parquet/file/metadata/options.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Options used to control metadata parsing
19
20use std::collections::HashSet;
21use std::sync::Arc;
22
23use crate::schema::types::SchemaDescPtr;
24
25/// Enum to control decoding of some Parquet statistics fields.
26///
27/// # Example
28/// ```rust
29/// use parquet::file::metadata::ParquetStatisticsPolicy;
30/// use parquet::file::serialized_reader::ReadOptionsBuilder;
31/// use parquet::arrow::arrow_reader::ArrowReaderOptions;
32///
33/// // Set arrow options to skip encoding statistics for all columns.
34/// let options =
35/// ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
36///
37/// // Set serialized reader options to decode encoding statistics for all columns.
38/// let options =
39/// ReadOptionsBuilder::new().with_encoding_stats_policy(ParquetStatisticsPolicy::KeepAll)
40/// .build();
41///
42/// // Set arrow options to skip encoding statistics for all columns, but to decode statistics
43/// // for columns 0 and 1.
44/// let options = ArrowReaderOptions::new()
45/// .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0, 1]));
46/// ```
47#[derive(Default, Debug, Clone)]
48pub enum ParquetStatisticsPolicy {
49 /// Decode the relevant statistics for all columns.
50 #[default]
51 KeepAll,
52 /// Skip decoding the relevant statistics for all columns.
53 SkipAll,
54 /// Skip decoding the relevant statistics for all columns not in the provided set
55 /// of column indices.
56 SkipExcept(Arc<HashSet<usize>>),
57}
58
59impl ParquetStatisticsPolicy {
60 /// Create a `ParquetStatisticsPolicy` to skip all columns except those in `keep`.
61 ///
62 /// If `keep` is empty, then this returns [`Self::SkipAll`]
63 pub fn skip_except(keep: &[usize]) -> Self {
64 if keep.is_empty() {
65 Self::SkipAll
66 } else {
67 let mut keep_set = HashSet::<usize>::with_capacity(keep.len());
68 keep_set.extend(keep.iter());
69 Self::SkipExcept(Arc::new(keep_set))
70 }
71 }
72
73 /// Returns whether the policy for the given column index is to skip the statistics.
74 pub(crate) fn is_skip(&self, col_index: usize) -> bool {
75 match self {
76 Self::KeepAll => false,
77 Self::SkipAll => true,
78 Self::SkipExcept(keep) => !keep.contains(&col_index),
79 }
80 }
81}
82
83/// Options that can be set to control what parts of the Parquet file footer
84/// metadata will be decoded and made present in the [`ParquetMetaData`] returned
85/// by [`ParquetMetaDataReader`] and [`ParquetMetaDataPushDecoder`].
86///
87/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
88/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
89/// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder
90#[derive(Default, Debug, Clone)]
91pub struct ParquetMetaDataOptions {
92 schema_descr: Option<SchemaDescPtr>,
93 encoding_stats_as_mask: bool,
94 encoding_stats_policy: ParquetStatisticsPolicy,
95}
96
97impl ParquetMetaDataOptions {
98 /// Return a new default [`ParquetMetaDataOptions`].
99 pub fn new() -> Self {
100 Default::default()
101 }
102
103 /// Returns an optional [`SchemaDescPtr`] to use when decoding. If this is not `None` then
104 /// the schema in the footer will be skipped.
105 pub fn schema(&self) -> Option<&SchemaDescPtr> {
106 self.schema_descr.as_ref()
107 }
108
109 /// Provide a schema to use when decoding the metadata.
110 pub fn set_schema(&mut self, val: SchemaDescPtr) {
111 self.schema_descr = Some(val);
112 }
113
114 /// Call [`Self::set_schema`] and return `Self` for chaining.
115 pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
116 self.set_schema(val);
117 self
118 }
119
120 /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData`
121 /// as a bitmask (defaults to `false`).
122 ///
123 /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
124 /// might be desirable.
125 ///
126 /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
127 /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
128 /// [`encoding_stats`]:
129 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
130 pub fn encoding_stats_as_mask(&self) -> bool {
131 self.encoding_stats_as_mask
132 }
133
134 /// Convert [`encoding_stats`] from a vector of [`PageEncodingStats`] to a bitmask. This can
135 /// speed up metadata decoding while still enabling some use cases served by the full stats.
136 ///
137 /// Note that if for a given column both this option and `skip_encoding_stats` are `true`, the
138 /// stats will be skipped and not be returned as a mask.
139 ///
140 /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information.
141 ///
142 /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats
143 /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
144 /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
145 /// [`encoding_stats`]:
146 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
147 pub fn set_encoding_stats_as_mask(&mut self, val: bool) {
148 self.encoding_stats_as_mask = val;
149 }
150
151 /// Call [`Self::set_encoding_stats_as_mask`] and return `Self` for chaining.
152 pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
153 self.set_encoding_stats_as_mask(val);
154 self
155 }
156
157 /// Returns whether to skip decoding the [`encoding_stats`] in the Parquet `ColumnMetaData`
158 /// for the column indexed by `col_index`.
159 ///
160 /// [`encoding_stats`]:
161 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
162 pub fn skip_encoding_stats(&self, col_index: usize) -> bool {
163 self.encoding_stats_policy.is_skip(col_index)
164 }
165
166 /// Sets the decoding policy for [`encoding_stats`] in the Parquet `ColumnMetaData`.
167 ///
168 /// The default policy is to decode all `encoding_stats`.
169 ///
170 /// This option takes precedence over [`Self::encoding_stats_as_mask`].
171 ///
172 /// [`encoding_stats`]:
173 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
174 pub fn set_encoding_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
175 self.encoding_stats_policy = policy;
176 }
177
178 /// Call [`Self::set_encoding_stats_policy`] and return `Self` for chaining.
179 pub fn with_encoding_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
180 self.set_encoding_stats_policy(policy);
181 self
182 }
183}
184
185#[cfg(test)]
186mod tests {
187 use bytes::Bytes;
188
189 use crate::{
190 DecodeResult,
191 file::metadata::{ParquetMetaDataOptions, ParquetMetaDataPushDecoder},
192 util::test_common::file_util::get_test_file,
193 };
194 use std::{io::Read, sync::Arc};
195
196 #[test]
197 fn test_provide_schema() {
198 let mut buf: Vec<u8> = Vec::new();
199 get_test_file("alltypes_plain.parquet")
200 .read_to_end(&mut buf)
201 .unwrap();
202
203 let data = Bytes::from(buf);
204 let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap();
205 decoder
206 .push_range(0..data.len() as u64, data.clone())
207 .unwrap();
208
209 let expected = match decoder.try_decode().unwrap() {
210 DecodeResult::Data(m) => m,
211 _ => panic!("could not parse metadata"),
212 };
213 let expected_schema = expected.file_metadata().schema_descr_ptr();
214
215 let mut options = ParquetMetaDataOptions::new();
216 options.set_schema(expected_schema);
217 let options = Arc::new(options);
218
219 let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64)
220 .unwrap()
221 .with_metadata_options(Some(options));
222 decoder.push_range(0..data.len() as u64, data).unwrap();
223 let metadata = match decoder.try_decode().unwrap() {
224 DecodeResult::Data(m) => m,
225 _ => panic!("could not parse metadata"),
226 };
227
228 assert_eq!(expected, metadata);
229 // the schema pointers should be the same
230 assert!(Arc::ptr_eq(
231 &expected.file_metadata().schema_descr_ptr(),
232 &metadata.file_metadata().schema_descr_ptr()
233 ));
234 }
235}