1use crate::basic::Type;
21use crate::data_type::private::ParquetValueType;
22use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
23use crate::errors::ParquetError;
24use crate::file::metadata::LevelHistogram;
25use crate::format::{BoundaryOrder, ColumnIndex};
26use std::fmt::Debug;
27
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
32pub struct PageIndex<T> {
33 pub min: Option<T>,
35 pub max: Option<T>,
37 pub null_count: Option<i64>,
39 pub repetition_level_histogram: Option<LevelHistogram>,
44 pub definition_level_histogram: Option<LevelHistogram>,
50}
51
52impl<T> PageIndex<T> {
53 pub fn min(&self) -> Option<&T> {
57 self.min.as_ref()
58 }
59
60 pub fn max(&self) -> Option<&T> {
64 self.max.as_ref()
65 }
66
67 pub fn null_count(&self) -> Option<i64> {
69 self.null_count
70 }
71
72 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
74 self.repetition_level_histogram.as_ref()
75 }
76
77 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
79 self.definition_level_histogram.as_ref()
80 }
81}
82
83impl<T> PageIndex<T>
84where
85 T: AsBytes,
86{
87 pub fn max_bytes(&self) -> Option<&[u8]> {
91 self.max.as_ref().map(|x| x.as_bytes())
92 }
93
94 pub fn min_bytes(&self) -> Option<&[u8]> {
98 self.min.as_ref().map(|x| x.as_bytes())
99 }
100}
101
102#[derive(Debug, Clone, PartialEq)]
103#[allow(non_camel_case_types)]
104pub enum Index {
108 NONE,
112 BOOLEAN(NativeIndex<bool>),
114 INT32(NativeIndex<i32>),
116 INT64(NativeIndex<i64>),
118 INT96(NativeIndex<Int96>),
120 FLOAT(NativeIndex<f32>),
122 DOUBLE(NativeIndex<f64>),
124 BYTE_ARRAY(NativeIndex<ByteArray>),
126 FIXED_LEN_BYTE_ARRAY(NativeIndex<FixedLenByteArray>),
128}
129
130impl Index {
131 pub fn is_sorted(&self) -> bool {
133 if let Some(order) = self.get_boundary_order() {
135 order.0 > (BoundaryOrder::UNORDERED.0)
136 } else {
137 false
138 }
139 }
140
141 pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
143 match self {
144 Index::NONE => None,
145 Index::BOOLEAN(index) => Some(index.boundary_order),
146 Index::INT32(index) => Some(index.boundary_order),
147 Index::INT64(index) => Some(index.boundary_order),
148 Index::INT96(index) => Some(index.boundary_order),
149 Index::FLOAT(index) => Some(index.boundary_order),
150 Index::DOUBLE(index) => Some(index.boundary_order),
151 Index::BYTE_ARRAY(index) => Some(index.boundary_order),
152 Index::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
153 }
154 }
155}
156
157#[derive(Debug, Clone, PartialEq, Eq, Hash)]
174pub struct NativeIndex<T: ParquetValueType> {
175 pub indexes: Vec<PageIndex<T>>,
177 pub boundary_order: BoundaryOrder,
182}
183
184impl<T: ParquetValueType> NativeIndex<T> {
185 pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE;
187
188 pub(crate) fn try_new(index: ColumnIndex) -> Result<Self, ParquetError> {
190 let len = index.min_values.len();
191
192 let null_counts = index
193 .null_counts
194 .map(|x| x.into_iter().map(Some).collect::<Vec<_>>())
195 .unwrap_or_else(|| vec![None; len]);
196
197 let to_page_histograms = |opt_hist: Option<Vec<i64>>| {
199 if let Some(hist) = opt_hist {
200 let num_levels = hist.len() / len;
202 let mut res = Vec::with_capacity(len);
203 for i in 0..len {
204 let page_idx = i * num_levels;
205 let page_hist = hist[page_idx..page_idx + num_levels].to_vec();
206 res.push(Some(LevelHistogram::from(page_hist)));
207 }
208 res
209 } else {
210 vec![None; len]
211 }
212 };
213
214 let rep_hists: Vec<Option<LevelHistogram>> =
215 to_page_histograms(index.repetition_level_histograms);
216 let def_hists: Vec<Option<LevelHistogram>> =
217 to_page_histograms(index.definition_level_histograms);
218
219 let indexes = index
220 .min_values
221 .iter()
222 .zip(index.max_values.iter())
223 .zip(index.null_pages.into_iter())
224 .zip(null_counts.into_iter())
225 .zip(rep_hists.into_iter())
226 .zip(def_hists.into_iter())
227 .map(
228 |(
229 ((((min, max), is_null), null_count), repetition_level_histogram),
230 definition_level_histogram,
231 )| {
232 let (min, max) = if is_null {
233 (None, None)
234 } else {
235 (
236 Some(T::try_from_le_slice(min)?),
237 Some(T::try_from_le_slice(max)?),
238 )
239 };
240 Ok(PageIndex {
241 min,
242 max,
243 null_count,
244 repetition_level_histogram,
245 definition_level_histogram,
246 })
247 },
248 )
249 .collect::<Result<Vec<_>, ParquetError>>()?;
250
251 Ok(Self {
252 indexes,
253 boundary_order: index.boundary_order,
254 })
255 }
256
257 pub(crate) fn to_thrift(&self) -> ColumnIndex {
258 let min_values = self
259 .indexes
260 .iter()
261 .map(|x| x.min_bytes().unwrap_or(&[]).to_vec())
262 .collect::<Vec<_>>();
263
264 let max_values = self
265 .indexes
266 .iter()
267 .map(|x| x.max_bytes().unwrap_or(&[]).to_vec())
268 .collect::<Vec<_>>();
269
270 let null_counts = self
271 .indexes
272 .iter()
273 .map(|x| x.null_count())
274 .collect::<Option<Vec<_>>>();
275
276 let repetition_level_histograms = self
278 .indexes
279 .iter()
280 .map(|x| x.repetition_level_histogram().map(|v| v.values()))
281 .collect::<Option<Vec<&[i64]>>>()
282 .map(|hists| hists.concat());
283
284 let definition_level_histograms = self
285 .indexes
286 .iter()
287 .map(|x| x.definition_level_histogram().map(|v| v.values()))
288 .collect::<Option<Vec<&[i64]>>>()
289 .map(|hists| hists.concat());
290
291 ColumnIndex::new(
292 self.indexes.iter().map(|x| x.min().is_none()).collect(),
293 min_values,
294 max_values,
295 self.boundary_order,
296 null_counts,
297 repetition_level_histograms,
298 definition_level_histograms,
299 )
300 }
301}
302
303#[cfg(test)]
304mod tests {
305 use super::*;
306
307 #[test]
308 fn test_page_index_min_max_null() {
309 let page_index = PageIndex {
310 min: Some(-123),
311 max: Some(234),
312 null_count: Some(0),
313 repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])),
314 definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])),
315 };
316
317 assert_eq!(page_index.min().unwrap(), &-123);
318 assert_eq!(page_index.max().unwrap(), &234);
319 assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes());
320 assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes());
321 assert_eq!(page_index.null_count().unwrap(), 0);
322 assert_eq!(
323 page_index.repetition_level_histogram().unwrap().values(),
324 &vec![1, 2]
325 );
326 assert_eq!(
327 page_index.definition_level_histogram().unwrap().values(),
328 &vec![1, 2, 3]
329 );
330 }
331
332 #[test]
333 fn test_page_index_min_max_null_none() {
334 let page_index: PageIndex<i32> = PageIndex {
335 min: None,
336 max: None,
337 null_count: None,
338 repetition_level_histogram: None,
339 definition_level_histogram: None,
340 };
341
342 assert_eq!(page_index.min(), None);
343 assert_eq!(page_index.max(), None);
344 assert_eq!(page_index.min_bytes(), None);
345 assert_eq!(page_index.max_bytes(), None);
346 assert_eq!(page_index.null_count(), None);
347 assert_eq!(page_index.repetition_level_histogram(), None);
348 assert_eq!(page_index.definition_level_histogram(), None);
349 }
350
351 #[test]
352 fn test_invalid_column_index() {
353 let column_index = ColumnIndex {
354 null_pages: vec![true, false],
355 min_values: vec![
356 vec![],
357 vec![], ],
359 max_values: vec![
360 vec![],
361 vec![], ],
363 null_counts: None,
364 repetition_level_histograms: None,
365 definition_level_histograms: None,
366 boundary_order: BoundaryOrder::UNORDERED,
367 };
368
369 let err = NativeIndex::<i32>::try_new(column_index).unwrap_err();
370 assert_eq!(
371 err.to_string(),
372 "Parquet error: error converting value, expected 4 bytes got 0"
373 );
374 }
375}