parquet/file/page_index/
index.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`Index`] structures holding decoded [`ColumnIndex`] information
19
20use crate::basic::Type;
21use crate::data_type::private::ParquetValueType;
22use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
23use crate::errors::ParquetError;
24use crate::file::metadata::LevelHistogram;
25use crate::format::{BoundaryOrder, ColumnIndex};
26use std::fmt::Debug;
27
28/// Typed statistics for one data page
29///
30/// See [`NativeIndex`] for more details
31#[derive(Debug, Clone, PartialEq, Eq, Hash)]
32pub struct PageIndex<T> {
33    /// The minimum value, It is None when all values are null
34    pub min: Option<T>,
35    /// The maximum value, It is None when all values are null
36    pub max: Option<T>,
37    /// Null values in the page
38    pub null_count: Option<i64>,
39    /// Repetition level histogram for the page
40    ///
41    /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`.
42    /// For example, `repetition_level_histogram[0]` indicates how many rows the page contains.
43    pub repetition_level_histogram: Option<LevelHistogram>,
44    /// Definition level histogram for the page
45    ///
46    /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`.
47    /// For example, `definition_level_histogram[max_definition_level]` indicates how many
48    /// non-null values are present in the page.
49    pub definition_level_histogram: Option<LevelHistogram>,
50}
51
52impl<T> PageIndex<T> {
53    /// Returns the minimum value in the page
54    ///
55    /// It is `None` when all values are null
56    pub fn min(&self) -> Option<&T> {
57        self.min.as_ref()
58    }
59
60    /// Returns the maximum value in the page
61    ///
62    /// It is `None` when all values are null
63    pub fn max(&self) -> Option<&T> {
64        self.max.as_ref()
65    }
66
67    /// Returns the number of null values in the page
68    pub fn null_count(&self) -> Option<i64> {
69        self.null_count
70    }
71
72    /// Returns the repetition level histogram for the page
73    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
74        self.repetition_level_histogram.as_ref()
75    }
76
77    /// Returns the definition level histogram for the page
78    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
79        self.definition_level_histogram.as_ref()
80    }
81}
82
83impl<T> PageIndex<T>
84where
85    T: AsBytes,
86{
87    /// Returns the minimum value in the page as bytes
88    ///
89    /// It is `None` when all values are null
90    pub fn max_bytes(&self) -> Option<&[u8]> {
91        self.max.as_ref().map(|x| x.as_bytes())
92    }
93
94    /// Returns the maximum value in the page as bytes
95    ///
96    /// It is `None` when all values are null
97    pub fn min_bytes(&self) -> Option<&[u8]> {
98        self.min.as_ref().map(|x| x.as_bytes())
99    }
100}
101
102#[derive(Debug, Clone, PartialEq)]
103#[allow(non_camel_case_types)]
104/// Statistics for data pages in a column chunk.
105///
106/// See [`NativeIndex`] for more information
107pub enum Index {
108    /// Sometimes reading page index from parquet file
109    /// will only return pageLocations without min_max index,
110    /// `NONE` represents this lack of index information
111    NONE,
112    /// Boolean type index
113    BOOLEAN(NativeIndex<bool>),
114    /// 32-bit integer type index
115    INT32(NativeIndex<i32>),
116    /// 64-bit integer type index
117    INT64(NativeIndex<i64>),
118    /// 96-bit integer type (timestamp) index
119    INT96(NativeIndex<Int96>),
120    /// 32-bit floating point type index
121    FLOAT(NativeIndex<f32>),
122    /// 64-bit floating point type index
123    DOUBLE(NativeIndex<f64>),
124    /// Byte array type index
125    BYTE_ARRAY(NativeIndex<ByteArray>),
126    /// Fixed length byte array type index
127    FIXED_LEN_BYTE_ARRAY(NativeIndex<FixedLenByteArray>),
128}
129
130impl Index {
131    /// Return min/max elements inside ColumnIndex are ordered or not.
132    pub fn is_sorted(&self) -> bool {
133        // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
134        if let Some(order) = self.get_boundary_order() {
135            order.0 > (BoundaryOrder::UNORDERED.0)
136        } else {
137            false
138        }
139    }
140
141    /// Get boundary_order of this page index.
142    pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
143        match self {
144            Index::NONE => None,
145            Index::BOOLEAN(index) => Some(index.boundary_order),
146            Index::INT32(index) => Some(index.boundary_order),
147            Index::INT64(index) => Some(index.boundary_order),
148            Index::INT96(index) => Some(index.boundary_order),
149            Index::FLOAT(index) => Some(index.boundary_order),
150            Index::DOUBLE(index) => Some(index.boundary_order),
151            Index::BYTE_ARRAY(index) => Some(index.boundary_order),
152            Index::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
153        }
154    }
155}
156
157/// Strongly typed statistics for data pages in a column chunk.
158///
159/// This structure is a natively typed, in memory representation of the
160/// [`ColumnIndex`] structure in a parquet file footer, as described in the
161/// Parquet [PageIndex documentation]. The statistics stored in this structure
162/// can be used by query engines to skip decoding pages while reading parquet
163/// data.
164///
165/// # Differences with Row Group Level Statistics
166///
167/// One significant difference between `NativeIndex` and row group level
168/// [`Statistics`] is that page level statistics may not store actual column
169/// values as min and max (e.g. they may store truncated strings to save space)
170///
171/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
172/// [`Statistics`]: crate::file::statistics::Statistics
173#[derive(Debug, Clone, PartialEq, Eq, Hash)]
174pub struct NativeIndex<T: ParquetValueType> {
175    /// The actual column indexes, one item per page
176    pub indexes: Vec<PageIndex<T>>,
177    /// If the min/max elements are ordered, and if so in which
178    /// direction. See [source] for details.
179    ///
180    /// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964
181    pub boundary_order: BoundaryOrder,
182}
183
184impl<T: ParquetValueType> NativeIndex<T> {
185    /// The physical data type of the column
186    pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE;
187
188    /// Creates a new [`NativeIndex`]
189    pub(crate) fn try_new(index: ColumnIndex) -> Result<Self, ParquetError> {
190        let len = index.min_values.len();
191
192        let null_counts = index
193            .null_counts
194            .map(|x| x.into_iter().map(Some).collect::<Vec<_>>())
195            .unwrap_or_else(|| vec![None; len]);
196
197        // histograms are a 1D array encoding a 2D num_pages X num_levels matrix.
198        let to_page_histograms = |opt_hist: Option<Vec<i64>>| {
199            if let Some(hist) = opt_hist {
200                // TODO: should we assert (hist.len() % len) == 0?
201                let num_levels = hist.len() / len;
202                let mut res = Vec::with_capacity(len);
203                for i in 0..len {
204                    let page_idx = i * num_levels;
205                    let page_hist = hist[page_idx..page_idx + num_levels].to_vec();
206                    res.push(Some(LevelHistogram::from(page_hist)));
207                }
208                res
209            } else {
210                vec![None; len]
211            }
212        };
213
214        let rep_hists: Vec<Option<LevelHistogram>> =
215            to_page_histograms(index.repetition_level_histograms);
216        let def_hists: Vec<Option<LevelHistogram>> =
217            to_page_histograms(index.definition_level_histograms);
218
219        let indexes = index
220            .min_values
221            .iter()
222            .zip(index.max_values.iter())
223            .zip(index.null_pages.into_iter())
224            .zip(null_counts.into_iter())
225            .zip(rep_hists.into_iter())
226            .zip(def_hists.into_iter())
227            .map(
228                |(
229                    ((((min, max), is_null), null_count), repetition_level_histogram),
230                    definition_level_histogram,
231                )| {
232                    let (min, max) = if is_null {
233                        (None, None)
234                    } else {
235                        (
236                            Some(T::try_from_le_slice(min)?),
237                            Some(T::try_from_le_slice(max)?),
238                        )
239                    };
240                    Ok(PageIndex {
241                        min,
242                        max,
243                        null_count,
244                        repetition_level_histogram,
245                        definition_level_histogram,
246                    })
247                },
248            )
249            .collect::<Result<Vec<_>, ParquetError>>()?;
250
251        Ok(Self {
252            indexes,
253            boundary_order: index.boundary_order,
254        })
255    }
256
257    pub(crate) fn to_thrift(&self) -> ColumnIndex {
258        let min_values = self
259            .indexes
260            .iter()
261            .map(|x| x.min_bytes().unwrap_or(&[]).to_vec())
262            .collect::<Vec<_>>();
263
264        let max_values = self
265            .indexes
266            .iter()
267            .map(|x| x.max_bytes().unwrap_or(&[]).to_vec())
268            .collect::<Vec<_>>();
269
270        let null_counts = self
271            .indexes
272            .iter()
273            .map(|x| x.null_count())
274            .collect::<Option<Vec<_>>>();
275
276        // Concatenate page histograms into a single Option<Vec>
277        let repetition_level_histograms = self
278            .indexes
279            .iter()
280            .map(|x| x.repetition_level_histogram().map(|v| v.values()))
281            .collect::<Option<Vec<&[i64]>>>()
282            .map(|hists| hists.concat());
283
284        let definition_level_histograms = self
285            .indexes
286            .iter()
287            .map(|x| x.definition_level_histogram().map(|v| v.values()))
288            .collect::<Option<Vec<&[i64]>>>()
289            .map(|hists| hists.concat());
290
291        ColumnIndex::new(
292            self.indexes.iter().map(|x| x.min().is_none()).collect(),
293            min_values,
294            max_values,
295            self.boundary_order,
296            null_counts,
297            repetition_level_histograms,
298            definition_level_histograms,
299        )
300    }
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306
307    #[test]
308    fn test_page_index_min_max_null() {
309        let page_index = PageIndex {
310            min: Some(-123),
311            max: Some(234),
312            null_count: Some(0),
313            repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])),
314            definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])),
315        };
316
317        assert_eq!(page_index.min().unwrap(), &-123);
318        assert_eq!(page_index.max().unwrap(), &234);
319        assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes());
320        assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes());
321        assert_eq!(page_index.null_count().unwrap(), 0);
322        assert_eq!(
323            page_index.repetition_level_histogram().unwrap().values(),
324            &vec![1, 2]
325        );
326        assert_eq!(
327            page_index.definition_level_histogram().unwrap().values(),
328            &vec![1, 2, 3]
329        );
330    }
331
332    #[test]
333    fn test_page_index_min_max_null_none() {
334        let page_index: PageIndex<i32> = PageIndex {
335            min: None,
336            max: None,
337            null_count: None,
338            repetition_level_histogram: None,
339            definition_level_histogram: None,
340        };
341
342        assert_eq!(page_index.min(), None);
343        assert_eq!(page_index.max(), None);
344        assert_eq!(page_index.min_bytes(), None);
345        assert_eq!(page_index.max_bytes(), None);
346        assert_eq!(page_index.null_count(), None);
347        assert_eq!(page_index.repetition_level_histogram(), None);
348        assert_eq!(page_index.definition_level_histogram(), None);
349    }
350
351    #[test]
352    fn test_invalid_column_index() {
353        let column_index = ColumnIndex {
354            null_pages: vec![true, false],
355            min_values: vec![
356                vec![],
357                vec![], // this shouldn't be empty as null_pages[1] is false
358            ],
359            max_values: vec![
360                vec![],
361                vec![], // this shouldn't be empty as null_pages[1] is false
362            ],
363            null_counts: None,
364            repetition_level_histograms: None,
365            definition_level_histograms: None,
366            boundary_order: BoundaryOrder::UNORDERED,
367        };
368
369        let err = NativeIndex::<i32>::try_new(column_index).unwrap_err();
370        assert_eq!(
371            err.to_string(),
372            "Parquet error: error converting value, expected 4 bytes got 0"
373        );
374    }
375}