Skip to main content

parquet/file/page_index/
index_reader.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Support for reading [`ColumnIndexMetaData`] and [`OffsetIndexMetaData`] from parquet metadata.
19
20use crate::basic::{BoundaryOrder, Type};
21use crate::data_type::Int96;
22use crate::errors::{ParquetError, Result};
23use crate::file::page_index::column_index::{
24    ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
25};
26use crate::file::page_index::offset_index::OffsetIndexMetaData;
27use crate::parquet_thrift::{
28    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
29    ThriftSliceInputProtocol, WriteThrift, WriteThriftField, read_thrift_vec,
30};
31use crate::thrift_struct;
32use std::io::Write;
33use std::ops::Range;
34
35/// Computes the covering range of two optional ranges
36///
37/// For example `acc_range(Some(7..9), Some(1..3)) = Some(1..9)`
38pub(crate) fn acc_range(a: Option<Range<u64>>, b: Option<Range<u64>>) -> Option<Range<u64>> {
39    match (a, b) {
40        (Some(a), Some(b)) => Some(a.start.min(b.start)..a.end.max(b.end)),
41        (None, x) | (x, None) => x,
42    }
43}
44
45pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, ParquetError> {
46    let mut prot = ThriftSliceInputProtocol::new(data);
47
48    // Try to read fast-path first. If that fails, fall back to slower but more robust
49    // decoder.
50    match OffsetIndexMetaData::try_from_fast(&mut prot) {
51        Ok(offset_index) => Ok(offset_index),
52        Err(_) => {
53            prot = ThriftSliceInputProtocol::new(data);
54            OffsetIndexMetaData::read_thrift(&mut prot)
55        }
56    }
57}
58
59// private struct only used for decoding then discarded
60thrift_struct!(
61pub(super) struct ThriftColumnIndex<'a> {
62  1: required list<bool> null_pages
63  2: required list<'a><binary> min_values
64  3: required list<'a><binary> max_values
65  4: required BoundaryOrder boundary_order
66  5: optional list<i64> null_counts
67  6: optional list<i64> repetition_level_histograms;
68  7: optional list<i64> definition_level_histograms;
69}
70);
71
72pub(crate) fn decode_column_index(
73    data: &[u8],
74    column_type: Type,
75) -> Result<ColumnIndexMetaData, ParquetError> {
76    let mut prot = ThriftSliceInputProtocol::new(data);
77    let index = ThriftColumnIndex::read_thrift(&mut prot)?;
78
79    let index = match column_type {
80        Type::BOOLEAN => {
81            ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::<bool>::try_from_thrift(index)?)
82        }
83        Type::INT32 => {
84            ColumnIndexMetaData::INT32(PrimitiveColumnIndex::<i32>::try_from_thrift(index)?)
85        }
86        Type::INT64 => {
87            ColumnIndexMetaData::INT64(PrimitiveColumnIndex::<i64>::try_from_thrift(index)?)
88        }
89        Type::INT96 => {
90            ColumnIndexMetaData::INT96(PrimitiveColumnIndex::<Int96>::try_from_thrift(index)?)
91        }
92        Type::FLOAT => {
93            ColumnIndexMetaData::FLOAT(PrimitiveColumnIndex::<f32>::try_from_thrift(index)?)
94        }
95        Type::DOUBLE => {
96            ColumnIndexMetaData::DOUBLE(PrimitiveColumnIndex::<f64>::try_from_thrift(index)?)
97        }
98        Type::BYTE_ARRAY => {
99            ColumnIndexMetaData::BYTE_ARRAY(ByteArrayColumnIndex::try_from_thrift(index)?)
100        }
101        Type::FIXED_LEN_BYTE_ARRAY => {
102            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex::try_from_thrift(index)?)
103        }
104    };
105
106    Ok(index)
107}