parquet_index/
parquet-index.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Binary that prints the [page index] of a parquet file
19//!
20//! # Install
21//!
22//! `parquet-layout` can be installed using `cargo`:
23//! ```
24//! cargo install parquet --features=cli
25//! ```
26//! After this `parquet-index` should be available:
27//! ```
28//! parquet-index XYZ.parquet COLUMN_NAME
29//! ```
30//!
31//! The binary can also be built from the source code and run as follows:
32//! ```
33//! cargo run --features=cli --bin parquet-index XYZ.parquet COLUMN_NAME
34//!
35//! [page index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
36
37use clap::Parser;
38use parquet::errors::{ParquetError, Result};
39use parquet::file::page_index::index::{Index, PageIndex};
40use parquet::file::page_index::offset_index::OffsetIndexMetaData;
41use parquet::file::reader::{FileReader, SerializedFileReader};
42use parquet::file::serialized_reader::ReadOptionsBuilder;
43use parquet::format::PageLocation;
44use std::fs::File;
45
46#[derive(Debug, Parser)]
47#[clap(author, version, about("Prints the page index of a parquet file"), long_about = None)]
48struct Args {
49    #[clap(help("Path to a parquet file"))]
50    file: String,
51
52    #[clap(help("Column name to print"))]
53    column: String,
54}
55
56impl Args {
57    fn run(&self) -> Result<()> {
58        let file = File::open(&self.file)?;
59        let options = ReadOptionsBuilder::new().with_page_index().build();
60        let reader = SerializedFileReader::new_with_options(file, options)?;
61
62        let schema = reader.metadata().file_metadata().schema_descr();
63        let column_idx = schema
64            .columns()
65            .iter()
66            .position(|x| x.name() == self.column.as_str())
67            .ok_or_else(|| {
68                ParquetError::General(format!("Failed to find column {}", self.column))
69            })?;
70
71        // Column index data for all row groups and columns
72        let column_index = reader
73            .metadata()
74            .column_index()
75            .ok_or_else(|| ParquetError::General("Column index not found".to_string()))?;
76
77        // Offset index data for all row groups and columns
78        let offset_index = reader
79            .metadata()
80            .offset_index()
81            .ok_or_else(|| ParquetError::General("Offset index not found".to_string()))?;
82
83        // Iterate through each row group
84        for (row_group_idx, ((column_indices, offset_indices), row_group)) in column_index
85            .iter()
86            .zip(offset_index)
87            .zip(reader.metadata().row_groups())
88            .enumerate()
89        {
90            println!("Row Group: {row_group_idx}");
91            let offset_index = offset_indices.get(column_idx).ok_or_else(|| {
92                ParquetError::General(format!(
93                    "No offset index for row group {row_group_idx} column chunk {column_idx}"
94                ))
95            })?;
96
97            let row_counts =
98                compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows());
99            match &column_indices[column_idx] {
100                Index::NONE => println!("NO INDEX"),
101                Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?,
102                Index::INT32(v) => print_index(&v.indexes, offset_index, &row_counts)?,
103                Index::INT64(v) => print_index(&v.indexes, offset_index, &row_counts)?,
104                Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?,
105                Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?,
106                Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?,
107                Index::BYTE_ARRAY(v) => print_index(&v.indexes, offset_index, &row_counts)?,
108                Index::FIXED_LEN_BYTE_ARRAY(v) => {
109                    print_index(&v.indexes, offset_index, &row_counts)?
110                }
111            }
112        }
113        Ok(())
114    }
115}
116
117/// Computes the number of rows in each page within a column chunk
118fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec<i64> {
119    if offset_index.is_empty() {
120        return vec![];
121    }
122
123    let mut last = offset_index[0].first_row_index;
124    let mut out = Vec::with_capacity(offset_index.len());
125    for o in offset_index.iter().skip(1) {
126        out.push(o.first_row_index - last);
127        last = o.first_row_index;
128    }
129    out.push(rows - last);
130    out
131}
132
133/// Prints index information for a single column chunk
134fn print_index<T: std::fmt::Display>(
135    column_index: &[PageIndex<T>],
136    offset_index: &OffsetIndexMetaData,
137    row_counts: &[i64],
138) -> Result<()> {
139    if column_index.len() != offset_index.page_locations.len() {
140        return Err(ParquetError::General(format!(
141            "Index length mismatch, got {} and {}",
142            column_index.len(),
143            offset_index.page_locations.len()
144        )));
145    }
146
147    for (idx, ((c, o), row_count)) in column_index
148        .iter()
149        .zip(offset_index.page_locations())
150        .zip(row_counts)
151        .enumerate()
152    {
153        print!(
154            "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}",
155            idx, o.offset, o.compressed_page_size, row_count
156        );
157        match &c.min {
158            Some(m) => print!(", min {m:>10}"),
159            None => print!(", min {:>10}", "NONE"),
160        }
161
162        match &c.max {
163            Some(m) => print!(", max {m:>10}"),
164            None => print!(", max {:>10}", "NONE"),
165        }
166        println!()
167    }
168
169    Ok(())
170}
171
172fn main() -> Result<()> {
173    Args::parse().run()
174}