parquet_index/
parquet-index.rs1use clap::Parser;
38use parquet::errors::{ParquetError, Result};
39use parquet::file::page_index::index::{Index, PageIndex};
40use parquet::file::page_index::offset_index::OffsetIndexMetaData;
41use parquet::file::reader::{FileReader, SerializedFileReader};
42use parquet::file::serialized_reader::ReadOptionsBuilder;
43use parquet::format::PageLocation;
44use std::fs::File;
45
46#[derive(Debug, Parser)]
47#[clap(author, version, about("Prints the page index of a parquet file"), long_about = None)]
48struct Args {
49 #[clap(help("Path to a parquet file"))]
50 file: String,
51
52 #[clap(help("Column name to print"))]
53 column: String,
54}
55
56impl Args {
57 fn run(&self) -> Result<()> {
58 let file = File::open(&self.file)?;
59 let options = ReadOptionsBuilder::new().with_page_index().build();
60 let reader = SerializedFileReader::new_with_options(file, options)?;
61
62 let schema = reader.metadata().file_metadata().schema_descr();
63 let column_idx = schema
64 .columns()
65 .iter()
66 .position(|x| x.name() == self.column.as_str())
67 .ok_or_else(|| {
68 ParquetError::General(format!("Failed to find column {}", self.column))
69 })?;
70
71 let column_index = reader
73 .metadata()
74 .column_index()
75 .ok_or_else(|| ParquetError::General("Column index not found".to_string()))?;
76
77 let offset_index = reader
79 .metadata()
80 .offset_index()
81 .ok_or_else(|| ParquetError::General("Offset index not found".to_string()))?;
82
83 for (row_group_idx, ((column_indices, offset_indices), row_group)) in column_index
85 .iter()
86 .zip(offset_index)
87 .zip(reader.metadata().row_groups())
88 .enumerate()
89 {
90 println!("Row Group: {row_group_idx}");
91 let offset_index = offset_indices.get(column_idx).ok_or_else(|| {
92 ParquetError::General(format!(
93 "No offset index for row group {row_group_idx} column chunk {column_idx}"
94 ))
95 })?;
96
97 let row_counts =
98 compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows());
99 match &column_indices[column_idx] {
100 Index::NONE => println!("NO INDEX"),
101 Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?,
102 Index::INT32(v) => print_index(&v.indexes, offset_index, &row_counts)?,
103 Index::INT64(v) => print_index(&v.indexes, offset_index, &row_counts)?,
104 Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?,
105 Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?,
106 Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?,
107 Index::BYTE_ARRAY(v) => print_index(&v.indexes, offset_index, &row_counts)?,
108 Index::FIXED_LEN_BYTE_ARRAY(v) => {
109 print_index(&v.indexes, offset_index, &row_counts)?
110 }
111 }
112 }
113 Ok(())
114 }
115}
116
117fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec<i64> {
119 if offset_index.is_empty() {
120 return vec![];
121 }
122
123 let mut last = offset_index[0].first_row_index;
124 let mut out = Vec::with_capacity(offset_index.len());
125 for o in offset_index.iter().skip(1) {
126 out.push(o.first_row_index - last);
127 last = o.first_row_index;
128 }
129 out.push(rows - last);
130 out
131}
132
133fn print_index<T: std::fmt::Display>(
135 column_index: &[PageIndex<T>],
136 offset_index: &OffsetIndexMetaData,
137 row_counts: &[i64],
138) -> Result<()> {
139 if column_index.len() != offset_index.page_locations.len() {
140 return Err(ParquetError::General(format!(
141 "Index length mismatch, got {} and {}",
142 column_index.len(),
143 offset_index.page_locations.len()
144 )));
145 }
146
147 for (idx, ((c, o), row_count)) in column_index
148 .iter()
149 .zip(offset_index.page_locations())
150 .zip(row_counts)
151 .enumerate()
152 {
153 print!(
154 "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}",
155 idx, o.offset, o.compressed_page_size, row_count
156 );
157 match &c.min {
158 Some(m) => print!(", min {m:>10}"),
159 None => print!(", min {:>10}", "NONE"),
160 }
161
162 match &c.max {
163 Some(m) => print!(", max {m:>10}"),
164 None => print!(", max {:>10}", "NONE"),
165 }
166 println!()
167 }
168
169 Ok(())
170}
171
172fn main() -> Result<()> {
173 Args::parse().run()
174}