parquet_layout/
parquet-layout.rs1use std::fs::File;
42
43use clap::Parser;
44use parquet::file::metadata::ParquetMetaDataReader;
45use serde::{Serialize, Serializer};
46
47use parquet::basic::{CompressionCodec, Encoding};
48use parquet::errors::Result;
49use parquet::file::reader::ChunkReader;
50
51#[derive(Serialize, Debug)]
52struct Index {
53 offset: i64,
54 length: Option<i32>,
55}
56
57#[derive(Serialize, Debug)]
58struct Footer {
59 metadata_size: Option<usize>,
60}
61
62#[derive(Serialize, Debug)]
63struct ParquetFile {
64 row_groups: Vec<RowGroup>,
65 footer: Footer,
66}
67
68#[derive(Serialize, Debug)]
69struct RowGroup {
70 columns: Vec<ColumnChunk>,
71 row_count: i64,
72}
73
74#[derive(Serialize, Debug)]
75struct ColumnChunk {
76 path: String,
77 has_offset_index: bool,
78 has_column_index: bool,
79 has_bloom_filter: bool,
80 offset_index: Option<Index>,
81 column_index: Option<Index>,
82 bloom_filter: Option<Index>,
83 compression: DebugSerialize<CompressionCodec>,
84 encodings: Vec<DebugSerialize<Encoding>>,
85}
86
87#[derive(Debug)]
88struct DebugSerialize<T: std::fmt::Debug>(T);
89
90impl<T: std::fmt::Debug> Serialize for DebugSerialize<T> {
91 fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
92 where
93 S: Serializer,
94 {
95 serializer.serialize_str(&format!("{:?}", &self.0))
96 }
97}
98
99fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
100 let mut metadata_reader = ParquetMetaDataReader::new();
101 metadata_reader.try_parse(reader)?;
102 let metadata_size = metadata_reader.metadata_size();
103 let metadata = metadata_reader.finish()?;
104 let schema = metadata.file_metadata().schema_descr();
105
106 let row_groups = (0..metadata.num_row_groups())
107 .map(|row_group_idx| {
108 let row_group = metadata.row_group(row_group_idx);
109 let columns = row_group
110 .columns()
111 .iter()
112 .zip(schema.columns())
113 .map(|(column, column_schema)| {
114 let compression = DebugSerialize(column.compression_codec());
115 let encodings = column.encodings().map(DebugSerialize).collect();
116
117 Ok(ColumnChunk {
118 path: column_schema.path().parts().join("."),
119 has_offset_index: column.offset_index_offset().is_some(),
120 has_column_index: column.column_index_offset().is_some(),
121 has_bloom_filter: column.bloom_filter_offset().is_some(),
122 offset_index: column.offset_index_offset().map(|offset| Index {
123 offset,
124 length: column.offset_index_length(),
125 }),
126 column_index: column.column_index_offset().map(|offset| Index {
127 offset,
128 length: column.column_index_length(),
129 }),
130 bloom_filter: column.bloom_filter_offset().map(|offset| Index {
131 offset,
132 length: column.bloom_filter_length(),
133 }),
134 compression,
135 encodings,
136 })
137 })
138 .collect::<Result<Vec<_>>>()?;
139
140 Ok(RowGroup {
141 columns,
142 row_count: row_group.num_rows(),
143 })
144 })
145 .collect::<Result<Vec<_>>>()?;
146
147 Ok(ParquetFile {
148 row_groups,
149 footer: Footer { metadata_size },
150 })
151}
152
153#[derive(Debug, Parser)]
154#[clap(author, version, about("Prints the physical layout of a parquet file"), long_about = None)]
155struct Args {
156 #[clap(help("Path to a parquet file"))]
157 file: String,
158}
159
160impl Args {
161 fn run(&self) -> Result<()> {
162 let file = File::open(&self.file)?;
163 let layout = do_layout(&file)?;
164
165 let out = std::io::stdout();
166 let writer = out.lock();
167
168 serde_json::to_writer_pretty(writer, &layout).unwrap();
169 Ok(())
170 }
171}
172
173fn main() -> Result<()> {
174 Args::parse().run()
175}