Skip to main content

parquet_layout/
parquet-layout.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Binary that prints the physical layout of a parquet file
19//!
20//! Alternatives to this binary include [`parquet-cli`] and [`parquet-viewer`].
21//!
22//! # Install
23//!
24//! `parquet-layout` can be installed using `cargo`:
25//! ```
26//! cargo install parquet --features=cli
27//! ```
28//! After this `parquet-layout` should be available:
29//! ```
30//! parquet-layout XYZ.parquet
31//! ```
32//!
33//! The binary can also be built from the source code and run as follows:
34//! ```
35//! cargo run --features=cli --bin parquet-layout XYZ.parquet
36//! ```
37//!
38//! [`parquet-cli`]: https://github.com/apache/parquet-java/tree/master/parquet-cli
39//! [`parquet-viewer`]: https://github.com/xiangpenghao/parquet-viewer
40
41use std::fs::File;
42
43use clap::Parser;
44use parquet::file::metadata::ParquetMetaDataReader;
45use serde::{Serialize, Serializer};
46
47use parquet::basic::{CompressionCodec, Encoding};
48use parquet::errors::Result;
49use parquet::file::reader::ChunkReader;
50
51#[derive(Serialize, Debug)]
52struct Index {
53    offset: i64,
54    length: Option<i32>,
55}
56
57#[derive(Serialize, Debug)]
58struct Footer {
59    metadata_size: Option<usize>,
60}
61
62#[derive(Serialize, Debug)]
63struct ParquetFile {
64    row_groups: Vec<RowGroup>,
65    footer: Footer,
66}
67
68#[derive(Serialize, Debug)]
69struct RowGroup {
70    columns: Vec<ColumnChunk>,
71    row_count: i64,
72}
73
74#[derive(Serialize, Debug)]
75struct ColumnChunk {
76    path: String,
77    has_offset_index: bool,
78    has_column_index: bool,
79    has_bloom_filter: bool,
80    offset_index: Option<Index>,
81    column_index: Option<Index>,
82    bloom_filter: Option<Index>,
83    compression: DebugSerialize<CompressionCodec>,
84    encodings: Vec<DebugSerialize<Encoding>>,
85}
86
87#[derive(Debug)]
88struct DebugSerialize<T: std::fmt::Debug>(T);
89
90impl<T: std::fmt::Debug> Serialize for DebugSerialize<T> {
91    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
92    where
93        S: Serializer,
94    {
95        serializer.serialize_str(&format!("{:?}", &self.0))
96    }
97}
98
99fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
100    let mut metadata_reader = ParquetMetaDataReader::new();
101    metadata_reader.try_parse(reader)?;
102    let metadata_size = metadata_reader.metadata_size();
103    let metadata = metadata_reader.finish()?;
104    let schema = metadata.file_metadata().schema_descr();
105
106    let row_groups = (0..metadata.num_row_groups())
107        .map(|row_group_idx| {
108            let row_group = metadata.row_group(row_group_idx);
109            let columns = row_group
110                .columns()
111                .iter()
112                .zip(schema.columns())
113                .map(|(column, column_schema)| {
114                    let compression = DebugSerialize(column.compression_codec());
115                    let encodings = column.encodings().map(DebugSerialize).collect();
116
117                    Ok(ColumnChunk {
118                        path: column_schema.path().parts().join("."),
119                        has_offset_index: column.offset_index_offset().is_some(),
120                        has_column_index: column.column_index_offset().is_some(),
121                        has_bloom_filter: column.bloom_filter_offset().is_some(),
122                        offset_index: column.offset_index_offset().map(|offset| Index {
123                            offset,
124                            length: column.offset_index_length(),
125                        }),
126                        column_index: column.column_index_offset().map(|offset| Index {
127                            offset,
128                            length: column.column_index_length(),
129                        }),
130                        bloom_filter: column.bloom_filter_offset().map(|offset| Index {
131                            offset,
132                            length: column.bloom_filter_length(),
133                        }),
134                        compression,
135                        encodings,
136                    })
137                })
138                .collect::<Result<Vec<_>>>()?;
139
140            Ok(RowGroup {
141                columns,
142                row_count: row_group.num_rows(),
143            })
144        })
145        .collect::<Result<Vec<_>>>()?;
146
147    Ok(ParquetFile {
148        row_groups,
149        footer: Footer { metadata_size },
150    })
151}
152
153#[derive(Debug, Parser)]
154#[clap(author, version, about("Prints the physical layout of a parquet file"), long_about = None)]
155struct Args {
156    #[clap(help("Path to a parquet file"))]
157    file: String,
158}
159
160impl Args {
161    fn run(&self) -> Result<()> {
162        let file = File::open(&self.file)?;
163        let layout = do_layout(&file)?;
164
165        let out = std::io::stdout();
166        let writer = out.lock();
167
168        serde_json::to_writer_pretty(writer, &layout).unwrap();
169        Ok(())
170    }
171}
172
173fn main() -> Result<()> {
174    Args::parse().run()
175}