parquet_read/
parquet-read.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Binary file to read data from a Parquet file.
19//!
20//! # Install
21//!
22//! `parquet-read` can be installed using `cargo`:
23//! ```
24//! cargo install parquet --features=cli
25//! ```
26//! After this `parquet-read` should be available:
27//! ```
28//! parquet-read XYZ.parquet
29//! ```
30//!
31//! The binary can also be built from the source code and run as follows:
32//! ```
33//! cargo run --features=cli --bin parquet-read XYZ.parquet
34//! ```
35//!
36//! Note that `parquet-read` reads full file schema, no projection or filtering is
37//! applied.
38
39use clap::Parser;
40use parquet::file::reader::{FileReader, SerializedFileReader};
41use parquet::record::Row;
42use std::io::{self, Read};
43use std::{fs::File, path::Path};
44
45#[derive(Debug, Parser)]
46#[clap(author, version, about("Binary file to read data from a Parquet file"), long_about = None)]
47struct Args {
48    #[clap(help("Path to a parquet file, or - for stdin"))]
49    file_name: String,
50    #[clap(
51        short,
52        long,
53        default_value_t = 0_usize,
54        help("Number of records to read. When not provided or 0, all records are read")
55    )]
56    num_records: usize,
57    #[clap(short, long, help("Print Parquet file in JSON lines format"))]
58    json: bool,
59}
60
61fn main() {
62    let args = Args::parse();
63
64    let filename = args.file_name;
65    let num_records = args.num_records;
66    let json = args.json;
67
68    let parquet_reader: Box<dyn FileReader> = if filename == "-" {
69        let mut buf = Vec::new();
70        io::stdin()
71            .read_to_end(&mut buf)
72            .expect("Failed to read stdin into a buffer");
73        Box::new(
74            SerializedFileReader::new(bytes::Bytes::from(buf)).expect("Failed to create reader"),
75        )
76    } else {
77        let path = Path::new(&filename);
78        let file = File::open(path).expect("Unable to open file");
79        Box::new(SerializedFileReader::new(file).expect("Failed to create reader"))
80    };
81
82    // Use full schema as projected schema
83    let mut iter = parquet_reader
84        .get_row_iter(None)
85        .expect("Failed to create row iterator");
86
87    let mut start = 0;
88    let end = num_records;
89    let all_records = end == 0;
90
91    while all_records || start < end {
92        match iter.next() {
93            Some(row) => print_row(&row.unwrap(), json),
94            None => break,
95        };
96        start += 1;
97    }
98}
99
100fn print_row(row: &Row, json: bool) {
101    if json {
102        println!("{}", row.to_json_value())
103    } else {
104        println!("{row}");
105    }
106}