parquet_show_bloom_filter/
parquet-show-bloom-filter.rs1use clap::Parser;
37use parquet::basic::Type;
38use parquet::bloom_filter::Sbbf;
39use parquet::file::metadata::ColumnChunkMetaData;
40use parquet::file::{
41 properties::ReaderProperties,
42 reader::{FileReader, SerializedFileReader},
43 serialized_reader::ReadOptionsBuilder,
44};
45use std::{fs::File, path::Path};
46
47#[derive(Debug, Parser)]
48#[clap(author, version, about("Binary file to read bloom filter data from a Parquet file"), long_about = None)]
49struct Args {
50 #[clap(help("Path to the parquet file"))]
51 file_name: String,
52 #[clap(help(
53 "Check the bloom filter indexes for the given column. Only string typed columns or columns with an Int32 or Int64 physical type are supported"
54 ))]
55 column: String,
56 #[clap(
57 help(
58 "Check if the given values match bloom filter, the values will be parsed to the physical type of the column"
59 ),
60 required = true
61 )]
62 values: Vec<String>,
63}
64
65fn main() {
66 let args = Args::parse();
67 let file_name = args.file_name;
68 let path = Path::new(&file_name);
69 let file = File::open(path).expect("Unable to open file");
70
71 let file_reader = SerializedFileReader::new_with_options(
72 file,
73 ReadOptionsBuilder::new()
74 .with_reader_properties(
75 ReaderProperties::builder()
76 .set_read_bloom_filter(true)
77 .build(),
78 )
79 .build(),
80 )
81 .expect("Unable to open file as Parquet");
82 let metadata = file_reader.metadata();
83 for (ri, row_group) in metadata.row_groups().iter().enumerate() {
84 println!("Row group #{ri}");
85 println!("{}", "=".repeat(80));
86 if let Some((column_index, column)) = row_group
87 .columns()
88 .iter()
89 .enumerate()
90 .find(|(_, column)| column.column_path().string() == args.column)
91 {
92 let row_group_reader = file_reader
93 .get_row_group(ri)
94 .expect("Unable to read row group");
95 if let Some(sbbf) = row_group_reader.get_column_bloom_filter(column_index) {
96 args.values.iter().for_each(|value| {
97 match check_filter(sbbf, value, column) {
98 Ok(present) => {
99 println!(
100 "Value {} is {} in bloom filter",
101 value,
102 if present { "present" } else { "absent" }
103 )
104 }
105 Err(err) => {
106 println!("{err}");
107 }
108 };
109 });
110 } else {
111 println!("No bloom filter found for column {}", args.column);
112 }
113 } else {
114 println!(
115 "No column named {} found, candidate columns are: {}",
116 args.column,
117 row_group
118 .columns()
119 .iter()
120 .map(|c| c.column_path().string())
121 .collect::<Vec<_>>()
122 .join(", ")
123 );
124 }
125 }
126}
127
128fn check_filter(sbbf: &Sbbf, value: &String, column: &ColumnChunkMetaData) -> Result<bool, String> {
129 match column.column_type() {
130 Type::INT32 => {
131 let value: i32 = value
132 .parse()
133 .map_err(|e| format!("Unable to parse value '{value}' to i32: {e}"))?;
134 Ok(sbbf.check(&value))
135 }
136 Type::INT64 => {
137 let value: i64 = value
138 .parse()
139 .map_err(|e| format!("Unable to parse value '{value}' to i64: {e}"))?;
140 Ok(sbbf.check(&value))
141 }
142 Type::BYTE_ARRAY => Ok(sbbf.check(&value.as_str())),
143 _ => Err(format!(
144 "Unsupported column type for checking bloom filter: {}",
145 column.column_type()
146 )),
147 }
148}