parquet/bloom_filter/mod.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Bloom filter implementation specific to Parquet, as described
19//! in the [spec][parquet-bf-spec].
20//!
21//! # Bloom Filter Size
22//!
23//! Parquet uses the [Split Block Bloom Filter][sbbf-paper] (SBBF) as its bloom filter
24//! implementation. For each column upon which bloom filters are enabled, the offset and length of an SBBF
25//! is stored in the metadata for each row group in the parquet file. The size of each filter is
26//! initialized using a calculation based on the desired number of distinct values (NDV) and false
27//! positive probability (FPP). The FPP for a SBBF can be approximated as<sup>[1][bf-formulae]</sup>:
28//!
29//! ```text
30//! f = (1 - e^(-k * n / m))^k
31//! ```
32//!
33//! Where, `f` is the FPP, `k` the number of hash functions, `n` the NDV, and `m` the total number
34//! of bits in the bloom filter. This can be re-arranged to determine the total number of bits
35//! required to achieve a given FPP and NDV:
36//!
37//! ```text
38//! m = -k * n / ln(1 - f^(1/k))
39//! ```
40//!
41//! SBBFs use eight hash functions to cleanly fit in SIMD lanes<sup>[2][sbbf-paper]</sup>, therefore
42//! `k` is set to 8. The SBBF will spread those `m` bits accross a set of `b` blocks that
43//! are each 256 bits, i.e., 32 bytes, in size. The number of blocks is chosen as:
44//!
45//! ```text
46//! b = NP2(m/8) / 32
47//! ```
48//!
49//! Where, `NP2` denotes *the next power of two*, and `m` is divided by 8 to be represented as bytes.
50//!
51//! Here is a table of calculated sizes for various FPP and NDV:
52//!
53//! | NDV | FPP | b | Size (KB) |
54//! |-----------|-----------|---------|-----------|
55//! | 10,000 | 0.1 | 256 | 8 |
56//! | 10,000 | 0.01 | 512 | 16 |
57//! | 10,000 | 0.001 | 1,024 | 32 |
58//! | 10,000 | 0.0001 | 1,024 | 32 |
59//! | 100,000 | 0.1 | 4,096 | 128 |
60//! | 100,000 | 0.01 | 4,096 | 128 |
61//! | 100,000 | 0.001 | 8,192 | 256 |
62//! | 100,000 | 0.0001 | 16,384 | 512 |
63//! | 100,000 | 0.00001 | 16,384 | 512 |
64//! | 1,000,000 | 0.1 | 32,768 | 1,024 |
65//! | 1,000,000 | 0.01 | 65,536 | 2,048 |
66//! | 1,000,000 | 0.001 | 65,536 | 2,048 |
67//! | 1,000,000 | 0.0001 | 131,072 | 4,096 |
68//! | 1,000,000 | 0.00001 | 131,072 | 4,096 |
69//! | 1,000,000 | 0.000001 | 262,144 | 8,192 |
70//!
71//! # Structure: Filter → Blocks → Words → Bits
72//!
73//! An SBBF is an array of **blocks**. Each block is 256 bits (32 bytes),
74//! divided into eight 32-bit **words**. A word is just a `u32` — an array of
75//! 32 individual bits that can each be "set" (1) or "not set" (0).
76//!
77//! ```text
78//! Sbbf (the whole filter)
79//! ┌──────────┬──────────┬──────────┬─── ─── ──┬──────────┐
80//! │ Block 0 │ Block 1 │ Block 2 │ ... │ Block N-1│
81//! └──────────┴──────────┴──────────┴─── ─── ──┴──────────┘
82//! │
83//! ▼
84//! One Block = 256 bits = 8 words
85//! ┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
86//! │ word 0 │ word 1 │ word 2 │ word 3 │ word 4 │ word 5 │ word 6 │ word 7 │
87//! │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │
88//! └────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
89//! │
90//! ▼
91//! One Word = 32 individual bits
92//! ┌─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┐
93//! │0│0│1│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│ ← bit 29 is set
94//! └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
95//! ```
96//!
97//! **Inserting** a value hashes it to a 64-bit number, then:
98//! 1. The upper 32 bits pick which **block** (via `Sbbf::hash_to_block_index`).
99//! 2. The lower 32 bits pick one bit position in each of the 8 **words** (via `Block::mask`).
100//! So each insert sets exactly **8 bits** (one per word) in a single block.
101//!
102//! **Checking** does the same two steps and returns `true` only if all 8 bits
103//! are already set — meaning the value was *probably* inserted (or is a false
104//! positive).
105//!
106//! # Bloom Filter Folding
107//!
108//! After inserting all values into a bloom filter it can be "folded" to minimize it's size.
109//! See [`Sbbf::fold_to_target_fpp`] for details on the algorithm and its mathematical basis.
110//!
111//! [parquet-bf-spec]: https://github.com/apache/parquet-format/blob/master/BloomFilter.md
112//! [sbbf-paper]: https://arxiv.org/pdf/2101.01719
113//! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf
114
115use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
116use crate::data_type::AsBytes;
117use crate::errors::{ParquetError, Result};
118use crate::file::metadata::ColumnChunkMetaData;
119use crate::file::reader::ChunkReader;
120use crate::parquet_thrift::{
121 ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
122 ThriftSliceInputProtocol, WriteThrift, WriteThriftField,
123};
124use crate::thrift_struct;
125use bytes::Bytes;
126use std::io::Write;
127use twox_hash::XxHash64;
128
129/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach).
130const SALT: [u32; 8] = [
131 0x47b6137b_u32,
132 0x44974d91_u32,
133 0x8824ad5b_u32,
134 0xa2b7289d_u32,
135 0x705495c7_u32,
136 0x2df1424b_u32,
137 0x9efc4947_u32,
138 0x5c6bfb31_u32,
139];
140
141thrift_struct!(
142/// Bloom filter header is stored at beginning of Bloom filter data of each column
143/// and followed by its bitset.
144///
145pub struct BloomFilterHeader {
146 /// The size of bitset in bytes
147 1: required i32 num_bytes;
148 /// The algorithm for setting bits.
149 2: required BloomFilterAlgorithm algorithm;
150 /// The hash function used for Bloom filter
151 3: required BloomFilterHash hash;
152 /// The compression used in the Bloom filter
153 4: required BloomFilterCompression compression;
154}
155);
156
157/// A single 256-bit block, the basic unit of the Split Block Bloom Filter.
158///
159/// A block is eight contiguous 32-bit **words** (`[u32; 8]`).
160/// Each word is an independent bit-array of 32 positions:
161///
162/// ```text
163/// Block (256 bits total)
164/// ┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
165/// │ word 0 │ word 1 │ word 2 │ word 3 │ word 4 │ word 5 │ word 6 │ word 7 │
166/// │ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│
167/// └────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
168/// ```
169///
170/// When a value is inserted, [`Block::mask`] picks one bit in each word
171/// (8 bits total), and those bits are OR'd in. When checking, we verify
172/// all 8 bits are set.
173#[derive(Debug, Copy, Clone)]
174#[repr(transparent)]
175struct Block([u32; 8]);
176impl Block {
177 const ZERO: Block = Block([0; 8]);
178
179 /// Produce a block where each of the 8 words has exactly one bit set.
180 ///
181 /// For each word `i` the bit position is derived from `x`:
182 ///
183 /// ```text
184 /// y = (x wrapping* SALT[i]) >> 27 // top 5 bits → value in 0..31
185 /// word[i] = 1 << y // exactly one bit set per word
186 /// ```
187 ///
188 /// Because only the top 5 bits survive the shift, each word picks one of
189 /// 32 possible bit positions. The eight SALT constants spread the choices
190 /// so different words usually light up different positions.
191 ///
192 /// Key property: the mask depends *only* on `x` (a u32) and the fixed
193 /// SALT constants — it is independent of the filter size. This is why
194 /// folding preserves bit patterns (see Lemma 2 in tests).
195 fn mask(x: u32) -> Self {
196 let mut result = [0_u32; 8];
197 for i in 0..8 {
198 let y = x.wrapping_mul(SALT[i]); // spread bits via multiply
199 let y = y >> 27; // keep top 5 bits → 0..31
200 result[i] = 1 << y; // set exactly that one bit
201 }
202 Self(result)
203 }
204
205 #[inline]
206 #[cfg(not(target_endian = "little"))]
207 fn to_ne_bytes(self) -> [u8; 32] {
208 // SAFETY: [u32; 8] and [u8; 32] have the same size and neither has invalid bit patterns.
209 unsafe { std::mem::transmute(self.0) }
210 }
211
212 #[inline]
213 #[cfg(not(target_endian = "little"))]
214 fn to_le_bytes(self) -> [u8; 32] {
215 self.swap_bytes().to_ne_bytes()
216 }
217
218 #[inline]
219 #[cfg(not(target_endian = "little"))]
220 fn swap_bytes(mut self) -> Self {
221 self.0.iter_mut().for_each(|x| *x = x.swap_bytes());
222 self
223 }
224
225 /// OR the mask bits into this block (`block[i] |= mask[i]`).
226 ///
227 /// After insertion the 8 bits chosen by `mask(hash)` are guaranteed set;
228 /// bits previously set by other hashes are preserved.
229 fn insert(&mut self, hash: u32) {
230 let mask = Self::mask(hash);
231 for i in 0..8 {
232 self[i] |= mask[i];
233 }
234 }
235
236 /// Check membership: returns `true` when *every* bit from `mask(hash)` is
237 /// already set in this block (`block[i] & mask[i] != 0` for all 8 words).
238 ///
239 /// A `true` result means "probably present" (other inserts may have set
240 /// the same bits). A `false` is definitive — the value was never inserted.
241 fn check(&self, hash: u32) -> bool {
242 let mask = Self::mask(hash);
243 for i in 0..8 {
244 if self[i] & mask[i] == 0 {
245 return false;
246 }
247 }
248 true
249 }
250}
251
252impl std::ops::Index<usize> for Block {
253 type Output = u32;
254
255 #[inline]
256 fn index(&self, index: usize) -> &Self::Output {
257 self.0.index(index)
258 }
259}
260
261impl std::ops::IndexMut<usize> for Block {
262 #[inline]
263 fn index_mut(&mut self, index: usize) -> &mut Self::Output {
264 self.0.index_mut(index)
265 }
266}
267
268impl std::ops::BitOr for Block {
269 type Output = Self;
270
271 #[inline]
272 fn bitor(self, rhs: Self) -> Self {
273 let mut result = [0u32; 8];
274 for (i, item) in result.iter_mut().enumerate() {
275 *item = self.0[i] | rhs.0[i];
276 }
277 Self(result)
278 }
279}
280
281impl std::ops::BitOrAssign for Block {
282 #[inline]
283 fn bitor_assign(&mut self, rhs: Self) {
284 for i in 0..8 {
285 self.0[i] |= rhs.0[i];
286 }
287 }
288}
289
290impl Block {
291 /// Count the total number of set bits across all 8 words.
292 ///
293 /// Computes popcount on each word separately and sums. Keeping the popcount
294 /// separate from the OR allows the compiler to batch SIMD popcount instructions
295 /// (e.g., `cnt.16b` on ARM NEON) instead of interleaving them with OR operations.
296 #[inline]
297 fn count_ones(self) -> u32 {
298 // Written as a fold over the array so the compiler sees 8 independent
299 // popcount operations it can vectorize into cnt.16b + horizontal sum.
300 self.0.iter().map(|w| w.count_ones()).sum()
301 }
302}
303
304/// A split block Bloom filter (SBBF).
305///
306/// An SBBF partitions its bit space into fixed-size 256-bit (32-byte) blocks, each fitting in a
307/// single CPU cache line. Each block contains eight 32-bit words, aligned with SIMD lanes for
308/// parallel bit manipulation. When checking membership, only one block is accessed per query,
309/// eliminating the cache-miss penalty of standard Bloom filters.
310///
311/// ## Sizing and folding
312///
313/// Filters are initially sized for a maximum expected number of distinct values (NDV) via
314/// [`Sbbf::new_with_ndv_fpp`]. After all values are inserted, the filter is compacted by
315/// calling [`Sbbf::fold_to_target_fpp`], which folds the filter down to the smallest size
316/// that still meets the target false positive probability.
317///
318/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
319/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
320#[derive(Debug, Clone)]
321pub struct Sbbf(Vec<Block>);
322
323pub(crate) const SBBF_HEADER_SIZE_ESTIMATE: usize = 20;
324
325/// given an initial offset, and a byte buffer, try to read out a bloom filter header and return
326/// both the header and the offset after it (for bitset).
327pub(crate) fn chunk_read_bloom_filter_header_and_offset(
328 offset: u64,
329 buffer: Bytes,
330) -> Result<(BloomFilterHeader, u64), ParquetError> {
331 let (header, length) = read_bloom_filter_header_and_length(buffer)?;
332 Ok((header, offset + length))
333}
334
335/// given a [Bytes] buffer, try to read out a bloom filter header and return both the header and
336/// length of the header.
337#[inline]
338pub(crate) fn read_bloom_filter_header_and_length(
339 buffer: Bytes,
340) -> Result<(BloomFilterHeader, u64), ParquetError> {
341 read_bloom_filter_header_and_length_from_bytes(buffer.as_ref())
342}
343
344/// Given a byte slice, try to read out a bloom filter header and return both the header and
345/// length of the header.
346#[inline]
347fn read_bloom_filter_header_and_length_from_bytes(
348 buffer: &[u8],
349) -> Result<(BloomFilterHeader, u64), ParquetError> {
350 let total_length = buffer.len();
351 let mut prot = ThriftSliceInputProtocol::new(buffer);
352 let header = BloomFilterHeader::read_thrift(&mut prot)
353 .map_err(|e| ParquetError::General(format!("Could not read bloom filter header: {e}")))?;
354 Ok((header, (total_length - prot.as_slice().len()) as u64))
355}
356
357/// The minimum number of bytes for a bloom filter bitset.
358pub const BITSET_MIN_LENGTH: usize = 32;
359/// The maximum number of bytes for a bloom filter bitset.
360pub const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024;
361
362#[inline]
363fn optimal_num_of_bytes(num_bytes: usize) -> usize {
364 let num_bytes = num_bytes.min(BITSET_MAX_LENGTH);
365 let num_bytes = num_bytes.max(BITSET_MIN_LENGTH);
366 num_bytes.next_power_of_two()
367}
368
369// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
370// given fpp = (1 - e^(-k * n / m)) ^ k
371// we have m = - k * n / ln(1 - fpp ^ (1 / k))
372// where k = number of hash functions, m = number of bits, n = number of distinct values
373#[inline]
374fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
375 let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
376 num_bits as usize
377}
378
379impl Sbbf {
380 /// Create a new [Sbbf] with given number of distinct values and false positive probability.
381 /// Will return an error if `fpp` is greater than or equal to 1.0 or less than 0.0.
382 pub fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Result<Self, ParquetError> {
383 if !(0.0..1.0).contains(&fpp) {
384 return Err(ParquetError::General(format!(
385 "False positive probability must be between 0.0 and 1.0, got {fpp}"
386 )));
387 }
388 let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp);
389 Ok(Self::new_with_num_of_bytes(num_bits / 8))
390 }
391
392 /// Create a new [Sbbf] with given number of bytes, the exact number of bytes will be adjusted
393 /// to the next power of two bounded by [BITSET_MIN_LENGTH] and [BITSET_MAX_LENGTH].
394 pub fn new_with_num_of_bytes(num_bytes: usize) -> Self {
395 let num_bytes = optimal_num_of_bytes(num_bytes);
396 assert_eq!(num_bytes % size_of::<Block>(), 0);
397 let num_blocks = num_bytes / size_of::<Block>();
398 let bitset = vec![Block::ZERO; num_blocks];
399 Self(bitset)
400 }
401
402 /// Creates a new [Sbbf] from a raw byte slice.
403 pub fn new(bitset: &[u8]) -> Self {
404 let data = bitset
405 .chunks_exact(4 * 8)
406 .map(|chunk| {
407 let mut block = Block::ZERO;
408 for (i, word) in chunk.chunks_exact(4).enumerate() {
409 block[i] = u32::from_le_bytes(word.try_into().unwrap());
410 }
411 block
412 })
413 .collect::<Vec<Block>>();
414 Self(data)
415 }
416
417 /// Write the bloom filter data (header and then bitset) to the output. This doesn't
418 /// flush the writer in order to boost performance of bulk writing all blocks. Caller
419 /// must remember to flush the writer.
420 /// This method usually is used in conjunction with [`Self::from_bytes`] for serialization/deserialization.
421 pub fn write<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
422 let mut protocol = ThriftCompactOutputProtocol::new(&mut writer);
423 self.header().write_thrift(&mut protocol).map_err(|e| {
424 ParquetError::General(format!("Could not write bloom filter header: {e}"))
425 })?;
426 self.write_bitset(&mut writer)?;
427 Ok(())
428 }
429
430 /// Write the bitset in serialized form to the writer.
431 #[cfg(not(target_endian = "little"))]
432 pub fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
433 for block in &self.0 {
434 writer
435 .write_all(block.to_le_bytes().as_slice())
436 .map_err(|e| {
437 ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
438 })?;
439 }
440 Ok(())
441 }
442
443 /// Write the bitset in serialized form to the writer.
444 #[cfg(target_endian = "little")]
445 pub fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
446 // Safety: Block is repr(transparent) and [u32; 8] can be reinterpreted as [u8; 32].
447 let slice = unsafe {
448 std::slice::from_raw_parts(
449 self.0.as_ptr() as *const u8,
450 self.0.len() * size_of::<Block>(),
451 )
452 };
453 writer.write_all(slice).map_err(|e| {
454 ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
455 })?;
456 Ok(())
457 }
458
459 /// Create and populate [`BloomFilterHeader`] from this bitset for writing to serialized form
460 fn header(&self) -> BloomFilterHeader {
461 BloomFilterHeader {
462 // 8 i32 per block, 4 bytes per i32
463 num_bytes: self.0.len() as i32 * 4 * 8,
464 algorithm: BloomFilterAlgorithm::BLOCK,
465 hash: BloomFilterHash::XXHASH,
466 compression: BloomFilterCompression::UNCOMPRESSED,
467 }
468 }
469
470 /// Read a new bloom filter from the given offset in the given reader.
471 pub fn read_from_column_chunk<R: ChunkReader>(
472 column_metadata: &ColumnChunkMetaData,
473 reader: &R,
474 ) -> Result<Option<Self>, ParquetError> {
475 let offset: u64 = if let Some(offset) = column_metadata.bloom_filter_offset() {
476 offset
477 .try_into()
478 .map_err(|_| ParquetError::General("Bloom filter offset is invalid".to_string()))?
479 } else {
480 return Ok(None);
481 };
482
483 let buffer = match column_metadata.bloom_filter_length() {
484 Some(length) => reader.get_bytes(offset, length as usize),
485 None => reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE),
486 }?;
487
488 let (header, bitset_offset) =
489 chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;
490
491 match header.algorithm {
492 BloomFilterAlgorithm::BLOCK => {
493 // this match exists to future proof the singleton algorithm enum
494 }
495 }
496 match header.compression {
497 BloomFilterCompression::UNCOMPRESSED => {
498 // this match exists to future proof the singleton compression enum
499 }
500 }
501 match header.hash {
502 BloomFilterHash::XXHASH => {
503 // this match exists to future proof the singleton hash enum
504 }
505 }
506
507 let bitset = match column_metadata.bloom_filter_length() {
508 Some(_) => buffer.slice((bitset_offset - offset) as usize..),
509 None => {
510 let bitset_length: usize = header.num_bytes.try_into().map_err(|_| {
511 ParquetError::General("Bloom filter length is invalid".to_string())
512 })?;
513 reader.get_bytes(bitset_offset, bitset_length)?
514 }
515 };
516
517 Ok(Some(Self::new(&bitset)))
518 }
519
520 /// Map a 64-bit hash to a block index in `[0, num_blocks)`.
521 ///
522 /// Uses the "multiply-and-shift" trick (a fast alternative to modulo):
523 ///
524 /// ```text
525 /// upper32 = hash >> 32 // take the top 32 bits of the hash
526 /// index = (upper32 * N) >> 32 // ∈ [0, N) where N = num_blocks
527 /// ```
528 ///
529 /// Why this matters for folding (Lemma 1): when N is a power of two and
530 /// you halve it to N/2, the index also halves:
531 ///
532 /// ```text
533 /// index_N = (upper32 * N) >> 32
534 /// index_N/2 = (upper32 * N/2) >> 32 = index_N / 2 (integer division)
535 /// ```
536 ///
537 /// So the block that held hash `h` in the big filter is at `index / 2` in
538 /// the half-sized filter — exactly where `fold` ORs it.
539 #[inline]
540 fn hash_to_block_index(&self, hash: u64) -> usize {
541 (((hash >> 32).saturating_mul(self.0.len() as u64)) >> 32) as usize
542 }
543
544 /// Insert an [AsBytes] value into the filter
545 pub fn insert<T: AsBytes + ?Sized>(&mut self, value: &T) {
546 self.insert_hash(hash_as_bytes(value));
547 }
548
549 /// Insert a hash into the filter
550 fn insert_hash(&mut self, hash: u64) {
551 let block_index = self.hash_to_block_index(hash);
552 self.0[block_index].insert(hash as u32)
553 }
554
555 /// Check if an [AsBytes] value is probably present or definitely absent in the filter
556 pub fn check<T: AsBytes + ?Sized>(&self, value: &T) -> bool {
557 self.check_hash(hash_as_bytes(value))
558 }
559
560 /// Check if a hash is in the filter. May return
561 /// true for values that was never inserted ("false positive")
562 /// but will always return false if a hash has not been inserted.
563 fn check_hash(&self, hash: u64) -> bool {
564 let block_index = self.hash_to_block_index(hash);
565 self.0[block_index].check(hash as u32)
566 }
567
568 /// Return the total in memory size of this bloom filter in bytes
569 pub(crate) fn estimated_memory_size(&self) -> usize {
570 self.0.capacity() * std::mem::size_of::<Block>()
571 }
572
573 /// Returns the number of blocks in this bloom filter.
574 pub fn num_blocks(&self) -> usize {
575 self.0.len()
576 }
577
578 /// Fold the bloom filter down to the smallest size that still meets the target FPP
579 /// (False Positive Percentage).
580 ///
581 /// Folds the filter by merging groups of adjacent blocks via bitwise OR, where each
582 /// fold level halves the number of blocks. The fold count is chosen as the maximum
583 /// number of folds whose estimated FPP stays within `target_fpp`. The filter stops
584 /// at a minimum size of 1 block (32 bytes).
585 ///
586 /// ## How it works
587 ///
588 /// SBBFs use multiplicative hashing for block selection:
589 ///
590 /// ```text
591 /// block_index = ((hash >> 32) * num_blocks) >> 32
592 /// ```
593 ///
594 /// A single fold halves the block count: when `num_blocks` is halved, the new index
595 /// becomes `floor(original_index / 2)`, so blocks `2i` and `2i+1` map to the same
596 /// position. More generally, `k` folds reduce the block count by `2^k`, merging
597 /// groups of `2^k` adjacent blocks in a single pass:
598 ///
599 /// ```text
600 /// folded[i] = blocks[i*2^k] | blocks[i*2^k + 1] | ... | blocks[i*2^k + 2^k - 1]
601 /// ```
602 ///
603 /// This differs from standard Bloom filter folding, which merges the two halves
604 /// (`B[i] | B[i + m/2]`) because standard filters use modular hashing where
605 /// `h(x) mod (m/2)` maps indices `i` and `i + m/2` to the same position.
606 ///
607 /// ## Correctness
608 ///
609 /// Folding **never introduces false negatives**. Every bit that was set in the original
610 /// filter remains set in the folded filter (via bitwise OR). The only effect is a controlled
611 /// increase in FPP as set bits from different blocks are merged together.
612 /// This is was originally proven in [Sailhan & Stehr 2012] for standard bloom filters and is empirically
613 /// demonstrated for SBBFs in Lemma 1 and Lemma 2 of the tests.
614 ///
615 /// ## References
616 ///
617 /// [Sailhan & Stehr 2012]: https://doi.org/10.1109/GreenCom.2012.16
618 pub fn fold_to_target_fpp(&mut self, target_fpp: f64) {
619 let num_folds = self.num_folds_for_target_fpp(target_fpp);
620 if num_folds > 0 {
621 self.fold_n(num_folds);
622 }
623 }
624
625 /// Determine how many folds can be applied without exceeding `target_fpp`.
626 ///
627 /// Computes the average per-block fill rate in a single pass (no allocation),
628 /// then analytically estimates the FPP at each fold level.
629 ///
630 /// When two blocks with independent fill rate `f` are OR'd, the expected fill
631 /// of the merged block is `1 - (1-f)^2`. After `k` folds (merging `2^k` blocks):
632 ///
633 /// ```text
634 /// f_k = 1 - (1 - f)^(2^k)
635 /// ```
636 ///
637 /// SBBF membership checks perform `k=8` bit checks within one 256-bit block,
638 /// so the estimated FPP at fold level k is `f_k^8`.
639 fn num_folds_for_target_fpp(&self, target_fpp: f64) -> u32 {
640 let len = self.0.len();
641 if len < 2 {
642 return 0;
643 }
644
645 // Single pass: compute average per-block fill rate.
646 let total_set_bits: u64 = self.0.iter().map(|b| u64::from(b.count_ones())).sum();
647 let avg_fill = total_set_bits as f64 / (len as f64 * 256.0);
648
649 // Empty filter: can fold all the way down.
650 if avg_fill == 0.0 {
651 return len.trailing_zeros();
652 }
653
654 // Find max folds where estimated FPP stays within target.
655 // f_k = 1 - (1 - avg_fill)^(2^k), FPP_k = f_k^8
656 assert!(
657 len.is_power_of_two(),
658 "Number of blocks must be a power of 2 for folding"
659 );
660 let max_folds = len.trailing_zeros(); // log2(len) since len is power of 2
661 let one_minus_f = 1.0 - avg_fill;
662 let mut num_folds = 0u32;
663 let mut one_minus_fk = one_minus_f; // (1-f)^1 initially
664
665 for _ in 0..max_folds {
666 // After one more fold: (1-f)^(2^(k+1)) = ((1-f)^(2^k))^2
667 one_minus_fk = one_minus_fk * one_minus_fk;
668 let fk = 1.0 - one_minus_fk;
669 let estimated_fpp = fk.powi(8);
670 if estimated_fpp > target_fpp {
671 break;
672 }
673 num_folds += 1;
674 }
675
676 num_folds
677 }
678
679 /// Fold the filter `num_folds` times in a single pass.
680 ///
681 /// Merges groups of `2^num_folds` adjacent blocks via bitwise OR, producing
682 /// `len / 2^num_folds` output blocks. The original allocation is reused.
683 ///
684 /// # Panics
685 ///
686 /// Panics if `num_folds` is 0 or would reduce the filter below 1 block.
687 fn fold_n(&mut self, num_folds: u32) {
688 assert!(num_folds > 0, "num_folds must be at least 1");
689 let len = self.0.len();
690 let group_size = 1usize << num_folds;
691 assert!(
692 group_size <= len,
693 "Cannot fold {num_folds} times: need at least {group_size} blocks, have {len}"
694 );
695 let new_len = len / group_size;
696 for i in 0..new_len {
697 let start = i * group_size;
698 let mut merged = self.0[start];
699 for j in 1..group_size {
700 merged |= self.0[start + j];
701 }
702 self.0[i] = merged;
703 }
704 self.0.truncate(new_len);
705 }
706
707 /// Reads a Sbff from Thrift encoded bytes
708 ///
709 /// # Examples
710 ///
711 /// ```no_run
712 /// # use parquet::errors::Result;
713 /// # use parquet::bloom_filter::Sbbf;
714 /// # fn main() -> Result<()> {
715 /// // In a real application, you would read serialized bloom filter bytes from a cache.
716 /// // This example demonstrates the deserialization process.
717 /// // Assuming you have bloom filter bytes from a Parquet file:
718 /// # let serialized_bytes: Vec<u8> = vec![];
719 /// let bloom_filter = Sbbf::from_bytes(&serialized_bytes)?;
720 /// // Now you can use the bloom filter to check for values
721 /// if bloom_filter.check(&"some_value") {
722 /// println!("Value might be present (or false positive)");
723 /// } else {
724 /// println!("Value is definitely not present");
725 /// }
726 /// # Ok(())
727 /// # }
728 /// ```
729 pub fn from_bytes(bytes: &[u8]) -> Result<Self, ParquetError> {
730 let (header, header_len) = read_bloom_filter_header_and_length_from_bytes(bytes)?;
731
732 let bitset_length: u64 = header
733 .num_bytes
734 .try_into()
735 .map_err(|_| ParquetError::General("Bloom filter length is invalid".to_string()))?;
736
737 // Validate that bitset consumes all remaining bytes
738 if header_len + bitset_length != bytes.len() as u64 {
739 return Err(ParquetError::General(format!(
740 "Bloom filter data contains extra bytes: expected {} total bytes, got {}",
741 header_len + bitset_length,
742 bytes.len()
743 )));
744 }
745
746 let start = header_len as usize;
747 let end = (header_len + bitset_length) as usize;
748 let bitset = bytes
749 .get(start..end)
750 .ok_or_else(|| ParquetError::General("Bloom filter bitset is invalid".to_string()))?;
751
752 Ok(Self::new(bitset))
753 }
754}
755
756// per spec we use xxHash with seed=0
757const SEED: u64 = 0;
758
759#[inline]
760fn hash_as_bytes<A: AsBytes + ?Sized>(value: &A) -> u64 {
761 XxHash64::oneshot(SEED, value.as_bytes())
762}
763
764#[cfg(test)]
765mod tests {
766 use super::*;
767
768 #[test]
769 fn test_hash_bytes() {
770 assert_eq!(hash_as_bytes(""), 17241709254077376921);
771 }
772
773 #[test]
774 fn test_mask_set_quick_check() {
775 for i in 0..1_000_000 {
776 let result = Block::mask(i);
777 assert!(result.0.iter().all(|&x| x.is_power_of_two()));
778 }
779 }
780
781 #[test]
782 fn test_block_insert_and_check() {
783 for i in 0..1_000_000 {
784 let mut block = Block::ZERO;
785 block.insert(i);
786 assert!(block.check(i));
787 }
788 }
789
790 #[test]
791 fn test_sbbf_insert_and_check() {
792 let mut sbbf = Sbbf(vec![Block::ZERO; 1_000]);
793 for i in 0..1_000_000 {
794 sbbf.insert(&i);
795 assert!(sbbf.check(&i));
796 }
797 }
798
799 #[test]
800 fn test_with_fixture() {
801 // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10
802 let bitset: &[u8] = &[
803 200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5,
804 99, 65, 2, 0, 224, 44, 64, 78, 96, 4,
805 ];
806 let sbbf = Sbbf::new(bitset);
807 for a in 0..10i64 {
808 let value = format!("a{a}");
809 assert!(sbbf.check(&value.as_str()));
810 }
811 }
812
813 /// test the assumption that bloom filter header size should not exceed SBBF_HEADER_SIZE_ESTIMATE
814 /// essentially we are testing that the struct is packed with 4 i32 fields, each can be 1-5 bytes
815 /// so altogether it'll be 20 bytes at most.
816 #[test]
817 fn test_bloom_filter_header_size_assumption() {
818 let buffer: &[u8; 16] = &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99];
819 let (
820 BloomFilterHeader {
821 algorithm,
822 compression,
823 hash,
824 num_bytes,
825 },
826 read_length,
827 ) = read_bloom_filter_header_and_length(Bytes::copy_from_slice(buffer)).unwrap();
828 assert_eq!(read_length, 15);
829 assert_eq!(algorithm, BloomFilterAlgorithm::BLOCK);
830 assert_eq!(compression, BloomFilterCompression::UNCOMPRESSED);
831 assert_eq!(hash, BloomFilterHash::XXHASH);
832 assert_eq!(num_bytes, 32_i32);
833 assert_eq!(20, SBBF_HEADER_SIZE_ESTIMATE);
834 }
835
836 #[test]
837 fn test_optimal_num_of_bytes() {
838 for (input, expected) in &[
839 (0, 32),
840 (9, 32),
841 (31, 32),
842 (32, 32),
843 (33, 64),
844 (99, 128),
845 (1024, 1024),
846 (999_000_000, 128 * 1024 * 1024),
847 ] {
848 assert_eq!(*expected, optimal_num_of_bytes(*input));
849 }
850 }
851
852 #[test]
853 fn test_num_of_bits_from_ndv_fpp() {
854 for (fpp, ndv, num_bits) in &[
855 (0.1, 10, 57),
856 (0.01, 10, 96),
857 (0.001, 10, 146),
858 (0.1, 100, 577),
859 (0.01, 100, 968),
860 (0.001, 100, 1460),
861 (0.1, 1000, 5772),
862 (0.01, 1000, 9681),
863 (0.001, 1000, 14607),
864 (0.1, 10000, 57725),
865 (0.01, 10000, 96815),
866 (0.001, 10000, 146076),
867 (0.1, 100000, 577254),
868 (0.01, 100000, 968152),
869 (0.001, 100000, 1460769),
870 (0.1, 1000000, 5772541),
871 (0.01, 1000000, 9681526),
872 (0.001, 1000000, 14607697),
873 (1e-50, 1_000_000_000_000, 14226231280773240832),
874 ] {
875 assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
876 }
877 }
878
879 #[test]
880 fn test_fold_n_halves_block_count() {
881 let mut sbbf = Sbbf::new_with_num_of_bytes(1024); // 32 blocks
882 assert_eq!(sbbf.num_blocks(), 32);
883 sbbf.fold_n(1);
884 assert_eq!(sbbf.num_blocks(), 16);
885 sbbf.fold_n(1);
886 assert_eq!(sbbf.num_blocks(), 8);
887 }
888
889 #[test]
890 fn test_fold_preserves_inserted_values() {
891 // Create a large filter, insert values, fold, verify no false negatives
892 let mut sbbf = Sbbf::new_with_num_of_bytes(32 * 1024); // 32KB = 1024 blocks
893 let values: Vec<String> = (0..1000).map(|i| format!("value_{i}")).collect();
894 for v in &values {
895 sbbf.insert(v.as_str());
896 }
897
898 // Fold several times
899 let original_blocks = sbbf.num_blocks();
900 sbbf.fold_to_target_fpp(0.05);
901 assert!(
902 sbbf.num_blocks() < original_blocks,
903 "should have folded at least once"
904 );
905
906 // All inserted values must still be found (no false negatives)
907 for v in &values {
908 assert!(
909 sbbf.check(v.as_str()),
910 "Value '{}' missing after folding (false negative!)",
911 v
912 );
913 }
914 }
915
916 #[test]
917 fn test_fold_to_target_fpp_stops_before_exceeding_target() {
918 let mut sbbf = Sbbf::new_with_num_of_bytes(64 * 1024); // 64KB
919 // Insert enough values to set some bits
920 for i in 0..5000 {
921 sbbf.insert(&i);
922 }
923
924 let target_fpp = 0.01;
925 sbbf.fold_to_target_fpp(target_fpp);
926
927 // After folding, the estimated FPP should be at or below target
928 // (the current state should not exceed target — we stopped before that would happen)
929 let total_bits = (sbbf.num_blocks() * 256) as f64;
930 let set_bits: u64 = sbbf
931 .0
932 .iter()
933 .flat_map(|b| b.0.iter())
934 .map(|w| w.count_ones() as u64)
935 .sum();
936 let fill = set_bits as f64 / total_bits;
937 let current_fpp = fill.powi(8);
938 assert!(
939 current_fpp <= target_fpp,
940 "FPP {current_fpp} exceeds target {target_fpp}"
941 );
942 }
943
944 #[test]
945 fn test_fold_empty_filter_folds_to_minimum() {
946 // An empty filter has fill=0, so estimated FPP is always 0 — should fold all the way down
947 let mut sbbf = Sbbf::new_with_num_of_bytes(1024); // 32 blocks
948 sbbf.fold_to_target_fpp(0.01);
949 assert_eq!(sbbf.num_blocks(), 1);
950 }
951
952 #[test]
953 #[should_panic(expected = "Cannot fold 1 times: need at least 2 blocks, have 1")]
954 fn test_fold_n_panics_at_minimum_size() {
955 let mut sbbf = Sbbf::new_with_num_of_bytes(32); // 1 block (minimum)
956 sbbf.fold_n(1);
957 }
958
959 #[test]
960 fn test_sbbf_write_round_trip() {
961 // Create a bloom filter with a 32-byte bitset (minimum size)
962 let bitset_bytes = vec![0u8; 32];
963 let mut original = Sbbf::new(&bitset_bytes);
964
965 // Insert some test values
966 let test_values = ["hello", "world", "rust", "parquet", "bloom", "filter"];
967 for value in &test_values {
968 original.insert(value);
969 }
970
971 // Serialize to bytes
972 let mut output = Vec::new();
973 original.write(&mut output).unwrap();
974
975 // Validate header was written correctly
976 let mut protocol = ThriftSliceInputProtocol::new(&output);
977 let header = BloomFilterHeader::read_thrift(&mut protocol).unwrap();
978 assert_eq!(header.num_bytes, bitset_bytes.len() as i32);
979 assert_eq!(header.algorithm, BloomFilterAlgorithm::BLOCK);
980 assert_eq!(header.hash, BloomFilterHash::XXHASH);
981 assert_eq!(header.compression, BloomFilterCompression::UNCOMPRESSED);
982
983 // Deserialize using from_bytes
984 let reconstructed = Sbbf::from_bytes(&output).unwrap();
985
986 // Most importantly: verify the bloom filter WORKS correctly after round-trip
987 // Note: bloom filters can have false positives, but should never have false negatives
988 // So we can't assert !check(), but we should verify inserted values are found
989 for value in &test_values {
990 assert!(
991 reconstructed.check(value),
992 "Value '{}' should be present after round-trip",
993 value
994 );
995 }
996 }
997
998 /// Prove that folding an SBBF by one level produces the exact same bits
999 /// as building a fresh filter at the smaller size from scratch.
1000 ///
1001 /// # What is folding?
1002 ///
1003 /// ```text
1004 /// Original (N = 8 blocks):
1005 /// ┌───┬───┬───┬───┬───┬───┬───┬───┐
1006 /// │ 0 │ 1 │ 2 │ 3 │ 4 │ 5 │ 6 │ 7 │
1007 /// └─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┘
1008 /// │ │ │ │ │ │ │ │
1009 /// └─OR┘ └─OR┘ └─OR┘ └─OR┘ pair-wise OR
1010 /// │ │ │ │
1011 /// ┌───┴──┬────┴──┬────┴──┬────┴──┐
1012 /// │ 0|1 │ 2|3 │ 4|5 │ 6|7 │ Folded (N/2 = 4 blocks)
1013 /// └──────┴───────┴───────┴───────┘
1014 /// ```
1015 ///
1016 /// # Why folded == fresh (the two lemmas)
1017 ///
1018 /// An SBBF insertion does two things with a 64-bit hash `h`:
1019 ///
1020 /// 1. **Pick a block** — uses the upper 32 bits via `hash_to_block_index`
1021 /// 2. **Set 8 bits in that block** — uses the lower 32 bits via `Block::mask`
1022 ///
1023 /// **Lemma 1 (block index halves):** `hash_to_block_index` uses
1024 /// `(upper32 * N) >> 32`. When N halves, the index halves too:
1025 /// `index_in(N/2) == index_in(N) / 2`. So the hash lands in the same
1026 /// destination block whether you fold or build fresh.
1027 ///
1028 /// **Lemma 2 (mask is size-independent):** `Block::mask(h as u32)` depends
1029 /// only on the lower 32 bits and the fixed SALT constants — the filter
1030 /// size N is not involved. So the same 8 bits get set regardless.
1031 ///
1032 /// Combined: every hash sets the *same bits* in the *same destination
1033 /// block* whether you fold or build fresh → filters are bit-identical.
1034 #[test]
1035 fn test_sbbf_folded_equals_fresh() {
1036 let values = (0..5000).map(|i| format!("elem_{i}")).collect::<Vec<_>>();
1037 let hashes = values
1038 .iter()
1039 .map(|v| hash_as_bytes(v.as_str()))
1040 .collect::<Vec<_>>();
1041
1042 for num_blocks in [64, 256, 1024] {
1043 let half = num_blocks / 2;
1044
1045 // Build a filter with N blocks and insert all values.
1046 let mut original = Sbbf::new_with_num_of_bytes(num_blocks * 32);
1047 assert_eq!(original.num_blocks(), num_blocks);
1048 for &h in &hashes {
1049 original.insert_hash(h);
1050 }
1051
1052 // --- Per-hash verification of the two lemmas ---
1053 for &h in hashes.iter() {
1054 // mask(h as u32) gives the 8-bit pattern that this hash sets
1055 // inside whichever block it lands in. It uses only the lower
1056 // 32 bits of h, so it's the same regardless of filter size.
1057 let mask = Block::mask(h as u32);
1058
1059 // Lemma 1 check: the block index in the original N-block
1060 // filter, divided by 2, should equal the block index in a
1061 // fresh N/2-block filter.
1062 let orig_idx = original.hash_to_block_index(h);
1063 assert!(orig_idx < num_blocks);
1064
1065 let fresh_idx = {
1066 let tmp = Sbbf(vec![Block::ZERO; half]);
1067 tmp.hash_to_block_index(h)
1068 };
1069 let folded_idx = orig_idx / 2;
1070 assert_eq!(
1071 fresh_idx, folded_idx,
1072 "Lemma 1 failed: fresh index {fresh_idx} != folded index {folded_idx}"
1073 );
1074
1075 // Lemma 2 check: every bit that mask wants to set is actually
1076 // present in the original block.
1077 //
1078 // mask.0[w] has exactly ONE bit set (see Block::mask: `1 << y`).
1079 // The block at orig_idx has many bits set from many inserts, so
1080 // we can't test equality — we test that the specific mask bit is
1081 // *present*:
1082 //
1083 // block_word & mask_word != 0
1084 // ⟺ "the one bit in the mask is set in the block"
1085 //
1086 // (Since mask_word has exactly 1 bit, `& mask != 0` is the same
1087 // as `& mask == mask` — but `!= 0` reads more naturally.)
1088 for w in 0..8 {
1089 assert_ne!(
1090 original.0[orig_idx].0[w] & mask.0[w],
1091 0,
1092 "Lemma 2 failed: mask bit not set in word {w} of block {orig_idx}"
1093 );
1094 }
1095 }
1096
1097 // --- Final bit-identical comparison ---
1098 // Fold the original N-block filter down to N/2 blocks.
1099 let mut folded = original.clone();
1100 folded.fold_n(1);
1101 assert_eq!(folded.num_blocks(), half);
1102
1103 // Build a fresh N/2-block filter with the same values.
1104 let mut fresh = Sbbf::new_with_num_of_bytes(half * 32);
1105 for &h in &hashes {
1106 fresh.insert_hash(h);
1107 }
1108
1109 // By lemmas 1 + 2, every block should be bit-identical.
1110 for j in 0..half {
1111 assert_eq!(
1112 folded.0[j].0, fresh.0[j].0,
1113 "Block {j} differs after fold (N={num_blocks} → {half})"
1114 );
1115 }
1116 }
1117 }
1118
1119 /// Inductive multi-step folding: folding k times from N blocks produces
1120 /// a filter bit-identical to a fresh N/2^k-block filter.
1121 ///
1122 /// `test_sbbf_folded_equals_fresh` proves the base case (one fold).
1123 /// This test applies folds *repeatedly*, checking after each step:
1124 ///
1125 /// ```text
1126 /// 512 ─fold→ 256 ─fold→ 128 ─…→ 1 (9 folds total)
1127 /// ```
1128 ///
1129 /// At each intermediate size we build a fresh filter and assert
1130 /// bit-equality, confirming the lemma composes across folds.
1131 #[test]
1132 fn test_multi_step_fold() {
1133 let values = (0..3000).map(|i| format!("x_{i}")).collect::<Vec<_>>();
1134
1135 // Start with a 512-block filter.
1136 let mut filter = Sbbf::new_with_num_of_bytes(512 * 32);
1137 for v in &values {
1138 filter.insert(v.as_str());
1139 }
1140
1141 // Fold one level at a time, comparing against a fresh filter each step.
1142 for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
1143 filter.fold_n(1);
1144 assert_eq!(filter.num_blocks(), expected_blocks);
1145
1146 let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
1147 for v in &values {
1148 fresh.insert(v.as_str());
1149 }
1150 for (fb, rb) in filter.0.iter().zip(fresh.0.iter()) {
1151 assert_eq!(fb.0, rb.0);
1152 }
1153 }
1154 }
1155
1156 /// test that the fpp estimator's overestimation doesn't cause fold_to_target_fpp
1157 /// to produce significantly oversized filters
1158 ///
1159 /// compare the final size after folding against the theoretical optimal size
1160 #[test]
1161 fn test_fold_size_vs_optimal_fixed_size() {
1162 for (ndv, target_fpp) in [
1163 (1000, 0.05),
1164 (1000, 0.01),
1165 (5000, 0.05),
1166 (5000, 0.01),
1167 (10000, 0.05),
1168 ] {
1169 let values = (0..ndv).map(|i| format!("d_{i}")).collect::<Vec<_>>();
1170
1171 let mut folded = Sbbf::new_with_num_of_bytes(128 * 1024); // 128KB
1172 for v in &values {
1173 folded.insert(v.as_str());
1174 }
1175 folded.fold_to_target_fpp(target_fpp);
1176
1177 let folded_bytes = folded.num_blocks() * 32;
1178
1179 let optimal = Sbbf::new_with_ndv_fpp(ndv as u64, target_fpp).unwrap();
1180 let optimal_bytes = optimal.num_blocks() * 32;
1181
1182 let ratio = folded_bytes as f64 / optimal_bytes as f64;
1183
1184 assert_eq!(ratio, 1.0);
1185 }
1186 }
1187
1188 /// verify that a folded sbbf has the same empirical fpp as a fresh filter of the same size
1189 /// this bridges the bit-identity proof above with the FPP guarantee from the folding paper
1190 /// since the bits are identical, the false-positive rate must be too
1191 ///
1192 /// we measure fpp empirically by probing with values that were never inserted
1193 /// and counting how many are incorrectly marked as present
1194 #[test]
1195 fn test_folded_fpp_matches_fresh_fpp() {
1196 let ndv = 2000;
1197 let num_probes = 50_000;
1198 let inserted = (0..ndv)
1199 .map(|i| format!("ins_{i}"))
1200 .collect::<Vec<String>>();
1201
1202 // probe values that were NOT inserted (different prefix guarantees no overlap)
1203 let probes = (0..num_probes)
1204 .map(|i| format!("probe_{i}"))
1205 .collect::<Vec<String>>();
1206
1207 // build a large filter and fold it down several times
1208 let mut folded = Sbbf::new_with_num_of_bytes(512 * 32); // 512 blocks
1209 for v in &inserted {
1210 folded.insert(v.as_str());
1211 }
1212
1213 // check FPP at each fold level
1214 for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
1215 folded.fold_n(1);
1216 assert_eq!(folded.num_blocks(), expected_blocks);
1217
1218 // build a fresh filter of the same size with the same values
1219 let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
1220 for v in &inserted {
1221 fresh.insert(v.as_str());
1222 }
1223
1224 // measure empirical FPP on both
1225 let mut folded_fp = 0u64;
1226 let mut fresh_fp = 0u64;
1227 for p in &probes {
1228 if folded.check(p.as_str()) {
1229 folded_fp += 1;
1230 }
1231 if fresh.check(p.as_str()) {
1232 fresh_fp += 1;
1233 }
1234 }
1235
1236 // bit-identity means these must be exactly equal
1237 assert_eq!(folded_fp, fresh_fp);
1238 }
1239 }
1240}