Skip to main content

parquet/bloom_filter/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Bloom filter implementation specific to Parquet, as described
19//! in the [spec][parquet-bf-spec].
20//!
21//! # Bloom Filter Size
22//!
23//! Parquet uses the [Split Block Bloom Filter][sbbf-paper] (SBBF) as its bloom filter
24//! implementation. For each column upon which bloom filters are enabled, the offset and length of an SBBF
25//! is stored in  the metadata for each row group in the parquet file. The size of each filter is
26//! initialized using a calculation based on the desired number of distinct values (NDV) and false
27//! positive probability (FPP). The FPP for a SBBF can be approximated as<sup>[1][bf-formulae]</sup>:
28//!
29//! ```text
30//! f = (1 - e^(-k * n / m))^k
31//! ```
32//!
33//! Where, `f` is the FPP, `k` the number of hash functions, `n` the NDV, and `m` the total number
34//! of bits in the bloom filter. This can be re-arranged to determine the total number of bits
35//! required to achieve a given FPP and NDV:
36//!
37//! ```text
38//! m = -k * n / ln(1 - f^(1/k))
39//! ```
40//!
41//! SBBFs use eight hash functions to cleanly fit in SIMD lanes<sup>[2][sbbf-paper]</sup>, therefore
42//! `k` is set to 8. The SBBF will spread those `m` bits accross a set of `b` blocks that
43//! are each 256 bits, i.e., 32 bytes, in size. The number of blocks is chosen as:
44//!
45//! ```text
46//! b = NP2(m/8) / 32
47//! ```
48//!
49//! Where, `NP2` denotes *the next power of two*, and `m` is divided by 8 to be represented as bytes.
50//!
51//! Here is a table of calculated sizes for various FPP and NDV:
52//!
53//! | NDV       | FPP       | b       | Size (KB) |
54//! |-----------|-----------|---------|-----------|
55//! | 10,000    | 0.1       | 256     | 8         |
56//! | 10,000    | 0.01      | 512     | 16        |
57//! | 10,000    | 0.001     | 1,024   | 32        |
58//! | 10,000    | 0.0001    | 1,024   | 32        |
59//! | 100,000   | 0.1       | 4,096   | 128       |
60//! | 100,000   | 0.01      | 4,096   | 128       |
61//! | 100,000   | 0.001     | 8,192   | 256       |
62//! | 100,000   | 0.0001    | 16,384  | 512       |
63//! | 100,000   | 0.00001   | 16,384  | 512       |
64//! | 1,000,000 | 0.1       | 32,768  | 1,024     |
65//! | 1,000,000 | 0.01      | 65,536  | 2,048     |
66//! | 1,000,000 | 0.001     | 65,536  | 2,048     |
67//! | 1,000,000 | 0.0001    | 131,072 | 4,096     |
68//! | 1,000,000 | 0.00001   | 131,072 | 4,096     |
69//! | 1,000,000 | 0.000001  | 262,144 | 8,192     |
70//!
71//! # Structure: Filter → Blocks → Words → Bits
72//!
73//! An SBBF is an array of **blocks**. Each block is 256 bits (32 bytes),
74//! divided into eight 32-bit **words**. A word is just a `u32` — an array of
75//! 32 individual bits that can each be "set" (1) or "not set" (0).
76//!
77//! ```text
78//!   Sbbf (the whole filter)
79//!   ┌──────────┬──────────┬──────────┬─── ─── ──┬──────────┐
80//!   │ Block 0  │ Block 1  │ Block 2  │   ...    │ Block N-1│
81//!   └──────────┴──────────┴──────────┴─── ─── ──┴──────────┘
82//!        │
83//!        ▼
84//!   One Block = 256 bits = 8 words
85//!   ┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
86//!   │ word 0 │ word 1 │ word 2 │ word 3 │ word 4 │ word 5 │ word 6 │ word 7 │
87//!   │ (u32)  │ (u32)  │ (u32)  │ (u32)  │ (u32)  │ (u32)  │ (u32)  │ (u32)  │
88//!   └────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
89//!        │
90//!        ▼
91//!   One Word = 32 individual bits
92//!   ┌─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┐
93//!   │0│0│1│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│  ← bit 29 is set
94//!   └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
95//! ```
96//!
97//! **Inserting** a value hashes it to a 64-bit number, then:
98//!  1. The upper 32 bits pick which **block** (via `Sbbf::hash_to_block_index`).
99//!  2. The lower 32 bits pick one bit position in each of the 8 **words** (via `Block::mask`).
100//!     So each insert sets exactly **8 bits** (one per word) in a single block.
101//!
102//! **Checking** does the same two steps and returns `true` only if all 8 bits
103//! are already set — meaning the value was *probably* inserted (or is a false
104//! positive).
105//!
106//! # Bloom Filter Folding
107//!
108//! After inserting all values into a bloom filter it can be "folded" to minimize it's size.
109//! See [`Sbbf::fold_to_target_fpp`] for details  on the algorithm and its mathematical basis.
110//!
111//! [parquet-bf-spec]: https://github.com/apache/parquet-format/blob/master/BloomFilter.md
112//! [sbbf-paper]: https://arxiv.org/pdf/2101.01719
113//! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf
114
115use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
116use crate::data_type::AsBytes;
117use crate::errors::{ParquetError, Result};
118use crate::file::metadata::ColumnChunkMetaData;
119use crate::file::reader::ChunkReader;
120use crate::parquet_thrift::{
121    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
122    ThriftSliceInputProtocol, WriteThrift, WriteThriftField,
123};
124use crate::thrift_struct;
125use bytes::Bytes;
126use std::io::Write;
127use twox_hash::XxHash64;
128
129/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach).
130const SALT: [u32; 8] = [
131    0x47b6137b_u32,
132    0x44974d91_u32,
133    0x8824ad5b_u32,
134    0xa2b7289d_u32,
135    0x705495c7_u32,
136    0x2df1424b_u32,
137    0x9efc4947_u32,
138    0x5c6bfb31_u32,
139];
140
141thrift_struct!(
142/// Bloom filter header is stored at beginning of Bloom filter data of each column
143/// and followed by its bitset.
144///
145pub struct BloomFilterHeader {
146  /// The size of bitset in bytes
147  1: required i32 num_bytes;
148  /// The algorithm for setting bits.
149  2: required BloomFilterAlgorithm algorithm;
150  /// The hash function used for Bloom filter
151  3: required BloomFilterHash hash;
152  /// The compression used in the Bloom filter
153  4: required BloomFilterCompression compression;
154}
155);
156
157/// A single 256-bit block, the basic unit of the Split Block Bloom Filter.
158///
159/// A block is eight contiguous 32-bit **words** (`[u32; 8]`).
160/// Each word is an independent bit-array of 32 positions:
161///
162/// ```text
163///   Block (256 bits total)
164///   ┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
165///   │ word 0 │ word 1 │ word 2 │ word 3 │ word 4 │ word 5 │ word 6 │ word 7 │
166///   │ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│
167///   └────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
168/// ```
169///
170/// When a value is inserted, [`Block::mask`] picks one bit in each word
171/// (8 bits total), and those bits are OR'd in. When checking, we verify
172/// all 8 bits are set.
173#[derive(Debug, Copy, Clone)]
174#[repr(transparent)]
175struct Block([u32; 8]);
176impl Block {
177    const ZERO: Block = Block([0; 8]);
178
179    /// Produce a block where each of the 8 words has exactly one bit set.
180    ///
181    /// For each word `i` the bit position is derived from `x`:
182    ///
183    /// ```text
184    ///   y = (x wrapping* SALT[i]) >> 27   // top 5 bits → value in 0..31
185    ///   word[i] = 1 << y                  // exactly one bit set per word
186    /// ```
187    ///
188    /// Because only the top 5 bits survive the shift, each word picks one of
189    /// 32 possible bit positions. The eight SALT constants spread the choices
190    /// so different words usually light up different positions.
191    ///
192    /// Key property: the mask depends *only* on `x` (a u32) and the fixed
193    /// SALT constants — it is independent of the filter size. This is why
194    /// folding preserves bit patterns (see Lemma 2 in tests).
195    fn mask(x: u32) -> Self {
196        let mut result = [0_u32; 8];
197        for i in 0..8 {
198            let y = x.wrapping_mul(SALT[i]); // spread bits via multiply
199            let y = y >> 27; // keep top 5 bits → 0..31
200            result[i] = 1 << y; // set exactly that one bit
201        }
202        Self(result)
203    }
204
205    #[inline]
206    #[cfg(not(target_endian = "little"))]
207    fn to_ne_bytes(self) -> [u8; 32] {
208        // SAFETY: [u32; 8] and [u8; 32] have the same size and neither has invalid bit patterns.
209        unsafe { std::mem::transmute(self.0) }
210    }
211
212    #[inline]
213    #[cfg(not(target_endian = "little"))]
214    fn to_le_bytes(self) -> [u8; 32] {
215        self.swap_bytes().to_ne_bytes()
216    }
217
218    #[inline]
219    #[cfg(not(target_endian = "little"))]
220    fn swap_bytes(mut self) -> Self {
221        self.0.iter_mut().for_each(|x| *x = x.swap_bytes());
222        self
223    }
224
225    /// OR the mask bits into this block (`block[i] |= mask[i]`).
226    ///
227    /// After insertion the 8 bits chosen by `mask(hash)` are guaranteed set;
228    /// bits previously set by other hashes are preserved.
229    fn insert(&mut self, hash: u32) {
230        let mask = Self::mask(hash);
231        for i in 0..8 {
232            self[i] |= mask[i];
233        }
234    }
235
236    /// Check membership: returns `true` when *every* bit from `mask(hash)` is
237    /// already set in this block (`block[i] & mask[i] != 0` for all 8 words).
238    ///
239    /// A `true` result means "probably present" (other inserts may have set
240    /// the same bits). A `false` is definitive — the value was never inserted.
241    fn check(&self, hash: u32) -> bool {
242        let mask = Self::mask(hash);
243        for i in 0..8 {
244            if self[i] & mask[i] == 0 {
245                return false;
246            }
247        }
248        true
249    }
250}
251
252impl std::ops::Index<usize> for Block {
253    type Output = u32;
254
255    #[inline]
256    fn index(&self, index: usize) -> &Self::Output {
257        self.0.index(index)
258    }
259}
260
261impl std::ops::IndexMut<usize> for Block {
262    #[inline]
263    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
264        self.0.index_mut(index)
265    }
266}
267
268impl std::ops::BitOr for Block {
269    type Output = Self;
270
271    #[inline]
272    fn bitor(self, rhs: Self) -> Self {
273        let mut result = [0u32; 8];
274        for (i, item) in result.iter_mut().enumerate() {
275            *item = self.0[i] | rhs.0[i];
276        }
277        Self(result)
278    }
279}
280
281impl std::ops::BitOrAssign for Block {
282    #[inline]
283    fn bitor_assign(&mut self, rhs: Self) {
284        for i in 0..8 {
285            self.0[i] |= rhs.0[i];
286        }
287    }
288}
289
290impl Block {
291    /// Count the total number of set bits across all 8 words.
292    ///
293    /// Computes popcount on each word separately and sums. Keeping the popcount
294    /// separate from the OR allows the compiler to batch SIMD popcount instructions
295    /// (e.g., `cnt.16b` on ARM NEON) instead of interleaving them with OR operations.
296    #[inline]
297    fn count_ones(self) -> u32 {
298        // Written as a fold over the array so the compiler sees 8 independent
299        // popcount operations it can vectorize into cnt.16b + horizontal sum.
300        self.0.iter().map(|w| w.count_ones()).sum()
301    }
302}
303
304/// A split block Bloom filter (SBBF).
305///
306/// An SBBF partitions its bit space into fixed-size 256-bit (32-byte) blocks, each fitting in a
307/// single CPU cache line. Each block contains eight 32-bit words, aligned with SIMD lanes for
308/// parallel bit manipulation. When checking membership, only one block is accessed per query,
309/// eliminating the cache-miss penalty of standard Bloom filters.
310///
311/// ## Sizing and folding
312///
313/// Filters are initially sized for a maximum expected number of distinct values (NDV) via
314/// [`Sbbf::new_with_ndv_fpp`]. After all values are inserted, the filter is compacted by
315/// calling [`Sbbf::fold_to_target_fpp`], which folds the filter down to the smallest size
316/// that still meets the target false positive probability.
317///
318/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
319/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
320#[derive(Debug, Clone)]
321pub struct Sbbf(Vec<Block>);
322
323pub(crate) const SBBF_HEADER_SIZE_ESTIMATE: usize = 20;
324
325/// given an initial offset, and a byte buffer, try to read out a bloom filter header and return
326/// both the header and the offset after it (for bitset).
327pub(crate) fn chunk_read_bloom_filter_header_and_offset(
328    offset: u64,
329    buffer: Bytes,
330) -> Result<(BloomFilterHeader, u64), ParquetError> {
331    let (header, length) = read_bloom_filter_header_and_length(buffer)?;
332    Ok((header, offset + length))
333}
334
335/// given a [Bytes] buffer, try to read out a bloom filter header and return both the header and
336/// length of the header.
337#[inline]
338pub(crate) fn read_bloom_filter_header_and_length(
339    buffer: Bytes,
340) -> Result<(BloomFilterHeader, u64), ParquetError> {
341    read_bloom_filter_header_and_length_from_bytes(buffer.as_ref())
342}
343
344/// Given a byte slice, try to read out a bloom filter header and return both the header and
345/// length of the header.
346#[inline]
347fn read_bloom_filter_header_and_length_from_bytes(
348    buffer: &[u8],
349) -> Result<(BloomFilterHeader, u64), ParquetError> {
350    let total_length = buffer.len();
351    let mut prot = ThriftSliceInputProtocol::new(buffer);
352    let header = BloomFilterHeader::read_thrift(&mut prot)
353        .map_err(|e| ParquetError::General(format!("Could not read bloom filter header: {e}")))?;
354    Ok((header, (total_length - prot.as_slice().len()) as u64))
355}
356
357/// The minimum number of bytes for a bloom filter bitset.
358pub const BITSET_MIN_LENGTH: usize = 32;
359/// The maximum number of bytes for a bloom filter bitset.
360pub const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024;
361
362#[inline]
363fn optimal_num_of_bytes(num_bytes: usize) -> usize {
364    let num_bytes = num_bytes.min(BITSET_MAX_LENGTH);
365    let num_bytes = num_bytes.max(BITSET_MIN_LENGTH);
366    num_bytes.next_power_of_two()
367}
368
369// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
370// given fpp = (1 - e^(-k * n / m)) ^ k
371// we have m = - k * n / ln(1 - fpp ^ (1 / k))
372// where k = number of hash functions, m = number of bits, n = number of distinct values
373#[inline]
374fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
375    let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
376    num_bits as usize
377}
378
379impl Sbbf {
380    /// Create a new [Sbbf] with given number of distinct values and false positive probability.
381    /// Will return an error if `fpp` is greater than or equal to 1.0 or less than 0.0.
382    pub fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Result<Self, ParquetError> {
383        if !(0.0..1.0).contains(&fpp) {
384            return Err(ParquetError::General(format!(
385                "False positive probability must be between 0.0 and 1.0, got {fpp}"
386            )));
387        }
388        let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp);
389        Ok(Self::new_with_num_of_bytes(num_bits / 8))
390    }
391
392    /// Create a new [Sbbf] with given number of bytes, the exact number of bytes will be adjusted
393    /// to the next power of two bounded by [BITSET_MIN_LENGTH] and [BITSET_MAX_LENGTH].
394    pub fn new_with_num_of_bytes(num_bytes: usize) -> Self {
395        let num_bytes = optimal_num_of_bytes(num_bytes);
396        assert_eq!(num_bytes % size_of::<Block>(), 0);
397        let num_blocks = num_bytes / size_of::<Block>();
398        let bitset = vec![Block::ZERO; num_blocks];
399        Self(bitset)
400    }
401
402    /// Creates a new [Sbbf] from a raw byte slice.
403    pub fn new(bitset: &[u8]) -> Self {
404        let data = bitset
405            .chunks_exact(4 * 8)
406            .map(|chunk| {
407                let mut block = Block::ZERO;
408                for (i, word) in chunk.chunks_exact(4).enumerate() {
409                    block[i] = u32::from_le_bytes(word.try_into().unwrap());
410                }
411                block
412            })
413            .collect::<Vec<Block>>();
414        Self(data)
415    }
416
417    /// Write the bloom filter data (header and then bitset) to the output. This doesn't
418    /// flush the writer in order to boost performance of bulk writing all blocks. Caller
419    /// must remember to flush the writer.
420    /// This method usually is used in conjunction with [`Self::from_bytes`] for serialization/deserialization.
421    pub fn write<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
422        let mut protocol = ThriftCompactOutputProtocol::new(&mut writer);
423        self.header().write_thrift(&mut protocol).map_err(|e| {
424            ParquetError::General(format!("Could not write bloom filter header: {e}"))
425        })?;
426        self.write_bitset(&mut writer)?;
427        Ok(())
428    }
429
430    /// Write the bitset in serialized form to the writer.
431    #[cfg(not(target_endian = "little"))]
432    pub fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
433        for block in &self.0 {
434            writer
435                .write_all(block.to_le_bytes().as_slice())
436                .map_err(|e| {
437                    ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
438                })?;
439        }
440        Ok(())
441    }
442
443    /// Write the bitset in serialized form to the writer.
444    #[cfg(target_endian = "little")]
445    pub fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
446        // Safety: Block is repr(transparent) and [u32; 8] can be reinterpreted as [u8; 32].
447        let slice = unsafe {
448            std::slice::from_raw_parts(
449                self.0.as_ptr() as *const u8,
450                self.0.len() * size_of::<Block>(),
451            )
452        };
453        writer.write_all(slice).map_err(|e| {
454            ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
455        })?;
456        Ok(())
457    }
458
459    /// Create and populate [`BloomFilterHeader`] from this bitset for writing to serialized form
460    fn header(&self) -> BloomFilterHeader {
461        BloomFilterHeader {
462            // 8 i32 per block, 4 bytes per i32
463            num_bytes: self.0.len() as i32 * 4 * 8,
464            algorithm: BloomFilterAlgorithm::BLOCK,
465            hash: BloomFilterHash::XXHASH,
466            compression: BloomFilterCompression::UNCOMPRESSED,
467        }
468    }
469
470    /// Read a new bloom filter from the given offset in the given reader.
471    pub fn read_from_column_chunk<R: ChunkReader>(
472        column_metadata: &ColumnChunkMetaData,
473        reader: &R,
474    ) -> Result<Option<Self>, ParquetError> {
475        let offset: u64 = if let Some(offset) = column_metadata.bloom_filter_offset() {
476            offset
477                .try_into()
478                .map_err(|_| ParquetError::General("Bloom filter offset is invalid".to_string()))?
479        } else {
480            return Ok(None);
481        };
482
483        let buffer = match column_metadata.bloom_filter_length() {
484            Some(length) => reader.get_bytes(offset, length as usize),
485            None => reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE),
486        }?;
487
488        let (header, bitset_offset) =
489            chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;
490
491        match header.algorithm {
492            BloomFilterAlgorithm::BLOCK => {
493                // this match exists to future proof the singleton algorithm enum
494            }
495        }
496        match header.compression {
497            BloomFilterCompression::UNCOMPRESSED => {
498                // this match exists to future proof the singleton compression enum
499            }
500        }
501        match header.hash {
502            BloomFilterHash::XXHASH => {
503                // this match exists to future proof the singleton hash enum
504            }
505        }
506
507        let bitset = match column_metadata.bloom_filter_length() {
508            Some(_) => buffer.slice((bitset_offset - offset) as usize..),
509            None => {
510                let bitset_length: usize = header.num_bytes.try_into().map_err(|_| {
511                    ParquetError::General("Bloom filter length is invalid".to_string())
512                })?;
513                reader.get_bytes(bitset_offset, bitset_length)?
514            }
515        };
516
517        Ok(Some(Self::new(&bitset)))
518    }
519
520    /// Map a 64-bit hash to a block index in `[0, num_blocks)`.
521    ///
522    /// Uses the "multiply-and-shift" trick (a fast alternative to modulo):
523    ///
524    /// ```text
525    ///   upper32 = hash >> 32           // take the top 32 bits of the hash
526    ///   index   = (upper32 * N) >> 32  // ∈ [0, N)  where N = num_blocks
527    /// ```
528    ///
529    /// Why this matters for folding (Lemma 1): when N is a power of two and
530    /// you halve it to N/2, the index also halves:
531    ///
532    /// ```text
533    ///   index_N   = (upper32 * N)   >> 32
534    ///   index_N/2 = (upper32 * N/2) >> 32 = index_N / 2  (integer division)
535    /// ```
536    ///
537    /// So the block that held hash `h` in the big filter is at `index / 2` in
538    /// the half-sized filter — exactly where `fold` ORs it.
539    #[inline]
540    fn hash_to_block_index(&self, hash: u64) -> usize {
541        (((hash >> 32).saturating_mul(self.0.len() as u64)) >> 32) as usize
542    }
543
544    /// Insert an [AsBytes] value into the filter
545    pub fn insert<T: AsBytes + ?Sized>(&mut self, value: &T) {
546        self.insert_hash(hash_as_bytes(value));
547    }
548
549    /// Insert a hash into the filter
550    fn insert_hash(&mut self, hash: u64) {
551        let block_index = self.hash_to_block_index(hash);
552        self.0[block_index].insert(hash as u32)
553    }
554
555    /// Check if an [AsBytes] value is probably present or definitely absent in the filter
556    pub fn check<T: AsBytes + ?Sized>(&self, value: &T) -> bool {
557        self.check_hash(hash_as_bytes(value))
558    }
559
560    /// Check if a hash is in the filter. May return
561    /// true for values that was never inserted ("false positive")
562    /// but will always return false if a hash has not been inserted.
563    fn check_hash(&self, hash: u64) -> bool {
564        let block_index = self.hash_to_block_index(hash);
565        self.0[block_index].check(hash as u32)
566    }
567
568    /// Return the total in memory size of this bloom filter in bytes
569    pub(crate) fn estimated_memory_size(&self) -> usize {
570        self.0.capacity() * std::mem::size_of::<Block>()
571    }
572
573    /// Returns the number of blocks in this bloom filter.
574    pub fn num_blocks(&self) -> usize {
575        self.0.len()
576    }
577
578    /// Fold the bloom filter down to the smallest size that still meets the target FPP
579    /// (False Positive Percentage).
580    ///
581    /// Folds the filter by merging groups of adjacent blocks via bitwise OR, where each
582    /// fold level halves the number of blocks. The fold count is chosen as the maximum
583    /// number of folds whose estimated FPP stays within `target_fpp`. The filter stops
584    /// at a minimum size of 1 block (32 bytes).
585    ///
586    /// ## How it works
587    ///
588    /// SBBFs use multiplicative hashing for block selection:
589    ///
590    /// ```text
591    /// block_index = ((hash >> 32) * num_blocks) >> 32
592    /// ```
593    ///
594    /// A single fold halves the block count: when `num_blocks` is halved, the new index
595    /// becomes `floor(original_index / 2)`, so blocks `2i` and `2i+1` map to the same
596    /// position. More generally, `k` folds reduce the block count by `2^k`, merging
597    /// groups of `2^k` adjacent blocks in a single pass:
598    ///
599    /// ```text
600    /// folded[i] = blocks[i*2^k] | blocks[i*2^k + 1] | ... | blocks[i*2^k + 2^k - 1]
601    /// ```
602    ///
603    /// This differs from standard Bloom filter folding, which merges the two halves
604    /// (`B[i] | B[i + m/2]`) because standard filters use modular hashing where
605    /// `h(x) mod (m/2)` maps indices `i` and `i + m/2` to the same position.
606    ///
607    /// ## Correctness
608    ///
609    /// Folding **never introduces false negatives**. Every bit that was set in the original
610    /// filter remains set in the folded filter (via bitwise OR). The only effect is a controlled
611    /// increase in FPP as set bits from different blocks are merged together.
612    /// This is was originally proven in [Sailhan & Stehr 2012] for standard bloom filters and is empirically
613    /// demonstrated for SBBFs in Lemma 1 and Lemma 2 of the tests.
614    ///
615    /// ## References
616    ///
617    /// [Sailhan & Stehr 2012]: https://doi.org/10.1109/GreenCom.2012.16
618    pub fn fold_to_target_fpp(&mut self, target_fpp: f64) {
619        let num_folds = self.num_folds_for_target_fpp(target_fpp);
620        if num_folds > 0 {
621            self.fold_n(num_folds);
622        }
623    }
624
625    /// Determine how many folds can be applied without exceeding `target_fpp`.
626    ///
627    /// Computes the average per-block fill rate in a single pass (no allocation),
628    /// then analytically estimates the FPP at each fold level.
629    ///
630    /// When two blocks with independent fill rate `f` are OR'd, the expected fill
631    /// of the merged block is `1 - (1-f)^2`. After `k` folds (merging `2^k` blocks):
632    ///
633    /// ```text
634    /// f_k = 1 - (1 - f)^(2^k)
635    /// ```
636    ///
637    /// SBBF membership checks perform `k=8` bit checks within one 256-bit block,
638    /// so the estimated FPP at fold level k is `f_k^8`.
639    fn num_folds_for_target_fpp(&self, target_fpp: f64) -> u32 {
640        let len = self.0.len();
641        if len < 2 {
642            return 0;
643        }
644
645        // Single pass: compute average per-block fill rate.
646        let total_set_bits: u64 = self.0.iter().map(|b| u64::from(b.count_ones())).sum();
647        let avg_fill = total_set_bits as f64 / (len as f64 * 256.0);
648
649        // Empty filter: can fold all the way down.
650        if avg_fill == 0.0 {
651            return len.trailing_zeros();
652        }
653
654        // Find max folds where estimated FPP stays within target.
655        // f_k = 1 - (1 - avg_fill)^(2^k), FPP_k = f_k^8
656        assert!(
657            len.is_power_of_two(),
658            "Number of blocks must be a power of 2 for folding"
659        );
660        let max_folds = len.trailing_zeros(); // log2(len) since len is power of 2
661        let one_minus_f = 1.0 - avg_fill;
662        let mut num_folds = 0u32;
663        let mut one_minus_fk = one_minus_f; // (1-f)^1 initially
664
665        for _ in 0..max_folds {
666            // After one more fold: (1-f)^(2^(k+1)) = ((1-f)^(2^k))^2
667            one_minus_fk = one_minus_fk * one_minus_fk;
668            let fk = 1.0 - one_minus_fk;
669            let estimated_fpp = fk.powi(8);
670            if estimated_fpp > target_fpp {
671                break;
672            }
673            num_folds += 1;
674        }
675
676        num_folds
677    }
678
679    /// Fold the filter `num_folds` times in a single pass.
680    ///
681    /// Merges groups of `2^num_folds` adjacent blocks via bitwise OR, producing
682    /// `len / 2^num_folds` output blocks. The original allocation is reused.
683    ///
684    /// # Panics
685    ///
686    /// Panics if `num_folds` is 0 or would reduce the filter below 1 block.
687    fn fold_n(&mut self, num_folds: u32) {
688        assert!(num_folds > 0, "num_folds must be at least 1");
689        let len = self.0.len();
690        let group_size = 1usize << num_folds;
691        assert!(
692            group_size <= len,
693            "Cannot fold {num_folds} times: need at least {group_size} blocks, have {len}"
694        );
695        let new_len = len / group_size;
696        for i in 0..new_len {
697            let start = i * group_size;
698            let mut merged = self.0[start];
699            for j in 1..group_size {
700                merged |= self.0[start + j];
701            }
702            self.0[i] = merged;
703        }
704        self.0.truncate(new_len);
705    }
706
707    /// Reads a Sbff from Thrift encoded bytes
708    ///
709    /// # Examples
710    ///
711    /// ```no_run
712    /// # use parquet::errors::Result;
713    /// # use parquet::bloom_filter::Sbbf;
714    /// # fn main() -> Result<()> {
715    /// // In a real application, you would read serialized bloom filter bytes from a cache.
716    /// // This example demonstrates the deserialization process.
717    /// // Assuming you have bloom filter bytes from a Parquet file:
718    /// # let serialized_bytes: Vec<u8> = vec![];
719    /// let bloom_filter = Sbbf::from_bytes(&serialized_bytes)?;
720    /// // Now you can use the bloom filter to check for values
721    /// if bloom_filter.check(&"some_value") {
722    ///     println!("Value might be present (or false positive)");
723    /// } else {
724    ///     println!("Value is definitely not present");
725    /// }
726    /// # Ok(())
727    /// # }
728    /// ```
729    pub fn from_bytes(bytes: &[u8]) -> Result<Self, ParquetError> {
730        let (header, header_len) = read_bloom_filter_header_and_length_from_bytes(bytes)?;
731
732        let bitset_length: u64 = header
733            .num_bytes
734            .try_into()
735            .map_err(|_| ParquetError::General("Bloom filter length is invalid".to_string()))?;
736
737        // Validate that bitset consumes all remaining bytes
738        if header_len + bitset_length != bytes.len() as u64 {
739            return Err(ParquetError::General(format!(
740                "Bloom filter data contains extra bytes: expected {} total bytes, got {}",
741                header_len + bitset_length,
742                bytes.len()
743            )));
744        }
745
746        let start = header_len as usize;
747        let end = (header_len + bitset_length) as usize;
748        let bitset = bytes
749            .get(start..end)
750            .ok_or_else(|| ParquetError::General("Bloom filter bitset is invalid".to_string()))?;
751
752        Ok(Self::new(bitset))
753    }
754}
755
756// per spec we use xxHash with seed=0
757const SEED: u64 = 0;
758
759#[inline]
760fn hash_as_bytes<A: AsBytes + ?Sized>(value: &A) -> u64 {
761    XxHash64::oneshot(SEED, value.as_bytes())
762}
763
764#[cfg(test)]
765mod tests {
766    use super::*;
767
768    #[test]
769    fn test_hash_bytes() {
770        assert_eq!(hash_as_bytes(""), 17241709254077376921);
771    }
772
773    #[test]
774    fn test_mask_set_quick_check() {
775        for i in 0..1_000_000 {
776            let result = Block::mask(i);
777            assert!(result.0.iter().all(|&x| x.is_power_of_two()));
778        }
779    }
780
781    #[test]
782    fn test_block_insert_and_check() {
783        for i in 0..1_000_000 {
784            let mut block = Block::ZERO;
785            block.insert(i);
786            assert!(block.check(i));
787        }
788    }
789
790    #[test]
791    fn test_sbbf_insert_and_check() {
792        let mut sbbf = Sbbf(vec![Block::ZERO; 1_000]);
793        for i in 0..1_000_000 {
794            sbbf.insert(&i);
795            assert!(sbbf.check(&i));
796        }
797    }
798
799    #[test]
800    fn test_with_fixture() {
801        // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10
802        let bitset: &[u8] = &[
803            200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5,
804            99, 65, 2, 0, 224, 44, 64, 78, 96, 4,
805        ];
806        let sbbf = Sbbf::new(bitset);
807        for a in 0..10i64 {
808            let value = format!("a{a}");
809            assert!(sbbf.check(&value.as_str()));
810        }
811    }
812
813    /// test the assumption that bloom filter header size should not exceed SBBF_HEADER_SIZE_ESTIMATE
814    /// essentially we are testing that the struct is packed with 4 i32 fields, each can be 1-5 bytes
815    /// so altogether it'll be 20 bytes at most.
816    #[test]
817    fn test_bloom_filter_header_size_assumption() {
818        let buffer: &[u8; 16] = &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99];
819        let (
820            BloomFilterHeader {
821                algorithm,
822                compression,
823                hash,
824                num_bytes,
825            },
826            read_length,
827        ) = read_bloom_filter_header_and_length(Bytes::copy_from_slice(buffer)).unwrap();
828        assert_eq!(read_length, 15);
829        assert_eq!(algorithm, BloomFilterAlgorithm::BLOCK);
830        assert_eq!(compression, BloomFilterCompression::UNCOMPRESSED);
831        assert_eq!(hash, BloomFilterHash::XXHASH);
832        assert_eq!(num_bytes, 32_i32);
833        assert_eq!(20, SBBF_HEADER_SIZE_ESTIMATE);
834    }
835
836    #[test]
837    fn test_optimal_num_of_bytes() {
838        for (input, expected) in &[
839            (0, 32),
840            (9, 32),
841            (31, 32),
842            (32, 32),
843            (33, 64),
844            (99, 128),
845            (1024, 1024),
846            (999_000_000, 128 * 1024 * 1024),
847        ] {
848            assert_eq!(*expected, optimal_num_of_bytes(*input));
849        }
850    }
851
852    #[test]
853    fn test_num_of_bits_from_ndv_fpp() {
854        for (fpp, ndv, num_bits) in &[
855            (0.1, 10, 57),
856            (0.01, 10, 96),
857            (0.001, 10, 146),
858            (0.1, 100, 577),
859            (0.01, 100, 968),
860            (0.001, 100, 1460),
861            (0.1, 1000, 5772),
862            (0.01, 1000, 9681),
863            (0.001, 1000, 14607),
864            (0.1, 10000, 57725),
865            (0.01, 10000, 96815),
866            (0.001, 10000, 146076),
867            (0.1, 100000, 577254),
868            (0.01, 100000, 968152),
869            (0.001, 100000, 1460769),
870            (0.1, 1000000, 5772541),
871            (0.01, 1000000, 9681526),
872            (0.001, 1000000, 14607697),
873            (1e-50, 1_000_000_000_000, 14226231280773240832),
874        ] {
875            assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
876        }
877    }
878
879    #[test]
880    fn test_fold_n_halves_block_count() {
881        let mut sbbf = Sbbf::new_with_num_of_bytes(1024); // 32 blocks
882        assert_eq!(sbbf.num_blocks(), 32);
883        sbbf.fold_n(1);
884        assert_eq!(sbbf.num_blocks(), 16);
885        sbbf.fold_n(1);
886        assert_eq!(sbbf.num_blocks(), 8);
887    }
888
889    #[test]
890    fn test_fold_preserves_inserted_values() {
891        // Create a large filter, insert values, fold, verify no false negatives
892        let mut sbbf = Sbbf::new_with_num_of_bytes(32 * 1024); // 32KB = 1024 blocks
893        let values: Vec<String> = (0..1000).map(|i| format!("value_{i}")).collect();
894        for v in &values {
895            sbbf.insert(v.as_str());
896        }
897
898        // Fold several times
899        let original_blocks = sbbf.num_blocks();
900        sbbf.fold_to_target_fpp(0.05);
901        assert!(
902            sbbf.num_blocks() < original_blocks,
903            "should have folded at least once"
904        );
905
906        // All inserted values must still be found (no false negatives)
907        for v in &values {
908            assert!(
909                sbbf.check(v.as_str()),
910                "Value '{}' missing after folding (false negative!)",
911                v
912            );
913        }
914    }
915
916    #[test]
917    fn test_fold_to_target_fpp_stops_before_exceeding_target() {
918        let mut sbbf = Sbbf::new_with_num_of_bytes(64 * 1024); // 64KB
919        // Insert enough values to set some bits
920        for i in 0..5000 {
921            sbbf.insert(&i);
922        }
923
924        let target_fpp = 0.01;
925        sbbf.fold_to_target_fpp(target_fpp);
926
927        // After folding, the estimated FPP should be at or below target
928        // (the current state should not exceed target — we stopped before that would happen)
929        let total_bits = (sbbf.num_blocks() * 256) as f64;
930        let set_bits: u64 = sbbf
931            .0
932            .iter()
933            .flat_map(|b| b.0.iter())
934            .map(|w| w.count_ones() as u64)
935            .sum();
936        let fill = set_bits as f64 / total_bits;
937        let current_fpp = fill.powi(8);
938        assert!(
939            current_fpp <= target_fpp,
940            "FPP {current_fpp} exceeds target {target_fpp}"
941        );
942    }
943
944    #[test]
945    fn test_fold_empty_filter_folds_to_minimum() {
946        // An empty filter has fill=0, so estimated FPP is always 0 — should fold all the way down
947        let mut sbbf = Sbbf::new_with_num_of_bytes(1024); // 32 blocks
948        sbbf.fold_to_target_fpp(0.01);
949        assert_eq!(sbbf.num_blocks(), 1);
950    }
951
952    #[test]
953    #[should_panic(expected = "Cannot fold 1 times: need at least 2 blocks, have 1")]
954    fn test_fold_n_panics_at_minimum_size() {
955        let mut sbbf = Sbbf::new_with_num_of_bytes(32); // 1 block (minimum)
956        sbbf.fold_n(1);
957    }
958
959    #[test]
960    fn test_sbbf_write_round_trip() {
961        // Create a bloom filter with a 32-byte bitset (minimum size)
962        let bitset_bytes = vec![0u8; 32];
963        let mut original = Sbbf::new(&bitset_bytes);
964
965        // Insert some test values
966        let test_values = ["hello", "world", "rust", "parquet", "bloom", "filter"];
967        for value in &test_values {
968            original.insert(value);
969        }
970
971        // Serialize to bytes
972        let mut output = Vec::new();
973        original.write(&mut output).unwrap();
974
975        // Validate header was written correctly
976        let mut protocol = ThriftSliceInputProtocol::new(&output);
977        let header = BloomFilterHeader::read_thrift(&mut protocol).unwrap();
978        assert_eq!(header.num_bytes, bitset_bytes.len() as i32);
979        assert_eq!(header.algorithm, BloomFilterAlgorithm::BLOCK);
980        assert_eq!(header.hash, BloomFilterHash::XXHASH);
981        assert_eq!(header.compression, BloomFilterCompression::UNCOMPRESSED);
982
983        // Deserialize using from_bytes
984        let reconstructed = Sbbf::from_bytes(&output).unwrap();
985
986        // Most importantly: verify the bloom filter WORKS correctly after round-trip
987        // Note: bloom filters can have false positives, but should never have false negatives
988        // So we can't assert !check(), but we should verify inserted values are found
989        for value in &test_values {
990            assert!(
991                reconstructed.check(value),
992                "Value '{}' should be present after round-trip",
993                value
994            );
995        }
996    }
997
998    /// Prove that folding an SBBF by one level produces the exact same bits
999    /// as building a fresh filter at the smaller size from scratch.
1000    ///
1001    /// # What is folding?
1002    ///
1003    /// ```text
1004    ///   Original (N = 8 blocks):
1005    ///   ┌───┬───┬───┬───┬───┬───┬───┬───┐
1006    ///   │ 0 │ 1 │ 2 │ 3 │ 4 │ 5 │ 6 │ 7 │
1007    ///   └─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┘
1008    ///     │   │   │   │   │   │   │   │
1009    ///     └─OR┘   └─OR┘   └─OR┘   └─OR┘    pair-wise OR
1010    ///       │       │       │       │
1011    ///   ┌───┴──┬────┴──┬────┴──┬────┴──┐
1012    ///   │ 0|1  │ 2|3   │ 4|5   │ 6|7   │   Folded (N/2 = 4 blocks)
1013    ///   └──────┴───────┴───────┴───────┘
1014    /// ```
1015    ///
1016    /// # Why folded == fresh (the two lemmas)
1017    ///
1018    /// An SBBF insertion does two things with a 64-bit hash `h`:
1019    ///
1020    ///   1. **Pick a block** — uses the upper 32 bits via `hash_to_block_index`
1021    ///   2. **Set 8 bits in that block** — uses the lower 32 bits via `Block::mask`
1022    ///
1023    /// **Lemma 1 (block index halves):** `hash_to_block_index` uses
1024    /// `(upper32 * N) >> 32`. When N halves, the index halves too:
1025    /// `index_in(N/2) == index_in(N) / 2`. So the hash lands in the same
1026    /// destination block whether you fold or build fresh.
1027    ///
1028    /// **Lemma 2 (mask is size-independent):** `Block::mask(h as u32)` depends
1029    /// only on the lower 32 bits and the fixed SALT constants — the filter
1030    /// size N is not involved. So the same 8 bits get set regardless.
1031    ///
1032    /// Combined: every hash sets the *same bits* in the *same destination
1033    /// block* whether you fold or build fresh → filters are bit-identical.
1034    #[test]
1035    fn test_sbbf_folded_equals_fresh() {
1036        let values = (0..5000).map(|i| format!("elem_{i}")).collect::<Vec<_>>();
1037        let hashes = values
1038            .iter()
1039            .map(|v| hash_as_bytes(v.as_str()))
1040            .collect::<Vec<_>>();
1041
1042        for num_blocks in [64, 256, 1024] {
1043            let half = num_blocks / 2;
1044
1045            // Build a filter with N blocks and insert all values.
1046            let mut original = Sbbf::new_with_num_of_bytes(num_blocks * 32);
1047            assert_eq!(original.num_blocks(), num_blocks);
1048            for &h in &hashes {
1049                original.insert_hash(h);
1050            }
1051
1052            // --- Per-hash verification of the two lemmas ---
1053            for &h in hashes.iter() {
1054                // mask(h as u32) gives the 8-bit pattern that this hash sets
1055                // inside whichever block it lands in. It uses only the lower
1056                // 32 bits of h, so it's the same regardless of filter size.
1057                let mask = Block::mask(h as u32);
1058
1059                // Lemma 1 check: the block index in the original N-block
1060                // filter, divided by 2, should equal the block index in a
1061                // fresh N/2-block filter.
1062                let orig_idx = original.hash_to_block_index(h);
1063                assert!(orig_idx < num_blocks);
1064
1065                let fresh_idx = {
1066                    let tmp = Sbbf(vec![Block::ZERO; half]);
1067                    tmp.hash_to_block_index(h)
1068                };
1069                let folded_idx = orig_idx / 2;
1070                assert_eq!(
1071                    fresh_idx, folded_idx,
1072                    "Lemma 1 failed: fresh index {fresh_idx} != folded index {folded_idx}"
1073                );
1074
1075                // Lemma 2 check: every bit that mask wants to set is actually
1076                // present in the original block.
1077                //
1078                // mask.0[w] has exactly ONE bit set (see Block::mask: `1 << y`).
1079                // The block at orig_idx has many bits set from many inserts, so
1080                // we can't test equality — we test that the specific mask bit is
1081                // *present*:
1082                //
1083                //   block_word & mask_word != 0
1084                //     ⟺  "the one bit in the mask is set in the block"
1085                //
1086                // (Since mask_word has exactly 1 bit, `& mask != 0` is the same
1087                //  as `& mask == mask` — but `!= 0` reads more naturally.)
1088                for w in 0..8 {
1089                    assert_ne!(
1090                        original.0[orig_idx].0[w] & mask.0[w],
1091                        0,
1092                        "Lemma 2 failed: mask bit not set in word {w} of block {orig_idx}"
1093                    );
1094                }
1095            }
1096
1097            // --- Final bit-identical comparison ---
1098            // Fold the original N-block filter down to N/2 blocks.
1099            let mut folded = original.clone();
1100            folded.fold_n(1);
1101            assert_eq!(folded.num_blocks(), half);
1102
1103            // Build a fresh N/2-block filter with the same values.
1104            let mut fresh = Sbbf::new_with_num_of_bytes(half * 32);
1105            for &h in &hashes {
1106                fresh.insert_hash(h);
1107            }
1108
1109            // By lemmas 1 + 2, every block should be bit-identical.
1110            for j in 0..half {
1111                assert_eq!(
1112                    folded.0[j].0, fresh.0[j].0,
1113                    "Block {j} differs after fold (N={num_blocks} → {half})"
1114                );
1115            }
1116        }
1117    }
1118
1119    /// Inductive multi-step folding: folding k times from N blocks produces
1120    /// a filter bit-identical to a fresh N/2^k-block filter.
1121    ///
1122    /// `test_sbbf_folded_equals_fresh` proves the base case (one fold).
1123    /// This test applies folds *repeatedly*, checking after each step:
1124    ///
1125    /// ```text
1126    ///   512 ─fold→ 256 ─fold→ 128 ─…→ 1  (9 folds total)
1127    /// ```
1128    ///
1129    /// At each intermediate size we build a fresh filter and assert
1130    /// bit-equality, confirming the lemma composes across folds.
1131    #[test]
1132    fn test_multi_step_fold() {
1133        let values = (0..3000).map(|i| format!("x_{i}")).collect::<Vec<_>>();
1134
1135        // Start with a 512-block filter.
1136        let mut filter = Sbbf::new_with_num_of_bytes(512 * 32);
1137        for v in &values {
1138            filter.insert(v.as_str());
1139        }
1140
1141        // Fold one level at a time, comparing against a fresh filter each step.
1142        for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
1143            filter.fold_n(1);
1144            assert_eq!(filter.num_blocks(), expected_blocks);
1145
1146            let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
1147            for v in &values {
1148                fresh.insert(v.as_str());
1149            }
1150            for (fb, rb) in filter.0.iter().zip(fresh.0.iter()) {
1151                assert_eq!(fb.0, rb.0);
1152            }
1153        }
1154    }
1155
1156    /// test that the fpp estimator's overestimation doesn't cause fold_to_target_fpp
1157    /// to produce significantly oversized filters
1158    ///
1159    /// compare the final size after folding against the theoretical optimal size
1160    #[test]
1161    fn test_fold_size_vs_optimal_fixed_size() {
1162        for (ndv, target_fpp) in [
1163            (1000, 0.05),
1164            (1000, 0.01),
1165            (5000, 0.05),
1166            (5000, 0.01),
1167            (10000, 0.05),
1168        ] {
1169            let values = (0..ndv).map(|i| format!("d_{i}")).collect::<Vec<_>>();
1170
1171            let mut folded = Sbbf::new_with_num_of_bytes(128 * 1024); // 128KB
1172            for v in &values {
1173                folded.insert(v.as_str());
1174            }
1175            folded.fold_to_target_fpp(target_fpp);
1176
1177            let folded_bytes = folded.num_blocks() * 32;
1178
1179            let optimal = Sbbf::new_with_ndv_fpp(ndv as u64, target_fpp).unwrap();
1180            let optimal_bytes = optimal.num_blocks() * 32;
1181
1182            let ratio = folded_bytes as f64 / optimal_bytes as f64;
1183
1184            assert_eq!(ratio, 1.0);
1185        }
1186    }
1187
1188    /// verify that a folded sbbf has the same empirical fpp as a fresh filter of the same size
1189    /// this bridges the bit-identity proof above with the FPP guarantee from the folding paper
1190    ///     since the bits are identical, the false-positive rate must be too
1191    ///
1192    /// we measure fpp empirically by probing with values that were never inserted
1193    /// and counting how many are incorrectly marked as present
1194    #[test]
1195    fn test_folded_fpp_matches_fresh_fpp() {
1196        let ndv = 2000;
1197        let num_probes = 50_000;
1198        let inserted = (0..ndv)
1199            .map(|i| format!("ins_{i}"))
1200            .collect::<Vec<String>>();
1201
1202        // probe values that were NOT inserted (different prefix guarantees no overlap)
1203        let probes = (0..num_probes)
1204            .map(|i| format!("probe_{i}"))
1205            .collect::<Vec<String>>();
1206
1207        // build a large filter and fold it down several times
1208        let mut folded = Sbbf::new_with_num_of_bytes(512 * 32); // 512 blocks
1209        for v in &inserted {
1210            folded.insert(v.as_str());
1211        }
1212
1213        // check FPP at each fold level
1214        for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
1215            folded.fold_n(1);
1216            assert_eq!(folded.num_blocks(), expected_blocks);
1217
1218            // build a fresh filter of the same size with the same values
1219            let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
1220            for v in &inserted {
1221                fresh.insert(v.as_str());
1222            }
1223
1224            // measure empirical FPP on both
1225            let mut folded_fp = 0u64;
1226            let mut fresh_fp = 0u64;
1227            for p in &probes {
1228                if folded.check(p.as_str()) {
1229                    folded_fp += 1;
1230                }
1231                if fresh.check(p.as_str()) {
1232                    fresh_fp += 1;
1233                }
1234            }
1235
1236            // bit-identity means these must be exactly equal
1237            assert_eq!(folded_fp, fresh_fp);
1238        }
1239    }
1240}