Skip to main content

parquet/bloom_filter/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Bloom filter implementation specific to Parquet, as described
19//! in the [spec][parquet-bf-spec].
20//!
21//! # Bloom Filter Size
22//!
23//! Parquet uses the [Split Block Bloom Filter][sbbf-paper] (SBBF) as its bloom filter
24//! implementation. For each column upon which bloom filters are enabled, the offset and length of an SBBF
25//! is stored in  the metadata for each row group in the parquet file. The size of each filter is
26//! initialized using a calculation based on the desired number of distinct values (NDV) and false
27//! positive probability (FPP). The FPP for a SBBF can be approximated as<sup>[1][bf-formulae]</sup>:
28//!
29//! ```text
30//! f = (1 - e^(-k * n / m))^k
31//! ```
32//!
33//! Where, `f` is the FPP, `k` the number of hash functions, `n` the NDV, and `m` the total number
34//! of bits in the bloom filter. This can be re-arranged to determine the total number of bits
35//! required to achieve a given FPP and NDV:
36//!
37//! ```text
38//! m = -k * n / ln(1 - f^(1/k))
39//! ```
40//!
41//! SBBFs use eight hash functions to cleanly fit in SIMD lanes<sup>[2][sbbf-paper]</sup>, therefore
42//! `k` is set to 8. The SBBF will spread those `m` bits accross a set of `b` blocks that
43//! are each 256 bits, i.e., 32 bytes, in size. The number of blocks is chosen as:
44//!
45//! ```text
46//! b = NP2(m/8) / 32
47//! ```
48//!
49//! Where, `NP2` denotes *the next power of two*, and `m` is divided by 8 to be represented as bytes.
50//!
51//! Here is a table of calculated sizes for various FPP and NDV:
52//!
53//! | NDV       | FPP       | b       | Size (KB) |
54//! |-----------|-----------|---------|-----------|
55//! | 10,000    | 0.1       | 256     | 8         |
56//! | 10,000    | 0.01      | 512     | 16        |
57//! | 10,000    | 0.001     | 1,024   | 32        |
58//! | 10,000    | 0.0001    | 1,024   | 32        |
59//! | 100,000   | 0.1       | 4,096   | 128       |
60//! | 100,000   | 0.01      | 4,096   | 128       |
61//! | 100,000   | 0.001     | 8,192   | 256       |
62//! | 100,000   | 0.0001    | 16,384  | 512       |
63//! | 100,000   | 0.00001   | 16,384  | 512       |
64//! | 1,000,000 | 0.1       | 32,768  | 1,024     |
65//! | 1,000,000 | 0.01      | 65,536  | 2,048     |
66//! | 1,000,000 | 0.001     | 65,536  | 2,048     |
67//! | 1,000,000 | 0.0001    | 131,072 | 4,096     |
68//! | 1,000,000 | 0.00001   | 131,072 | 4,096     |
69//! | 1,000,000 | 0.000001  | 262,144 | 8,192     |
70//!
71//! [parquet-bf-spec]: https://github.com/apache/parquet-format/blob/master/BloomFilter.md
72//! [sbbf-paper]: https://arxiv.org/pdf/2101.01719
73//! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf
74
75use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
76use crate::data_type::AsBytes;
77use crate::errors::{ParquetError, Result};
78use crate::file::metadata::ColumnChunkMetaData;
79use crate::file::reader::ChunkReader;
80use crate::parquet_thrift::{
81    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
82    ThriftSliceInputProtocol, WriteThrift, WriteThriftField,
83};
84use crate::thrift_struct;
85use bytes::Bytes;
86use std::io::Write;
87use twox_hash::XxHash64;
88
89/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach).
90const SALT: [u32; 8] = [
91    0x47b6137b_u32,
92    0x44974d91_u32,
93    0x8824ad5b_u32,
94    0xa2b7289d_u32,
95    0x705495c7_u32,
96    0x2df1424b_u32,
97    0x9efc4947_u32,
98    0x5c6bfb31_u32,
99];
100
101thrift_struct!(
102/// Bloom filter header is stored at beginning of Bloom filter data of each column
103/// and followed by its bitset.
104///
105pub struct BloomFilterHeader {
106  /// The size of bitset in bytes
107  1: required i32 num_bytes;
108  /// The algorithm for setting bits.
109  2: required BloomFilterAlgorithm algorithm;
110  /// The hash function used for Bloom filter
111  3: required BloomFilterHash hash;
112  /// The compression used in the Bloom filter
113  4: required BloomFilterCompression compression;
114}
115);
116
117/// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits.
118/// Each word is thought of as an array of bits; each bit is either "set" or "not set".
119#[derive(Debug, Copy, Clone)]
120#[repr(transparent)]
121struct Block([u32; 8]);
122impl Block {
123    const ZERO: Block = Block([0; 8]);
124
125    /// takes as its argument a single unsigned 32-bit integer and returns a block in which each
126    /// word has exactly one bit set.
127    fn mask(x: u32) -> Self {
128        let mut result = [0_u32; 8];
129        for i in 0..8 {
130            // wrapping instead of checking for overflow
131            let y = x.wrapping_mul(SALT[i]);
132            let y = y >> 27;
133            result[i] = 1 << y;
134        }
135        Self(result)
136    }
137
138    #[inline]
139    #[cfg(not(target_endian = "little"))]
140    fn to_ne_bytes(self) -> [u8; 32] {
141        // SAFETY: [u32; 8] and [u8; 32] have the same size and neither has invalid bit patterns.
142        unsafe { std::mem::transmute(self.0) }
143    }
144
145    #[inline]
146    #[cfg(not(target_endian = "little"))]
147    fn to_le_bytes(self) -> [u8; 32] {
148        self.swap_bytes().to_ne_bytes()
149    }
150
151    #[inline]
152    #[cfg(not(target_endian = "little"))]
153    fn swap_bytes(mut self) -> Self {
154        self.0.iter_mut().for_each(|x| *x = x.swap_bytes());
155        self
156    }
157
158    /// setting every bit in the block that was also set in the result from mask
159    fn insert(&mut self, hash: u32) {
160        let mask = Self::mask(hash);
161        for i in 0..8 {
162            self[i] |= mask[i];
163        }
164    }
165
166    /// returns true when every bit that is set in the result of mask is also set in the block.
167    fn check(&self, hash: u32) -> bool {
168        let mask = Self::mask(hash);
169        for i in 0..8 {
170            if self[i] & mask[i] == 0 {
171                return false;
172            }
173        }
174        true
175    }
176}
177
178impl std::ops::Index<usize> for Block {
179    type Output = u32;
180
181    #[inline]
182    fn index(&self, index: usize) -> &Self::Output {
183        self.0.index(index)
184    }
185}
186
187impl std::ops::IndexMut<usize> for Block {
188    #[inline]
189    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
190        self.0.index_mut(index)
191    }
192}
193
194/// A split block Bloom filter.
195///
196/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
197/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
198#[derive(Debug, Clone)]
199pub struct Sbbf(Vec<Block>);
200
201pub(crate) const SBBF_HEADER_SIZE_ESTIMATE: usize = 20;
202
203/// given an initial offset, and a byte buffer, try to read out a bloom filter header and return
204/// both the header and the offset after it (for bitset).
205pub(crate) fn chunk_read_bloom_filter_header_and_offset(
206    offset: u64,
207    buffer: Bytes,
208) -> Result<(BloomFilterHeader, u64), ParquetError> {
209    let (header, length) = read_bloom_filter_header_and_length(buffer)?;
210    Ok((header, offset + length))
211}
212
213/// given a [Bytes] buffer, try to read out a bloom filter header and return both the header and
214/// length of the header.
215#[inline]
216pub(crate) fn read_bloom_filter_header_and_length(
217    buffer: Bytes,
218) -> Result<(BloomFilterHeader, u64), ParquetError> {
219    read_bloom_filter_header_and_length_from_bytes(buffer.as_ref())
220}
221
222/// Given a byte slice, try to read out a bloom filter header and return both the header and
223/// length of the header.
224#[inline]
225fn read_bloom_filter_header_and_length_from_bytes(
226    buffer: &[u8],
227) -> Result<(BloomFilterHeader, u64), ParquetError> {
228    let total_length = buffer.len();
229    let mut prot = ThriftSliceInputProtocol::new(buffer);
230    let header = BloomFilterHeader::read_thrift(&mut prot)
231        .map_err(|e| ParquetError::General(format!("Could not read bloom filter header: {e}")))?;
232    Ok((header, (total_length - prot.as_slice().len()) as u64))
233}
234
235/// The minimum number of bytes for a bloom filter bitset.
236pub const BITSET_MIN_LENGTH: usize = 32;
237/// The maximum number of bytes for a bloom filter bitset.
238pub const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024;
239
240#[inline]
241fn optimal_num_of_bytes(num_bytes: usize) -> usize {
242    let num_bytes = num_bytes.min(BITSET_MAX_LENGTH);
243    let num_bytes = num_bytes.max(BITSET_MIN_LENGTH);
244    num_bytes.next_power_of_two()
245}
246
247// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
248// given fpp = (1 - e^(-k * n / m)) ^ k
249// we have m = - k * n / ln(1 - fpp ^ (1 / k))
250// where k = number of hash functions, m = number of bits, n = number of distinct values
251#[inline]
252fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
253    let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
254    num_bits as usize
255}
256
257impl Sbbf {
258    /// Create a new [Sbbf] with given number of distinct values and false positive probability.
259    /// Will return an error if `fpp` is greater than or equal to 1.0 or less than 0.0.
260    pub fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Result<Self, ParquetError> {
261        if !(0.0..1.0).contains(&fpp) {
262            return Err(ParquetError::General(format!(
263                "False positive probability must be between 0.0 and 1.0, got {fpp}"
264            )));
265        }
266        let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp);
267        Ok(Self::new_with_num_of_bytes(num_bits / 8))
268    }
269
270    /// Create a new [Sbbf] with given number of bytes, the exact number of bytes will be adjusted
271    /// to the next power of two bounded by [BITSET_MIN_LENGTH] and [BITSET_MAX_LENGTH].
272    pub fn new_with_num_of_bytes(num_bytes: usize) -> Self {
273        let num_bytes = optimal_num_of_bytes(num_bytes);
274        assert_eq!(num_bytes % size_of::<Block>(), 0);
275        let num_blocks = num_bytes / size_of::<Block>();
276        let bitset = vec![Block::ZERO; num_blocks];
277        Self(bitset)
278    }
279
280    /// Creates a new [Sbbf] from a raw byte slice.
281    pub fn new(bitset: &[u8]) -> Self {
282        let data = bitset
283            .chunks_exact(4 * 8)
284            .map(|chunk| {
285                let mut block = Block::ZERO;
286                for (i, word) in chunk.chunks_exact(4).enumerate() {
287                    block[i] = u32::from_le_bytes(word.try_into().unwrap());
288                }
289                block
290            })
291            .collect::<Vec<Block>>();
292        Self(data)
293    }
294
295    /// Write the bloom filter data (header and then bitset) to the output. This doesn't
296    /// flush the writer in order to boost performance of bulk writing all blocks. Caller
297    /// must remember to flush the writer.
298    /// This method usually is used in conjunction with [`Self::from_bytes`] for serialization/deserialization.
299    pub fn write<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
300        let mut protocol = ThriftCompactOutputProtocol::new(&mut writer);
301        self.header().write_thrift(&mut protocol).map_err(|e| {
302            ParquetError::General(format!("Could not write bloom filter header: {e}"))
303        })?;
304        self.write_bitset(&mut writer)?;
305        Ok(())
306    }
307
308    /// Write the bitset in serialized form to the writer.
309    #[cfg(not(target_endian = "little"))]
310    pub fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
311        for block in &self.0 {
312            writer
313                .write_all(block.to_le_bytes().as_slice())
314                .map_err(|e| {
315                    ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
316                })?;
317        }
318        Ok(())
319    }
320
321    /// Write the bitset in serialized form to the writer.
322    #[cfg(target_endian = "little")]
323    pub fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
324        // Safety: Block is repr(transparent) and [u32; 8] can be reinterpreted as [u8; 32].
325        let slice = unsafe {
326            std::slice::from_raw_parts(
327                self.0.as_ptr() as *const u8,
328                self.0.len() * size_of::<Block>(),
329            )
330        };
331        writer.write_all(slice).map_err(|e| {
332            ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
333        })?;
334        Ok(())
335    }
336
337    /// Create and populate [`BloomFilterHeader`] from this bitset for writing to serialized form
338    fn header(&self) -> BloomFilterHeader {
339        BloomFilterHeader {
340            // 8 i32 per block, 4 bytes per i32
341            num_bytes: self.0.len() as i32 * 4 * 8,
342            algorithm: BloomFilterAlgorithm::BLOCK,
343            hash: BloomFilterHash::XXHASH,
344            compression: BloomFilterCompression::UNCOMPRESSED,
345        }
346    }
347
348    /// Read a new bloom filter from the given offset in the given reader.
349    pub fn read_from_column_chunk<R: ChunkReader>(
350        column_metadata: &ColumnChunkMetaData,
351        reader: &R,
352    ) -> Result<Option<Self>, ParquetError> {
353        let offset: u64 = if let Some(offset) = column_metadata.bloom_filter_offset() {
354            offset
355                .try_into()
356                .map_err(|_| ParquetError::General("Bloom filter offset is invalid".to_string()))?
357        } else {
358            return Ok(None);
359        };
360
361        let buffer = match column_metadata.bloom_filter_length() {
362            Some(length) => reader.get_bytes(offset, length as usize),
363            None => reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE),
364        }?;
365
366        let (header, bitset_offset) =
367            chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;
368
369        match header.algorithm {
370            BloomFilterAlgorithm::BLOCK => {
371                // this match exists to future proof the singleton algorithm enum
372            }
373        }
374        match header.compression {
375            BloomFilterCompression::UNCOMPRESSED => {
376                // this match exists to future proof the singleton compression enum
377            }
378        }
379        match header.hash {
380            BloomFilterHash::XXHASH => {
381                // this match exists to future proof the singleton hash enum
382            }
383        }
384
385        let bitset = match column_metadata.bloom_filter_length() {
386            Some(_) => buffer.slice((bitset_offset - offset) as usize..),
387            None => {
388                let bitset_length: usize = header.num_bytes.try_into().map_err(|_| {
389                    ParquetError::General("Bloom filter length is invalid".to_string())
390                })?;
391                reader.get_bytes(bitset_offset, bitset_length)?
392            }
393        };
394
395        Ok(Some(Self::new(&bitset)))
396    }
397
398    #[inline]
399    fn hash_to_block_index(&self, hash: u64) -> usize {
400        // unchecked_mul is unstable, but in reality this is safe, we'd just use saturating mul
401        // but it will not saturate
402        (((hash >> 32).saturating_mul(self.0.len() as u64)) >> 32) as usize
403    }
404
405    /// Insert an [AsBytes] value into the filter
406    pub fn insert<T: AsBytes + ?Sized>(&mut self, value: &T) {
407        self.insert_hash(hash_as_bytes(value));
408    }
409
410    /// Insert a hash into the filter
411    fn insert_hash(&mut self, hash: u64) {
412        let block_index = self.hash_to_block_index(hash);
413        self.0[block_index].insert(hash as u32)
414    }
415
416    /// Check if an [AsBytes] value is probably present or definitely absent in the filter
417    pub fn check<T: AsBytes + ?Sized>(&self, value: &T) -> bool {
418        self.check_hash(hash_as_bytes(value))
419    }
420
421    /// Check if a hash is in the filter. May return
422    /// true for values that was never inserted ("false positive")
423    /// but will always return false if a hash has not been inserted.
424    fn check_hash(&self, hash: u64) -> bool {
425        let block_index = self.hash_to_block_index(hash);
426        self.0[block_index].check(hash as u32)
427    }
428
429    /// Return the total in memory size of this bloom filter in bytes
430    pub(crate) fn estimated_memory_size(&self) -> usize {
431        self.0.capacity() * std::mem::size_of::<Block>()
432    }
433
434    /// Reads a Sbff from Thrift encoded bytes
435    ///
436    /// # Examples
437    ///
438    /// ```no_run
439    /// # use parquet::errors::Result;
440    /// # use parquet::bloom_filter::Sbbf;
441    /// # fn main() -> Result<()> {
442    /// // In a real application, you would read serialized bloom filter bytes from a cache.
443    /// // This example demonstrates the deserialization process.
444    /// // Assuming you have bloom filter bytes from a Parquet file:
445    /// # let serialized_bytes: Vec<u8> = vec![];
446    /// let bloom_filter = Sbbf::from_bytes(&serialized_bytes)?;
447    /// // Now you can use the bloom filter to check for values
448    /// if bloom_filter.check(&"some_value") {
449    ///     println!("Value might be present (or false positive)");
450    /// } else {
451    ///     println!("Value is definitely not present");
452    /// }
453    /// # Ok(())
454    /// # }
455    /// ```
456    pub fn from_bytes(bytes: &[u8]) -> Result<Self, ParquetError> {
457        let (header, header_len) = read_bloom_filter_header_and_length_from_bytes(bytes)?;
458
459        let bitset_length: u64 = header
460            .num_bytes
461            .try_into()
462            .map_err(|_| ParquetError::General("Bloom filter length is invalid".to_string()))?;
463
464        // Validate that bitset consumes all remaining bytes
465        if header_len + bitset_length != bytes.len() as u64 {
466            return Err(ParquetError::General(format!(
467                "Bloom filter data contains extra bytes: expected {} total bytes, got {}",
468                header_len + bitset_length,
469                bytes.len()
470            )));
471        }
472
473        let start = header_len as usize;
474        let end = (header_len + bitset_length) as usize;
475        let bitset = bytes
476            .get(start..end)
477            .ok_or_else(|| ParquetError::General("Bloom filter bitset is invalid".to_string()))?;
478
479        Ok(Self::new(bitset))
480    }
481}
482
483// per spec we use xxHash with seed=0
484const SEED: u64 = 0;
485
486#[inline]
487fn hash_as_bytes<A: AsBytes + ?Sized>(value: &A) -> u64 {
488    XxHash64::oneshot(SEED, value.as_bytes())
489}
490
491#[cfg(test)]
492mod tests {
493    use super::*;
494
495    #[test]
496    fn test_hash_bytes() {
497        assert_eq!(hash_as_bytes(""), 17241709254077376921);
498    }
499
500    #[test]
501    fn test_mask_set_quick_check() {
502        for i in 0..1_000_000 {
503            let result = Block::mask(i);
504            assert!(result.0.iter().all(|&x| x.is_power_of_two()));
505        }
506    }
507
508    #[test]
509    fn test_block_insert_and_check() {
510        for i in 0..1_000_000 {
511            let mut block = Block::ZERO;
512            block.insert(i);
513            assert!(block.check(i));
514        }
515    }
516
517    #[test]
518    fn test_sbbf_insert_and_check() {
519        let mut sbbf = Sbbf(vec![Block::ZERO; 1_000]);
520        for i in 0..1_000_000 {
521            sbbf.insert(&i);
522            assert!(sbbf.check(&i));
523        }
524    }
525
526    #[test]
527    fn test_with_fixture() {
528        // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10
529        let bitset: &[u8] = &[
530            200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5,
531            99, 65, 2, 0, 224, 44, 64, 78, 96, 4,
532        ];
533        let sbbf = Sbbf::new(bitset);
534        for a in 0..10i64 {
535            let value = format!("a{a}");
536            assert!(sbbf.check(&value.as_str()));
537        }
538    }
539
540    /// test the assumption that bloom filter header size should not exceed SBBF_HEADER_SIZE_ESTIMATE
541    /// essentially we are testing that the struct is packed with 4 i32 fields, each can be 1-5 bytes
542    /// so altogether it'll be 20 bytes at most.
543    #[test]
544    fn test_bloom_filter_header_size_assumption() {
545        let buffer: &[u8; 16] = &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99];
546        let (
547            BloomFilterHeader {
548                algorithm,
549                compression,
550                hash,
551                num_bytes,
552            },
553            read_length,
554        ) = read_bloom_filter_header_and_length(Bytes::copy_from_slice(buffer)).unwrap();
555        assert_eq!(read_length, 15);
556        assert_eq!(algorithm, BloomFilterAlgorithm::BLOCK);
557        assert_eq!(compression, BloomFilterCompression::UNCOMPRESSED);
558        assert_eq!(hash, BloomFilterHash::XXHASH);
559        assert_eq!(num_bytes, 32_i32);
560        assert_eq!(20, SBBF_HEADER_SIZE_ESTIMATE);
561    }
562
563    #[test]
564    fn test_optimal_num_of_bytes() {
565        for (input, expected) in &[
566            (0, 32),
567            (9, 32),
568            (31, 32),
569            (32, 32),
570            (33, 64),
571            (99, 128),
572            (1024, 1024),
573            (999_000_000, 128 * 1024 * 1024),
574        ] {
575            assert_eq!(*expected, optimal_num_of_bytes(*input));
576        }
577    }
578
579    #[test]
580    fn test_num_of_bits_from_ndv_fpp() {
581        for (fpp, ndv, num_bits) in &[
582            (0.1, 10, 57),
583            (0.01, 10, 96),
584            (0.001, 10, 146),
585            (0.1, 100, 577),
586            (0.01, 100, 968),
587            (0.001, 100, 1460),
588            (0.1, 1000, 5772),
589            (0.01, 1000, 9681),
590            (0.001, 1000, 14607),
591            (0.1, 10000, 57725),
592            (0.01, 10000, 96815),
593            (0.001, 10000, 146076),
594            (0.1, 100000, 577254),
595            (0.01, 100000, 968152),
596            (0.001, 100000, 1460769),
597            (0.1, 1000000, 5772541),
598            (0.01, 1000000, 9681526),
599            (0.001, 1000000, 14607697),
600            (1e-50, 1_000_000_000_000, 14226231280773240832),
601        ] {
602            assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
603        }
604    }
605
606    #[test]
607    fn test_sbbf_write_round_trip() {
608        // Create a bloom filter with a 32-byte bitset (minimum size)
609        let bitset_bytes = vec![0u8; 32];
610        let mut original = Sbbf::new(&bitset_bytes);
611
612        // Insert some test values
613        let test_values = ["hello", "world", "rust", "parquet", "bloom", "filter"];
614        for value in &test_values {
615            original.insert(value);
616        }
617
618        // Serialize to bytes
619        let mut output = Vec::new();
620        original.write(&mut output).unwrap();
621
622        // Validate header was written correctly
623        let mut protocol = ThriftSliceInputProtocol::new(&output);
624        let header = BloomFilterHeader::read_thrift(&mut protocol).unwrap();
625        assert_eq!(header.num_bytes, bitset_bytes.len() as i32);
626        assert_eq!(header.algorithm, BloomFilterAlgorithm::BLOCK);
627        assert_eq!(header.hash, BloomFilterHash::XXHASH);
628        assert_eq!(header.compression, BloomFilterCompression::UNCOMPRESSED);
629
630        // Deserialize using from_bytes
631        let reconstructed = Sbbf::from_bytes(&output).unwrap();
632
633        // Most importantly: verify the bloom filter WORKS correctly after round-trip
634        // Note: bloom filters can have false positives, but should never have false negatives
635        // So we can't assert !check(), but we should verify inserted values are found
636        for value in &test_values {
637            assert!(
638                reconstructed.check(value),
639                "Value '{}' should be present after round-trip",
640                value
641            );
642        }
643    }
644}