arrow_buffer/util/
bit_iterator.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Types for iterating over packed bitmasks
19
20use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator};
21use crate::bit_util::{ceil, get_bit_raw};
22
23/// Iterator over the bits within a packed bitmask
24///
25/// To efficiently iterate over just the set bits see [`BitIndexIterator`] and [`BitSliceIterator`]
26pub struct BitIterator<'a> {
27    buffer: &'a [u8],
28    current_offset: usize,
29    end_offset: usize,
30}
31
32impl<'a> BitIterator<'a> {
33    /// Create a new [`BitIterator`] from the provided `buffer`,
34    /// and `offset` and `len` in bits
35    ///
36    /// # Panic
37    ///
38    /// Panics if `buffer` is too short for the provided offset and length
39    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
40        let end_offset = offset.checked_add(len).unwrap();
41        let required_len = ceil(end_offset, 8);
42        assert!(
43            buffer.len() >= required_len,
44            "BitIterator buffer too small, expected {required_len} got {}",
45            buffer.len()
46        );
47
48        Self {
49            buffer,
50            current_offset: offset,
51            end_offset,
52        }
53    }
54}
55
56impl Iterator for BitIterator<'_> {
57    type Item = bool;
58
59    fn next(&mut self) -> Option<Self::Item> {
60        if self.current_offset == self.end_offset {
61            return None;
62        }
63        // Safety:
64        // offsets in bounds
65        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.current_offset) };
66        self.current_offset += 1;
67        Some(v)
68    }
69
70    fn size_hint(&self) -> (usize, Option<usize>) {
71        let remaining_bits = self.end_offset - self.current_offset;
72        (remaining_bits, Some(remaining_bits))
73    }
74}
75
76impl ExactSizeIterator for BitIterator<'_> {}
77
78impl DoubleEndedIterator for BitIterator<'_> {
79    fn next_back(&mut self) -> Option<Self::Item> {
80        if self.current_offset == self.end_offset {
81            return None;
82        }
83        self.end_offset -= 1;
84        // Safety:
85        // offsets in bounds
86        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) };
87        Some(v)
88    }
89}
90
91/// Iterator of contiguous ranges of set bits within a provided packed bitmask
92///
93/// Returns `(usize, usize)` each representing an interval where the corresponding
94/// bits in the provides mask are set
95///
96/// the first value is the start of the range (inclusive) and the second value is the end of the range (exclusive)
97///
98#[derive(Debug)]
99pub struct BitSliceIterator<'a> {
100    iter: UnalignedBitChunkIterator<'a>,
101    len: usize,
102    current_offset: i64,
103    current_chunk: u64,
104}
105
106impl<'a> BitSliceIterator<'a> {
107    /// Create a new [`BitSliceIterator`] from the provided `buffer`,
108    /// and `offset` and `len` in bits
109    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
110        let chunk = UnalignedBitChunk::new(buffer, offset, len);
111        let mut iter = chunk.iter();
112
113        let current_offset = -(chunk.lead_padding() as i64);
114        let current_chunk = iter.next().unwrap_or(0);
115
116        Self {
117            iter,
118            len,
119            current_offset,
120            current_chunk,
121        }
122    }
123
124    /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at
125    /// least one bit set, or None if there is no such chunk.
126    ///
127    /// Where `chunk_offset` is the bit offset to the current `u64` chunk
128    /// and `bit_offset` is the offset of the first `1` bit in that chunk
129    fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> {
130        loop {
131            if self.current_chunk != 0 {
132                // Find the index of the first 1
133                let bit_pos = self.current_chunk.trailing_zeros();
134                return Some((self.current_offset, bit_pos));
135            }
136
137            self.current_chunk = self.iter.next()?;
138            self.current_offset += 64;
139        }
140    }
141}
142
143impl Iterator for BitSliceIterator<'_> {
144    type Item = (usize, usize);
145
146    fn next(&mut self) -> Option<Self::Item> {
147        // Used as termination condition
148        if self.len == 0 {
149            return None;
150        }
151
152        let (start_chunk, start_bit) = self.advance_to_set_bit()?;
153
154        // Set bits up to start
155        self.current_chunk |= (1 << start_bit) - 1;
156
157        loop {
158            if self.current_chunk != u64::MAX {
159                // Find the index of the first 0
160                let end_bit = self.current_chunk.trailing_ones();
161
162                // Zero out up to end_bit
163                self.current_chunk &= !((1 << end_bit) - 1);
164
165                return Some((
166                    (start_chunk + start_bit as i64) as usize,
167                    (self.current_offset + end_bit as i64) as usize,
168                ));
169            }
170
171            match self.iter.next() {
172                Some(next) => {
173                    self.current_chunk = next;
174                    self.current_offset += 64;
175                }
176                None => {
177                    return Some((
178                        (start_chunk + start_bit as i64) as usize,
179                        std::mem::replace(&mut self.len, 0),
180                    ));
181                }
182            }
183        }
184    }
185}
186
187/// An iterator of `usize` whose index in a provided bitmask is true
188///
189/// This provides the best performance on most masks, apart from those which contain
190/// large runs and therefore favour [`BitSliceIterator`]
191#[derive(Debug)]
192pub struct BitIndexIterator<'a> {
193    current_chunk: u64,
194    chunk_offset: i64,
195    iter: UnalignedBitChunkIterator<'a>,
196}
197
198impl<'a> BitIndexIterator<'a> {
199    /// Create a new [`BitIndexIterator`] from the provide `buffer`,
200    /// and `offset` and `len` in bits
201    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
202        let chunks = UnalignedBitChunk::new(buffer, offset, len);
203        let mut iter = chunks.iter();
204
205        let current_chunk = iter.next().unwrap_or(0);
206        let chunk_offset = -(chunks.lead_padding() as i64);
207
208        Self {
209            current_chunk,
210            chunk_offset,
211            iter,
212        }
213    }
214}
215
216impl Iterator for BitIndexIterator<'_> {
217    type Item = usize;
218
219    #[inline]
220    fn next(&mut self) -> Option<Self::Item> {
221        loop {
222            if self.current_chunk != 0 {
223                let bit_pos = self.current_chunk.trailing_zeros();
224                self.current_chunk ^= 1 << bit_pos;
225                return Some((self.chunk_offset + bit_pos as i64) as usize);
226            }
227
228            self.current_chunk = self.iter.next()?;
229            self.chunk_offset += 64;
230        }
231    }
232}
233
234/// An iterator of u32 whose index in a provided bitmask is true
235/// Respects arbitrary offsets and slice lead/trail padding exactly like BitIndexIterator
236#[derive(Debug)]
237pub struct BitIndexU32Iterator<'a> {
238    curr: u64,
239    chunk_offset: i64,
240    iter: UnalignedBitChunkIterator<'a>,
241}
242
243impl<'a> BitIndexU32Iterator<'a> {
244    /// Create a new [BitIndexU32Iterator] from the provided buffer,
245    /// offset and len in bits.
246    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
247        // Build the aligned chunks (including prefix/suffix masked)
248        let chunks = UnalignedBitChunk::new(buffer, offset, len);
249        let mut iter = chunks.iter();
250
251        // First 64-bit word (masked for lead padding), or 0 if empty
252        let curr = iter.next().unwrap_or(0);
253        // Negative lead padding ensures the first bit in curr maps to index 0
254        let chunk_offset = -(chunks.lead_padding() as i64);
255
256        Self {
257            curr,
258            chunk_offset,
259            iter,
260        }
261    }
262}
263
264impl<'a> Iterator for BitIndexU32Iterator<'a> {
265    type Item = u32;
266
267    #[inline(always)]
268    fn next(&mut self) -> Option<u32> {
269        loop {
270            if self.curr != 0 {
271                // Position of least-significant set bit
272                let tz = self.curr.trailing_zeros();
273                // Clear that bit
274                self.curr &= self.curr - 1;
275                // Return global index = chunk_offset + tz
276                return Some((self.chunk_offset + tz as i64) as u32);
277            }
278            // Advance to next 64-bit chunk
279            match self.iter.next() {
280                Some(next_chunk) => {
281                    // Move offset forward by 64 bits
282                    self.chunk_offset += 64;
283                    self.curr = next_chunk;
284                }
285                None => return None,
286            }
287        }
288    }
289}
290
291/// Calls the provided closure for each index in the provided null mask that is set,
292/// using an adaptive strategy based on the null count
293///
294/// Ideally this would be encapsulated in an [`Iterator`] that would determine the optimal
295/// strategy up front, and then yield indexes based on this.
296///
297/// Unfortunately, external iteration based on the resulting [`Iterator`] would match the strategy
298/// variant on each call to [`Iterator::next`], and LLVM generally cannot eliminate this.
299///
300/// One solution to this might be internal iteration, e.g. [`Iterator::try_fold`], however,
301/// it is currently [not possible] to override this for custom iterators in stable Rust.
302///
303/// As such this is the next best option
304///
305/// [not possible]: https://github.com/rust-lang/rust/issues/69595
306#[inline]
307pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
308    len: usize,
309    offset: usize,
310    null_count: usize,
311    nulls: Option<&[u8]>,
312    f: F,
313) -> Result<(), E> {
314    let valid_count = len - null_count;
315
316    if valid_count == len {
317        (0..len).try_for_each(f)
318    } else if null_count != len {
319        BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f)
320    } else {
321        Ok(())
322    }
323}
324
325// Note: further tests located in arrow_select::filter module
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330
331    #[test]
332    fn test_bit_iterator_size_hint() {
333        let mut b = BitIterator::new(&[0b00000011], 0, 2);
334        assert_eq!(
335            b.size_hint(),
336            (2, Some(2)),
337            "Expected size_hint to be (2, Some(2))"
338        );
339
340        b.next();
341        assert_eq!(
342            b.size_hint(),
343            (1, Some(1)),
344            "Expected size_hint to be (1, Some(1)) after one bit consumed"
345        );
346
347        b.next();
348        assert_eq!(
349            b.size_hint(),
350            (0, Some(0)),
351            "Expected size_hint to be (0, Some(0)) after all bits consumed"
352        );
353    }
354
355    #[test]
356    fn test_bit_iterator() {
357        let mask = &[0b00010010, 0b00100011, 0b00000101, 0b00010001, 0b10010011];
358        let actual: Vec<_> = BitIterator::new(mask, 0, 5).collect();
359        assert_eq!(actual, &[false, true, false, false, true]);
360
361        let actual: Vec<_> = BitIterator::new(mask, 4, 5).collect();
362        assert_eq!(actual, &[true, false, false, false, true]);
363
364        let actual: Vec<_> = BitIterator::new(mask, 12, 14).collect();
365        assert_eq!(
366            actual,
367            &[
368                false, true, false, false, true, false, true, false, false, false, false, false,
369                true, false
370            ]
371        );
372
373        assert_eq!(BitIterator::new(mask, 0, 0).count(), 0);
374        assert_eq!(BitIterator::new(mask, 40, 0).count(), 0);
375    }
376
377    #[test]
378    #[should_panic(expected = "BitIterator buffer too small, expected 3 got 2")]
379    fn test_bit_iterator_bounds() {
380        let mask = &[223, 23];
381        BitIterator::new(mask, 17, 0);
382    }
383
384    #[test]
385    fn test_bit_index_u32_iterator_basic() {
386        let mask = &[0b00010010, 0b00100011];
387
388        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
389        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
390            .map(|i| i as u32)
391            .collect();
392        assert_eq!(result, expected);
393
394        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 4, 8).collect();
395        let expected: Vec<u32> = BitIndexIterator::new(mask, 4, 8)
396            .map(|i| i as u32)
397            .collect();
398        assert_eq!(result, expected);
399
400        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 10, 4).collect();
401        let expected: Vec<u32> = BitIndexIterator::new(mask, 10, 4)
402            .map(|i| i as u32)
403            .collect();
404        assert_eq!(result, expected);
405
406        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 0).collect();
407        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 0)
408            .map(|i| i as u32)
409            .collect();
410        assert_eq!(result, expected);
411    }
412
413    #[test]
414    fn test_bit_index_u32_iterator_all_set() {
415        let mask = &[0xFF, 0xFF];
416        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
417        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
418            .map(|i| i as u32)
419            .collect();
420        assert_eq!(result, expected);
421    }
422
423    #[test]
424    fn test_bit_index_u32_iterator_none_set() {
425        let mask = &[0x00, 0x00];
426        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
427        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
428            .map(|i| i as u32)
429            .collect();
430        assert_eq!(result, expected);
431    }
432
433    #[test]
434    fn test_bit_index_u32_cross_chunk() {
435        let mut buf = vec![0u8; 16];
436        for bit in 60..68 {
437            let byte = (bit / 8) as usize;
438            let bit_in_byte = bit % 8;
439            buf[byte] |= 1 << bit_in_byte;
440        }
441        let offset = 58;
442        let len = 10;
443
444        let result: Vec<u32> = BitIndexU32Iterator::new(&buf, offset, len).collect();
445        let expected: Vec<u32> = BitIndexIterator::new(&buf, offset, len)
446            .map(|i| i as u32)
447            .collect();
448        assert_eq!(result, expected);
449    }
450
451    #[test]
452    fn test_bit_index_u32_unaligned_offset() {
453        let mask = &[0b0110_1100, 0b1010_0000];
454        let offset = 2;
455        let len = 12;
456
457        let result: Vec<u32> = BitIndexU32Iterator::new(mask, offset, len).collect();
458        let expected: Vec<u32> = BitIndexIterator::new(mask, offset, len)
459            .map(|i| i as u32)
460            .collect();
461        assert_eq!(result, expected);
462    }
463
464    #[test]
465    fn test_bit_index_u32_long_all_set() {
466        let len = 200;
467        let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 };
468        let bytes = vec![0xFFu8; num_bytes];
469
470        let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect();
471        let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len)
472            .map(|i| i as u32)
473            .collect();
474        assert_eq!(result, expected);
475    }
476
477    #[test]
478    fn test_bit_index_u32_none_set() {
479        let len = 50;
480        let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 };
481        let bytes = vec![0u8; num_bytes];
482
483        let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect();
484        let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len)
485            .map(|i| i as u32)
486            .collect();
487        assert_eq!(result, expected);
488    }
489}