Skip to main content

arrow_buffer/buffer/
null.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
19use crate::buffer::BooleanBuffer;
20use crate::{Buffer, MutableBuffer};
21
22/// A [`BooleanBuffer`] used to encode validity (null values) for Arrow arrays
23///
24/// In the [Arrow specification], array validity is encoded in a packed bitmask with a
25/// `true` value indicating the corresponding slot is not null, and `false` indicating
26/// that it is null.
27///
28/// # See also
29/// * [`NullBufferBuilder`] for creating `NullBuffer`s
30///
31/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
32/// [`NullBufferBuilder`]: crate::NullBufferBuilder
33#[derive(Debug, Clone, Eq, PartialEq)]
34pub struct NullBuffer {
35    buffer: BooleanBuffer,
36    null_count: usize,
37}
38
39impl NullBuffer {
40    /// Create a new [`NullBuffer`] computing the null count
41    pub fn new(buffer: BooleanBuffer) -> Self {
42        let null_count = buffer.len() - buffer.count_set_bits();
43        Self { buffer, null_count }
44    }
45
46    /// Create a new [`NullBuffer`] of length `len` where all values are null
47    pub fn new_null(len: usize) -> Self {
48        Self {
49            buffer: BooleanBuffer::new_unset(len),
50            null_count: len,
51        }
52    }
53
54    /// Create a new [`NullBuffer`] of length `len` where all values are valid
55    ///
56    /// Note: it is more efficient to not set the null buffer if it is known to
57    /// be all valid (aka all values are not null)
58    pub fn new_valid(len: usize) -> Self {
59        Self {
60            buffer: BooleanBuffer::new_set(len),
61            null_count: 0,
62        }
63    }
64
65    /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
66    ///
67    /// # Safety
68    ///
69    /// `buffer` must contain `null_count` `0` bits
70    pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
71        Self { buffer, null_count }
72    }
73
74    /// Computes the union of the nulls in two optional [`NullBuffer`]
75    ///
76    /// This is commonly used by binary operations where the result is NULL if either
77    /// of the input values is NULL. Handling the null mask separately in this way
78    /// can yield significant performance improvements over an iterator approach
79    pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
80        match (lhs, rhs) {
81            (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
82            (Some(n), None) | (None, Some(n)) => Some(n.clone()),
83            (None, None) => None,
84        }
85    }
86
87    /// Computes the union of the nulls in multiple optional [`NullBuffer`]s
88    ///
89    /// See [`union`](Self::union)
90    pub fn union_many<'a>(
91        nulls: impl IntoIterator<Item = Option<&'a NullBuffer>>,
92    ) -> Option<NullBuffer> {
93        // Unwrap to BooleanBuffer because BitAndAssign is not implemented for NullBuffer
94        let mut buffers = nulls.into_iter().filter_map(|nb| nb.map(NullBuffer::inner));
95        let first = buffers.next()?;
96        let mut result = first.clone();
97        for buf in buffers {
98            result &= buf;
99        }
100        Some(Self::new(result))
101    }
102
103    /// Returns true if all nulls in `other` also exist in self
104    pub fn contains(&self, other: &NullBuffer) -> bool {
105        if other.null_count == 0 {
106            return true;
107        }
108        let lhs = self.inner().bit_chunks().iter_padded();
109        let rhs = other.inner().bit_chunks().iter_padded();
110        lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
111    }
112
113    /// Returns a new [`NullBuffer`] where each bit in the current null buffer
114    /// is repeated `count` times. This is useful for masking the nulls of
115    /// the child of a FixedSizeListArray based on its parent
116    pub fn expand(&self, count: usize) -> Self {
117        let capacity = self.buffer.len().checked_mul(count).unwrap();
118        let mut buffer = MutableBuffer::new_null(capacity);
119
120        // Expand each bit within `null_mask` into `element_len`
121        // bits, constructing the implicit mask of the child elements
122        for i in 0..self.buffer.len() {
123            if self.is_null(i) {
124                continue;
125            }
126            for j in 0..count {
127                crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
128            }
129        }
130        Self {
131            buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
132            null_count: self.null_count * count,
133        }
134    }
135
136    /// Returns the length of this [`NullBuffer`] in bits
137    #[inline]
138    pub fn len(&self) -> usize {
139        self.buffer.len()
140    }
141
142    /// Returns the offset of this [`NullBuffer`] in bits
143    #[inline]
144    pub fn offset(&self) -> usize {
145        self.buffer.offset()
146    }
147
148    /// Returns true if this [`NullBuffer`] is empty
149    #[inline]
150    pub fn is_empty(&self) -> bool {
151        self.buffer.is_empty()
152    }
153
154    /// Free up unused memory.
155    pub fn shrink_to_fit(&mut self) {
156        self.buffer.shrink_to_fit();
157    }
158
159    /// Returns the null count for this [`NullBuffer`]
160    #[inline]
161    pub fn null_count(&self) -> usize {
162        self.null_count
163    }
164
165    /// Returns `true` if the value at `idx` is not null
166    #[inline]
167    pub fn is_valid(&self, idx: usize) -> bool {
168        self.buffer.value(idx)
169    }
170
171    /// Returns `true` if the value at `idx` is null
172    #[inline]
173    pub fn is_null(&self, idx: usize) -> bool {
174        !self.is_valid(idx)
175    }
176
177    /// Returns the packed validity of this [`NullBuffer`] not including any offset
178    #[inline]
179    pub fn validity(&self) -> &[u8] {
180        self.buffer.values()
181    }
182
183    /// Slices this [`NullBuffer`] by the provided `offset` and `length`
184    pub fn slice(&self, offset: usize, len: usize) -> Self {
185        Self::new(self.buffer.slice(offset, len))
186    }
187
188    /// Returns an iterator over the bits in this [`NullBuffer`]
189    ///
190    /// * `true` indicates that the corresponding value is not NULL
191    /// * `false` indicates that the corresponding value is NULL
192    ///
193    /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
194    pub fn iter(&self) -> BitIterator<'_> {
195        self.buffer.iter()
196    }
197
198    /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
199    ///
200    /// Valid indices indicate the corresponding value is not NULL
201    pub fn valid_indices(&self) -> BitIndexIterator<'_> {
202        self.buffer.set_indices()
203    }
204
205    /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
206    ///
207    /// Valid indices indicate the corresponding value is not NULL
208    pub fn valid_slices(&self) -> BitSliceIterator<'_> {
209        self.buffer.set_slices()
210    }
211
212    /// Calls the provided closure for each index in this null mask that is set
213    #[inline]
214    pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
215        &self,
216        f: F,
217    ) -> Result<(), E> {
218        if self.null_count == self.len() {
219            return Ok(());
220        }
221        self.valid_indices().try_for_each(f)
222    }
223
224    /// Returns the inner [`BooleanBuffer`]
225    #[inline]
226    pub fn inner(&self) -> &BooleanBuffer {
227        &self.buffer
228    }
229
230    /// Returns the inner [`BooleanBuffer`]
231    #[inline]
232    pub fn into_inner(self) -> BooleanBuffer {
233        self.buffer
234    }
235
236    /// Returns the underlying [`Buffer`]
237    #[inline]
238    pub fn buffer(&self) -> &Buffer {
239        self.buffer.inner()
240    }
241
242    /// Create a [`NullBuffer`] from an *unsliced* validity bitmap (`offset = 0` **bits**) of length `len`.
243    ///
244    /// Returns `None` if there are no nulls (all values valid).
245    pub fn from_unsliced_buffer(buffer: impl Into<Buffer>, len: usize) -> Option<Self> {
246        let bb = BooleanBuffer::new(buffer.into(), 0, len);
247        let nb = NullBuffer::new(bb);
248        (nb.null_count() > 0).then_some(nb)
249    }
250
251    /// Claim memory used by this null buffer in the provided memory pool.
252    #[cfg(feature = "pool")]
253    pub fn claim(&self, pool: &dyn crate::MemoryPool) {
254        // NullBuffer wraps a BooleanBuffer which wraps a Buffer
255        self.buffer.inner().claim(pool);
256    }
257}
258
259impl<'a> IntoIterator for &'a NullBuffer {
260    type Item = bool;
261    type IntoIter = BitIterator<'a>;
262
263    fn into_iter(self) -> Self::IntoIter {
264        self.buffer.iter()
265    }
266}
267
268impl From<BooleanBuffer> for NullBuffer {
269    fn from(value: BooleanBuffer) -> Self {
270        Self::new(value)
271    }
272}
273
274impl From<&[bool]> for NullBuffer {
275    fn from(value: &[bool]) -> Self {
276        BooleanBuffer::from(value).into()
277    }
278}
279
280impl<const N: usize> From<&[bool; N]> for NullBuffer {
281    fn from(value: &[bool; N]) -> Self {
282        value[..].into()
283    }
284}
285
286impl From<Vec<bool>> for NullBuffer {
287    fn from(value: Vec<bool>) -> Self {
288        BooleanBuffer::from(value).into()
289    }
290}
291
292impl FromIterator<bool> for NullBuffer {
293    fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
294        BooleanBuffer::from_iter(iter).into()
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    #[test]
303    fn test_size() {
304        // This tests that the niche optimisation eliminates the overhead of an option
305        assert_eq!(
306            std::mem::size_of::<NullBuffer>(),
307            std::mem::size_of::<Option<NullBuffer>>()
308        );
309    }
310
311    #[test]
312    fn test_from_unsliced_buffer_with_nulls() {
313        // 0b10110010 → null(0), valid(1), null(2), null(3), valid(4), valid(5), null(6), valid(7)
314        let buf = Buffer::from([0b10110010u8]);
315        let result = NullBuffer::from_unsliced_buffer(buf, 8);
316        assert!(result.is_some());
317        let nb = result.unwrap();
318        assert_eq!(nb.len(), 8);
319        assert_eq!(nb.null_count(), 4);
320        assert!(nb.is_null(0));
321        assert!(nb.is_valid(1));
322        assert!(nb.is_null(2));
323        assert!(nb.is_null(3));
324        assert!(nb.is_valid(4));
325        assert!(nb.is_valid(5));
326        assert!(nb.is_null(6));
327        assert!(nb.is_valid(7));
328    }
329
330    #[test]
331    fn test_from_unsliced_buffer_all_valid() {
332        // All bits set = all valid, no nulls
333        let buf = Buffer::from([0b11111111u8]);
334        let result = NullBuffer::from_unsliced_buffer(buf, 8);
335        assert!(result.is_none());
336    }
337
338    #[test]
339    fn test_from_unsliced_buffer_all_null() {
340        // No bits set = all null
341        let buf = Buffer::from([0b00000000u8]);
342        let result = NullBuffer::from_unsliced_buffer(buf, 8);
343        assert!(result.is_some());
344        let nb = result.unwrap();
345        assert_eq!(nb.len(), 8);
346        assert_eq!(nb.null_count(), 8);
347    }
348
349    #[test]
350    fn test_from_unsliced_buffer_empty() {
351        let buf = Buffer::from([]);
352        let result = NullBuffer::from_unsliced_buffer(buf, 0);
353        assert!(result.is_none());
354    }
355
356    #[test]
357    fn test_union_many_all_none() {
358        let result = NullBuffer::union_many([None, None, None]);
359        assert!(result.is_none());
360    }
361
362    #[test]
363    fn test_union_many_single_some() {
364        let a = NullBuffer::from(&[true, false, true, true]);
365        let result = NullBuffer::union_many([Some(&a)]);
366        assert_eq!(result, Some(a));
367    }
368
369    #[test]
370    fn test_union_many_two_inputs() {
371        let a = NullBuffer::from(&[true, false, true, true]);
372        let b = NullBuffer::from(&[true, true, false, true]);
373        let result = NullBuffer::union_many([Some(&a), Some(&b)]);
374        let expected = NullBuffer::union(Some(&a), Some(&b));
375        assert_eq!(result, expected);
376    }
377
378    #[test]
379    fn test_union_many_three_inputs() {
380        let a = NullBuffer::from(&[true, false, true, true]);
381        let b = NullBuffer::from(&[true, true, false, true]);
382        let c = NullBuffer::from(&[false, true, true, true]);
383        let result = NullBuffer::union_many([Some(&a), Some(&b), Some(&c)]);
384        let expected = NullBuffer::from(&[false, false, false, true]);
385        assert_eq!(result, Some(expected));
386    }
387
388    #[test]
389    fn test_union_many_mixed_none() {
390        let a = NullBuffer::from(&[true, false, true, true]);
391        let b = NullBuffer::from(&[false, true, true, true]);
392        let result = NullBuffer::union_many([Some(&a), None, Some(&b)]);
393        let expected = NullBuffer::union(Some(&a), Some(&b));
394        assert_eq!(result, expected);
395    }
396
397    #[test]
398    fn test_union_many_empty_slice() {
399        let result = NullBuffer::union_many([] as [Option<&NullBuffer>; 0]);
400        assert!(result.is_none());
401    }
402}