arrow_buffer/buffer/null.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
19use crate::buffer::BooleanBuffer;
20use crate::{Buffer, MutableBuffer};
21
22/// A [`BooleanBuffer`] used to encode validity (null values) for Arrow arrays
23///
24/// In the [Arrow specification], array validity is encoded in a packed bitmask with a
25/// `true` value indicating the corresponding slot is not null, and `false` indicating
26/// that it is null.
27///
28/// # See also
29/// * [`NullBufferBuilder`] for creating `NullBuffer`s
30///
31/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
32/// [`NullBufferBuilder`]: crate::NullBufferBuilder
33#[derive(Debug, Clone, Eq, PartialEq)]
34pub struct NullBuffer {
35 buffer: BooleanBuffer,
36 null_count: usize,
37}
38
39impl NullBuffer {
40 /// Create a new [`NullBuffer`] computing the null count
41 pub fn new(buffer: BooleanBuffer) -> Self {
42 let null_count = buffer.len() - buffer.count_set_bits();
43 Self { buffer, null_count }
44 }
45
46 /// Create a new [`NullBuffer`] of length `len` where all values are null
47 pub fn new_null(len: usize) -> Self {
48 Self {
49 buffer: BooleanBuffer::new_unset(len),
50 null_count: len,
51 }
52 }
53
54 /// Create a new [`NullBuffer`] of length `len` where all values are valid
55 ///
56 /// Note: it is more efficient to not set the null buffer if it is known to
57 /// be all valid (aka all values are not null)
58 pub fn new_valid(len: usize) -> Self {
59 Self {
60 buffer: BooleanBuffer::new_set(len),
61 null_count: 0,
62 }
63 }
64
65 /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
66 ///
67 /// # Safety
68 ///
69 /// `buffer` must contain `null_count` `0` bits
70 pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
71 Self { buffer, null_count }
72 }
73
74 /// Computes the union of the nulls in two optional [`NullBuffer`]
75 ///
76 /// This is commonly used by binary operations where the result is NULL if either
77 /// of the input values is NULL. Handling the null mask separately in this way
78 /// can yield significant performance improvements over an iterator approach
79 pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
80 match (lhs, rhs) {
81 (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
82 (Some(n), None) | (None, Some(n)) => Some(n.clone()),
83 (None, None) => None,
84 }
85 }
86
87 /// Returns true if all nulls in `other` also exist in self
88 pub fn contains(&self, other: &NullBuffer) -> bool {
89 if other.null_count == 0 {
90 return true;
91 }
92 let lhs = self.inner().bit_chunks().iter_padded();
93 let rhs = other.inner().bit_chunks().iter_padded();
94 lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
95 }
96
97 /// Returns a new [`NullBuffer`] where each bit in the current null buffer
98 /// is repeated `count` times. This is useful for masking the nulls of
99 /// the child of a FixedSizeListArray based on its parent
100 pub fn expand(&self, count: usize) -> Self {
101 let capacity = self.buffer.len().checked_mul(count).unwrap();
102 let mut buffer = MutableBuffer::new_null(capacity);
103
104 // Expand each bit within `null_mask` into `element_len`
105 // bits, constructing the implicit mask of the child elements
106 for i in 0..self.buffer.len() {
107 if self.is_null(i) {
108 continue;
109 }
110 for j in 0..count {
111 crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
112 }
113 }
114 Self {
115 buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
116 null_count: self.null_count * count,
117 }
118 }
119
120 /// Returns the length of this [`NullBuffer`] in bits
121 #[inline]
122 pub fn len(&self) -> usize {
123 self.buffer.len()
124 }
125
126 /// Returns the offset of this [`NullBuffer`] in bits
127 #[inline]
128 pub fn offset(&self) -> usize {
129 self.buffer.offset()
130 }
131
132 /// Returns true if this [`NullBuffer`] is empty
133 #[inline]
134 pub fn is_empty(&self) -> bool {
135 self.buffer.is_empty()
136 }
137
138 /// Free up unused memory.
139 pub fn shrink_to_fit(&mut self) {
140 self.buffer.shrink_to_fit();
141 }
142
143 /// Returns the null count for this [`NullBuffer`]
144 #[inline]
145 pub fn null_count(&self) -> usize {
146 self.null_count
147 }
148
149 /// Returns `true` if the value at `idx` is not null
150 #[inline]
151 pub fn is_valid(&self, idx: usize) -> bool {
152 self.buffer.value(idx)
153 }
154
155 /// Returns `true` if the value at `idx` is null
156 #[inline]
157 pub fn is_null(&self, idx: usize) -> bool {
158 !self.is_valid(idx)
159 }
160
161 /// Returns the packed validity of this [`NullBuffer`] not including any offset
162 #[inline]
163 pub fn validity(&self) -> &[u8] {
164 self.buffer.values()
165 }
166
167 /// Slices this [`NullBuffer`] by the provided `offset` and `length`
168 pub fn slice(&self, offset: usize, len: usize) -> Self {
169 Self::new(self.buffer.slice(offset, len))
170 }
171
172 /// Returns an iterator over the bits in this [`NullBuffer`]
173 ///
174 /// * `true` indicates that the corresponding value is not NULL
175 /// * `false` indicates that the corresponding value is NULL
176 ///
177 /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
178 pub fn iter(&self) -> BitIterator<'_> {
179 self.buffer.iter()
180 }
181
182 /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
183 ///
184 /// Valid indices indicate the corresponding value is not NULL
185 pub fn valid_indices(&self) -> BitIndexIterator<'_> {
186 self.buffer.set_indices()
187 }
188
189 /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
190 ///
191 /// Valid indices indicate the corresponding value is not NULL
192 pub fn valid_slices(&self) -> BitSliceIterator<'_> {
193 self.buffer.set_slices()
194 }
195
196 /// Calls the provided closure for each index in this null mask that is set
197 #[inline]
198 pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
199 &self,
200 f: F,
201 ) -> Result<(), E> {
202 if self.null_count == self.len() {
203 return Ok(());
204 }
205 self.valid_indices().try_for_each(f)
206 }
207
208 /// Returns the inner [`BooleanBuffer`]
209 #[inline]
210 pub fn inner(&self) -> &BooleanBuffer {
211 &self.buffer
212 }
213
214 /// Returns the inner [`BooleanBuffer`]
215 #[inline]
216 pub fn into_inner(self) -> BooleanBuffer {
217 self.buffer
218 }
219
220 /// Returns the underlying [`Buffer`]
221 #[inline]
222 pub fn buffer(&self) -> &Buffer {
223 self.buffer.inner()
224 }
225}
226
227impl<'a> IntoIterator for &'a NullBuffer {
228 type Item = bool;
229 type IntoIter = BitIterator<'a>;
230
231 fn into_iter(self) -> Self::IntoIter {
232 self.buffer.iter()
233 }
234}
235
236impl From<BooleanBuffer> for NullBuffer {
237 fn from(value: BooleanBuffer) -> Self {
238 Self::new(value)
239 }
240}
241
242impl From<&[bool]> for NullBuffer {
243 fn from(value: &[bool]) -> Self {
244 BooleanBuffer::from(value).into()
245 }
246}
247
248impl<const N: usize> From<&[bool; N]> for NullBuffer {
249 fn from(value: &[bool; N]) -> Self {
250 value[..].into()
251 }
252}
253
254impl From<Vec<bool>> for NullBuffer {
255 fn from(value: Vec<bool>) -> Self {
256 BooleanBuffer::from(value).into()
257 }
258}
259
260impl FromIterator<bool> for NullBuffer {
261 fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
262 BooleanBuffer::from_iter(iter).into()
263 }
264}
265
266#[cfg(test)]
267mod tests {
268 use super::*;
269 #[test]
270 fn test_size() {
271 // This tests that the niche optimisation eliminates the overhead of an option
272 assert_eq!(
273 std::mem::size_of::<NullBuffer>(),
274 std::mem::size_of::<Option<NullBuffer>>()
275 );
276 }
277}