Skip to main content

arrow_buffer/buffer/
offset.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::buffer::ScalarBuffer;
19use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder};
20use std::ops::Deref;
21
22/// A non-empty buffer of monotonically increasing, positive integers.
23///
24/// [`OffsetBuffer`] are used to represent ranges of offsets. An
25/// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start
26/// offset for element `i` is `offsets[i]` and the end offset is
27/// `offsets[i+1]`. Equal offsets represent an empty range.
28///
29/// # Example
30///
31/// This example shows how 5 distinct ranges, are represented using a
32/// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the
33/// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets
34/// (e.g. an empty list).
35///
36/// ```text
37///   ┌───────┐                ┌───┐
38///   │ (0,3) │                │ 0 │
39///   ├───────┤                ├───┤
40///   │ (3,3) │                │ 3 │
41///   ├───────┤                ├───┤
42///   │ (3,4) │                │ 3 │
43///   ├───────┤                ├───┤
44///   │ (4,5) │                │ 4 │
45///   ├───────┤                ├───┤
46///   │ (5,7) │                │ 5 │
47///   └───────┘                ├───┤
48///                            │ 7 │
49///                            └───┘
50///
51///                        Offsets Buffer
52///    Logical
53///    Offsets
54///
55///  (offsets[i],
56///   offsets[i+1])
57/// ```
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>);
60
61impl<O: ArrowNativeType> OffsetBuffer<O> {
62    /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
63    ///
64    /// # Panics
65    ///
66    /// Panics if `buffer` is not a non-empty buffer containing
67    /// monotonically increasing values greater than or equal to zero
68    pub fn new(buffer: ScalarBuffer<O>) -> Self {
69        assert!(!buffer.is_empty(), "offsets cannot be empty");
70        assert!(
71            buffer[0] >= O::usize_as(0),
72            "offsets must be greater than 0"
73        );
74        assert!(
75            buffer.windows(2).all(|w| w[0] <= w[1]),
76            "offsets must be monotonically increasing"
77        );
78        Self(buffer)
79    }
80
81    /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
82    ///
83    /// # Safety
84    ///
85    /// `buffer` must be a non-empty buffer containing monotonically increasing
86    /// values greater than or equal to zero
87    pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self {
88        Self(buffer)
89    }
90
91    /// Create a new [`OffsetBuffer`] containing a single 0 value
92    pub fn new_empty() -> Self {
93        let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>());
94        Self(buffer.into_buffer().into())
95    }
96
97    /// Create a new [`OffsetBuffer`] containing `len + 1` `0` values
98    pub fn new_zeroed(len: usize) -> Self {
99        let len_bytes = len
100            .checked_add(1)
101            .and_then(|o| o.checked_mul(std::mem::size_of::<O>()))
102            .expect("overflow");
103        let buffer = MutableBuffer::from_len_zeroed(len_bytes);
104        Self(buffer.into_buffer().into())
105    }
106
107    /// Create a new [`OffsetBuffer`] from the iterator of slice lengths
108    ///
109    /// ```
110    /// # use arrow_buffer::OffsetBuffer;
111    /// let offsets = OffsetBuffer::<i32>::from_lengths([1, 3, 5]);
112    /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]);
113    /// ```
114    ///
115    /// If you want to create an [`OffsetBuffer`] where all lengths are the same,
116    /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead.
117    ///
118    /// # Panics
119    ///
120    /// Panics on overflow
121    pub fn from_lengths<I>(lengths: I) -> Self
122    where
123        I: IntoIterator<Item = usize>,
124    {
125        let iter = lengths.into_iter();
126        let mut out = Vec::with_capacity(iter.size_hint().0 + 1);
127        out.push(O::usize_as(0));
128
129        let mut acc = 0_usize;
130        for length in iter {
131            acc = acc.checked_add(length).expect("usize overflow");
132            out.push(O::usize_as(acc))
133        }
134        // Check for overflow
135        O::from_usize(acc).expect("offset overflow");
136        Self(out.into())
137    }
138
139    /// Create a new [`OffsetBuffer`] where each slice has the same length
140    /// `length`, repeated `n` times.
141    ///
142    ///
143    /// Example
144    /// ```
145    /// # use arrow_buffer::OffsetBuffer;
146    /// let offsets = OffsetBuffer::<i32>::from_repeated_length(4, 3);
147    /// assert_eq!(offsets.as_ref(), &[0, 4, 8, 12]);
148    /// ```
149    ///
150    /// # Panics
151    ///
152    /// Panics on overflow
153    pub fn from_repeated_length(length: usize, n: usize) -> Self {
154        if n == 0 {
155            return Self::new_empty();
156        }
157
158        if length == 0 {
159            return Self::new_zeroed(n);
160        }
161
162        // Check for overflow
163        // Making sure we don't overflow usize or O when calculating the total length
164        length.checked_mul(n).expect("usize overflow");
165
166        // Check for overflow
167        O::from_usize(length * n).expect("offset overflow");
168
169        let offsets = (0..=n)
170            .map(|index| O::usize_as(index * length))
171            .collect::<Vec<O>>();
172
173        Self(ScalarBuffer::from(offsets))
174    }
175
176    /// Get an Iterator over the lengths of this [`OffsetBuffer`]
177    ///
178    /// ```
179    /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
180    /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
181    /// assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
182    /// ```
183    ///
184    /// Empty [`OffsetBuffer`] will return an empty iterator
185    /// ```
186    /// # use arrow_buffer::OffsetBuffer;
187    /// let offsets = OffsetBuffer::<i32>::new_empty();
188    /// assert_eq!(offsets.lengths().count(), 0);
189    /// ```
190    ///
191    /// This can be used to merge multiple [`OffsetBuffer`]s to one
192    /// ```
193    /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
194    ///
195    /// let buffer1 = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
196    /// let buffer2 = OffsetBuffer::<i32>::from_lengths([1, 3, 5, 7, 9]);
197    ///
198    /// let merged = OffsetBuffer::<i32>::from_lengths(
199    ///     vec![buffer1, buffer2].iter().flat_map(|x| x.lengths())
200    /// );
201    ///
202    /// assert_eq!(merged.lengths().collect::<Vec<_>>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]);
203    /// ```
204    pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
205        self.0.windows(2).map(|x| x[1].as_usize() - x[0].as_usize())
206    }
207
208    /// Free up unused memory.
209    pub fn shrink_to_fit(&mut self) {
210        self.0.shrink_to_fit();
211    }
212
213    /// Returns the inner [`ScalarBuffer`]
214    pub fn inner(&self) -> &ScalarBuffer<O> {
215        &self.0
216    }
217
218    /// Returns the inner [`ScalarBuffer`], consuming self
219    pub fn into_inner(self) -> ScalarBuffer<O> {
220        self.0
221    }
222
223    /// Claim memory used by this buffer in the provided memory pool.
224    #[cfg(feature = "pool")]
225    pub fn claim(&self, pool: &dyn crate::MemoryPool) {
226        self.0.claim(pool);
227    }
228
229    /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset`
230    pub fn slice(&self, offset: usize, len: usize) -> Self {
231        Self(self.0.slice(offset, len.saturating_add(1)))
232    }
233
234    /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons
235    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
236    /// return false when the arrays are logically equal
237    #[inline]
238    pub fn ptr_eq(&self, other: &Self) -> bool {
239        self.0.ptr_eq(&other.0)
240    }
241}
242
243impl<T: ArrowNativeType> Deref for OffsetBuffer<T> {
244    type Target = [T];
245
246    #[inline]
247    fn deref(&self) -> &Self::Target {
248        &self.0
249    }
250}
251
252impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> {
253    #[inline]
254    fn as_ref(&self) -> &[T] {
255        self
256    }
257}
258
259impl<O: ArrowNativeType> From<OffsetBufferBuilder<O>> for OffsetBuffer<O> {
260    fn from(value: OffsetBufferBuilder<O>) -> Self {
261        value.finish()
262    }
263}
264
265impl<O: ArrowNativeType> Default for OffsetBuffer<O> {
266    fn default() -> Self {
267        Self::new_empty()
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    #[should_panic(expected = "offsets cannot be empty")]
277    fn empty_offsets() {
278        OffsetBuffer::new(Vec::<i32>::new().into());
279    }
280
281    #[test]
282    #[should_panic(expected = "offsets must be greater than 0")]
283    fn negative_offsets() {
284        OffsetBuffer::new(vec![-1, 0, 1].into());
285    }
286
287    #[test]
288    fn offsets() {
289        OffsetBuffer::new(vec![0, 1, 2, 3].into());
290
291        let offsets = OffsetBuffer::<i32>::new_zeroed(3);
292        assert_eq!(offsets.as_ref(), &[0; 4]);
293
294        let offsets = OffsetBuffer::<i32>::new_zeroed(0);
295        assert_eq!(offsets.as_ref(), &[0; 1]);
296    }
297
298    #[test]
299    #[should_panic(expected = "overflow")]
300    fn offsets_new_zeroed_overflow() {
301        OffsetBuffer::<i32>::new_zeroed(usize::MAX);
302    }
303
304    #[test]
305    #[should_panic(expected = "offsets must be monotonically increasing")]
306    fn non_monotonic_offsets() {
307        OffsetBuffer::new(vec![1, 2, 0].into());
308    }
309
310    #[test]
311    fn from_lengths() {
312        let buffer = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
313        assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]);
314
315        let half_max = i32::MAX / 2;
316        let buffer = OffsetBuffer::<i32>::from_lengths([half_max as usize, half_max as usize]);
317        assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]);
318    }
319
320    #[test]
321    #[should_panic(expected = "offset overflow")]
322    fn from_lengths_offset_overflow() {
323        OffsetBuffer::<i32>::from_lengths([i32::MAX as usize, 1]);
324    }
325
326    #[test]
327    #[should_panic(expected = "usize overflow")]
328    fn from_lengths_usize_overflow() {
329        OffsetBuffer::<i32>::from_lengths([usize::MAX, 1]);
330    }
331
332    #[test]
333    #[should_panic(expected = "offset overflow")]
334    fn from_repeated_lengths_offset_length_overflow() {
335        OffsetBuffer::<i32>::from_repeated_length(i32::MAX as usize / 4, 5);
336    }
337
338    #[test]
339    #[should_panic(expected = "offset overflow")]
340    fn from_repeated_lengths_offset_repeat_overflow() {
341        OffsetBuffer::<i32>::from_repeated_length(1, i32::MAX as usize + 1);
342    }
343
344    #[test]
345    #[should_panic(expected = "offset overflow")]
346    fn from_repeated_lengths_usize_length_overflow() {
347        OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 1);
348    }
349
350    #[test]
351    #[should_panic(expected = "usize overflow")]
352    fn from_repeated_lengths_usize_length_usize_overflow() {
353        OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 2);
354    }
355
356    #[test]
357    #[should_panic(expected = "offset overflow")]
358    fn from_repeated_lengths_usize_repeat_overflow() {
359        OffsetBuffer::<i32>::from_repeated_length(1, usize::MAX);
360    }
361
362    #[test]
363    fn get_lengths() {
364        let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
365        assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
366    }
367
368    #[test]
369    fn get_lengths_should_be_with_fixed_size() {
370        let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
371        let iter = offsets.lengths();
372        assert_eq!(iter.size_hint(), (3, Some(3)));
373        assert_eq!(iter.len(), 3);
374    }
375
376    #[test]
377    fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() {
378        let offsets = OffsetBuffer::<i32>::new_empty();
379        assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![]);
380    }
381
382    #[test]
383    fn impl_eq() {
384        fn are_equal<T: Eq>(a: &T, b: &T) -> bool {
385            a.eq(b)
386        }
387
388        assert!(
389            are_equal(
390                &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])),
391                &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]))
392            ),
393            "OffsetBuffer should implement Eq."
394        );
395    }
396
397    #[test]
398    fn impl_default() {
399        let default = OffsetBuffer::<i32>::default();
400        assert_eq!(default.as_ref(), &[0]);
401    }
402
403    #[test]
404    fn from_repeated_length_basic() {
405        // Basic case with length 4, repeated 3 times
406        let buffer = OffsetBuffer::<i32>::from_repeated_length(4, 3);
407        assert_eq!(buffer.as_ref(), &[0, 4, 8, 12]);
408
409        // Verify the lengths are correct
410        let lengths: Vec<usize> = buffer.lengths().collect();
411        assert_eq!(lengths, vec![4, 4, 4]);
412    }
413
414    #[test]
415    fn from_repeated_length_single_repeat() {
416        // Length 5, repeated once
417        let buffer = OffsetBuffer::<i32>::from_repeated_length(5, 1);
418        assert_eq!(buffer.as_ref(), &[0, 5]);
419
420        let lengths: Vec<usize> = buffer.lengths().collect();
421        assert_eq!(lengths, vec![5]);
422    }
423
424    #[test]
425    fn from_repeated_length_zero_repeats() {
426        let buffer = OffsetBuffer::<i32>::from_repeated_length(10, 0);
427        assert_eq!(buffer, OffsetBuffer::<i32>::new_empty());
428    }
429
430    #[test]
431    fn from_repeated_length_zero_length() {
432        // Zero length, repeated 5 times (all zeros)
433        let buffer = OffsetBuffer::<i32>::from_repeated_length(0, 5);
434        assert_eq!(buffer.as_ref(), &[0, 0, 0, 0, 0, 0]);
435
436        // All lengths should be 0
437        let lengths: Vec<usize> = buffer.lengths().collect();
438        assert_eq!(lengths, vec![0, 0, 0, 0, 0]);
439    }
440
441    #[test]
442    fn from_repeated_length_large_values() {
443        // Test with larger values that don't overflow
444        let buffer = OffsetBuffer::<i32>::from_repeated_length(1000, 100);
445        assert_eq!(buffer[0], 0);
446
447        // Verify all lengths are 1000
448        let lengths: Vec<usize> = buffer.lengths().collect();
449        assert_eq!(lengths.len(), 100);
450        assert!(lengths.iter().all(|&len| len == 1000));
451    }
452
453    #[test]
454    fn from_repeated_length_unit_length() {
455        // Length 1, repeated multiple times
456        let buffer = OffsetBuffer::<i32>::from_repeated_length(1, 10);
457        assert_eq!(buffer.as_ref(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
458
459        let lengths: Vec<usize> = buffer.lengths().collect();
460        assert_eq!(lengths, vec![1; 10]);
461    }
462
463    #[test]
464    fn from_repeated_length_max_safe_values() {
465        // Test with maximum safe values for i32
466        // i32::MAX / 3 ensures we don't overflow when repeated twice
467        let third_max = (i32::MAX / 3) as usize;
468        let buffer = OffsetBuffer::<i32>::from_repeated_length(third_max, 2);
469        assert_eq!(
470            buffer.as_ref(),
471            &[0, third_max as i32, (third_max * 2) as i32]
472        );
473    }
474}