arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68///                         ┌──────┬────────────────────────┐
69///                         │length│      string value      │
70///    Strings (len <= 12)  │      │    (padded with 0)     │
71///                         └──────┴────────────────────────┘
72///                          0    31                      127
73///
74///                         ┌───────┬───────┬───────┬───────┐
75///                         │length │prefix │  buf  │offset │
76///    Strings (len > 12)   │       │       │ index │       │
77///                         └───────┴───────┴───────┴───────┘
78///                          0    31       63      95    127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82///   the view. See [`Self::inline_value`] to access the inlined prefix from a
83///   short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86///   view and the entire string is stored in one of the buffers. See [`ByteView`]
87///   to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100///   "hello",
101///   "this string is longer than 12 bytes",
102///   "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113///   StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148///                                                                            ┌───┐
149///                         ┌──────┬──────┬──────┬──────┐               offset │...│
150/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
151///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
152///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
153/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
154///                         └──────┴──────┴──────┴──────┘                      │ced│
155///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
157///                         └──────┴────────────────────┘               offset │sIn│
158///                                                                       115  │Tow│
159///                                                                            │nTo│
160///                                                                            │day│
161///                                  u128 "views"                              │Yay│
162///                                                                   buffer 0 │...│
163///                                                                            └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166    data_type: DataType,
167    views: ScalarBuffer<u128>,
168    buffers: Vec<Buffer>,
169    phantom: PhantomData<T>,
170    nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174    fn clone(&self) -> Self {
175        Self {
176            data_type: T::DATA_TYPE,
177            views: self.views.clone(),
178            buffers: self.buffers.clone(),
179            nulls: self.nulls.clone(),
180            phantom: Default::default(),
181        }
182    }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187    ///
188    /// # Panics
189    ///
190    /// Panics if [`GenericByteViewArray::try_new`] returns an error
191    pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
192        Self::try_new(views, buffers, nulls).unwrap()
193    }
194
195    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
196    ///
197    /// # Errors
198    ///
199    /// * `views.len() != nulls.len()`
200    /// * [ByteViewType::validate] fails
201    pub fn try_new(
202        views: ScalarBuffer<u128>,
203        buffers: Vec<Buffer>,
204        nulls: Option<NullBuffer>,
205    ) -> Result<Self, ArrowError> {
206        T::validate(&views, &buffers)?;
207
208        if let Some(n) = nulls.as_ref() {
209            if n.len() != views.len() {
210                return Err(ArrowError::InvalidArgumentError(format!(
211                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
212                    T::PREFIX,
213                    views.len(),
214                    n.len(),
215                )));
216            }
217        }
218
219        Ok(Self {
220            data_type: T::DATA_TYPE,
221            views,
222            buffers,
223            nulls,
224            phantom: Default::default(),
225        })
226    }
227
228    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
229    ///
230    /// # Safety
231    ///
232    /// Safe if [`Self::try_new`] would not error
233    pub unsafe fn new_unchecked(
234        views: ScalarBuffer<u128>,
235        buffers: Vec<Buffer>,
236        nulls: Option<NullBuffer>,
237    ) -> Self {
238        if cfg!(feature = "force_validate") {
239            return Self::new(views, buffers, nulls);
240        }
241
242        Self {
243            data_type: T::DATA_TYPE,
244            phantom: Default::default(),
245            views,
246            buffers,
247            nulls,
248        }
249    }
250
251    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
252    pub fn new_null(len: usize) -> Self {
253        Self {
254            data_type: T::DATA_TYPE,
255            views: vec![0; len].into(),
256            buffers: vec![],
257            nulls: Some(NullBuffer::new_null(len)),
258            phantom: Default::default(),
259        }
260    }
261
262    /// Create a new [`Scalar`] from `value`
263    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
264        Scalar::new(Self::from_iter_values(std::iter::once(value)))
265    }
266
267    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
268    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
269    where
270        Ptr: AsRef<T::Native>,
271        I: IntoIterator<Item = Ptr>,
272    {
273        let iter = iter.into_iter();
274        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
275        for v in iter {
276            builder.append_value(v);
277        }
278        builder.finish()
279    }
280
281    /// Deconstruct this array into its constituent parts
282    pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
283        (self.views, self.buffers, self.nulls)
284    }
285
286    /// Returns the views buffer
287    #[inline]
288    pub fn views(&self) -> &ScalarBuffer<u128> {
289        &self.views
290    }
291
292    /// Returns the buffers storing string data
293    #[inline]
294    pub fn data_buffers(&self) -> &[Buffer] {
295        &self.buffers
296    }
297
298    /// Returns the element at index `i`
299    /// # Panics
300    /// Panics if index `i` is out of bounds.
301    pub fn value(&self, i: usize) -> &T::Native {
302        assert!(
303            i < self.len(),
304            "Trying to access an element at index {} from a {}ViewArray of length {}",
305            i,
306            T::PREFIX,
307            self.len()
308        );
309
310        unsafe { self.value_unchecked(i) }
311    }
312
313    /// Returns the element at index `i` without bounds checking
314    ///
315    /// # Safety
316    ///
317    /// Caller is responsible for ensuring that the index is within the bounds
318    /// of the array
319    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
320        let v = self.views.get_unchecked(idx);
321        let len = *v as u32;
322        let b = if len <= MAX_INLINE_VIEW_LEN {
323            Self::inline_value(v, len as usize)
324        } else {
325            let view = ByteView::from(*v);
326            let data = self.buffers.get_unchecked(view.buffer_index as usize);
327            let offset = view.offset as usize;
328            data.get_unchecked(offset..offset + len as usize)
329        };
330        T::Native::from_bytes_unchecked(b)
331    }
332
333    /// Returns the first `len` bytes the inline value of the view.
334    ///
335    /// # Safety
336    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
337    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
338    #[inline(always)]
339    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
340        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
341        std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
342    }
343
344    /// Constructs a new iterator for iterating over the values of this array
345    pub fn iter(&self) -> ArrayIter<&Self> {
346        ArrayIter::new(self)
347    }
348
349    /// Returns an iterator over the bytes of this array, including null values
350    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
351        self.views.iter().map(move |v| {
352            let len = *v as u32;
353            if len <= MAX_INLINE_VIEW_LEN {
354                unsafe { Self::inline_value(v, len as usize) }
355            } else {
356                let view = ByteView::from(*v);
357                let data = &self.buffers[view.buffer_index as usize];
358                let offset = view.offset as usize;
359                unsafe { data.get_unchecked(offset..offset + len as usize) }
360            }
361        })
362    }
363
364    /// Returns an iterator over the first `prefix_len` bytes of each array
365    /// element, including null values.
366    ///
367    /// If `prefix_len` is larger than the element's length, the iterator will
368    /// return an empty slice (`&[]`).
369    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
370        self.views().into_iter().map(move |v| {
371            let len = (*v as u32) as usize;
372
373            if len < prefix_len {
374                return &[] as &[u8];
375            }
376
377            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
378                unsafe { StringViewArray::inline_value(v, prefix_len) }
379            } else {
380                let view = ByteView::from(*v);
381                let data = unsafe {
382                    self.data_buffers()
383                        .get_unchecked(view.buffer_index as usize)
384                };
385                let offset = view.offset as usize;
386                unsafe { data.get_unchecked(offset..offset + prefix_len) }
387            }
388        })
389    }
390
391    /// Returns an iterator over the last `suffix_len` bytes of each array
392    /// element, including null values.
393    ///
394    /// Note that for [`StringViewArray`] the last bytes may start in the middle
395    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
396    ///
397    /// If `suffix_len` is larger than the element's length, the iterator will
398    /// return an empty slice (`&[]`).
399    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
400        self.views().into_iter().map(move |v| {
401            let len = (*v as u32) as usize;
402
403            if len < suffix_len {
404                return &[] as &[u8];
405            }
406
407            if len as u32 <= MAX_INLINE_VIEW_LEN {
408                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
409            } else {
410                let view = ByteView::from(*v);
411                let data = unsafe {
412                    self.data_buffers()
413                        .get_unchecked(view.buffer_index as usize)
414                };
415                let offset = view.offset as usize;
416                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
417            }
418        })
419    }
420
421    /// Returns a zero-copy slice of this array with the indicated offset and length.
422    pub fn slice(&self, offset: usize, length: usize) -> Self {
423        Self {
424            data_type: T::DATA_TYPE,
425            views: self.views.slice(offset, length),
426            buffers: self.buffers.clone(),
427            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
428            phantom: Default::default(),
429        }
430    }
431
432    /// Returns a "compacted" version of this array
433    ///
434    /// The original array will *not* be modified
435    ///
436    /// # Garbage Collection
437    ///
438    /// Before GC:
439    /// ```text
440    ///                                        ┌──────┐
441    ///                                        │......│
442    ///                                        │......│
443    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
444    /// │       View 1       │─ ─ ─ ─          │......│  with data that
445    /// ├────────────────────┤                 │......│ is not referred
446    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
447    /// └────────────────────┘                 │......│      View 2
448    ///                                        │......│
449    ///    2 views, refer to                   │......│
450    ///   small portions of a                  └──────┘
451    ///      large buffer
452    /// ```
453    ///
454    /// After GC:
455    ///
456    /// ```text
457    /// ┌────────────────────┐                 ┌─────┐    After gc, only
458    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
459    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
460    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
461    /// └────────────────────┘                                 left
462    ///
463    ///
464    ///         2 views
465    /// ```
466    /// This method will compact the data buffers by recreating the view array and only include the data
467    /// that is pointed to by the views.
468    ///
469    /// Note that it will copy the array regardless of whether the original array is compact.
470    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
471    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
472    ///
473    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
474    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
475    pub fn gc(&self) -> Self {
476        let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
477
478        for v in self.iter() {
479            builder.append_option(v);
480        }
481
482        builder.finish()
483    }
484
485    /// Returns the total number of bytes used by all non inlined views in all
486    /// buffers.
487    ///
488    /// Note this does not account for views that point at the same underlying
489    /// data in buffers
490    ///
491    /// For example, if the array has three strings views:
492    /// * View with length = 9 (inlined)
493    /// * View with length = 32 (non inlined)
494    /// * View with length = 16 (non inlined)
495    ///
496    /// Then this method would report 48
497    pub fn total_buffer_bytes_used(&self) -> usize {
498        self.views()
499            .iter()
500            .map(|v| {
501                let len = *v as u32;
502                if len > MAX_INLINE_VIEW_LEN {
503                    len as usize
504                } else {
505                    0
506                }
507            })
508            .sum()
509    }
510
511    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
512    ///
513    /// Comparing two ByteView types are non-trivial.
514    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
515    ///
516    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
517    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
518    ///     Meaning that reading one array element requires only one memory access
519    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
520    ///
521    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
522    ///     thanks to the inlined 4 bytes.
523    ///     Consider equality check:
524    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
525    ///
526    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
527    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
528    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
529    ///
530    /// # Order check flow
531    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
532    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
533    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
534    ///     (2.2) o.w., we need to compare the full string.
535    ///
536    /// # Safety
537    /// The left/right_idx must within range of each array
538    pub unsafe fn compare_unchecked(
539        left: &GenericByteViewArray<T>,
540        left_idx: usize,
541        right: &GenericByteViewArray<T>,
542        right_idx: usize,
543    ) -> Ordering {
544        let l_view = left.views().get_unchecked(left_idx);
545        let l_byte_view = ByteView::from(*l_view);
546
547        let r_view = right.views().get_unchecked(right_idx);
548        let r_byte_view = ByteView::from(*r_view);
549
550        let l_len = l_byte_view.length;
551        let r_len = r_byte_view.length;
552
553        if l_len <= 12 && r_len <= 12 {
554            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
555        }
556
557        // one of the string is larger than 12 bytes,
558        // we then try to compare the inlined data first
559
560        // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
561        // but we can still use it to get the inlined prefix for shorter strings.
562        // The prefix is always the first 4 bytes of the view, for both short and long strings.
563        let l_inlined_be = l_byte_view.prefix.swap_bytes();
564        let r_inlined_be = r_byte_view.prefix.swap_bytes();
565        if l_inlined_be != r_inlined_be {
566            return l_inlined_be.cmp(&r_inlined_be);
567        }
568
569        // unfortunately, we need to compare the full data
570        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
571        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
572
573        l_full_data.cmp(r_full_data)
574    }
575
576    /// Builds a 128-bit composite key for an inline value:
577    ///
578    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
579    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
580    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
581    ///
582    /// This function extracts the length and the 12-byte inline string data from the raw
583    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
584    /// into a single `u128` value suitable for fast, branchless comparisons.
585    ///
586    /// ### Why include length?
587    ///
588    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
589    /// compare equal—either because one is a true prefix of the other or because zero-padding
590    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
591    /// handles both content and length in one go.
592    ///
593    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
594    ///
595    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
596    /// |------------|-----------------------|---------------------------------|
597    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
598    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
599    ///
600    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
601    /// then differentiates:
602    ///
603    /// ```text
604    /// key("bar")   = 0x0000000000000000000062617200000003
605    /// key("bar\0") = 0x0000000000000000000062617200000004
606    /// ⇒ key("bar") < key("bar\0")
607    /// ```
608    #[inline(always)]
609    pub fn inline_key_fast(raw: u128) -> u128 {
610        // Convert the raw u128 (little-endian) into bytes for manipulation
611        let raw_bytes = raw.to_le_bytes();
612
613        // Extract the length (first 4 bytes), convert to big-endian u32, and promote to u128
614        let len_le = &raw_bytes[0..4];
615        let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as u128;
616
617        // Extract the inline string bytes (next 12 bytes), place them into the lower 12 bytes of a 16-byte array,
618        // padding the upper 4 bytes with zero to form a little-endian u128 value
619        let mut inline_bytes = [0u8; 16];
620        inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]);
621
622        // Convert to big-endian to ensure correct lexical ordering
623        let inline_u128 = u128::from_le_bytes(inline_bytes).to_be();
624
625        // Shift right by 32 bits to discard the zero padding (upper 4 bytes),
626        // so that the inline string occupies the high 96 bits
627        let inline_part = inline_u128 >> 32;
628
629        // Combine the inline string part (high 96 bits) and length (low 32 bits) into the final key
630        (inline_part << 32) | len_be
631    }
632}
633
634impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
635    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
636        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
637        print_long_array(self, f, |array, index, f| {
638            std::fmt::Debug::fmt(&array.value(index), f)
639        })?;
640        write!(f, "]")
641    }
642}
643
644impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
645    fn as_any(&self) -> &dyn Any {
646        self
647    }
648
649    fn to_data(&self) -> ArrayData {
650        self.clone().into()
651    }
652
653    fn into_data(self) -> ArrayData {
654        self.into()
655    }
656
657    fn data_type(&self) -> &DataType {
658        &self.data_type
659    }
660
661    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
662        Arc::new(self.slice(offset, length))
663    }
664
665    fn len(&self) -> usize {
666        self.views.len()
667    }
668
669    fn is_empty(&self) -> bool {
670        self.views.is_empty()
671    }
672
673    fn shrink_to_fit(&mut self) {
674        self.views.shrink_to_fit();
675        self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
676        self.buffers.shrink_to_fit();
677        if let Some(nulls) = &mut self.nulls {
678            nulls.shrink_to_fit();
679        }
680    }
681
682    fn offset(&self) -> usize {
683        0
684    }
685
686    fn nulls(&self) -> Option<&NullBuffer> {
687        self.nulls.as_ref()
688    }
689
690    fn logical_null_count(&self) -> usize {
691        // More efficient that the default implementation
692        self.null_count()
693    }
694
695    fn get_buffer_memory_size(&self) -> usize {
696        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
697        sum += self.views.inner().capacity();
698        if let Some(x) = &self.nulls {
699            sum += x.buffer().capacity()
700        }
701        sum
702    }
703
704    fn get_array_memory_size(&self) -> usize {
705        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
706    }
707}
708
709impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
710    type Item = &'a T::Native;
711
712    fn value(&self, index: usize) -> Self::Item {
713        GenericByteViewArray::value(self, index)
714    }
715
716    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
717        GenericByteViewArray::value_unchecked(self, index)
718    }
719}
720
721impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
722    type Item = Option<&'a T::Native>;
723    type IntoIter = ArrayIter<Self>;
724
725    fn into_iter(self) -> Self::IntoIter {
726        ArrayIter::new(self)
727    }
728}
729
730impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
731    fn from(value: ArrayData) -> Self {
732        let views = value.buffers()[0].clone();
733        let views = ScalarBuffer::new(views, value.offset(), value.len());
734        let buffers = value.buffers()[1..].to_vec();
735        Self {
736            data_type: T::DATA_TYPE,
737            views,
738            buffers,
739            nulls: value.nulls().cloned(),
740            phantom: Default::default(),
741        }
742    }
743}
744
745/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
746///
747/// For example this method can convert a [`StringArray`] to a
748/// [`StringViewArray`].
749///
750/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
751/// is built without copying the underlying string data (views are created
752/// directly into the existing buffer)
753///
754/// [`StringArray`]: crate::StringArray
755impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
756where
757    FROM: ByteArrayType,
758    FROM::Offset: OffsetSizeTrait + ToPrimitive,
759    V: ByteViewType<Native = FROM::Native>,
760{
761    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
762        let offsets = byte_array.offsets();
763
764        let can_reuse_buffer = match offsets.last() {
765            Some(offset) => offset.as_usize() < u32::MAX as usize,
766            None => true,
767        };
768
769        if can_reuse_buffer {
770            // build views directly pointing to the existing buffer
771            let len = byte_array.len();
772            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
773            let str_values_buf = byte_array.values().clone();
774            let block = views_builder.append_block(str_values_buf);
775            for (i, w) in offsets.windows(2).enumerate() {
776                let offset = w[0].as_usize();
777                let end = w[1].as_usize();
778                let length = end - offset;
779
780                if byte_array.is_null(i) {
781                    views_builder.append_null();
782                } else {
783                    // Safety: the input was a valid array so it valid UTF8 (if string). And
784                    // all offsets were valid
785                    unsafe {
786                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
787                    }
788                }
789            }
790            assert_eq!(views_builder.len(), len);
791            views_builder.finish()
792        } else {
793            // Otherwise, create a new buffer for large strings
794            // TODO: the original buffer could still be used
795            // by making multiple slices of u32::MAX length
796            GenericByteViewArray::<V>::from_iter(byte_array.iter())
797        }
798    }
799}
800
801impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
802    fn from(mut array: GenericByteViewArray<T>) -> Self {
803        let len = array.len();
804        array.buffers.insert(0, array.views.into_inner());
805        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
806            .len(len)
807            .buffers(array.buffers)
808            .nulls(array.nulls);
809
810        unsafe { builder.build_unchecked() }
811    }
812}
813
814impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
815where
816    Ptr: AsRef<T::Native> + 'a,
817    T: ByteViewType + ?Sized,
818{
819    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
820        iter.into_iter()
821            .map(|o| o.as_ref().map(|p| p.as_ref()))
822            .collect()
823    }
824}
825
826impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
827where
828    Ptr: AsRef<T::Native>,
829{
830    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
831        let iter = iter.into_iter();
832        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
833        builder.extend(iter);
834        builder.finish()
835    }
836}
837
838/// A [`GenericByteViewArray`] of `[u8]`
839///
840/// See [`GenericByteViewArray`] for format and layout details.
841///
842/// # Example
843/// ```
844/// use arrow_array::BinaryViewArray;
845/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
846/// assert_eq!(array.value(0), b"hello");
847/// assert_eq!(array.value(3), b"large payload over 12 bytes");
848/// ```
849pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
850
851impl BinaryViewArray {
852    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
853    /// If items not utf8 data, validate will fail and error returned.
854    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
855        StringViewType::validate(self.views(), self.data_buffers())?;
856        unsafe { Ok(self.to_string_view_unchecked()) }
857    }
858
859    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
860    /// # Safety
861    /// Caller is responsible for ensuring that items in array are utf8 data.
862    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
863        StringViewArray::new_unchecked(self.views, self.buffers, self.nulls)
864    }
865}
866
867impl From<Vec<&[u8]>> for BinaryViewArray {
868    fn from(v: Vec<&[u8]>) -> Self {
869        Self::from_iter_values(v)
870    }
871}
872
873impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
874    fn from(v: Vec<Option<&[u8]>>) -> Self {
875        v.into_iter().collect()
876    }
877}
878
879/// A [`GenericByteViewArray`] that stores utf8 data
880///
881/// See [`GenericByteViewArray`] for format and layout details.
882///
883/// # Example
884/// ```
885/// use arrow_array::StringViewArray;
886/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
887/// assert_eq!(array.value(0), "hello");
888/// assert_eq!(array.value(3), "large payload over 12 bytes");
889/// ```
890pub type StringViewArray = GenericByteViewArray<StringViewType>;
891
892impl StringViewArray {
893    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
894    pub fn to_binary_view(self) -> BinaryViewArray {
895        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
896    }
897
898    /// Returns true if all data within this array is ASCII
899    pub fn is_ascii(&self) -> bool {
900        // Alternative (but incorrect): directly check the underlying buffers
901        // (1) Our string view might be sparse, i.e., a subset of the buffers,
902        //      so even if the buffer is not ascii, we can still be ascii.
903        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
904        // This means that this operation is quite expensive, shall we cache the result?
905        //  i.e. track `is_ascii` in the builder.
906        self.iter().all(|v| match v {
907            Some(v) => v.is_ascii(),
908            None => true,
909        })
910    }
911}
912
913impl From<Vec<&str>> for StringViewArray {
914    fn from(v: Vec<&str>) -> Self {
915        Self::from_iter_values(v)
916    }
917}
918
919impl From<Vec<Option<&str>>> for StringViewArray {
920    fn from(v: Vec<Option<&str>>) -> Self {
921        v.into_iter().collect()
922    }
923}
924
925impl From<Vec<String>> for StringViewArray {
926    fn from(v: Vec<String>) -> Self {
927        Self::from_iter_values(v)
928    }
929}
930
931impl From<Vec<Option<String>>> for StringViewArray {
932    fn from(v: Vec<Option<String>>) -> Self {
933        v.into_iter().collect()
934    }
935}
936
937#[cfg(test)]
938mod tests {
939    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
940    use crate::types::BinaryViewType;
941    use crate::{
942        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
943    };
944    use arrow_buffer::{Buffer, ScalarBuffer};
945    use arrow_data::ByteView;
946
947    #[test]
948    fn try_new_string() {
949        let array = StringViewArray::from_iter_values(vec![
950            "hello",
951            "world",
952            "lulu",
953            "large payload over 12 bytes",
954        ]);
955        assert_eq!(array.value(0), "hello");
956        assert_eq!(array.value(3), "large payload over 12 bytes");
957    }
958
959    #[test]
960    fn try_new_binary() {
961        let array = BinaryViewArray::from_iter_values(vec![
962            b"hello".as_slice(),
963            b"world".as_slice(),
964            b"lulu".as_slice(),
965            b"large payload over 12 bytes".as_slice(),
966        ]);
967        assert_eq!(array.value(0), b"hello");
968        assert_eq!(array.value(3), b"large payload over 12 bytes");
969    }
970
971    #[test]
972    fn try_new_empty_string() {
973        // test empty array
974        let array = {
975            let mut builder = StringViewBuilder::new();
976            builder.finish()
977        };
978        assert!(array.is_empty());
979    }
980
981    #[test]
982    fn try_new_empty_binary() {
983        // test empty array
984        let array = {
985            let mut builder = BinaryViewBuilder::new();
986            builder.finish()
987        };
988        assert!(array.is_empty());
989    }
990
991    #[test]
992    fn test_append_string() {
993        // test builder append
994        let array = {
995            let mut builder = StringViewBuilder::new();
996            builder.append_value("hello");
997            builder.append_null();
998            builder.append_option(Some("large payload over 12 bytes"));
999            builder.finish()
1000        };
1001        assert_eq!(array.value(0), "hello");
1002        assert!(array.is_null(1));
1003        assert_eq!(array.value(2), "large payload over 12 bytes");
1004    }
1005
1006    #[test]
1007    fn test_append_binary() {
1008        // test builder append
1009        let array = {
1010            let mut builder = BinaryViewBuilder::new();
1011            builder.append_value(b"hello");
1012            builder.append_null();
1013            builder.append_option(Some(b"large payload over 12 bytes"));
1014            builder.finish()
1015        };
1016        assert_eq!(array.value(0), b"hello");
1017        assert!(array.is_null(1));
1018        assert_eq!(array.value(2), b"large payload over 12 bytes");
1019    }
1020
1021    #[test]
1022    fn test_in_progress_recreation() {
1023        let array = {
1024            // make a builder with small block size.
1025            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1026            builder.append_value("large payload over 12 bytes");
1027            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1028            builder.finish()
1029        };
1030        assert_eq!(array.value(0), "large payload over 12 bytes");
1031        assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
1032        assert_eq!(2, array.buffers.len());
1033    }
1034
1035    #[test]
1036    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1037    fn new_with_invalid_view_data() {
1038        let v = "large payload over 12 bytes";
1039        let view = ByteView::new(13, &v.as_bytes()[0..4])
1040            .with_buffer_index(3)
1041            .with_offset(1);
1042        let views = ScalarBuffer::from(vec![view.into()]);
1043        let buffers = vec![Buffer::from_slice_ref(v)];
1044        StringViewArray::new(views, buffers, None);
1045    }
1046
1047    #[test]
1048    #[should_panic(
1049        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1050    )]
1051    fn new_with_invalid_utf8_data() {
1052        let v: Vec<u8> = vec![
1053            // invalid UTF8
1054            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1055            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1056        ];
1057        let view = ByteView::new(v.len() as u32, &v[0..4]);
1058        let views = ScalarBuffer::from(vec![view.into()]);
1059        let buffers = vec![Buffer::from_slice_ref(v)];
1060        StringViewArray::new(views, buffers, None);
1061    }
1062
1063    #[test]
1064    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1065    fn new_with_invalid_zero_padding() {
1066        let mut data = [0; 12];
1067        data[0] = b'H';
1068        data[11] = 1; // no zero padding
1069
1070        let mut view_buffer = [0; 16];
1071        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1072        view_buffer[4..].copy_from_slice(&data);
1073
1074        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1075        let views = ScalarBuffer::from(vec![view.into()]);
1076        let buffers = vec![];
1077        StringViewArray::new(views, buffers, None);
1078    }
1079
1080    #[test]
1081    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1082    fn test_mismatch_between_embedded_prefix_and_data() {
1083        let input_str_1 = "Hello, Rustaceans!";
1084        let input_str_2 = "Hallo, Rustaceans!";
1085        let length = input_str_1.len() as u32;
1086        assert!(input_str_1.len() > 12);
1087
1088        let mut view_buffer = [0; 16];
1089        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1090        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1091        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1092        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1093        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1094        let views = ScalarBuffer::from(vec![view.into()]);
1095        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1096
1097        StringViewArray::new(views, buffers, None);
1098    }
1099
1100    #[test]
1101    fn test_gc() {
1102        let test_data = [
1103            Some("longer than 12 bytes"),
1104            Some("short"),
1105            Some("t"),
1106            Some("longer than 12 bytes"),
1107            None,
1108            Some("short"),
1109        ];
1110
1111        let array = {
1112            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1113            test_data.into_iter().for_each(|v| builder.append_option(v));
1114            builder.finish()
1115        };
1116        assert!(array.buffers.len() > 1);
1117
1118        fn check_gc(to_test: &StringViewArray) {
1119            let gc = to_test.gc();
1120            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1121
1122            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1123                assert_eq!(a, b);
1124            });
1125            assert_eq!(to_test.len(), gc.len());
1126        }
1127
1128        check_gc(&array);
1129        check_gc(&array.slice(1, 3));
1130        check_gc(&array.slice(2, 1));
1131        check_gc(&array.slice(2, 2));
1132        check_gc(&array.slice(3, 1));
1133    }
1134
1135    #[test]
1136    fn test_eq() {
1137        let test_data = [
1138            Some("longer than 12 bytes"),
1139            None,
1140            Some("short"),
1141            Some("again, this is longer than 12 bytes"),
1142        ];
1143
1144        let array1 = {
1145            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1146            test_data.into_iter().for_each(|v| builder.append_option(v));
1147            builder.finish()
1148        };
1149        let array2 = {
1150            // create a new array with the same data but different layout
1151            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1152            test_data.into_iter().for_each(|v| builder.append_option(v));
1153            builder.finish()
1154        };
1155        assert_eq!(array1, array1.clone());
1156        assert_eq!(array2, array2.clone());
1157        assert_eq!(array1, array2);
1158    }
1159
1160    /// Integration tests for `inline_key_fast` covering:
1161    ///
1162    /// 1. Monotonic ordering across increasing lengths and lexical variations.
1163    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1164    ///
1165    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1166    /// the length field is required even when all inline bytes fit in 12 bytes.
1167    #[test]
1168    fn test_inline_key_fast_various_lengths_and_lexical() {
1169        /// Helper to create a raw u128 value representing an inline ByteView
1170        /// - `length`: number of meaningful bytes (≤ 12)
1171        /// - `data`: the actual inline data
1172        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1173            assert!(length as usize <= 12, "Inline length must be ≤ 12");
1174            assert!(data.len() == length as usize, "Data must match length");
1175
1176            let mut raw_bytes = [0u8; 16];
1177            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // little-endian length
1178            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1179            u128::from_le_bytes(raw_bytes)
1180        }
1181
1182        // Test inputs: include the specific "bar" vs "bar\0" case, plus length and lexical variations
1183        let test_inputs: Vec<&[u8]> = vec![
1184            b"a",
1185            b"aa",
1186            b"aaa",
1187            b"aab",
1188            b"abcd",
1189            b"abcde",
1190            b"abcdef",
1191            b"abcdefg",
1192            b"abcdefgh",
1193            b"abcdefghi",
1194            b"abcdefghij",
1195            b"abcdefghijk",
1196            b"abcdefghijkl", // 12 bytes, max inline
1197            b"bar",
1198            b"bar\0", // special case to test length tiebreaker
1199            b"xyy",
1200            b"xyz",
1201        ];
1202
1203        // Monotonic key order: content then length,and cross-check against GenericBinaryArray comparison
1204        let array: GenericBinaryArray<i32> =
1205            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1206
1207        for i in 0..array.len() - 1 {
1208            let v1 = array.value(i);
1209            let v2 = array.value(i + 1);
1210            // Ensure lexical ordering matches
1211            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1212            // Ensure fast key compare matches
1213            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1214                v1.len() as u32,
1215                v1,
1216            ));
1217            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1218                v2.len() as u32,
1219                v2,
1220            ));
1221            assert!(
1222                key1 < key2,
1223                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1224            );
1225        }
1226    }
1227}