Skip to main content

arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68///                         ┌──────┬────────────────────────┐
69///                         │length│      string value      │
70///    Strings (len <= 12)  │      │    (padded with 0)     │
71///                         └──────┴────────────────────────┘
72///                          0    31                      127
73///
74///                         ┌───────┬───────┬───────┬───────┐
75///                         │length │prefix │  buf  │offset │
76///    Strings (len > 12)   │       │       │ index │       │
77///                         └───────┴───────┴───────┴───────┘
78///                          0    31       63      95    127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82///   the view. See [`Self::inline_value`] to access the inlined prefix from a
83///   short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86///   view and the entire string is stored in one of the buffers. See [`ByteView`]
87///   to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100///   "hello",
101///   "this string is longer than 12 bytes",
102///   "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113///   StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148///                                                                            ┌───┐
149///                         ┌──────┬──────┬──────┬──────┐               offset │...│
150/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
151///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
152///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
153/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
154///                         └──────┴──────┴──────┴──────┘                      │ced│
155///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
157///                         └──────┴────────────────────┘               offset │sIn│
158///                                                                       115  │Tow│
159///                                                                            │nTo│
160///                                                                            │day│
161///                                  u128 "views"                              │Yay│
162///                                                                   buffer 0 │...│
163///                                                                            └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166    data_type: DataType,
167    views: ScalarBuffer<u128>,
168    buffers: Arc<[Buffer]>,
169    phantom: PhantomData<T>,
170    nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174    fn clone(&self) -> Self {
175        Self {
176            data_type: T::DATA_TYPE,
177            views: self.views.clone(),
178            buffers: self.buffers.clone(),
179            nulls: self.nulls.clone(),
180            phantom: Default::default(),
181        }
182    }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187    ///
188    /// # Panics
189    ///
190    /// Panics if [`GenericByteViewArray::try_new`] returns an error
191    pub fn new<U>(views: ScalarBuffer<u128>, buffers: U, nulls: Option<NullBuffer>) -> Self
192    where
193        U: Into<Arc<[Buffer]>>,
194    {
195        Self::try_new(views, buffers, nulls).unwrap()
196    }
197
198    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
199    ///
200    /// # Errors
201    ///
202    /// * `views.len() != nulls.len()`
203    /// * [ByteViewType::validate] fails
204    pub fn try_new<U>(
205        views: ScalarBuffer<u128>,
206        buffers: U,
207        nulls: Option<NullBuffer>,
208    ) -> Result<Self, ArrowError>
209    where
210        U: Into<Arc<[Buffer]>>,
211    {
212        let buffers: Arc<[Buffer]> = buffers.into();
213
214        T::validate(&views, &buffers)?;
215
216        if let Some(n) = nulls.as_ref() {
217            if n.len() != views.len() {
218                return Err(ArrowError::InvalidArgumentError(format!(
219                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
220                    T::PREFIX,
221                    views.len(),
222                    n.len(),
223                )));
224            }
225        }
226
227        Ok(Self {
228            data_type: T::DATA_TYPE,
229            views,
230            buffers,
231            nulls,
232            phantom: Default::default(),
233        })
234    }
235
236    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
237    ///
238    /// # Safety
239    ///
240    /// Safe if [`Self::try_new`] would not error
241    pub unsafe fn new_unchecked<U>(
242        views: ScalarBuffer<u128>,
243        buffers: U,
244        nulls: Option<NullBuffer>,
245    ) -> Self
246    where
247        U: Into<Arc<[Buffer]>>,
248    {
249        if cfg!(feature = "force_validate") {
250            return Self::new(views, buffers, nulls);
251        }
252
253        Self {
254            data_type: T::DATA_TYPE,
255            phantom: Default::default(),
256            views,
257            buffers: buffers.into(),
258            nulls,
259        }
260    }
261
262    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
263    pub fn new_null(len: usize) -> Self {
264        Self {
265            data_type: T::DATA_TYPE,
266            views: vec![0; len].into(),
267            buffers: vec![].into(),
268            nulls: Some(NullBuffer::new_null(len)),
269            phantom: Default::default(),
270        }
271    }
272
273    /// Create a new [`Scalar`] from `value`
274    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
275        Scalar::new(Self::from_iter_values(std::iter::once(value)))
276    }
277
278    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
279    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
280    where
281        Ptr: AsRef<T::Native>,
282        I: IntoIterator<Item = Ptr>,
283    {
284        let iter = iter.into_iter();
285        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
286        for v in iter {
287            builder.append_value(v);
288        }
289        builder.finish()
290    }
291
292    /// Deconstruct this array into its constituent parts
293    pub fn into_parts(self) -> (ScalarBuffer<u128>, Arc<[Buffer]>, Option<NullBuffer>) {
294        (self.views, self.buffers, self.nulls)
295    }
296
297    /// Returns the views buffer
298    #[inline]
299    pub fn views(&self) -> &ScalarBuffer<u128> {
300        &self.views
301    }
302
303    /// Returns the buffers storing string data
304    #[inline]
305    pub fn data_buffers(&self) -> &[Buffer] {
306        &self.buffers
307    }
308
309    /// Returns the element at index `i`
310    ///
311    /// Note: This method does not check for nulls and the value is arbitrary
312    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
313    ///
314    /// # Panics
315    /// Panics if index `i` is out of bounds.
316    pub fn value(&self, i: usize) -> &T::Native {
317        assert!(
318            i < self.len(),
319            "Trying to access an element at index {} from a {}ViewArray of length {}",
320            i,
321            T::PREFIX,
322            self.len()
323        );
324
325        unsafe { self.value_unchecked(i) }
326    }
327
328    /// Returns the element at index `i` without bounds checking
329    ///
330    /// Note: This method does not check for nulls and the value is arbitrary
331    /// if [`is_null`](Self::is_null) returns true for the index.
332    ///
333    /// # Safety
334    ///
335    /// Caller is responsible for ensuring that the index is within the bounds
336    /// of the array
337    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
338        let v = unsafe { self.views.get_unchecked(idx) };
339        let len = *v as u32;
340        let b = if len <= MAX_INLINE_VIEW_LEN {
341            unsafe { Self::inline_value(v, len as usize) }
342        } else {
343            let view = ByteView::from(*v);
344            let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
345            let offset = view.offset as usize;
346            unsafe { data.get_unchecked(offset..offset + len as usize) }
347        };
348        unsafe { T::Native::from_bytes_unchecked(b) }
349    }
350
351    /// Returns the first `len` bytes the inline value of the view.
352    ///
353    /// # Safety
354    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
355    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
356    #[inline(always)]
357    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
358        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
359        unsafe {
360            std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
361        }
362    }
363
364    /// Constructs a new iterator for iterating over the values of this array
365    pub fn iter(&self) -> ArrayIter<&Self> {
366        ArrayIter::new(self)
367    }
368
369    /// Returns an iterator over the bytes of this array, including null values
370    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
371        self.views.iter().map(move |v| {
372            let len = *v as u32;
373            if len <= MAX_INLINE_VIEW_LEN {
374                unsafe { Self::inline_value(v, len as usize) }
375            } else {
376                let view = ByteView::from(*v);
377                let data = &self.buffers[view.buffer_index as usize];
378                let offset = view.offset as usize;
379                unsafe { data.get_unchecked(offset..offset + len as usize) }
380            }
381        })
382    }
383
384    /// Returns an iterator over the first `prefix_len` bytes of each array
385    /// element, including null values.
386    ///
387    /// If `prefix_len` is larger than the element's length, the iterator will
388    /// return an empty slice (`&[]`).
389    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
390        self.views().into_iter().map(move |v| {
391            let len = (*v as u32) as usize;
392
393            if len < prefix_len {
394                return &[] as &[u8];
395            }
396
397            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
398                unsafe { StringViewArray::inline_value(v, prefix_len) }
399            } else {
400                let view = ByteView::from(*v);
401                let data = unsafe {
402                    self.data_buffers()
403                        .get_unchecked(view.buffer_index as usize)
404                };
405                let offset = view.offset as usize;
406                unsafe { data.get_unchecked(offset..offset + prefix_len) }
407            }
408        })
409    }
410
411    /// Returns an iterator over the last `suffix_len` bytes of each array
412    /// element, including null values.
413    ///
414    /// Note that for [`StringViewArray`] the last bytes may start in the middle
415    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
416    ///
417    /// If `suffix_len` is larger than the element's length, the iterator will
418    /// return an empty slice (`&[]`).
419    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
420        self.views().into_iter().map(move |v| {
421            let len = (*v as u32) as usize;
422
423            if len < suffix_len {
424                return &[] as &[u8];
425            }
426
427            if len as u32 <= MAX_INLINE_VIEW_LEN {
428                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
429            } else {
430                let view = ByteView::from(*v);
431                let data = unsafe {
432                    self.data_buffers()
433                        .get_unchecked(view.buffer_index as usize)
434                };
435                let offset = view.offset as usize;
436                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
437            }
438        })
439    }
440
441    /// Return an iterator over the length of each array element, including null values.
442    ///
443    /// Null values length would equal to the underlying bytes length and NOT 0
444    ///
445    /// Example of getting 0 for null values
446    /// ```rust
447    /// # use arrow_array::StringViewArray;
448    /// # use arrow_array::Array;
449    /// use arrow_data::ByteView;
450    ///
451    /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
452    ///     view.lengths()
453    ///         .enumerate()
454    ///         .map(|(index, length)| if view.is_null(index) { 0 } else { length })
455    /// }
456    /// ```
457    pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
458        self.views().iter().map(|v| *v as u32)
459    }
460
461    /// Returns a zero-copy slice of this array with the indicated offset and length.
462    pub fn slice(&self, offset: usize, length: usize) -> Self {
463        Self {
464            data_type: T::DATA_TYPE,
465            views: self.views.slice(offset, length),
466            buffers: self.buffers.clone(),
467            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
468            phantom: Default::default(),
469        }
470    }
471
472    /// Returns a "compacted" version of this array
473    ///
474    /// The original array will *not* be modified
475    ///
476    /// # Garbage Collection
477    ///
478    /// Before GC:
479    /// ```text
480    ///                                        ┌──────┐
481    ///                                        │......│
482    ///                                        │......│
483    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
484    /// │       View 1       │─ ─ ─ ─          │......│  with data that
485    /// ├────────────────────┤                 │......│ is not referred
486    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
487    /// └────────────────────┘                 │......│      View 2
488    ///                                        │......│
489    ///    2 views, refer to                   │......│
490    ///   small portions of a                  └──────┘
491    ///      large buffer
492    /// ```
493    ///
494    /// After GC:
495    ///
496    /// ```text
497    /// ┌────────────────────┐                 ┌─────┐    After gc, only
498    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
499    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
500    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
501    /// └────────────────────┘                                 left
502    ///
503    ///
504    ///         2 views
505    /// ```
506    /// This method will compact the data buffers by recreating the view array and only include the data
507    /// that is pointed to by the views.
508    ///
509    /// Note that it will copy the array regardless of whether the original array is compact.
510    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
511    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
512    ///
513    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
514    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
515    pub fn gc(&self) -> Self {
516        // 1) Read basic properties once
517        let len = self.len(); // number of elements
518        let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
519
520        // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
521        if self.data_buffers().is_empty() {
522            return unsafe {
523                GenericByteViewArray::new_unchecked(
524                    self.views().clone(),
525                    vec![], // empty data blocks
526                    nulls,
527                )
528            };
529        }
530
531        // 2) Calculate total size of all non-inline data and detect if any exists
532        let total_large = self.total_buffer_bytes_used();
533
534        // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
535        if total_large == 0 {
536            // Views are inline-only or all null; just reuse original views and no data blocks
537            return unsafe {
538                GenericByteViewArray::new_unchecked(
539                    self.views().clone(),
540                    vec![], // empty data blocks
541                    nulls,
542                )
543            };
544        }
545
546        let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
547            // fast path, the entire data fits in a single buffer
548            // 3) Allocate exactly capacity for all non-inline data
549            let mut data_buf = Vec::with_capacity(total_large);
550
551            // 4) Iterate over views and process each inline/non-inline view
552            let views_buf: Vec<u128> = (0..len)
553                .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
554                .collect();
555            let data_block = Buffer::from_vec(data_buf);
556            let data_blocks = vec![data_block];
557            (views_buf, data_blocks)
558        } else {
559            // slow path, need to split into multiple buffers
560
561            struct GcCopyGroup {
562                total_buffer_bytes: usize,
563                total_len: usize,
564            }
565
566            impl GcCopyGroup {
567                fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
568                    Self {
569                        total_buffer_bytes: total_buffer_bytes as usize,
570                        total_len,
571                    }
572                }
573            }
574
575            let mut groups = Vec::new();
576            let mut current_length = 0;
577            let mut current_elements = 0;
578
579            for view in self.views() {
580                let len = *view as u32;
581                if len > MAX_INLINE_VIEW_LEN {
582                    if current_length + len > i32::MAX as u32 {
583                        // Start a new group
584                        groups.push(GcCopyGroup::new(current_length, current_elements));
585                        current_length = 0;
586                        current_elements = 0;
587                    }
588                    current_length += len;
589                    current_elements += 1;
590                }
591            }
592            if current_elements != 0 {
593                groups.push(GcCopyGroup::new(current_length, current_elements));
594            }
595            debug_assert!(groups.len() <= i32::MAX as usize);
596
597            // 3) Copy the buffers group by group
598            let mut views_buf = Vec::with_capacity(len);
599            let mut data_blocks = Vec::with_capacity(groups.len());
600
601            let mut current_view_idx = 0;
602
603            for (group_idx, gc_copy_group) in groups.iter().enumerate() {
604                let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
605
606                // Directly push views to avoid intermediate Vec allocation
607                let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
608                    |view_idx| {
609                        // safety: the view index came from iterating over valid range
610                        unsafe {
611                            self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
612                        }
613                    },
614                );
615                views_buf.extend(new_views);
616
617                data_blocks.push(Buffer::from_vec(data_buf));
618                current_view_idx += gc_copy_group.total_len;
619            }
620            (views_buf, data_blocks)
621        };
622
623        // 5) Wrap up views buffer
624        let views_scalar = ScalarBuffer::from(views_buf);
625
626        // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
627        unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
628    }
629
630    /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
631    ///
632    /// # Safety
633    ///
634    /// - `i < self.len()`.
635    /// - Every element in `self.views()` must currently refer to a valid slice
636    ///   inside one of `self.buffers`.
637    /// - `data_buf` must be ready to have additional bytes appended.
638    /// - After this call, the returned view will have its
639    ///   `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
640    ///   into the bytes just appended at the end of `data_buf`.
641    #[inline(always)]
642    unsafe fn copy_view_to_buffer(
643        &self,
644        i: usize,
645        buffer_idx: i32,
646        data_buf: &mut Vec<u8>,
647    ) -> u128 {
648        // SAFETY: `i < self.len()` ensures this is in‑bounds.
649        let raw_view = unsafe { *self.views().get_unchecked(i) };
650        let mut bv = ByteView::from(raw_view);
651
652        // Inline‑small views stay as‑is.
653        if bv.length <= MAX_INLINE_VIEW_LEN {
654            raw_view
655        } else {
656            // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
657            // must both lie within valid ranges for `self.buffers`.
658            let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
659            let start = bv.offset as usize;
660            let end = start + bv.length as usize;
661            let slice = unsafe { buffer.get_unchecked(start..end) };
662
663            // Copy out‑of‑line data into our single “0” buffer.
664            let new_offset = data_buf.len() as u32;
665            data_buf.extend_from_slice(slice);
666
667            bv.buffer_index = buffer_idx as u32;
668            bv.offset = new_offset;
669            bv.into()
670        }
671    }
672
673    /// Returns the total number of bytes used by all non inlined views in all
674    /// buffers.
675    ///
676    /// Note this does not account for views that point at the same underlying
677    /// data in buffers
678    ///
679    /// For example, if the array has three strings views:
680    /// * View with length = 9 (inlined)
681    /// * View with length = 32 (non inlined)
682    /// * View with length = 16 (non inlined)
683    ///
684    /// Then this method would report 48
685    pub fn total_buffer_bytes_used(&self) -> usize {
686        self.views()
687            .iter()
688            .map(|v| {
689                let len = *v as u32;
690                if len > MAX_INLINE_VIEW_LEN {
691                    len as usize
692                } else {
693                    0
694                }
695            })
696            .sum()
697    }
698
699    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
700    ///
701    /// Comparing two ByteView types are non-trivial.
702    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
703    ///
704    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
705    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
706    ///     Meaning that reading one array element requires only one memory access
707    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
708    ///
709    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
710    ///     thanks to the inlined 4 bytes.
711    ///     Consider equality check:
712    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
713    ///
714    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
715    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
716    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
717    ///
718    /// # Order check flow
719    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
720    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
721    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
722    ///     (2.2) o.w., we need to compare the full string.
723    ///
724    /// # Safety
725    /// The left/right_idx must within range of each array
726    pub unsafe fn compare_unchecked(
727        left: &GenericByteViewArray<T>,
728        left_idx: usize,
729        right: &GenericByteViewArray<T>,
730        right_idx: usize,
731    ) -> Ordering {
732        let l_view = unsafe { left.views().get_unchecked(left_idx) };
733        let l_byte_view = ByteView::from(*l_view);
734
735        let r_view = unsafe { right.views().get_unchecked(right_idx) };
736        let r_byte_view = ByteView::from(*r_view);
737
738        let l_len = l_byte_view.length;
739        let r_len = r_byte_view.length;
740
741        if l_len <= 12 && r_len <= 12 {
742            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
743        }
744
745        // one of the string is larger than 12 bytes,
746        // we then try to compare the inlined data first
747
748        // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
749        // but we can still use it to get the inlined prefix for shorter strings.
750        // The prefix is always the first 4 bytes of the view, for both short and long strings.
751        let l_inlined_be = l_byte_view.prefix.swap_bytes();
752        let r_inlined_be = r_byte_view.prefix.swap_bytes();
753        if l_inlined_be != r_inlined_be {
754            return l_inlined_be.cmp(&r_inlined_be);
755        }
756
757        // unfortunately, we need to compare the full data
758        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
759        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
760
761        l_full_data.cmp(r_full_data)
762    }
763
764    /// Builds a 128-bit composite key for an inline value:
765    ///
766    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
767    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
768    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
769    ///
770    /// This function extracts the length and the 12-byte inline string data from the raw
771    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
772    /// into a single `u128` value suitable for fast, branchless comparisons.
773    ///
774    /// # Why include length?
775    ///
776    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
777    /// compare equal—either because one is a true prefix of the other or because zero-padding
778    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
779    /// handles both content and length in one go.
780    ///
781    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
782    ///
783    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
784    /// |------------|-----------------------|---------------------------------|
785    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
786    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
787    ///
788    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
789    /// then differentiates:
790    ///
791    /// ```text
792    /// key("bar")   = 0x0000000000000000000062617200000003
793    /// key("bar\0") = 0x0000000000000000000062617200000004
794    /// ⇒ key("bar") < key("bar\0")
795    /// ```
796    /// - `raw` is treated as a 128-bit integer with its bits laid out as follows:
797    ///   - bits 0–31: length (little-endian)
798    ///   - bits 32–127: data (little-endian)
799    ///
800    /// # Inlining and Endianness
801    ///
802    /// This function uses platform-independent bitwise operations to construct a 128-bit key:
803    /// - `raw.swap_bytes() << 32` effectively clears the length bits and shifts the 12-byte inline data
804    ///   into the high 96 bits in Big-Endian order. This ensures the first byte of the string
805    ///   is the most significant byte of the resulting `u128`.
806    /// - `raw as u32` extracts the length as a numeric integer, which is then placed in the low 32 bits.
807    #[inline(always)]
808    pub fn inline_key_fast(raw: u128) -> u128 {
809        (raw.swap_bytes() << 32) | (raw as u32 as u128)
810    }
811}
812
813impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
814    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
815        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
816        print_long_array(self, f, |array, index, f| {
817            std::fmt::Debug::fmt(&array.value(index), f)
818        })?;
819        write!(f, "]")
820    }
821}
822
823/// SAFETY: Correctly implements the contract of Arrow Arrays
824unsafe impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
825    fn as_any(&self) -> &dyn Any {
826        self
827    }
828
829    fn to_data(&self) -> ArrayData {
830        self.clone().into()
831    }
832
833    fn into_data(self) -> ArrayData {
834        self.into()
835    }
836
837    fn data_type(&self) -> &DataType {
838        &self.data_type
839    }
840
841    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
842        Arc::new(self.slice(offset, length))
843    }
844
845    fn len(&self) -> usize {
846        self.views.len()
847    }
848
849    fn is_empty(&self) -> bool {
850        self.views.is_empty()
851    }
852
853    fn shrink_to_fit(&mut self) {
854        self.views.shrink_to_fit();
855
856        // The goal of `shrink_to_fit` is to minimize the space used by any of
857        // its allocations. The use of `Arc::get_mut` over `Arc::make_mut` is
858        // because if the reference count is greater than 1, `Arc::make_mut`
859        // will first clone its contents. So, any large allocations will first
860        // be cloned before being shrunk, leaving the pre-cloned allocations
861        // intact, before adding the extra (used) space of the new clones.
862        if let Some(buffers) = Arc::get_mut(&mut self.buffers) {
863            buffers.iter_mut().for_each(|b| b.shrink_to_fit());
864        }
865
866        // With the assumption that this is a best-effort function, no attempt
867        // is made to shrink `self.buffers`, which it can't because it's type
868        // does not expose a `shrink_to_fit` method.
869
870        if let Some(nulls) = &mut self.nulls {
871            nulls.shrink_to_fit();
872        }
873    }
874
875    fn offset(&self) -> usize {
876        0
877    }
878
879    fn nulls(&self) -> Option<&NullBuffer> {
880        self.nulls.as_ref()
881    }
882
883    fn logical_null_count(&self) -> usize {
884        // More efficient that the default implementation
885        self.null_count()
886    }
887
888    fn get_buffer_memory_size(&self) -> usize {
889        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
890        sum += self.views.inner().capacity();
891        if let Some(x) = &self.nulls {
892            sum += x.buffer().capacity()
893        }
894        sum
895    }
896
897    fn get_array_memory_size(&self) -> usize {
898        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
899    }
900}
901
902impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
903    type Item = &'a T::Native;
904
905    fn value(&self, index: usize) -> Self::Item {
906        GenericByteViewArray::value(self, index)
907    }
908
909    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
910        unsafe { GenericByteViewArray::value_unchecked(self, index) }
911    }
912}
913
914impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
915    type Item = Option<&'a T::Native>;
916    type IntoIter = ArrayIter<Self>;
917
918    fn into_iter(self) -> Self::IntoIter {
919        ArrayIter::new(self)
920    }
921}
922
923impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
924    fn from(data: ArrayData) -> Self {
925        let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();
926        assert_eq!(
927            data_type,
928            T::DATA_TYPE,
929            "Mismatched data type, expected {}, got {data_type}",
930            T::DATA_TYPE
931        );
932        let mut buffers = buffers.into_iter();
933        // first buffer is views, remaining are data buffers
934        let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
935        Self {
936            data_type,
937            views,
938            buffers: Arc::from_iter(buffers),
939            nulls,
940            phantom: Default::default(),
941        }
942    }
943}
944
945/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
946///
947/// For example this method can convert a [`StringArray`] to a
948/// [`StringViewArray`].
949///
950/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
951/// is built without copying the underlying string data (views are created
952/// directly into the existing buffer)
953///
954/// [`StringArray`]: crate::StringArray
955impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
956where
957    FROM: ByteArrayType,
958    FROM::Offset: OffsetSizeTrait + ToPrimitive,
959    V: ByteViewType<Native = FROM::Native>,
960{
961    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
962        let offsets = byte_array.offsets();
963
964        let can_reuse_buffer = match offsets.last() {
965            Some(offset) => offset.as_usize() < u32::MAX as usize,
966            None => true,
967        };
968
969        if can_reuse_buffer {
970            // build views directly pointing to the existing buffer
971            let len = byte_array.len();
972            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
973            let str_values_buf = byte_array.values().clone();
974            let block = views_builder.append_block(str_values_buf);
975            for (i, w) in offsets.windows(2).enumerate() {
976                let offset = w[0].as_usize();
977                let end = w[1].as_usize();
978                let length = end - offset;
979
980                if byte_array.is_null(i) {
981                    views_builder.append_null();
982                } else {
983                    // Safety: the input was a valid array so it valid UTF8 (if string). And
984                    // all offsets were valid
985                    unsafe {
986                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
987                    }
988                }
989            }
990            assert_eq!(views_builder.len(), len);
991            views_builder.finish()
992        } else {
993            // Otherwise, create a new buffer for large strings
994            // TODO: the original buffer could still be used
995            // by making multiple slices of u32::MAX length
996            GenericByteViewArray::<V>::from_iter(byte_array.iter())
997        }
998    }
999}
1000
1001impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1002    fn from(array: GenericByteViewArray<T>) -> Self {
1003        let len = array.len();
1004
1005        let mut buffers = array.buffers.to_vec();
1006        buffers.insert(0, array.views.into_inner());
1007
1008        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1009            .len(len)
1010            .buffers(buffers)
1011            .nulls(array.nulls);
1012
1013        unsafe { builder.build_unchecked() }
1014    }
1015}
1016
1017impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1018where
1019    Ptr: AsRef<T::Native> + 'a,
1020    T: ByteViewType + ?Sized,
1021{
1022    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1023        iter.into_iter()
1024            .map(|o| o.as_ref().map(|p| p.as_ref()))
1025            .collect()
1026    }
1027}
1028
1029impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1030where
1031    Ptr: AsRef<T::Native>,
1032{
1033    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1034        let iter = iter.into_iter();
1035        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1036        builder.extend(iter);
1037        builder.finish()
1038    }
1039}
1040
1041/// A [`GenericByteViewArray`] of `[u8]`
1042///
1043/// See [`GenericByteViewArray`] for format and layout details.
1044///
1045/// # Example
1046/// ```
1047/// use arrow_array::BinaryViewArray;
1048/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1049/// assert_eq!(array.value(0), b"hello");
1050/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1051/// ```
1052pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1053
1054impl BinaryViewArray {
1055    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1056    /// If items not utf8 data, validate will fail and error returned.
1057    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1058        StringViewType::validate(self.views(), self.data_buffers())?;
1059        unsafe { Ok(self.to_string_view_unchecked()) }
1060    }
1061
1062    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1063    /// # Safety
1064    /// Caller is responsible for ensuring that items in array are utf8 data.
1065    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1066        unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1067    }
1068}
1069
1070impl From<Vec<&[u8]>> for BinaryViewArray {
1071    fn from(v: Vec<&[u8]>) -> Self {
1072        Self::from_iter_values(v)
1073    }
1074}
1075
1076impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1077    fn from(v: Vec<Option<&[u8]>>) -> Self {
1078        v.into_iter().collect()
1079    }
1080}
1081
1082/// A [`GenericByteViewArray`] that stores utf8 data
1083///
1084/// See [`GenericByteViewArray`] for format and layout details.
1085///
1086/// # Example
1087/// ```
1088/// use arrow_array::StringViewArray;
1089/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1090/// assert_eq!(array.value(0), "hello");
1091/// assert_eq!(array.value(3), "large payload over 12 bytes");
1092/// ```
1093pub type StringViewArray = GenericByteViewArray<StringViewType>;
1094
1095impl StringViewArray {
1096    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1097    pub fn to_binary_view(self) -> BinaryViewArray {
1098        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1099    }
1100
1101    /// Returns true if all data within this array is ASCII
1102    pub fn is_ascii(&self) -> bool {
1103        // Alternative (but incorrect): directly check the underlying buffers
1104        // (1) Our string view might be sparse, i.e., a subset of the buffers,
1105        //      so even if the buffer is not ascii, we can still be ascii.
1106        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1107        // This means that this operation is quite expensive, shall we cache the result?
1108        //  i.e. track `is_ascii` in the builder.
1109        self.iter().all(|v| match v {
1110            Some(v) => v.is_ascii(),
1111            None => true,
1112        })
1113    }
1114}
1115
1116impl From<Vec<&str>> for StringViewArray {
1117    fn from(v: Vec<&str>) -> Self {
1118        Self::from_iter_values(v)
1119    }
1120}
1121
1122impl From<Vec<Option<&str>>> for StringViewArray {
1123    fn from(v: Vec<Option<&str>>) -> Self {
1124        v.into_iter().collect()
1125    }
1126}
1127
1128impl From<Vec<String>> for StringViewArray {
1129    fn from(v: Vec<String>) -> Self {
1130        Self::from_iter_values(v)
1131    }
1132}
1133
1134impl From<Vec<Option<String>>> for StringViewArray {
1135    fn from(v: Vec<Option<String>>) -> Self {
1136        v.into_iter().collect()
1137    }
1138}
1139
1140#[cfg(test)]
1141mod tests {
1142    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1143    use crate::types::BinaryViewType;
1144    use crate::{
1145        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1146    };
1147    use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
1148    use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
1149    use arrow_schema::DataType;
1150    use rand::prelude::StdRng;
1151    use rand::{Rng, SeedableRng};
1152    use std::str::from_utf8;
1153
1154    const BLOCK_SIZE: u32 = 8;
1155
1156    #[test]
1157    fn try_new_string() {
1158        let array = StringViewArray::from_iter_values(vec![
1159            "hello",
1160            "world",
1161            "lulu",
1162            "large payload over 12 bytes",
1163        ]);
1164        assert_eq!(array.value(0), "hello");
1165        assert_eq!(array.value(3), "large payload over 12 bytes");
1166    }
1167
1168    #[test]
1169    fn try_new_binary() {
1170        let array = BinaryViewArray::from_iter_values(vec![
1171            b"hello".as_slice(),
1172            b"world".as_slice(),
1173            b"lulu".as_slice(),
1174            b"large payload over 12 bytes".as_slice(),
1175        ]);
1176        assert_eq!(array.value(0), b"hello");
1177        assert_eq!(array.value(3), b"large payload over 12 bytes");
1178    }
1179
1180    #[test]
1181    fn try_new_empty_string() {
1182        // test empty array
1183        let array = {
1184            let mut builder = StringViewBuilder::new();
1185            builder.finish()
1186        };
1187        assert!(array.is_empty());
1188    }
1189
1190    #[test]
1191    fn try_new_empty_binary() {
1192        // test empty array
1193        let array = {
1194            let mut builder = BinaryViewBuilder::new();
1195            builder.finish()
1196        };
1197        assert!(array.is_empty());
1198    }
1199
1200    #[test]
1201    fn test_append_string() {
1202        // test builder append
1203        let array = {
1204            let mut builder = StringViewBuilder::new();
1205            builder.append_value("hello");
1206            builder.append_null();
1207            builder.append_option(Some("large payload over 12 bytes"));
1208            builder.finish()
1209        };
1210        assert_eq!(array.value(0), "hello");
1211        assert!(array.is_null(1));
1212        assert_eq!(array.value(2), "large payload over 12 bytes");
1213    }
1214
1215    #[test]
1216    fn test_append_binary() {
1217        // test builder append
1218        let array = {
1219            let mut builder = BinaryViewBuilder::new();
1220            builder.append_value(b"hello");
1221            builder.append_null();
1222            builder.append_option(Some(b"large payload over 12 bytes"));
1223            builder.finish()
1224        };
1225        assert_eq!(array.value(0), b"hello");
1226        assert!(array.is_null(1));
1227        assert_eq!(array.value(2), b"large payload over 12 bytes");
1228    }
1229
1230    #[test]
1231    fn test_in_progress_recreation() {
1232        let array = {
1233            // make a builder with small block size.
1234            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1235            builder.append_value("large payload over 12 bytes");
1236            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1237            builder.finish()
1238        };
1239        assert_eq!(array.value(0), "large payload over 12 bytes");
1240        assert_eq!(
1241            array.value(1),
1242            "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1243        );
1244        assert_eq!(2, array.buffers.len());
1245    }
1246
1247    #[test]
1248    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1249    fn new_with_invalid_view_data() {
1250        let v = "large payload over 12 bytes";
1251        let view = ByteView::new(13, &v.as_bytes()[0..4])
1252            .with_buffer_index(3)
1253            .with_offset(1);
1254        let views = ScalarBuffer::from(vec![view.into()]);
1255        let buffers = vec![Buffer::from_slice_ref(v)];
1256        StringViewArray::new(views, buffers, None);
1257    }
1258
1259    #[test]
1260    #[should_panic(
1261        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1262    )]
1263    fn new_with_invalid_utf8_data() {
1264        let v: Vec<u8> = vec![
1265            // invalid UTF8
1266            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1267            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1268        ];
1269        let view = ByteView::new(v.len() as u32, &v[0..4]);
1270        let views = ScalarBuffer::from(vec![view.into()]);
1271        let buffers = vec![Buffer::from_slice_ref(v)];
1272        StringViewArray::new(views, buffers, None);
1273    }
1274
1275    #[test]
1276    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1277    fn new_with_invalid_zero_padding() {
1278        let mut data = [0; 12];
1279        data[0] = b'H';
1280        data[11] = 1; // no zero padding
1281
1282        let mut view_buffer = [0; 16];
1283        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1284        view_buffer[4..].copy_from_slice(&data);
1285
1286        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1287        let views = ScalarBuffer::from(vec![view.into()]);
1288        let buffers = vec![];
1289        StringViewArray::new(views, buffers, None);
1290    }
1291
1292    #[test]
1293    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1294    fn test_mismatch_between_embedded_prefix_and_data() {
1295        let input_str_1 = "Hello, Rustaceans!";
1296        let input_str_2 = "Hallo, Rustaceans!";
1297        let length = input_str_1.len() as u32;
1298        assert!(input_str_1.len() > 12);
1299
1300        let mut view_buffer = [0; 16];
1301        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1302        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1303        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1304        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1305        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1306        let views = ScalarBuffer::from(vec![view.into()]);
1307        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1308
1309        StringViewArray::new(views, buffers, None);
1310    }
1311
1312    #[test]
1313    fn test_gc() {
1314        let test_data = [
1315            Some("longer than 12 bytes"),
1316            Some("short"),
1317            Some("t"),
1318            Some("longer than 12 bytes"),
1319            None,
1320            Some("short"),
1321        ];
1322
1323        let array = {
1324            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1325            test_data.into_iter().for_each(|v| builder.append_option(v));
1326            builder.finish()
1327        };
1328        assert!(array.buffers.len() > 1);
1329
1330        fn check_gc(to_test: &StringViewArray) {
1331            let gc = to_test.gc();
1332            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1333
1334            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1335                assert_eq!(a, b);
1336            });
1337            assert_eq!(to_test.len(), gc.len());
1338        }
1339
1340        check_gc(&array);
1341        check_gc(&array.slice(1, 3));
1342        check_gc(&array.slice(2, 1));
1343        check_gc(&array.slice(2, 2));
1344        check_gc(&array.slice(3, 1));
1345    }
1346
1347    /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1348    #[test]
1349    fn test_gc_empty_array() {
1350        let array = StringViewBuilder::new()
1351            .with_fixed_block_size(BLOCK_SIZE)
1352            .finish();
1353        let gced = array.gc();
1354        // length and null count remain zero
1355        assert_eq!(gced.len(), 0);
1356        assert_eq!(gced.null_count(), 0);
1357        // no underlying data buffers should be allocated
1358        assert!(
1359            gced.data_buffers().is_empty(),
1360            "Expected no data buffers for empty array"
1361        );
1362    }
1363
1364    /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1365    #[test]
1366    fn test_gc_all_inline() {
1367        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1368        // append many short strings, each exactly INLINE_LEN long
1369        for _ in 0..100 {
1370            let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1371            builder.append_option(Some(&s));
1372        }
1373        let array = builder.finish();
1374        let gced = array.gc();
1375        // Since all views fit inline, data buffer is empty
1376        assert_eq!(
1377            gced.data_buffers().len(),
1378            0,
1379            "Should have no data buffers for inline values"
1380        );
1381        assert_eq!(gced.len(), 100);
1382        // verify element-wise equality
1383        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1384            assert_eq!(orig, got, "Inline value mismatch after gc");
1385        });
1386    }
1387
1388    /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1389    #[test]
1390    fn test_gc_all_large() {
1391        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1392        let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1393        // append multiple large strings
1394        for _ in 0..50 {
1395            builder.append_option(Some(&large_str));
1396        }
1397        let array = builder.finish();
1398        let gced = array.gc();
1399        // New data buffers should be populated (one or more blocks)
1400        assert!(
1401            !gced.data_buffers().is_empty(),
1402            "Expected data buffers for large values"
1403        );
1404        assert_eq!(gced.len(), 50);
1405        // verify that every large string emerges unchanged
1406        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1407            assert_eq!(orig, got, "Large view mismatch after gc");
1408        });
1409    }
1410
1411    /// 4) All null elements: ensure null bitmap handling path is correct
1412    #[test]
1413    fn test_gc_all_nulls() {
1414        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1415        for _ in 0..20 {
1416            builder.append_null();
1417        }
1418        let array = builder.finish();
1419        let gced = array.gc();
1420        // length and null count match
1421        assert_eq!(gced.len(), 20);
1422        assert_eq!(gced.null_count(), 20);
1423        // data buffers remain empty for null-only array
1424        assert!(
1425            gced.data_buffers().is_empty(),
1426            "No data should be stored for nulls"
1427        );
1428    }
1429
1430    /// 5) Random mix of inline, large, and null values with slicing tests
1431    #[test]
1432    fn test_gc_random_mixed_and_slices() {
1433        let mut rng = StdRng::seed_from_u64(42);
1434        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1435        // Keep a Vec of original Option<String> for later comparison
1436        let mut original: Vec<Option<String>> = Vec::new();
1437
1438        for _ in 0..200 {
1439            if rng.random_bool(0.1) {
1440                // 10% nulls
1441                builder.append_null();
1442                original.push(None);
1443            } else {
1444                // random length between 0 and twice the inline limit
1445                let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1446                let s: String = "A".repeat(len as usize);
1447                builder.append_option(Some(&s));
1448                original.push(Some(s));
1449            }
1450        }
1451
1452        let array = builder.finish();
1453        // Test multiple slice ranges to ensure offset logic is correct
1454        for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1455            let sliced = array.slice(*offset, *slice_len);
1456            let gced = sliced.gc();
1457            // Build expected slice of Option<&str>
1458            let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1459                .iter()
1460                .map(|opt| opt.as_deref())
1461                .collect();
1462
1463            assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1464            // Compare element-wise
1465            gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1466                assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1467            });
1468        }
1469    }
1470
1471    #[test]
1472    #[cfg_attr(miri, ignore)] // Takes too long
1473    fn test_gc_huge_array() {
1474        // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1475        let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1476        let num_views: usize = 36;
1477
1478        // Create a single 128 MiB data block with a simple byte pattern
1479        let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1480        let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1481
1482        // Append this block and then add many views pointing to it
1483        let mut builder = BinaryViewBuilder::new();
1484        let block_id = builder.append_block(buffer);
1485        for _ in 0..num_views / 2 {
1486            builder
1487                .try_append_view(block_id, 0, block_len as u32)
1488                .expect("append view into 128MiB block");
1489        }
1490        let block_id2 = builder.append_block(buffer2);
1491        for _ in 0..num_views / 2 {
1492            builder
1493                .try_append_view(block_id2, 0, block_len as u32)
1494                .expect("append view into 128MiB block");
1495        }
1496
1497        let array = builder.finish();
1498        let total = array.total_buffer_bytes_used();
1499        assert!(
1500            total > u32::MAX as usize,
1501            "Expected total non-inline bytes to exceed 4 GiB, got {}",
1502            total
1503        );
1504
1505        // Run gc and verify correctness
1506        let gced = array.gc();
1507        assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1508        assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1509        assert_ne!(
1510            gced.data_buffers().len(),
1511            1,
1512            "gc with huge buffer should not consolidate data into a single buffer"
1513        );
1514
1515        // Element-wise equality check across the entire array
1516        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1517            assert_eq!(orig, got, "Value mismatch after gc on huge array");
1518        });
1519    }
1520
1521    #[test]
1522    fn test_eq() {
1523        let test_data = [
1524            Some("longer than 12 bytes"),
1525            None,
1526            Some("short"),
1527            Some("again, this is longer than 12 bytes"),
1528        ];
1529
1530        let array1 = {
1531            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1532            test_data.into_iter().for_each(|v| builder.append_option(v));
1533            builder.finish()
1534        };
1535        let array2 = {
1536            // create a new array with the same data but different layout
1537            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1538            test_data.into_iter().for_each(|v| builder.append_option(v));
1539            builder.finish()
1540        };
1541        assert_eq!(array1, array1.clone());
1542        assert_eq!(array2, array2.clone());
1543        assert_eq!(array1, array2);
1544    }
1545
1546    /// Integration tests for `inline_key_fast` covering:
1547    ///
1548    /// 1. Monotonic ordering across increasing lengths and lexical variations.
1549    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1550    ///
1551    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1552    /// the length field is required even when all inline bytes fit in 12 bytes.
1553    ///
1554    /// The test includes strings that verify correct byte order (prevent reversal bugs),
1555    /// and length-based tie-breaking in the composite key.
1556    ///
1557    /// The test confirms that `inline_key_fast` produces keys which sort consistently
1558    /// with the expected lexicographical order of the raw byte arrays.
1559    #[test]
1560    fn test_inline_key_fast_various_lengths_and_lexical() {
1561        /// Helper to create a raw u128 value representing an inline ByteView:
1562        /// - `length`: number of meaningful bytes (must be ≤ 12)
1563        /// - `data`: the actual inline data bytes
1564        ///
1565        /// The first 4 bytes encode length in little-endian,
1566        /// the following 12 bytes contain the inline string data (unpadded).
1567        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1568            assert!(length as usize <= 12, "Inline length must be ≤ 12");
1569            assert!(
1570                data.len() == length as usize,
1571                "Data length must match `length`"
1572            );
1573
1574            let mut raw_bytes = [0u8; 16];
1575            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1576            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1577            u128::from_le_bytes(raw_bytes)
1578        }
1579
1580        // Test inputs: various lengths and lexical orders,
1581        // plus special cases for byte order and length tie-breaking
1582        let test_inputs: Vec<&[u8]> = vec![
1583            b"a",
1584            b"aa",
1585            b"aaa",
1586            b"aab",
1587            b"abcd",
1588            b"abcde",
1589            b"abcdef",
1590            b"abcdefg",
1591            b"abcdefgh",
1592            b"abcdefghi",
1593            b"abcdefghij",
1594            b"abcdefghijk",
1595            b"abcdefghijkl",
1596            // Tests for byte-order reversal bug:
1597            // Without the fix, "backend one" would compare as "eno dnekcab",
1598            // causing incorrect sort order relative to "backend two".
1599            b"backend one",
1600            b"backend two",
1601            // Tests length-tiebreaker logic:
1602            // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1603            // so only the length differentiates their ordering.
1604            b"bar",
1605            b"bar\0",
1606            // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1607            b"than12Byt",
1608            b"than12Bytes",
1609            b"than12Bytes\0",
1610            b"than12Bytesx",
1611            b"than12Bytex",
1612            b"than12Bytez",
1613            // Additional lexical tests
1614            b"xyy",
1615            b"xyz",
1616            b"xza",
1617        ];
1618
1619        // Create a GenericBinaryArray for cross-comparison of lex order
1620        let array: GenericBinaryArray<i32> =
1621            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1622
1623        for i in 0..array.len() - 1 {
1624            let v1 = array.value(i);
1625            let v2 = array.value(i + 1);
1626
1627            // Assert the array's natural lexical ordering is correct
1628            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1629
1630            // Assert the keys produced by inline_key_fast reflect the same ordering
1631            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1632                v1.len() as u32,
1633                v1,
1634            ));
1635            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1636                v2.len() as u32,
1637                v2,
1638            ));
1639
1640            assert!(
1641                key1 < key2,
1642                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1643            );
1644        }
1645    }
1646
1647    #[test]
1648    fn empty_array_should_return_empty_lengths_iterator() {
1649        let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
1650
1651        let mut lengths_iter = empty.lengths();
1652        assert_eq!(lengths_iter.len(), 0);
1653        assert_eq!(lengths_iter.next(), None);
1654    }
1655
1656    #[test]
1657    fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
1658        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1659            // Not inlined as longer than 12 bytes
1660            b"Supercalifragilisticexpialidocious" as &[u8],
1661            // Inlined as shorter than 12 bytes
1662            b"Hello",
1663            // Empty value
1664            b"",
1665            // Exactly 12 bytes
1666            b"abcdefghijkl",
1667        ]);
1668
1669        let mut lengths_iter = cases.lengths();
1670
1671        assert_eq!(lengths_iter.len(), cases.len());
1672
1673        let cases_iter = cases.iter();
1674
1675        for case in cases_iter {
1676            let case_value = case.unwrap();
1677            let length = lengths_iter.next().expect("Should have a length");
1678
1679            assert_eq!(case_value.len(), length as usize);
1680        }
1681
1682        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1683    }
1684
1685    #[test]
1686    fn array_lengths_should_return_the_underlying_length_for_null_values() {
1687        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1688            // Not inlined as longer than 12 bytes
1689            b"Supercalifragilisticexpialidocious" as &[u8],
1690            // Inlined as shorter than 12 bytes
1691            b"Hello",
1692            // Empty value
1693            b"",
1694            // Exactly 12 bytes
1695            b"abcdefghijkl",
1696        ]);
1697
1698        let (views, buffer, _) = cases.clone().into_parts();
1699
1700        // Keeping the values but just adding nulls on top
1701        let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
1702            views,
1703            buffer,
1704            Some(NullBuffer::new_null(cases.len())),
1705        );
1706
1707        let lengths_iter = cases.lengths();
1708        let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
1709
1710        assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
1711
1712        for expected_length in lengths_iter {
1713            let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");
1714
1715            assert_eq!(expected_length, actual_length);
1716        }
1717
1718        assert_eq!(
1719            all_nulls_lengths_iter.next(),
1720            None,
1721            "Should not have more lengths"
1722        );
1723    }
1724
1725    #[test]
1726    fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
1727        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1728            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1729            b"Hello",
1730            b"something great",
1731            b"is",
1732            b"coming soon!",
1733            b"when you find what it is",
1734            b"let me know",
1735            b"cause",
1736            b"I",
1737            b"have no idea",
1738            b"what it",
1739            b"is",
1740        ]);
1741
1742        let sliced_array = array.slice(2, array.len() - 3);
1743
1744        let mut lengths_iter = sliced_array.lengths();
1745
1746        assert_eq!(lengths_iter.len(), sliced_array.len());
1747
1748        let values_iter = sliced_array.iter();
1749
1750        for value in values_iter {
1751            let value = value.unwrap();
1752            let length = lengths_iter.next().expect("Should have a length");
1753
1754            assert_eq!(value.len(), length as usize);
1755        }
1756
1757        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1758    }
1759
1760    #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")]
1761    #[test]
1762    fn invalid_casting_from_array_data() {
1763        // Should not be able to cast to StringViewArray due to invalid UTF-8
1764        let array_data = binary_view_array_with_invalid_utf8_data().into_data();
1765        let _ = StringViewArray::from(array_data);
1766    }
1767
1768    #[should_panic(expected = "invalid utf-8 sequence")]
1769    #[test]
1770    fn invalid_array_data() {
1771        let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts();
1772
1773        // manually try and add invalid array data with Utf8View data type
1774        let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
1775            .add_buffer(views.into_inner())
1776            .len(3);
1777        for buffer in buffers.iter() {
1778            builder = builder.add_buffer(buffer.clone())
1779        }
1780        builder = builder.nulls(nulls);
1781
1782        let data = builder.build().unwrap(); // should fail validation
1783        let _arr = StringViewArray::from(data);
1784    }
1785
1786    /// Returns a BinaryViewArray with one invalid UTF-8 value
1787    fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
1788        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1789            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1790            &[
1791                0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1792                0x00, 0x00,
1793            ],
1794            b"good",
1795        ]);
1796        assert!(from_utf8(array.value(0)).is_ok());
1797        assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
1798        assert!(from_utf8(array.value(2)).is_ok());
1799        array
1800    }
1801}