Skip to main content

arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68///                         ┌──────┬────────────────────────┐
69///                         │length│      string value      │
70///    Strings (len <= 12)  │      │    (padded with 0)     │
71///                         └──────┴────────────────────────┘
72///                          0    31                      127
73///
74///                         ┌───────┬───────┬───────┬───────┐
75///                         │length │prefix │  buf  │offset │
76///    Strings (len > 12)   │       │       │ index │       │
77///                         └───────┴───────┴───────┴───────┘
78///                          0    31       63      95    127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82///   the view. See [`Self::inline_value`] to access the inlined prefix from a
83///   short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86///   view and the entire string is stored in one of the buffers. See [`ByteView`]
87///   to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100///   "hello",
101///   "this string is longer than 12 bytes",
102///   "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113///   StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148///                                                                            ┌───┐
149///                         ┌──────┬──────┬──────┬──────┐               offset │...│
150/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
151///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
152///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
153/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
154///                         └──────┴──────┴──────┴──────┘                      │ced│
155///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
157///                         └──────┴────────────────────┘               offset │sIn│
158///                                                                       115  │Tow│
159///                                                                            │nTo│
160///                                                                            │day│
161///                                  u128 "views"                              │Yay│
162///                                                                   buffer 0 │...│
163///                                                                            └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166    data_type: DataType,
167    views: ScalarBuffer<u128>,
168    buffers: Arc<[Buffer]>,
169    phantom: PhantomData<T>,
170    nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174    fn clone(&self) -> Self {
175        Self {
176            data_type: T::DATA_TYPE,
177            views: self.views.clone(),
178            buffers: self.buffers.clone(),
179            nulls: self.nulls.clone(),
180            phantom: Default::default(),
181        }
182    }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187    ///
188    /// # Panics
189    ///
190    /// Panics if [`GenericByteViewArray::try_new`] returns an error
191    pub fn new<U>(views: ScalarBuffer<u128>, buffers: U, nulls: Option<NullBuffer>) -> Self
192    where
193        U: Into<Arc<[Buffer]>>,
194    {
195        Self::try_new(views, buffers, nulls).unwrap()
196    }
197
198    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
199    ///
200    /// # Errors
201    ///
202    /// * `views.len() != nulls.len()`
203    /// * [ByteViewType::validate] fails
204    pub fn try_new<U>(
205        views: ScalarBuffer<u128>,
206        buffers: U,
207        nulls: Option<NullBuffer>,
208    ) -> Result<Self, ArrowError>
209    where
210        U: Into<Arc<[Buffer]>>,
211    {
212        let buffers: Arc<[Buffer]> = buffers.into();
213
214        T::validate(&views, &buffers)?;
215
216        if let Some(n) = nulls.as_ref() {
217            if n.len() != views.len() {
218                return Err(ArrowError::InvalidArgumentError(format!(
219                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
220                    T::PREFIX,
221                    views.len(),
222                    n.len(),
223                )));
224            }
225        }
226
227        Ok(Self {
228            data_type: T::DATA_TYPE,
229            views,
230            buffers,
231            nulls,
232            phantom: Default::default(),
233        })
234    }
235
236    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
237    ///
238    /// # Safety
239    ///
240    /// Safe if [`Self::try_new`] would not error
241    pub unsafe fn new_unchecked<U>(
242        views: ScalarBuffer<u128>,
243        buffers: U,
244        nulls: Option<NullBuffer>,
245    ) -> Self
246    where
247        U: Into<Arc<[Buffer]>>,
248    {
249        if cfg!(feature = "force_validate") {
250            return Self::new(views, buffers, nulls);
251        }
252
253        Self {
254            data_type: T::DATA_TYPE,
255            phantom: Default::default(),
256            views,
257            buffers: buffers.into(),
258            nulls,
259        }
260    }
261
262    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
263    pub fn new_null(len: usize) -> Self {
264        Self {
265            data_type: T::DATA_TYPE,
266            views: vec![0; len].into(),
267            buffers: vec![].into(),
268            nulls: Some(NullBuffer::new_null(len)),
269            phantom: Default::default(),
270        }
271    }
272
273    /// Create a new [`Scalar`] from `value`
274    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
275        Scalar::new(Self::from_iter_values(std::iter::once(value)))
276    }
277
278    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
279    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
280    where
281        Ptr: AsRef<T::Native>,
282        I: IntoIterator<Item = Ptr>,
283    {
284        let iter = iter.into_iter();
285        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
286        for v in iter {
287            builder.append_value(v);
288        }
289        builder.finish()
290    }
291
292    /// Deconstruct this array into its constituent parts
293    pub fn into_parts(self) -> (ScalarBuffer<u128>, Arc<[Buffer]>, Option<NullBuffer>) {
294        (self.views, self.buffers, self.nulls)
295    }
296
297    /// Returns the views buffer
298    #[inline]
299    pub fn views(&self) -> &ScalarBuffer<u128> {
300        &self.views
301    }
302
303    /// Returns the buffers storing string data
304    #[inline]
305    pub fn data_buffers(&self) -> &[Buffer] {
306        &self.buffers
307    }
308
309    /// Returns the element at index `i`
310    ///
311    /// Note: This method does not check for nulls and the value is arbitrary
312    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
313    ///
314    /// # Panics
315    /// Panics if index `i` is out of bounds.
316    pub fn value(&self, i: usize) -> &T::Native {
317        assert!(
318            i < self.len(),
319            "Trying to access an element at index {} from a {}ViewArray of length {}",
320            i,
321            T::PREFIX,
322            self.len()
323        );
324
325        unsafe { self.value_unchecked(i) }
326    }
327
328    /// Returns the element at index `i` without bounds checking
329    ///
330    /// Note: This method does not check for nulls and the value is arbitrary
331    /// if [`is_null`](Self::is_null) returns true for the index.
332    ///
333    /// # Safety
334    ///
335    /// Caller is responsible for ensuring that the index is within the bounds
336    /// of the array
337    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
338        let v = unsafe { self.views.get_unchecked(idx) };
339        let len = *v as u32;
340        let b = if len <= MAX_INLINE_VIEW_LEN {
341            unsafe { Self::inline_value(v, len as usize) }
342        } else {
343            let view = ByteView::from(*v);
344            let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
345            let offset = view.offset as usize;
346            unsafe { data.get_unchecked(offset..offset + len as usize) }
347        };
348        unsafe { T::Native::from_bytes_unchecked(b) }
349    }
350
351    /// Returns the first `len` bytes the inline value of the view.
352    ///
353    /// # Safety
354    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
355    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
356    #[inline(always)]
357    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
358        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
359        unsafe {
360            std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
361        }
362    }
363
364    /// Constructs a new iterator for iterating over the values of this array
365    pub fn iter(&self) -> ArrayIter<&Self> {
366        ArrayIter::new(self)
367    }
368
369    /// Returns an iterator over the bytes of this array, including null values
370    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
371        self.views.iter().map(move |v| {
372            let len = *v as u32;
373            if len <= MAX_INLINE_VIEW_LEN {
374                unsafe { Self::inline_value(v, len as usize) }
375            } else {
376                let view = ByteView::from(*v);
377                let data = &self.buffers[view.buffer_index as usize];
378                let offset = view.offset as usize;
379                unsafe { data.get_unchecked(offset..offset + len as usize) }
380            }
381        })
382    }
383
384    /// Returns an iterator over the first `prefix_len` bytes of each array
385    /// element, including null values.
386    ///
387    /// If `prefix_len` is larger than the element's length, the iterator will
388    /// return an empty slice (`&[]`).
389    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
390        self.views().into_iter().map(move |v| {
391            let len = (*v as u32) as usize;
392
393            if len < prefix_len {
394                return &[] as &[u8];
395            }
396
397            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
398                unsafe { StringViewArray::inline_value(v, prefix_len) }
399            } else {
400                let view = ByteView::from(*v);
401                let data = unsafe {
402                    self.data_buffers()
403                        .get_unchecked(view.buffer_index as usize)
404                };
405                let offset = view.offset as usize;
406                unsafe { data.get_unchecked(offset..offset + prefix_len) }
407            }
408        })
409    }
410
411    /// Returns an iterator over the last `suffix_len` bytes of each array
412    /// element, including null values.
413    ///
414    /// Note that for [`StringViewArray`] the last bytes may start in the middle
415    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
416    ///
417    /// If `suffix_len` is larger than the element's length, the iterator will
418    /// return an empty slice (`&[]`).
419    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
420        self.views().into_iter().map(move |v| {
421            let len = (*v as u32) as usize;
422
423            if len < suffix_len {
424                return &[] as &[u8];
425            }
426
427            if len as u32 <= MAX_INLINE_VIEW_LEN {
428                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
429            } else {
430                let view = ByteView::from(*v);
431                let data = unsafe {
432                    self.data_buffers()
433                        .get_unchecked(view.buffer_index as usize)
434                };
435                let offset = view.offset as usize;
436                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
437            }
438        })
439    }
440
441    /// Return an iterator over the length of each array element, including null values.
442    ///
443    /// Null values length would equal to the underlying bytes length and NOT 0
444    ///
445    /// Example of getting 0 for null values
446    /// ```rust
447    /// # use arrow_array::StringViewArray;
448    /// # use arrow_array::Array;
449    /// use arrow_data::ByteView;
450    ///
451    /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
452    ///     view.lengths()
453    ///         .enumerate()
454    ///         .map(|(index, length)| if view.is_null(index) { 0 } else { length })
455    /// }
456    /// ```
457    pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
458        self.views().iter().map(|v| *v as u32)
459    }
460
461    /// Returns a zero-copy slice of this array with the indicated offset and length.
462    pub fn slice(&self, offset: usize, length: usize) -> Self {
463        Self {
464            data_type: T::DATA_TYPE,
465            views: self.views.slice(offset, length),
466            buffers: self.buffers.clone(),
467            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
468            phantom: Default::default(),
469        }
470    }
471
472    /// Returns a "compacted" version of this array
473    ///
474    /// The original array will *not* be modified
475    ///
476    /// # Garbage Collection
477    ///
478    /// Before GC:
479    /// ```text
480    ///                                        ┌──────┐
481    ///                                        │......│
482    ///                                        │......│
483    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
484    /// │       View 1       │─ ─ ─ ─          │......│  with data that
485    /// ├────────────────────┤                 │......│ is not referred
486    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
487    /// └────────────────────┘                 │......│      View 2
488    ///                                        │......│
489    ///    2 views, refer to                   │......│
490    ///   small portions of a                  └──────┘
491    ///      large buffer
492    /// ```
493    ///
494    /// After GC:
495    ///
496    /// ```text
497    /// ┌────────────────────┐                 ┌─────┐    After gc, only
498    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
499    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
500    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
501    /// └────────────────────┘                                 left
502    ///
503    ///
504    ///         2 views
505    /// ```
506    /// This method will compact the data buffers by recreating the view array and only include the data
507    /// that is pointed to by the views.
508    ///
509    /// Note that it will copy the array regardless of whether the original array is compact.
510    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
511    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
512    ///
513    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
514    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
515    pub fn gc(&self) -> Self {
516        // 1) Read basic properties once
517        let len = self.len(); // number of elements
518        let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
519
520        // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
521        if self.data_buffers().is_empty() {
522            return unsafe {
523                GenericByteViewArray::new_unchecked(
524                    self.views().clone(),
525                    vec![], // empty data blocks
526                    nulls,
527                )
528            };
529        }
530
531        // 2) Calculate total size of all non-inline data and detect if any exists
532        let total_large = self.total_buffer_bytes_used();
533
534        // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
535        if total_large == 0 {
536            // Views are inline-only or all null; just reuse original views and no data blocks
537            return unsafe {
538                GenericByteViewArray::new_unchecked(
539                    self.views().clone(),
540                    vec![], // empty data blocks
541                    nulls,
542                )
543            };
544        }
545
546        let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
547            // fast path, the entire data fits in a single buffer
548            // 3) Allocate exactly capacity for all non-inline data
549            let mut data_buf = Vec::with_capacity(total_large);
550
551            // 4) Iterate over views and process each inline/non-inline view
552            let views_buf: Vec<u128> = (0..len)
553                .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
554                .collect();
555            let data_block = Buffer::from_vec(data_buf);
556            let data_blocks = vec![data_block];
557            (views_buf, data_blocks)
558        } else {
559            // slow path, need to split into multiple buffers
560
561            struct GcCopyGroup {
562                total_buffer_bytes: usize,
563                total_len: usize,
564            }
565
566            impl GcCopyGroup {
567                fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
568                    Self {
569                        total_buffer_bytes: total_buffer_bytes as usize,
570                        total_len,
571                    }
572                }
573            }
574
575            let mut groups = Vec::new();
576            let mut current_length = 0;
577            let mut current_elements = 0;
578
579            for view in self.views() {
580                let len = *view as u32;
581                if len > MAX_INLINE_VIEW_LEN {
582                    if current_length + len > i32::MAX as u32 {
583                        // Start a new group
584                        groups.push(GcCopyGroup::new(current_length, current_elements));
585                        current_length = 0;
586                        current_elements = 0;
587                    }
588                    current_length += len;
589                    current_elements += 1;
590                }
591            }
592            if current_elements != 0 {
593                groups.push(GcCopyGroup::new(current_length, current_elements));
594            }
595            debug_assert!(groups.len() <= i32::MAX as usize);
596
597            // 3) Copy the buffers group by group
598            let mut views_buf = Vec::with_capacity(len);
599            let mut data_blocks = Vec::with_capacity(groups.len());
600
601            let mut current_view_idx = 0;
602
603            for (group_idx, gc_copy_group) in groups.iter().enumerate() {
604                let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
605
606                // Directly push views to avoid intermediate Vec allocation
607                let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
608                    |view_idx| {
609                        // safety: the view index came from iterating over valid range
610                        unsafe {
611                            self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
612                        }
613                    },
614                );
615                views_buf.extend(new_views);
616
617                data_blocks.push(Buffer::from_vec(data_buf));
618                current_view_idx += gc_copy_group.total_len;
619            }
620            (views_buf, data_blocks)
621        };
622
623        // 5) Wrap up views buffer
624        let views_scalar = ScalarBuffer::from(views_buf);
625
626        // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
627        unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
628    }
629
630    /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
631    ///
632    /// # Safety
633    ///
634    /// - `i < self.len()`.
635    /// - Every element in `self.views()` must currently refer to a valid slice
636    ///   inside one of `self.buffers`.
637    /// - `data_buf` must be ready to have additional bytes appended.
638    /// - After this call, the returned view will have its
639    ///   `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
640    ///   into the bytes just appended at the end of `data_buf`.
641    #[inline(always)]
642    unsafe fn copy_view_to_buffer(
643        &self,
644        i: usize,
645        buffer_idx: i32,
646        data_buf: &mut Vec<u8>,
647    ) -> u128 {
648        // SAFETY: `i < self.len()` ensures this is in‑bounds.
649        let raw_view = unsafe { *self.views().get_unchecked(i) };
650        let mut bv = ByteView::from(raw_view);
651
652        // Inline‑small views stay as‑is.
653        if bv.length <= MAX_INLINE_VIEW_LEN {
654            raw_view
655        } else {
656            // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
657            // must both lie within valid ranges for `self.buffers`.
658            let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
659            let start = bv.offset as usize;
660            let end = start + bv.length as usize;
661            let slice = unsafe { buffer.get_unchecked(start..end) };
662
663            // Copy out‑of‑line data into our single “0” buffer.
664            let new_offset = data_buf.len() as u32;
665            data_buf.extend_from_slice(slice);
666
667            bv.buffer_index = buffer_idx as u32;
668            bv.offset = new_offset;
669            bv.into()
670        }
671    }
672
673    /// Returns the total number of bytes used by all non inlined views in all
674    /// buffers.
675    ///
676    /// Note this does not account for views that point at the same underlying
677    /// data in buffers
678    ///
679    /// For example, if the array has three strings views:
680    /// * View with length = 9 (inlined)
681    /// * View with length = 32 (non inlined)
682    /// * View with length = 16 (non inlined)
683    ///
684    /// Then this method would report 48
685    pub fn total_buffer_bytes_used(&self) -> usize {
686        self.views()
687            .iter()
688            .map(|v| {
689                let len = *v as u32;
690                if len > MAX_INLINE_VIEW_LEN {
691                    len as usize
692                } else {
693                    0
694                }
695            })
696            .sum()
697    }
698
699    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
700    ///
701    /// Comparing two ByteView types are non-trivial.
702    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
703    ///
704    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
705    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
706    ///     Meaning that reading one array element requires only one memory access
707    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
708    ///
709    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
710    ///     thanks to the inlined 4 bytes.
711    ///     Consider equality check:
712    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
713    ///
714    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
715    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
716    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
717    ///
718    /// # Order check flow
719    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
720    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
721    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
722    ///     (2.2) o.w., we need to compare the full string.
723    ///
724    /// # Safety
725    /// The left/right_idx must within range of each array
726    pub unsafe fn compare_unchecked(
727        left: &GenericByteViewArray<T>,
728        left_idx: usize,
729        right: &GenericByteViewArray<T>,
730        right_idx: usize,
731    ) -> Ordering {
732        let l_view = unsafe { left.views().get_unchecked(left_idx) };
733        let l_byte_view = ByteView::from(*l_view);
734
735        let r_view = unsafe { right.views().get_unchecked(right_idx) };
736        let r_byte_view = ByteView::from(*r_view);
737
738        let l_len = l_byte_view.length;
739        let r_len = r_byte_view.length;
740
741        if l_len <= 12 && r_len <= 12 {
742            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
743        }
744
745        // one of the string is larger than 12 bytes,
746        // we then try to compare the inlined data first
747
748        // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
749        // but we can still use it to get the inlined prefix for shorter strings.
750        // The prefix is always the first 4 bytes of the view, for both short and long strings.
751        let l_inlined_be = l_byte_view.prefix.swap_bytes();
752        let r_inlined_be = r_byte_view.prefix.swap_bytes();
753        if l_inlined_be != r_inlined_be {
754            return l_inlined_be.cmp(&r_inlined_be);
755        }
756
757        // unfortunately, we need to compare the full data
758        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
759        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
760
761        l_full_data.cmp(r_full_data)
762    }
763
764    /// Builds a 128-bit composite key for an inline value:
765    ///
766    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
767    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
768    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
769    ///
770    /// This function extracts the length and the 12-byte inline string data from the raw
771    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
772    /// into a single `u128` value suitable for fast, branchless comparisons.
773    ///
774    /// # Why include length?
775    ///
776    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
777    /// compare equal—either because one is a true prefix of the other or because zero-padding
778    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
779    /// handles both content and length in one go.
780    ///
781    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
782    ///
783    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
784    /// |------------|-----------------------|---------------------------------|
785    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
786    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
787    ///
788    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
789    /// then differentiates:
790    ///
791    /// ```text
792    /// key("bar")   = 0x0000000000000000000062617200000003
793    /// key("bar\0") = 0x0000000000000000000062617200000004
794    /// ⇒ key("bar") < key("bar\0")
795    /// ```
796    /// - `raw` is treated as a 128-bit integer with its bits laid out as follows:
797    ///   - bits 0–31: length (little-endian)
798    ///   - bits 32–127: data (little-endian)
799    ///
800    /// # Inlining and Endianness
801    ///
802    /// This function uses platform-independent bitwise operations to construct a 128-bit key:
803    /// - `raw.swap_bytes() << 32` effectively clears the length bits and shifts the 12-byte inline data
804    ///   into the high 96 bits in Big-Endian order. This ensures the first byte of the string
805    ///   is the most significant byte of the resulting `u128`.
806    /// - `raw as u32` extracts the length as a numeric integer, which is then placed in the low 32 bits.
807    #[inline(always)]
808    pub fn inline_key_fast(raw: u128) -> u128 {
809        (raw.swap_bytes() << 32) | (raw as u32 as u128)
810    }
811}
812
813impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
814    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
815        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
816        print_long_array(self, f, |array, index, f| {
817            std::fmt::Debug::fmt(&array.value(index), f)
818        })?;
819        write!(f, "]")
820    }
821}
822
823/// SAFETY: Correctly implements the contract of Arrow Arrays
824unsafe impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
825    fn as_any(&self) -> &dyn Any {
826        self
827    }
828
829    fn to_data(&self) -> ArrayData {
830        self.clone().into()
831    }
832
833    fn into_data(self) -> ArrayData {
834        self.into()
835    }
836
837    fn data_type(&self) -> &DataType {
838        &self.data_type
839    }
840
841    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
842        Arc::new(self.slice(offset, length))
843    }
844
845    fn len(&self) -> usize {
846        self.views.len()
847    }
848
849    fn is_empty(&self) -> bool {
850        self.views.is_empty()
851    }
852
853    fn shrink_to_fit(&mut self) {
854        self.views.shrink_to_fit();
855
856        // The goal of `shrink_to_fit` is to minimize the space used by any of
857        // its allocations. The use of `Arc::get_mut` over `Arc::make_mut` is
858        // because if the reference count is greater than 1, `Arc::make_mut`
859        // will first clone its contents. So, any large allocations will first
860        // be cloned before being shrunk, leaving the pre-cloned allocations
861        // intact, before adding the extra (used) space of the new clones.
862        if let Some(buffers) = Arc::get_mut(&mut self.buffers) {
863            buffers.iter_mut().for_each(|b| b.shrink_to_fit());
864        }
865
866        // With the assumption that this is a best-effort function, no attempt
867        // is made to shrink `self.buffers`, which it can't because it's type
868        // does not expose a `shrink_to_fit` method.
869
870        if let Some(nulls) = &mut self.nulls {
871            nulls.shrink_to_fit();
872        }
873    }
874
875    fn offset(&self) -> usize {
876        0
877    }
878
879    fn nulls(&self) -> Option<&NullBuffer> {
880        self.nulls.as_ref()
881    }
882
883    fn logical_null_count(&self) -> usize {
884        // More efficient that the default implementation
885        self.null_count()
886    }
887
888    fn get_buffer_memory_size(&self) -> usize {
889        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
890        sum += self.views.inner().capacity();
891        if let Some(x) = &self.nulls {
892            sum += x.buffer().capacity()
893        }
894        sum
895    }
896
897    fn get_array_memory_size(&self) -> usize {
898        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
899    }
900
901    #[cfg(feature = "pool")]
902    fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
903        self.views.claim(pool);
904        for buffer in self.buffers.iter() {
905            buffer.claim(pool);
906        }
907        if let Some(nulls) = &self.nulls {
908            nulls.claim(pool);
909        }
910    }
911}
912
913impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
914    type Item = &'a T::Native;
915
916    fn value(&self, index: usize) -> Self::Item {
917        GenericByteViewArray::value(self, index)
918    }
919
920    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
921        unsafe { GenericByteViewArray::value_unchecked(self, index) }
922    }
923}
924
925impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
926    type Item = Option<&'a T::Native>;
927    type IntoIter = ArrayIter<Self>;
928
929    fn into_iter(self) -> Self::IntoIter {
930        ArrayIter::new(self)
931    }
932}
933
934impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
935    fn from(data: ArrayData) -> Self {
936        let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();
937        assert_eq!(
938            data_type,
939            T::DATA_TYPE,
940            "Mismatched data type, expected {}, got {data_type}",
941            T::DATA_TYPE
942        );
943        let mut buffers = buffers.into_iter();
944        // first buffer is views, remaining are data buffers
945        let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
946        Self {
947            data_type,
948            views,
949            buffers: Arc::from_iter(buffers),
950            nulls,
951            phantom: Default::default(),
952        }
953    }
954}
955
956/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
957///
958/// For example this method can convert a [`StringArray`] to a
959/// [`StringViewArray`].
960///
961/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
962/// is built without copying the underlying string data (views are created
963/// directly into the existing buffer)
964///
965/// [`StringArray`]: crate::StringArray
966impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
967where
968    FROM: ByteArrayType,
969    FROM::Offset: OffsetSizeTrait + ToPrimitive,
970    V: ByteViewType<Native = FROM::Native>,
971{
972    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
973        let offsets = byte_array.offsets();
974
975        let can_reuse_buffer = match offsets.last() {
976            Some(offset) => offset.as_usize() < u32::MAX as usize,
977            None => true,
978        };
979
980        if can_reuse_buffer {
981            // build views directly pointing to the existing buffer
982            let len = byte_array.len();
983            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
984            let str_values_buf = byte_array.values().clone();
985            let block = views_builder.append_block(str_values_buf);
986            for (i, w) in offsets.windows(2).enumerate() {
987                let offset = w[0].as_usize();
988                let end = w[1].as_usize();
989                let length = end - offset;
990
991                if byte_array.is_null(i) {
992                    views_builder.append_null();
993                } else {
994                    // Safety: the input was a valid array so it valid UTF8 (if string). And
995                    // all offsets were valid
996                    unsafe {
997                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
998                    }
999                }
1000            }
1001            assert_eq!(views_builder.len(), len);
1002            views_builder.finish()
1003        } else {
1004            // Otherwise, create a new buffer for large strings
1005            // TODO: the original buffer could still be used
1006            // by making multiple slices of u32::MAX length
1007            GenericByteViewArray::<V>::from_iter(byte_array.iter())
1008        }
1009    }
1010}
1011
1012impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1013    fn from(array: GenericByteViewArray<T>) -> Self {
1014        let len = array.len();
1015
1016        let mut buffers = array.buffers.to_vec();
1017        buffers.insert(0, array.views.into_inner());
1018
1019        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1020            .len(len)
1021            .buffers(buffers)
1022            .nulls(array.nulls);
1023
1024        unsafe { builder.build_unchecked() }
1025    }
1026}
1027
1028impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1029where
1030    Ptr: AsRef<T::Native> + 'a,
1031    T: ByteViewType + ?Sized,
1032{
1033    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1034        iter.into_iter()
1035            .map(|o| o.as_ref().map(|p| p.as_ref()))
1036            .collect()
1037    }
1038}
1039
1040impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1041where
1042    Ptr: AsRef<T::Native>,
1043{
1044    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1045        let iter = iter.into_iter();
1046        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1047        builder.extend(iter);
1048        builder.finish()
1049    }
1050}
1051
1052/// A [`GenericByteViewArray`] of `[u8]`
1053///
1054/// See [`GenericByteViewArray`] for format and layout details.
1055///
1056/// # Example
1057/// ```
1058/// use arrow_array::BinaryViewArray;
1059/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1060/// assert_eq!(array.value(0), b"hello");
1061/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1062/// ```
1063pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1064
1065impl BinaryViewArray {
1066    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1067    /// If items not utf8 data, validate will fail and error returned.
1068    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1069        StringViewType::validate(self.views(), self.data_buffers())?;
1070        unsafe { Ok(self.to_string_view_unchecked()) }
1071    }
1072
1073    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1074    /// # Safety
1075    /// Caller is responsible for ensuring that items in array are utf8 data.
1076    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1077        unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1078    }
1079}
1080
1081impl From<Vec<&[u8]>> for BinaryViewArray {
1082    fn from(v: Vec<&[u8]>) -> Self {
1083        Self::from_iter_values(v)
1084    }
1085}
1086
1087impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1088    fn from(v: Vec<Option<&[u8]>>) -> Self {
1089        v.into_iter().collect()
1090    }
1091}
1092
1093/// A [`GenericByteViewArray`] that stores utf8 data
1094///
1095/// See [`GenericByteViewArray`] for format and layout details.
1096///
1097/// # Example
1098/// ```
1099/// use arrow_array::StringViewArray;
1100/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1101/// assert_eq!(array.value(0), "hello");
1102/// assert_eq!(array.value(3), "large payload over 12 bytes");
1103/// ```
1104pub type StringViewArray = GenericByteViewArray<StringViewType>;
1105
1106impl StringViewArray {
1107    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1108    pub fn to_binary_view(self) -> BinaryViewArray {
1109        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1110    }
1111
1112    /// Returns true if all data within this array is ASCII
1113    pub fn is_ascii(&self) -> bool {
1114        // Alternative (but incorrect): directly check the underlying buffers
1115        // (1) Our string view might be sparse, i.e., a subset of the buffers,
1116        //      so even if the buffer is not ascii, we can still be ascii.
1117        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1118        // This means that this operation is quite expensive, shall we cache the result?
1119        //  i.e. track `is_ascii` in the builder.
1120        self.iter().all(|v| match v {
1121            Some(v) => v.is_ascii(),
1122            None => true,
1123        })
1124    }
1125}
1126
1127impl From<Vec<&str>> for StringViewArray {
1128    fn from(v: Vec<&str>) -> Self {
1129        Self::from_iter_values(v)
1130    }
1131}
1132
1133impl From<Vec<Option<&str>>> for StringViewArray {
1134    fn from(v: Vec<Option<&str>>) -> Self {
1135        v.into_iter().collect()
1136    }
1137}
1138
1139impl From<Vec<String>> for StringViewArray {
1140    fn from(v: Vec<String>) -> Self {
1141        Self::from_iter_values(v)
1142    }
1143}
1144
1145impl From<Vec<Option<String>>> for StringViewArray {
1146    fn from(v: Vec<Option<String>>) -> Self {
1147        v.into_iter().collect()
1148    }
1149}
1150
1151#[cfg(test)]
1152mod tests {
1153    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1154    use crate::types::BinaryViewType;
1155    use crate::{
1156        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1157    };
1158    use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
1159    use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
1160    use arrow_schema::DataType;
1161    use rand::prelude::StdRng;
1162    use rand::{Rng, SeedableRng};
1163    use std::str::from_utf8;
1164
1165    const BLOCK_SIZE: u32 = 8;
1166
1167    #[test]
1168    fn try_new_string() {
1169        let array = StringViewArray::from_iter_values(vec![
1170            "hello",
1171            "world",
1172            "lulu",
1173            "large payload over 12 bytes",
1174        ]);
1175        assert_eq!(array.value(0), "hello");
1176        assert_eq!(array.value(3), "large payload over 12 bytes");
1177    }
1178
1179    #[test]
1180    fn try_new_binary() {
1181        let array = BinaryViewArray::from_iter_values(vec![
1182            b"hello".as_slice(),
1183            b"world".as_slice(),
1184            b"lulu".as_slice(),
1185            b"large payload over 12 bytes".as_slice(),
1186        ]);
1187        assert_eq!(array.value(0), b"hello");
1188        assert_eq!(array.value(3), b"large payload over 12 bytes");
1189    }
1190
1191    #[test]
1192    fn try_new_empty_string() {
1193        // test empty array
1194        let array = {
1195            let mut builder = StringViewBuilder::new();
1196            builder.finish()
1197        };
1198        assert!(array.is_empty());
1199    }
1200
1201    #[test]
1202    fn try_new_empty_binary() {
1203        // test empty array
1204        let array = {
1205            let mut builder = BinaryViewBuilder::new();
1206            builder.finish()
1207        };
1208        assert!(array.is_empty());
1209    }
1210
1211    #[test]
1212    fn test_append_string() {
1213        // test builder append
1214        let array = {
1215            let mut builder = StringViewBuilder::new();
1216            builder.append_value("hello");
1217            builder.append_null();
1218            builder.append_option(Some("large payload over 12 bytes"));
1219            builder.finish()
1220        };
1221        assert_eq!(array.value(0), "hello");
1222        assert!(array.is_null(1));
1223        assert_eq!(array.value(2), "large payload over 12 bytes");
1224    }
1225
1226    #[test]
1227    fn test_append_binary() {
1228        // test builder append
1229        let array = {
1230            let mut builder = BinaryViewBuilder::new();
1231            builder.append_value(b"hello");
1232            builder.append_null();
1233            builder.append_option(Some(b"large payload over 12 bytes"));
1234            builder.finish()
1235        };
1236        assert_eq!(array.value(0), b"hello");
1237        assert!(array.is_null(1));
1238        assert_eq!(array.value(2), b"large payload over 12 bytes");
1239    }
1240
1241    #[test]
1242    fn test_in_progress_recreation() {
1243        let array = {
1244            // make a builder with small block size.
1245            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1246            builder.append_value("large payload over 12 bytes");
1247            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1248            builder.finish()
1249        };
1250        assert_eq!(array.value(0), "large payload over 12 bytes");
1251        assert_eq!(
1252            array.value(1),
1253            "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1254        );
1255        assert_eq!(2, array.buffers.len());
1256    }
1257
1258    #[test]
1259    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1260    fn new_with_invalid_view_data() {
1261        let v = "large payload over 12 bytes";
1262        let view = ByteView::new(13, &v.as_bytes()[0..4])
1263            .with_buffer_index(3)
1264            .with_offset(1);
1265        let views = ScalarBuffer::from(vec![view.into()]);
1266        let buffers = vec![Buffer::from_slice_ref(v)];
1267        StringViewArray::new(views, buffers, None);
1268    }
1269
1270    #[test]
1271    #[should_panic(
1272        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1273    )]
1274    fn new_with_invalid_utf8_data() {
1275        let v: Vec<u8> = vec![
1276            // invalid UTF8
1277            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1278            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1279        ];
1280        let view = ByteView::new(v.len() as u32, &v[0..4]);
1281        let views = ScalarBuffer::from(vec![view.into()]);
1282        let buffers = vec![Buffer::from_slice_ref(v)];
1283        StringViewArray::new(views, buffers, None);
1284    }
1285
1286    #[test]
1287    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1288    fn new_with_invalid_zero_padding() {
1289        let mut data = [0; 12];
1290        data[0] = b'H';
1291        data[11] = 1; // no zero padding
1292
1293        let mut view_buffer = [0; 16];
1294        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1295        view_buffer[4..].copy_from_slice(&data);
1296
1297        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1298        let views = ScalarBuffer::from(vec![view.into()]);
1299        let buffers = vec![];
1300        StringViewArray::new(views, buffers, None);
1301    }
1302
1303    #[test]
1304    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1305    fn test_mismatch_between_embedded_prefix_and_data() {
1306        let input_str_1 = "Hello, Rustaceans!";
1307        let input_str_2 = "Hallo, Rustaceans!";
1308        let length = input_str_1.len() as u32;
1309        assert!(input_str_1.len() > 12);
1310
1311        let mut view_buffer = [0; 16];
1312        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1313        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1314        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1315        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1316        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1317        let views = ScalarBuffer::from(vec![view.into()]);
1318        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1319
1320        StringViewArray::new(views, buffers, None);
1321    }
1322
1323    #[test]
1324    fn test_gc() {
1325        let test_data = [
1326            Some("longer than 12 bytes"),
1327            Some("short"),
1328            Some("t"),
1329            Some("longer than 12 bytes"),
1330            None,
1331            Some("short"),
1332        ];
1333
1334        let array = {
1335            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1336            test_data.into_iter().for_each(|v| builder.append_option(v));
1337            builder.finish()
1338        };
1339        assert!(array.buffers.len() > 1);
1340
1341        fn check_gc(to_test: &StringViewArray) {
1342            let gc = to_test.gc();
1343            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1344
1345            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1346                assert_eq!(a, b);
1347            });
1348            assert_eq!(to_test.len(), gc.len());
1349        }
1350
1351        check_gc(&array);
1352        check_gc(&array.slice(1, 3));
1353        check_gc(&array.slice(2, 1));
1354        check_gc(&array.slice(2, 2));
1355        check_gc(&array.slice(3, 1));
1356    }
1357
1358    /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1359    #[test]
1360    fn test_gc_empty_array() {
1361        let array = StringViewBuilder::new()
1362            .with_fixed_block_size(BLOCK_SIZE)
1363            .finish();
1364        let gced = array.gc();
1365        // length and null count remain zero
1366        assert_eq!(gced.len(), 0);
1367        assert_eq!(gced.null_count(), 0);
1368        // no underlying data buffers should be allocated
1369        assert!(
1370            gced.data_buffers().is_empty(),
1371            "Expected no data buffers for empty array"
1372        );
1373    }
1374
1375    /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1376    #[test]
1377    fn test_gc_all_inline() {
1378        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1379        // append many short strings, each exactly INLINE_LEN long
1380        for _ in 0..100 {
1381            let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1382            builder.append_option(Some(&s));
1383        }
1384        let array = builder.finish();
1385        let gced = array.gc();
1386        // Since all views fit inline, data buffer is empty
1387        assert_eq!(
1388            gced.data_buffers().len(),
1389            0,
1390            "Should have no data buffers for inline values"
1391        );
1392        assert_eq!(gced.len(), 100);
1393        // verify element-wise equality
1394        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1395            assert_eq!(orig, got, "Inline value mismatch after gc");
1396        });
1397    }
1398
1399    /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1400    #[test]
1401    fn test_gc_all_large() {
1402        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1403        let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1404        // append multiple large strings
1405        for _ in 0..50 {
1406            builder.append_option(Some(&large_str));
1407        }
1408        let array = builder.finish();
1409        let gced = array.gc();
1410        // New data buffers should be populated (one or more blocks)
1411        assert!(
1412            !gced.data_buffers().is_empty(),
1413            "Expected data buffers for large values"
1414        );
1415        assert_eq!(gced.len(), 50);
1416        // verify that every large string emerges unchanged
1417        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1418            assert_eq!(orig, got, "Large view mismatch after gc");
1419        });
1420    }
1421
1422    /// 4) All null elements: ensure null bitmap handling path is correct
1423    #[test]
1424    fn test_gc_all_nulls() {
1425        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1426        for _ in 0..20 {
1427            builder.append_null();
1428        }
1429        let array = builder.finish();
1430        let gced = array.gc();
1431        // length and null count match
1432        assert_eq!(gced.len(), 20);
1433        assert_eq!(gced.null_count(), 20);
1434        // data buffers remain empty for null-only array
1435        assert!(
1436            gced.data_buffers().is_empty(),
1437            "No data should be stored for nulls"
1438        );
1439    }
1440
1441    /// 5) Random mix of inline, large, and null values with slicing tests
1442    #[test]
1443    fn test_gc_random_mixed_and_slices() {
1444        let mut rng = StdRng::seed_from_u64(42);
1445        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1446        // Keep a Vec of original Option<String> for later comparison
1447        let mut original: Vec<Option<String>> = Vec::new();
1448
1449        for _ in 0..200 {
1450            if rng.random_bool(0.1) {
1451                // 10% nulls
1452                builder.append_null();
1453                original.push(None);
1454            } else {
1455                // random length between 0 and twice the inline limit
1456                let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1457                let s: String = "A".repeat(len as usize);
1458                builder.append_option(Some(&s));
1459                original.push(Some(s));
1460            }
1461        }
1462
1463        let array = builder.finish();
1464        // Test multiple slice ranges to ensure offset logic is correct
1465        for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1466            let sliced = array.slice(*offset, *slice_len);
1467            let gced = sliced.gc();
1468            // Build expected slice of Option<&str>
1469            let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1470                .iter()
1471                .map(|opt| opt.as_deref())
1472                .collect();
1473
1474            assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1475            // Compare element-wise
1476            gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1477                assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1478            });
1479        }
1480    }
1481
1482    #[test]
1483    #[cfg_attr(miri, ignore)] // Takes too long
1484    fn test_gc_huge_array() {
1485        // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1486        let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1487        let num_views: usize = 36;
1488
1489        // Create a single 128 MiB data block with a simple byte pattern
1490        let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1491        let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1492
1493        // Append this block and then add many views pointing to it
1494        let mut builder = BinaryViewBuilder::new();
1495        let block_id = builder.append_block(buffer);
1496        for _ in 0..num_views / 2 {
1497            builder
1498                .try_append_view(block_id, 0, block_len as u32)
1499                .expect("append view into 128MiB block");
1500        }
1501        let block_id2 = builder.append_block(buffer2);
1502        for _ in 0..num_views / 2 {
1503            builder
1504                .try_append_view(block_id2, 0, block_len as u32)
1505                .expect("append view into 128MiB block");
1506        }
1507
1508        let array = builder.finish();
1509        let total = array.total_buffer_bytes_used();
1510        assert!(
1511            total > u32::MAX as usize,
1512            "Expected total non-inline bytes to exceed 4 GiB, got {}",
1513            total
1514        );
1515
1516        // Run gc and verify correctness
1517        let gced = array.gc();
1518        assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1519        assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1520        assert_ne!(
1521            gced.data_buffers().len(),
1522            1,
1523            "gc with huge buffer should not consolidate data into a single buffer"
1524        );
1525
1526        // Element-wise equality check across the entire array
1527        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1528            assert_eq!(orig, got, "Value mismatch after gc on huge array");
1529        });
1530    }
1531
1532    #[test]
1533    fn test_eq() {
1534        let test_data = [
1535            Some("longer than 12 bytes"),
1536            None,
1537            Some("short"),
1538            Some("again, this is longer than 12 bytes"),
1539        ];
1540
1541        let array1 = {
1542            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1543            test_data.into_iter().for_each(|v| builder.append_option(v));
1544            builder.finish()
1545        };
1546        let array2 = {
1547            // create a new array with the same data but different layout
1548            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1549            test_data.into_iter().for_each(|v| builder.append_option(v));
1550            builder.finish()
1551        };
1552        assert_eq!(array1, array1.clone());
1553        assert_eq!(array2, array2.clone());
1554        assert_eq!(array1, array2);
1555    }
1556
1557    /// Integration tests for `inline_key_fast` covering:
1558    ///
1559    /// 1. Monotonic ordering across increasing lengths and lexical variations.
1560    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1561    ///
1562    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1563    /// the length field is required even when all inline bytes fit in 12 bytes.
1564    ///
1565    /// The test includes strings that verify correct byte order (prevent reversal bugs),
1566    /// and length-based tie-breaking in the composite key.
1567    ///
1568    /// The test confirms that `inline_key_fast` produces keys which sort consistently
1569    /// with the expected lexicographical order of the raw byte arrays.
1570    #[test]
1571    fn test_inline_key_fast_various_lengths_and_lexical() {
1572        /// Helper to create a raw u128 value representing an inline ByteView:
1573        /// - `length`: number of meaningful bytes (must be ≤ 12)
1574        /// - `data`: the actual inline data bytes
1575        ///
1576        /// The first 4 bytes encode length in little-endian,
1577        /// the following 12 bytes contain the inline string data (unpadded).
1578        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1579            assert!(length as usize <= 12, "Inline length must be ≤ 12");
1580            assert!(
1581                data.len() == length as usize,
1582                "Data length must match `length`"
1583            );
1584
1585            let mut raw_bytes = [0u8; 16];
1586            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1587            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1588            u128::from_le_bytes(raw_bytes)
1589        }
1590
1591        // Test inputs: various lengths and lexical orders,
1592        // plus special cases for byte order and length tie-breaking
1593        let test_inputs: Vec<&[u8]> = vec![
1594            b"a",
1595            b"aa",
1596            b"aaa",
1597            b"aab",
1598            b"abcd",
1599            b"abcde",
1600            b"abcdef",
1601            b"abcdefg",
1602            b"abcdefgh",
1603            b"abcdefghi",
1604            b"abcdefghij",
1605            b"abcdefghijk",
1606            b"abcdefghijkl",
1607            // Tests for byte-order reversal bug:
1608            // Without the fix, "backend one" would compare as "eno dnekcab",
1609            // causing incorrect sort order relative to "backend two".
1610            b"backend one",
1611            b"backend two",
1612            // Tests length-tiebreaker logic:
1613            // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1614            // so only the length differentiates their ordering.
1615            b"bar",
1616            b"bar\0",
1617            // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1618            b"than12Byt",
1619            b"than12Bytes",
1620            b"than12Bytes\0",
1621            b"than12Bytesx",
1622            b"than12Bytex",
1623            b"than12Bytez",
1624            // Additional lexical tests
1625            b"xyy",
1626            b"xyz",
1627            b"xza",
1628        ];
1629
1630        // Create a GenericBinaryArray for cross-comparison of lex order
1631        let array: GenericBinaryArray<i32> =
1632            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1633
1634        for i in 0..array.len() - 1 {
1635            let v1 = array.value(i);
1636            let v2 = array.value(i + 1);
1637
1638            // Assert the array's natural lexical ordering is correct
1639            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1640
1641            // Assert the keys produced by inline_key_fast reflect the same ordering
1642            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1643                v1.len() as u32,
1644                v1,
1645            ));
1646            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1647                v2.len() as u32,
1648                v2,
1649            ));
1650
1651            assert!(
1652                key1 < key2,
1653                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1654            );
1655        }
1656    }
1657
1658    #[test]
1659    fn empty_array_should_return_empty_lengths_iterator() {
1660        let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
1661
1662        let mut lengths_iter = empty.lengths();
1663        assert_eq!(lengths_iter.len(), 0);
1664        assert_eq!(lengths_iter.next(), None);
1665    }
1666
1667    #[test]
1668    fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
1669        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1670            // Not inlined as longer than 12 bytes
1671            b"Supercalifragilisticexpialidocious" as &[u8],
1672            // Inlined as shorter than 12 bytes
1673            b"Hello",
1674            // Empty value
1675            b"",
1676            // Exactly 12 bytes
1677            b"abcdefghijkl",
1678        ]);
1679
1680        let mut lengths_iter = cases.lengths();
1681
1682        assert_eq!(lengths_iter.len(), cases.len());
1683
1684        let cases_iter = cases.iter();
1685
1686        for case in cases_iter {
1687            let case_value = case.unwrap();
1688            let length = lengths_iter.next().expect("Should have a length");
1689
1690            assert_eq!(case_value.len(), length as usize);
1691        }
1692
1693        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1694    }
1695
1696    #[test]
1697    fn array_lengths_should_return_the_underlying_length_for_null_values() {
1698        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1699            // Not inlined as longer than 12 bytes
1700            b"Supercalifragilisticexpialidocious" as &[u8],
1701            // Inlined as shorter than 12 bytes
1702            b"Hello",
1703            // Empty value
1704            b"",
1705            // Exactly 12 bytes
1706            b"abcdefghijkl",
1707        ]);
1708
1709        let (views, buffer, _) = cases.clone().into_parts();
1710
1711        // Keeping the values but just adding nulls on top
1712        let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
1713            views,
1714            buffer,
1715            Some(NullBuffer::new_null(cases.len())),
1716        );
1717
1718        let lengths_iter = cases.lengths();
1719        let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
1720
1721        assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
1722
1723        for expected_length in lengths_iter {
1724            let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");
1725
1726            assert_eq!(expected_length, actual_length);
1727        }
1728
1729        assert_eq!(
1730            all_nulls_lengths_iter.next(),
1731            None,
1732            "Should not have more lengths"
1733        );
1734    }
1735
1736    #[test]
1737    fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
1738        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1739            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1740            b"Hello",
1741            b"something great",
1742            b"is",
1743            b"coming soon!",
1744            b"when you find what it is",
1745            b"let me know",
1746            b"cause",
1747            b"I",
1748            b"have no idea",
1749            b"what it",
1750            b"is",
1751        ]);
1752
1753        let sliced_array = array.slice(2, array.len() - 3);
1754
1755        let mut lengths_iter = sliced_array.lengths();
1756
1757        assert_eq!(lengths_iter.len(), sliced_array.len());
1758
1759        let values_iter = sliced_array.iter();
1760
1761        for value in values_iter {
1762            let value = value.unwrap();
1763            let length = lengths_iter.next().expect("Should have a length");
1764
1765            assert_eq!(value.len(), length as usize);
1766        }
1767
1768        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1769    }
1770
1771    #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")]
1772    #[test]
1773    fn invalid_casting_from_array_data() {
1774        // Should not be able to cast to StringViewArray due to invalid UTF-8
1775        let array_data = binary_view_array_with_invalid_utf8_data().into_data();
1776        let _ = StringViewArray::from(array_data);
1777    }
1778
1779    #[should_panic(expected = "invalid utf-8 sequence")]
1780    #[test]
1781    fn invalid_array_data() {
1782        let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts();
1783
1784        // manually try and add invalid array data with Utf8View data type
1785        let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
1786            .add_buffer(views.into_inner())
1787            .len(3);
1788        for buffer in buffers.iter() {
1789            builder = builder.add_buffer(buffer.clone())
1790        }
1791        builder = builder.nulls(nulls);
1792
1793        let data = builder.build().unwrap(); // should fail validation
1794        let _arr = StringViewArray::from(data);
1795    }
1796
1797    /// Returns a BinaryViewArray with one invalid UTF-8 value
1798    fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
1799        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1800            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1801            &[
1802                0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1803                0x00, 0x00,
1804            ],
1805            b"good",
1806        ]);
1807        assert!(from_utf8(array.value(0)).is_ok());
1808        assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
1809        assert!(from_utf8(array.value(2)).is_ok());
1810        array
1811    }
1812}