Skip to main content

arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68///                         ┌──────┬────────────────────────┐
69///                         │length│      string value      │
70///    Strings (len <= 12)  │      │    (padded with 0)     │
71///                         └──────┴────────────────────────┘
72///                          0    31                      127
73///
74///                         ┌───────┬───────┬───────┬───────┐
75///                         │length │prefix │  buf  │offset │
76///    Strings (len > 12)   │       │       │ index │       │
77///                         └───────┴───────┴───────┴───────┘
78///                          0    31       63      95    127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82///   the view. See [`Self::inline_value`] to access the inlined prefix from a
83///   short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86///   view and the entire string is stored in one of the buffers. See [`ByteView`]
87///   to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100///   "hello",
101///   "this string is longer than 12 bytes",
102///   "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113///   StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148///                                                                            ┌───┐
149///                         ┌──────┬──────┬──────┬──────┐               offset │...│
150/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
151///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
152///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
153/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
154///                         └──────┴──────┴──────┴──────┘                      │ced│
155///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
157///                         └──────┴────────────────────┘               offset │sIn│
158///                                                                       115  │Tow│
159///                                                                            │nTo│
160///                                                                            │day│
161///                                  u128 "views"                              │Yay│
162///                                                                   buffer 0 │...│
163///                                                                            └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166    data_type: DataType,
167    views: ScalarBuffer<u128>,
168    buffers: Arc<[Buffer]>,
169    phantom: PhantomData<T>,
170    nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174    fn clone(&self) -> Self {
175        Self {
176            data_type: T::DATA_TYPE,
177            views: self.views.clone(),
178            buffers: self.buffers.clone(),
179            nulls: self.nulls.clone(),
180            phantom: Default::default(),
181        }
182    }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187    ///
188    /// # Panics
189    ///
190    /// Panics if [`GenericByteViewArray::try_new`] returns an error
191    pub fn new<U>(views: ScalarBuffer<u128>, buffers: U, nulls: Option<NullBuffer>) -> Self
192    where
193        U: Into<Arc<[Buffer]>>,
194    {
195        Self::try_new(views, buffers, nulls).unwrap()
196    }
197
198    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
199    ///
200    /// # Errors
201    ///
202    /// * `views.len() != nulls.len()`
203    /// * [ByteViewType::validate] fails
204    pub fn try_new<U>(
205        views: ScalarBuffer<u128>,
206        buffers: U,
207        nulls: Option<NullBuffer>,
208    ) -> Result<Self, ArrowError>
209    where
210        U: Into<Arc<[Buffer]>>,
211    {
212        let buffers: Arc<[Buffer]> = buffers.into();
213
214        T::validate(&views, &buffers)?;
215
216        if let Some(n) = nulls.as_ref() {
217            if n.len() != views.len() {
218                return Err(ArrowError::InvalidArgumentError(format!(
219                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
220                    T::PREFIX,
221                    views.len(),
222                    n.len(),
223                )));
224            }
225        }
226
227        Ok(Self {
228            data_type: T::DATA_TYPE,
229            views,
230            buffers,
231            nulls,
232            phantom: Default::default(),
233        })
234    }
235
236    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
237    ///
238    /// # Safety
239    ///
240    /// Safe if [`Self::try_new`] would not error
241    pub unsafe fn new_unchecked<U>(
242        views: ScalarBuffer<u128>,
243        buffers: U,
244        nulls: Option<NullBuffer>,
245    ) -> Self
246    where
247        U: Into<Arc<[Buffer]>>,
248    {
249        if cfg!(feature = "force_validate") {
250            return Self::new(views, buffers, nulls);
251        }
252
253        Self {
254            data_type: T::DATA_TYPE,
255            phantom: Default::default(),
256            views,
257            buffers: buffers.into(),
258            nulls,
259        }
260    }
261
262    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
263    pub fn new_null(len: usize) -> Self {
264        Self {
265            data_type: T::DATA_TYPE,
266            views: vec![0; len].into(),
267            buffers: vec![].into(),
268            nulls: Some(NullBuffer::new_null(len)),
269            phantom: Default::default(),
270        }
271    }
272
273    /// Create a new [`Scalar`] from `value`
274    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
275        Scalar::new(Self::from_iter_values(std::iter::once(value)))
276    }
277
278    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
279    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
280    where
281        Ptr: AsRef<T::Native>,
282        I: IntoIterator<Item = Ptr>,
283    {
284        let iter = iter.into_iter();
285        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
286        for v in iter {
287            builder.append_value(v);
288        }
289        builder.finish()
290    }
291
292    /// Deconstruct this array into its constituent parts
293    pub fn into_parts(self) -> (ScalarBuffer<u128>, Arc<[Buffer]>, Option<NullBuffer>) {
294        (self.views, self.buffers, self.nulls)
295    }
296
297    /// Returns the views buffer
298    #[inline]
299    pub fn views(&self) -> &ScalarBuffer<u128> {
300        &self.views
301    }
302
303    /// Returns the buffers storing string data
304    #[inline]
305    pub fn data_buffers(&self) -> &[Buffer] {
306        &self.buffers
307    }
308
309    /// Returns the element at index `i`
310    ///
311    /// Note: This method does not check for nulls and the value is arbitrary
312    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
313    ///
314    /// # Panics
315    /// Panics if index `i` is out of bounds.
316    pub fn value(&self, i: usize) -> &T::Native {
317        assert!(
318            i < self.len(),
319            "Trying to access an element at index {} from a {}ViewArray of length {}",
320            i,
321            T::PREFIX,
322            self.len()
323        );
324
325        unsafe { self.value_unchecked(i) }
326    }
327
328    /// Returns the element at index `i` without bounds checking
329    ///
330    /// Note: This method does not check for nulls and the value is arbitrary
331    /// if [`is_null`](Self::is_null) returns true for the index.
332    ///
333    /// # Safety
334    ///
335    /// Caller is responsible for ensuring that the index is within the bounds
336    /// of the array
337    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
338        let v = unsafe { self.views.get_unchecked(idx) };
339        let len = *v as u32;
340        let b = if len <= MAX_INLINE_VIEW_LEN {
341            unsafe { Self::inline_value(v, len as usize) }
342        } else {
343            let view = ByteView::from(*v);
344            let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
345            let offset = view.offset as usize;
346            unsafe { data.get_unchecked(offset..offset + len as usize) }
347        };
348        unsafe { T::Native::from_bytes_unchecked(b) }
349    }
350
351    /// Returns the first `len` bytes the inline value of the view.
352    ///
353    /// # Safety
354    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
355    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
356    #[inline(always)]
357    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
358        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
359        unsafe {
360            std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
361        }
362    }
363
364    /// Constructs a new iterator for iterating over the values of this array
365    pub fn iter(&self) -> ArrayIter<&Self> {
366        ArrayIter::new(self)
367    }
368
369    /// Returns an iterator over the bytes of this array, including null values
370    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
371        self.views.iter().map(move |v| {
372            let len = *v as u32;
373            if len <= MAX_INLINE_VIEW_LEN {
374                unsafe { Self::inline_value(v, len as usize) }
375            } else {
376                let view = ByteView::from(*v);
377                let data = &self.buffers[view.buffer_index as usize];
378                let offset = view.offset as usize;
379                unsafe { data.get_unchecked(offset..offset + len as usize) }
380            }
381        })
382    }
383
384    /// Returns an iterator over the first `prefix_len` bytes of each array
385    /// element, including null values.
386    ///
387    /// If `prefix_len` is larger than the element's length, the iterator will
388    /// return an empty slice (`&[]`).
389    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
390        self.views().into_iter().map(move |v| {
391            let len = (*v as u32) as usize;
392
393            if len < prefix_len {
394                return &[] as &[u8];
395            }
396
397            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
398                unsafe { StringViewArray::inline_value(v, prefix_len) }
399            } else {
400                let view = ByteView::from(*v);
401                let data = unsafe {
402                    self.data_buffers()
403                        .get_unchecked(view.buffer_index as usize)
404                };
405                let offset = view.offset as usize;
406                unsafe { data.get_unchecked(offset..offset + prefix_len) }
407            }
408        })
409    }
410
411    /// Returns an iterator over the last `suffix_len` bytes of each array
412    /// element, including null values.
413    ///
414    /// Note that for [`StringViewArray`] the last bytes may start in the middle
415    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
416    ///
417    /// If `suffix_len` is larger than the element's length, the iterator will
418    /// return an empty slice (`&[]`).
419    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
420        self.views().into_iter().map(move |v| {
421            let len = (*v as u32) as usize;
422
423            if len < suffix_len {
424                return &[] as &[u8];
425            }
426
427            if len as u32 <= MAX_INLINE_VIEW_LEN {
428                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
429            } else {
430                let view = ByteView::from(*v);
431                let data = unsafe {
432                    self.data_buffers()
433                        .get_unchecked(view.buffer_index as usize)
434                };
435                let offset = view.offset as usize;
436                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
437            }
438        })
439    }
440
441    /// Return an iterator over the length of each array element, including null values.
442    ///
443    /// Null values length would equal to the underlying bytes length and NOT 0
444    ///
445    /// Example of getting 0 for null values
446    /// ```rust
447    /// # use arrow_array::StringViewArray;
448    /// # use arrow_array::Array;
449    /// use arrow_data::ByteView;
450    ///
451    /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
452    ///     view.lengths()
453    ///         .enumerate()
454    ///         .map(|(index, length)| if view.is_null(index) { 0 } else { length })
455    /// }
456    /// ```
457    pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
458        self.views().iter().map(|v| *v as u32)
459    }
460
461    /// Returns a zero-copy slice of this array with the indicated offset and length.
462    pub fn slice(&self, offset: usize, length: usize) -> Self {
463        Self {
464            data_type: T::DATA_TYPE,
465            views: self.views.slice(offset, length),
466            buffers: self.buffers.clone(),
467            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
468            phantom: Default::default(),
469        }
470    }
471
472    /// Returns a "compacted" version of this array
473    ///
474    /// The original array will *not* be modified
475    ///
476    /// # Garbage Collection
477    ///
478    /// Before GC:
479    /// ```text
480    ///                                        ┌──────┐
481    ///                                        │......│
482    ///                                        │......│
483    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
484    /// │       View 1       │─ ─ ─ ─          │......│  with data that
485    /// ├────────────────────┤                 │......│ is not referred
486    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
487    /// └────────────────────┘                 │......│      View 2
488    ///                                        │......│
489    ///    2 views, refer to                   │......│
490    ///   small portions of a                  └──────┘
491    ///      large buffer
492    /// ```
493    ///
494    /// After GC:
495    ///
496    /// ```text
497    /// ┌────────────────────┐                 ┌─────┐    After gc, only
498    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
499    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
500    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
501    /// └────────────────────┘                                 left
502    ///
503    ///
504    ///         2 views
505    /// ```
506    /// This method will compact the data buffers by recreating the view array and only include the data
507    /// that is pointed to by the views.
508    ///
509    /// Note that it will copy the array regardless of whether the original array is compact.
510    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
511    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
512    ///
513    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
514    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
515    pub fn gc(&self) -> Self {
516        // 1) Read basic properties once
517        let len = self.len(); // number of elements
518        let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
519
520        // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
521        if self.data_buffers().is_empty() {
522            return unsafe {
523                GenericByteViewArray::new_unchecked(
524                    self.views().clone(),
525                    vec![], // empty data blocks
526                    nulls,
527                )
528            };
529        }
530
531        // 2) Calculate total size of all non-inline data and detect if any exists
532        let total_large = self.total_buffer_bytes_used();
533
534        // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
535        if total_large == 0 {
536            // Views are inline-only or all null; just reuse original views and no data blocks
537            return unsafe {
538                GenericByteViewArray::new_unchecked(
539                    self.views().clone(),
540                    vec![], // empty data blocks
541                    nulls,
542                )
543            };
544        }
545
546        let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
547            // fast path, the entire data fits in a single buffer
548            // 3) Allocate exactly capacity for all non-inline data
549            let mut data_buf = Vec::with_capacity(total_large);
550
551            // 4) Iterate over views and process each inline/non-inline view
552            let views_buf: Vec<u128> = (0..len)
553                .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
554                .collect();
555            let data_block = Buffer::from_vec(data_buf);
556            let data_blocks = vec![data_block];
557            (views_buf, data_blocks)
558        } else {
559            // slow path, need to split into multiple buffers
560
561            struct GcCopyGroup {
562                total_buffer_bytes: usize,
563                total_len: usize,
564            }
565
566            impl GcCopyGroup {
567                fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
568                    Self {
569                        total_buffer_bytes: total_buffer_bytes as usize,
570                        total_len,
571                    }
572                }
573            }
574
575            let mut groups = Vec::new();
576            let mut current_length = 0;
577            let mut current_elements = 0;
578
579            for view in self.views() {
580                let len = *view as u32;
581                if len > MAX_INLINE_VIEW_LEN {
582                    if current_length + len > i32::MAX as u32 {
583                        // Start a new group
584                        groups.push(GcCopyGroup::new(current_length, current_elements));
585                        current_length = 0;
586                        current_elements = 0;
587                    }
588                    current_length += len;
589                    current_elements += 1;
590                }
591            }
592            if current_elements != 0 {
593                groups.push(GcCopyGroup::new(current_length, current_elements));
594            }
595            debug_assert!(groups.len() <= i32::MAX as usize);
596
597            // 3) Copy the buffers group by group
598            let mut views_buf = Vec::with_capacity(len);
599            let mut data_blocks = Vec::with_capacity(groups.len());
600
601            let mut current_view_idx = 0;
602
603            for (group_idx, gc_copy_group) in groups.iter().enumerate() {
604                let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
605
606                // Directly push views to avoid intermediate Vec allocation
607                let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
608                    |view_idx| {
609                        // safety: the view index came from iterating over valid range
610                        unsafe {
611                            self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
612                        }
613                    },
614                );
615                views_buf.extend(new_views);
616
617                data_blocks.push(Buffer::from_vec(data_buf));
618                current_view_idx += gc_copy_group.total_len;
619            }
620            (views_buf, data_blocks)
621        };
622
623        // 5) Wrap up views buffer
624        let views_scalar = ScalarBuffer::from(views_buf);
625
626        // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
627        unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
628    }
629
630    /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
631    ///
632    /// # Safety
633    ///
634    /// - `i < self.len()`.
635    /// - Every element in `self.views()` must currently refer to a valid slice
636    ///   inside one of `self.buffers`.
637    /// - `data_buf` must be ready to have additional bytes appended.
638    /// - After this call, the returned view will have its
639    ///   `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
640    ///   into the bytes just appended at the end of `data_buf`.
641    #[inline(always)]
642    unsafe fn copy_view_to_buffer(
643        &self,
644        i: usize,
645        buffer_idx: i32,
646        data_buf: &mut Vec<u8>,
647    ) -> u128 {
648        // SAFETY: `i < self.len()` ensures this is in‑bounds.
649        let raw_view = unsafe { *self.views().get_unchecked(i) };
650        let mut bv = ByteView::from(raw_view);
651
652        // Inline‑small views stay as‑is.
653        if bv.length <= MAX_INLINE_VIEW_LEN {
654            raw_view
655        } else {
656            // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
657            // must both lie within valid ranges for `self.buffers`.
658            let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
659            let start = bv.offset as usize;
660            let end = start + bv.length as usize;
661            let slice = unsafe { buffer.get_unchecked(start..end) };
662
663            // Copy out‑of‑line data into our single “0” buffer.
664            let new_offset = data_buf.len() as u32;
665            data_buf.extend_from_slice(slice);
666
667            bv.buffer_index = buffer_idx as u32;
668            bv.offset = new_offset;
669            bv.into()
670        }
671    }
672
673    /// Returns the total number of bytes of all non-null values in this array.
674    ///
675    /// Unlike [`Self::total_buffer_bytes_used`], this method includes inlined strings
676    /// (those with length ≤ [`MAX_INLINE_VIEW_LEN`]), making it suitable as a
677    /// capacity hint when pre-allocating output buffers.
678    ///
679    /// Null values are excluded from the sum.
680    ///
681    /// # Example
682    ///
683    /// ```rust
684    /// # use arrow_array::StringViewArray;
685    /// let array = StringViewArray::from_iter(vec![
686    ///     Some("hello"),   // 5 bytes, inlined
687    ///     None,            // excluded
688    ///     Some("large payload over 12 bytes"),  // 27 bytes, non-inlined
689    /// ]);
690    /// assert_eq!(array.total_bytes_len(), 5 + 27);
691    /// ```
692    pub fn total_bytes_len(&self) -> usize {
693        match self.nulls() {
694            None => self.views().iter().map(|v| (*v as u32) as usize).sum(),
695            Some(nulls) => self
696                .views()
697                .iter()
698                .zip(nulls.iter())
699                .map(|(v, is_valid)| if is_valid { (*v as u32) as usize } else { 0 })
700                .sum(),
701        }
702    }
703
704    /// Returns the total number of bytes used by all non inlined views in all
705    /// buffers.
706    ///
707    /// Note this does not account for views that point at the same underlying
708    /// data in buffers
709    ///
710    /// For example, if the array has three strings views:
711    /// * View with length = 9 (inlined)
712    /// * View with length = 32 (non inlined)
713    /// * View with length = 16 (non inlined)
714    ///
715    /// Then this method would report 48
716    pub fn total_buffer_bytes_used(&self) -> usize {
717        self.views()
718            .iter()
719            .map(|v| {
720                let len = *v as u32;
721                if len > MAX_INLINE_VIEW_LEN {
722                    len as usize
723                } else {
724                    0
725                }
726            })
727            .sum()
728    }
729
730    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
731    ///
732    /// Comparing two ByteView types are non-trivial.
733    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
734    ///
735    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
736    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
737    ///     Meaning that reading one array element requires only one memory access
738    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
739    ///
740    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
741    ///     thanks to the inlined 4 bytes.
742    ///     Consider equality check:
743    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
744    ///
745    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
746    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
747    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
748    ///
749    /// # Order check flow
750    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
751    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
752    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
753    ///     (2.2) o.w., we need to compare the full string.
754    ///
755    /// # Safety
756    /// The left/right_idx must within range of each array
757    pub unsafe fn compare_unchecked(
758        left: &GenericByteViewArray<T>,
759        left_idx: usize,
760        right: &GenericByteViewArray<T>,
761        right_idx: usize,
762    ) -> Ordering {
763        let l_view = unsafe { left.views().get_unchecked(left_idx) };
764        let l_byte_view = ByteView::from(*l_view);
765
766        let r_view = unsafe { right.views().get_unchecked(right_idx) };
767        let r_byte_view = ByteView::from(*r_view);
768
769        let l_len = l_byte_view.length;
770        let r_len = r_byte_view.length;
771
772        if l_len <= 12 && r_len <= 12 {
773            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
774        }
775
776        // one of the string is larger than 12 bytes,
777        // we then try to compare the inlined data first
778
779        // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
780        // but we can still use it to get the inlined prefix for shorter strings.
781        // The prefix is always the first 4 bytes of the view, for both short and long strings.
782        let l_inlined_be = l_byte_view.prefix.swap_bytes();
783        let r_inlined_be = r_byte_view.prefix.swap_bytes();
784        if l_inlined_be != r_inlined_be {
785            return l_inlined_be.cmp(&r_inlined_be);
786        }
787
788        // unfortunately, we need to compare the full data
789        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
790        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
791
792        l_full_data.cmp(r_full_data)
793    }
794
795    /// Builds a 128-bit composite key for an inline value:
796    ///
797    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
798    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
799    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
800    ///
801    /// This function extracts the length and the 12-byte inline string data from the raw
802    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
803    /// into a single `u128` value suitable for fast, branchless comparisons.
804    ///
805    /// # Why include length?
806    ///
807    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
808    /// compare equal—either because one is a true prefix of the other or because zero-padding
809    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
810    /// handles both content and length in one go.
811    ///
812    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
813    ///
814    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
815    /// |------------|-----------------------|---------------------------------|
816    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
817    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
818    ///
819    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
820    /// then differentiates:
821    ///
822    /// ```text
823    /// key("bar")   = 0x0000000000000000000062617200000003
824    /// key("bar\0") = 0x0000000000000000000062617200000004
825    /// ⇒ key("bar") < key("bar\0")
826    /// ```
827    /// - `raw` is treated as a 128-bit integer with its bits laid out as follows:
828    ///   - bits 0–31: length (little-endian)
829    ///   - bits 32–127: data (little-endian)
830    ///
831    /// # Inlining and Endianness
832    ///
833    /// This function uses platform-independent bitwise operations to construct a 128-bit key:
834    /// - `raw.swap_bytes() << 32` effectively clears the length bits and shifts the 12-byte inline data
835    ///   into the high 96 bits in Big-Endian order. This ensures the first byte of the string
836    ///   is the most significant byte of the resulting `u128`.
837    /// - `raw as u32` extracts the length as a numeric integer, which is then placed in the low 32 bits.
838    #[inline(always)]
839    pub fn inline_key_fast(raw: u128) -> u128 {
840        (raw.swap_bytes() << 32) | (raw as u32 as u128)
841    }
842}
843
844impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
845    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
846        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
847        print_long_array(self, f, |array, index, f| {
848            std::fmt::Debug::fmt(&array.value(index), f)
849        })?;
850        write!(f, "]")
851    }
852}
853
854/// SAFETY: Correctly implements the contract of Arrow Arrays
855unsafe impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
856    fn as_any(&self) -> &dyn Any {
857        self
858    }
859
860    fn to_data(&self) -> ArrayData {
861        self.clone().into()
862    }
863
864    fn into_data(self) -> ArrayData {
865        self.into()
866    }
867
868    fn data_type(&self) -> &DataType {
869        &self.data_type
870    }
871
872    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
873        Arc::new(self.slice(offset, length))
874    }
875
876    fn len(&self) -> usize {
877        self.views.len()
878    }
879
880    fn is_empty(&self) -> bool {
881        self.views.is_empty()
882    }
883
884    fn shrink_to_fit(&mut self) {
885        self.views.shrink_to_fit();
886
887        // The goal of `shrink_to_fit` is to minimize the space used by any of
888        // its allocations. The use of `Arc::get_mut` over `Arc::make_mut` is
889        // because if the reference count is greater than 1, `Arc::make_mut`
890        // will first clone its contents. So, any large allocations will first
891        // be cloned before being shrunk, leaving the pre-cloned allocations
892        // intact, before adding the extra (used) space of the new clones.
893        if let Some(buffers) = Arc::get_mut(&mut self.buffers) {
894            buffers.iter_mut().for_each(|b| b.shrink_to_fit());
895        }
896
897        // With the assumption that this is a best-effort function, no attempt
898        // is made to shrink `self.buffers`, which it can't because it's type
899        // does not expose a `shrink_to_fit` method.
900
901        if let Some(nulls) = &mut self.nulls {
902            nulls.shrink_to_fit();
903        }
904    }
905
906    fn offset(&self) -> usize {
907        0
908    }
909
910    fn nulls(&self) -> Option<&NullBuffer> {
911        self.nulls.as_ref()
912    }
913
914    fn logical_null_count(&self) -> usize {
915        // More efficient that the default implementation
916        self.null_count()
917    }
918
919    fn get_buffer_memory_size(&self) -> usize {
920        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
921        sum += self.views.inner().capacity();
922        if let Some(x) = &self.nulls {
923            sum += x.buffer().capacity()
924        }
925        sum
926    }
927
928    fn get_array_memory_size(&self) -> usize {
929        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
930    }
931
932    #[cfg(feature = "pool")]
933    fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
934        self.views.claim(pool);
935        for buffer in self.buffers.iter() {
936            buffer.claim(pool);
937        }
938        if let Some(nulls) = &self.nulls {
939            nulls.claim(pool);
940        }
941    }
942}
943
944impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
945    type Item = &'a T::Native;
946
947    fn value(&self, index: usize) -> Self::Item {
948        GenericByteViewArray::value(self, index)
949    }
950
951    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
952        unsafe { GenericByteViewArray::value_unchecked(self, index) }
953    }
954}
955
956impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
957    type Item = Option<&'a T::Native>;
958    type IntoIter = ArrayIter<Self>;
959
960    fn into_iter(self) -> Self::IntoIter {
961        ArrayIter::new(self)
962    }
963}
964
965impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
966    fn from(data: ArrayData) -> Self {
967        let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();
968        assert_eq!(
969            data_type,
970            T::DATA_TYPE,
971            "Mismatched data type, expected {}, got {data_type}",
972            T::DATA_TYPE
973        );
974        let mut buffers = buffers.into_iter();
975        // first buffer is views, remaining are data buffers
976        let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
977        Self {
978            data_type,
979            views,
980            buffers: Arc::from_iter(buffers),
981            nulls,
982            phantom: Default::default(),
983        }
984    }
985}
986
987/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
988///
989/// For example this method can convert a [`StringArray`] to a
990/// [`StringViewArray`].
991///
992/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
993/// is built without copying the underlying string data (views are created
994/// directly into the existing buffer)
995///
996/// [`StringArray`]: crate::StringArray
997impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
998where
999    FROM: ByteArrayType,
1000    FROM::Offset: OffsetSizeTrait + ToPrimitive,
1001    V: ByteViewType<Native = FROM::Native>,
1002{
1003    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
1004        let offsets = byte_array.offsets();
1005
1006        let can_reuse_buffer = match offsets.last() {
1007            Some(offset) => offset.as_usize() < u32::MAX as usize,
1008            None => true,
1009        };
1010
1011        if can_reuse_buffer {
1012            // build views directly pointing to the existing buffer
1013            let len = byte_array.len();
1014            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
1015            let str_values_buf = byte_array.values().clone();
1016            let block = views_builder.append_block(str_values_buf);
1017            for (i, w) in offsets.windows(2).enumerate() {
1018                let offset = w[0].as_usize();
1019                let end = w[1].as_usize();
1020                let length = end - offset;
1021
1022                if byte_array.is_null(i) {
1023                    views_builder.append_null();
1024                } else {
1025                    // Safety: the input was a valid array so it valid UTF8 (if string). And
1026                    // all offsets were valid
1027                    unsafe {
1028                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
1029                    }
1030                }
1031            }
1032            assert_eq!(views_builder.len(), len);
1033            views_builder.finish()
1034        } else {
1035            // Otherwise, create a new buffer for large strings
1036            // TODO: the original buffer could still be used
1037            // by making multiple slices of u32::MAX length
1038            GenericByteViewArray::<V>::from_iter(byte_array.iter())
1039        }
1040    }
1041}
1042
1043impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1044    fn from(array: GenericByteViewArray<T>) -> Self {
1045        let len = array.len();
1046
1047        let mut buffers = array.buffers.to_vec();
1048        buffers.insert(0, array.views.into_inner());
1049
1050        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1051            .len(len)
1052            .buffers(buffers)
1053            .nulls(array.nulls);
1054
1055        unsafe { builder.build_unchecked() }
1056    }
1057}
1058
1059impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1060where
1061    Ptr: AsRef<T::Native> + 'a,
1062    T: ByteViewType + ?Sized,
1063{
1064    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1065        iter.into_iter()
1066            .map(|o| o.as_ref().map(|p| p.as_ref()))
1067            .collect()
1068    }
1069}
1070
1071impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1072where
1073    Ptr: AsRef<T::Native>,
1074{
1075    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1076        let iter = iter.into_iter();
1077        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1078        builder.extend(iter);
1079        builder.finish()
1080    }
1081}
1082
1083/// A [`GenericByteViewArray`] of `[u8]`
1084///
1085/// See [`GenericByteViewArray`] for format and layout details.
1086///
1087/// # Example
1088/// ```
1089/// use arrow_array::BinaryViewArray;
1090/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1091/// assert_eq!(array.value(0), b"hello");
1092/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1093/// ```
1094pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1095
1096impl BinaryViewArray {
1097    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1098    /// If items not utf8 data, validate will fail and error returned.
1099    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1100        StringViewType::validate(self.views(), self.data_buffers())?;
1101        unsafe { Ok(self.to_string_view_unchecked()) }
1102    }
1103
1104    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1105    /// # Safety
1106    /// Caller is responsible for ensuring that items in array are utf8 data.
1107    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1108        unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1109    }
1110}
1111
1112impl From<Vec<&[u8]>> for BinaryViewArray {
1113    fn from(v: Vec<&[u8]>) -> Self {
1114        Self::from_iter_values(v)
1115    }
1116}
1117
1118impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1119    fn from(v: Vec<Option<&[u8]>>) -> Self {
1120        v.into_iter().collect()
1121    }
1122}
1123
1124/// A [`GenericByteViewArray`] that stores utf8 data
1125///
1126/// See [`GenericByteViewArray`] for format and layout details.
1127///
1128/// # Example
1129/// ```
1130/// use arrow_array::StringViewArray;
1131/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1132/// assert_eq!(array.value(0), "hello");
1133/// assert_eq!(array.value(3), "large payload over 12 bytes");
1134/// ```
1135pub type StringViewArray = GenericByteViewArray<StringViewType>;
1136
1137impl StringViewArray {
1138    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1139    pub fn to_binary_view(self) -> BinaryViewArray {
1140        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1141    }
1142
1143    /// Returns true if all data within this array is ASCII
1144    pub fn is_ascii(&self) -> bool {
1145        // Alternative (but incorrect): directly check the underlying buffers
1146        // (1) Our string view might be sparse, i.e., a subset of the buffers,
1147        //      so even if the buffer is not ascii, we can still be ascii.
1148        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1149        // This means that this operation is quite expensive, shall we cache the result?
1150        //  i.e. track `is_ascii` in the builder.
1151        self.iter().all(|v| match v {
1152            Some(v) => v.is_ascii(),
1153            None => true,
1154        })
1155    }
1156}
1157
1158impl From<Vec<&str>> for StringViewArray {
1159    fn from(v: Vec<&str>) -> Self {
1160        Self::from_iter_values(v)
1161    }
1162}
1163
1164impl From<Vec<Option<&str>>> for StringViewArray {
1165    fn from(v: Vec<Option<&str>>) -> Self {
1166        v.into_iter().collect()
1167    }
1168}
1169
1170impl From<Vec<String>> for StringViewArray {
1171    fn from(v: Vec<String>) -> Self {
1172        Self::from_iter_values(v)
1173    }
1174}
1175
1176impl From<Vec<Option<String>>> for StringViewArray {
1177    fn from(v: Vec<Option<String>>) -> Self {
1178        v.into_iter().collect()
1179    }
1180}
1181
1182#[cfg(test)]
1183mod tests {
1184    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1185    use crate::types::BinaryViewType;
1186    use crate::{
1187        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1188    };
1189    use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
1190    use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
1191    use arrow_schema::DataType;
1192    use rand::prelude::StdRng;
1193    use rand::{Rng, SeedableRng};
1194    use std::str::from_utf8;
1195
1196    const BLOCK_SIZE: u32 = 8;
1197
1198    #[test]
1199    fn try_new_string() {
1200        let array = StringViewArray::from_iter_values(vec![
1201            "hello",
1202            "world",
1203            "lulu",
1204            "large payload over 12 bytes",
1205        ]);
1206        assert_eq!(array.value(0), "hello");
1207        assert_eq!(array.value(3), "large payload over 12 bytes");
1208    }
1209
1210    #[test]
1211    fn try_new_binary() {
1212        let array = BinaryViewArray::from_iter_values(vec![
1213            b"hello".as_slice(),
1214            b"world".as_slice(),
1215            b"lulu".as_slice(),
1216            b"large payload over 12 bytes".as_slice(),
1217        ]);
1218        assert_eq!(array.value(0), b"hello");
1219        assert_eq!(array.value(3), b"large payload over 12 bytes");
1220    }
1221
1222    #[test]
1223    fn try_new_empty_string() {
1224        // test empty array
1225        let array = {
1226            let mut builder = StringViewBuilder::new();
1227            builder.finish()
1228        };
1229        assert!(array.is_empty());
1230    }
1231
1232    #[test]
1233    fn try_new_empty_binary() {
1234        // test empty array
1235        let array = {
1236            let mut builder = BinaryViewBuilder::new();
1237            builder.finish()
1238        };
1239        assert!(array.is_empty());
1240    }
1241
1242    #[test]
1243    fn test_append_string() {
1244        // test builder append
1245        let array = {
1246            let mut builder = StringViewBuilder::new();
1247            builder.append_value("hello");
1248            builder.append_null();
1249            builder.append_option(Some("large payload over 12 bytes"));
1250            builder.finish()
1251        };
1252        assert_eq!(array.value(0), "hello");
1253        assert!(array.is_null(1));
1254        assert_eq!(array.value(2), "large payload over 12 bytes");
1255    }
1256
1257    #[test]
1258    fn test_append_binary() {
1259        // test builder append
1260        let array = {
1261            let mut builder = BinaryViewBuilder::new();
1262            builder.append_value(b"hello");
1263            builder.append_null();
1264            builder.append_option(Some(b"large payload over 12 bytes"));
1265            builder.finish()
1266        };
1267        assert_eq!(array.value(0), b"hello");
1268        assert!(array.is_null(1));
1269        assert_eq!(array.value(2), b"large payload over 12 bytes");
1270    }
1271
1272    #[test]
1273    fn test_in_progress_recreation() {
1274        let array = {
1275            // make a builder with small block size.
1276            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1277            builder.append_value("large payload over 12 bytes");
1278            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1279            builder.finish()
1280        };
1281        assert_eq!(array.value(0), "large payload over 12 bytes");
1282        assert_eq!(
1283            array.value(1),
1284            "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1285        );
1286        assert_eq!(2, array.buffers.len());
1287    }
1288
1289    #[test]
1290    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1291    fn new_with_invalid_view_data() {
1292        let v = "large payload over 12 bytes";
1293        let view = ByteView::new(13, &v.as_bytes()[0..4])
1294            .with_buffer_index(3)
1295            .with_offset(1);
1296        let views = ScalarBuffer::from(vec![view.into()]);
1297        let buffers = vec![Buffer::from_slice_ref(v)];
1298        StringViewArray::new(views, buffers, None);
1299    }
1300
1301    #[test]
1302    #[should_panic(
1303        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1304    )]
1305    fn new_with_invalid_utf8_data() {
1306        let v: Vec<u8> = vec![
1307            // invalid UTF8
1308            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1309            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1310        ];
1311        let view = ByteView::new(v.len() as u32, &v[0..4]);
1312        let views = ScalarBuffer::from(vec![view.into()]);
1313        let buffers = vec![Buffer::from_slice_ref(v)];
1314        StringViewArray::new(views, buffers, None);
1315    }
1316
1317    #[test]
1318    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1319    fn new_with_invalid_zero_padding() {
1320        let mut data = [0; 12];
1321        data[0] = b'H';
1322        data[11] = 1; // no zero padding
1323
1324        let mut view_buffer = [0; 16];
1325        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1326        view_buffer[4..].copy_from_slice(&data);
1327
1328        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1329        let views = ScalarBuffer::from(vec![view.into()]);
1330        let buffers = vec![];
1331        StringViewArray::new(views, buffers, None);
1332    }
1333
1334    #[test]
1335    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1336    fn test_mismatch_between_embedded_prefix_and_data() {
1337        let input_str_1 = "Hello, Rustaceans!";
1338        let input_str_2 = "Hallo, Rustaceans!";
1339        let length = input_str_1.len() as u32;
1340        assert!(input_str_1.len() > 12);
1341
1342        let mut view_buffer = [0; 16];
1343        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1344        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1345        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1346        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1347        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1348        let views = ScalarBuffer::from(vec![view.into()]);
1349        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1350
1351        StringViewArray::new(views, buffers, None);
1352    }
1353
1354    #[test]
1355    fn test_gc() {
1356        let test_data = [
1357            Some("longer than 12 bytes"),
1358            Some("short"),
1359            Some("t"),
1360            Some("longer than 12 bytes"),
1361            None,
1362            Some("short"),
1363        ];
1364
1365        let array = {
1366            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1367            test_data.into_iter().for_each(|v| builder.append_option(v));
1368            builder.finish()
1369        };
1370        assert!(array.buffers.len() > 1);
1371
1372        fn check_gc(to_test: &StringViewArray) {
1373            let gc = to_test.gc();
1374            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1375
1376            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1377                assert_eq!(a, b);
1378            });
1379            assert_eq!(to_test.len(), gc.len());
1380        }
1381
1382        check_gc(&array);
1383        check_gc(&array.slice(1, 3));
1384        check_gc(&array.slice(2, 1));
1385        check_gc(&array.slice(2, 2));
1386        check_gc(&array.slice(3, 1));
1387    }
1388
1389    /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1390    #[test]
1391    fn test_gc_empty_array() {
1392        let array = StringViewBuilder::new()
1393            .with_fixed_block_size(BLOCK_SIZE)
1394            .finish();
1395        let gced = array.gc();
1396        // length and null count remain zero
1397        assert_eq!(gced.len(), 0);
1398        assert_eq!(gced.null_count(), 0);
1399        // no underlying data buffers should be allocated
1400        assert!(
1401            gced.data_buffers().is_empty(),
1402            "Expected no data buffers for empty array"
1403        );
1404    }
1405
1406    /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1407    #[test]
1408    fn test_gc_all_inline() {
1409        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1410        // append many short strings, each exactly INLINE_LEN long
1411        for _ in 0..100 {
1412            let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1413            builder.append_option(Some(&s));
1414        }
1415        let array = builder.finish();
1416        let gced = array.gc();
1417        // Since all views fit inline, data buffer is empty
1418        assert_eq!(
1419            gced.data_buffers().len(),
1420            0,
1421            "Should have no data buffers for inline values"
1422        );
1423        assert_eq!(gced.len(), 100);
1424        // verify element-wise equality
1425        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1426            assert_eq!(orig, got, "Inline value mismatch after gc");
1427        });
1428    }
1429
1430    /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1431    #[test]
1432    fn test_gc_all_large() {
1433        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1434        let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1435        // append multiple large strings
1436        for _ in 0..50 {
1437            builder.append_option(Some(&large_str));
1438        }
1439        let array = builder.finish();
1440        let gced = array.gc();
1441        // New data buffers should be populated (one or more blocks)
1442        assert!(
1443            !gced.data_buffers().is_empty(),
1444            "Expected data buffers for large values"
1445        );
1446        assert_eq!(gced.len(), 50);
1447        // verify that every large string emerges unchanged
1448        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1449            assert_eq!(orig, got, "Large view mismatch after gc");
1450        });
1451    }
1452
1453    /// 4) All null elements: ensure null bitmap handling path is correct
1454    #[test]
1455    fn test_gc_all_nulls() {
1456        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1457        for _ in 0..20 {
1458            builder.append_null();
1459        }
1460        let array = builder.finish();
1461        let gced = array.gc();
1462        // length and null count match
1463        assert_eq!(gced.len(), 20);
1464        assert_eq!(gced.null_count(), 20);
1465        // data buffers remain empty for null-only array
1466        assert!(
1467            gced.data_buffers().is_empty(),
1468            "No data should be stored for nulls"
1469        );
1470    }
1471
1472    /// 5) Random mix of inline, large, and null values with slicing tests
1473    #[test]
1474    fn test_gc_random_mixed_and_slices() {
1475        let mut rng = StdRng::seed_from_u64(42);
1476        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1477        // Keep a Vec of original Option<String> for later comparison
1478        let mut original: Vec<Option<String>> = Vec::new();
1479
1480        for _ in 0..200 {
1481            if rng.random_bool(0.1) {
1482                // 10% nulls
1483                builder.append_null();
1484                original.push(None);
1485            } else {
1486                // random length between 0 and twice the inline limit
1487                let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1488                let s: String = "A".repeat(len as usize);
1489                builder.append_option(Some(&s));
1490                original.push(Some(s));
1491            }
1492        }
1493
1494        let array = builder.finish();
1495        // Test multiple slice ranges to ensure offset logic is correct
1496        for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1497            let sliced = array.slice(*offset, *slice_len);
1498            let gced = sliced.gc();
1499            // Build expected slice of Option<&str>
1500            let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1501                .iter()
1502                .map(|opt| opt.as_deref())
1503                .collect();
1504
1505            assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1506            // Compare element-wise
1507            gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1508                assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1509            });
1510        }
1511    }
1512
1513    #[test]
1514    #[cfg_attr(miri, ignore)] // Takes too long
1515    fn test_gc_huge_array() {
1516        // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1517        let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1518        let num_views: usize = 36;
1519
1520        // Create a single 128 MiB data block with a simple byte pattern
1521        let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1522        let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1523
1524        // Append this block and then add many views pointing to it
1525        let mut builder = BinaryViewBuilder::new();
1526        let block_id = builder.append_block(buffer);
1527        for _ in 0..num_views / 2 {
1528            builder
1529                .try_append_view(block_id, 0, block_len as u32)
1530                .expect("append view into 128MiB block");
1531        }
1532        let block_id2 = builder.append_block(buffer2);
1533        for _ in 0..num_views / 2 {
1534            builder
1535                .try_append_view(block_id2, 0, block_len as u32)
1536                .expect("append view into 128MiB block");
1537        }
1538
1539        let array = builder.finish();
1540        let total = array.total_buffer_bytes_used();
1541        assert!(
1542            total > u32::MAX as usize,
1543            "Expected total non-inline bytes to exceed 4 GiB, got {}",
1544            total
1545        );
1546
1547        // Run gc and verify correctness
1548        let gced = array.gc();
1549        assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1550        assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1551        assert_ne!(
1552            gced.data_buffers().len(),
1553            1,
1554            "gc with huge buffer should not consolidate data into a single buffer"
1555        );
1556
1557        // Element-wise equality check across the entire array
1558        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1559            assert_eq!(orig, got, "Value mismatch after gc on huge array");
1560        });
1561    }
1562
1563    #[test]
1564    fn test_eq() {
1565        let test_data = [
1566            Some("longer than 12 bytes"),
1567            None,
1568            Some("short"),
1569            Some("again, this is longer than 12 bytes"),
1570        ];
1571
1572        let array1 = {
1573            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1574            test_data.into_iter().for_each(|v| builder.append_option(v));
1575            builder.finish()
1576        };
1577        let array2 = {
1578            // create a new array with the same data but different layout
1579            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1580            test_data.into_iter().for_each(|v| builder.append_option(v));
1581            builder.finish()
1582        };
1583        assert_eq!(array1, array1.clone());
1584        assert_eq!(array2, array2.clone());
1585        assert_eq!(array1, array2);
1586    }
1587
1588    /// Integration tests for `inline_key_fast` covering:
1589    ///
1590    /// 1. Monotonic ordering across increasing lengths and lexical variations.
1591    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1592    ///
1593    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1594    /// the length field is required even when all inline bytes fit in 12 bytes.
1595    ///
1596    /// The test includes strings that verify correct byte order (prevent reversal bugs),
1597    /// and length-based tie-breaking in the composite key.
1598    ///
1599    /// The test confirms that `inline_key_fast` produces keys which sort consistently
1600    /// with the expected lexicographical order of the raw byte arrays.
1601    #[test]
1602    fn test_inline_key_fast_various_lengths_and_lexical() {
1603        /// Helper to create a raw u128 value representing an inline ByteView:
1604        /// - `length`: number of meaningful bytes (must be ≤ 12)
1605        /// - `data`: the actual inline data bytes
1606        ///
1607        /// The first 4 bytes encode length in little-endian,
1608        /// the following 12 bytes contain the inline string data (unpadded).
1609        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1610            assert!(length as usize <= 12, "Inline length must be ≤ 12");
1611            assert!(
1612                data.len() == length as usize,
1613                "Data length must match `length`"
1614            );
1615
1616            let mut raw_bytes = [0u8; 16];
1617            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1618            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1619            u128::from_le_bytes(raw_bytes)
1620        }
1621
1622        // Test inputs: various lengths and lexical orders,
1623        // plus special cases for byte order and length tie-breaking
1624        let test_inputs: Vec<&[u8]> = vec![
1625            b"a",
1626            b"aa",
1627            b"aaa",
1628            b"aab",
1629            b"abcd",
1630            b"abcde",
1631            b"abcdef",
1632            b"abcdefg",
1633            b"abcdefgh",
1634            b"abcdefghi",
1635            b"abcdefghij",
1636            b"abcdefghijk",
1637            b"abcdefghijkl",
1638            // Tests for byte-order reversal bug:
1639            // Without the fix, "backend one" would compare as "eno dnekcab",
1640            // causing incorrect sort order relative to "backend two".
1641            b"backend one",
1642            b"backend two",
1643            // Tests length-tiebreaker logic:
1644            // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1645            // so only the length differentiates their ordering.
1646            b"bar",
1647            b"bar\0",
1648            // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1649            b"than12Byt",
1650            b"than12Bytes",
1651            b"than12Bytes\0",
1652            b"than12Bytesx",
1653            b"than12Bytex",
1654            b"than12Bytez",
1655            // Additional lexical tests
1656            b"xyy",
1657            b"xyz",
1658            b"xza",
1659        ];
1660
1661        // Create a GenericBinaryArray for cross-comparison of lex order
1662        let array: GenericBinaryArray<i32> =
1663            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1664
1665        for i in 0..array.len() - 1 {
1666            let v1 = array.value(i);
1667            let v2 = array.value(i + 1);
1668
1669            // Assert the array's natural lexical ordering is correct
1670            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1671
1672            // Assert the keys produced by inline_key_fast reflect the same ordering
1673            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1674                v1.len() as u32,
1675                v1,
1676            ));
1677            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1678                v2.len() as u32,
1679                v2,
1680            ));
1681
1682            assert!(
1683                key1 < key2,
1684                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1685            );
1686        }
1687    }
1688
1689    #[test]
1690    fn empty_array_should_return_empty_lengths_iterator() {
1691        let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
1692
1693        let mut lengths_iter = empty.lengths();
1694        assert_eq!(lengths_iter.len(), 0);
1695        assert_eq!(lengths_iter.next(), None);
1696    }
1697
1698    #[test]
1699    fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
1700        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1701            // Not inlined as longer than 12 bytes
1702            b"Supercalifragilisticexpialidocious" as &[u8],
1703            // Inlined as shorter than 12 bytes
1704            b"Hello",
1705            // Empty value
1706            b"",
1707            // Exactly 12 bytes
1708            b"abcdefghijkl",
1709        ]);
1710
1711        let mut lengths_iter = cases.lengths();
1712
1713        assert_eq!(lengths_iter.len(), cases.len());
1714
1715        let cases_iter = cases.iter();
1716
1717        for case in cases_iter {
1718            let case_value = case.unwrap();
1719            let length = lengths_iter.next().expect("Should have a length");
1720
1721            assert_eq!(case_value.len(), length as usize);
1722        }
1723
1724        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1725    }
1726
1727    #[test]
1728    fn array_lengths_should_return_the_underlying_length_for_null_values() {
1729        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1730            // Not inlined as longer than 12 bytes
1731            b"Supercalifragilisticexpialidocious" as &[u8],
1732            // Inlined as shorter than 12 bytes
1733            b"Hello",
1734            // Empty value
1735            b"",
1736            // Exactly 12 bytes
1737            b"abcdefghijkl",
1738        ]);
1739
1740        let (views, buffer, _) = cases.clone().into_parts();
1741
1742        // Keeping the values but just adding nulls on top
1743        let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
1744            views,
1745            buffer,
1746            Some(NullBuffer::new_null(cases.len())),
1747        );
1748
1749        let lengths_iter = cases.lengths();
1750        let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
1751
1752        assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
1753
1754        for expected_length in lengths_iter {
1755            let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");
1756
1757            assert_eq!(expected_length, actual_length);
1758        }
1759
1760        assert_eq!(
1761            all_nulls_lengths_iter.next(),
1762            None,
1763            "Should not have more lengths"
1764        );
1765    }
1766
1767    #[test]
1768    fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
1769        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1770            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1771            b"Hello",
1772            b"something great",
1773            b"is",
1774            b"coming soon!",
1775            b"when you find what it is",
1776            b"let me know",
1777            b"cause",
1778            b"I",
1779            b"have no idea",
1780            b"what it",
1781            b"is",
1782        ]);
1783
1784        let sliced_array = array.slice(2, array.len() - 3);
1785
1786        let mut lengths_iter = sliced_array.lengths();
1787
1788        assert_eq!(lengths_iter.len(), sliced_array.len());
1789
1790        let values_iter = sliced_array.iter();
1791
1792        for value in values_iter {
1793            let value = value.unwrap();
1794            let length = lengths_iter.next().expect("Should have a length");
1795
1796            assert_eq!(value.len(), length as usize);
1797        }
1798
1799        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1800    }
1801
1802    #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")]
1803    #[test]
1804    fn invalid_casting_from_array_data() {
1805        // Should not be able to cast to StringViewArray due to invalid UTF-8
1806        let array_data = binary_view_array_with_invalid_utf8_data().into_data();
1807        let _ = StringViewArray::from(array_data);
1808    }
1809
1810    #[should_panic(expected = "invalid utf-8 sequence")]
1811    #[test]
1812    fn invalid_array_data() {
1813        let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts();
1814
1815        // manually try and add invalid array data with Utf8View data type
1816        let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
1817            .add_buffer(views.into_inner())
1818            .len(3);
1819        for buffer in buffers.iter() {
1820            builder = builder.add_buffer(buffer.clone())
1821        }
1822        builder = builder.nulls(nulls);
1823
1824        let data = builder.build().unwrap(); // should fail validation
1825        let _arr = StringViewArray::from(data);
1826    }
1827
1828    /// Returns a BinaryViewArray with one invalid UTF-8 value
1829    fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
1830        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1831            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1832            &[
1833                0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1834                0x00, 0x00,
1835            ],
1836            b"good",
1837        ]);
1838        assert!(from_utf8(array.value(0)).is_ok());
1839        assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
1840        assert!(from_utf8(array.value(2)).is_ok());
1841        array
1842    }
1843
1844    #[test]
1845    fn test_total_bytes_len() {
1846        // inlined: "hello"=5, "world"=5, "lulu"=4 → 14
1847        // non-inlined: "large payload over 12 bytes"=27
1848        // null: should not count
1849        let mut builder = StringViewBuilder::new();
1850        builder.append_value("hello");
1851        builder.append_value("world");
1852        builder.append_value("lulu");
1853        builder.append_null();
1854        builder.append_value("large payload over 12 bytes");
1855        let array = builder.finish();
1856        assert_eq!(array.total_bytes_len(), 5 + 5 + 4 + 27);
1857    }
1858
1859    #[test]
1860    fn test_total_bytes_len_empty() {
1861        let array = StringViewArray::from_iter::<Vec<Option<&str>>>(vec![]);
1862        assert_eq!(array.total_bytes_len(), 0);
1863    }
1864
1865    #[test]
1866    fn test_total_bytes_len_all_nulls() {
1867        let array = StringViewArray::new_null(5);
1868        assert_eq!(array.total_bytes_len(), 0);
1869    }
1870
1871    #[test]
1872    fn test_total_bytes_len_binary_view() {
1873        let array = BinaryViewArray::from_iter(vec![
1874            Some(b"hi".as_ref()),
1875            None,
1876            Some(b"large payload over 12 bytes".as_ref()),
1877        ]);
1878        assert_eq!(array.total_bytes_len(), 2 + 27);
1879    }
1880}