arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num::ToPrimitive;
29use std::any::Any;
30use std::fmt::Debug;
31use std::marker::PhantomData;
32use std::sync::Arc;
33
34use super::ByteArrayType;
35
36/// [Variable-size Binary View Layout]: An array of variable length bytes views.
37///
38/// This array type is used to store variable length byte data (e.g. Strings, Binary)
39/// and has efficient operations such as `take`, `filter`, and comparison.
40///
41/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
42///
43/// This is different from [`GenericByteArray`], which also stores variable
44/// length byte data, as it represents strings with an offset and length. `take`
45/// and `filter` like operations are implemented by manipulating the "views"
46/// (`u128`) without modifying the bytes. Each view also stores an inlined
47/// prefix which speed up comparisons.
48///
49/// # See Also
50///
51/// * [`StringViewArray`] for storing utf8 encoded string data
52/// * [`BinaryViewArray`] for storing bytes
53/// * [`ByteView`] to interpret `u128`s layout of the views.
54///
55/// [`ByteView`]: arrow_data::ByteView
56///
57/// # Layout: "views" and buffers
58///
59/// A `GenericByteViewArray` stores variable length byte strings. An array of
60/// `N` elements is stored as `N` fixed length "views" and a variable number
61/// of variable length "buffers".
62///
63/// Each view is a `u128` value whose layout is different depending on the
64/// length of the string stored at that location:
65///
66/// ```text
67///                         ┌──────┬────────────────────────┐
68///                         │length│      string value      │
69///    Strings (len <= 12)  │      │    (padded with 0)     │
70///                         └──────┴────────────────────────┘
71///                          0    31                      127
72///
73///                         ┌───────┬───────┬───────┬───────┐
74///                         │length │prefix │  buf  │offset │
75///    Strings (len > 12)   │       │       │ index │       │
76///                         └───────┴───────┴───────┴───────┘
77///                          0    31       63      95    127
78/// ```
79///
80/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
81///   the view. See [`Self::inline_value`] to access the inlined prefix from a
82///   short view.
83///
84/// * Strings with length > 12: The first four bytes are stored inline in the
85///   view and the entire string is stored in one of the buffers. See [`ByteView`]
86///   to access the fields of the these views.
87///
88/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
89/// the easiest and fastest way to work with this data. However, it is possible
90/// to access the views and buffers directly for more control.
91///
92/// For example
93///
94/// ```rust
95/// # use arrow_array::StringViewArray;
96/// # use arrow_array::Array;
97/// use arrow_data::ByteView;
98/// let array = StringViewArray::from(vec![
99///   "hello",
100///   "this string is longer than 12 bytes",
101///   "this string is also longer than 12 bytes"
102/// ]);
103///
104/// // ** Examine the first view (short string) **
105/// assert!(array.is_valid(0)); // Check for nulls
106/// let short_view: u128 = array.views()[0]; // "hello"
107/// // get length of the string
108/// let len = short_view as u32;
109/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
110/// // SAFETY: `view` is a valid view
111/// let value = unsafe {
112///   StringViewArray::inline_value(&short_view, len as usize)
113/// };
114/// assert_eq!(value, b"hello");
115///
116/// // ** Examine the third view (long string) **
117/// assert!(array.is_valid(12)); // Check for nulls
118/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
119/// let len = long_view as u32;
120/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
121/// let view = ByteView::from(long_view); // use ByteView to access the fields
122/// assert_eq!(view.length, 40);
123/// assert_eq!(view.buffer_index, 0);
124/// assert_eq!(view.offset, 35); // data starts after the first long string
125/// // Views for long strings store a 4 byte prefix
126/// let prefix = view.prefix.to_le_bytes();
127/// assert_eq!(&prefix, b"this");
128/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
129/// assert_eq!(value, "this string is also longer than 12 bytes");
130/// ```
131///
132/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
133/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
134///
135/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
136/// than they must point into a valid buffer. However, they can be out of order,
137/// non continuous and overlapping.
138///
139/// For example, in the following diagram, the strings "FishWasInTownToday" and
140/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
141/// separate buffer while the string "LavaMonster" is stored inlined in the
142/// view. In this case, the same bytes for "Fish" are used to store both strings.
143///
144/// [`ByteView`]: arrow_data::ByteView
145///
146/// ```text
147///                                                                            ┌───┐
148///                         ┌──────┬──────┬──────┬──────┐               offset │...│
149/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
150///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
151///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
152/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
153///                         └──────┴──────┴──────┴──────┘                      │ced│
154///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
155/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
156///                         └──────┴────────────────────┘               offset │sIn│
157///                                                                       115  │Tow│
158///                                                                            │nTo│
159///                                                                            │day│
160///                                  u128 "views"                              │Yay│
161///                                                                   buffer 0 │...│
162///                                                                            └───┘
163/// ```
164pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
165    data_type: DataType,
166    views: ScalarBuffer<u128>,
167    buffers: Vec<Buffer>,
168    phantom: PhantomData<T>,
169    nulls: Option<NullBuffer>,
170}
171
172impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
173    fn clone(&self) -> Self {
174        Self {
175            data_type: T::DATA_TYPE,
176            views: self.views.clone(),
177            buffers: self.buffers.clone(),
178            nulls: self.nulls.clone(),
179            phantom: Default::default(),
180        }
181    }
182}
183
184impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
185    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
186    ///
187    /// # Panics
188    ///
189    /// Panics if [`GenericByteViewArray::try_new`] returns an error
190    pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
191        Self::try_new(views, buffers, nulls).unwrap()
192    }
193
194    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
195    ///
196    /// # Errors
197    ///
198    /// * `views.len() != nulls.len()`
199    /// * [ByteViewType::validate] fails
200    pub fn try_new(
201        views: ScalarBuffer<u128>,
202        buffers: Vec<Buffer>,
203        nulls: Option<NullBuffer>,
204    ) -> Result<Self, ArrowError> {
205        T::validate(&views, &buffers)?;
206
207        if let Some(n) = nulls.as_ref() {
208            if n.len() != views.len() {
209                return Err(ArrowError::InvalidArgumentError(format!(
210                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
211                    T::PREFIX,
212                    views.len(),
213                    n.len(),
214                )));
215            }
216        }
217
218        Ok(Self {
219            data_type: T::DATA_TYPE,
220            views,
221            buffers,
222            nulls,
223            phantom: Default::default(),
224        })
225    }
226
227    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
228    ///
229    /// # Safety
230    ///
231    /// Safe if [`Self::try_new`] would not error
232    pub unsafe fn new_unchecked(
233        views: ScalarBuffer<u128>,
234        buffers: Vec<Buffer>,
235        nulls: Option<NullBuffer>,
236    ) -> Self {
237        if cfg!(feature = "force_validate") {
238            return Self::new(views, buffers, nulls);
239        }
240
241        Self {
242            data_type: T::DATA_TYPE,
243            phantom: Default::default(),
244            views,
245            buffers,
246            nulls,
247        }
248    }
249
250    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
251    pub fn new_null(len: usize) -> Self {
252        Self {
253            data_type: T::DATA_TYPE,
254            views: vec![0; len].into(),
255            buffers: vec![],
256            nulls: Some(NullBuffer::new_null(len)),
257            phantom: Default::default(),
258        }
259    }
260
261    /// Create a new [`Scalar`] from `value`
262    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
263        Scalar::new(Self::from_iter_values(std::iter::once(value)))
264    }
265
266    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
267    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
268    where
269        Ptr: AsRef<T::Native>,
270        I: IntoIterator<Item = Ptr>,
271    {
272        let iter = iter.into_iter();
273        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
274        for v in iter {
275            builder.append_value(v);
276        }
277        builder.finish()
278    }
279
280    /// Deconstruct this array into its constituent parts
281    pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
282        (self.views, self.buffers, self.nulls)
283    }
284
285    /// Returns the views buffer
286    #[inline]
287    pub fn views(&self) -> &ScalarBuffer<u128> {
288        &self.views
289    }
290
291    /// Returns the buffers storing string data
292    #[inline]
293    pub fn data_buffers(&self) -> &[Buffer] {
294        &self.buffers
295    }
296
297    /// Returns the element at index `i`
298    /// # Panics
299    /// Panics if index `i` is out of bounds.
300    pub fn value(&self, i: usize) -> &T::Native {
301        assert!(
302            i < self.len(),
303            "Trying to access an element at index {} from a {}ViewArray of length {}",
304            i,
305            T::PREFIX,
306            self.len()
307        );
308
309        unsafe { self.value_unchecked(i) }
310    }
311
312    /// Returns the element at index `i` without bounds checking
313    ///
314    /// # Safety
315    ///
316    /// Caller is responsible for ensuring that the index is within the bounds
317    /// of the array
318    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
319        let v = self.views.get_unchecked(idx);
320        let len = *v as u32;
321        let b = if len <= MAX_INLINE_VIEW_LEN {
322            Self::inline_value(v, len as usize)
323        } else {
324            let view = ByteView::from(*v);
325            let data = self.buffers.get_unchecked(view.buffer_index as usize);
326            let offset = view.offset as usize;
327            data.get_unchecked(offset..offset + len as usize)
328        };
329        T::Native::from_bytes_unchecked(b)
330    }
331
332    /// Returns the first `len` bytes the inline value of the view.
333    ///
334    /// # Safety
335    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
336    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
337    #[inline(always)]
338    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
339        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
340        std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
341    }
342
343    /// Constructs a new iterator for iterating over the values of this array
344    pub fn iter(&self) -> ArrayIter<&Self> {
345        ArrayIter::new(self)
346    }
347
348    /// Returns an iterator over the bytes of this array, including null values
349    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
350        self.views.iter().map(move |v| {
351            let len = *v as u32;
352            if len <= MAX_INLINE_VIEW_LEN {
353                unsafe { Self::inline_value(v, len as usize) }
354            } else {
355                let view = ByteView::from(*v);
356                let data = &self.buffers[view.buffer_index as usize];
357                let offset = view.offset as usize;
358                unsafe { data.get_unchecked(offset..offset + len as usize) }
359            }
360        })
361    }
362
363    /// Returns an iterator over the first `prefix_len` bytes of each array
364    /// element, including null values.
365    ///
366    /// If `prefix_len` is larger than the element's length, the iterator will
367    /// return an empty slice (`&[]`).
368    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
369        self.views().into_iter().map(move |v| {
370            let len = (*v as u32) as usize;
371
372            if len < prefix_len {
373                return &[] as &[u8];
374            }
375
376            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
377                unsafe { StringViewArray::inline_value(v, prefix_len) }
378            } else {
379                let view = ByteView::from(*v);
380                let data = unsafe {
381                    self.data_buffers()
382                        .get_unchecked(view.buffer_index as usize)
383                };
384                let offset = view.offset as usize;
385                unsafe { data.get_unchecked(offset..offset + prefix_len) }
386            }
387        })
388    }
389
390    /// Returns an iterator over the last `suffix_len` bytes of each array
391    /// element, including null values.
392    ///
393    /// Note that for [`StringViewArray`] the last bytes may start in the middle
394    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
395    ///
396    /// If `suffix_len` is larger than the element's length, the iterator will
397    /// return an empty slice (`&[]`).
398    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
399        self.views().into_iter().map(move |v| {
400            let len = (*v as u32) as usize;
401
402            if len < suffix_len {
403                return &[] as &[u8];
404            }
405
406            if len as u32 <= MAX_INLINE_VIEW_LEN {
407                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
408            } else {
409                let view = ByteView::from(*v);
410                let data = unsafe {
411                    self.data_buffers()
412                        .get_unchecked(view.buffer_index as usize)
413                };
414                let offset = view.offset as usize;
415                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
416            }
417        })
418    }
419
420    /// Returns a zero-copy slice of this array with the indicated offset and length.
421    pub fn slice(&self, offset: usize, length: usize) -> Self {
422        Self {
423            data_type: T::DATA_TYPE,
424            views: self.views.slice(offset, length),
425            buffers: self.buffers.clone(),
426            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
427            phantom: Default::default(),
428        }
429    }
430
431    /// Returns a "compacted" version of this array
432    ///
433    /// The original array will *not* be modified
434    ///
435    /// # Garbage Collection
436    ///
437    /// Before GC:
438    /// ```text
439    ///                                        ┌──────┐
440    ///                                        │......│
441    ///                                        │......│
442    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
443    /// │       View 1       │─ ─ ─ ─          │......│  with data that
444    /// ├────────────────────┤                 │......│ is not referred
445    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
446    /// └────────────────────┘                 │......│      View 2
447    ///                                        │......│
448    ///    2 views, refer to                   │......│
449    ///   small portions of a                  └──────┘
450    ///      large buffer
451    /// ```
452    ///
453    /// After GC:
454    ///
455    /// ```text
456    /// ┌────────────────────┐                 ┌─────┐    After gc, only
457    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
458    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
459    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
460    /// └────────────────────┘                                 left
461    ///
462    ///
463    ///         2 views
464    /// ```
465    /// This method will compact the data buffers by recreating the view array and only include the data
466    /// that is pointed to by the views.
467    ///
468    /// Note that it will copy the array regardless of whether the original array is compact.
469    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
470    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
471    ///
472    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
473    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
474    pub fn gc(&self) -> Self {
475        let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
476
477        for v in self.iter() {
478            builder.append_option(v);
479        }
480
481        builder.finish()
482    }
483
484    /// Returns the total number of bytes used by all non inlined views in all
485    /// buffers.
486    ///
487    /// Note this does not account for views that point at the same underlying
488    /// data in buffers
489    ///
490    /// For example, if the array has three strings views:
491    /// * View with length = 9 (inlined)
492    /// * View with length = 32 (non inlined)
493    /// * View with length = 16 (non inlined)
494    ///
495    /// Then this method would report 48
496    pub fn total_buffer_bytes_used(&self) -> usize {
497        self.views()
498            .iter()
499            .map(|v| {
500                let len = *v as u32;
501                if len > MAX_INLINE_VIEW_LEN {
502                    len as usize
503                } else {
504                    0
505                }
506            })
507            .sum()
508    }
509
510    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
511    ///
512    /// Comparing two ByteView types are non-trivial.
513    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
514    ///
515    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
516    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
517    ///     Meaning that reading one array element requires only one memory access
518    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
519    ///
520    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
521    ///     thanks to the inlined 4 bytes.
522    ///     Consider equality check:
523    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
524    ///
525    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
526    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
527    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
528    ///
529    /// # Order check flow
530    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
531    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
532    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
533    ///     (2.2) o.w., we need to compare the full string.
534    ///
535    /// # Safety
536    /// The left/right_idx must within range of each array
537    pub unsafe fn compare_unchecked(
538        left: &GenericByteViewArray<T>,
539        left_idx: usize,
540        right: &GenericByteViewArray<T>,
541        right_idx: usize,
542    ) -> std::cmp::Ordering {
543        let l_view = left.views().get_unchecked(left_idx);
544        let l_len = *l_view as u32;
545
546        let r_view = right.views().get_unchecked(right_idx);
547        let r_len = *r_view as u32;
548
549        if l_len <= MAX_INLINE_VIEW_LEN && r_len <= MAX_INLINE_VIEW_LEN {
550            let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
551            let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
552            return l_data.cmp(r_data);
553        }
554
555        // one of the string is larger than 12 bytes,
556        // we then try to compare the inlined data first
557        let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
558        let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
559        if r_inlined_data != l_inlined_data {
560            return l_inlined_data.cmp(r_inlined_data);
561        }
562
563        // unfortunately, we need to compare the full data
564        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
565        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
566
567        l_full_data.cmp(r_full_data)
568    }
569}
570
571impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
572    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
573        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
574        print_long_array(self, f, |array, index, f| {
575            std::fmt::Debug::fmt(&array.value(index), f)
576        })?;
577        write!(f, "]")
578    }
579}
580
581impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
582    fn as_any(&self) -> &dyn Any {
583        self
584    }
585
586    fn to_data(&self) -> ArrayData {
587        self.clone().into()
588    }
589
590    fn into_data(self) -> ArrayData {
591        self.into()
592    }
593
594    fn data_type(&self) -> &DataType {
595        &self.data_type
596    }
597
598    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
599        Arc::new(self.slice(offset, length))
600    }
601
602    fn len(&self) -> usize {
603        self.views.len()
604    }
605
606    fn is_empty(&self) -> bool {
607        self.views.is_empty()
608    }
609
610    fn shrink_to_fit(&mut self) {
611        self.views.shrink_to_fit();
612        self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
613        self.buffers.shrink_to_fit();
614        if let Some(nulls) = &mut self.nulls {
615            nulls.shrink_to_fit();
616        }
617    }
618
619    fn offset(&self) -> usize {
620        0
621    }
622
623    fn nulls(&self) -> Option<&NullBuffer> {
624        self.nulls.as_ref()
625    }
626
627    fn logical_null_count(&self) -> usize {
628        // More efficient that the default implementation
629        self.null_count()
630    }
631
632    fn get_buffer_memory_size(&self) -> usize {
633        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
634        sum += self.views.inner().capacity();
635        if let Some(x) = &self.nulls {
636            sum += x.buffer().capacity()
637        }
638        sum
639    }
640
641    fn get_array_memory_size(&self) -> usize {
642        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
643    }
644}
645
646impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
647    type Item = &'a T::Native;
648
649    fn value(&self, index: usize) -> Self::Item {
650        GenericByteViewArray::value(self, index)
651    }
652
653    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
654        GenericByteViewArray::value_unchecked(self, index)
655    }
656}
657
658impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
659    type Item = Option<&'a T::Native>;
660    type IntoIter = ArrayIter<Self>;
661
662    fn into_iter(self) -> Self::IntoIter {
663        ArrayIter::new(self)
664    }
665}
666
667impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
668    fn from(value: ArrayData) -> Self {
669        let views = value.buffers()[0].clone();
670        let views = ScalarBuffer::new(views, value.offset(), value.len());
671        let buffers = value.buffers()[1..].to_vec();
672        Self {
673            data_type: T::DATA_TYPE,
674            views,
675            buffers,
676            nulls: value.nulls().cloned(),
677            phantom: Default::default(),
678        }
679    }
680}
681
682/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
683///
684/// For example this method can convert a [`StringArray`] to a
685/// [`StringViewArray`].
686///
687/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
688/// is built without copying the underlying string data (views are created
689/// directly into the existing buffer)
690///
691/// [`StringArray`]: crate::StringArray
692impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
693where
694    FROM: ByteArrayType,
695    FROM::Offset: OffsetSizeTrait + ToPrimitive,
696    V: ByteViewType<Native = FROM::Native>,
697{
698    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
699        let offsets = byte_array.offsets();
700
701        let can_reuse_buffer = match offsets.last() {
702            Some(offset) => offset.as_usize() < u32::MAX as usize,
703            None => true,
704        };
705
706        if can_reuse_buffer {
707            // build views directly pointing to the existing buffer
708            let len = byte_array.len();
709            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
710            let str_values_buf = byte_array.values().clone();
711            let block = views_builder.append_block(str_values_buf);
712            for (i, w) in offsets.windows(2).enumerate() {
713                let offset = w[0].as_usize();
714                let end = w[1].as_usize();
715                let length = end - offset;
716
717                if byte_array.is_null(i) {
718                    views_builder.append_null();
719                } else {
720                    // Safety: the input was a valid array so it valid UTF8 (if string). And
721                    // all offsets were valid
722                    unsafe {
723                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
724                    }
725                }
726            }
727            assert_eq!(views_builder.len(), len);
728            views_builder.finish()
729        } else {
730            // Otherwise, create a new buffer for large strings
731            // TODO: the original buffer could still be used
732            // by making multiple slices of u32::MAX length
733            GenericByteViewArray::<V>::from_iter(byte_array.iter())
734        }
735    }
736}
737
738impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
739    fn from(mut array: GenericByteViewArray<T>) -> Self {
740        let len = array.len();
741        array.buffers.insert(0, array.views.into_inner());
742        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
743            .len(len)
744            .buffers(array.buffers)
745            .nulls(array.nulls);
746
747        unsafe { builder.build_unchecked() }
748    }
749}
750
751impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
752where
753    Ptr: AsRef<T::Native> + 'a,
754    T: ByteViewType + ?Sized,
755{
756    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
757        iter.into_iter()
758            .map(|o| o.as_ref().map(|p| p.as_ref()))
759            .collect()
760    }
761}
762
763impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
764where
765    Ptr: AsRef<T::Native>,
766{
767    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
768        let iter = iter.into_iter();
769        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
770        builder.extend(iter);
771        builder.finish()
772    }
773}
774
775/// A [`GenericByteViewArray`] of `[u8]`
776///
777/// See [`GenericByteViewArray`] for format and layout details.
778///
779/// # Example
780/// ```
781/// use arrow_array::BinaryViewArray;
782/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
783/// assert_eq!(array.value(0), b"hello");
784/// assert_eq!(array.value(3), b"large payload over 12 bytes");
785/// ```
786pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
787
788impl BinaryViewArray {
789    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
790    /// If items not utf8 data, validate will fail and error returned.
791    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
792        StringViewType::validate(self.views(), self.data_buffers())?;
793        unsafe { Ok(self.to_string_view_unchecked()) }
794    }
795
796    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
797    /// # Safety
798    /// Caller is responsible for ensuring that items in array are utf8 data.
799    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
800        StringViewArray::new_unchecked(self.views, self.buffers, self.nulls)
801    }
802}
803
804impl From<Vec<&[u8]>> for BinaryViewArray {
805    fn from(v: Vec<&[u8]>) -> Self {
806        Self::from_iter_values(v)
807    }
808}
809
810impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
811    fn from(v: Vec<Option<&[u8]>>) -> Self {
812        v.into_iter().collect()
813    }
814}
815
816/// A [`GenericByteViewArray`] that stores utf8 data
817///
818/// See [`GenericByteViewArray`] for format and layout details.
819///
820/// # Example
821/// ```
822/// use arrow_array::StringViewArray;
823/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
824/// assert_eq!(array.value(0), "hello");
825/// assert_eq!(array.value(3), "large payload over 12 bytes");
826/// ```
827pub type StringViewArray = GenericByteViewArray<StringViewType>;
828
829impl StringViewArray {
830    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
831    pub fn to_binary_view(self) -> BinaryViewArray {
832        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
833    }
834
835    /// Returns true if all data within this array is ASCII
836    pub fn is_ascii(&self) -> bool {
837        // Alternative (but incorrect): directly check the underlying buffers
838        // (1) Our string view might be sparse, i.e., a subset of the buffers,
839        //      so even if the buffer is not ascii, we can still be ascii.
840        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
841        // This means that this operation is quite expensive, shall we cache the result?
842        //  i.e. track `is_ascii` in the builder.
843        self.iter().all(|v| match v {
844            Some(v) => v.is_ascii(),
845            None => true,
846        })
847    }
848}
849
850impl From<Vec<&str>> for StringViewArray {
851    fn from(v: Vec<&str>) -> Self {
852        Self::from_iter_values(v)
853    }
854}
855
856impl From<Vec<Option<&str>>> for StringViewArray {
857    fn from(v: Vec<Option<&str>>) -> Self {
858        v.into_iter().collect()
859    }
860}
861
862impl From<Vec<String>> for StringViewArray {
863    fn from(v: Vec<String>) -> Self {
864        Self::from_iter_values(v)
865    }
866}
867
868impl From<Vec<Option<String>>> for StringViewArray {
869    fn from(v: Vec<Option<String>>) -> Self {
870        v.into_iter().collect()
871    }
872}
873
874#[cfg(test)]
875mod tests {
876    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
877    use crate::{Array, BinaryViewArray, StringViewArray};
878    use arrow_buffer::{Buffer, ScalarBuffer};
879    use arrow_data::ByteView;
880
881    #[test]
882    fn try_new_string() {
883        let array = StringViewArray::from_iter_values(vec![
884            "hello",
885            "world",
886            "lulu",
887            "large payload over 12 bytes",
888        ]);
889        assert_eq!(array.value(0), "hello");
890        assert_eq!(array.value(3), "large payload over 12 bytes");
891    }
892
893    #[test]
894    fn try_new_binary() {
895        let array = BinaryViewArray::from_iter_values(vec![
896            b"hello".as_slice(),
897            b"world".as_slice(),
898            b"lulu".as_slice(),
899            b"large payload over 12 bytes".as_slice(),
900        ]);
901        assert_eq!(array.value(0), b"hello");
902        assert_eq!(array.value(3), b"large payload over 12 bytes");
903    }
904
905    #[test]
906    fn try_new_empty_string() {
907        // test empty array
908        let array = {
909            let mut builder = StringViewBuilder::new();
910            builder.finish()
911        };
912        assert!(array.is_empty());
913    }
914
915    #[test]
916    fn try_new_empty_binary() {
917        // test empty array
918        let array = {
919            let mut builder = BinaryViewBuilder::new();
920            builder.finish()
921        };
922        assert!(array.is_empty());
923    }
924
925    #[test]
926    fn test_append_string() {
927        // test builder append
928        let array = {
929            let mut builder = StringViewBuilder::new();
930            builder.append_value("hello");
931            builder.append_null();
932            builder.append_option(Some("large payload over 12 bytes"));
933            builder.finish()
934        };
935        assert_eq!(array.value(0), "hello");
936        assert!(array.is_null(1));
937        assert_eq!(array.value(2), "large payload over 12 bytes");
938    }
939
940    #[test]
941    fn test_append_binary() {
942        // test builder append
943        let array = {
944            let mut builder = BinaryViewBuilder::new();
945            builder.append_value(b"hello");
946            builder.append_null();
947            builder.append_option(Some(b"large payload over 12 bytes"));
948            builder.finish()
949        };
950        assert_eq!(array.value(0), b"hello");
951        assert!(array.is_null(1));
952        assert_eq!(array.value(2), b"large payload over 12 bytes");
953    }
954
955    #[test]
956    fn test_in_progress_recreation() {
957        let array = {
958            // make a builder with small block size.
959            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
960            builder.append_value("large payload over 12 bytes");
961            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
962            builder.finish()
963        };
964        assert_eq!(array.value(0), "large payload over 12 bytes");
965        assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
966        assert_eq!(2, array.buffers.len());
967    }
968
969    #[test]
970    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
971    fn new_with_invalid_view_data() {
972        let v = "large payload over 12 bytes";
973        let view = ByteView::new(13, &v.as_bytes()[0..4])
974            .with_buffer_index(3)
975            .with_offset(1);
976        let views = ScalarBuffer::from(vec![view.into()]);
977        let buffers = vec![Buffer::from_slice_ref(v)];
978        StringViewArray::new(views, buffers, None);
979    }
980
981    #[test]
982    #[should_panic(
983        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
984    )]
985    fn new_with_invalid_utf8_data() {
986        let v: Vec<u8> = vec![
987            // invalid UTF8
988            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
989            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
990        ];
991        let view = ByteView::new(v.len() as u32, &v[0..4]);
992        let views = ScalarBuffer::from(vec![view.into()]);
993        let buffers = vec![Buffer::from_slice_ref(v)];
994        StringViewArray::new(views, buffers, None);
995    }
996
997    #[test]
998    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
999    fn new_with_invalid_zero_padding() {
1000        let mut data = [0; 12];
1001        data[0] = b'H';
1002        data[11] = 1; // no zero padding
1003
1004        let mut view_buffer = [0; 16];
1005        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1006        view_buffer[4..].copy_from_slice(&data);
1007
1008        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1009        let views = ScalarBuffer::from(vec![view.into()]);
1010        let buffers = vec![];
1011        StringViewArray::new(views, buffers, None);
1012    }
1013
1014    #[test]
1015    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1016    fn test_mismatch_between_embedded_prefix_and_data() {
1017        let input_str_1 = "Hello, Rustaceans!";
1018        let input_str_2 = "Hallo, Rustaceans!";
1019        let length = input_str_1.len() as u32;
1020        assert!(input_str_1.len() > 12);
1021
1022        let mut view_buffer = [0; 16];
1023        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1024        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1025        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1026        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1027        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1028        let views = ScalarBuffer::from(vec![view.into()]);
1029        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1030
1031        StringViewArray::new(views, buffers, None);
1032    }
1033
1034    #[test]
1035    fn test_gc() {
1036        let test_data = [
1037            Some("longer than 12 bytes"),
1038            Some("short"),
1039            Some("t"),
1040            Some("longer than 12 bytes"),
1041            None,
1042            Some("short"),
1043        ];
1044
1045        let array = {
1046            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1047            test_data.into_iter().for_each(|v| builder.append_option(v));
1048            builder.finish()
1049        };
1050        assert!(array.buffers.len() > 1);
1051
1052        fn check_gc(to_test: &StringViewArray) {
1053            let gc = to_test.gc();
1054            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1055
1056            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1057                assert_eq!(a, b);
1058            });
1059            assert_eq!(to_test.len(), gc.len());
1060        }
1061
1062        check_gc(&array);
1063        check_gc(&array.slice(1, 3));
1064        check_gc(&array.slice(2, 1));
1065        check_gc(&array.slice(2, 2));
1066        check_gc(&array.slice(3, 1));
1067    }
1068
1069    #[test]
1070    fn test_eq() {
1071        let test_data = [
1072            Some("longer than 12 bytes"),
1073            None,
1074            Some("short"),
1075            Some("again, this is longer than 12 bytes"),
1076        ];
1077
1078        let array1 = {
1079            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1080            test_data.into_iter().for_each(|v| builder.append_option(v));
1081            builder.finish()
1082        };
1083        let array2 = {
1084            // create a new array with the same data but different layout
1085            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1086            test_data.into_iter().for_each(|v| builder.append_option(v));
1087            builder.finish()
1088        };
1089        assert_eq!(array1, array1.clone());
1090        assert_eq!(array2, array2.clone());
1091        assert_eq!(array1, array2);
1092    }
1093}