arrow_array/array/byte_view_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68/// ┌──────┬────────────────────────┐
69/// │length│ string value │
70/// Strings (len <= 12) │ │ (padded with 0) │
71/// └──────┴────────────────────────┘
72/// 0 31 127
73///
74/// ┌───────┬───────┬───────┬───────┐
75/// │length │prefix │ buf │offset │
76/// Strings (len > 12) │ │ │ index │ │
77/// └───────┴───────┴───────┴───────┘
78/// 0 31 63 95 127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82/// the view. See [`Self::inline_value`] to access the inlined prefix from a
83/// short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86/// view and the entire string is stored in one of the buffers. See [`ByteView`]
87/// to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100/// "hello",
101/// "this string is longer than 12 bytes",
102/// "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113/// StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148/// ┌───┐
149/// ┌──────┬──────┬──────┬──────┐ offset │...│
150/// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│
151/// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│
152/// ┌──────┬──────┬──────┬──────┐ │mpl│
153/// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│
154/// └──────┴──────┴──────┴──────┘ │ced│
155/// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster" │ 11 │ LavaMonster │ │hWa│
157/// └──────┴────────────────────┘ offset │sIn│
158/// 115 │Tow│
159/// │nTo│
160/// │day│
161/// u128 "views" │Yay│
162/// buffer 0 │...│
163/// └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166 data_type: DataType,
167 views: ScalarBuffer<u128>,
168 buffers: Arc<[Buffer]>,
169 phantom: PhantomData<T>,
170 nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174 fn clone(&self) -> Self {
175 Self {
176 data_type: T::DATA_TYPE,
177 views: self.views.clone(),
178 buffers: self.buffers.clone(),
179 nulls: self.nulls.clone(),
180 phantom: Default::default(),
181 }
182 }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186 /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187 ///
188 /// # Panics
189 ///
190 /// Panics if [`GenericByteViewArray::try_new`] returns an error
191 pub fn new<U>(views: ScalarBuffer<u128>, buffers: U, nulls: Option<NullBuffer>) -> Self
192 where
193 U: Into<Arc<[Buffer]>>,
194 {
195 Self::try_new(views, buffers, nulls).unwrap()
196 }
197
198 /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
199 ///
200 /// # Errors
201 ///
202 /// * `views.len() != nulls.len()`
203 /// * [ByteViewType::validate] fails
204 pub fn try_new<U>(
205 views: ScalarBuffer<u128>,
206 buffers: U,
207 nulls: Option<NullBuffer>,
208 ) -> Result<Self, ArrowError>
209 where
210 U: Into<Arc<[Buffer]>>,
211 {
212 let buffers: Arc<[Buffer]> = buffers.into();
213
214 T::validate(&views, &buffers)?;
215
216 if let Some(n) = nulls.as_ref() {
217 if n.len() != views.len() {
218 return Err(ArrowError::InvalidArgumentError(format!(
219 "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
220 T::PREFIX,
221 views.len(),
222 n.len(),
223 )));
224 }
225 }
226
227 Ok(Self {
228 data_type: T::DATA_TYPE,
229 views,
230 buffers,
231 nulls,
232 phantom: Default::default(),
233 })
234 }
235
236 /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
237 ///
238 /// # Safety
239 ///
240 /// Safe if [`Self::try_new`] would not error
241 pub unsafe fn new_unchecked<U>(
242 views: ScalarBuffer<u128>,
243 buffers: U,
244 nulls: Option<NullBuffer>,
245 ) -> Self
246 where
247 U: Into<Arc<[Buffer]>>,
248 {
249 if cfg!(feature = "force_validate") {
250 return Self::new(views, buffers, nulls);
251 }
252
253 Self {
254 data_type: T::DATA_TYPE,
255 phantom: Default::default(),
256 views,
257 buffers: buffers.into(),
258 nulls,
259 }
260 }
261
262 /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
263 pub fn new_null(len: usize) -> Self {
264 Self {
265 data_type: T::DATA_TYPE,
266 views: vec![0; len].into(),
267 buffers: vec![].into(),
268 nulls: Some(NullBuffer::new_null(len)),
269 phantom: Default::default(),
270 }
271 }
272
273 /// Create a new [`Scalar`] from `value`
274 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
275 Scalar::new(Self::from_iter_values(std::iter::once(value)))
276 }
277
278 /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
279 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
280 where
281 Ptr: AsRef<T::Native>,
282 I: IntoIterator<Item = Ptr>,
283 {
284 let iter = iter.into_iter();
285 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
286 for v in iter {
287 builder.append_value(v);
288 }
289 builder.finish()
290 }
291
292 /// Deconstruct this array into its constituent parts
293 pub fn into_parts(self) -> (ScalarBuffer<u128>, Arc<[Buffer]>, Option<NullBuffer>) {
294 (self.views, self.buffers, self.nulls)
295 }
296
297 /// Returns the views buffer
298 #[inline]
299 pub fn views(&self) -> &ScalarBuffer<u128> {
300 &self.views
301 }
302
303 /// Returns the buffers storing string data
304 #[inline]
305 pub fn data_buffers(&self) -> &[Buffer] {
306 &self.buffers
307 }
308
309 /// Returns the element at index `i`
310 ///
311 /// Note: This method does not check for nulls and the value is arbitrary
312 /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
313 ///
314 /// # Panics
315 /// Panics if index `i` is out of bounds.
316 pub fn value(&self, i: usize) -> &T::Native {
317 assert!(
318 i < self.len(),
319 "Trying to access an element at index {} from a {}ViewArray of length {}",
320 i,
321 T::PREFIX,
322 self.len()
323 );
324
325 unsafe { self.value_unchecked(i) }
326 }
327
328 /// Returns the element at index `i` without bounds checking
329 ///
330 /// Note: This method does not check for nulls and the value is arbitrary
331 /// if [`is_null`](Self::is_null) returns true for the index.
332 ///
333 /// # Safety
334 ///
335 /// Caller is responsible for ensuring that the index is within the bounds
336 /// of the array
337 pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
338 let v = unsafe { self.views.get_unchecked(idx) };
339 let len = *v as u32;
340 let b = if len <= MAX_INLINE_VIEW_LEN {
341 unsafe { Self::inline_value(v, len as usize) }
342 } else {
343 let view = ByteView::from(*v);
344 let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
345 let offset = view.offset as usize;
346 unsafe { data.get_unchecked(offset..offset + len as usize) }
347 };
348 unsafe { T::Native::from_bytes_unchecked(b) }
349 }
350
351 /// Returns the first `len` bytes the inline value of the view.
352 ///
353 /// # Safety
354 /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
355 /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
356 #[inline(always)]
357 pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
358 debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
359 unsafe {
360 std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
361 }
362 }
363
364 /// Constructs a new iterator for iterating over the values of this array
365 pub fn iter(&self) -> ArrayIter<&Self> {
366 ArrayIter::new(self)
367 }
368
369 /// Returns an iterator over the bytes of this array, including null values
370 pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
371 self.views.iter().map(move |v| {
372 let len = *v as u32;
373 if len <= MAX_INLINE_VIEW_LEN {
374 unsafe { Self::inline_value(v, len as usize) }
375 } else {
376 let view = ByteView::from(*v);
377 let data = &self.buffers[view.buffer_index as usize];
378 let offset = view.offset as usize;
379 unsafe { data.get_unchecked(offset..offset + len as usize) }
380 }
381 })
382 }
383
384 /// Returns an iterator over the first `prefix_len` bytes of each array
385 /// element, including null values.
386 ///
387 /// If `prefix_len` is larger than the element's length, the iterator will
388 /// return an empty slice (`&[]`).
389 pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
390 self.views().into_iter().map(move |v| {
391 let len = (*v as u32) as usize;
392
393 if len < prefix_len {
394 return &[] as &[u8];
395 }
396
397 if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
398 unsafe { StringViewArray::inline_value(v, prefix_len) }
399 } else {
400 let view = ByteView::from(*v);
401 let data = unsafe {
402 self.data_buffers()
403 .get_unchecked(view.buffer_index as usize)
404 };
405 let offset = view.offset as usize;
406 unsafe { data.get_unchecked(offset..offset + prefix_len) }
407 }
408 })
409 }
410
411 /// Returns an iterator over the last `suffix_len` bytes of each array
412 /// element, including null values.
413 ///
414 /// Note that for [`StringViewArray`] the last bytes may start in the middle
415 /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
416 ///
417 /// If `suffix_len` is larger than the element's length, the iterator will
418 /// return an empty slice (`&[]`).
419 pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
420 self.views().into_iter().map(move |v| {
421 let len = (*v as u32) as usize;
422
423 if len < suffix_len {
424 return &[] as &[u8];
425 }
426
427 if len as u32 <= MAX_INLINE_VIEW_LEN {
428 unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
429 } else {
430 let view = ByteView::from(*v);
431 let data = unsafe {
432 self.data_buffers()
433 .get_unchecked(view.buffer_index as usize)
434 };
435 let offset = view.offset as usize;
436 unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
437 }
438 })
439 }
440
441 /// Return an iterator over the length of each array element, including null values.
442 ///
443 /// Null values length would equal to the underlying bytes length and NOT 0
444 ///
445 /// Example of getting 0 for null values
446 /// ```rust
447 /// # use arrow_array::StringViewArray;
448 /// # use arrow_array::Array;
449 /// use arrow_data::ByteView;
450 ///
451 /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
452 /// view.lengths()
453 /// .enumerate()
454 /// .map(|(index, length)| if view.is_null(index) { 0 } else { length })
455 /// }
456 /// ```
457 pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
458 self.views().iter().map(|v| *v as u32)
459 }
460
461 /// Returns a zero-copy slice of this array with the indicated offset and length.
462 pub fn slice(&self, offset: usize, length: usize) -> Self {
463 Self {
464 data_type: T::DATA_TYPE,
465 views: self.views.slice(offset, length),
466 buffers: self.buffers.clone(),
467 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
468 phantom: Default::default(),
469 }
470 }
471
472 /// Returns a "compacted" version of this array
473 ///
474 /// The original array will *not* be modified
475 ///
476 /// # Garbage Collection
477 ///
478 /// Before GC:
479 /// ```text
480 /// ┌──────┐
481 /// │......│
482 /// │......│
483 /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
484 /// │ View 1 │─ ─ ─ ─ │......│ with data that
485 /// ├────────────────────┤ │......│ is not referred
486 /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
487 /// └────────────────────┘ │......│ View 2
488 /// │......│
489 /// 2 views, refer to │......│
490 /// small portions of a └──────┘
491 /// large buffer
492 /// ```
493 ///
494 /// After GC:
495 ///
496 /// ```text
497 /// ┌────────────────────┐ ┌─────┐ After gc, only
498 /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
499 /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
500 /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
501 /// └────────────────────┘ left
502 ///
503 ///
504 /// 2 views
505 /// ```
506 /// This method will compact the data buffers by recreating the view array and only include the data
507 /// that is pointed to by the views.
508 ///
509 /// Note that it will copy the array regardless of whether the original array is compact.
510 /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
511 /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
512 ///
513 /// Note: this function does not attempt to canonicalize / deduplicate values. For this
514 /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
515 pub fn gc(&self) -> Self {
516 // 1) Read basic properties once
517 let len = self.len(); // number of elements
518 let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
519
520 // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
521 if self.data_buffers().is_empty() {
522 return unsafe {
523 GenericByteViewArray::new_unchecked(
524 self.views().clone(),
525 vec![], // empty data blocks
526 nulls,
527 )
528 };
529 }
530
531 // 2) Calculate total size of all non-inline data and detect if any exists
532 let total_large = self.total_buffer_bytes_used();
533
534 // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
535 if total_large == 0 {
536 // Views are inline-only or all null; just reuse original views and no data blocks
537 return unsafe {
538 GenericByteViewArray::new_unchecked(
539 self.views().clone(),
540 vec![], // empty data blocks
541 nulls,
542 )
543 };
544 }
545
546 let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
547 // fast path, the entire data fits in a single buffer
548 // 3) Allocate exactly capacity for all non-inline data
549 let mut data_buf = Vec::with_capacity(total_large);
550
551 // 4) Iterate over views and process each inline/non-inline view
552 let views_buf: Vec<u128> = (0..len)
553 .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
554 .collect();
555 let data_block = Buffer::from_vec(data_buf);
556 let data_blocks = vec![data_block];
557 (views_buf, data_blocks)
558 } else {
559 // slow path, need to split into multiple buffers
560
561 struct GcCopyGroup {
562 total_buffer_bytes: usize,
563 total_len: usize,
564 }
565
566 impl GcCopyGroup {
567 fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
568 Self {
569 total_buffer_bytes: total_buffer_bytes as usize,
570 total_len,
571 }
572 }
573 }
574
575 let mut groups = Vec::new();
576 let mut current_length = 0;
577 let mut current_elements = 0;
578
579 for view in self.views() {
580 let len = *view as u32;
581 if len > MAX_INLINE_VIEW_LEN {
582 if current_length + len > i32::MAX as u32 {
583 // Start a new group
584 groups.push(GcCopyGroup::new(current_length, current_elements));
585 current_length = 0;
586 current_elements = 0;
587 }
588 current_length += len;
589 current_elements += 1;
590 }
591 }
592 if current_elements != 0 {
593 groups.push(GcCopyGroup::new(current_length, current_elements));
594 }
595 debug_assert!(groups.len() <= i32::MAX as usize);
596
597 // 3) Copy the buffers group by group
598 let mut views_buf = Vec::with_capacity(len);
599 let mut data_blocks = Vec::with_capacity(groups.len());
600
601 let mut current_view_idx = 0;
602
603 for (group_idx, gc_copy_group) in groups.iter().enumerate() {
604 let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
605
606 // Directly push views to avoid intermediate Vec allocation
607 let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
608 |view_idx| {
609 // safety: the view index came from iterating over valid range
610 unsafe {
611 self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
612 }
613 },
614 );
615 views_buf.extend(new_views);
616
617 data_blocks.push(Buffer::from_vec(data_buf));
618 current_view_idx += gc_copy_group.total_len;
619 }
620 (views_buf, data_blocks)
621 };
622
623 // 5) Wrap up views buffer
624 let views_scalar = ScalarBuffer::from(views_buf);
625
626 // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
627 unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
628 }
629
630 /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
631 ///
632 /// # Safety
633 ///
634 /// - `i < self.len()`.
635 /// - Every element in `self.views()` must currently refer to a valid slice
636 /// inside one of `self.buffers`.
637 /// - `data_buf` must be ready to have additional bytes appended.
638 /// - After this call, the returned view will have its
639 /// `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
640 /// into the bytes just appended at the end of `data_buf`.
641 #[inline(always)]
642 unsafe fn copy_view_to_buffer(
643 &self,
644 i: usize,
645 buffer_idx: i32,
646 data_buf: &mut Vec<u8>,
647 ) -> u128 {
648 // SAFETY: `i < self.len()` ensures this is in‑bounds.
649 let raw_view = unsafe { *self.views().get_unchecked(i) };
650 let mut bv = ByteView::from(raw_view);
651
652 // Inline‑small views stay as‑is.
653 if bv.length <= MAX_INLINE_VIEW_LEN {
654 raw_view
655 } else {
656 // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
657 // must both lie within valid ranges for `self.buffers`.
658 let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
659 let start = bv.offset as usize;
660 let end = start + bv.length as usize;
661 let slice = unsafe { buffer.get_unchecked(start..end) };
662
663 // Copy out‑of‑line data into our single “0” buffer.
664 let new_offset = data_buf.len() as u32;
665 data_buf.extend_from_slice(slice);
666
667 bv.buffer_index = buffer_idx as u32;
668 bv.offset = new_offset;
669 bv.into()
670 }
671 }
672
673 /// Returns the total number of bytes of all non-null values in this array.
674 ///
675 /// Unlike [`Self::total_buffer_bytes_used`], this method includes inlined strings
676 /// (those with length ≤ [`MAX_INLINE_VIEW_LEN`]), making it suitable as a
677 /// capacity hint when pre-allocating output buffers.
678 ///
679 /// Null values are excluded from the sum.
680 ///
681 /// # Example
682 ///
683 /// ```rust
684 /// # use arrow_array::StringViewArray;
685 /// let array = StringViewArray::from_iter(vec![
686 /// Some("hello"), // 5 bytes, inlined
687 /// None, // excluded
688 /// Some("large payload over 12 bytes"), // 27 bytes, non-inlined
689 /// ]);
690 /// assert_eq!(array.total_bytes_len(), 5 + 27);
691 /// ```
692 pub fn total_bytes_len(&self) -> usize {
693 match self.nulls() {
694 None => self.views().iter().map(|v| (*v as u32) as usize).sum(),
695 Some(nulls) => self
696 .views()
697 .iter()
698 .zip(nulls.iter())
699 .map(|(v, is_valid)| if is_valid { (*v as u32) as usize } else { 0 })
700 .sum(),
701 }
702 }
703
704 /// Returns the total number of bytes used by all non inlined views in all
705 /// buffers.
706 ///
707 /// Note this does not account for views that point at the same underlying
708 /// data in buffers
709 ///
710 /// For example, if the array has three strings views:
711 /// * View with length = 9 (inlined)
712 /// * View with length = 32 (non inlined)
713 /// * View with length = 16 (non inlined)
714 ///
715 /// Then this method would report 48
716 pub fn total_buffer_bytes_used(&self) -> usize {
717 self.views()
718 .iter()
719 .map(|v| {
720 let len = *v as u32;
721 if len > MAX_INLINE_VIEW_LEN {
722 len as usize
723 } else {
724 0
725 }
726 })
727 .sum()
728 }
729
730 /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
731 ///
732 /// Comparing two ByteView types are non-trivial.
733 /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
734 ///
735 /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
736 /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
737 /// Meaning that reading one array element requires only one memory access
738 /// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
739 ///
740 /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
741 /// thanks to the inlined 4 bytes.
742 /// Consider equality check:
743 /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
744 ///
745 /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
746 /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
747 /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
748 ///
749 /// # Order check flow
750 /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
751 /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
752 /// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
753 /// (2.2) o.w., we need to compare the full string.
754 ///
755 /// # Safety
756 /// The left/right_idx must within range of each array
757 pub unsafe fn compare_unchecked(
758 left: &GenericByteViewArray<T>,
759 left_idx: usize,
760 right: &GenericByteViewArray<T>,
761 right_idx: usize,
762 ) -> Ordering {
763 let l_view = unsafe { left.views().get_unchecked(left_idx) };
764 let l_byte_view = ByteView::from(*l_view);
765
766 let r_view = unsafe { right.views().get_unchecked(right_idx) };
767 let r_byte_view = ByteView::from(*r_view);
768
769 let l_len = l_byte_view.length;
770 let r_len = r_byte_view.length;
771
772 if l_len <= 12 && r_len <= 12 {
773 return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
774 }
775
776 // one of the string is larger than 12 bytes,
777 // we then try to compare the inlined data first
778
779 // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
780 // but we can still use it to get the inlined prefix for shorter strings.
781 // The prefix is always the first 4 bytes of the view, for both short and long strings.
782 let l_inlined_be = l_byte_view.prefix.swap_bytes();
783 let r_inlined_be = r_byte_view.prefix.swap_bytes();
784 if l_inlined_be != r_inlined_be {
785 return l_inlined_be.cmp(&r_inlined_be);
786 }
787
788 // unfortunately, we need to compare the full data
789 let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
790 let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
791
792 l_full_data.cmp(r_full_data)
793 }
794
795 /// Builds a 128-bit composite key for an inline value:
796 ///
797 /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
798 /// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
799 /// (or those with fewer meaningful bytes) always numerically sort before longer ones.
800 ///
801 /// This function extracts the length and the 12-byte inline string data from the raw
802 /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
803 /// into a single `u128` value suitable for fast, branchless comparisons.
804 ///
805 /// # Why include length?
806 ///
807 /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
808 /// compare equal—either because one is a true prefix of the other or because zero-padding
809 /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
810 /// handles both content and length in one go.
811 ///
812 /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
813 ///
814 /// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) |
815 /// |------------|-----------------------|---------------------------------|
816 /// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` |
817 /// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` |
818 ///
819 /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
820 /// then differentiates:
821 ///
822 /// ```text
823 /// key("bar") = 0x0000000000000000000062617200000003
824 /// key("bar\0") = 0x0000000000000000000062617200000004
825 /// ⇒ key("bar") < key("bar\0")
826 /// ```
827 /// - `raw` is treated as a 128-bit integer with its bits laid out as follows:
828 /// - bits 0–31: length (little-endian)
829 /// - bits 32–127: data (little-endian)
830 ///
831 /// # Inlining and Endianness
832 ///
833 /// This function uses platform-independent bitwise operations to construct a 128-bit key:
834 /// - `raw.swap_bytes() << 32` effectively clears the length bits and shifts the 12-byte inline data
835 /// into the high 96 bits in Big-Endian order. This ensures the first byte of the string
836 /// is the most significant byte of the resulting `u128`.
837 /// - `raw as u32` extracts the length as a numeric integer, which is then placed in the low 32 bits.
838 #[inline(always)]
839 pub fn inline_key_fast(raw: u128) -> u128 {
840 (raw.swap_bytes() << 32) | (raw as u32 as u128)
841 }
842}
843
844impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
845 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
846 write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
847 print_long_array(self, f, |array, index, f| {
848 std::fmt::Debug::fmt(&array.value(index), f)
849 })?;
850 write!(f, "]")
851 }
852}
853
854/// SAFETY: Correctly implements the contract of Arrow Arrays
855unsafe impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
856 fn as_any(&self) -> &dyn Any {
857 self
858 }
859
860 fn to_data(&self) -> ArrayData {
861 self.clone().into()
862 }
863
864 fn into_data(self) -> ArrayData {
865 self.into()
866 }
867
868 fn data_type(&self) -> &DataType {
869 &self.data_type
870 }
871
872 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
873 Arc::new(self.slice(offset, length))
874 }
875
876 fn len(&self) -> usize {
877 self.views.len()
878 }
879
880 fn is_empty(&self) -> bool {
881 self.views.is_empty()
882 }
883
884 fn shrink_to_fit(&mut self) {
885 self.views.shrink_to_fit();
886
887 // The goal of `shrink_to_fit` is to minimize the space used by any of
888 // its allocations. The use of `Arc::get_mut` over `Arc::make_mut` is
889 // because if the reference count is greater than 1, `Arc::make_mut`
890 // will first clone its contents. So, any large allocations will first
891 // be cloned before being shrunk, leaving the pre-cloned allocations
892 // intact, before adding the extra (used) space of the new clones.
893 if let Some(buffers) = Arc::get_mut(&mut self.buffers) {
894 buffers.iter_mut().for_each(|b| b.shrink_to_fit());
895 }
896
897 // With the assumption that this is a best-effort function, no attempt
898 // is made to shrink `self.buffers`, which it can't because it's type
899 // does not expose a `shrink_to_fit` method.
900
901 if let Some(nulls) = &mut self.nulls {
902 nulls.shrink_to_fit();
903 }
904 }
905
906 fn offset(&self) -> usize {
907 0
908 }
909
910 fn nulls(&self) -> Option<&NullBuffer> {
911 self.nulls.as_ref()
912 }
913
914 fn logical_null_count(&self) -> usize {
915 // More efficient that the default implementation
916 self.null_count()
917 }
918
919 fn get_buffer_memory_size(&self) -> usize {
920 let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
921 sum += self.views.inner().capacity();
922 if let Some(x) = &self.nulls {
923 sum += x.buffer().capacity()
924 }
925 sum
926 }
927
928 fn get_array_memory_size(&self) -> usize {
929 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
930 }
931
932 #[cfg(feature = "pool")]
933 fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
934 self.views.claim(pool);
935 for buffer in self.buffers.iter() {
936 buffer.claim(pool);
937 }
938 if let Some(nulls) = &self.nulls {
939 nulls.claim(pool);
940 }
941 }
942}
943
944impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
945 type Item = &'a T::Native;
946
947 fn value(&self, index: usize) -> Self::Item {
948 GenericByteViewArray::value(self, index)
949 }
950
951 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
952 unsafe { GenericByteViewArray::value_unchecked(self, index) }
953 }
954}
955
956impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
957 type Item = Option<&'a T::Native>;
958 type IntoIter = ArrayIter<Self>;
959
960 fn into_iter(self) -> Self::IntoIter {
961 ArrayIter::new(self)
962 }
963}
964
965impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
966 fn from(data: ArrayData) -> Self {
967 let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();
968 assert_eq!(
969 data_type,
970 T::DATA_TYPE,
971 "Mismatched data type, expected {}, got {data_type}",
972 T::DATA_TYPE
973 );
974 let mut buffers = buffers.into_iter();
975 // first buffer is views, remaining are data buffers
976 let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
977 Self {
978 data_type,
979 views,
980 buffers: Arc::from_iter(buffers),
981 nulls,
982 phantom: Default::default(),
983 }
984 }
985}
986
987/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
988///
989/// For example this method can convert a [`StringArray`] to a
990/// [`StringViewArray`].
991///
992/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
993/// is built without copying the underlying string data (views are created
994/// directly into the existing buffer)
995///
996/// [`StringArray`]: crate::StringArray
997impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
998where
999 FROM: ByteArrayType,
1000 FROM::Offset: OffsetSizeTrait + ToPrimitive,
1001 V: ByteViewType<Native = FROM::Native>,
1002{
1003 fn from(byte_array: &GenericByteArray<FROM>) -> Self {
1004 let offsets = byte_array.offsets();
1005
1006 let can_reuse_buffer = match offsets.last() {
1007 Some(offset) => offset.as_usize() < u32::MAX as usize,
1008 None => true,
1009 };
1010
1011 if can_reuse_buffer {
1012 // build views directly pointing to the existing buffer
1013 let len = byte_array.len();
1014 let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
1015 let str_values_buf = byte_array.values().clone();
1016 let block = views_builder.append_block(str_values_buf);
1017 for (i, w) in offsets.windows(2).enumerate() {
1018 let offset = w[0].as_usize();
1019 let end = w[1].as_usize();
1020 let length = end - offset;
1021
1022 if byte_array.is_null(i) {
1023 views_builder.append_null();
1024 } else {
1025 // Safety: the input was a valid array so it valid UTF8 (if string). And
1026 // all offsets were valid
1027 unsafe {
1028 views_builder.append_view_unchecked(block, offset as u32, length as u32)
1029 }
1030 }
1031 }
1032 assert_eq!(views_builder.len(), len);
1033 views_builder.finish()
1034 } else {
1035 // Otherwise, create a new buffer for large strings
1036 // TODO: the original buffer could still be used
1037 // by making multiple slices of u32::MAX length
1038 GenericByteViewArray::<V>::from_iter(byte_array.iter())
1039 }
1040 }
1041}
1042
1043impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1044 fn from(array: GenericByteViewArray<T>) -> Self {
1045 let len = array.len();
1046
1047 let mut buffers = array.buffers.to_vec();
1048 buffers.insert(0, array.views.into_inner());
1049
1050 let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1051 .len(len)
1052 .buffers(buffers)
1053 .nulls(array.nulls);
1054
1055 unsafe { builder.build_unchecked() }
1056 }
1057}
1058
1059impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1060where
1061 Ptr: AsRef<T::Native> + 'a,
1062 T: ByteViewType + ?Sized,
1063{
1064 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1065 iter.into_iter()
1066 .map(|o| o.as_ref().map(|p| p.as_ref()))
1067 .collect()
1068 }
1069}
1070
1071impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1072where
1073 Ptr: AsRef<T::Native>,
1074{
1075 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1076 let iter = iter.into_iter();
1077 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1078 builder.extend(iter);
1079 builder.finish()
1080 }
1081}
1082
1083/// A [`GenericByteViewArray`] of `[u8]`
1084///
1085/// See [`GenericByteViewArray`] for format and layout details.
1086///
1087/// # Example
1088/// ```
1089/// use arrow_array::BinaryViewArray;
1090/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1091/// assert_eq!(array.value(0), b"hello");
1092/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1093/// ```
1094pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1095
1096impl BinaryViewArray {
1097 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1098 /// If items not utf8 data, validate will fail and error returned.
1099 pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1100 StringViewType::validate(self.views(), self.data_buffers())?;
1101 unsafe { Ok(self.to_string_view_unchecked()) }
1102 }
1103
1104 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1105 /// # Safety
1106 /// Caller is responsible for ensuring that items in array are utf8 data.
1107 pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1108 unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1109 }
1110}
1111
1112impl From<Vec<&[u8]>> for BinaryViewArray {
1113 fn from(v: Vec<&[u8]>) -> Self {
1114 Self::from_iter_values(v)
1115 }
1116}
1117
1118impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1119 fn from(v: Vec<Option<&[u8]>>) -> Self {
1120 v.into_iter().collect()
1121 }
1122}
1123
1124/// A [`GenericByteViewArray`] that stores utf8 data
1125///
1126/// See [`GenericByteViewArray`] for format and layout details.
1127///
1128/// # Example
1129/// ```
1130/// use arrow_array::StringViewArray;
1131/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1132/// assert_eq!(array.value(0), "hello");
1133/// assert_eq!(array.value(3), "large payload over 12 bytes");
1134/// ```
1135pub type StringViewArray = GenericByteViewArray<StringViewType>;
1136
1137impl StringViewArray {
1138 /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1139 pub fn to_binary_view(self) -> BinaryViewArray {
1140 unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1141 }
1142
1143 /// Returns true if all data within this array is ASCII
1144 pub fn is_ascii(&self) -> bool {
1145 // Alternative (but incorrect): directly check the underlying buffers
1146 // (1) Our string view might be sparse, i.e., a subset of the buffers,
1147 // so even if the buffer is not ascii, we can still be ascii.
1148 // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1149 // This means that this operation is quite expensive, shall we cache the result?
1150 // i.e. track `is_ascii` in the builder.
1151 self.iter().all(|v| match v {
1152 Some(v) => v.is_ascii(),
1153 None => true,
1154 })
1155 }
1156}
1157
1158impl From<Vec<&str>> for StringViewArray {
1159 fn from(v: Vec<&str>) -> Self {
1160 Self::from_iter_values(v)
1161 }
1162}
1163
1164impl From<Vec<Option<&str>>> for StringViewArray {
1165 fn from(v: Vec<Option<&str>>) -> Self {
1166 v.into_iter().collect()
1167 }
1168}
1169
1170impl From<Vec<String>> for StringViewArray {
1171 fn from(v: Vec<String>) -> Self {
1172 Self::from_iter_values(v)
1173 }
1174}
1175
1176impl From<Vec<Option<String>>> for StringViewArray {
1177 fn from(v: Vec<Option<String>>) -> Self {
1178 v.into_iter().collect()
1179 }
1180}
1181
1182#[cfg(test)]
1183mod tests {
1184 use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1185 use crate::types::BinaryViewType;
1186 use crate::{
1187 Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1188 };
1189 use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
1190 use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
1191 use arrow_schema::DataType;
1192 use rand::prelude::StdRng;
1193 use rand::{Rng, SeedableRng};
1194 use std::str::from_utf8;
1195
1196 const BLOCK_SIZE: u32 = 8;
1197
1198 #[test]
1199 fn try_new_string() {
1200 let array = StringViewArray::from_iter_values(vec![
1201 "hello",
1202 "world",
1203 "lulu",
1204 "large payload over 12 bytes",
1205 ]);
1206 assert_eq!(array.value(0), "hello");
1207 assert_eq!(array.value(3), "large payload over 12 bytes");
1208 }
1209
1210 #[test]
1211 fn try_new_binary() {
1212 let array = BinaryViewArray::from_iter_values(vec![
1213 b"hello".as_slice(),
1214 b"world".as_slice(),
1215 b"lulu".as_slice(),
1216 b"large payload over 12 bytes".as_slice(),
1217 ]);
1218 assert_eq!(array.value(0), b"hello");
1219 assert_eq!(array.value(3), b"large payload over 12 bytes");
1220 }
1221
1222 #[test]
1223 fn try_new_empty_string() {
1224 // test empty array
1225 let array = {
1226 let mut builder = StringViewBuilder::new();
1227 builder.finish()
1228 };
1229 assert!(array.is_empty());
1230 }
1231
1232 #[test]
1233 fn try_new_empty_binary() {
1234 // test empty array
1235 let array = {
1236 let mut builder = BinaryViewBuilder::new();
1237 builder.finish()
1238 };
1239 assert!(array.is_empty());
1240 }
1241
1242 #[test]
1243 fn test_append_string() {
1244 // test builder append
1245 let array = {
1246 let mut builder = StringViewBuilder::new();
1247 builder.append_value("hello");
1248 builder.append_null();
1249 builder.append_option(Some("large payload over 12 bytes"));
1250 builder.finish()
1251 };
1252 assert_eq!(array.value(0), "hello");
1253 assert!(array.is_null(1));
1254 assert_eq!(array.value(2), "large payload over 12 bytes");
1255 }
1256
1257 #[test]
1258 fn test_append_binary() {
1259 // test builder append
1260 let array = {
1261 let mut builder = BinaryViewBuilder::new();
1262 builder.append_value(b"hello");
1263 builder.append_null();
1264 builder.append_option(Some(b"large payload over 12 bytes"));
1265 builder.finish()
1266 };
1267 assert_eq!(array.value(0), b"hello");
1268 assert!(array.is_null(1));
1269 assert_eq!(array.value(2), b"large payload over 12 bytes");
1270 }
1271
1272 #[test]
1273 fn test_in_progress_recreation() {
1274 let array = {
1275 // make a builder with small block size.
1276 let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1277 builder.append_value("large payload over 12 bytes");
1278 builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1279 builder.finish()
1280 };
1281 assert_eq!(array.value(0), "large payload over 12 bytes");
1282 assert_eq!(
1283 array.value(1),
1284 "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1285 );
1286 assert_eq!(2, array.buffers.len());
1287 }
1288
1289 #[test]
1290 #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1291 fn new_with_invalid_view_data() {
1292 let v = "large payload over 12 bytes";
1293 let view = ByteView::new(13, &v.as_bytes()[0..4])
1294 .with_buffer_index(3)
1295 .with_offset(1);
1296 let views = ScalarBuffer::from(vec![view.into()]);
1297 let buffers = vec![Buffer::from_slice_ref(v)];
1298 StringViewArray::new(views, buffers, None);
1299 }
1300
1301 #[test]
1302 #[should_panic(
1303 expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1304 )]
1305 fn new_with_invalid_utf8_data() {
1306 let v: Vec<u8> = vec![
1307 // invalid UTF8
1308 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1310 ];
1311 let view = ByteView::new(v.len() as u32, &v[0..4]);
1312 let views = ScalarBuffer::from(vec![view.into()]);
1313 let buffers = vec![Buffer::from_slice_ref(v)];
1314 StringViewArray::new(views, buffers, None);
1315 }
1316
1317 #[test]
1318 #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1319 fn new_with_invalid_zero_padding() {
1320 let mut data = [0; 12];
1321 data[0] = b'H';
1322 data[11] = 1; // no zero padding
1323
1324 let mut view_buffer = [0; 16];
1325 view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1326 view_buffer[4..].copy_from_slice(&data);
1327
1328 let view = ByteView::from(u128::from_le_bytes(view_buffer));
1329 let views = ScalarBuffer::from(vec![view.into()]);
1330 let buffers = vec![];
1331 StringViewArray::new(views, buffers, None);
1332 }
1333
1334 #[test]
1335 #[should_panic(expected = "Mismatch between embedded prefix and data")]
1336 fn test_mismatch_between_embedded_prefix_and_data() {
1337 let input_str_1 = "Hello, Rustaceans!";
1338 let input_str_2 = "Hallo, Rustaceans!";
1339 let length = input_str_1.len() as u32;
1340 assert!(input_str_1.len() > 12);
1341
1342 let mut view_buffer = [0; 16];
1343 view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1344 view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1345 view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1346 view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1347 let view = ByteView::from(u128::from_le_bytes(view_buffer));
1348 let views = ScalarBuffer::from(vec![view.into()]);
1349 let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1350
1351 StringViewArray::new(views, buffers, None);
1352 }
1353
1354 #[test]
1355 fn test_gc() {
1356 let test_data = [
1357 Some("longer than 12 bytes"),
1358 Some("short"),
1359 Some("t"),
1360 Some("longer than 12 bytes"),
1361 None,
1362 Some("short"),
1363 ];
1364
1365 let array = {
1366 let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1367 test_data.into_iter().for_each(|v| builder.append_option(v));
1368 builder.finish()
1369 };
1370 assert!(array.buffers.len() > 1);
1371
1372 fn check_gc(to_test: &StringViewArray) {
1373 let gc = to_test.gc();
1374 assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1375
1376 to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1377 assert_eq!(a, b);
1378 });
1379 assert_eq!(to_test.len(), gc.len());
1380 }
1381
1382 check_gc(&array);
1383 check_gc(&array.slice(1, 3));
1384 check_gc(&array.slice(2, 1));
1385 check_gc(&array.slice(2, 2));
1386 check_gc(&array.slice(3, 1));
1387 }
1388
1389 /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1390 #[test]
1391 fn test_gc_empty_array() {
1392 let array = StringViewBuilder::new()
1393 .with_fixed_block_size(BLOCK_SIZE)
1394 .finish();
1395 let gced = array.gc();
1396 // length and null count remain zero
1397 assert_eq!(gced.len(), 0);
1398 assert_eq!(gced.null_count(), 0);
1399 // no underlying data buffers should be allocated
1400 assert!(
1401 gced.data_buffers().is_empty(),
1402 "Expected no data buffers for empty array"
1403 );
1404 }
1405
1406 /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1407 #[test]
1408 fn test_gc_all_inline() {
1409 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1410 // append many short strings, each exactly INLINE_LEN long
1411 for _ in 0..100 {
1412 let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1413 builder.append_option(Some(&s));
1414 }
1415 let array = builder.finish();
1416 let gced = array.gc();
1417 // Since all views fit inline, data buffer is empty
1418 assert_eq!(
1419 gced.data_buffers().len(),
1420 0,
1421 "Should have no data buffers for inline values"
1422 );
1423 assert_eq!(gced.len(), 100);
1424 // verify element-wise equality
1425 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1426 assert_eq!(orig, got, "Inline value mismatch after gc");
1427 });
1428 }
1429
1430 /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1431 #[test]
1432 fn test_gc_all_large() {
1433 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1434 let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1435 // append multiple large strings
1436 for _ in 0..50 {
1437 builder.append_option(Some(&large_str));
1438 }
1439 let array = builder.finish();
1440 let gced = array.gc();
1441 // New data buffers should be populated (one or more blocks)
1442 assert!(
1443 !gced.data_buffers().is_empty(),
1444 "Expected data buffers for large values"
1445 );
1446 assert_eq!(gced.len(), 50);
1447 // verify that every large string emerges unchanged
1448 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1449 assert_eq!(orig, got, "Large view mismatch after gc");
1450 });
1451 }
1452
1453 /// 4) All null elements: ensure null bitmap handling path is correct
1454 #[test]
1455 fn test_gc_all_nulls() {
1456 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1457 for _ in 0..20 {
1458 builder.append_null();
1459 }
1460 let array = builder.finish();
1461 let gced = array.gc();
1462 // length and null count match
1463 assert_eq!(gced.len(), 20);
1464 assert_eq!(gced.null_count(), 20);
1465 // data buffers remain empty for null-only array
1466 assert!(
1467 gced.data_buffers().is_empty(),
1468 "No data should be stored for nulls"
1469 );
1470 }
1471
1472 /// 5) Random mix of inline, large, and null values with slicing tests
1473 #[test]
1474 fn test_gc_random_mixed_and_slices() {
1475 let mut rng = StdRng::seed_from_u64(42);
1476 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1477 // Keep a Vec of original Option<String> for later comparison
1478 let mut original: Vec<Option<String>> = Vec::new();
1479
1480 for _ in 0..200 {
1481 if rng.random_bool(0.1) {
1482 // 10% nulls
1483 builder.append_null();
1484 original.push(None);
1485 } else {
1486 // random length between 0 and twice the inline limit
1487 let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1488 let s: String = "A".repeat(len as usize);
1489 builder.append_option(Some(&s));
1490 original.push(Some(s));
1491 }
1492 }
1493
1494 let array = builder.finish();
1495 // Test multiple slice ranges to ensure offset logic is correct
1496 for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1497 let sliced = array.slice(*offset, *slice_len);
1498 let gced = sliced.gc();
1499 // Build expected slice of Option<&str>
1500 let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1501 .iter()
1502 .map(|opt| opt.as_deref())
1503 .collect();
1504
1505 assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1506 // Compare element-wise
1507 gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1508 assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1509 });
1510 }
1511 }
1512
1513 #[test]
1514 #[cfg_attr(miri, ignore)] // Takes too long
1515 fn test_gc_huge_array() {
1516 // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1517 let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1518 let num_views: usize = 36;
1519
1520 // Create a single 128 MiB data block with a simple byte pattern
1521 let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1522 let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1523
1524 // Append this block and then add many views pointing to it
1525 let mut builder = BinaryViewBuilder::new();
1526 let block_id = builder.append_block(buffer);
1527 for _ in 0..num_views / 2 {
1528 builder
1529 .try_append_view(block_id, 0, block_len as u32)
1530 .expect("append view into 128MiB block");
1531 }
1532 let block_id2 = builder.append_block(buffer2);
1533 for _ in 0..num_views / 2 {
1534 builder
1535 .try_append_view(block_id2, 0, block_len as u32)
1536 .expect("append view into 128MiB block");
1537 }
1538
1539 let array = builder.finish();
1540 let total = array.total_buffer_bytes_used();
1541 assert!(
1542 total > u32::MAX as usize,
1543 "Expected total non-inline bytes to exceed 4 GiB, got {}",
1544 total
1545 );
1546
1547 // Run gc and verify correctness
1548 let gced = array.gc();
1549 assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1550 assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1551 assert_ne!(
1552 gced.data_buffers().len(),
1553 1,
1554 "gc with huge buffer should not consolidate data into a single buffer"
1555 );
1556
1557 // Element-wise equality check across the entire array
1558 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1559 assert_eq!(orig, got, "Value mismatch after gc on huge array");
1560 });
1561 }
1562
1563 #[test]
1564 fn test_eq() {
1565 let test_data = [
1566 Some("longer than 12 bytes"),
1567 None,
1568 Some("short"),
1569 Some("again, this is longer than 12 bytes"),
1570 ];
1571
1572 let array1 = {
1573 let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1574 test_data.into_iter().for_each(|v| builder.append_option(v));
1575 builder.finish()
1576 };
1577 let array2 = {
1578 // create a new array with the same data but different layout
1579 let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1580 test_data.into_iter().for_each(|v| builder.append_option(v));
1581 builder.finish()
1582 };
1583 assert_eq!(array1, array1.clone());
1584 assert_eq!(array2, array2.clone());
1585 assert_eq!(array1, array2);
1586 }
1587
1588 /// Integration tests for `inline_key_fast` covering:
1589 ///
1590 /// 1. Monotonic ordering across increasing lengths and lexical variations.
1591 /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1592 ///
1593 /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1594 /// the length field is required even when all inline bytes fit in 12 bytes.
1595 ///
1596 /// The test includes strings that verify correct byte order (prevent reversal bugs),
1597 /// and length-based tie-breaking in the composite key.
1598 ///
1599 /// The test confirms that `inline_key_fast` produces keys which sort consistently
1600 /// with the expected lexicographical order of the raw byte arrays.
1601 #[test]
1602 fn test_inline_key_fast_various_lengths_and_lexical() {
1603 /// Helper to create a raw u128 value representing an inline ByteView:
1604 /// - `length`: number of meaningful bytes (must be ≤ 12)
1605 /// - `data`: the actual inline data bytes
1606 ///
1607 /// The first 4 bytes encode length in little-endian,
1608 /// the following 12 bytes contain the inline string data (unpadded).
1609 fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1610 assert!(length as usize <= 12, "Inline length must be ≤ 12");
1611 assert!(
1612 data.len() == length as usize,
1613 "Data length must match `length`"
1614 );
1615
1616 let mut raw_bytes = [0u8; 16];
1617 raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1618 raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1619 u128::from_le_bytes(raw_bytes)
1620 }
1621
1622 // Test inputs: various lengths and lexical orders,
1623 // plus special cases for byte order and length tie-breaking
1624 let test_inputs: Vec<&[u8]> = vec![
1625 b"a",
1626 b"aa",
1627 b"aaa",
1628 b"aab",
1629 b"abcd",
1630 b"abcde",
1631 b"abcdef",
1632 b"abcdefg",
1633 b"abcdefgh",
1634 b"abcdefghi",
1635 b"abcdefghij",
1636 b"abcdefghijk",
1637 b"abcdefghijkl",
1638 // Tests for byte-order reversal bug:
1639 // Without the fix, "backend one" would compare as "eno dnekcab",
1640 // causing incorrect sort order relative to "backend two".
1641 b"backend one",
1642 b"backend two",
1643 // Tests length-tiebreaker logic:
1644 // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1645 // so only the length differentiates their ordering.
1646 b"bar",
1647 b"bar\0",
1648 // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1649 b"than12Byt",
1650 b"than12Bytes",
1651 b"than12Bytes\0",
1652 b"than12Bytesx",
1653 b"than12Bytex",
1654 b"than12Bytez",
1655 // Additional lexical tests
1656 b"xyy",
1657 b"xyz",
1658 b"xza",
1659 ];
1660
1661 // Create a GenericBinaryArray for cross-comparison of lex order
1662 let array: GenericBinaryArray<i32> =
1663 GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1664
1665 for i in 0..array.len() - 1 {
1666 let v1 = array.value(i);
1667 let v2 = array.value(i + 1);
1668
1669 // Assert the array's natural lexical ordering is correct
1670 assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1671
1672 // Assert the keys produced by inline_key_fast reflect the same ordering
1673 let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1674 v1.len() as u32,
1675 v1,
1676 ));
1677 let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1678 v2.len() as u32,
1679 v2,
1680 ));
1681
1682 assert!(
1683 key1 < key2,
1684 "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1685 );
1686 }
1687 }
1688
1689 #[test]
1690 fn empty_array_should_return_empty_lengths_iterator() {
1691 let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
1692
1693 let mut lengths_iter = empty.lengths();
1694 assert_eq!(lengths_iter.len(), 0);
1695 assert_eq!(lengths_iter.next(), None);
1696 }
1697
1698 #[test]
1699 fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
1700 let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1701 // Not inlined as longer than 12 bytes
1702 b"Supercalifragilisticexpialidocious" as &[u8],
1703 // Inlined as shorter than 12 bytes
1704 b"Hello",
1705 // Empty value
1706 b"",
1707 // Exactly 12 bytes
1708 b"abcdefghijkl",
1709 ]);
1710
1711 let mut lengths_iter = cases.lengths();
1712
1713 assert_eq!(lengths_iter.len(), cases.len());
1714
1715 let cases_iter = cases.iter();
1716
1717 for case in cases_iter {
1718 let case_value = case.unwrap();
1719 let length = lengths_iter.next().expect("Should have a length");
1720
1721 assert_eq!(case_value.len(), length as usize);
1722 }
1723
1724 assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1725 }
1726
1727 #[test]
1728 fn array_lengths_should_return_the_underlying_length_for_null_values() {
1729 let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1730 // Not inlined as longer than 12 bytes
1731 b"Supercalifragilisticexpialidocious" as &[u8],
1732 // Inlined as shorter than 12 bytes
1733 b"Hello",
1734 // Empty value
1735 b"",
1736 // Exactly 12 bytes
1737 b"abcdefghijkl",
1738 ]);
1739
1740 let (views, buffer, _) = cases.clone().into_parts();
1741
1742 // Keeping the values but just adding nulls on top
1743 let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
1744 views,
1745 buffer,
1746 Some(NullBuffer::new_null(cases.len())),
1747 );
1748
1749 let lengths_iter = cases.lengths();
1750 let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
1751
1752 assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
1753
1754 for expected_length in lengths_iter {
1755 let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");
1756
1757 assert_eq!(expected_length, actual_length);
1758 }
1759
1760 assert_eq!(
1761 all_nulls_lengths_iter.next(),
1762 None,
1763 "Should not have more lengths"
1764 );
1765 }
1766
1767 #[test]
1768 fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
1769 let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1770 b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1771 b"Hello",
1772 b"something great",
1773 b"is",
1774 b"coming soon!",
1775 b"when you find what it is",
1776 b"let me know",
1777 b"cause",
1778 b"I",
1779 b"have no idea",
1780 b"what it",
1781 b"is",
1782 ]);
1783
1784 let sliced_array = array.slice(2, array.len() - 3);
1785
1786 let mut lengths_iter = sliced_array.lengths();
1787
1788 assert_eq!(lengths_iter.len(), sliced_array.len());
1789
1790 let values_iter = sliced_array.iter();
1791
1792 for value in values_iter {
1793 let value = value.unwrap();
1794 let length = lengths_iter.next().expect("Should have a length");
1795
1796 assert_eq!(value.len(), length as usize);
1797 }
1798
1799 assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1800 }
1801
1802 #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")]
1803 #[test]
1804 fn invalid_casting_from_array_data() {
1805 // Should not be able to cast to StringViewArray due to invalid UTF-8
1806 let array_data = binary_view_array_with_invalid_utf8_data().into_data();
1807 let _ = StringViewArray::from(array_data);
1808 }
1809
1810 #[should_panic(expected = "invalid utf-8 sequence")]
1811 #[test]
1812 fn invalid_array_data() {
1813 let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts();
1814
1815 // manually try and add invalid array data with Utf8View data type
1816 let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
1817 .add_buffer(views.into_inner())
1818 .len(3);
1819 for buffer in buffers.iter() {
1820 builder = builder.add_buffer(buffer.clone())
1821 }
1822 builder = builder.nulls(nulls);
1823
1824 let data = builder.build().unwrap(); // should fail validation
1825 let _arr = StringViewArray::from(data);
1826 }
1827
1828 /// Returns a BinaryViewArray with one invalid UTF-8 value
1829 fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
1830 let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1831 b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1832 &[
1833 0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1834 0x00, 0x00,
1835 ],
1836 b"good",
1837 ]);
1838 assert!(from_utf8(array.value(0)).is_ok());
1839 assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
1840 assert!(from_utf8(array.value(2)).is_ok());
1841 array
1842 }
1843
1844 #[test]
1845 fn test_total_bytes_len() {
1846 // inlined: "hello"=5, "world"=5, "lulu"=4 → 14
1847 // non-inlined: "large payload over 12 bytes"=27
1848 // null: should not count
1849 let mut builder = StringViewBuilder::new();
1850 builder.append_value("hello");
1851 builder.append_value("world");
1852 builder.append_value("lulu");
1853 builder.append_null();
1854 builder.append_value("large payload over 12 bytes");
1855 let array = builder.finish();
1856 assert_eq!(array.total_bytes_len(), 5 + 5 + 4 + 27);
1857 }
1858
1859 #[test]
1860 fn test_total_bytes_len_empty() {
1861 let array = StringViewArray::from_iter::<Vec<Option<&str>>>(vec![]);
1862 assert_eq!(array.total_bytes_len(), 0);
1863 }
1864
1865 #[test]
1866 fn test_total_bytes_len_all_nulls() {
1867 let array = StringViewArray::new_null(5);
1868 assert_eq!(array.total_bytes_len(), 0);
1869 }
1870
1871 #[test]
1872 fn test_total_bytes_len_binary_view() {
1873 let array = BinaryViewArray::from_iter(vec![
1874 Some(b"hi".as_ref()),
1875 None,
1876 Some(b"large payload over 12 bytes".as_ref()),
1877 ]);
1878 assert_eq!(array.total_bytes_len(), 2 + 27);
1879 }
1880}