arrow_array/array/byte_view_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68/// ┌──────┬────────────────────────┐
69/// │length│ string value │
70/// Strings (len <= 12) │ │ (padded with 0) │
71/// └──────┴────────────────────────┘
72/// 0 31 127
73///
74/// ┌───────┬───────┬───────┬───────┐
75/// │length │prefix │ buf │offset │
76/// Strings (len > 12) │ │ │ index │ │
77/// └───────┴───────┴───────┴───────┘
78/// 0 31 63 95 127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82/// the view. See [`Self::inline_value`] to access the inlined prefix from a
83/// short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86/// view and the entire string is stored in one of the buffers. See [`ByteView`]
87/// to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100/// "hello",
101/// "this string is longer than 12 bytes",
102/// "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113/// StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148/// ┌───┐
149/// ┌──────┬──────┬──────┬──────┐ offset │...│
150/// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│
151/// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│
152/// ┌──────┬──────┬──────┬──────┐ │mpl│
153/// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│
154/// └──────┴──────┴──────┴──────┘ │ced│
155/// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster" │ 11 │ LavaMonster │ │hWa│
157/// └──────┴────────────────────┘ offset │sIn│
158/// 115 │Tow│
159/// │nTo│
160/// │day│
161/// u128 "views" │Yay│
162/// buffer 0 │...│
163/// └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166 data_type: DataType,
167 views: ScalarBuffer<u128>,
168 buffers: Arc<[Buffer]>,
169 phantom: PhantomData<T>,
170 nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174 fn clone(&self) -> Self {
175 Self {
176 data_type: T::DATA_TYPE,
177 views: self.views.clone(),
178 buffers: self.buffers.clone(),
179 nulls: self.nulls.clone(),
180 phantom: Default::default(),
181 }
182 }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186 /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187 ///
188 /// # Panics
189 ///
190 /// Panics if [`GenericByteViewArray::try_new`] returns an error
191 pub fn new<U>(views: ScalarBuffer<u128>, buffers: U, nulls: Option<NullBuffer>) -> Self
192 where
193 U: Into<Arc<[Buffer]>>,
194 {
195 Self::try_new(views, buffers, nulls).unwrap()
196 }
197
198 /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
199 ///
200 /// # Errors
201 ///
202 /// * `views.len() != nulls.len()`
203 /// * [ByteViewType::validate] fails
204 pub fn try_new<U>(
205 views: ScalarBuffer<u128>,
206 buffers: U,
207 nulls: Option<NullBuffer>,
208 ) -> Result<Self, ArrowError>
209 where
210 U: Into<Arc<[Buffer]>>,
211 {
212 let buffers: Arc<[Buffer]> = buffers.into();
213
214 T::validate(&views, &buffers)?;
215
216 if let Some(n) = nulls.as_ref() {
217 if n.len() != views.len() {
218 return Err(ArrowError::InvalidArgumentError(format!(
219 "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
220 T::PREFIX,
221 views.len(),
222 n.len(),
223 )));
224 }
225 }
226
227 Ok(Self {
228 data_type: T::DATA_TYPE,
229 views,
230 buffers,
231 nulls,
232 phantom: Default::default(),
233 })
234 }
235
236 /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
237 ///
238 /// # Safety
239 ///
240 /// Safe if [`Self::try_new`] would not error
241 pub unsafe fn new_unchecked<U>(
242 views: ScalarBuffer<u128>,
243 buffers: U,
244 nulls: Option<NullBuffer>,
245 ) -> Self
246 where
247 U: Into<Arc<[Buffer]>>,
248 {
249 if cfg!(feature = "force_validate") {
250 return Self::new(views, buffers, nulls);
251 }
252
253 Self {
254 data_type: T::DATA_TYPE,
255 phantom: Default::default(),
256 views,
257 buffers: buffers.into(),
258 nulls,
259 }
260 }
261
262 /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
263 pub fn new_null(len: usize) -> Self {
264 Self {
265 data_type: T::DATA_TYPE,
266 views: vec![0; len].into(),
267 buffers: vec![].into(),
268 nulls: Some(NullBuffer::new_null(len)),
269 phantom: Default::default(),
270 }
271 }
272
273 /// Create a new [`Scalar`] from `value`
274 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
275 Scalar::new(Self::from_iter_values(std::iter::once(value)))
276 }
277
278 /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
279 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
280 where
281 Ptr: AsRef<T::Native>,
282 I: IntoIterator<Item = Ptr>,
283 {
284 let iter = iter.into_iter();
285 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
286 for v in iter {
287 builder.append_value(v);
288 }
289 builder.finish()
290 }
291
292 /// Deconstruct this array into its constituent parts
293 pub fn into_parts(self) -> (ScalarBuffer<u128>, Arc<[Buffer]>, Option<NullBuffer>) {
294 (self.views, self.buffers, self.nulls)
295 }
296
297 /// Returns the views buffer
298 #[inline]
299 pub fn views(&self) -> &ScalarBuffer<u128> {
300 &self.views
301 }
302
303 /// Returns the buffers storing string data
304 #[inline]
305 pub fn data_buffers(&self) -> &[Buffer] {
306 &self.buffers
307 }
308
309 /// Returns the element at index `i`
310 ///
311 /// Note: This method does not check for nulls and the value is arbitrary
312 /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
313 ///
314 /// # Panics
315 /// Panics if index `i` is out of bounds.
316 pub fn value(&self, i: usize) -> &T::Native {
317 assert!(
318 i < self.len(),
319 "Trying to access an element at index {} from a {}ViewArray of length {}",
320 i,
321 T::PREFIX,
322 self.len()
323 );
324
325 unsafe { self.value_unchecked(i) }
326 }
327
328 /// Returns the element at index `i` without bounds checking
329 ///
330 /// Note: This method does not check for nulls and the value is arbitrary
331 /// if [`is_null`](Self::is_null) returns true for the index.
332 ///
333 /// # Safety
334 ///
335 /// Caller is responsible for ensuring that the index is within the bounds
336 /// of the array
337 pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
338 let v = unsafe { self.views.get_unchecked(idx) };
339 let len = *v as u32;
340 let b = if len <= MAX_INLINE_VIEW_LEN {
341 unsafe { Self::inline_value(v, len as usize) }
342 } else {
343 let view = ByteView::from(*v);
344 let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
345 let offset = view.offset as usize;
346 unsafe { data.get_unchecked(offset..offset + len as usize) }
347 };
348 unsafe { T::Native::from_bytes_unchecked(b) }
349 }
350
351 /// Returns the first `len` bytes the inline value of the view.
352 ///
353 /// # Safety
354 /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
355 /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
356 #[inline(always)]
357 pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
358 debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
359 unsafe {
360 std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
361 }
362 }
363
364 /// Constructs a new iterator for iterating over the values of this array
365 pub fn iter(&self) -> ArrayIter<&Self> {
366 ArrayIter::new(self)
367 }
368
369 /// Returns an iterator over the bytes of this array, including null values
370 pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
371 self.views.iter().map(move |v| {
372 let len = *v as u32;
373 if len <= MAX_INLINE_VIEW_LEN {
374 unsafe { Self::inline_value(v, len as usize) }
375 } else {
376 let view = ByteView::from(*v);
377 let data = &self.buffers[view.buffer_index as usize];
378 let offset = view.offset as usize;
379 unsafe { data.get_unchecked(offset..offset + len as usize) }
380 }
381 })
382 }
383
384 /// Returns an iterator over the first `prefix_len` bytes of each array
385 /// element, including null values.
386 ///
387 /// If `prefix_len` is larger than the element's length, the iterator will
388 /// return an empty slice (`&[]`).
389 pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
390 self.views().into_iter().map(move |v| {
391 let len = (*v as u32) as usize;
392
393 if len < prefix_len {
394 return &[] as &[u8];
395 }
396
397 if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
398 unsafe { StringViewArray::inline_value(v, prefix_len) }
399 } else {
400 let view = ByteView::from(*v);
401 let data = unsafe {
402 self.data_buffers()
403 .get_unchecked(view.buffer_index as usize)
404 };
405 let offset = view.offset as usize;
406 unsafe { data.get_unchecked(offset..offset + prefix_len) }
407 }
408 })
409 }
410
411 /// Returns an iterator over the last `suffix_len` bytes of each array
412 /// element, including null values.
413 ///
414 /// Note that for [`StringViewArray`] the last bytes may start in the middle
415 /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
416 ///
417 /// If `suffix_len` is larger than the element's length, the iterator will
418 /// return an empty slice (`&[]`).
419 pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
420 self.views().into_iter().map(move |v| {
421 let len = (*v as u32) as usize;
422
423 if len < suffix_len {
424 return &[] as &[u8];
425 }
426
427 if len as u32 <= MAX_INLINE_VIEW_LEN {
428 unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
429 } else {
430 let view = ByteView::from(*v);
431 let data = unsafe {
432 self.data_buffers()
433 .get_unchecked(view.buffer_index as usize)
434 };
435 let offset = view.offset as usize;
436 unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
437 }
438 })
439 }
440
441 /// Return an iterator over the length of each array element, including null values.
442 ///
443 /// Null values length would equal to the underlying bytes length and NOT 0
444 ///
445 /// Example of getting 0 for null values
446 /// ```rust
447 /// # use arrow_array::StringViewArray;
448 /// # use arrow_array::Array;
449 /// use arrow_data::ByteView;
450 ///
451 /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
452 /// view.lengths()
453 /// .enumerate()
454 /// .map(|(index, length)| if view.is_null(index) { 0 } else { length })
455 /// }
456 /// ```
457 pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
458 self.views().iter().map(|v| *v as u32)
459 }
460
461 /// Returns a zero-copy slice of this array with the indicated offset and length.
462 pub fn slice(&self, offset: usize, length: usize) -> Self {
463 Self {
464 data_type: T::DATA_TYPE,
465 views: self.views.slice(offset, length),
466 buffers: self.buffers.clone(),
467 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
468 phantom: Default::default(),
469 }
470 }
471
472 /// Returns a "compacted" version of this array
473 ///
474 /// The original array will *not* be modified
475 ///
476 /// # Garbage Collection
477 ///
478 /// Before GC:
479 /// ```text
480 /// ┌──────┐
481 /// │......│
482 /// │......│
483 /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
484 /// │ View 1 │─ ─ ─ ─ │......│ with data that
485 /// ├────────────────────┤ │......│ is not referred
486 /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
487 /// └────────────────────┘ │......│ View 2
488 /// │......│
489 /// 2 views, refer to │......│
490 /// small portions of a └──────┘
491 /// large buffer
492 /// ```
493 ///
494 /// After GC:
495 ///
496 /// ```text
497 /// ┌────────────────────┐ ┌─────┐ After gc, only
498 /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
499 /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
500 /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
501 /// └────────────────────┘ left
502 ///
503 ///
504 /// 2 views
505 /// ```
506 /// This method will compact the data buffers by recreating the view array and only include the data
507 /// that is pointed to by the views.
508 ///
509 /// Note that it will copy the array regardless of whether the original array is compact.
510 /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
511 /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
512 ///
513 /// Note: this function does not attempt to canonicalize / deduplicate values. For this
514 /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
515 pub fn gc(&self) -> Self {
516 // 1) Read basic properties once
517 let len = self.len(); // number of elements
518 let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
519
520 // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
521 if self.data_buffers().is_empty() {
522 return unsafe {
523 GenericByteViewArray::new_unchecked(
524 self.views().clone(),
525 vec![], // empty data blocks
526 nulls,
527 )
528 };
529 }
530
531 // 2) Calculate total size of all non-inline data and detect if any exists
532 let total_large = self.total_buffer_bytes_used();
533
534 // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
535 if total_large == 0 {
536 // Views are inline-only or all null; just reuse original views and no data blocks
537 return unsafe {
538 GenericByteViewArray::new_unchecked(
539 self.views().clone(),
540 vec![], // empty data blocks
541 nulls,
542 )
543 };
544 }
545
546 let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
547 // fast path, the entire data fits in a single buffer
548 // 3) Allocate exactly capacity for all non-inline data
549 let mut data_buf = Vec::with_capacity(total_large);
550
551 // 4) Iterate over views and process each inline/non-inline view
552 let views_buf: Vec<u128> = (0..len)
553 .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
554 .collect();
555 let data_block = Buffer::from_vec(data_buf);
556 let data_blocks = vec![data_block];
557 (views_buf, data_blocks)
558 } else {
559 // slow path, need to split into multiple buffers
560
561 struct GcCopyGroup {
562 total_buffer_bytes: usize,
563 total_len: usize,
564 }
565
566 impl GcCopyGroup {
567 fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
568 Self {
569 total_buffer_bytes: total_buffer_bytes as usize,
570 total_len,
571 }
572 }
573 }
574
575 let mut groups = Vec::new();
576 let mut current_length = 0;
577 let mut current_elements = 0;
578
579 for view in self.views() {
580 let len = *view as u32;
581 if len > MAX_INLINE_VIEW_LEN {
582 if current_length + len > i32::MAX as u32 {
583 // Start a new group
584 groups.push(GcCopyGroup::new(current_length, current_elements));
585 current_length = 0;
586 current_elements = 0;
587 }
588 current_length += len;
589 current_elements += 1;
590 }
591 }
592 if current_elements != 0 {
593 groups.push(GcCopyGroup::new(current_length, current_elements));
594 }
595 debug_assert!(groups.len() <= i32::MAX as usize);
596
597 // 3) Copy the buffers group by group
598 let mut views_buf = Vec::with_capacity(len);
599 let mut data_blocks = Vec::with_capacity(groups.len());
600
601 let mut current_view_idx = 0;
602
603 for (group_idx, gc_copy_group) in groups.iter().enumerate() {
604 let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
605
606 // Directly push views to avoid intermediate Vec allocation
607 let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
608 |view_idx| {
609 // safety: the view index came from iterating over valid range
610 unsafe {
611 self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
612 }
613 },
614 );
615 views_buf.extend(new_views);
616
617 data_blocks.push(Buffer::from_vec(data_buf));
618 current_view_idx += gc_copy_group.total_len;
619 }
620 (views_buf, data_blocks)
621 };
622
623 // 5) Wrap up views buffer
624 let views_scalar = ScalarBuffer::from(views_buf);
625
626 // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
627 unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
628 }
629
630 /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
631 ///
632 /// # Safety
633 ///
634 /// - `i < self.len()`.
635 /// - Every element in `self.views()` must currently refer to a valid slice
636 /// inside one of `self.buffers`.
637 /// - `data_buf` must be ready to have additional bytes appended.
638 /// - After this call, the returned view will have its
639 /// `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
640 /// into the bytes just appended at the end of `data_buf`.
641 #[inline(always)]
642 unsafe fn copy_view_to_buffer(
643 &self,
644 i: usize,
645 buffer_idx: i32,
646 data_buf: &mut Vec<u8>,
647 ) -> u128 {
648 // SAFETY: `i < self.len()` ensures this is in‑bounds.
649 let raw_view = unsafe { *self.views().get_unchecked(i) };
650 let mut bv = ByteView::from(raw_view);
651
652 // Inline‑small views stay as‑is.
653 if bv.length <= MAX_INLINE_VIEW_LEN {
654 raw_view
655 } else {
656 // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
657 // must both lie within valid ranges for `self.buffers`.
658 let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
659 let start = bv.offset as usize;
660 let end = start + bv.length as usize;
661 let slice = unsafe { buffer.get_unchecked(start..end) };
662
663 // Copy out‑of‑line data into our single “0” buffer.
664 let new_offset = data_buf.len() as u32;
665 data_buf.extend_from_slice(slice);
666
667 bv.buffer_index = buffer_idx as u32;
668 bv.offset = new_offset;
669 bv.into()
670 }
671 }
672
673 /// Returns the total number of bytes used by all non inlined views in all
674 /// buffers.
675 ///
676 /// Note this does not account for views that point at the same underlying
677 /// data in buffers
678 ///
679 /// For example, if the array has three strings views:
680 /// * View with length = 9 (inlined)
681 /// * View with length = 32 (non inlined)
682 /// * View with length = 16 (non inlined)
683 ///
684 /// Then this method would report 48
685 pub fn total_buffer_bytes_used(&self) -> usize {
686 self.views()
687 .iter()
688 .map(|v| {
689 let len = *v as u32;
690 if len > MAX_INLINE_VIEW_LEN {
691 len as usize
692 } else {
693 0
694 }
695 })
696 .sum()
697 }
698
699 /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
700 ///
701 /// Comparing two ByteView types are non-trivial.
702 /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
703 ///
704 /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
705 /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
706 /// Meaning that reading one array element requires only one memory access
707 /// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
708 ///
709 /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
710 /// thanks to the inlined 4 bytes.
711 /// Consider equality check:
712 /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
713 ///
714 /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
715 /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
716 /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
717 ///
718 /// # Order check flow
719 /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
720 /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
721 /// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
722 /// (2.2) o.w., we need to compare the full string.
723 ///
724 /// # Safety
725 /// The left/right_idx must within range of each array
726 pub unsafe fn compare_unchecked(
727 left: &GenericByteViewArray<T>,
728 left_idx: usize,
729 right: &GenericByteViewArray<T>,
730 right_idx: usize,
731 ) -> Ordering {
732 let l_view = unsafe { left.views().get_unchecked(left_idx) };
733 let l_byte_view = ByteView::from(*l_view);
734
735 let r_view = unsafe { right.views().get_unchecked(right_idx) };
736 let r_byte_view = ByteView::from(*r_view);
737
738 let l_len = l_byte_view.length;
739 let r_len = r_byte_view.length;
740
741 if l_len <= 12 && r_len <= 12 {
742 return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
743 }
744
745 // one of the string is larger than 12 bytes,
746 // we then try to compare the inlined data first
747
748 // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
749 // but we can still use it to get the inlined prefix for shorter strings.
750 // The prefix is always the first 4 bytes of the view, for both short and long strings.
751 let l_inlined_be = l_byte_view.prefix.swap_bytes();
752 let r_inlined_be = r_byte_view.prefix.swap_bytes();
753 if l_inlined_be != r_inlined_be {
754 return l_inlined_be.cmp(&r_inlined_be);
755 }
756
757 // unfortunately, we need to compare the full data
758 let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
759 let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
760
761 l_full_data.cmp(r_full_data)
762 }
763
764 /// Builds a 128-bit composite key for an inline value:
765 ///
766 /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
767 /// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
768 /// (or those with fewer meaningful bytes) always numerically sort before longer ones.
769 ///
770 /// This function extracts the length and the 12-byte inline string data from the raw
771 /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
772 /// into a single `u128` value suitable for fast, branchless comparisons.
773 ///
774 /// # Why include length?
775 ///
776 /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
777 /// compare equal—either because one is a true prefix of the other or because zero-padding
778 /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
779 /// handles both content and length in one go.
780 ///
781 /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
782 ///
783 /// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) |
784 /// |------------|-----------------------|---------------------------------|
785 /// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` |
786 /// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` |
787 ///
788 /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
789 /// then differentiates:
790 ///
791 /// ```text
792 /// key("bar") = 0x0000000000000000000062617200000003
793 /// key("bar\0") = 0x0000000000000000000062617200000004
794 /// ⇒ key("bar") < key("bar\0")
795 /// ```
796 /// - `raw` is treated as a 128-bit integer with its bits laid out as follows:
797 /// - bits 0–31: length (little-endian)
798 /// - bits 32–127: data (little-endian)
799 ///
800 /// # Inlining and Endianness
801 ///
802 /// This function uses platform-independent bitwise operations to construct a 128-bit key:
803 /// - `raw.swap_bytes() << 32` effectively clears the length bits and shifts the 12-byte inline data
804 /// into the high 96 bits in Big-Endian order. This ensures the first byte of the string
805 /// is the most significant byte of the resulting `u128`.
806 /// - `raw as u32` extracts the length as a numeric integer, which is then placed in the low 32 bits.
807 #[inline(always)]
808 pub fn inline_key_fast(raw: u128) -> u128 {
809 (raw.swap_bytes() << 32) | (raw as u32 as u128)
810 }
811}
812
813impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
814 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
815 write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
816 print_long_array(self, f, |array, index, f| {
817 std::fmt::Debug::fmt(&array.value(index), f)
818 })?;
819 write!(f, "]")
820 }
821}
822
823/// SAFETY: Correctly implements the contract of Arrow Arrays
824unsafe impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
825 fn as_any(&self) -> &dyn Any {
826 self
827 }
828
829 fn to_data(&self) -> ArrayData {
830 self.clone().into()
831 }
832
833 fn into_data(self) -> ArrayData {
834 self.into()
835 }
836
837 fn data_type(&self) -> &DataType {
838 &self.data_type
839 }
840
841 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
842 Arc::new(self.slice(offset, length))
843 }
844
845 fn len(&self) -> usize {
846 self.views.len()
847 }
848
849 fn is_empty(&self) -> bool {
850 self.views.is_empty()
851 }
852
853 fn shrink_to_fit(&mut self) {
854 self.views.shrink_to_fit();
855
856 // The goal of `shrink_to_fit` is to minimize the space used by any of
857 // its allocations. The use of `Arc::get_mut` over `Arc::make_mut` is
858 // because if the reference count is greater than 1, `Arc::make_mut`
859 // will first clone its contents. So, any large allocations will first
860 // be cloned before being shrunk, leaving the pre-cloned allocations
861 // intact, before adding the extra (used) space of the new clones.
862 if let Some(buffers) = Arc::get_mut(&mut self.buffers) {
863 buffers.iter_mut().for_each(|b| b.shrink_to_fit());
864 }
865
866 // With the assumption that this is a best-effort function, no attempt
867 // is made to shrink `self.buffers`, which it can't because it's type
868 // does not expose a `shrink_to_fit` method.
869
870 if let Some(nulls) = &mut self.nulls {
871 nulls.shrink_to_fit();
872 }
873 }
874
875 fn offset(&self) -> usize {
876 0
877 }
878
879 fn nulls(&self) -> Option<&NullBuffer> {
880 self.nulls.as_ref()
881 }
882
883 fn logical_null_count(&self) -> usize {
884 // More efficient that the default implementation
885 self.null_count()
886 }
887
888 fn get_buffer_memory_size(&self) -> usize {
889 let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
890 sum += self.views.inner().capacity();
891 if let Some(x) = &self.nulls {
892 sum += x.buffer().capacity()
893 }
894 sum
895 }
896
897 fn get_array_memory_size(&self) -> usize {
898 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
899 }
900}
901
902impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
903 type Item = &'a T::Native;
904
905 fn value(&self, index: usize) -> Self::Item {
906 GenericByteViewArray::value(self, index)
907 }
908
909 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
910 unsafe { GenericByteViewArray::value_unchecked(self, index) }
911 }
912}
913
914impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
915 type Item = Option<&'a T::Native>;
916 type IntoIter = ArrayIter<Self>;
917
918 fn into_iter(self) -> Self::IntoIter {
919 ArrayIter::new(self)
920 }
921}
922
923impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
924 fn from(data: ArrayData) -> Self {
925 let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();
926 assert_eq!(
927 data_type,
928 T::DATA_TYPE,
929 "Mismatched data type, expected {}, got {data_type}",
930 T::DATA_TYPE
931 );
932 let mut buffers = buffers.into_iter();
933 // first buffer is views, remaining are data buffers
934 let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
935 Self {
936 data_type,
937 views,
938 buffers: Arc::from_iter(buffers),
939 nulls,
940 phantom: Default::default(),
941 }
942 }
943}
944
945/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
946///
947/// For example this method can convert a [`StringArray`] to a
948/// [`StringViewArray`].
949///
950/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
951/// is built without copying the underlying string data (views are created
952/// directly into the existing buffer)
953///
954/// [`StringArray`]: crate::StringArray
955impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
956where
957 FROM: ByteArrayType,
958 FROM::Offset: OffsetSizeTrait + ToPrimitive,
959 V: ByteViewType<Native = FROM::Native>,
960{
961 fn from(byte_array: &GenericByteArray<FROM>) -> Self {
962 let offsets = byte_array.offsets();
963
964 let can_reuse_buffer = match offsets.last() {
965 Some(offset) => offset.as_usize() < u32::MAX as usize,
966 None => true,
967 };
968
969 if can_reuse_buffer {
970 // build views directly pointing to the existing buffer
971 let len = byte_array.len();
972 let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
973 let str_values_buf = byte_array.values().clone();
974 let block = views_builder.append_block(str_values_buf);
975 for (i, w) in offsets.windows(2).enumerate() {
976 let offset = w[0].as_usize();
977 let end = w[1].as_usize();
978 let length = end - offset;
979
980 if byte_array.is_null(i) {
981 views_builder.append_null();
982 } else {
983 // Safety: the input was a valid array so it valid UTF8 (if string). And
984 // all offsets were valid
985 unsafe {
986 views_builder.append_view_unchecked(block, offset as u32, length as u32)
987 }
988 }
989 }
990 assert_eq!(views_builder.len(), len);
991 views_builder.finish()
992 } else {
993 // Otherwise, create a new buffer for large strings
994 // TODO: the original buffer could still be used
995 // by making multiple slices of u32::MAX length
996 GenericByteViewArray::<V>::from_iter(byte_array.iter())
997 }
998 }
999}
1000
1001impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1002 fn from(array: GenericByteViewArray<T>) -> Self {
1003 let len = array.len();
1004
1005 let mut buffers = array.buffers.to_vec();
1006 buffers.insert(0, array.views.into_inner());
1007
1008 let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1009 .len(len)
1010 .buffers(buffers)
1011 .nulls(array.nulls);
1012
1013 unsafe { builder.build_unchecked() }
1014 }
1015}
1016
1017impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1018where
1019 Ptr: AsRef<T::Native> + 'a,
1020 T: ByteViewType + ?Sized,
1021{
1022 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1023 iter.into_iter()
1024 .map(|o| o.as_ref().map(|p| p.as_ref()))
1025 .collect()
1026 }
1027}
1028
1029impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1030where
1031 Ptr: AsRef<T::Native>,
1032{
1033 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1034 let iter = iter.into_iter();
1035 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1036 builder.extend(iter);
1037 builder.finish()
1038 }
1039}
1040
1041/// A [`GenericByteViewArray`] of `[u8]`
1042///
1043/// See [`GenericByteViewArray`] for format and layout details.
1044///
1045/// # Example
1046/// ```
1047/// use arrow_array::BinaryViewArray;
1048/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1049/// assert_eq!(array.value(0), b"hello");
1050/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1051/// ```
1052pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1053
1054impl BinaryViewArray {
1055 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1056 /// If items not utf8 data, validate will fail and error returned.
1057 pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1058 StringViewType::validate(self.views(), self.data_buffers())?;
1059 unsafe { Ok(self.to_string_view_unchecked()) }
1060 }
1061
1062 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1063 /// # Safety
1064 /// Caller is responsible for ensuring that items in array are utf8 data.
1065 pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1066 unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1067 }
1068}
1069
1070impl From<Vec<&[u8]>> for BinaryViewArray {
1071 fn from(v: Vec<&[u8]>) -> Self {
1072 Self::from_iter_values(v)
1073 }
1074}
1075
1076impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1077 fn from(v: Vec<Option<&[u8]>>) -> Self {
1078 v.into_iter().collect()
1079 }
1080}
1081
1082/// A [`GenericByteViewArray`] that stores utf8 data
1083///
1084/// See [`GenericByteViewArray`] for format and layout details.
1085///
1086/// # Example
1087/// ```
1088/// use arrow_array::StringViewArray;
1089/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1090/// assert_eq!(array.value(0), "hello");
1091/// assert_eq!(array.value(3), "large payload over 12 bytes");
1092/// ```
1093pub type StringViewArray = GenericByteViewArray<StringViewType>;
1094
1095impl StringViewArray {
1096 /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1097 pub fn to_binary_view(self) -> BinaryViewArray {
1098 unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1099 }
1100
1101 /// Returns true if all data within this array is ASCII
1102 pub fn is_ascii(&self) -> bool {
1103 // Alternative (but incorrect): directly check the underlying buffers
1104 // (1) Our string view might be sparse, i.e., a subset of the buffers,
1105 // so even if the buffer is not ascii, we can still be ascii.
1106 // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1107 // This means that this operation is quite expensive, shall we cache the result?
1108 // i.e. track `is_ascii` in the builder.
1109 self.iter().all(|v| match v {
1110 Some(v) => v.is_ascii(),
1111 None => true,
1112 })
1113 }
1114}
1115
1116impl From<Vec<&str>> for StringViewArray {
1117 fn from(v: Vec<&str>) -> Self {
1118 Self::from_iter_values(v)
1119 }
1120}
1121
1122impl From<Vec<Option<&str>>> for StringViewArray {
1123 fn from(v: Vec<Option<&str>>) -> Self {
1124 v.into_iter().collect()
1125 }
1126}
1127
1128impl From<Vec<String>> for StringViewArray {
1129 fn from(v: Vec<String>) -> Self {
1130 Self::from_iter_values(v)
1131 }
1132}
1133
1134impl From<Vec<Option<String>>> for StringViewArray {
1135 fn from(v: Vec<Option<String>>) -> Self {
1136 v.into_iter().collect()
1137 }
1138}
1139
1140#[cfg(test)]
1141mod tests {
1142 use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1143 use crate::types::BinaryViewType;
1144 use crate::{
1145 Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1146 };
1147 use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
1148 use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
1149 use arrow_schema::DataType;
1150 use rand::prelude::StdRng;
1151 use rand::{Rng, SeedableRng};
1152 use std::str::from_utf8;
1153
1154 const BLOCK_SIZE: u32 = 8;
1155
1156 #[test]
1157 fn try_new_string() {
1158 let array = StringViewArray::from_iter_values(vec![
1159 "hello",
1160 "world",
1161 "lulu",
1162 "large payload over 12 bytes",
1163 ]);
1164 assert_eq!(array.value(0), "hello");
1165 assert_eq!(array.value(3), "large payload over 12 bytes");
1166 }
1167
1168 #[test]
1169 fn try_new_binary() {
1170 let array = BinaryViewArray::from_iter_values(vec![
1171 b"hello".as_slice(),
1172 b"world".as_slice(),
1173 b"lulu".as_slice(),
1174 b"large payload over 12 bytes".as_slice(),
1175 ]);
1176 assert_eq!(array.value(0), b"hello");
1177 assert_eq!(array.value(3), b"large payload over 12 bytes");
1178 }
1179
1180 #[test]
1181 fn try_new_empty_string() {
1182 // test empty array
1183 let array = {
1184 let mut builder = StringViewBuilder::new();
1185 builder.finish()
1186 };
1187 assert!(array.is_empty());
1188 }
1189
1190 #[test]
1191 fn try_new_empty_binary() {
1192 // test empty array
1193 let array = {
1194 let mut builder = BinaryViewBuilder::new();
1195 builder.finish()
1196 };
1197 assert!(array.is_empty());
1198 }
1199
1200 #[test]
1201 fn test_append_string() {
1202 // test builder append
1203 let array = {
1204 let mut builder = StringViewBuilder::new();
1205 builder.append_value("hello");
1206 builder.append_null();
1207 builder.append_option(Some("large payload over 12 bytes"));
1208 builder.finish()
1209 };
1210 assert_eq!(array.value(0), "hello");
1211 assert!(array.is_null(1));
1212 assert_eq!(array.value(2), "large payload over 12 bytes");
1213 }
1214
1215 #[test]
1216 fn test_append_binary() {
1217 // test builder append
1218 let array = {
1219 let mut builder = BinaryViewBuilder::new();
1220 builder.append_value(b"hello");
1221 builder.append_null();
1222 builder.append_option(Some(b"large payload over 12 bytes"));
1223 builder.finish()
1224 };
1225 assert_eq!(array.value(0), b"hello");
1226 assert!(array.is_null(1));
1227 assert_eq!(array.value(2), b"large payload over 12 bytes");
1228 }
1229
1230 #[test]
1231 fn test_in_progress_recreation() {
1232 let array = {
1233 // make a builder with small block size.
1234 let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1235 builder.append_value("large payload over 12 bytes");
1236 builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1237 builder.finish()
1238 };
1239 assert_eq!(array.value(0), "large payload over 12 bytes");
1240 assert_eq!(
1241 array.value(1),
1242 "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1243 );
1244 assert_eq!(2, array.buffers.len());
1245 }
1246
1247 #[test]
1248 #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1249 fn new_with_invalid_view_data() {
1250 let v = "large payload over 12 bytes";
1251 let view = ByteView::new(13, &v.as_bytes()[0..4])
1252 .with_buffer_index(3)
1253 .with_offset(1);
1254 let views = ScalarBuffer::from(vec![view.into()]);
1255 let buffers = vec![Buffer::from_slice_ref(v)];
1256 StringViewArray::new(views, buffers, None);
1257 }
1258
1259 #[test]
1260 #[should_panic(
1261 expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1262 )]
1263 fn new_with_invalid_utf8_data() {
1264 let v: Vec<u8> = vec![
1265 // invalid UTF8
1266 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1267 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1268 ];
1269 let view = ByteView::new(v.len() as u32, &v[0..4]);
1270 let views = ScalarBuffer::from(vec![view.into()]);
1271 let buffers = vec![Buffer::from_slice_ref(v)];
1272 StringViewArray::new(views, buffers, None);
1273 }
1274
1275 #[test]
1276 #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1277 fn new_with_invalid_zero_padding() {
1278 let mut data = [0; 12];
1279 data[0] = b'H';
1280 data[11] = 1; // no zero padding
1281
1282 let mut view_buffer = [0; 16];
1283 view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1284 view_buffer[4..].copy_from_slice(&data);
1285
1286 let view = ByteView::from(u128::from_le_bytes(view_buffer));
1287 let views = ScalarBuffer::from(vec![view.into()]);
1288 let buffers = vec![];
1289 StringViewArray::new(views, buffers, None);
1290 }
1291
1292 #[test]
1293 #[should_panic(expected = "Mismatch between embedded prefix and data")]
1294 fn test_mismatch_between_embedded_prefix_and_data() {
1295 let input_str_1 = "Hello, Rustaceans!";
1296 let input_str_2 = "Hallo, Rustaceans!";
1297 let length = input_str_1.len() as u32;
1298 assert!(input_str_1.len() > 12);
1299
1300 let mut view_buffer = [0; 16];
1301 view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1302 view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1303 view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1304 view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1305 let view = ByteView::from(u128::from_le_bytes(view_buffer));
1306 let views = ScalarBuffer::from(vec![view.into()]);
1307 let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1308
1309 StringViewArray::new(views, buffers, None);
1310 }
1311
1312 #[test]
1313 fn test_gc() {
1314 let test_data = [
1315 Some("longer than 12 bytes"),
1316 Some("short"),
1317 Some("t"),
1318 Some("longer than 12 bytes"),
1319 None,
1320 Some("short"),
1321 ];
1322
1323 let array = {
1324 let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1325 test_data.into_iter().for_each(|v| builder.append_option(v));
1326 builder.finish()
1327 };
1328 assert!(array.buffers.len() > 1);
1329
1330 fn check_gc(to_test: &StringViewArray) {
1331 let gc = to_test.gc();
1332 assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1333
1334 to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1335 assert_eq!(a, b);
1336 });
1337 assert_eq!(to_test.len(), gc.len());
1338 }
1339
1340 check_gc(&array);
1341 check_gc(&array.slice(1, 3));
1342 check_gc(&array.slice(2, 1));
1343 check_gc(&array.slice(2, 2));
1344 check_gc(&array.slice(3, 1));
1345 }
1346
1347 /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1348 #[test]
1349 fn test_gc_empty_array() {
1350 let array = StringViewBuilder::new()
1351 .with_fixed_block_size(BLOCK_SIZE)
1352 .finish();
1353 let gced = array.gc();
1354 // length and null count remain zero
1355 assert_eq!(gced.len(), 0);
1356 assert_eq!(gced.null_count(), 0);
1357 // no underlying data buffers should be allocated
1358 assert!(
1359 gced.data_buffers().is_empty(),
1360 "Expected no data buffers for empty array"
1361 );
1362 }
1363
1364 /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1365 #[test]
1366 fn test_gc_all_inline() {
1367 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1368 // append many short strings, each exactly INLINE_LEN long
1369 for _ in 0..100 {
1370 let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1371 builder.append_option(Some(&s));
1372 }
1373 let array = builder.finish();
1374 let gced = array.gc();
1375 // Since all views fit inline, data buffer is empty
1376 assert_eq!(
1377 gced.data_buffers().len(),
1378 0,
1379 "Should have no data buffers for inline values"
1380 );
1381 assert_eq!(gced.len(), 100);
1382 // verify element-wise equality
1383 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1384 assert_eq!(orig, got, "Inline value mismatch after gc");
1385 });
1386 }
1387
1388 /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1389 #[test]
1390 fn test_gc_all_large() {
1391 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1392 let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1393 // append multiple large strings
1394 for _ in 0..50 {
1395 builder.append_option(Some(&large_str));
1396 }
1397 let array = builder.finish();
1398 let gced = array.gc();
1399 // New data buffers should be populated (one or more blocks)
1400 assert!(
1401 !gced.data_buffers().is_empty(),
1402 "Expected data buffers for large values"
1403 );
1404 assert_eq!(gced.len(), 50);
1405 // verify that every large string emerges unchanged
1406 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1407 assert_eq!(orig, got, "Large view mismatch after gc");
1408 });
1409 }
1410
1411 /// 4) All null elements: ensure null bitmap handling path is correct
1412 #[test]
1413 fn test_gc_all_nulls() {
1414 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1415 for _ in 0..20 {
1416 builder.append_null();
1417 }
1418 let array = builder.finish();
1419 let gced = array.gc();
1420 // length and null count match
1421 assert_eq!(gced.len(), 20);
1422 assert_eq!(gced.null_count(), 20);
1423 // data buffers remain empty for null-only array
1424 assert!(
1425 gced.data_buffers().is_empty(),
1426 "No data should be stored for nulls"
1427 );
1428 }
1429
1430 /// 5) Random mix of inline, large, and null values with slicing tests
1431 #[test]
1432 fn test_gc_random_mixed_and_slices() {
1433 let mut rng = StdRng::seed_from_u64(42);
1434 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1435 // Keep a Vec of original Option<String> for later comparison
1436 let mut original: Vec<Option<String>> = Vec::new();
1437
1438 for _ in 0..200 {
1439 if rng.random_bool(0.1) {
1440 // 10% nulls
1441 builder.append_null();
1442 original.push(None);
1443 } else {
1444 // random length between 0 and twice the inline limit
1445 let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1446 let s: String = "A".repeat(len as usize);
1447 builder.append_option(Some(&s));
1448 original.push(Some(s));
1449 }
1450 }
1451
1452 let array = builder.finish();
1453 // Test multiple slice ranges to ensure offset logic is correct
1454 for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1455 let sliced = array.slice(*offset, *slice_len);
1456 let gced = sliced.gc();
1457 // Build expected slice of Option<&str>
1458 let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1459 .iter()
1460 .map(|opt| opt.as_deref())
1461 .collect();
1462
1463 assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1464 // Compare element-wise
1465 gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1466 assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1467 });
1468 }
1469 }
1470
1471 #[test]
1472 #[cfg_attr(miri, ignore)] // Takes too long
1473 fn test_gc_huge_array() {
1474 // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1475 let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1476 let num_views: usize = 36;
1477
1478 // Create a single 128 MiB data block with a simple byte pattern
1479 let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1480 let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1481
1482 // Append this block and then add many views pointing to it
1483 let mut builder = BinaryViewBuilder::new();
1484 let block_id = builder.append_block(buffer);
1485 for _ in 0..num_views / 2 {
1486 builder
1487 .try_append_view(block_id, 0, block_len as u32)
1488 .expect("append view into 128MiB block");
1489 }
1490 let block_id2 = builder.append_block(buffer2);
1491 for _ in 0..num_views / 2 {
1492 builder
1493 .try_append_view(block_id2, 0, block_len as u32)
1494 .expect("append view into 128MiB block");
1495 }
1496
1497 let array = builder.finish();
1498 let total = array.total_buffer_bytes_used();
1499 assert!(
1500 total > u32::MAX as usize,
1501 "Expected total non-inline bytes to exceed 4 GiB, got {}",
1502 total
1503 );
1504
1505 // Run gc and verify correctness
1506 let gced = array.gc();
1507 assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1508 assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1509 assert_ne!(
1510 gced.data_buffers().len(),
1511 1,
1512 "gc with huge buffer should not consolidate data into a single buffer"
1513 );
1514
1515 // Element-wise equality check across the entire array
1516 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1517 assert_eq!(orig, got, "Value mismatch after gc on huge array");
1518 });
1519 }
1520
1521 #[test]
1522 fn test_eq() {
1523 let test_data = [
1524 Some("longer than 12 bytes"),
1525 None,
1526 Some("short"),
1527 Some("again, this is longer than 12 bytes"),
1528 ];
1529
1530 let array1 = {
1531 let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1532 test_data.into_iter().for_each(|v| builder.append_option(v));
1533 builder.finish()
1534 };
1535 let array2 = {
1536 // create a new array with the same data but different layout
1537 let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1538 test_data.into_iter().for_each(|v| builder.append_option(v));
1539 builder.finish()
1540 };
1541 assert_eq!(array1, array1.clone());
1542 assert_eq!(array2, array2.clone());
1543 assert_eq!(array1, array2);
1544 }
1545
1546 /// Integration tests for `inline_key_fast` covering:
1547 ///
1548 /// 1. Monotonic ordering across increasing lengths and lexical variations.
1549 /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1550 ///
1551 /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1552 /// the length field is required even when all inline bytes fit in 12 bytes.
1553 ///
1554 /// The test includes strings that verify correct byte order (prevent reversal bugs),
1555 /// and length-based tie-breaking in the composite key.
1556 ///
1557 /// The test confirms that `inline_key_fast` produces keys which sort consistently
1558 /// with the expected lexicographical order of the raw byte arrays.
1559 #[test]
1560 fn test_inline_key_fast_various_lengths_and_lexical() {
1561 /// Helper to create a raw u128 value representing an inline ByteView:
1562 /// - `length`: number of meaningful bytes (must be ≤ 12)
1563 /// - `data`: the actual inline data bytes
1564 ///
1565 /// The first 4 bytes encode length in little-endian,
1566 /// the following 12 bytes contain the inline string data (unpadded).
1567 fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1568 assert!(length as usize <= 12, "Inline length must be ≤ 12");
1569 assert!(
1570 data.len() == length as usize,
1571 "Data length must match `length`"
1572 );
1573
1574 let mut raw_bytes = [0u8; 16];
1575 raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1576 raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1577 u128::from_le_bytes(raw_bytes)
1578 }
1579
1580 // Test inputs: various lengths and lexical orders,
1581 // plus special cases for byte order and length tie-breaking
1582 let test_inputs: Vec<&[u8]> = vec![
1583 b"a",
1584 b"aa",
1585 b"aaa",
1586 b"aab",
1587 b"abcd",
1588 b"abcde",
1589 b"abcdef",
1590 b"abcdefg",
1591 b"abcdefgh",
1592 b"abcdefghi",
1593 b"abcdefghij",
1594 b"abcdefghijk",
1595 b"abcdefghijkl",
1596 // Tests for byte-order reversal bug:
1597 // Without the fix, "backend one" would compare as "eno dnekcab",
1598 // causing incorrect sort order relative to "backend two".
1599 b"backend one",
1600 b"backend two",
1601 // Tests length-tiebreaker logic:
1602 // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1603 // so only the length differentiates their ordering.
1604 b"bar",
1605 b"bar\0",
1606 // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1607 b"than12Byt",
1608 b"than12Bytes",
1609 b"than12Bytes\0",
1610 b"than12Bytesx",
1611 b"than12Bytex",
1612 b"than12Bytez",
1613 // Additional lexical tests
1614 b"xyy",
1615 b"xyz",
1616 b"xza",
1617 ];
1618
1619 // Create a GenericBinaryArray for cross-comparison of lex order
1620 let array: GenericBinaryArray<i32> =
1621 GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1622
1623 for i in 0..array.len() - 1 {
1624 let v1 = array.value(i);
1625 let v2 = array.value(i + 1);
1626
1627 // Assert the array's natural lexical ordering is correct
1628 assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1629
1630 // Assert the keys produced by inline_key_fast reflect the same ordering
1631 let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1632 v1.len() as u32,
1633 v1,
1634 ));
1635 let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1636 v2.len() as u32,
1637 v2,
1638 ));
1639
1640 assert!(
1641 key1 < key2,
1642 "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1643 );
1644 }
1645 }
1646
1647 #[test]
1648 fn empty_array_should_return_empty_lengths_iterator() {
1649 let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
1650
1651 let mut lengths_iter = empty.lengths();
1652 assert_eq!(lengths_iter.len(), 0);
1653 assert_eq!(lengths_iter.next(), None);
1654 }
1655
1656 #[test]
1657 fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
1658 let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1659 // Not inlined as longer than 12 bytes
1660 b"Supercalifragilisticexpialidocious" as &[u8],
1661 // Inlined as shorter than 12 bytes
1662 b"Hello",
1663 // Empty value
1664 b"",
1665 // Exactly 12 bytes
1666 b"abcdefghijkl",
1667 ]);
1668
1669 let mut lengths_iter = cases.lengths();
1670
1671 assert_eq!(lengths_iter.len(), cases.len());
1672
1673 let cases_iter = cases.iter();
1674
1675 for case in cases_iter {
1676 let case_value = case.unwrap();
1677 let length = lengths_iter.next().expect("Should have a length");
1678
1679 assert_eq!(case_value.len(), length as usize);
1680 }
1681
1682 assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1683 }
1684
1685 #[test]
1686 fn array_lengths_should_return_the_underlying_length_for_null_values() {
1687 let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1688 // Not inlined as longer than 12 bytes
1689 b"Supercalifragilisticexpialidocious" as &[u8],
1690 // Inlined as shorter than 12 bytes
1691 b"Hello",
1692 // Empty value
1693 b"",
1694 // Exactly 12 bytes
1695 b"abcdefghijkl",
1696 ]);
1697
1698 let (views, buffer, _) = cases.clone().into_parts();
1699
1700 // Keeping the values but just adding nulls on top
1701 let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
1702 views,
1703 buffer,
1704 Some(NullBuffer::new_null(cases.len())),
1705 );
1706
1707 let lengths_iter = cases.lengths();
1708 let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
1709
1710 assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
1711
1712 for expected_length in lengths_iter {
1713 let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");
1714
1715 assert_eq!(expected_length, actual_length);
1716 }
1717
1718 assert_eq!(
1719 all_nulls_lengths_iter.next(),
1720 None,
1721 "Should not have more lengths"
1722 );
1723 }
1724
1725 #[test]
1726 fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
1727 let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1728 b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1729 b"Hello",
1730 b"something great",
1731 b"is",
1732 b"coming soon!",
1733 b"when you find what it is",
1734 b"let me know",
1735 b"cause",
1736 b"I",
1737 b"have no idea",
1738 b"what it",
1739 b"is",
1740 ]);
1741
1742 let sliced_array = array.slice(2, array.len() - 3);
1743
1744 let mut lengths_iter = sliced_array.lengths();
1745
1746 assert_eq!(lengths_iter.len(), sliced_array.len());
1747
1748 let values_iter = sliced_array.iter();
1749
1750 for value in values_iter {
1751 let value = value.unwrap();
1752 let length = lengths_iter.next().expect("Should have a length");
1753
1754 assert_eq!(value.len(), length as usize);
1755 }
1756
1757 assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1758 }
1759
1760 #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")]
1761 #[test]
1762 fn invalid_casting_from_array_data() {
1763 // Should not be able to cast to StringViewArray due to invalid UTF-8
1764 let array_data = binary_view_array_with_invalid_utf8_data().into_data();
1765 let _ = StringViewArray::from(array_data);
1766 }
1767
1768 #[should_panic(expected = "invalid utf-8 sequence")]
1769 #[test]
1770 fn invalid_array_data() {
1771 let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts();
1772
1773 // manually try and add invalid array data with Utf8View data type
1774 let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
1775 .add_buffer(views.into_inner())
1776 .len(3);
1777 for buffer in buffers.iter() {
1778 builder = builder.add_buffer(buffer.clone())
1779 }
1780 builder = builder.nulls(nulls);
1781
1782 let data = builder.build().unwrap(); // should fail validation
1783 let _arr = StringViewArray::from(data);
1784 }
1785
1786 /// Returns a BinaryViewArray with one invalid UTF-8 value
1787 fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
1788 let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1789 b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1790 &[
1791 0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1792 0x00, 0x00,
1793 ],
1794 b"good",
1795 ]);
1796 assert!(from_utf8(array.value(0)).is_ok());
1797 assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
1798 assert!(from_utf8(array.value(2)).is_ok());
1799 array
1800 }
1801}