arrow_array/array/byte_view_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68/// ┌──────┬────────────────────────┐
69/// │length│ string value │
70/// Strings (len <= 12) │ │ (padded with 0) │
71/// └──────┴────────────────────────┘
72/// 0 31 127
73///
74/// ┌───────┬───────┬───────┬───────┐
75/// │length │prefix │ buf │offset │
76/// Strings (len > 12) │ │ │ index │ │
77/// └───────┴───────┴───────┴───────┘
78/// 0 31 63 95 127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82/// the view. See [`Self::inline_value`] to access the inlined prefix from a
83/// short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86/// view and the entire string is stored in one of the buffers. See [`ByteView`]
87/// to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100/// "hello",
101/// "this string is longer than 12 bytes",
102/// "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113/// StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148/// ┌───┐
149/// ┌──────┬──────┬──────┬──────┐ offset │...│
150/// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│
151/// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│
152/// ┌──────┬──────┬──────┬──────┐ │mpl│
153/// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│
154/// └──────┴──────┴──────┴──────┘ │ced│
155/// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster" │ 11 │ LavaMonster │ │hWa│
157/// └──────┴────────────────────┘ offset │sIn│
158/// 115 │Tow│
159/// │nTo│
160/// │day│
161/// u128 "views" │Yay│
162/// buffer 0 │...│
163/// └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166 data_type: DataType,
167 views: ScalarBuffer<u128>,
168 buffers: Vec<Buffer>,
169 phantom: PhantomData<T>,
170 nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174 fn clone(&self) -> Self {
175 Self {
176 data_type: T::DATA_TYPE,
177 views: self.views.clone(),
178 buffers: self.buffers.clone(),
179 nulls: self.nulls.clone(),
180 phantom: Default::default(),
181 }
182 }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186 /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187 ///
188 /// # Panics
189 ///
190 /// Panics if [`GenericByteViewArray::try_new`] returns an error
191 pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
192 Self::try_new(views, buffers, nulls).unwrap()
193 }
194
195 /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
196 ///
197 /// # Errors
198 ///
199 /// * `views.len() != nulls.len()`
200 /// * [ByteViewType::validate] fails
201 pub fn try_new(
202 views: ScalarBuffer<u128>,
203 buffers: Vec<Buffer>,
204 nulls: Option<NullBuffer>,
205 ) -> Result<Self, ArrowError> {
206 T::validate(&views, &buffers)?;
207
208 if let Some(n) = nulls.as_ref() {
209 if n.len() != views.len() {
210 return Err(ArrowError::InvalidArgumentError(format!(
211 "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
212 T::PREFIX,
213 views.len(),
214 n.len(),
215 )));
216 }
217 }
218
219 Ok(Self {
220 data_type: T::DATA_TYPE,
221 views,
222 buffers,
223 nulls,
224 phantom: Default::default(),
225 })
226 }
227
228 /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
229 ///
230 /// # Safety
231 ///
232 /// Safe if [`Self::try_new`] would not error
233 pub unsafe fn new_unchecked(
234 views: ScalarBuffer<u128>,
235 buffers: Vec<Buffer>,
236 nulls: Option<NullBuffer>,
237 ) -> Self {
238 if cfg!(feature = "force_validate") {
239 return Self::new(views, buffers, nulls);
240 }
241
242 Self {
243 data_type: T::DATA_TYPE,
244 phantom: Default::default(),
245 views,
246 buffers,
247 nulls,
248 }
249 }
250
251 /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
252 pub fn new_null(len: usize) -> Self {
253 Self {
254 data_type: T::DATA_TYPE,
255 views: vec![0; len].into(),
256 buffers: vec![],
257 nulls: Some(NullBuffer::new_null(len)),
258 phantom: Default::default(),
259 }
260 }
261
262 /// Create a new [`Scalar`] from `value`
263 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
264 Scalar::new(Self::from_iter_values(std::iter::once(value)))
265 }
266
267 /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
268 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
269 where
270 Ptr: AsRef<T::Native>,
271 I: IntoIterator<Item = Ptr>,
272 {
273 let iter = iter.into_iter();
274 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
275 for v in iter {
276 builder.append_value(v);
277 }
278 builder.finish()
279 }
280
281 /// Deconstruct this array into its constituent parts
282 pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
283 (self.views, self.buffers, self.nulls)
284 }
285
286 /// Returns the views buffer
287 #[inline]
288 pub fn views(&self) -> &ScalarBuffer<u128> {
289 &self.views
290 }
291
292 /// Returns the buffers storing string data
293 #[inline]
294 pub fn data_buffers(&self) -> &[Buffer] {
295 &self.buffers
296 }
297
298 /// Returns the element at index `i`
299 ///
300 /// Note: This method does not check for nulls and the value is arbitrary
301 /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
302 ///
303 /// # Panics
304 /// Panics if index `i` is out of bounds.
305 pub fn value(&self, i: usize) -> &T::Native {
306 assert!(
307 i < self.len(),
308 "Trying to access an element at index {} from a {}ViewArray of length {}",
309 i,
310 T::PREFIX,
311 self.len()
312 );
313
314 unsafe { self.value_unchecked(i) }
315 }
316
317 /// Returns the element at index `i` without bounds checking
318 ///
319 /// Note: This method does not check for nulls and the value is arbitrary
320 /// if [`is_null`](Self::is_null) returns true for the index.
321 ///
322 /// # Safety
323 ///
324 /// Caller is responsible for ensuring that the index is within the bounds
325 /// of the array
326 pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
327 let v = unsafe { self.views.get_unchecked(idx) };
328 let len = *v as u32;
329 let b = if len <= MAX_INLINE_VIEW_LEN {
330 unsafe { Self::inline_value(v, len as usize) }
331 } else {
332 let view = ByteView::from(*v);
333 let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
334 let offset = view.offset as usize;
335 unsafe { data.get_unchecked(offset..offset + len as usize) }
336 };
337 unsafe { T::Native::from_bytes_unchecked(b) }
338 }
339
340 /// Returns the first `len` bytes the inline value of the view.
341 ///
342 /// # Safety
343 /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
344 /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
345 #[inline(always)]
346 pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
347 debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
348 unsafe {
349 std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
350 }
351 }
352
353 /// Constructs a new iterator for iterating over the values of this array
354 pub fn iter(&self) -> ArrayIter<&Self> {
355 ArrayIter::new(self)
356 }
357
358 /// Returns an iterator over the bytes of this array, including null values
359 pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
360 self.views.iter().map(move |v| {
361 let len = *v as u32;
362 if len <= MAX_INLINE_VIEW_LEN {
363 unsafe { Self::inline_value(v, len as usize) }
364 } else {
365 let view = ByteView::from(*v);
366 let data = &self.buffers[view.buffer_index as usize];
367 let offset = view.offset as usize;
368 unsafe { data.get_unchecked(offset..offset + len as usize) }
369 }
370 })
371 }
372
373 /// Returns an iterator over the first `prefix_len` bytes of each array
374 /// element, including null values.
375 ///
376 /// If `prefix_len` is larger than the element's length, the iterator will
377 /// return an empty slice (`&[]`).
378 pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
379 self.views().into_iter().map(move |v| {
380 let len = (*v as u32) as usize;
381
382 if len < prefix_len {
383 return &[] as &[u8];
384 }
385
386 if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
387 unsafe { StringViewArray::inline_value(v, prefix_len) }
388 } else {
389 let view = ByteView::from(*v);
390 let data = unsafe {
391 self.data_buffers()
392 .get_unchecked(view.buffer_index as usize)
393 };
394 let offset = view.offset as usize;
395 unsafe { data.get_unchecked(offset..offset + prefix_len) }
396 }
397 })
398 }
399
400 /// Returns an iterator over the last `suffix_len` bytes of each array
401 /// element, including null values.
402 ///
403 /// Note that for [`StringViewArray`] the last bytes may start in the middle
404 /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
405 ///
406 /// If `suffix_len` is larger than the element's length, the iterator will
407 /// return an empty slice (`&[]`).
408 pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
409 self.views().into_iter().map(move |v| {
410 let len = (*v as u32) as usize;
411
412 if len < suffix_len {
413 return &[] as &[u8];
414 }
415
416 if len as u32 <= MAX_INLINE_VIEW_LEN {
417 unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
418 } else {
419 let view = ByteView::from(*v);
420 let data = unsafe {
421 self.data_buffers()
422 .get_unchecked(view.buffer_index as usize)
423 };
424 let offset = view.offset as usize;
425 unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
426 }
427 })
428 }
429
430 /// Returns a zero-copy slice of this array with the indicated offset and length.
431 pub fn slice(&self, offset: usize, length: usize) -> Self {
432 Self {
433 data_type: T::DATA_TYPE,
434 views: self.views.slice(offset, length),
435 buffers: self.buffers.clone(),
436 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
437 phantom: Default::default(),
438 }
439 }
440
441 /// Returns a "compacted" version of this array
442 ///
443 /// The original array will *not* be modified
444 ///
445 /// # Garbage Collection
446 ///
447 /// Before GC:
448 /// ```text
449 /// ┌──────┐
450 /// │......│
451 /// │......│
452 /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
453 /// │ View 1 │─ ─ ─ ─ │......│ with data that
454 /// ├────────────────────┤ │......│ is not referred
455 /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
456 /// └────────────────────┘ │......│ View 2
457 /// │......│
458 /// 2 views, refer to │......│
459 /// small portions of a └──────┘
460 /// large buffer
461 /// ```
462 ///
463 /// After GC:
464 ///
465 /// ```text
466 /// ┌────────────────────┐ ┌─────┐ After gc, only
467 /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
468 /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
469 /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
470 /// └────────────────────┘ left
471 ///
472 ///
473 /// 2 views
474 /// ```
475 /// This method will compact the data buffers by recreating the view array and only include the data
476 /// that is pointed to by the views.
477 ///
478 /// Note that it will copy the array regardless of whether the original array is compact.
479 /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
480 /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
481 ///
482 /// Note: this function does not attempt to canonicalize / deduplicate values. For this
483 /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
484 pub fn gc(&self) -> Self {
485 // 1) Read basic properties once
486 let len = self.len(); // number of elements
487 let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
488
489 // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
490 if self.data_buffers().is_empty() {
491 return unsafe {
492 GenericByteViewArray::new_unchecked(
493 self.views().clone(),
494 vec![], // empty data blocks
495 nulls,
496 )
497 };
498 }
499
500 // 2) Calculate total size of all non-inline data and detect if any exists
501 let total_large = self.total_buffer_bytes_used();
502
503 // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
504 if total_large == 0 {
505 // Views are inline-only or all null; just reuse original views and no data blocks
506 return unsafe {
507 GenericByteViewArray::new_unchecked(
508 self.views().clone(),
509 vec![], // empty data blocks
510 nulls,
511 )
512 };
513 }
514
515 let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
516 // fast path, the entire data fits in a single buffer
517 // 3) Allocate exactly capacity for all non-inline data
518 let mut data_buf = Vec::with_capacity(total_large);
519
520 // 4) Iterate over views and process each inline/non-inline view
521 let views_buf: Vec<u128> = (0..len)
522 .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
523 .collect();
524 let data_block = Buffer::from_vec(data_buf);
525 let data_blocks = vec![data_block];
526 (views_buf, data_blocks)
527 } else {
528 // slow path, need to split into multiple buffers
529
530 struct GcCopyGroup {
531 total_buffer_bytes: usize,
532 total_len: usize,
533 }
534
535 impl GcCopyGroup {
536 fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
537 Self {
538 total_buffer_bytes: total_buffer_bytes as usize,
539 total_len,
540 }
541 }
542 }
543
544 let mut groups = Vec::new();
545 let mut current_length = 0;
546 let mut current_elements = 0;
547
548 for view in self.views() {
549 let len = *view as u32;
550 if len > MAX_INLINE_VIEW_LEN {
551 if current_length + len > i32::MAX as u32 {
552 // Start a new group
553 groups.push(GcCopyGroup::new(current_length, current_elements));
554 current_length = 0;
555 current_elements = 0;
556 }
557 current_length += len;
558 current_elements += 1;
559 }
560 }
561 if current_elements != 0 {
562 groups.push(GcCopyGroup::new(current_length, current_elements));
563 }
564 debug_assert!(groups.len() <= i32::MAX as usize);
565
566 // 3) Copy the buffers group by group
567 let mut views_buf = Vec::with_capacity(len);
568 let mut data_blocks = Vec::with_capacity(groups.len());
569
570 let mut current_view_idx = 0;
571
572 for (group_idx, gc_copy_group) in groups.iter().enumerate() {
573 let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
574
575 // Directly push views to avoid intermediate Vec allocation
576 let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
577 |view_idx| {
578 // safety: the view index came from iterating over valid range
579 unsafe {
580 self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
581 }
582 },
583 );
584 views_buf.extend(new_views);
585
586 data_blocks.push(Buffer::from_vec(data_buf));
587 current_view_idx += gc_copy_group.total_len;
588 }
589 (views_buf, data_blocks)
590 };
591
592 // 5) Wrap up views buffer
593 let views_scalar = ScalarBuffer::from(views_buf);
594
595 // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
596 unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
597 }
598
599 /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
600 ///
601 /// # Safety
602 ///
603 /// - `i < self.len()`.
604 /// - Every element in `self.views()` must currently refer to a valid slice
605 /// inside one of `self.buffers`.
606 /// - `data_buf` must be ready to have additional bytes appended.
607 /// - After this call, the returned view will have its
608 /// `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
609 /// into the bytes just appended at the end of `data_buf`.
610 #[inline(always)]
611 unsafe fn copy_view_to_buffer(
612 &self,
613 i: usize,
614 buffer_idx: i32,
615 data_buf: &mut Vec<u8>,
616 ) -> u128 {
617 // SAFETY: `i < self.len()` ensures this is in‑bounds.
618 let raw_view = unsafe { *self.views().get_unchecked(i) };
619 let mut bv = ByteView::from(raw_view);
620
621 // Inline‑small views stay as‑is.
622 if bv.length <= MAX_INLINE_VIEW_LEN {
623 raw_view
624 } else {
625 // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
626 // must both lie within valid ranges for `self.buffers`.
627 let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
628 let start = bv.offset as usize;
629 let end = start + bv.length as usize;
630 let slice = unsafe { buffer.get_unchecked(start..end) };
631
632 // Copy out‑of‑line data into our single “0” buffer.
633 let new_offset = data_buf.len() as u32;
634 data_buf.extend_from_slice(slice);
635
636 bv.buffer_index = buffer_idx as u32;
637 bv.offset = new_offset;
638 bv.into()
639 }
640 }
641
642 /// Returns the total number of bytes used by all non inlined views in all
643 /// buffers.
644 ///
645 /// Note this does not account for views that point at the same underlying
646 /// data in buffers
647 ///
648 /// For example, if the array has three strings views:
649 /// * View with length = 9 (inlined)
650 /// * View with length = 32 (non inlined)
651 /// * View with length = 16 (non inlined)
652 ///
653 /// Then this method would report 48
654 pub fn total_buffer_bytes_used(&self) -> usize {
655 self.views()
656 .iter()
657 .map(|v| {
658 let len = *v as u32;
659 if len > MAX_INLINE_VIEW_LEN {
660 len as usize
661 } else {
662 0
663 }
664 })
665 .sum()
666 }
667
668 /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
669 ///
670 /// Comparing two ByteView types are non-trivial.
671 /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
672 ///
673 /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
674 /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
675 /// Meaning that reading one array element requires only one memory access
676 /// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
677 ///
678 /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
679 /// thanks to the inlined 4 bytes.
680 /// Consider equality check:
681 /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
682 ///
683 /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
684 /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
685 /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
686 ///
687 /// # Order check flow
688 /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
689 /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
690 /// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
691 /// (2.2) o.w., we need to compare the full string.
692 ///
693 /// # Safety
694 /// The left/right_idx must within range of each array
695 pub unsafe fn compare_unchecked(
696 left: &GenericByteViewArray<T>,
697 left_idx: usize,
698 right: &GenericByteViewArray<T>,
699 right_idx: usize,
700 ) -> Ordering {
701 let l_view = unsafe { left.views().get_unchecked(left_idx) };
702 let l_byte_view = ByteView::from(*l_view);
703
704 let r_view = unsafe { right.views().get_unchecked(right_idx) };
705 let r_byte_view = ByteView::from(*r_view);
706
707 let l_len = l_byte_view.length;
708 let r_len = r_byte_view.length;
709
710 if l_len <= 12 && r_len <= 12 {
711 return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
712 }
713
714 // one of the string is larger than 12 bytes,
715 // we then try to compare the inlined data first
716
717 // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
718 // but we can still use it to get the inlined prefix for shorter strings.
719 // The prefix is always the first 4 bytes of the view, for both short and long strings.
720 let l_inlined_be = l_byte_view.prefix.swap_bytes();
721 let r_inlined_be = r_byte_view.prefix.swap_bytes();
722 if l_inlined_be != r_inlined_be {
723 return l_inlined_be.cmp(&r_inlined_be);
724 }
725
726 // unfortunately, we need to compare the full data
727 let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
728 let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
729
730 l_full_data.cmp(r_full_data)
731 }
732
733 /// Builds a 128-bit composite key for an inline value:
734 ///
735 /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
736 /// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
737 /// (or those with fewer meaningful bytes) always numerically sort before longer ones.
738 ///
739 /// This function extracts the length and the 12-byte inline string data from the raw
740 /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
741 /// into a single `u128` value suitable for fast, branchless comparisons.
742 ///
743 /// # Why include length?
744 ///
745 /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
746 /// compare equal—either because one is a true prefix of the other or because zero-padding
747 /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
748 /// handles both content and length in one go.
749 ///
750 /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
751 ///
752 /// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) |
753 /// |------------|-----------------------|---------------------------------|
754 /// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` |
755 /// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` |
756 ///
757 /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
758 /// then differentiates:
759 ///
760 /// ```text
761 /// key("bar") = 0x0000000000000000000062617200000003
762 /// key("bar\0") = 0x0000000000000000000062617200000004
763 /// ⇒ key("bar") < key("bar\0")
764 /// ```
765 /// # Inlining and Endianness
766 ///
767 /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory
768 /// representation is little‑endian on x86/ARM.
769 /// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free.
770 /// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`.
771 /// - We serialize `length` as big‑endian into `buf[12..16]`.
772 /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte
773 /// and `buf[15]` as the least significant, producing a `u128` whose integer value
774 /// directly encodes “inline data then length” in big‑endian form.
775 ///
776 /// This ensures that a simple `u128` comparison is equivalent to the desired
777 /// lexicographical comparison of the inline bytes followed by length.
778 #[inline(always)]
779 pub fn inline_key_fast(raw: u128) -> u128 {
780 // 1. Decompose `raw` into little‑endian bytes:
781 // - raw_bytes[0..4] = length in LE
782 // - raw_bytes[4..16] = inline string data
783 let raw_bytes = raw.to_le_bytes();
784
785 // 2. Numerically truncate to get the low 32‑bit length (endianness‑free).
786 let length = raw as u32;
787
788 // 3. Build a 16‑byte buffer in big‑endian order:
789 // - buf[0..12] = inline string bytes (in original order)
790 // - buf[12..16] = length.to_be_bytes() (BE)
791 let mut buf = [0u8; 16];
792 buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data
793
794 // Why convert length to big-endian for comparison?
795 //
796 // Rust (on most platforms) stores integers in little-endian format,
797 // meaning the least significant byte is at the lowest memory address.
798 // For example, an u32 value like 0x22345677 is stored in memory as:
799 //
800 // [0x77, 0x56, 0x34, 0x22] // little-endian layout
801 // ^ ^ ^ ^
802 // LSB ↑↑↑ MSB
803 //
804 // This layout is efficient for arithmetic but *not* suitable for
805 // lexicographic (dictionary-style) comparison of byte arrays.
806 //
807 // To compare values by byte order—e.g., for sorted keys or binary trees—
808 // we must convert them to **big-endian**, where:
809 //
810 // - The most significant byte (MSB) comes first (index 0)
811 // - The least significant byte (LSB) comes last (index N-1)
812 //
813 // In big-endian, the same u32 = 0x22345677 would be represented as:
814 //
815 // [0x22, 0x34, 0x56, 0x77]
816 //
817 // This ordering aligns with natural string/byte sorting, so calling
818 // `.to_be_bytes()` allows us to construct
819 // keys where standard numeric comparison (e.g., `<`, `>`) behaves
820 // like lexicographic byte comparison.
821 buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE
822
823 // 4. Deserialize the buffer as a big‑endian u128:
824 // buf[0] is MSB, buf[15] is LSB.
825 // Details:
826 // Note on endianness and layout:
827 //
828 // Although `buf[0]` is stored at the lowest memory address,
829 // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**,
830 // and `buf[15]` as the **least significant byte (LSB)**.
831 //
832 // This is the core principle of **big-endian decoding**:
833 // - Byte at index 0 maps to bits 127..120 (highest)
834 // - Byte at index 1 maps to bits 119..112
835 // - ...
836 // - Byte at index 15 maps to bits 7..0 (lowest)
837 //
838 // So even though memory layout goes from low to high (left to right),
839 // big-endian treats the **first byte** as highest in value.
840 //
841 // This guarantees that comparing two `u128` keys is equivalent to lexicographically
842 // comparing the original inline bytes, followed by length.
843 u128::from_be_bytes(buf)
844 }
845}
846
847impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
848 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
849 write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
850 print_long_array(self, f, |array, index, f| {
851 std::fmt::Debug::fmt(&array.value(index), f)
852 })?;
853 write!(f, "]")
854 }
855}
856
857impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
858 fn as_any(&self) -> &dyn Any {
859 self
860 }
861
862 fn to_data(&self) -> ArrayData {
863 self.clone().into()
864 }
865
866 fn into_data(self) -> ArrayData {
867 self.into()
868 }
869
870 fn data_type(&self) -> &DataType {
871 &self.data_type
872 }
873
874 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
875 Arc::new(self.slice(offset, length))
876 }
877
878 fn len(&self) -> usize {
879 self.views.len()
880 }
881
882 fn is_empty(&self) -> bool {
883 self.views.is_empty()
884 }
885
886 fn shrink_to_fit(&mut self) {
887 self.views.shrink_to_fit();
888 self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
889 self.buffers.shrink_to_fit();
890 if let Some(nulls) = &mut self.nulls {
891 nulls.shrink_to_fit();
892 }
893 }
894
895 fn offset(&self) -> usize {
896 0
897 }
898
899 fn nulls(&self) -> Option<&NullBuffer> {
900 self.nulls.as_ref()
901 }
902
903 fn logical_null_count(&self) -> usize {
904 // More efficient that the default implementation
905 self.null_count()
906 }
907
908 fn get_buffer_memory_size(&self) -> usize {
909 let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
910 sum += self.views.inner().capacity();
911 if let Some(x) = &self.nulls {
912 sum += x.buffer().capacity()
913 }
914 sum
915 }
916
917 fn get_array_memory_size(&self) -> usize {
918 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
919 }
920}
921
922impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
923 type Item = &'a T::Native;
924
925 fn value(&self, index: usize) -> Self::Item {
926 GenericByteViewArray::value(self, index)
927 }
928
929 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
930 unsafe { GenericByteViewArray::value_unchecked(self, index) }
931 }
932}
933
934impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
935 type Item = Option<&'a T::Native>;
936 type IntoIter = ArrayIter<Self>;
937
938 fn into_iter(self) -> Self::IntoIter {
939 ArrayIter::new(self)
940 }
941}
942
943impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
944 fn from(value: ArrayData) -> Self {
945 let views = value.buffers()[0].clone();
946 let views = ScalarBuffer::new(views, value.offset(), value.len());
947 let buffers = value.buffers()[1..].to_vec();
948 Self {
949 data_type: T::DATA_TYPE,
950 views,
951 buffers,
952 nulls: value.nulls().cloned(),
953 phantom: Default::default(),
954 }
955 }
956}
957
958/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
959///
960/// For example this method can convert a [`StringArray`] to a
961/// [`StringViewArray`].
962///
963/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
964/// is built without copying the underlying string data (views are created
965/// directly into the existing buffer)
966///
967/// [`StringArray`]: crate::StringArray
968impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
969where
970 FROM: ByteArrayType,
971 FROM::Offset: OffsetSizeTrait + ToPrimitive,
972 V: ByteViewType<Native = FROM::Native>,
973{
974 fn from(byte_array: &GenericByteArray<FROM>) -> Self {
975 let offsets = byte_array.offsets();
976
977 let can_reuse_buffer = match offsets.last() {
978 Some(offset) => offset.as_usize() < u32::MAX as usize,
979 None => true,
980 };
981
982 if can_reuse_buffer {
983 // build views directly pointing to the existing buffer
984 let len = byte_array.len();
985 let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
986 let str_values_buf = byte_array.values().clone();
987 let block = views_builder.append_block(str_values_buf);
988 for (i, w) in offsets.windows(2).enumerate() {
989 let offset = w[0].as_usize();
990 let end = w[1].as_usize();
991 let length = end - offset;
992
993 if byte_array.is_null(i) {
994 views_builder.append_null();
995 } else {
996 // Safety: the input was a valid array so it valid UTF8 (if string). And
997 // all offsets were valid
998 unsafe {
999 views_builder.append_view_unchecked(block, offset as u32, length as u32)
1000 }
1001 }
1002 }
1003 assert_eq!(views_builder.len(), len);
1004 views_builder.finish()
1005 } else {
1006 // Otherwise, create a new buffer for large strings
1007 // TODO: the original buffer could still be used
1008 // by making multiple slices of u32::MAX length
1009 GenericByteViewArray::<V>::from_iter(byte_array.iter())
1010 }
1011 }
1012}
1013
1014impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1015 fn from(mut array: GenericByteViewArray<T>) -> Self {
1016 let len = array.len();
1017 array.buffers.insert(0, array.views.into_inner());
1018 let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1019 .len(len)
1020 .buffers(array.buffers)
1021 .nulls(array.nulls);
1022
1023 unsafe { builder.build_unchecked() }
1024 }
1025}
1026
1027impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1028where
1029 Ptr: AsRef<T::Native> + 'a,
1030 T: ByteViewType + ?Sized,
1031{
1032 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1033 iter.into_iter()
1034 .map(|o| o.as_ref().map(|p| p.as_ref()))
1035 .collect()
1036 }
1037}
1038
1039impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1040where
1041 Ptr: AsRef<T::Native>,
1042{
1043 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1044 let iter = iter.into_iter();
1045 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1046 builder.extend(iter);
1047 builder.finish()
1048 }
1049}
1050
1051/// A [`GenericByteViewArray`] of `[u8]`
1052///
1053/// See [`GenericByteViewArray`] for format and layout details.
1054///
1055/// # Example
1056/// ```
1057/// use arrow_array::BinaryViewArray;
1058/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1059/// assert_eq!(array.value(0), b"hello");
1060/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1061/// ```
1062pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1063
1064impl BinaryViewArray {
1065 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1066 /// If items not utf8 data, validate will fail and error returned.
1067 pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1068 StringViewType::validate(self.views(), self.data_buffers())?;
1069 unsafe { Ok(self.to_string_view_unchecked()) }
1070 }
1071
1072 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1073 /// # Safety
1074 /// Caller is responsible for ensuring that items in array are utf8 data.
1075 pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1076 unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1077 }
1078}
1079
1080impl From<Vec<&[u8]>> for BinaryViewArray {
1081 fn from(v: Vec<&[u8]>) -> Self {
1082 Self::from_iter_values(v)
1083 }
1084}
1085
1086impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1087 fn from(v: Vec<Option<&[u8]>>) -> Self {
1088 v.into_iter().collect()
1089 }
1090}
1091
1092/// A [`GenericByteViewArray`] that stores utf8 data
1093///
1094/// See [`GenericByteViewArray`] for format and layout details.
1095///
1096/// # Example
1097/// ```
1098/// use arrow_array::StringViewArray;
1099/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1100/// assert_eq!(array.value(0), "hello");
1101/// assert_eq!(array.value(3), "large payload over 12 bytes");
1102/// ```
1103pub type StringViewArray = GenericByteViewArray<StringViewType>;
1104
1105impl StringViewArray {
1106 /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1107 pub fn to_binary_view(self) -> BinaryViewArray {
1108 unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1109 }
1110
1111 /// Returns true if all data within this array is ASCII
1112 pub fn is_ascii(&self) -> bool {
1113 // Alternative (but incorrect): directly check the underlying buffers
1114 // (1) Our string view might be sparse, i.e., a subset of the buffers,
1115 // so even if the buffer is not ascii, we can still be ascii.
1116 // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1117 // This means that this operation is quite expensive, shall we cache the result?
1118 // i.e. track `is_ascii` in the builder.
1119 self.iter().all(|v| match v {
1120 Some(v) => v.is_ascii(),
1121 None => true,
1122 })
1123 }
1124}
1125
1126impl From<Vec<&str>> for StringViewArray {
1127 fn from(v: Vec<&str>) -> Self {
1128 Self::from_iter_values(v)
1129 }
1130}
1131
1132impl From<Vec<Option<&str>>> for StringViewArray {
1133 fn from(v: Vec<Option<&str>>) -> Self {
1134 v.into_iter().collect()
1135 }
1136}
1137
1138impl From<Vec<String>> for StringViewArray {
1139 fn from(v: Vec<String>) -> Self {
1140 Self::from_iter_values(v)
1141 }
1142}
1143
1144impl From<Vec<Option<String>>> for StringViewArray {
1145 fn from(v: Vec<Option<String>>) -> Self {
1146 v.into_iter().collect()
1147 }
1148}
1149
1150#[cfg(test)]
1151mod tests {
1152 use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1153 use crate::types::BinaryViewType;
1154 use crate::{
1155 Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1156 };
1157 use arrow_buffer::{Buffer, ScalarBuffer};
1158 use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
1159 use rand::prelude::StdRng;
1160 use rand::{Rng, SeedableRng};
1161
1162 const BLOCK_SIZE: u32 = 8;
1163
1164 #[test]
1165 fn try_new_string() {
1166 let array = StringViewArray::from_iter_values(vec![
1167 "hello",
1168 "world",
1169 "lulu",
1170 "large payload over 12 bytes",
1171 ]);
1172 assert_eq!(array.value(0), "hello");
1173 assert_eq!(array.value(3), "large payload over 12 bytes");
1174 }
1175
1176 #[test]
1177 fn try_new_binary() {
1178 let array = BinaryViewArray::from_iter_values(vec![
1179 b"hello".as_slice(),
1180 b"world".as_slice(),
1181 b"lulu".as_slice(),
1182 b"large payload over 12 bytes".as_slice(),
1183 ]);
1184 assert_eq!(array.value(0), b"hello");
1185 assert_eq!(array.value(3), b"large payload over 12 bytes");
1186 }
1187
1188 #[test]
1189 fn try_new_empty_string() {
1190 // test empty array
1191 let array = {
1192 let mut builder = StringViewBuilder::new();
1193 builder.finish()
1194 };
1195 assert!(array.is_empty());
1196 }
1197
1198 #[test]
1199 fn try_new_empty_binary() {
1200 // test empty array
1201 let array = {
1202 let mut builder = BinaryViewBuilder::new();
1203 builder.finish()
1204 };
1205 assert!(array.is_empty());
1206 }
1207
1208 #[test]
1209 fn test_append_string() {
1210 // test builder append
1211 let array = {
1212 let mut builder = StringViewBuilder::new();
1213 builder.append_value("hello");
1214 builder.append_null();
1215 builder.append_option(Some("large payload over 12 bytes"));
1216 builder.finish()
1217 };
1218 assert_eq!(array.value(0), "hello");
1219 assert!(array.is_null(1));
1220 assert_eq!(array.value(2), "large payload over 12 bytes");
1221 }
1222
1223 #[test]
1224 fn test_append_binary() {
1225 // test builder append
1226 let array = {
1227 let mut builder = BinaryViewBuilder::new();
1228 builder.append_value(b"hello");
1229 builder.append_null();
1230 builder.append_option(Some(b"large payload over 12 bytes"));
1231 builder.finish()
1232 };
1233 assert_eq!(array.value(0), b"hello");
1234 assert!(array.is_null(1));
1235 assert_eq!(array.value(2), b"large payload over 12 bytes");
1236 }
1237
1238 #[test]
1239 fn test_in_progress_recreation() {
1240 let array = {
1241 // make a builder with small block size.
1242 let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1243 builder.append_value("large payload over 12 bytes");
1244 builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1245 builder.finish()
1246 };
1247 assert_eq!(array.value(0), "large payload over 12 bytes");
1248 assert_eq!(
1249 array.value(1),
1250 "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1251 );
1252 assert_eq!(2, array.buffers.len());
1253 }
1254
1255 #[test]
1256 #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1257 fn new_with_invalid_view_data() {
1258 let v = "large payload over 12 bytes";
1259 let view = ByteView::new(13, &v.as_bytes()[0..4])
1260 .with_buffer_index(3)
1261 .with_offset(1);
1262 let views = ScalarBuffer::from(vec![view.into()]);
1263 let buffers = vec![Buffer::from_slice_ref(v)];
1264 StringViewArray::new(views, buffers, None);
1265 }
1266
1267 #[test]
1268 #[should_panic(
1269 expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1270 )]
1271 fn new_with_invalid_utf8_data() {
1272 let v: Vec<u8> = vec![
1273 // invalid UTF8
1274 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1275 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1276 ];
1277 let view = ByteView::new(v.len() as u32, &v[0..4]);
1278 let views = ScalarBuffer::from(vec![view.into()]);
1279 let buffers = vec![Buffer::from_slice_ref(v)];
1280 StringViewArray::new(views, buffers, None);
1281 }
1282
1283 #[test]
1284 #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1285 fn new_with_invalid_zero_padding() {
1286 let mut data = [0; 12];
1287 data[0] = b'H';
1288 data[11] = 1; // no zero padding
1289
1290 let mut view_buffer = [0; 16];
1291 view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1292 view_buffer[4..].copy_from_slice(&data);
1293
1294 let view = ByteView::from(u128::from_le_bytes(view_buffer));
1295 let views = ScalarBuffer::from(vec![view.into()]);
1296 let buffers = vec![];
1297 StringViewArray::new(views, buffers, None);
1298 }
1299
1300 #[test]
1301 #[should_panic(expected = "Mismatch between embedded prefix and data")]
1302 fn test_mismatch_between_embedded_prefix_and_data() {
1303 let input_str_1 = "Hello, Rustaceans!";
1304 let input_str_2 = "Hallo, Rustaceans!";
1305 let length = input_str_1.len() as u32;
1306 assert!(input_str_1.len() > 12);
1307
1308 let mut view_buffer = [0; 16];
1309 view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1310 view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1311 view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1312 view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1313 let view = ByteView::from(u128::from_le_bytes(view_buffer));
1314 let views = ScalarBuffer::from(vec![view.into()]);
1315 let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1316
1317 StringViewArray::new(views, buffers, None);
1318 }
1319
1320 #[test]
1321 fn test_gc() {
1322 let test_data = [
1323 Some("longer than 12 bytes"),
1324 Some("short"),
1325 Some("t"),
1326 Some("longer than 12 bytes"),
1327 None,
1328 Some("short"),
1329 ];
1330
1331 let array = {
1332 let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1333 test_data.into_iter().for_each(|v| builder.append_option(v));
1334 builder.finish()
1335 };
1336 assert!(array.buffers.len() > 1);
1337
1338 fn check_gc(to_test: &StringViewArray) {
1339 let gc = to_test.gc();
1340 assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1341
1342 to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1343 assert_eq!(a, b);
1344 });
1345 assert_eq!(to_test.len(), gc.len());
1346 }
1347
1348 check_gc(&array);
1349 check_gc(&array.slice(1, 3));
1350 check_gc(&array.slice(2, 1));
1351 check_gc(&array.slice(2, 2));
1352 check_gc(&array.slice(3, 1));
1353 }
1354
1355 /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1356 #[test]
1357 fn test_gc_empty_array() {
1358 let array = StringViewBuilder::new()
1359 .with_fixed_block_size(BLOCK_SIZE)
1360 .finish();
1361 let gced = array.gc();
1362 // length and null count remain zero
1363 assert_eq!(gced.len(), 0);
1364 assert_eq!(gced.null_count(), 0);
1365 // no underlying data buffers should be allocated
1366 assert!(
1367 gced.data_buffers().is_empty(),
1368 "Expected no data buffers for empty array"
1369 );
1370 }
1371
1372 /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1373 #[test]
1374 fn test_gc_all_inline() {
1375 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1376 // append many short strings, each exactly INLINE_LEN long
1377 for _ in 0..100 {
1378 let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1379 builder.append_option(Some(&s));
1380 }
1381 let array = builder.finish();
1382 let gced = array.gc();
1383 // Since all views fit inline, data buffer is empty
1384 assert_eq!(
1385 gced.data_buffers().len(),
1386 0,
1387 "Should have no data buffers for inline values"
1388 );
1389 assert_eq!(gced.len(), 100);
1390 // verify element-wise equality
1391 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1392 assert_eq!(orig, got, "Inline value mismatch after gc");
1393 });
1394 }
1395
1396 /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1397 #[test]
1398 fn test_gc_all_large() {
1399 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1400 let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1401 // append multiple large strings
1402 for _ in 0..50 {
1403 builder.append_option(Some(&large_str));
1404 }
1405 let array = builder.finish();
1406 let gced = array.gc();
1407 // New data buffers should be populated (one or more blocks)
1408 assert!(
1409 !gced.data_buffers().is_empty(),
1410 "Expected data buffers for large values"
1411 );
1412 assert_eq!(gced.len(), 50);
1413 // verify that every large string emerges unchanged
1414 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1415 assert_eq!(orig, got, "Large view mismatch after gc");
1416 });
1417 }
1418
1419 /// 4) All null elements: ensure null bitmap handling path is correct
1420 #[test]
1421 fn test_gc_all_nulls() {
1422 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1423 for _ in 0..20 {
1424 builder.append_null();
1425 }
1426 let array = builder.finish();
1427 let gced = array.gc();
1428 // length and null count match
1429 assert_eq!(gced.len(), 20);
1430 assert_eq!(gced.null_count(), 20);
1431 // data buffers remain empty for null-only array
1432 assert!(
1433 gced.data_buffers().is_empty(),
1434 "No data should be stored for nulls"
1435 );
1436 }
1437
1438 /// 5) Random mix of inline, large, and null values with slicing tests
1439 #[test]
1440 fn test_gc_random_mixed_and_slices() {
1441 let mut rng = StdRng::seed_from_u64(42);
1442 let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1443 // Keep a Vec of original Option<String> for later comparison
1444 let mut original: Vec<Option<String>> = Vec::new();
1445
1446 for _ in 0..200 {
1447 if rng.random_bool(0.1) {
1448 // 10% nulls
1449 builder.append_null();
1450 original.push(None);
1451 } else {
1452 // random length between 0 and twice the inline limit
1453 let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1454 let s: String = "A".repeat(len as usize);
1455 builder.append_option(Some(&s));
1456 original.push(Some(s));
1457 }
1458 }
1459
1460 let array = builder.finish();
1461 // Test multiple slice ranges to ensure offset logic is correct
1462 for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1463 let sliced = array.slice(*offset, *slice_len);
1464 let gced = sliced.gc();
1465 // Build expected slice of Option<&str>
1466 let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1467 .iter()
1468 .map(|opt| opt.as_deref())
1469 .collect();
1470
1471 assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1472 // Compare element-wise
1473 gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1474 assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1475 });
1476 }
1477 }
1478
1479 #[test]
1480 #[cfg_attr(miri, ignore)] // Takes too long
1481 fn test_gc_huge_array() {
1482 // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1483 let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1484 let num_views: usize = 36;
1485
1486 // Create a single 128 MiB data block with a simple byte pattern
1487 let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1488 let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1489
1490 // Append this block and then add many views pointing to it
1491 let mut builder = BinaryViewBuilder::new();
1492 let block_id = builder.append_block(buffer);
1493 for _ in 0..num_views / 2 {
1494 builder
1495 .try_append_view(block_id, 0, block_len as u32)
1496 .expect("append view into 128MiB block");
1497 }
1498 let block_id2 = builder.append_block(buffer2);
1499 for _ in 0..num_views / 2 {
1500 builder
1501 .try_append_view(block_id2, 0, block_len as u32)
1502 .expect("append view into 128MiB block");
1503 }
1504
1505 let array = builder.finish();
1506 let total = array.total_buffer_bytes_used();
1507 assert!(
1508 total > u32::MAX as usize,
1509 "Expected total non-inline bytes to exceed 4 GiB, got {}",
1510 total
1511 );
1512
1513 // Run gc and verify correctness
1514 let gced = array.gc();
1515 assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1516 assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1517 assert_ne!(
1518 gced.data_buffers().len(),
1519 1,
1520 "gc with huge buffer should not consolidate data into a single buffer"
1521 );
1522
1523 // Element-wise equality check across the entire array
1524 array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1525 assert_eq!(orig, got, "Value mismatch after gc on huge array");
1526 });
1527 }
1528
1529 #[test]
1530 fn test_eq() {
1531 let test_data = [
1532 Some("longer than 12 bytes"),
1533 None,
1534 Some("short"),
1535 Some("again, this is longer than 12 bytes"),
1536 ];
1537
1538 let array1 = {
1539 let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1540 test_data.into_iter().for_each(|v| builder.append_option(v));
1541 builder.finish()
1542 };
1543 let array2 = {
1544 // create a new array with the same data but different layout
1545 let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1546 test_data.into_iter().for_each(|v| builder.append_option(v));
1547 builder.finish()
1548 };
1549 assert_eq!(array1, array1.clone());
1550 assert_eq!(array2, array2.clone());
1551 assert_eq!(array1, array2);
1552 }
1553
1554 /// Integration tests for `inline_key_fast` covering:
1555 ///
1556 /// 1. Monotonic ordering across increasing lengths and lexical variations.
1557 /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1558 ///
1559 /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1560 /// the length field is required even when all inline bytes fit in 12 bytes.
1561 ///
1562 /// The test includes strings that verify correct byte order (prevent reversal bugs),
1563 /// and length-based tie-breaking in the composite key.
1564 ///
1565 /// The test confirms that `inline_key_fast` produces keys which sort consistently
1566 /// with the expected lexicographical order of the raw byte arrays.
1567 #[test]
1568 fn test_inline_key_fast_various_lengths_and_lexical() {
1569 /// Helper to create a raw u128 value representing an inline ByteView:
1570 /// - `length`: number of meaningful bytes (must be ≤ 12)
1571 /// - `data`: the actual inline data bytes
1572 ///
1573 /// The first 4 bytes encode length in little-endian,
1574 /// the following 12 bytes contain the inline string data (unpadded).
1575 fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1576 assert!(length as usize <= 12, "Inline length must be ≤ 12");
1577 assert!(
1578 data.len() == length as usize,
1579 "Data length must match `length`"
1580 );
1581
1582 let mut raw_bytes = [0u8; 16];
1583 raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1584 raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1585 u128::from_le_bytes(raw_bytes)
1586 }
1587
1588 // Test inputs: various lengths and lexical orders,
1589 // plus special cases for byte order and length tie-breaking
1590 let test_inputs: Vec<&[u8]> = vec![
1591 b"a",
1592 b"aa",
1593 b"aaa",
1594 b"aab",
1595 b"abcd",
1596 b"abcde",
1597 b"abcdef",
1598 b"abcdefg",
1599 b"abcdefgh",
1600 b"abcdefghi",
1601 b"abcdefghij",
1602 b"abcdefghijk",
1603 b"abcdefghijkl",
1604 // Tests for byte-order reversal bug:
1605 // Without the fix, "backend one" would compare as "eno dnekcab",
1606 // causing incorrect sort order relative to "backend two".
1607 b"backend one",
1608 b"backend two",
1609 // Tests length-tiebreaker logic:
1610 // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1611 // so only the length differentiates their ordering.
1612 b"bar",
1613 b"bar\0",
1614 // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1615 b"than12Byt",
1616 b"than12Bytes",
1617 b"than12Bytes\0",
1618 b"than12Bytesx",
1619 b"than12Bytex",
1620 b"than12Bytez",
1621 // Additional lexical tests
1622 b"xyy",
1623 b"xyz",
1624 b"xza",
1625 ];
1626
1627 // Create a GenericBinaryArray for cross-comparison of lex order
1628 let array: GenericBinaryArray<i32> =
1629 GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1630
1631 for i in 0..array.len() - 1 {
1632 let v1 = array.value(i);
1633 let v2 = array.value(i + 1);
1634
1635 // Assert the array's natural lexical ordering is correct
1636 assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1637
1638 // Assert the keys produced by inline_key_fast reflect the same ordering
1639 let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1640 v1.len() as u32,
1641 v1,
1642 ));
1643 let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1644 v2.len() as u32,
1645 v2,
1646 ));
1647
1648 assert!(
1649 key1 < key2,
1650 "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1651 );
1652 }
1653 }
1654}