arrow_array/builder/
generic_bytes_view_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::marker::PhantomData;
20use std::sync::Arc;
21
22use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer};
23use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
24use arrow_schema::ArrowError;
25use hashbrown::HashTable;
26use hashbrown::hash_table::Entry;
27
28use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, StringLikeArrayBuilder};
29use crate::types::bytes::ByteArrayNativeType;
30use crate::types::{BinaryViewType, ByteViewType, StringViewType};
31use crate::{Array, ArrayRef, GenericByteViewArray};
32
33const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB
34const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB
35
36enum BlockSizeGrowthStrategy {
37    Fixed { size: u32 },
38    Exponential { current_size: u32 },
39}
40
41impl BlockSizeGrowthStrategy {
42    fn next_size(&mut self) -> u32 {
43        match self {
44            Self::Fixed { size } => *size,
45            Self::Exponential { current_size } => {
46                if *current_size < MAX_BLOCK_SIZE {
47                    // we have fixed start/end block sizes, so we can't overflow
48                    *current_size = current_size.saturating_mul(2);
49                    *current_size
50                } else {
51                    MAX_BLOCK_SIZE
52                }
53            }
54        }
55    }
56}
57
58/// A builder for [`GenericByteViewArray`]
59///
60/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data,
61/// and a list of views into those buffers.
62///
63/// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`]
64///
65/// This builder can be used in two ways
66///
67/// # Append Values
68///
69/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
70/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
71/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller
72/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the
73/// in-progress block, it will be closed, and a new block of sufficient size allocated
74///
75/// # Append Views
76///
77/// Some use-cases may wish to reuse an existing allocation containing string data, for example,
78/// when parsing data from a parquet data page. In such a case entire blocks can be appended
79/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
80/// using [`GenericByteViewBuilder::try_append_view`]
81pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
82    views_buffer: Vec<u128>,
83    null_buffer_builder: NullBufferBuilder,
84    completed: Vec<Buffer>,
85    in_progress: Vec<u8>,
86    block_size: BlockSizeGrowthStrategy,
87    /// Some if deduplicating strings
88    /// map `<string hash> -> <index to the views>`
89    string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90    max_deduplication_len: Option<u32>,
91    phantom: PhantomData<T>,
92}
93
94impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
95    /// Creates a new [`GenericByteViewBuilder`].
96    pub fn new() -> Self {
97        Self::with_capacity(1024)
98    }
99
100    /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
101    pub fn with_capacity(capacity: usize) -> Self {
102        Self {
103            views_buffer: Vec::with_capacity(capacity),
104            null_buffer_builder: NullBufferBuilder::new(capacity),
105            completed: vec![],
106            in_progress: vec![],
107            block_size: BlockSizeGrowthStrategy::Exponential {
108                current_size: STARTING_BLOCK_SIZE,
109            },
110            string_tracker: None,
111            max_deduplication_len: None,
112            phantom: Default::default(),
113        }
114    }
115
116    /// Configure max deduplication length when deduplicating strings while building the array.
117    /// Default is None.
118    ///
119    /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
120    /// any strings longer than 12 bytes. However, since it takes time proportional to the length
121    /// of the string to deduplicate, setting this option limits the CPU overhead for this option.  
122    pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self {
123        debug_assert!(
124            max_deduplication_len > 0,
125            "max_deduplication_len must be greater than 0"
126        );
127        Self {
128            max_deduplication_len: Some(max_deduplication_len),
129            ..self
130        }
131    }
132
133    /// Set a fixed buffer size for variable length strings
134    ///
135    /// The block size is the size of the buffer used to store values greater
136    /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current
137    /// buffer is full.
138    ///
139    /// By default the builder balances buffer size and buffer count by
140    /// growing buffer size exponentially from 8KB up to 2MB. The
141    /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
142    ///
143    /// If this method is used, any new buffers allocated are
144    /// exactly this size. This can be useful for advanced users
145    /// that want to control the memory usage and buffer count.
146    ///
147    /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
148    pub fn with_fixed_block_size(self, block_size: u32) -> Self {
149        debug_assert!(block_size > 0, "Block size must be greater than 0");
150        Self {
151            block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
152            ..self
153        }
154    }
155
156    /// Deduplicate strings while building the array
157    ///
158    /// This will potentially decrease the memory usage if the array have repeated strings
159    /// It will also increase the time to build the array as it needs to hash the strings
160    pub fn with_deduplicate_strings(self) -> Self {
161        Self {
162            string_tracker: Some((
163                HashTable::with_capacity(self.views_buffer.capacity()),
164                Default::default(),
165            )),
166            ..self
167        }
168    }
169
170    /// Append a new data block returning the new block offset
171    ///
172    /// Note: this will first flush any in-progress block
173    ///
174    /// This allows appending views from blocks added using [`Self::append_block`]. See
175    /// [`Self::append_value`] for appending individual values
176    ///
177    /// ```
178    /// # use arrow_array::builder::StringViewBuilder;
179    /// let mut builder = StringViewBuilder::new();
180    ///
181    /// let block = builder.append_block(b"helloworldbingobongo".into());
182    ///
183    /// builder.try_append_view(block, 0, 5).unwrap();
184    /// builder.try_append_view(block, 5, 5).unwrap();
185    /// builder.try_append_view(block, 10, 5).unwrap();
186    /// builder.try_append_view(block, 15, 5).unwrap();
187    /// builder.try_append_view(block, 0, 15).unwrap();
188    /// let array = builder.finish();
189    ///
190    /// let actual: Vec<_> = array.iter().flatten().collect();
191    /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"];
192    /// assert_eq!(actual, expected);
193    /// ```
194    pub fn append_block(&mut self, buffer: Buffer) -> u32 {
195        assert!(buffer.len() < u32::MAX as usize);
196
197        self.flush_in_progress();
198        let offset = self.completed.len();
199        self.push_completed(buffer);
200        offset as u32
201    }
202
203    /// Append a view of the given `block`, `offset` and `length`
204    ///
205    /// # Safety
206    /// (1) The block must have been added using [`Self::append_block`]
207    /// (2) The range `offset..offset+length` must be within the bounds of the block
208    /// (3) The data in the block must be valid of type `T`
209    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
210        let b = unsafe { self.completed.get_unchecked(block as usize) };
211        let start = offset as usize;
212        let end = start.saturating_add(len as usize);
213        let b = unsafe { b.get_unchecked(start..end) };
214
215        let view = make_view(b, block, offset);
216        self.views_buffer.push(view);
217        self.null_buffer_builder.append_non_null();
218    }
219
220    /// Appends an array to the builder.
221    /// This will flush any in-progress block and append the data buffers
222    /// and add the (adapted) views.
223    pub fn append_array(&mut self, array: &GenericByteViewArray<T>) {
224        self.flush_in_progress();
225        // keep original views if this array is the first to be added or if there are no data buffers (all inline views)
226        let keep_views = self.completed.is_empty() || array.data_buffers().is_empty();
227        let starting_buffer = self.completed.len() as u32;
228
229        self.completed.extend(array.data_buffers().iter().cloned());
230
231        if keep_views {
232            self.views_buffer.extend_from_slice(array.views());
233        } else {
234            self.views_buffer.extend(array.views().iter().map(|v| {
235                let mut byte_view = ByteView::from(*v);
236                if byte_view.length > MAX_INLINE_VIEW_LEN {
237                    // Small views (<=12 bytes) are inlined, so only need to update large views
238                    byte_view.buffer_index += starting_buffer;
239                };
240
241                byte_view.as_u128()
242            }));
243        }
244
245        if let Some(null_buffer) = array.nulls() {
246            self.null_buffer_builder.append_buffer(null_buffer);
247        } else {
248            self.null_buffer_builder.append_n_non_nulls(array.len());
249        }
250    }
251
252    /// Try to append a view of the given `block`, `offset` and `length`
253    ///
254    /// See [`Self::append_block`]
255    pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> {
256        let b = self.completed.get(block as usize).ok_or_else(|| {
257            ArrowError::InvalidArgumentError(format!("No block found with index {block}"))
258        })?;
259        let start = offset as usize;
260        let end = start.saturating_add(len as usize);
261
262        let b = b.get(start..end).ok_or_else(|| {
263            ArrowError::InvalidArgumentError(format!(
264                "Range {start}..{end} out of bounds for block of length {}",
265                b.len()
266            ))
267        })?;
268
269        if T::Native::from_bytes_checked(b).is_none() {
270            return Err(ArrowError::InvalidArgumentError(
271                "Invalid view data".to_string(),
272            ));
273        }
274
275        unsafe {
276            self.append_view_unchecked(block, offset, len);
277        }
278        Ok(())
279    }
280
281    /// Flushes the in progress block if any
282    #[inline]
283    fn flush_in_progress(&mut self) {
284        if !self.in_progress.is_empty() {
285            let f = Buffer::from_vec(std::mem::take(&mut self.in_progress));
286            self.push_completed(f)
287        }
288    }
289
290    /// Append a block to `self.completed`, checking for overflow
291    #[inline]
292    fn push_completed(&mut self, block: Buffer) {
293        assert!(block.len() < u32::MAX as usize, "Block too large");
294        assert!(self.completed.len() < u32::MAX as usize, "Too many blocks");
295        self.completed.push(block);
296    }
297
298    /// Returns the value at the given index
299    /// Useful if we want to know what value has been inserted to the builder
300    /// The index has to be smaller than `self.len()`, otherwise it will panic
301    pub fn get_value(&self, index: usize) -> &[u8] {
302        let view = self.views_buffer.as_slice().get(index).unwrap();
303        let len = *view as u32;
304        if len <= MAX_INLINE_VIEW_LEN {
305            // # Safety
306            // The view is valid from the builder
307            unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
308        } else {
309            let view = ByteView::from(*view);
310            if view.buffer_index < self.completed.len() as u32 {
311                let block = &self.completed[view.buffer_index as usize];
312                &block[view.offset as usize..view.offset as usize + view.length as usize]
313            } else {
314                &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize]
315            }
316        }
317    }
318
319    /// Appends a value into the builder
320    ///
321    /// # Panics
322    ///
323    /// Panics if
324    /// - String buffer count exceeds `u32::MAX`
325    /// - String length exceeds `u32::MAX`
326    #[inline]
327    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
328        self.try_append_value(value).unwrap()
329    }
330
331    /// Appends a value into the builder
332    ///
333    /// # Errors
334    ///
335    /// Returns an error if:
336    /// - String buffer count exceeds `u32::MAX`
337    /// - String length exceeds `u32::MAX`
338    #[inline]
339    pub fn try_append_value(&mut self, value: impl AsRef<T::Native>) -> Result<(), ArrowError> {
340        let v: &[u8] = value.as_ref().as_ref();
341        let length: u32 = v.len().try_into().map_err(|_| {
342            ArrowError::InvalidArgumentError(format!("String length {} exceeds u32::MAX", v.len()))
343        })?;
344
345        if length <= MAX_INLINE_VIEW_LEN {
346            let mut view_buffer = [0; 16];
347            view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
348            view_buffer[4..4 + v.len()].copy_from_slice(v);
349            self.views_buffer.push(u128::from_le_bytes(view_buffer));
350            self.null_buffer_builder.append_non_null();
351            return Ok(());
352        }
353
354        // Deduplication if:
355        // (1) deduplication is enabled.
356        // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
357        let can_deduplicate = self.string_tracker.is_some()
358            && self
359                .max_deduplication_len
360                .map(|max_length| length <= max_length)
361                .unwrap_or(true);
362        if can_deduplicate {
363            if let Some((mut ht, hasher)) = self.string_tracker.take() {
364                let hash_val = hasher.hash_one(v);
365                let hasher_fn = |v: &_| hasher.hash_one(v);
366
367                let entry = ht.entry(
368                    hash_val,
369                    |idx| {
370                        let stored_value = self.get_value(*idx);
371                        v == stored_value
372                    },
373                    hasher_fn,
374                );
375                match entry {
376                    Entry::Occupied(occupied) => {
377                        // If the string already exists, we will directly use the view
378                        let idx = occupied.get();
379                        self.views_buffer.push(self.views_buffer[*idx]);
380                        self.null_buffer_builder.append_non_null();
381                        self.string_tracker = Some((ht, hasher));
382                        return Ok(());
383                    }
384                    Entry::Vacant(vacant) => {
385                        // o.w. we insert the (string hash -> view index)
386                        // the idx is current length of views_builder, as we are inserting a new view
387                        vacant.insert(self.views_buffer.len());
388                    }
389                }
390                self.string_tracker = Some((ht, hasher));
391            }
392        }
393
394        let required_cap = self.in_progress.len() + v.len();
395        if self.in_progress.capacity() < required_cap {
396            self.flush_in_progress();
397            let to_reserve = v.len().max(self.block_size.next_size() as usize);
398            self.in_progress.reserve(to_reserve);
399        };
400
401        let offset = self.in_progress.len() as u32;
402        self.in_progress.extend_from_slice(v);
403
404        let buffer_index: u32 = self.completed.len().try_into().map_err(|_| {
405            ArrowError::InvalidArgumentError(format!(
406                "Buffer count {} exceeds u32::MAX",
407                self.completed.len()
408            ))
409        })?;
410
411        let view = ByteView {
412            length,
413            // This won't panic as we checked the length of prefix earlier.
414            prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
415            buffer_index,
416            offset,
417        };
418        self.views_buffer.push(view.into());
419        self.null_buffer_builder.append_non_null();
420
421        Ok(())
422    }
423
424    /// Append an `Option` value into the builder
425    #[inline]
426    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
427        match value {
428            None => self.append_null(),
429            Some(v) => self.append_value(v),
430        };
431    }
432
433    /// Append the same value `n` times into the builder
434    ///
435    /// This is more efficient than calling [`Self::try_append_value`] `n` times,
436    /// especially when deduplication is enabled, as it only hashes the value once.
437    ///
438    /// # Errors
439    ///
440    /// Returns an error if
441    /// - String buffer count exceeds `u32::MAX`
442    /// - String length exceeds `u32::MAX`
443    ///
444    /// # Example
445    /// ```
446    /// # use arrow_array::builder::StringViewBuilder;
447    /// # use arrow_array::Array;
448    /// let mut builder = StringViewBuilder::new().with_deduplicate_strings();
449    ///
450    /// // Append "hello" 1000 times efficiently
451    /// builder.try_append_value_n("hello", 1000)?;
452    ///
453    /// let array = builder.finish();
454    /// assert_eq!(array.len(), 1000);
455    ///
456    /// // All values are "hello"
457    /// for value in array.iter() {
458    ///     assert_eq!(value, Some("hello"));
459    /// }
460    /// # Ok::<(), arrow_schema::ArrowError>(())
461    /// ```
462    #[inline]
463    pub fn try_append_value_n(
464        &mut self,
465        value: impl AsRef<T::Native>,
466        n: usize,
467    ) -> Result<(), ArrowError> {
468        if n == 0 {
469            return Ok(());
470        }
471        // Process value once (handles deduplication, buffer management, view creation)
472        self.try_append_value(value)?;
473        // Reuse the view (n-1) times
474        let view = *self.views_buffer.last().unwrap();
475        self.views_buffer.extend(std::iter::repeat_n(view, n - 1));
476        self.null_buffer_builder.append_n_non_nulls(n - 1);
477        Ok(())
478    }
479
480    /// Append a null value into the builder
481    #[inline]
482    pub fn append_null(&mut self) {
483        self.null_buffer_builder.append_null();
484        self.views_buffer.push(0);
485    }
486
487    /// Builds the [`GenericByteViewArray`] and reset this builder
488    pub fn finish(&mut self) -> GenericByteViewArray<T> {
489        self.flush_in_progress();
490        let completed = std::mem::take(&mut self.completed);
491        let nulls = self.null_buffer_builder.finish();
492        if let Some((ht, _)) = self.string_tracker.as_mut() {
493            ht.clear();
494        }
495        let views = std::mem::take(&mut self.views_buffer);
496        // SAFETY: valid by construction
497        unsafe { GenericByteViewArray::new_unchecked(views.into(), completed, nulls) }
498    }
499
500    /// Builds the [`GenericByteViewArray`] without resetting the builder
501    pub fn finish_cloned(&self) -> GenericByteViewArray<T> {
502        let mut completed = self.completed.clone();
503        if !self.in_progress.is_empty() {
504            completed.push(Buffer::from_slice_ref(&self.in_progress));
505        }
506        let len = self.views_buffer.len();
507        let views = Buffer::from_slice_ref(self.views_buffer.as_slice());
508        let views = ScalarBuffer::new(views, 0, len);
509        let nulls = self.null_buffer_builder.finish_cloned();
510        // SAFETY: valid by construction
511        unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
512    }
513
514    /// Returns the current null buffer as a slice
515    pub fn validity_slice(&self) -> Option<&[u8]> {
516        self.null_buffer_builder.as_slice()
517    }
518
519    /// Return the allocated size of this builder in bytes, useful for memory accounting.
520    pub fn allocated_size(&self) -> usize {
521        let views = self.views_buffer.capacity() * std::mem::size_of::<u128>();
522        let null = self.null_buffer_builder.allocated_size();
523        let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>();
524        let in_progress = self.in_progress.capacity();
525        let tracker = match &self.string_tracker {
526            Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
527            None => 0,
528        };
529        buffer_size + in_progress + tracker + views + null
530    }
531}
532
533impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
534    fn default() -> Self {
535        Self::new()
536    }
537}
538
539impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
540    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
541        write!(f, "{}ViewBuilder", T::PREFIX)?;
542        f.debug_struct("")
543            .field("views_buffer", &self.views_buffer)
544            .field("in_progress", &self.in_progress)
545            .field("completed", &self.completed)
546            .field("null_buffer_builder", &self.null_buffer_builder)
547            .finish()
548    }
549}
550
551impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> {
552    fn len(&self) -> usize {
553        self.null_buffer_builder.len()
554    }
555
556    fn finish(&mut self) -> ArrayRef {
557        Arc::new(self.finish())
558    }
559
560    fn finish_cloned(&self) -> ArrayRef {
561        Arc::new(self.finish_cloned())
562    }
563
564    fn as_any(&self) -> &dyn Any {
565        self
566    }
567
568    fn as_any_mut(&mut self) -> &mut dyn Any {
569        self
570    }
571
572    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
573        self
574    }
575}
576
577impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
578    for GenericByteViewBuilder<T>
579{
580    #[inline]
581    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
582        for v in iter {
583            self.append_option(v)
584        }
585    }
586}
587
588/// Array builder for [`StringViewArray`][crate::StringViewArray]
589///
590/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
591/// [`GenericByteViewBuilder::append_null`] as normal.
592///
593/// # Example
594/// ```
595/// # use arrow_array::builder::StringViewBuilder;
596/// # use arrow_array::StringViewArray;
597/// let mut builder = StringViewBuilder::new();
598/// builder.append_value("hello");
599/// builder.append_null();
600/// builder.append_value("world");
601/// let array = builder.finish();
602///
603/// let expected = vec![Some("hello"), None, Some("world")];
604/// let actual: Vec<_> = array.iter().collect();
605/// assert_eq!(expected, actual);
606/// ```
607pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;
608
609impl StringLikeArrayBuilder for StringViewBuilder {
610    fn type_name() -> &'static str {
611        std::any::type_name::<StringViewBuilder>()
612    }
613    fn with_capacity(capacity: usize) -> Self {
614        Self::with_capacity(capacity)
615    }
616    fn append_value(&mut self, value: &str) {
617        Self::append_value(self, value);
618    }
619    fn append_null(&mut self) {
620        Self::append_null(self);
621    }
622}
623
624///  Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
625///
626/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
627/// [`GenericByteViewBuilder::append_null`] as normal.
628///
629/// # Example
630/// ```
631/// # use arrow_array::builder::BinaryViewBuilder;
632/// use arrow_array::BinaryViewArray;
633/// let mut builder = BinaryViewBuilder::new();
634/// builder.append_value("hello");
635/// builder.append_null();
636/// builder.append_value("world");
637/// let array = builder.finish();
638///
639/// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")];
640/// let actual: Vec<_> = array.iter().collect();
641/// assert_eq!(expected, actual);
642/// ```
643///
644pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
645
646impl BinaryLikeArrayBuilder for BinaryViewBuilder {
647    fn type_name() -> &'static str {
648        std::any::type_name::<BinaryViewBuilder>()
649    }
650    fn with_capacity(capacity: usize) -> Self {
651        Self::with_capacity(capacity)
652    }
653    fn append_value(&mut self, value: &[u8]) {
654        Self::append_value(self, value);
655    }
656    fn append_null(&mut self) {
657        Self::append_null(self);
658    }
659}
660
661/// Creates a view from a fixed length input (the compiler can generate
662/// specialized code for this)
663fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
664    let mut view_buffer = [0; 16];
665    view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes());
666    view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]);
667    u128::from_le_bytes(view_buffer)
668}
669
670/// Create a view based on the given data, block id and offset.
671///
672/// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G>
673/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined),
674/// which slows down things.
675#[inline(never)]
676pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
677    let len = data.len();
678
679    // Generate specialized code for each potential small string length
680    // to improve performance
681    match len {
682        0 => make_inlined_view::<0>(data),
683        1 => make_inlined_view::<1>(data),
684        2 => make_inlined_view::<2>(data),
685        3 => make_inlined_view::<3>(data),
686        4 => make_inlined_view::<4>(data),
687        5 => make_inlined_view::<5>(data),
688        6 => make_inlined_view::<6>(data),
689        7 => make_inlined_view::<7>(data),
690        8 => make_inlined_view::<8>(data),
691        9 => make_inlined_view::<9>(data),
692        10 => make_inlined_view::<10>(data),
693        11 => make_inlined_view::<11>(data),
694        12 => make_inlined_view::<12>(data),
695        // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead.
696        _ => {
697            let view = ByteView {
698                length: len as u32,
699                prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()),
700                buffer_index: block_id,
701                offset,
702            };
703            view.as_u128()
704        }
705    }
706}
707
708#[cfg(test)]
709mod tests {
710    use core::str;
711
712    use arrow_buffer::ArrowNativeType;
713
714    use super::*;
715
716    #[test]
717    fn test_string_max_deduplication_len() {
718        let value_1 = "short";
719        let value_2 = "not so similar string but long";
720        let value_3 = "1234567890123";
721
722        let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
723
724        let mut builder = StringViewBuilder::new()
725            .with_deduplicate_strings()
726            .with_max_deduplication_len(max_deduplication_len);
727
728        assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
729        assert!(value_2.len() > max_deduplication_len.as_usize());
730        assert!(
731            value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
732                && value_3.len() < max_deduplication_len.as_usize()
733        );
734
735        // append value1 (short), expect it is inlined and not deduplicated
736        builder.append_value(value_1); // view 0
737        builder.append_value(value_1); // view 1
738        // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
739        builder.append_value(value_2); // view 2
740        builder.append_value(value_2); // view 3
741        // append value3, expect second copy is deduplicated
742        builder.append_value(value_3); // view 4
743        builder.append_value(value_3); // view 5
744
745        let array = builder.finish();
746
747        // verify
748        let v2 = ByteView::from(array.views()[2]);
749        let v3 = ByteView::from(array.views()[3]);
750        assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer
751        assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated
752
753        let v4 = ByteView::from(array.views()[4]);
754        let v5 = ByteView::from(array.views()[5]);
755        assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer
756        assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
757    }
758
759    #[test]
760    fn test_string_view_deduplicate() {
761        let value_1 = "long string to test string view";
762        let value_2 = "not so similar string but long";
763
764        let mut builder = StringViewBuilder::new()
765            .with_deduplicate_strings()
766            .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
767
768        let values = vec![
769            Some(value_1),
770            Some(value_2),
771            Some("short"),
772            Some(value_1),
773            None,
774            Some(value_2),
775            Some(value_1),
776        ];
777        builder.extend(values.clone());
778
779        let array = builder.finish_cloned();
780        array.to_data().validate_full().unwrap();
781        assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers.
782        let actual: Vec<_> = array.iter().collect();
783        assert_eq!(actual, values);
784
785        let view0 = array.views().first().unwrap();
786        let view3 = array.views().get(3).unwrap();
787        let view6 = array.views().get(6).unwrap();
788
789        assert_eq!(view0, view3);
790        assert_eq!(view0, view6);
791
792        assert_eq!(array.views().get(1), array.views().get(5));
793    }
794
795    #[test]
796    fn test_string_view_deduplicate_after_finish() {
797        let mut builder = StringViewBuilder::new().with_deduplicate_strings();
798
799        let value_1 = "long string to test string view";
800        let value_2 = "not so similar string but long";
801        builder.append_value(value_1);
802        let _array = builder.finish();
803        builder.append_value(value_2);
804        let _array = builder.finish();
805        builder.append_value(value_1);
806        let _array = builder.finish();
807    }
808
809    #[test]
810    fn test_string_view() {
811        let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");
812        let b2 = Buffer::from(b"cupcakes");
813        let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity");
814
815        let mut v = StringViewBuilder::new();
816        assert_eq!(v.append_block(b1), 0);
817
818        v.append_value("This is a very long string that exceeds the inline length");
819        v.append_value("This is another very long string that exceeds the inline length");
820
821        assert_eq!(v.append_block(b2), 2);
822        assert_eq!(v.append_block(b3), 3);
823
824        // Test short strings
825        v.try_append_view(0, 0, 5).unwrap(); // world
826        v.try_append_view(0, 6, 7).unwrap(); // bananas
827        v.try_append_view(2, 3, 5).unwrap(); // cake
828        v.try_append_view(2, 0, 3).unwrap(); // cup
829        v.try_append_view(2, 0, 8).unwrap(); // cupcakes
830        v.try_append_view(0, 13, 4).unwrap(); // 😁
831        v.try_append_view(0, 13, 0).unwrap(); //
832
833        // Test longer strings
834        v.try_append_view(3, 0, 16).unwrap(); // Many strings are
835        v.try_append_view(1, 0, 19).unwrap(); // This is a very long
836        v.try_append_view(3, 13, 27).unwrap(); // here contained of great length
837
838        v.append_value("I do so like long strings");
839
840        let array = v.finish_cloned();
841        array.to_data().validate_full().unwrap();
842        assert_eq!(array.data_buffers().len(), 5);
843        let actual: Vec<_> = array.iter().flatten().collect();
844        assert_eq!(
845            actual,
846            &[
847                "This is a very long string that exceeds the inline length",
848                "This is another very long string that exceeds the inline length",
849                "world",
850                "bananas",
851                "cakes",
852                "cup",
853                "cupcakes",
854                "😁",
855                "",
856                "Many strings are",
857                "This is a very long",
858                "are here contained of great",
859                "I do so like long strings"
860            ]
861        );
862
863        let err = v.try_append_view(0, u32::MAX, 1).unwrap_err();
864        assert_eq!(
865            err.to_string(),
866            "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17"
867        );
868
869        let err = v.try_append_view(0, 1, u32::MAX).unwrap_err();
870        assert_eq!(
871            err.to_string(),
872            "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17"
873        );
874
875        let err = v.try_append_view(0, 13, 2).unwrap_err();
876        assert_eq!(err.to_string(), "Invalid argument error: Invalid view data");
877
878        let err = v.try_append_view(0, 40, 0).unwrap_err();
879        assert_eq!(
880            err.to_string(),
881            "Invalid argument error: Range 40..40 out of bounds for block of length 17"
882        );
883
884        let err = v.try_append_view(5, 0, 0).unwrap_err();
885        assert_eq!(
886            err.to_string(),
887            "Invalid argument error: No block found with index 5"
888        );
889    }
890
891    #[test]
892    fn test_string_view_with_block_size_growth() {
893        let mut exp_builder = StringViewBuilder::new();
894        let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE);
895
896        let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap();
897
898        for i in 0..9 {
899            // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
900            for _ in 0..(2_u32.pow(i)) {
901                exp_builder.append_value(long_string);
902                fixed_builder.append_value(long_string);
903            }
904            exp_builder.flush_in_progress();
905            fixed_builder.flush_in_progress();
906
907            // Every step only add one buffer, but the buffer size is much larger
908            assert_eq!(exp_builder.completed.len(), i as usize + 1);
909            assert_eq!(
910                exp_builder.completed[i as usize].len(),
911                STARTING_BLOCK_SIZE as usize * 2_usize.pow(i)
912            );
913
914            // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
915            assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1);
916
917            // Every buffer is fixed size
918            assert!(
919                fixed_builder
920                    .completed
921                    .iter()
922                    .all(|b| b.len() == STARTING_BLOCK_SIZE as usize)
923            );
924        }
925
926        // Add one more value, and the buffer stop growing.
927        exp_builder.append_value(long_string);
928        exp_builder.flush_in_progress();
929        assert_eq!(
930            exp_builder.completed.last().unwrap().capacity(),
931            MAX_BLOCK_SIZE as usize
932        );
933    }
934
935    #[test]
936    fn test_append_value_n() {
937        // Test with inline strings (<=12 bytes)
938        let mut builder = StringViewBuilder::new();
939
940        builder.try_append_value_n("hello", 100).unwrap();
941        builder.append_value("world");
942        builder.try_append_value_n("foo", 50).unwrap();
943
944        let array = builder.finish();
945        assert_eq!(array.len(), 151);
946        assert_eq!(array.null_count(), 0);
947
948        // Verify the values
949        for i in 0..100 {
950            assert_eq!(array.value(i), "hello");
951        }
952        assert_eq!(array.value(100), "world");
953        for i in 101..151 {
954            assert_eq!(array.value(i), "foo");
955        }
956
957        // All inline strings should have no data buffers
958        assert_eq!(array.data_buffers().len(), 0);
959    }
960
961    #[test]
962    fn test_append_value_n_with_deduplication() {
963        let long_string = "This is a very long string that exceeds the inline length";
964
965        // Test with deduplication enabled
966        let mut builder = StringViewBuilder::new().with_deduplicate_strings();
967
968        // First append the string once to add it to the hash map
969        builder.append_value(long_string);
970
971        // Then append_n the same string - should deduplicate and reuse the existing value
972        builder.try_append_value_n(long_string, 999).unwrap();
973
974        let array = builder.finish();
975        assert_eq!(array.len(), 1000);
976        assert_eq!(array.null_count(), 0);
977
978        // Verify all values are the same
979        for i in 0..1000 {
980            assert_eq!(array.value(i), long_string);
981        }
982
983        // With deduplication, should only have 1 data buffer containing the string once
984        assert_eq!(array.data_buffers().len(), 1);
985
986        // All views should be identical
987        let first_view = array.views()[0];
988        for view in array.views().iter() {
989            assert_eq!(*view, first_view);
990        }
991    }
992
993    #[test]
994    fn test_append_value_n_zero() {
995        let mut builder = StringViewBuilder::new();
996
997        builder.append_value("first");
998        builder.try_append_value_n("should not appear", 0).unwrap();
999        builder.append_value("second");
1000
1001        let array = builder.finish();
1002        assert_eq!(array.len(), 2);
1003        assert_eq!(array.value(0), "first");
1004        assert_eq!(array.value(1), "second");
1005    }
1006}