arrow_array/array/
byte_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::{get_offsets, print_long_array};
19use crate::builder::GenericByteBuilder;
20use crate::iterator::ArrayIter;
21use crate::types::ByteArrayType;
22use crate::types::bytes::ByteArrayNativeType;
23use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
25use arrow_buffer::{NullBuffer, OffsetBuffer};
26use arrow_data::{ArrayData, ArrayDataBuilder};
27use arrow_schema::{ArrowError, DataType};
28use std::any::Any;
29use std::sync::Arc;
30
31/// An array of [variable length byte arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout)
32///
33/// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data
34///
35/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes
36///
37/// # Example: From a Vec
38///
39/// ```
40/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
41/// let arr: GenericByteArray<Utf8Type> = vec!["hello", "world", ""].into();
42/// assert_eq!(arr.value_data(), b"helloworld");
43/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10]);
44/// let values: Vec<_> = arr.iter().collect();
45/// assert_eq!(values, &[Some("hello"), Some("world"), Some("")]);
46/// ```
47///
48/// # Example: From an optional Vec
49///
50/// ```
51/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
52/// let arr: GenericByteArray<Utf8Type> = vec![Some("hello"), Some("world"), Some(""), None].into();
53/// assert_eq!(arr.value_data(), b"helloworld");
54/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10, 10]);
55/// let values: Vec<_> = arr.iter().collect();
56/// assert_eq!(values, &[Some("hello"), Some("world"), Some(""), None]);
57/// ```
58///
59/// # Example: From an iterator of option
60///
61/// ```
62/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
63/// let arr: GenericByteArray<Utf8Type> = (0..5).map(|x| (x % 2 == 0).then(|| x.to_string())).collect();
64/// let values: Vec<_> = arr.iter().collect();
65/// assert_eq!(values, &[Some("0"), None, Some("2"), None, Some("4")]);
66/// ```
67///
68/// # Example: Using Builder
69///
70/// ```
71/// # use arrow_array::Array;
72/// # use arrow_array::builder::GenericByteBuilder;
73/// # use arrow_array::types::Utf8Type;
74/// let mut builder = GenericByteBuilder::<Utf8Type>::new();
75/// builder.append_value("hello");
76/// builder.append_null();
77/// builder.append_value("world");
78/// let array = builder.finish();
79/// let values: Vec<_> = array.iter().collect();
80/// assert_eq!(values, &[Some("hello"), None, Some("world")]);
81/// ```
82///
83/// [`StringArray`]: crate::StringArray
84/// [`LargeStringArray`]: crate::LargeStringArray
85/// [`BinaryArray`]: crate::BinaryArray
86/// [`LargeBinaryArray`]: crate::LargeBinaryArray
87pub struct GenericByteArray<T: ByteArrayType> {
88    data_type: DataType,
89    value_offsets: OffsetBuffer<T::Offset>,
90    value_data: Buffer,
91    nulls: Option<NullBuffer>,
92}
93
94impl<T: ByteArrayType> Clone for GenericByteArray<T> {
95    fn clone(&self) -> Self {
96        Self {
97            data_type: T::DATA_TYPE,
98            value_offsets: self.value_offsets.clone(),
99            value_data: self.value_data.clone(),
100            nulls: self.nulls.clone(),
101        }
102    }
103}
104
105impl<T: ByteArrayType> GenericByteArray<T> {
106    /// Data type of the array.
107    pub const DATA_TYPE: DataType = T::DATA_TYPE;
108
109    /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
110    ///
111    /// # Panics
112    ///
113    /// Panics if [`GenericByteArray::try_new`] returns an error
114    pub fn new(
115        offsets: OffsetBuffer<T::Offset>,
116        values: Buffer,
117        nulls: Option<NullBuffer>,
118    ) -> Self {
119        Self::try_new(offsets, values, nulls).unwrap()
120    }
121
122    /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
123    ///
124    /// # Errors
125    ///
126    /// * `offsets.len() - 1 != nulls.len()`
127    /// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
128    pub fn try_new(
129        offsets: OffsetBuffer<T::Offset>,
130        values: Buffer,
131        nulls: Option<NullBuffer>,
132    ) -> Result<Self, ArrowError> {
133        let len = offsets.len() - 1;
134
135        // Verify that each pair of offsets is a valid slices of values
136        T::validate(&offsets, &values)?;
137
138        if let Some(n) = nulls.as_ref() {
139            if n.len() != len {
140                return Err(ArrowError::InvalidArgumentError(format!(
141                    "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
142                    T::Offset::PREFIX,
143                    T::PREFIX,
144                    n.len(),
145                )));
146            }
147        }
148
149        Ok(Self {
150            data_type: T::DATA_TYPE,
151            value_offsets: offsets,
152            value_data: values,
153            nulls,
154        })
155    }
156
157    /// Create a new [`GenericByteArray`] from the provided parts, without validation
158    ///
159    /// # Safety
160    ///
161    /// Safe if [`Self::try_new`] would not error
162    pub unsafe fn new_unchecked(
163        offsets: OffsetBuffer<T::Offset>,
164        values: Buffer,
165        nulls: Option<NullBuffer>,
166    ) -> Self {
167        if cfg!(feature = "force_validate") {
168            return Self::new(offsets, values, nulls);
169        }
170        Self {
171            data_type: T::DATA_TYPE,
172            value_offsets: offsets,
173            value_data: values,
174            nulls,
175        }
176    }
177
178    /// Create a new [`GenericByteArray`] of length `len` where all values are null
179    pub fn new_null(len: usize) -> Self {
180        Self {
181            data_type: T::DATA_TYPE,
182            value_offsets: OffsetBuffer::new_zeroed(len),
183            value_data: MutableBuffer::new(0).into(),
184            nulls: Some(NullBuffer::new_null(len)),
185        }
186    }
187
188    /// Create a new [`Scalar`] from `v`
189    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
190        Scalar::new(Self::from_iter_values(std::iter::once(value)))
191    }
192
193    /// Create a new [`GenericByteArray`] where `value` is repeated `repeat_count` times.
194    ///
195    /// # Panics
196    /// This will panic if value's length multiplied by `repeat_count` overflows usize.
197    ///
198    pub fn new_repeated(value: impl AsRef<T::Native>, repeat_count: usize) -> Self {
199        let s: &[u8] = value.as_ref().as_ref();
200        let value_offsets = OffsetBuffer::from_repeated_length(s.len(), repeat_count);
201        let bytes: Buffer = {
202            let mut mutable_buffer = MutableBuffer::with_capacity(0);
203            mutable_buffer.repeat_slice_n_times(s, repeat_count);
204
205            mutable_buffer.into()
206        };
207
208        Self {
209            data_type: T::DATA_TYPE,
210            value_data: bytes,
211            value_offsets,
212            nulls: None,
213        }
214    }
215
216    /// Creates a [`GenericByteArray`] based on an iterator of values without nulls
217    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
218    where
219        Ptr: AsRef<T::Native>,
220        I: IntoIterator<Item = Ptr>,
221    {
222        let iter = iter.into_iter();
223        let (_, data_len) = iter.size_hint();
224        let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
225
226        let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
227        offsets.push(T::Offset::usize_as(0));
228
229        let mut values = MutableBuffer::new(0);
230        for s in iter {
231            let s: &[u8] = s.as_ref().as_ref();
232            values.extend_from_slice(s);
233            offsets.push(T::Offset::usize_as(values.len()));
234        }
235
236        T::Offset::from_usize(values.len()).expect("offset overflow");
237        let offsets = Buffer::from(offsets);
238
239        // Safety: valid by construction
240        let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
241
242        Self {
243            data_type: T::DATA_TYPE,
244            value_data: values.into(),
245            value_offsets,
246            nulls: None,
247        }
248    }
249
250    /// Deconstruct this array into its constituent parts
251    pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
252        (self.value_offsets, self.value_data, self.nulls)
253    }
254
255    /// Returns the length for value at index `i`.
256    /// # Panics
257    /// Panics if index `i` is out of bounds.
258    #[inline]
259    pub fn value_length(&self, i: usize) -> T::Offset {
260        let offsets = self.value_offsets();
261        offsets[i + 1] - offsets[i]
262    }
263
264    /// Returns a reference to the offsets of this array
265    ///
266    /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
267    /// allowing for zero-copy cloning
268    #[inline]
269    pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
270        &self.value_offsets
271    }
272
273    /// Returns the values of this array
274    ///
275    /// Unlike [`Self::value_data`] this returns the [`Buffer`]
276    /// allowing for zero-copy cloning
277    #[inline]
278    pub fn values(&self) -> &Buffer {
279        &self.value_data
280    }
281
282    /// Returns the raw value data
283    pub fn value_data(&self) -> &[u8] {
284        self.value_data.as_slice()
285    }
286
287    /// Returns true if all data within this array is ASCII
288    pub fn is_ascii(&self) -> bool {
289        let offsets = self.value_offsets();
290        let start = offsets.first().unwrap();
291        let end = offsets.last().unwrap();
292        self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
293    }
294
295    /// Returns the offset values in the offsets buffer
296    #[inline]
297    pub fn value_offsets(&self) -> &[T::Offset] {
298        &self.value_offsets
299    }
300
301    /// Returns the element at index `i`
302    ///
303    /// Note: This method does not check for nulls and the value is arbitrary
304    /// if [`is_null`](Self::is_null) returns true for the index.
305    ///
306    /// # Safety
307    /// Caller is responsible for ensuring that the index is within the bounds of the array
308    pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
309        let end = *unsafe { self.value_offsets().get_unchecked(i + 1) };
310        let start = *unsafe { self.value_offsets().get_unchecked(i) };
311
312        // Soundness
313        // pointer alignment & location is ensured by RawPtrBox
314        // buffer bounds/offset is ensured by the value_offset invariants
315
316        // Safety of `to_isize().unwrap()`
317        // `start` and `end` are &OffsetSize, which is a generic type that implements the
318        // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
319        // both of which should cleanly cast to isize on an architecture that supports
320        // 32/64-bit offsets
321        let b = unsafe {
322            std::slice::from_raw_parts(
323                self.value_data
324                    .as_ptr()
325                    .offset(start.to_isize().unwrap_unchecked()),
326                (end - start).to_usize().unwrap_unchecked(),
327            )
328        };
329
330        // SAFETY:
331        // ArrayData is valid
332        unsafe { T::Native::from_bytes_unchecked(b) }
333    }
334
335    /// Returns the element at index `i`
336    ///
337    /// Note: This method does not check for nulls and the value is arbitrary
338    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
339    ///
340    /// # Panics
341    /// Panics if index `i` is out of bounds.
342    pub fn value(&self, i: usize) -> &T::Native {
343        assert!(
344            i < self.len(),
345            "Trying to access an element at index {} from a {}{}Array of length {}",
346            i,
347            T::Offset::PREFIX,
348            T::PREFIX,
349            self.len()
350        );
351        // SAFETY:
352        // Verified length above
353        unsafe { self.value_unchecked(i) }
354    }
355
356    /// constructs a new iterator
357    pub fn iter(&self) -> ArrayIter<&Self> {
358        ArrayIter::new(self)
359    }
360
361    /// Returns a zero-copy slice of this array with the indicated offset and length.
362    pub fn slice(&self, offset: usize, length: usize) -> Self {
363        Self {
364            data_type: T::DATA_TYPE,
365            value_offsets: self.value_offsets.slice(offset, length),
366            value_data: self.value_data.clone(),
367            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
368        }
369    }
370
371    /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying
372    /// offset and data buffers are not shared by others.
373    pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
374        let len = self.len();
375        let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
376
377        let data = self.into_data();
378        let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
379
380        let element_len = std::mem::size_of::<T::Offset>();
381        let offset_buffer = data.buffers()[0]
382            .slice_with_length(data.offset() * element_len, (len + 1) * element_len);
383
384        let element_len = std::mem::size_of::<u8>();
385        let value_buffer = data.buffers()[1]
386            .slice_with_length(data.offset() * element_len, value_len * element_len);
387
388        drop(data);
389
390        let try_mutable_null_buffer = match null_bit_buffer {
391            None => Ok(None),
392            Some(null_buffer) => {
393                // Null buffer exists, tries to make it mutable
394                null_buffer.into_mutable().map(Some)
395            }
396        };
397
398        let try_mutable_buffers = match try_mutable_null_buffer {
399            Ok(mutable_null_buffer) => {
400                // Got mutable null buffer, tries to get mutable value buffer
401                let try_mutable_offset_buffer = offset_buffer.into_mutable();
402                let try_mutable_value_buffer = value_buffer.into_mutable();
403
404                // try_mutable_offset_buffer.map(...).map_err(...) doesn't work as the compiler complains
405                // mutable_null_buffer is moved into map closure.
406                match (try_mutable_offset_buffer, try_mutable_value_buffer) {
407                    (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
408                        Ok(GenericByteBuilder::<T>::new_from_buffer(
409                            mutable_offset_buffer,
410                            mutable_value_buffer,
411                            mutable_null_buffer,
412                        ))
413                    },
414                    (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
415                        mutable_offset_buffer.into(),
416                        value_buffer,
417                        mutable_null_buffer.map(|b| b.into()),
418                    )),
419                    (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
420                        offset_buffer,
421                        mutable_value_buffer.into(),
422                        mutable_null_buffer.map(|b| b.into()),
423                    )),
424                    (Err(offset_buffer), Err(value_buffer)) => Err((
425                        offset_buffer,
426                        value_buffer,
427                        mutable_null_buffer.map(|b| b.into()),
428                    )),
429                }
430            }
431            Err(mutable_null_buffer) => {
432                // Unable to get mutable null buffer
433                Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
434            }
435        };
436
437        match try_mutable_buffers {
438            Ok(builder) => Ok(builder),
439            Err((offset_buffer, value_buffer, null_bit_buffer)) => {
440                let builder = ArrayData::builder(T::DATA_TYPE)
441                    .len(len)
442                    .add_buffer(offset_buffer)
443                    .add_buffer(value_buffer)
444                    .null_bit_buffer(null_bit_buffer);
445
446                let array_data = unsafe { builder.build_unchecked() };
447                let array = GenericByteArray::<T>::from(array_data);
448
449                Err(array)
450            }
451        }
452    }
453}
454
455impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
456    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
457        write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
458        print_long_array(self, f, |array, index, f| {
459            std::fmt::Debug::fmt(&array.value(index), f)
460        })?;
461        write!(f, "]")
462    }
463}
464
465impl<T: ByteArrayType> super::private::Sealed for GenericByteArray<T> {}
466
467impl<T: ByteArrayType> Array for GenericByteArray<T> {
468    fn as_any(&self) -> &dyn Any {
469        self
470    }
471
472    fn to_data(&self) -> ArrayData {
473        self.clone().into()
474    }
475
476    fn into_data(self) -> ArrayData {
477        self.into()
478    }
479
480    fn data_type(&self) -> &DataType {
481        &self.data_type
482    }
483
484    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
485        Arc::new(self.slice(offset, length))
486    }
487
488    fn len(&self) -> usize {
489        self.value_offsets.len() - 1
490    }
491
492    fn is_empty(&self) -> bool {
493        self.value_offsets.len() <= 1
494    }
495
496    fn shrink_to_fit(&mut self) {
497        self.value_offsets.shrink_to_fit();
498        self.value_data.shrink_to_fit();
499        if let Some(nulls) = &mut self.nulls {
500            nulls.shrink_to_fit();
501        }
502    }
503
504    fn offset(&self) -> usize {
505        0
506    }
507
508    fn nulls(&self) -> Option<&NullBuffer> {
509        self.nulls.as_ref()
510    }
511
512    fn logical_null_count(&self) -> usize {
513        // More efficient that the default implementation
514        self.null_count()
515    }
516
517    fn get_buffer_memory_size(&self) -> usize {
518        let mut sum = self.value_offsets.inner().inner().capacity();
519        sum += self.value_data.capacity();
520        if let Some(x) = &self.nulls {
521            sum += x.buffer().capacity()
522        }
523        sum
524    }
525
526    fn get_array_memory_size(&self) -> usize {
527        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
528    }
529}
530
531impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
532    type Item = &'a T::Native;
533
534    fn value(&self, index: usize) -> Self::Item {
535        GenericByteArray::value(self, index)
536    }
537
538    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
539        unsafe { GenericByteArray::value_unchecked(self, index) }
540    }
541}
542
543impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
544    fn from(data: ArrayData) -> Self {
545        assert_eq!(
546            data.data_type(),
547            &Self::DATA_TYPE,
548            "{}{}Array expects DataType::{}",
549            T::Offset::PREFIX,
550            T::PREFIX,
551            Self::DATA_TYPE
552        );
553        assert_eq!(
554            data.buffers().len(),
555            2,
556            "{}{}Array data should contain 2 buffers only (offsets and values)",
557            T::Offset::PREFIX,
558            T::PREFIX,
559        );
560        // SAFETY:
561        // ArrayData is valid, and verified type above
562        let value_offsets = unsafe { get_offsets(&data) };
563        let value_data = data.buffers()[1].clone();
564        Self {
565            value_offsets,
566            value_data,
567            data_type: T::DATA_TYPE,
568            nulls: data.nulls().cloned(),
569        }
570    }
571}
572
573impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
574    fn from(array: GenericByteArray<T>) -> Self {
575        let len = array.len();
576
577        let offsets = array.value_offsets.into_inner().into_inner();
578        let builder = ArrayDataBuilder::new(array.data_type)
579            .len(len)
580            .buffers(vec![offsets, array.value_data])
581            .nulls(array.nulls);
582
583        unsafe { builder.build_unchecked() }
584    }
585}
586
587impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
588    type Item = Option<&'a T::Native>;
589    type IntoIter = ArrayIter<Self>;
590
591    fn into_iter(self) -> Self::IntoIter {
592        ArrayIter::new(self)
593    }
594}
595
596impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
597where
598    Ptr: AsRef<T::Native> + 'a,
599{
600    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
601        iter.into_iter()
602            .map(|o| o.as_ref().map(|p| p.as_ref()))
603            .collect()
604    }
605}
606
607impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
608where
609    Ptr: AsRef<T::Native>,
610{
611    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
612        let iter = iter.into_iter();
613        let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
614        builder.extend(iter);
615        builder.finish()
616    }
617}
618
619#[cfg(test)]
620mod tests {
621    use crate::{Array, BinaryArray, StringArray};
622    use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
623
624    #[test]
625    fn try_new() {
626        let data = Buffer::from_slice_ref("helloworld");
627        let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
628        StringArray::new(offsets.clone(), data.clone(), None);
629
630        let nulls = NullBuffer::new_null(3);
631        let err =
632            StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
633        assert_eq!(
634            err.to_string(),
635            "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"
636        );
637
638        let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
639        assert_eq!(
640            err.to_string(),
641            "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"
642        );
643
644        let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
645        let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
646        assert_eq!(
647            err.to_string(),
648            "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"
649        );
650
651        BinaryArray::new(offsets, non_utf8_data, None);
652
653        let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
654        let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
655        assert_eq!(
656            err.to_string(),
657            "Invalid argument error: Offset of 11 exceeds length of values 10"
658        );
659
660        let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
661        assert_eq!(
662            err.to_string(),
663            "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
664        );
665
666        let non_ascii_data = Buffer::from_slice_ref("heìloworld");
667        StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
668        BinaryArray::new(offsets, non_ascii_data.clone(), None);
669
670        let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
671        let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
672        assert_eq!(
673            err.to_string(),
674            "Invalid argument error: Split UTF-8 codepoint at offset 3"
675        );
676
677        BinaryArray::new(offsets, non_ascii_data, None);
678    }
679
680    #[test]
681    fn create_repeated() {
682        let arr = BinaryArray::new_repeated(b"hello", 3);
683        assert_eq!(arr.len(), 3);
684        assert_eq!(arr.value(0), b"hello");
685        assert_eq!(arr.value(1), b"hello");
686        assert_eq!(arr.value(2), b"hello");
687
688        let arr = StringArray::new_repeated("world", 2);
689        assert_eq!(arr.len(), 2);
690        assert_eq!(arr.value(0), "world");
691        assert_eq!(arr.value(1), "world");
692    }
693
694    #[test]
695    #[should_panic(expected = "usize overflow")]
696    fn create_repeated_usize_overflow_1() {
697        let _arr = BinaryArray::new_repeated(b"hello", (usize::MAX / "hello".len()) + 1);
698    }
699
700    #[test]
701    #[should_panic(expected = "usize overflow")]
702    fn create_repeated_usize_overflow_2() {
703        let _arr = BinaryArray::new_repeated(b"hello", usize::MAX);
704    }
705
706    #[test]
707    #[should_panic(expected = "offset overflow")]
708    fn create_repeated_i32_offset_overflow_1() {
709        let _arr = BinaryArray::new_repeated(b"hello", usize::MAX / "hello".len());
710    }
711
712    #[test]
713    #[should_panic(expected = "offset overflow")]
714    fn create_repeated_i32_offset_overflow_2() {
715        let _arr = BinaryArray::new_repeated(b"hello", ((i32::MAX as usize) / "hello".len()) + 1);
716    }
717}