1use crate::array::{get_offsets, print_long_array};
19use crate::builder::GenericByteBuilder;
20use crate::iterator::ArrayIter;
21use crate::types::ByteArrayType;
22use crate::types::bytes::ByteArrayNativeType;
23use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
25use arrow_buffer::{NullBuffer, OffsetBuffer};
26use arrow_data::{ArrayData, ArrayDataBuilder};
27use arrow_schema::{ArrowError, DataType};
28use std::any::Any;
29use std::sync::Arc;
30
31pub struct GenericByteArray<T: ByteArrayType> {
88 data_type: DataType,
89 value_offsets: OffsetBuffer<T::Offset>,
90 value_data: Buffer,
91 nulls: Option<NullBuffer>,
92}
93
94impl<T: ByteArrayType> Clone for GenericByteArray<T> {
95 fn clone(&self) -> Self {
96 Self {
97 data_type: T::DATA_TYPE,
98 value_offsets: self.value_offsets.clone(),
99 value_data: self.value_data.clone(),
100 nulls: self.nulls.clone(),
101 }
102 }
103}
104
105impl<T: ByteArrayType> GenericByteArray<T> {
106 pub const DATA_TYPE: DataType = T::DATA_TYPE;
108
109 pub fn new(
115 offsets: OffsetBuffer<T::Offset>,
116 values: Buffer,
117 nulls: Option<NullBuffer>,
118 ) -> Self {
119 Self::try_new(offsets, values, nulls).unwrap()
120 }
121
122 pub fn try_new(
129 offsets: OffsetBuffer<T::Offset>,
130 values: Buffer,
131 nulls: Option<NullBuffer>,
132 ) -> Result<Self, ArrowError> {
133 let len = offsets.len() - 1;
134
135 T::validate(&offsets, &values)?;
137
138 if let Some(n) = nulls.as_ref() {
139 if n.len() != len {
140 return Err(ArrowError::InvalidArgumentError(format!(
141 "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
142 T::Offset::PREFIX,
143 T::PREFIX,
144 n.len(),
145 )));
146 }
147 }
148
149 Ok(Self {
150 data_type: T::DATA_TYPE,
151 value_offsets: offsets,
152 value_data: values,
153 nulls,
154 })
155 }
156
157 pub unsafe fn new_unchecked(
163 offsets: OffsetBuffer<T::Offset>,
164 values: Buffer,
165 nulls: Option<NullBuffer>,
166 ) -> Self {
167 if cfg!(feature = "force_validate") {
168 return Self::new(offsets, values, nulls);
169 }
170 Self {
171 data_type: T::DATA_TYPE,
172 value_offsets: offsets,
173 value_data: values,
174 nulls,
175 }
176 }
177
178 pub fn new_null(len: usize) -> Self {
180 Self {
181 data_type: T::DATA_TYPE,
182 value_offsets: OffsetBuffer::new_zeroed(len),
183 value_data: MutableBuffer::new(0).into(),
184 nulls: Some(NullBuffer::new_null(len)),
185 }
186 }
187
188 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
190 Scalar::new(Self::from_iter_values(std::iter::once(value)))
191 }
192
193 pub fn new_repeated(value: impl AsRef<T::Native>, repeat_count: usize) -> Self {
199 let s: &[u8] = value.as_ref().as_ref();
200 let value_offsets = OffsetBuffer::from_repeated_length(s.len(), repeat_count);
201 let bytes: Buffer = {
202 let mut mutable_buffer = MutableBuffer::with_capacity(0);
203 mutable_buffer.repeat_slice_n_times(s, repeat_count);
204
205 mutable_buffer.into()
206 };
207
208 Self {
209 data_type: T::DATA_TYPE,
210 value_data: bytes,
211 value_offsets,
212 nulls: None,
213 }
214 }
215
216 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
218 where
219 Ptr: AsRef<T::Native>,
220 I: IntoIterator<Item = Ptr>,
221 {
222 let iter = iter.into_iter();
223 let (_, data_len) = iter.size_hint();
224 let data_len = data_len.expect("Iterator must be sized"); let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
227 offsets.push(T::Offset::usize_as(0));
228
229 let mut values = MutableBuffer::new(0);
230 for s in iter {
231 let s: &[u8] = s.as_ref().as_ref();
232 values.extend_from_slice(s);
233 offsets.push(T::Offset::usize_as(values.len()));
234 }
235
236 T::Offset::from_usize(values.len()).expect("offset overflow");
237 let offsets = Buffer::from(offsets);
238
239 let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
241
242 Self {
243 data_type: T::DATA_TYPE,
244 value_data: values.into(),
245 value_offsets,
246 nulls: None,
247 }
248 }
249
250 pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
252 (self.value_offsets, self.value_data, self.nulls)
253 }
254
255 #[inline]
259 pub fn value_length(&self, i: usize) -> T::Offset {
260 let offsets = self.value_offsets();
261 offsets[i + 1] - offsets[i]
262 }
263
264 #[inline]
269 pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
270 &self.value_offsets
271 }
272
273 #[inline]
278 pub fn values(&self) -> &Buffer {
279 &self.value_data
280 }
281
282 pub fn value_data(&self) -> &[u8] {
284 self.value_data.as_slice()
285 }
286
287 pub fn is_ascii(&self) -> bool {
289 let offsets = self.value_offsets();
290 let start = offsets.first().unwrap();
291 let end = offsets.last().unwrap();
292 self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
293 }
294
295 #[inline]
297 pub fn value_offsets(&self) -> &[T::Offset] {
298 &self.value_offsets
299 }
300
301 pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
309 let end = *unsafe { self.value_offsets().get_unchecked(i + 1) };
310 let start = *unsafe { self.value_offsets().get_unchecked(i) };
311
312 let b = unsafe {
322 std::slice::from_raw_parts(
323 self.value_data
324 .as_ptr()
325 .offset(start.to_isize().unwrap_unchecked()),
326 (end - start).to_usize().unwrap_unchecked(),
327 )
328 };
329
330 unsafe { T::Native::from_bytes_unchecked(b) }
333 }
334
335 pub fn value(&self, i: usize) -> &T::Native {
343 assert!(
344 i < self.len(),
345 "Trying to access an element at index {} from a {}{}Array of length {}",
346 i,
347 T::Offset::PREFIX,
348 T::PREFIX,
349 self.len()
350 );
351 unsafe { self.value_unchecked(i) }
354 }
355
356 pub fn iter(&self) -> ArrayIter<&Self> {
358 ArrayIter::new(self)
359 }
360
361 pub fn slice(&self, offset: usize, length: usize) -> Self {
363 Self {
364 data_type: T::DATA_TYPE,
365 value_offsets: self.value_offsets.slice(offset, length),
366 value_data: self.value_data.clone(),
367 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
368 }
369 }
370
371 pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
374 let len = self.len();
375 let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
376
377 let data = self.into_data();
378 let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
379
380 let element_len = std::mem::size_of::<T::Offset>();
381 let offset_buffer = data.buffers()[0]
382 .slice_with_length(data.offset() * element_len, (len + 1) * element_len);
383
384 let element_len = std::mem::size_of::<u8>();
385 let value_buffer = data.buffers()[1]
386 .slice_with_length(data.offset() * element_len, value_len * element_len);
387
388 drop(data);
389
390 let try_mutable_null_buffer = match null_bit_buffer {
391 None => Ok(None),
392 Some(null_buffer) => {
393 null_buffer.into_mutable().map(Some)
395 }
396 };
397
398 let try_mutable_buffers = match try_mutable_null_buffer {
399 Ok(mutable_null_buffer) => {
400 let try_mutable_offset_buffer = offset_buffer.into_mutable();
402 let try_mutable_value_buffer = value_buffer.into_mutable();
403
404 match (try_mutable_offset_buffer, try_mutable_value_buffer) {
407 (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
408 Ok(GenericByteBuilder::<T>::new_from_buffer(
409 mutable_offset_buffer,
410 mutable_value_buffer,
411 mutable_null_buffer,
412 ))
413 },
414 (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
415 mutable_offset_buffer.into(),
416 value_buffer,
417 mutable_null_buffer.map(|b| b.into()),
418 )),
419 (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
420 offset_buffer,
421 mutable_value_buffer.into(),
422 mutable_null_buffer.map(|b| b.into()),
423 )),
424 (Err(offset_buffer), Err(value_buffer)) => Err((
425 offset_buffer,
426 value_buffer,
427 mutable_null_buffer.map(|b| b.into()),
428 )),
429 }
430 }
431 Err(mutable_null_buffer) => {
432 Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
434 }
435 };
436
437 match try_mutable_buffers {
438 Ok(builder) => Ok(builder),
439 Err((offset_buffer, value_buffer, null_bit_buffer)) => {
440 let builder = ArrayData::builder(T::DATA_TYPE)
441 .len(len)
442 .add_buffer(offset_buffer)
443 .add_buffer(value_buffer)
444 .null_bit_buffer(null_bit_buffer);
445
446 let array_data = unsafe { builder.build_unchecked() };
447 let array = GenericByteArray::<T>::from(array_data);
448
449 Err(array)
450 }
451 }
452 }
453}
454
455impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
456 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
457 write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
458 print_long_array(self, f, |array, index, f| {
459 std::fmt::Debug::fmt(&array.value(index), f)
460 })?;
461 write!(f, "]")
462 }
463}
464
465impl<T: ByteArrayType> super::private::Sealed for GenericByteArray<T> {}
466
467impl<T: ByteArrayType> Array for GenericByteArray<T> {
468 fn as_any(&self) -> &dyn Any {
469 self
470 }
471
472 fn to_data(&self) -> ArrayData {
473 self.clone().into()
474 }
475
476 fn into_data(self) -> ArrayData {
477 self.into()
478 }
479
480 fn data_type(&self) -> &DataType {
481 &self.data_type
482 }
483
484 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
485 Arc::new(self.slice(offset, length))
486 }
487
488 fn len(&self) -> usize {
489 self.value_offsets.len() - 1
490 }
491
492 fn is_empty(&self) -> bool {
493 self.value_offsets.len() <= 1
494 }
495
496 fn shrink_to_fit(&mut self) {
497 self.value_offsets.shrink_to_fit();
498 self.value_data.shrink_to_fit();
499 if let Some(nulls) = &mut self.nulls {
500 nulls.shrink_to_fit();
501 }
502 }
503
504 fn offset(&self) -> usize {
505 0
506 }
507
508 fn nulls(&self) -> Option<&NullBuffer> {
509 self.nulls.as_ref()
510 }
511
512 fn logical_null_count(&self) -> usize {
513 self.null_count()
515 }
516
517 fn get_buffer_memory_size(&self) -> usize {
518 let mut sum = self.value_offsets.inner().inner().capacity();
519 sum += self.value_data.capacity();
520 if let Some(x) = &self.nulls {
521 sum += x.buffer().capacity()
522 }
523 sum
524 }
525
526 fn get_array_memory_size(&self) -> usize {
527 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
528 }
529}
530
531impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
532 type Item = &'a T::Native;
533
534 fn value(&self, index: usize) -> Self::Item {
535 GenericByteArray::value(self, index)
536 }
537
538 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
539 unsafe { GenericByteArray::value_unchecked(self, index) }
540 }
541}
542
543impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
544 fn from(data: ArrayData) -> Self {
545 assert_eq!(
546 data.data_type(),
547 &Self::DATA_TYPE,
548 "{}{}Array expects DataType::{}",
549 T::Offset::PREFIX,
550 T::PREFIX,
551 Self::DATA_TYPE
552 );
553 assert_eq!(
554 data.buffers().len(),
555 2,
556 "{}{}Array data should contain 2 buffers only (offsets and values)",
557 T::Offset::PREFIX,
558 T::PREFIX,
559 );
560 let value_offsets = unsafe { get_offsets(&data) };
563 let value_data = data.buffers()[1].clone();
564 Self {
565 value_offsets,
566 value_data,
567 data_type: T::DATA_TYPE,
568 nulls: data.nulls().cloned(),
569 }
570 }
571}
572
573impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
574 fn from(array: GenericByteArray<T>) -> Self {
575 let len = array.len();
576
577 let offsets = array.value_offsets.into_inner().into_inner();
578 let builder = ArrayDataBuilder::new(array.data_type)
579 .len(len)
580 .buffers(vec![offsets, array.value_data])
581 .nulls(array.nulls);
582
583 unsafe { builder.build_unchecked() }
584 }
585}
586
587impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
588 type Item = Option<&'a T::Native>;
589 type IntoIter = ArrayIter<Self>;
590
591 fn into_iter(self) -> Self::IntoIter {
592 ArrayIter::new(self)
593 }
594}
595
596impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
597where
598 Ptr: AsRef<T::Native> + 'a,
599{
600 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
601 iter.into_iter()
602 .map(|o| o.as_ref().map(|p| p.as_ref()))
603 .collect()
604 }
605}
606
607impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
608where
609 Ptr: AsRef<T::Native>,
610{
611 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
612 let iter = iter.into_iter();
613 let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
614 builder.extend(iter);
615 builder.finish()
616 }
617}
618
619#[cfg(test)]
620mod tests {
621 use crate::{Array, BinaryArray, StringArray};
622 use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
623
624 #[test]
625 fn try_new() {
626 let data = Buffer::from_slice_ref("helloworld");
627 let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
628 StringArray::new(offsets.clone(), data.clone(), None);
629
630 let nulls = NullBuffer::new_null(3);
631 let err =
632 StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
633 assert_eq!(
634 err.to_string(),
635 "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"
636 );
637
638 let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
639 assert_eq!(
640 err.to_string(),
641 "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"
642 );
643
644 let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
645 let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
646 assert_eq!(
647 err.to_string(),
648 "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"
649 );
650
651 BinaryArray::new(offsets, non_utf8_data, None);
652
653 let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
654 let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
655 assert_eq!(
656 err.to_string(),
657 "Invalid argument error: Offset of 11 exceeds length of values 10"
658 );
659
660 let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
661 assert_eq!(
662 err.to_string(),
663 "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
664 );
665
666 let non_ascii_data = Buffer::from_slice_ref("heìloworld");
667 StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
668 BinaryArray::new(offsets, non_ascii_data.clone(), None);
669
670 let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
671 let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
672 assert_eq!(
673 err.to_string(),
674 "Invalid argument error: Split UTF-8 codepoint at offset 3"
675 );
676
677 BinaryArray::new(offsets, non_ascii_data, None);
678 }
679
680 #[test]
681 fn create_repeated() {
682 let arr = BinaryArray::new_repeated(b"hello", 3);
683 assert_eq!(arr.len(), 3);
684 assert_eq!(arr.value(0), b"hello");
685 assert_eq!(arr.value(1), b"hello");
686 assert_eq!(arr.value(2), b"hello");
687
688 let arr = StringArray::new_repeated("world", 2);
689 assert_eq!(arr.len(), 2);
690 assert_eq!(arr.value(0), "world");
691 assert_eq!(arr.value(1), "world");
692 }
693
694 #[test]
695 #[should_panic(expected = "usize overflow")]
696 fn create_repeated_usize_overflow_1() {
697 let _arr = BinaryArray::new_repeated(b"hello", (usize::MAX / "hello".len()) + 1);
698 }
699
700 #[test]
701 #[should_panic(expected = "usize overflow")]
702 fn create_repeated_usize_overflow_2() {
703 let _arr = BinaryArray::new_repeated(b"hello", usize::MAX);
704 }
705
706 #[test]
707 #[should_panic(expected = "offset overflow")]
708 fn create_repeated_i32_offset_overflow_1() {
709 let _arr = BinaryArray::new_repeated(b"hello", usize::MAX / "hello".len());
710 }
711
712 #[test]
713 #[should_panic(expected = "offset overflow")]
714 fn create_repeated_i32_offset_overflow_2() {
715 let _arr = BinaryArray::new_repeated(b"hello", ((i32::MAX as usize) / "hello".len()) + 1);
716 }
717}