1use crate::array::{get_offsets, print_long_array};
19use crate::builder::GenericByteBuilder;
20use crate::iterator::ArrayIter;
21use crate::types::ByteArrayType;
22use crate::types::bytes::ByteArrayNativeType;
23use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
25use arrow_buffer::{NullBuffer, OffsetBuffer};
26use arrow_data::{ArrayData, ArrayDataBuilder};
27use arrow_schema::{ArrowError, DataType};
28use std::any::Any;
29use std::sync::Arc;
30
31pub struct GenericByteArray<T: ByteArrayType> {
88 data_type: DataType,
89 value_offsets: OffsetBuffer<T::Offset>,
90 value_data: Buffer,
91 nulls: Option<NullBuffer>,
92}
93
94impl<T: ByteArrayType> Clone for GenericByteArray<T> {
95 fn clone(&self) -> Self {
96 Self {
97 data_type: T::DATA_TYPE,
98 value_offsets: self.value_offsets.clone(),
99 value_data: self.value_data.clone(),
100 nulls: self.nulls.clone(),
101 }
102 }
103}
104
105impl<T: ByteArrayType> GenericByteArray<T> {
106 pub const DATA_TYPE: DataType = T::DATA_TYPE;
108
109 pub fn new(
115 offsets: OffsetBuffer<T::Offset>,
116 values: Buffer,
117 nulls: Option<NullBuffer>,
118 ) -> Self {
119 Self::try_new(offsets, values, nulls).unwrap()
120 }
121
122 pub fn try_new(
129 offsets: OffsetBuffer<T::Offset>,
130 values: Buffer,
131 nulls: Option<NullBuffer>,
132 ) -> Result<Self, ArrowError> {
133 let len = offsets.len() - 1;
134
135 T::validate(&offsets, &values)?;
137
138 if let Some(n) = nulls.as_ref() {
139 if n.len() != len {
140 return Err(ArrowError::InvalidArgumentError(format!(
141 "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
142 T::Offset::PREFIX,
143 T::PREFIX,
144 n.len(),
145 )));
146 }
147 }
148
149 Ok(Self {
150 data_type: T::DATA_TYPE,
151 value_offsets: offsets,
152 value_data: values,
153 nulls,
154 })
155 }
156
157 pub unsafe fn new_unchecked(
163 offsets: OffsetBuffer<T::Offset>,
164 values: Buffer,
165 nulls: Option<NullBuffer>,
166 ) -> Self {
167 if cfg!(feature = "force_validate") {
168 return Self::new(offsets, values, nulls);
169 }
170 Self {
171 data_type: T::DATA_TYPE,
172 value_offsets: offsets,
173 value_data: values,
174 nulls,
175 }
176 }
177
178 pub fn new_null(len: usize) -> Self {
180 Self {
181 data_type: T::DATA_TYPE,
182 value_offsets: OffsetBuffer::new_zeroed(len),
183 value_data: MutableBuffer::new(0).into(),
184 nulls: Some(NullBuffer::new_null(len)),
185 }
186 }
187
188 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
190 Scalar::new(Self::from_iter_values(std::iter::once(value)))
191 }
192
193 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
195 where
196 Ptr: AsRef<T::Native>,
197 I: IntoIterator<Item = Ptr>,
198 {
199 let iter = iter.into_iter();
200 let (_, data_len) = iter.size_hint();
201 let data_len = data_len.expect("Iterator must be sized"); let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
204 offsets.push(T::Offset::usize_as(0));
205
206 let mut values = MutableBuffer::new(0);
207 for s in iter {
208 let s: &[u8] = s.as_ref().as_ref();
209 values.extend_from_slice(s);
210 offsets.push(T::Offset::usize_as(values.len()));
211 }
212
213 T::Offset::from_usize(values.len()).expect("offset overflow");
214 let offsets = Buffer::from(offsets);
215
216 let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
218
219 Self {
220 data_type: T::DATA_TYPE,
221 value_data: values.into(),
222 value_offsets,
223 nulls: None,
224 }
225 }
226
227 pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
229 (self.value_offsets, self.value_data, self.nulls)
230 }
231
232 #[inline]
236 pub fn value_length(&self, i: usize) -> T::Offset {
237 let offsets = self.value_offsets();
238 offsets[i + 1] - offsets[i]
239 }
240
241 #[inline]
246 pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
247 &self.value_offsets
248 }
249
250 #[inline]
255 pub fn values(&self) -> &Buffer {
256 &self.value_data
257 }
258
259 pub fn value_data(&self) -> &[u8] {
261 self.value_data.as_slice()
262 }
263
264 pub fn is_ascii(&self) -> bool {
266 let offsets = self.value_offsets();
267 let start = offsets.first().unwrap();
268 let end = offsets.last().unwrap();
269 self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
270 }
271
272 #[inline]
274 pub fn value_offsets(&self) -> &[T::Offset] {
275 &self.value_offsets
276 }
277
278 pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
286 let end = *unsafe { self.value_offsets().get_unchecked(i + 1) };
287 let start = *unsafe { self.value_offsets().get_unchecked(i) };
288
289 let b = unsafe {
299 std::slice::from_raw_parts(
300 self.value_data
301 .as_ptr()
302 .offset(start.to_isize().unwrap_unchecked()),
303 (end - start).to_usize().unwrap_unchecked(),
304 )
305 };
306
307 unsafe { T::Native::from_bytes_unchecked(b) }
310 }
311
312 pub fn value(&self, i: usize) -> &T::Native {
320 assert!(
321 i < self.len(),
322 "Trying to access an element at index {} from a {}{}Array of length {}",
323 i,
324 T::Offset::PREFIX,
325 T::PREFIX,
326 self.len()
327 );
328 unsafe { self.value_unchecked(i) }
331 }
332
333 pub fn iter(&self) -> ArrayIter<&Self> {
335 ArrayIter::new(self)
336 }
337
338 pub fn slice(&self, offset: usize, length: usize) -> Self {
340 Self {
341 data_type: T::DATA_TYPE,
342 value_offsets: self.value_offsets.slice(offset, length),
343 value_data: self.value_data.clone(),
344 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
345 }
346 }
347
348 pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
351 let len = self.len();
352 let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
353
354 let data = self.into_data();
355 let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
356
357 let element_len = std::mem::size_of::<T::Offset>();
358 let offset_buffer = data.buffers()[0]
359 .slice_with_length(data.offset() * element_len, (len + 1) * element_len);
360
361 let element_len = std::mem::size_of::<u8>();
362 let value_buffer = data.buffers()[1]
363 .slice_with_length(data.offset() * element_len, value_len * element_len);
364
365 drop(data);
366
367 let try_mutable_null_buffer = match null_bit_buffer {
368 None => Ok(None),
369 Some(null_buffer) => {
370 null_buffer.into_mutable().map(Some)
372 }
373 };
374
375 let try_mutable_buffers = match try_mutable_null_buffer {
376 Ok(mutable_null_buffer) => {
377 let try_mutable_offset_buffer = offset_buffer.into_mutable();
379 let try_mutable_value_buffer = value_buffer.into_mutable();
380
381 match (try_mutable_offset_buffer, try_mutable_value_buffer) {
384 (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
385 Ok(GenericByteBuilder::<T>::new_from_buffer(
386 mutable_offset_buffer,
387 mutable_value_buffer,
388 mutable_null_buffer,
389 ))
390 },
391 (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
392 mutable_offset_buffer.into(),
393 value_buffer,
394 mutable_null_buffer.map(|b| b.into()),
395 )),
396 (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
397 offset_buffer,
398 mutable_value_buffer.into(),
399 mutable_null_buffer.map(|b| b.into()),
400 )),
401 (Err(offset_buffer), Err(value_buffer)) => Err((
402 offset_buffer,
403 value_buffer,
404 mutable_null_buffer.map(|b| b.into()),
405 )),
406 }
407 }
408 Err(mutable_null_buffer) => {
409 Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
411 }
412 };
413
414 match try_mutable_buffers {
415 Ok(builder) => Ok(builder),
416 Err((offset_buffer, value_buffer, null_bit_buffer)) => {
417 let builder = ArrayData::builder(T::DATA_TYPE)
418 .len(len)
419 .add_buffer(offset_buffer)
420 .add_buffer(value_buffer)
421 .null_bit_buffer(null_bit_buffer);
422
423 let array_data = unsafe { builder.build_unchecked() };
424 let array = GenericByteArray::<T>::from(array_data);
425
426 Err(array)
427 }
428 }
429 }
430}
431
432impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
433 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
434 write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
435 print_long_array(self, f, |array, index, f| {
436 std::fmt::Debug::fmt(&array.value(index), f)
437 })?;
438 write!(f, "]")
439 }
440}
441
442impl<T: ByteArrayType> Array for GenericByteArray<T> {
443 fn as_any(&self) -> &dyn Any {
444 self
445 }
446
447 fn to_data(&self) -> ArrayData {
448 self.clone().into()
449 }
450
451 fn into_data(self) -> ArrayData {
452 self.into()
453 }
454
455 fn data_type(&self) -> &DataType {
456 &self.data_type
457 }
458
459 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
460 Arc::new(self.slice(offset, length))
461 }
462
463 fn len(&self) -> usize {
464 self.value_offsets.len() - 1
465 }
466
467 fn is_empty(&self) -> bool {
468 self.value_offsets.len() <= 1
469 }
470
471 fn shrink_to_fit(&mut self) {
472 self.value_offsets.shrink_to_fit();
473 self.value_data.shrink_to_fit();
474 if let Some(nulls) = &mut self.nulls {
475 nulls.shrink_to_fit();
476 }
477 }
478
479 fn offset(&self) -> usize {
480 0
481 }
482
483 fn nulls(&self) -> Option<&NullBuffer> {
484 self.nulls.as_ref()
485 }
486
487 fn logical_null_count(&self) -> usize {
488 self.null_count()
490 }
491
492 fn get_buffer_memory_size(&self) -> usize {
493 let mut sum = self.value_offsets.inner().inner().capacity();
494 sum += self.value_data.capacity();
495 if let Some(x) = &self.nulls {
496 sum += x.buffer().capacity()
497 }
498 sum
499 }
500
501 fn get_array_memory_size(&self) -> usize {
502 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
503 }
504}
505
506impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
507 type Item = &'a T::Native;
508
509 fn value(&self, index: usize) -> Self::Item {
510 GenericByteArray::value(self, index)
511 }
512
513 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
514 unsafe { GenericByteArray::value_unchecked(self, index) }
515 }
516}
517
518impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
519 fn from(data: ArrayData) -> Self {
520 assert_eq!(
521 data.data_type(),
522 &Self::DATA_TYPE,
523 "{}{}Array expects DataType::{}",
524 T::Offset::PREFIX,
525 T::PREFIX,
526 Self::DATA_TYPE
527 );
528 assert_eq!(
529 data.buffers().len(),
530 2,
531 "{}{}Array data should contain 2 buffers only (offsets and values)",
532 T::Offset::PREFIX,
533 T::PREFIX,
534 );
535 let value_offsets = unsafe { get_offsets(&data) };
538 let value_data = data.buffers()[1].clone();
539 Self {
540 value_offsets,
541 value_data,
542 data_type: T::DATA_TYPE,
543 nulls: data.nulls().cloned(),
544 }
545 }
546}
547
548impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
549 fn from(array: GenericByteArray<T>) -> Self {
550 let len = array.len();
551
552 let offsets = array.value_offsets.into_inner().into_inner();
553 let builder = ArrayDataBuilder::new(array.data_type)
554 .len(len)
555 .buffers(vec![offsets, array.value_data])
556 .nulls(array.nulls);
557
558 unsafe { builder.build_unchecked() }
559 }
560}
561
562impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
563 type Item = Option<&'a T::Native>;
564 type IntoIter = ArrayIter<Self>;
565
566 fn into_iter(self) -> Self::IntoIter {
567 ArrayIter::new(self)
568 }
569}
570
571impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
572where
573 Ptr: AsRef<T::Native> + 'a,
574{
575 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
576 iter.into_iter()
577 .map(|o| o.as_ref().map(|p| p.as_ref()))
578 .collect()
579 }
580}
581
582impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
583where
584 Ptr: AsRef<T::Native>,
585{
586 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
587 let iter = iter.into_iter();
588 let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
589 builder.extend(iter);
590 builder.finish()
591 }
592}
593
594#[cfg(test)]
595mod tests {
596 use crate::{BinaryArray, StringArray};
597 use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
598
599 #[test]
600 fn try_new() {
601 let data = Buffer::from_slice_ref("helloworld");
602 let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
603 StringArray::new(offsets.clone(), data.clone(), None);
604
605 let nulls = NullBuffer::new_null(3);
606 let err =
607 StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
608 assert_eq!(
609 err.to_string(),
610 "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"
611 );
612
613 let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
614 assert_eq!(
615 err.to_string(),
616 "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"
617 );
618
619 let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
620 let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
621 assert_eq!(
622 err.to_string(),
623 "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"
624 );
625
626 BinaryArray::new(offsets, non_utf8_data, None);
627
628 let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
629 let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
630 assert_eq!(
631 err.to_string(),
632 "Invalid argument error: Offset of 11 exceeds length of values 10"
633 );
634
635 let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
636 assert_eq!(
637 err.to_string(),
638 "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
639 );
640
641 let non_ascii_data = Buffer::from_slice_ref("heìloworld");
642 StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
643 BinaryArray::new(offsets, non_ascii_data.clone(), None);
644
645 let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
646 let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
647 assert_eq!(
648 err.to_string(),
649 "Invalid argument error: Split UTF-8 codepoint at offset 3"
650 );
651
652 BinaryArray::new(offsets, non_ascii_data, None);
653 }
654}