1#![allow(clippy::enum_clike_unportable_variant)]
18
19use crate::{make_array, Array, ArrayRef};
20use arrow_buffer::bit_chunk_iterator::{BitChunkIterator, BitChunks};
21use arrow_buffer::buffer::NullBuffer;
22use arrow_buffer::{BooleanBuffer, MutableBuffer, ScalarBuffer};
23use arrow_data::{ArrayData, ArrayDataBuilder};
24use arrow_schema::{ArrowError, DataType, UnionFields, UnionMode};
25use std::any::Any;
28use std::collections::HashSet;
29use std::sync::Arc;
30
31#[derive(Clone)]
123pub struct UnionArray {
124 data_type: DataType,
125 type_ids: ScalarBuffer<i8>,
126 offsets: Option<ScalarBuffer<i32>>,
127 fields: Vec<Option<ArrayRef>>,
128}
129
130impl UnionArray {
131 pub unsafe fn new_unchecked(
150 fields: UnionFields,
151 type_ids: ScalarBuffer<i8>,
152 offsets: Option<ScalarBuffer<i32>>,
153 children: Vec<ArrayRef>,
154 ) -> Self {
155 let mode = if offsets.is_some() {
156 UnionMode::Dense
157 } else {
158 UnionMode::Sparse
159 };
160
161 let len = type_ids.len();
162 let builder = ArrayData::builder(DataType::Union(fields, mode))
163 .add_buffer(type_ids.into_inner())
164 .child_data(children.into_iter().map(Array::into_data).collect())
165 .len(len);
166
167 let data = match offsets {
168 Some(offsets) => builder.add_buffer(offsets.into_inner()).build_unchecked(),
169 None => builder.build_unchecked(),
170 };
171 Self::from(data)
172 }
173
174 pub fn try_new(
178 fields: UnionFields,
179 type_ids: ScalarBuffer<i8>,
180 offsets: Option<ScalarBuffer<i32>>,
181 children: Vec<ArrayRef>,
182 ) -> Result<Self, ArrowError> {
183 if fields.len() != children.len() {
185 return Err(ArrowError::InvalidArgumentError(
186 "Union fields length must match child arrays length".to_string(),
187 ));
188 }
189
190 if let Some(offsets) = &offsets {
191 if offsets.len() != type_ids.len() {
193 return Err(ArrowError::InvalidArgumentError(
194 "Type Ids and Offsets lengths must match".to_string(),
195 ));
196 }
197 } else {
198 for child in &children {
200 if child.len() != type_ids.len() {
201 return Err(ArrowError::InvalidArgumentError(
202 "Sparse union child arrays must be equal in length to the length of the union".to_string(),
203 ));
204 }
205 }
206 }
207
208 let max_id = fields.iter().map(|(i, _)| i).max().unwrap_or_default() as usize;
210 let mut array_lens = vec![i32::MIN; max_id + 1];
211 for (cd, (field_id, _)) in children.iter().zip(fields.iter()) {
212 array_lens[field_id as usize] = cd.len() as i32;
213 }
214
215 for id in &type_ids {
217 match array_lens.get(*id as usize) {
218 Some(x) if *x != i32::MIN => {}
219 _ => {
220 return Err(ArrowError::InvalidArgumentError(
221 "Type Ids values must match one of the field type ids".to_owned(),
222 ))
223 }
224 }
225 }
226
227 if let Some(offsets) = &offsets {
229 let mut iter = type_ids.iter().zip(offsets.iter());
230 if iter.any(|(type_id, &offset)| offset < 0 || offset >= array_lens[*type_id as usize])
231 {
232 return Err(ArrowError::InvalidArgumentError(
233 "Offsets must be positive and within the length of the Array".to_owned(),
234 ));
235 }
236 }
237
238 let union_array = unsafe { Self::new_unchecked(fields, type_ids, offsets, children) };
241 Ok(union_array)
242 }
243
244 pub fn child(&self, type_id: i8) -> &ArrayRef {
251 assert!((type_id as usize) < self.fields.len());
252 let boxed = &self.fields[type_id as usize];
253 boxed.as_ref().expect("invalid type id")
254 }
255
256 pub fn type_id(&self, index: usize) -> i8 {
262 assert!(index < self.type_ids.len());
263 self.type_ids[index]
264 }
265
266 pub fn type_ids(&self) -> &ScalarBuffer<i8> {
268 &self.type_ids
269 }
270
271 pub fn offsets(&self) -> Option<&ScalarBuffer<i32>> {
273 self.offsets.as_ref()
274 }
275
276 pub fn value_offset(&self, index: usize) -> usize {
282 assert!(index < self.len());
283 match &self.offsets {
284 Some(offsets) => offsets[index] as usize,
285 None => self.offset() + index,
286 }
287 }
288
289 pub fn value(&self, i: usize) -> ArrayRef {
293 let type_id = self.type_id(i);
294 let value_offset = self.value_offset(i);
295 let child = self.child(type_id);
296 child.slice(value_offset, 1)
297 }
298
299 pub fn type_names(&self) -> Vec<&str> {
301 match self.data_type() {
302 DataType::Union(fields, _) => fields
303 .iter()
304 .map(|(_, f)| f.name().as_str())
305 .collect::<Vec<&str>>(),
306 _ => unreachable!("Union array's data type is not a union!"),
307 }
308 }
309
310 fn is_dense(&self) -> bool {
312 match self.data_type() {
313 DataType::Union(_, mode) => mode == &UnionMode::Dense,
314 _ => unreachable!("Union array's data type is not a union!"),
315 }
316 }
317
318 pub fn slice(&self, offset: usize, length: usize) -> Self {
320 let (offsets, fields) = match self.offsets.as_ref() {
321 Some(offsets) => (Some(offsets.slice(offset, length)), self.fields.clone()),
323 None => {
325 let fields = self
326 .fields
327 .iter()
328 .map(|x| x.as_ref().map(|x| x.slice(offset, length)))
329 .collect();
330 (None, fields)
331 }
332 };
333
334 Self {
335 data_type: self.data_type.clone(),
336 type_ids: self.type_ids.slice(offset, length),
337 offsets,
338 fields,
339 }
340 }
341
342 #[allow(clippy::type_complexity)]
370 pub fn into_parts(
371 self,
372 ) -> (
373 UnionFields,
374 ScalarBuffer<i8>,
375 Option<ScalarBuffer<i32>>,
376 Vec<ArrayRef>,
377 ) {
378 let Self {
379 data_type,
380 type_ids,
381 offsets,
382 mut fields,
383 } = self;
384 match data_type {
385 DataType::Union(union_fields, _) => {
386 let children = union_fields
387 .iter()
388 .map(|(type_id, _)| fields[type_id as usize].take().unwrap())
389 .collect();
390 (union_fields, type_ids, offsets, children)
391 }
392 _ => unreachable!(),
393 }
394 }
395
396 fn mask_sparse_skip_without_nulls(&self, nulls: Vec<(i8, NullBuffer)>) -> BooleanBuffer {
398 let fold = |(with_nulls_selected, union_nulls), (is_field, field_nulls)| {
404 (
405 with_nulls_selected | is_field,
406 union_nulls | (is_field & field_nulls),
407 )
408 };
409
410 self.mask_sparse_helper(
411 nulls,
412 |type_ids_chunk_array, nulls_masks_iters| {
413 let (with_nulls_selected, union_nulls) = nulls_masks_iters
414 .iter_mut()
415 .map(|(field_type_id, field_nulls)| {
416 let field_nulls = field_nulls.next().unwrap();
417 let is_field = selection_mask(type_ids_chunk_array, *field_type_id);
418
419 (is_field, field_nulls)
420 })
421 .fold((0, 0), fold);
422
423 let without_nulls_selected = !with_nulls_selected;
425
426 without_nulls_selected | union_nulls
429 },
430 |type_ids_remainder, bit_chunks| {
431 let (with_nulls_selected, union_nulls) = bit_chunks
432 .iter()
433 .map(|(field_type_id, field_bit_chunks)| {
434 let field_nulls = field_bit_chunks.remainder_bits();
435 let is_field = selection_mask(type_ids_remainder, *field_type_id);
436
437 (is_field, field_nulls)
438 })
439 .fold((0, 0), fold);
440
441 let without_nulls_selected = !with_nulls_selected;
442
443 without_nulls_selected | union_nulls
444 },
445 )
446 }
447
448 fn mask_sparse_skip_fully_null(&self, mut nulls: Vec<(i8, NullBuffer)>) -> BooleanBuffer {
450 let fields = match self.data_type() {
451 DataType::Union(fields, _) => fields,
452 _ => unreachable!("Union array's data type is not a union!"),
453 };
454
455 let type_ids = fields.iter().map(|(id, _)| id).collect::<HashSet<_>>();
456 let with_nulls = nulls.iter().map(|(id, _)| *id).collect::<HashSet<_>>();
457
458 let without_nulls_ids = type_ids
459 .difference(&with_nulls)
460 .copied()
461 .collect::<Vec<_>>();
462
463 nulls.retain(|(_, nulls)| nulls.null_count() < nulls.len());
464
465 self.mask_sparse_helper(
470 nulls,
471 |type_ids_chunk_array, nulls_masks_iters| {
472 let union_nulls = nulls_masks_iters.iter_mut().fold(
473 0,
474 |union_nulls, (field_type_id, nulls_iter)| {
475 let field_nulls = nulls_iter.next().unwrap();
476
477 if field_nulls == 0 {
478 union_nulls
479 } else {
480 let is_field = selection_mask(type_ids_chunk_array, *field_type_id);
481
482 union_nulls | (is_field & field_nulls)
483 }
484 },
485 );
486
487 let without_nulls_selected =
489 without_nulls_selected(type_ids_chunk_array, &without_nulls_ids);
490
491 union_nulls | without_nulls_selected
494 },
495 |type_ids_remainder, bit_chunks| {
496 let union_nulls =
497 bit_chunks
498 .iter()
499 .fold(0, |union_nulls, (field_type_id, field_bit_chunks)| {
500 let is_field = selection_mask(type_ids_remainder, *field_type_id);
501 let field_nulls = field_bit_chunks.remainder_bits();
502
503 union_nulls | is_field & field_nulls
504 });
505
506 union_nulls | without_nulls_selected(type_ids_remainder, &without_nulls_ids)
507 },
508 )
509 }
510
511 fn mask_sparse_all_with_nulls_skip_one(&self, nulls: Vec<(i8, NullBuffer)>) -> BooleanBuffer {
513 self.mask_sparse_helper(
520 nulls,
521 |type_ids_chunk_array, nulls_masks_iters| {
522 let (is_not_first, union_nulls) = nulls_masks_iters[1..] .iter_mut()
524 .fold(
525 (0, 0),
526 |(is_not_first, union_nulls), (field_type_id, nulls_iter)| {
527 let field_nulls = nulls_iter.next().unwrap();
528 let is_field = selection_mask(type_ids_chunk_array, *field_type_id);
529
530 (
531 is_not_first | is_field,
532 union_nulls | (is_field & field_nulls),
533 )
534 },
535 );
536
537 let is_first = !is_not_first;
538 let first_nulls = nulls_masks_iters[0].1.next().unwrap();
539
540 (is_first & first_nulls) | union_nulls
541 },
542 |type_ids_remainder, bit_chunks| {
543 bit_chunks
544 .iter()
545 .fold(0, |union_nulls, (field_type_id, field_bit_chunks)| {
546 let field_nulls = field_bit_chunks.remainder_bits();
547 let is_field = selection_mask(type_ids_remainder, *field_type_id);
550
551 union_nulls | (is_field & field_nulls)
552 })
553 },
554 )
555 }
556
557 fn mask_sparse_helper(
560 &self,
561 nulls: Vec<(i8, NullBuffer)>,
562 mut mask_chunk: impl FnMut(&[i8; 64], &mut [(i8, BitChunkIterator)]) -> u64,
563 mask_remainder: impl FnOnce(&[i8], &[(i8, BitChunks)]) -> u64,
564 ) -> BooleanBuffer {
565 let bit_chunks = nulls
566 .iter()
567 .map(|(type_id, nulls)| (*type_id, nulls.inner().bit_chunks()))
568 .collect::<Vec<_>>();
569
570 let mut nulls_masks_iter = bit_chunks
571 .iter()
572 .map(|(type_id, bit_chunks)| (*type_id, bit_chunks.iter()))
573 .collect::<Vec<_>>();
574
575 let chunks_exact = self.type_ids.chunks_exact(64);
576 let remainder = chunks_exact.remainder();
577
578 let chunks = chunks_exact.map(|type_ids_chunk| {
579 let type_ids_chunk_array = <&[i8; 64]>::try_from(type_ids_chunk).unwrap();
580
581 mask_chunk(type_ids_chunk_array, &mut nulls_masks_iter)
582 });
583
584 let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };
587
588 if !remainder.is_empty() {
589 buffer.push(mask_remainder(remainder, &bit_chunks));
590 }
591
592 BooleanBuffer::new(buffer.into(), 0, self.type_ids.len())
593 }
594
595 fn gather_nulls(&self, nulls: Vec<(i8, NullBuffer)>) -> BooleanBuffer {
597 let one_null = NullBuffer::new_null(1);
598 let one_valid = NullBuffer::new_valid(1);
599
600 let mut logical_nulls_array = [(&one_valid, Mask::Zero); 256];
607
608 for (type_id, nulls) in &nulls {
609 if nulls.null_count() == nulls.len() {
610 logical_nulls_array[*type_id as u8 as usize] = (&one_null, Mask::Zero);
612 } else {
613 logical_nulls_array[*type_id as u8 as usize] = (nulls, Mask::Max);
614 }
615 }
616
617 match &self.offsets {
618 Some(offsets) => {
619 assert_eq!(self.type_ids.len(), offsets.len());
620
621 BooleanBuffer::collect_bool(self.type_ids.len(), |i| unsafe {
622 let type_id = *self.type_ids.get_unchecked(i);
624 let offset = *offsets.get_unchecked(i);
626
627 let (nulls, offset_mask) = &logical_nulls_array[type_id as u8 as usize];
628
629 nulls
635 .inner()
636 .value_unchecked(offset as usize & *offset_mask as usize)
637 })
638 }
639 None => {
640 BooleanBuffer::collect_bool(self.type_ids.len(), |index| unsafe {
641 let type_id = *self.type_ids.get_unchecked(index);
643
644 let (nulls, index_mask) = &logical_nulls_array[type_id as u8 as usize];
645
646 nulls.inner().value_unchecked(index & *index_mask as usize)
652 })
653 }
654 }
655 }
656
657 fn fields_logical_nulls(&self) -> Vec<(i8, NullBuffer)> {
660 self.fields
661 .iter()
662 .enumerate()
663 .filter_map(|(type_id, field)| Some((type_id as i8, field.as_ref()?.logical_nulls()?)))
664 .filter(|(_, nulls)| nulls.null_count() > 0)
665 .collect()
666 }
667}
668
669impl From<ArrayData> for UnionArray {
670 fn from(data: ArrayData) -> Self {
671 let (fields, mode) = match data.data_type() {
672 DataType::Union(fields, mode) => (fields, *mode),
673 d => panic!("UnionArray expected ArrayData with type Union got {d}"),
674 };
675 let (type_ids, offsets) = match mode {
676 UnionMode::Sparse => (
677 ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()),
678 None,
679 ),
680 UnionMode::Dense => (
681 ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()),
682 Some(ScalarBuffer::new(
683 data.buffers()[1].clone(),
684 data.offset(),
685 data.len(),
686 )),
687 ),
688 };
689
690 let max_id = fields.iter().map(|(i, _)| i).max().unwrap_or_default() as usize;
691 let mut boxed_fields = vec![None; max_id + 1];
692 for (cd, (field_id, _)) in data.child_data().iter().zip(fields.iter()) {
693 boxed_fields[field_id as usize] = Some(make_array(cd.clone()));
694 }
695 Self {
696 data_type: data.data_type().clone(),
697 type_ids,
698 offsets,
699 fields: boxed_fields,
700 }
701 }
702}
703
704impl From<UnionArray> for ArrayData {
705 fn from(array: UnionArray) -> Self {
706 let len = array.len();
707 let f = match &array.data_type {
708 DataType::Union(f, _) => f,
709 _ => unreachable!(),
710 };
711 let buffers = match array.offsets {
712 Some(o) => vec![array.type_ids.into_inner(), o.into_inner()],
713 None => vec![array.type_ids.into_inner()],
714 };
715
716 let child = f
717 .iter()
718 .map(|(i, _)| array.fields[i as usize].as_ref().unwrap().to_data())
719 .collect();
720
721 let builder = ArrayDataBuilder::new(array.data_type)
722 .len(len)
723 .buffers(buffers)
724 .child_data(child);
725 unsafe { builder.build_unchecked() }
726 }
727}
728
729impl Array for UnionArray {
730 fn as_any(&self) -> &dyn Any {
731 self
732 }
733
734 fn to_data(&self) -> ArrayData {
735 self.clone().into()
736 }
737
738 fn into_data(self) -> ArrayData {
739 self.into()
740 }
741
742 fn data_type(&self) -> &DataType {
743 &self.data_type
744 }
745
746 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
747 Arc::new(self.slice(offset, length))
748 }
749
750 fn len(&self) -> usize {
751 self.type_ids.len()
752 }
753
754 fn is_empty(&self) -> bool {
755 self.type_ids.is_empty()
756 }
757
758 fn shrink_to_fit(&mut self) {
759 self.type_ids.shrink_to_fit();
760 if let Some(offsets) = &mut self.offsets {
761 offsets.shrink_to_fit();
762 }
763 for array in self.fields.iter_mut().flatten() {
764 array.shrink_to_fit();
765 }
766 self.fields.shrink_to_fit();
767 }
768
769 fn offset(&self) -> usize {
770 0
771 }
772
773 fn nulls(&self) -> Option<&NullBuffer> {
774 None
775 }
776
777 fn logical_nulls(&self) -> Option<NullBuffer> {
778 let fields = match self.data_type() {
779 DataType::Union(fields, _) => fields,
780 _ => unreachable!(),
781 };
782
783 if fields.len() <= 1 {
784 return self.fields.iter().find_map(|field_opt| {
785 field_opt
786 .as_ref()
787 .and_then(|field| field.logical_nulls())
788 .map(|logical_nulls| {
789 if self.is_dense() {
790 self.gather_nulls(vec![(0, logical_nulls)]).into()
791 } else {
792 logical_nulls
793 }
794 })
795 });
796 }
797
798 let logical_nulls = self.fields_logical_nulls();
799
800 if logical_nulls.is_empty() {
801 return None;
802 }
803
804 let fully_null_count = logical_nulls
805 .iter()
806 .filter(|(_, nulls)| nulls.null_count() == nulls.len())
807 .count();
808
809 if fully_null_count == fields.len() {
810 if let Some((_, exactly_sized)) = logical_nulls
811 .iter()
812 .find(|(_, nulls)| nulls.len() == self.len())
813 {
814 return Some(exactly_sized.clone());
815 }
816
817 if let Some((_, bigger)) = logical_nulls
818 .iter()
819 .find(|(_, nulls)| nulls.len() > self.len())
820 {
821 return Some(bigger.slice(0, self.len()));
822 }
823
824 return Some(NullBuffer::new_null(self.len()));
825 }
826
827 let boolean_buffer = match &self.offsets {
828 Some(_) => self.gather_nulls(logical_nulls),
829 None => {
830 let gather_relative_cost = if cfg!(target_feature = "avx2") {
838 10
839 } else if cfg!(target_feature = "sse4.1") {
840 3
841 } else if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64") {
842 2
844 } else {
845 0
849 };
850
851 let strategies = [
852 (SparseStrategy::Gather, gather_relative_cost, true),
853 (
854 SparseStrategy::MaskAllFieldsWithNullsSkipOne,
855 fields.len() - 1,
856 fields.len() == logical_nulls.len(),
857 ),
858 (
859 SparseStrategy::MaskSkipWithoutNulls,
860 logical_nulls.len(),
861 true,
862 ),
863 (
864 SparseStrategy::MaskSkipFullyNull,
865 fields.len() - fully_null_count,
866 true,
867 ),
868 ];
869
870 let (strategy, _, _) = strategies
871 .iter()
872 .filter(|(_, _, applicable)| *applicable)
873 .min_by_key(|(_, cost, _)| cost)
874 .unwrap();
875
876 match strategy {
877 SparseStrategy::Gather => self.gather_nulls(logical_nulls),
878 SparseStrategy::MaskAllFieldsWithNullsSkipOne => {
879 self.mask_sparse_all_with_nulls_skip_one(logical_nulls)
880 }
881 SparseStrategy::MaskSkipWithoutNulls => {
882 self.mask_sparse_skip_without_nulls(logical_nulls)
883 }
884 SparseStrategy::MaskSkipFullyNull => {
885 self.mask_sparse_skip_fully_null(logical_nulls)
886 }
887 }
888 }
889 };
890
891 let null_buffer = NullBuffer::from(boolean_buffer);
892
893 if null_buffer.null_count() > 0 {
894 Some(null_buffer)
895 } else {
896 None
897 }
898 }
899
900 fn is_nullable(&self) -> bool {
901 self.fields
902 .iter()
903 .flatten()
904 .any(|field| field.is_nullable())
905 }
906
907 fn get_buffer_memory_size(&self) -> usize {
908 let mut sum = self.type_ids.inner().capacity();
909 if let Some(o) = self.offsets.as_ref() {
910 sum += o.inner().capacity()
911 }
912 self.fields
913 .iter()
914 .flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size()))
915 .sum::<usize>()
916 + sum
917 }
918
919 fn get_array_memory_size(&self) -> usize {
920 let mut sum = self.type_ids.inner().capacity();
921 if let Some(o) = self.offsets.as_ref() {
922 sum += o.inner().capacity()
923 }
924 std::mem::size_of::<Self>()
925 + self
926 .fields
927 .iter()
928 .flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size()))
929 .sum::<usize>()
930 + sum
931 }
932}
933
934impl std::fmt::Debug for UnionArray {
935 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
936 let header = if self.is_dense() {
937 "UnionArray(Dense)\n["
938 } else {
939 "UnionArray(Sparse)\n["
940 };
941 writeln!(f, "{header}")?;
942
943 writeln!(f, "-- type id buffer:")?;
944 writeln!(f, "{:?}", self.type_ids)?;
945
946 if let Some(offsets) = &self.offsets {
947 writeln!(f, "-- offsets buffer:")?;
948 writeln!(f, "{offsets:?}")?;
949 }
950
951 let fields = match self.data_type() {
952 DataType::Union(fields, _) => fields,
953 _ => unreachable!(),
954 };
955
956 for (type_id, field) in fields.iter() {
957 let child = self.child(type_id);
958 writeln!(
959 f,
960 "-- child {}: \"{}\" ({:?})",
961 type_id,
962 field.name(),
963 field.data_type()
964 )?;
965 std::fmt::Debug::fmt(child, f)?;
966 writeln!(f)?;
967 }
968 writeln!(f, "]")
969 }
970}
971
972enum SparseStrategy {
977 Gather,
979 MaskAllFieldsWithNullsSkipOne,
981 MaskSkipWithoutNulls,
983 MaskSkipFullyNull,
985}
986
987#[derive(Copy, Clone)]
988#[repr(usize)]
989enum Mask {
990 Zero = 0,
991 #[allow(clippy::enum_clike_unportable_variant)]
993 Max = usize::MAX,
994}
995
996fn selection_mask(type_ids_chunk: &[i8], type_id: i8) -> u64 {
997 type_ids_chunk
998 .iter()
999 .copied()
1000 .enumerate()
1001 .fold(0, |packed, (bit_idx, v)| {
1002 packed | (((v == type_id) as u64) << bit_idx)
1003 })
1004}
1005
1006fn without_nulls_selected(type_ids_chunk: &[i8], without_nulls_ids: &[i8]) -> u64 {
1008 without_nulls_ids
1009 .iter()
1010 .fold(0, |fully_valid_selected, field_type_id| {
1011 fully_valid_selected | selection_mask(type_ids_chunk, *field_type_id)
1012 })
1013}
1014
1015#[cfg(test)]
1016mod tests {
1017 use super::*;
1018 use std::collections::HashSet;
1019
1020 use crate::array::Int8Type;
1021 use crate::builder::UnionBuilder;
1022 use crate::cast::AsArray;
1023 use crate::types::{Float32Type, Float64Type, Int32Type, Int64Type};
1024 use crate::{Float64Array, Int32Array, Int64Array, StringArray};
1025 use crate::{Int8Array, RecordBatch};
1026 use arrow_buffer::Buffer;
1027 use arrow_schema::{Field, Schema};
1028
1029 #[test]
1030 fn test_dense_i32() {
1031 let mut builder = UnionBuilder::new_dense();
1032 builder.append::<Int32Type>("a", 1).unwrap();
1033 builder.append::<Int32Type>("b", 2).unwrap();
1034 builder.append::<Int32Type>("c", 3).unwrap();
1035 builder.append::<Int32Type>("a", 4).unwrap();
1036 builder.append::<Int32Type>("c", 5).unwrap();
1037 builder.append::<Int32Type>("a", 6).unwrap();
1038 builder.append::<Int32Type>("b", 7).unwrap();
1039 let union = builder.build().unwrap();
1040
1041 let expected_type_ids = vec![0_i8, 1, 2, 0, 2, 0, 1];
1042 let expected_offsets = vec![0_i32, 0, 0, 1, 1, 2, 1];
1043 let expected_array_values = [1_i32, 2, 3, 4, 5, 6, 7];
1044
1045 assert_eq!(*union.type_ids(), expected_type_ids);
1047 for (i, id) in expected_type_ids.iter().enumerate() {
1048 assert_eq!(id, &union.type_id(i));
1049 }
1050
1051 assert_eq!(*union.offsets().unwrap(), expected_offsets);
1053 for (i, id) in expected_offsets.iter().enumerate() {
1054 assert_eq!(union.value_offset(i), *id as usize);
1055 }
1056
1057 assert_eq!(
1059 *union.child(0).as_primitive::<Int32Type>().values(),
1060 [1_i32, 4, 6]
1061 );
1062 assert_eq!(
1063 *union.child(1).as_primitive::<Int32Type>().values(),
1064 [2_i32, 7]
1065 );
1066 assert_eq!(
1067 *union.child(2).as_primitive::<Int32Type>().values(),
1068 [3_i32, 5]
1069 );
1070
1071 assert_eq!(expected_array_values.len(), union.len());
1072 for (i, expected_value) in expected_array_values.iter().enumerate() {
1073 assert!(!union.is_null(i));
1074 let slot = union.value(i);
1075 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1076 assert_eq!(slot.len(), 1);
1077 let value = slot.value(0);
1078 assert_eq!(expected_value, &value);
1079 }
1080 }
1081
1082 #[test]
1083 fn slice_union_array_single_field() {
1084 let union_array = {
1087 let mut builder = UnionBuilder::new_dense();
1088 builder.append::<Int32Type>("a", 1).unwrap();
1089 builder.append_null::<Int32Type>("a").unwrap();
1090 builder.append::<Int32Type>("a", 3).unwrap();
1091 builder.append_null::<Int32Type>("a").unwrap();
1092 builder.append::<Int32Type>("a", 4).unwrap();
1093 builder.build().unwrap()
1094 };
1095
1096 let union_slice = union_array.slice(1, 3);
1098 let logical_nulls = union_slice.logical_nulls().unwrap();
1099
1100 assert_eq!(logical_nulls.len(), 3);
1101 assert!(logical_nulls.is_null(0));
1102 assert!(logical_nulls.is_valid(1));
1103 assert!(logical_nulls.is_null(2));
1104 }
1105
1106 #[test]
1107 #[cfg_attr(miri, ignore)]
1108 fn test_dense_i32_large() {
1109 let mut builder = UnionBuilder::new_dense();
1110
1111 let expected_type_ids = vec![0_i8; 1024];
1112 let expected_offsets: Vec<_> = (0..1024).collect();
1113 let expected_array_values: Vec<_> = (1..=1024).collect();
1114
1115 expected_array_values
1116 .iter()
1117 .for_each(|v| builder.append::<Int32Type>("a", *v).unwrap());
1118
1119 let union = builder.build().unwrap();
1120
1121 assert_eq!(*union.type_ids(), expected_type_ids);
1123 for (i, id) in expected_type_ids.iter().enumerate() {
1124 assert_eq!(id, &union.type_id(i));
1125 }
1126
1127 assert_eq!(*union.offsets().unwrap(), expected_offsets);
1129 for (i, id) in expected_offsets.iter().enumerate() {
1130 assert_eq!(union.value_offset(i), *id as usize);
1131 }
1132
1133 for (i, expected_value) in expected_array_values.iter().enumerate() {
1134 assert!(!union.is_null(i));
1135 let slot = union.value(i);
1136 let slot = slot.as_primitive::<Int32Type>();
1137 assert_eq!(slot.len(), 1);
1138 let value = slot.value(0);
1139 assert_eq!(expected_value, &value);
1140 }
1141 }
1142
1143 #[test]
1144 fn test_dense_mixed() {
1145 let mut builder = UnionBuilder::new_dense();
1146 builder.append::<Int32Type>("a", 1).unwrap();
1147 builder.append::<Int64Type>("c", 3).unwrap();
1148 builder.append::<Int32Type>("a", 4).unwrap();
1149 builder.append::<Int64Type>("c", 5).unwrap();
1150 builder.append::<Int32Type>("a", 6).unwrap();
1151 let union = builder.build().unwrap();
1152
1153 assert_eq!(5, union.len());
1154 for i in 0..union.len() {
1155 let slot = union.value(i);
1156 assert!(!union.is_null(i));
1157 match i {
1158 0 => {
1159 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1160 assert_eq!(slot.len(), 1);
1161 let value = slot.value(0);
1162 assert_eq!(1_i32, value);
1163 }
1164 1 => {
1165 let slot = slot.as_any().downcast_ref::<Int64Array>().unwrap();
1166 assert_eq!(slot.len(), 1);
1167 let value = slot.value(0);
1168 assert_eq!(3_i64, value);
1169 }
1170 2 => {
1171 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1172 assert_eq!(slot.len(), 1);
1173 let value = slot.value(0);
1174 assert_eq!(4_i32, value);
1175 }
1176 3 => {
1177 let slot = slot.as_any().downcast_ref::<Int64Array>().unwrap();
1178 assert_eq!(slot.len(), 1);
1179 let value = slot.value(0);
1180 assert_eq!(5_i64, value);
1181 }
1182 4 => {
1183 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1184 assert_eq!(slot.len(), 1);
1185 let value = slot.value(0);
1186 assert_eq!(6_i32, value);
1187 }
1188 _ => unreachable!(),
1189 }
1190 }
1191 }
1192
1193 #[test]
1194 fn test_dense_mixed_with_nulls() {
1195 let mut builder = UnionBuilder::new_dense();
1196 builder.append::<Int32Type>("a", 1).unwrap();
1197 builder.append::<Int64Type>("c", 3).unwrap();
1198 builder.append::<Int32Type>("a", 10).unwrap();
1199 builder.append_null::<Int32Type>("a").unwrap();
1200 builder.append::<Int32Type>("a", 6).unwrap();
1201 let union = builder.build().unwrap();
1202
1203 assert_eq!(5, union.len());
1204 for i in 0..union.len() {
1205 let slot = union.value(i);
1206 match i {
1207 0 => {
1208 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1209 assert!(!slot.is_null(0));
1210 assert_eq!(slot.len(), 1);
1211 let value = slot.value(0);
1212 assert_eq!(1_i32, value);
1213 }
1214 1 => {
1215 let slot = slot.as_any().downcast_ref::<Int64Array>().unwrap();
1216 assert!(!slot.is_null(0));
1217 assert_eq!(slot.len(), 1);
1218 let value = slot.value(0);
1219 assert_eq!(3_i64, value);
1220 }
1221 2 => {
1222 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1223 assert!(!slot.is_null(0));
1224 assert_eq!(slot.len(), 1);
1225 let value = slot.value(0);
1226 assert_eq!(10_i32, value);
1227 }
1228 3 => assert!(slot.is_null(0)),
1229 4 => {
1230 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1231 assert!(!slot.is_null(0));
1232 assert_eq!(slot.len(), 1);
1233 let value = slot.value(0);
1234 assert_eq!(6_i32, value);
1235 }
1236 _ => unreachable!(),
1237 }
1238 }
1239 }
1240
1241 #[test]
1242 fn test_dense_mixed_with_nulls_and_offset() {
1243 let mut builder = UnionBuilder::new_dense();
1244 builder.append::<Int32Type>("a", 1).unwrap();
1245 builder.append::<Int64Type>("c", 3).unwrap();
1246 builder.append::<Int32Type>("a", 10).unwrap();
1247 builder.append_null::<Int32Type>("a").unwrap();
1248 builder.append::<Int32Type>("a", 6).unwrap();
1249 let union = builder.build().unwrap();
1250
1251 let slice = union.slice(2, 3);
1252 let new_union = slice.as_any().downcast_ref::<UnionArray>().unwrap();
1253
1254 assert_eq!(3, new_union.len());
1255 for i in 0..new_union.len() {
1256 let slot = new_union.value(i);
1257 match i {
1258 0 => {
1259 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1260 assert!(!slot.is_null(0));
1261 assert_eq!(slot.len(), 1);
1262 let value = slot.value(0);
1263 assert_eq!(10_i32, value);
1264 }
1265 1 => assert!(slot.is_null(0)),
1266 2 => {
1267 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1268 assert!(!slot.is_null(0));
1269 assert_eq!(slot.len(), 1);
1270 let value = slot.value(0);
1271 assert_eq!(6_i32, value);
1272 }
1273 _ => unreachable!(),
1274 }
1275 }
1276 }
1277
1278 #[test]
1279 fn test_dense_mixed_with_str() {
1280 let string_array = StringArray::from(vec!["foo", "bar", "baz"]);
1281 let int_array = Int32Array::from(vec![5, 6]);
1282 let float_array = Float64Array::from(vec![10.0]);
1283
1284 let type_ids = [1, 0, 0, 2, 0, 1].into_iter().collect::<ScalarBuffer<i8>>();
1285 let offsets = [0, 0, 1, 0, 2, 1]
1286 .into_iter()
1287 .collect::<ScalarBuffer<i32>>();
1288
1289 let fields = [
1290 (0, Arc::new(Field::new("A", DataType::Utf8, false))),
1291 (1, Arc::new(Field::new("B", DataType::Int32, false))),
1292 (2, Arc::new(Field::new("C", DataType::Float64, false))),
1293 ]
1294 .into_iter()
1295 .collect::<UnionFields>();
1296 let children = [
1297 Arc::new(string_array) as Arc<dyn Array>,
1298 Arc::new(int_array),
1299 Arc::new(float_array),
1300 ]
1301 .into_iter()
1302 .collect();
1303 let array =
1304 UnionArray::try_new(fields, type_ids.clone(), Some(offsets.clone()), children).unwrap();
1305
1306 assert_eq!(*array.type_ids(), type_ids);
1308 for (i, id) in type_ids.iter().enumerate() {
1309 assert_eq!(id, &array.type_id(i));
1310 }
1311
1312 assert_eq!(*array.offsets().unwrap(), offsets);
1314 for (i, id) in offsets.iter().enumerate() {
1315 assert_eq!(*id as usize, array.value_offset(i));
1316 }
1317
1318 assert_eq!(6, array.len());
1320
1321 let slot = array.value(0);
1322 let value = slot.as_any().downcast_ref::<Int32Array>().unwrap().value(0);
1323 assert_eq!(5, value);
1324
1325 let slot = array.value(1);
1326 let value = slot
1327 .as_any()
1328 .downcast_ref::<StringArray>()
1329 .unwrap()
1330 .value(0);
1331 assert_eq!("foo", value);
1332
1333 let slot = array.value(2);
1334 let value = slot
1335 .as_any()
1336 .downcast_ref::<StringArray>()
1337 .unwrap()
1338 .value(0);
1339 assert_eq!("bar", value);
1340
1341 let slot = array.value(3);
1342 let value = slot
1343 .as_any()
1344 .downcast_ref::<Float64Array>()
1345 .unwrap()
1346 .value(0);
1347 assert_eq!(10.0, value);
1348
1349 let slot = array.value(4);
1350 let value = slot
1351 .as_any()
1352 .downcast_ref::<StringArray>()
1353 .unwrap()
1354 .value(0);
1355 assert_eq!("baz", value);
1356
1357 let slot = array.value(5);
1358 let value = slot.as_any().downcast_ref::<Int32Array>().unwrap().value(0);
1359 assert_eq!(6, value);
1360 }
1361
1362 #[test]
1363 fn test_sparse_i32() {
1364 let mut builder = UnionBuilder::new_sparse();
1365 builder.append::<Int32Type>("a", 1).unwrap();
1366 builder.append::<Int32Type>("b", 2).unwrap();
1367 builder.append::<Int32Type>("c", 3).unwrap();
1368 builder.append::<Int32Type>("a", 4).unwrap();
1369 builder.append::<Int32Type>("c", 5).unwrap();
1370 builder.append::<Int32Type>("a", 6).unwrap();
1371 builder.append::<Int32Type>("b", 7).unwrap();
1372 let union = builder.build().unwrap();
1373
1374 let expected_type_ids = vec![0_i8, 1, 2, 0, 2, 0, 1];
1375 let expected_array_values = [1_i32, 2, 3, 4, 5, 6, 7];
1376
1377 assert_eq!(*union.type_ids(), expected_type_ids);
1379 for (i, id) in expected_type_ids.iter().enumerate() {
1380 assert_eq!(id, &union.type_id(i));
1381 }
1382
1383 assert!(union.offsets().is_none());
1385
1386 assert_eq!(
1388 *union.child(0).as_primitive::<Int32Type>().values(),
1389 [1_i32, 0, 0, 4, 0, 6, 0],
1390 );
1391 assert_eq!(
1392 *union.child(1).as_primitive::<Int32Type>().values(),
1393 [0_i32, 2_i32, 0, 0, 0, 0, 7]
1394 );
1395 assert_eq!(
1396 *union.child(2).as_primitive::<Int32Type>().values(),
1397 [0_i32, 0, 3_i32, 0, 5, 0, 0]
1398 );
1399
1400 assert_eq!(expected_array_values.len(), union.len());
1401 for (i, expected_value) in expected_array_values.iter().enumerate() {
1402 assert!(!union.is_null(i));
1403 let slot = union.value(i);
1404 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1405 assert_eq!(slot.len(), 1);
1406 let value = slot.value(0);
1407 assert_eq!(expected_value, &value);
1408 }
1409 }
1410
1411 #[test]
1412 fn test_sparse_mixed() {
1413 let mut builder = UnionBuilder::new_sparse();
1414 builder.append::<Int32Type>("a", 1).unwrap();
1415 builder.append::<Float64Type>("c", 3.0).unwrap();
1416 builder.append::<Int32Type>("a", 4).unwrap();
1417 builder.append::<Float64Type>("c", 5.0).unwrap();
1418 builder.append::<Int32Type>("a", 6).unwrap();
1419 let union = builder.build().unwrap();
1420
1421 let expected_type_ids = vec![0_i8, 1, 0, 1, 0];
1422
1423 assert_eq!(*union.type_ids(), expected_type_ids);
1425 for (i, id) in expected_type_ids.iter().enumerate() {
1426 assert_eq!(id, &union.type_id(i));
1427 }
1428
1429 assert!(union.offsets().is_none());
1431
1432 for i in 0..union.len() {
1433 let slot = union.value(i);
1434 assert!(!union.is_null(i));
1435 match i {
1436 0 => {
1437 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1438 assert_eq!(slot.len(), 1);
1439 let value = slot.value(0);
1440 assert_eq!(1_i32, value);
1441 }
1442 1 => {
1443 let slot = slot.as_any().downcast_ref::<Float64Array>().unwrap();
1444 assert_eq!(slot.len(), 1);
1445 let value = slot.value(0);
1446 assert_eq!(value, 3_f64);
1447 }
1448 2 => {
1449 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1450 assert_eq!(slot.len(), 1);
1451 let value = slot.value(0);
1452 assert_eq!(4_i32, value);
1453 }
1454 3 => {
1455 let slot = slot.as_any().downcast_ref::<Float64Array>().unwrap();
1456 assert_eq!(slot.len(), 1);
1457 let value = slot.value(0);
1458 assert_eq!(5_f64, value);
1459 }
1460 4 => {
1461 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1462 assert_eq!(slot.len(), 1);
1463 let value = slot.value(0);
1464 assert_eq!(6_i32, value);
1465 }
1466 _ => unreachable!(),
1467 }
1468 }
1469 }
1470
1471 #[test]
1472 fn test_sparse_mixed_with_nulls() {
1473 let mut builder = UnionBuilder::new_sparse();
1474 builder.append::<Int32Type>("a", 1).unwrap();
1475 builder.append_null::<Int32Type>("a").unwrap();
1476 builder.append::<Float64Type>("c", 3.0).unwrap();
1477 builder.append::<Int32Type>("a", 4).unwrap();
1478 let union = builder.build().unwrap();
1479
1480 let expected_type_ids = vec![0_i8, 0, 1, 0];
1481
1482 assert_eq!(*union.type_ids(), expected_type_ids);
1484 for (i, id) in expected_type_ids.iter().enumerate() {
1485 assert_eq!(id, &union.type_id(i));
1486 }
1487
1488 assert!(union.offsets().is_none());
1490
1491 for i in 0..union.len() {
1492 let slot = union.value(i);
1493 match i {
1494 0 => {
1495 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1496 assert!(!slot.is_null(0));
1497 assert_eq!(slot.len(), 1);
1498 let value = slot.value(0);
1499 assert_eq!(1_i32, value);
1500 }
1501 1 => assert!(slot.is_null(0)),
1502 2 => {
1503 let slot = slot.as_any().downcast_ref::<Float64Array>().unwrap();
1504 assert!(!slot.is_null(0));
1505 assert_eq!(slot.len(), 1);
1506 let value = slot.value(0);
1507 assert_eq!(value, 3_f64);
1508 }
1509 3 => {
1510 let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
1511 assert!(!slot.is_null(0));
1512 assert_eq!(slot.len(), 1);
1513 let value = slot.value(0);
1514 assert_eq!(4_i32, value);
1515 }
1516 _ => unreachable!(),
1517 }
1518 }
1519 }
1520
1521 #[test]
1522 fn test_sparse_mixed_with_nulls_and_offset() {
1523 let mut builder = UnionBuilder::new_sparse();
1524 builder.append::<Int32Type>("a", 1).unwrap();
1525 builder.append_null::<Int32Type>("a").unwrap();
1526 builder.append::<Float64Type>("c", 3.0).unwrap();
1527 builder.append_null::<Float64Type>("c").unwrap();
1528 builder.append::<Int32Type>("a", 4).unwrap();
1529 let union = builder.build().unwrap();
1530
1531 let slice = union.slice(1, 4);
1532 let new_union = slice.as_any().downcast_ref::<UnionArray>().unwrap();
1533
1534 assert_eq!(4, new_union.len());
1535 for i in 0..new_union.len() {
1536 let slot = new_union.value(i);
1537 match i {
1538 0 => assert!(slot.is_null(0)),
1539 1 => {
1540 let slot = slot.as_primitive::<Float64Type>();
1541 assert!(!slot.is_null(0));
1542 assert_eq!(slot.len(), 1);
1543 let value = slot.value(0);
1544 assert_eq!(value, 3_f64);
1545 }
1546 2 => assert!(slot.is_null(0)),
1547 3 => {
1548 let slot = slot.as_primitive::<Int32Type>();
1549 assert!(!slot.is_null(0));
1550 assert_eq!(slot.len(), 1);
1551 let value = slot.value(0);
1552 assert_eq!(4_i32, value);
1553 }
1554 _ => unreachable!(),
1555 }
1556 }
1557 }
1558
1559 fn test_union_validity(union_array: &UnionArray) {
1560 assert_eq!(union_array.null_count(), 0);
1561
1562 for i in 0..union_array.len() {
1563 assert!(!union_array.is_null(i));
1564 assert!(union_array.is_valid(i));
1565 }
1566 }
1567
1568 #[test]
1569 fn test_union_array_validity() {
1570 let mut builder = UnionBuilder::new_sparse();
1571 builder.append::<Int32Type>("a", 1).unwrap();
1572 builder.append_null::<Int32Type>("a").unwrap();
1573 builder.append::<Float64Type>("c", 3.0).unwrap();
1574 builder.append_null::<Float64Type>("c").unwrap();
1575 builder.append::<Int32Type>("a", 4).unwrap();
1576 let union = builder.build().unwrap();
1577
1578 test_union_validity(&union);
1579
1580 let mut builder = UnionBuilder::new_dense();
1581 builder.append::<Int32Type>("a", 1).unwrap();
1582 builder.append_null::<Int32Type>("a").unwrap();
1583 builder.append::<Float64Type>("c", 3.0).unwrap();
1584 builder.append_null::<Float64Type>("c").unwrap();
1585 builder.append::<Int32Type>("a", 4).unwrap();
1586 let union = builder.build().unwrap();
1587
1588 test_union_validity(&union);
1589 }
1590
1591 #[test]
1592 fn test_type_check() {
1593 let mut builder = UnionBuilder::new_sparse();
1594 builder.append::<Float32Type>("a", 1.0).unwrap();
1595 let err = builder.append::<Int32Type>("a", 1).unwrap_err().to_string();
1596 assert!(
1597 err.contains(
1598 "Attempt to write col \"a\" with type Int32 doesn't match existing type Float32"
1599 ),
1600 "{}",
1601 err
1602 );
1603 }
1604
1605 #[test]
1606 fn slice_union_array() {
1607 fn create_union(mut builder: UnionBuilder) -> UnionArray {
1609 builder.append::<Int32Type>("a", 1).unwrap();
1610 builder.append_null::<Int32Type>("a").unwrap();
1611 builder.append::<Float64Type>("c", 3.0).unwrap();
1612 builder.append_null::<Float64Type>("c").unwrap();
1613 builder.append::<Int32Type>("a", 4).unwrap();
1614 builder.build().unwrap()
1615 }
1616
1617 fn create_batch(union: UnionArray) -> RecordBatch {
1618 let schema = Schema::new(vec![Field::new(
1619 "struct_array",
1620 union.data_type().clone(),
1621 true,
1622 )]);
1623
1624 RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap()
1625 }
1626
1627 fn test_slice_union(record_batch_slice: RecordBatch) {
1628 let union_slice = record_batch_slice
1629 .column(0)
1630 .as_any()
1631 .downcast_ref::<UnionArray>()
1632 .unwrap();
1633
1634 assert_eq!(union_slice.type_id(0), 0);
1635 assert_eq!(union_slice.type_id(1), 1);
1636 assert_eq!(union_slice.type_id(2), 1);
1637
1638 let slot = union_slice.value(0);
1639 let array = slot.as_primitive::<Int32Type>();
1640 assert_eq!(array.len(), 1);
1641 assert!(array.is_null(0));
1642
1643 let slot = union_slice.value(1);
1644 let array = slot.as_primitive::<Float64Type>();
1645 assert_eq!(array.len(), 1);
1646 assert!(array.is_valid(0));
1647 assert_eq!(array.value(0), 3.0);
1648
1649 let slot = union_slice.value(2);
1650 let array = slot.as_primitive::<Float64Type>();
1651 assert_eq!(array.len(), 1);
1652 assert!(array.is_null(0));
1653 }
1654
1655 let builder = UnionBuilder::new_sparse();
1657 let record_batch = create_batch(create_union(builder));
1658 let record_batch_slice = record_batch.slice(1, 3);
1660 test_slice_union(record_batch_slice);
1661
1662 let builder = UnionBuilder::new_dense();
1664 let record_batch = create_batch(create_union(builder));
1665 let record_batch_slice = record_batch.slice(1, 3);
1667 test_slice_union(record_batch_slice);
1668 }
1669
1670 #[test]
1671 fn test_custom_type_ids() {
1672 let data_type = DataType::Union(
1673 UnionFields::new(
1674 vec![8, 4, 9],
1675 vec![
1676 Field::new("strings", DataType::Utf8, false),
1677 Field::new("integers", DataType::Int32, false),
1678 Field::new("floats", DataType::Float64, false),
1679 ],
1680 ),
1681 UnionMode::Dense,
1682 );
1683
1684 let string_array = StringArray::from(vec!["foo", "bar", "baz"]);
1685 let int_array = Int32Array::from(vec![5, 6, 4]);
1686 let float_array = Float64Array::from(vec![10.0]);
1687
1688 let type_ids = Buffer::from_vec(vec![4_i8, 8, 4, 8, 9, 4, 8]);
1689 let value_offsets = Buffer::from_vec(vec![0_i32, 0, 1, 1, 0, 2, 2]);
1690
1691 let data = ArrayData::builder(data_type)
1692 .len(7)
1693 .buffers(vec![type_ids, value_offsets])
1694 .child_data(vec![
1695 string_array.into_data(),
1696 int_array.into_data(),
1697 float_array.into_data(),
1698 ])
1699 .build()
1700 .unwrap();
1701
1702 let array = UnionArray::from(data);
1703
1704 let v = array.value(0);
1705 assert_eq!(v.data_type(), &DataType::Int32);
1706 assert_eq!(v.len(), 1);
1707 assert_eq!(v.as_primitive::<Int32Type>().value(0), 5);
1708
1709 let v = array.value(1);
1710 assert_eq!(v.data_type(), &DataType::Utf8);
1711 assert_eq!(v.len(), 1);
1712 assert_eq!(v.as_string::<i32>().value(0), "foo");
1713
1714 let v = array.value(2);
1715 assert_eq!(v.data_type(), &DataType::Int32);
1716 assert_eq!(v.len(), 1);
1717 assert_eq!(v.as_primitive::<Int32Type>().value(0), 6);
1718
1719 let v = array.value(3);
1720 assert_eq!(v.data_type(), &DataType::Utf8);
1721 assert_eq!(v.len(), 1);
1722 assert_eq!(v.as_string::<i32>().value(0), "bar");
1723
1724 let v = array.value(4);
1725 assert_eq!(v.data_type(), &DataType::Float64);
1726 assert_eq!(v.len(), 1);
1727 assert_eq!(v.as_primitive::<Float64Type>().value(0), 10.0);
1728
1729 let v = array.value(5);
1730 assert_eq!(v.data_type(), &DataType::Int32);
1731 assert_eq!(v.len(), 1);
1732 assert_eq!(v.as_primitive::<Int32Type>().value(0), 4);
1733
1734 let v = array.value(6);
1735 assert_eq!(v.data_type(), &DataType::Utf8);
1736 assert_eq!(v.len(), 1);
1737 assert_eq!(v.as_string::<i32>().value(0), "baz");
1738 }
1739
1740 #[test]
1741 fn into_parts() {
1742 let mut builder = UnionBuilder::new_dense();
1743 builder.append::<Int32Type>("a", 1).unwrap();
1744 builder.append::<Int8Type>("b", 2).unwrap();
1745 builder.append::<Int32Type>("a", 3).unwrap();
1746 let dense_union = builder.build().unwrap();
1747
1748 let field = [
1749 &Arc::new(Field::new("a", DataType::Int32, false)),
1750 &Arc::new(Field::new("b", DataType::Int8, false)),
1751 ];
1752 let (union_fields, type_ids, offsets, children) = dense_union.into_parts();
1753 assert_eq!(
1754 union_fields
1755 .iter()
1756 .map(|(_, field)| field)
1757 .collect::<Vec<_>>(),
1758 field
1759 );
1760 assert_eq!(type_ids, [0, 1, 0]);
1761 assert!(offsets.is_some());
1762 assert_eq!(offsets.as_ref().unwrap(), &[0, 0, 1]);
1763
1764 let result = UnionArray::try_new(union_fields, type_ids, offsets, children);
1765 assert!(result.is_ok());
1766 assert_eq!(result.unwrap().len(), 3);
1767
1768 let mut builder = UnionBuilder::new_sparse();
1769 builder.append::<Int32Type>("a", 1).unwrap();
1770 builder.append::<Int8Type>("b", 2).unwrap();
1771 builder.append::<Int32Type>("a", 3).unwrap();
1772 let sparse_union = builder.build().unwrap();
1773
1774 let (union_fields, type_ids, offsets, children) = sparse_union.into_parts();
1775 assert_eq!(type_ids, [0, 1, 0]);
1776 assert!(offsets.is_none());
1777
1778 let result = UnionArray::try_new(union_fields, type_ids, offsets, children);
1779 assert!(result.is_ok());
1780 assert_eq!(result.unwrap().len(), 3);
1781 }
1782
1783 #[test]
1784 fn into_parts_custom_type_ids() {
1785 let set_field_type_ids: [i8; 3] = [8, 4, 9];
1786 let data_type = DataType::Union(
1787 UnionFields::new(
1788 set_field_type_ids,
1789 [
1790 Field::new("strings", DataType::Utf8, false),
1791 Field::new("integers", DataType::Int32, false),
1792 Field::new("floats", DataType::Float64, false),
1793 ],
1794 ),
1795 UnionMode::Dense,
1796 );
1797 let string_array = StringArray::from(vec!["foo", "bar", "baz"]);
1798 let int_array = Int32Array::from(vec![5, 6, 4]);
1799 let float_array = Float64Array::from(vec![10.0]);
1800 let type_ids = Buffer::from_vec(vec![4_i8, 8, 4, 8, 9, 4, 8]);
1801 let value_offsets = Buffer::from_vec(vec![0_i32, 0, 1, 1, 0, 2, 2]);
1802 let data = ArrayData::builder(data_type)
1803 .len(7)
1804 .buffers(vec![type_ids, value_offsets])
1805 .child_data(vec![
1806 string_array.into_data(),
1807 int_array.into_data(),
1808 float_array.into_data(),
1809 ])
1810 .build()
1811 .unwrap();
1812 let array = UnionArray::from(data);
1813
1814 let (union_fields, type_ids, offsets, children) = array.into_parts();
1815 assert_eq!(
1816 type_ids.iter().collect::<HashSet<_>>(),
1817 set_field_type_ids.iter().collect::<HashSet<_>>()
1818 );
1819 let result = UnionArray::try_new(union_fields, type_ids, offsets, children);
1820 assert!(result.is_ok());
1821 let array = result.unwrap();
1822 assert_eq!(array.len(), 7);
1823 }
1824
1825 #[test]
1826 fn test_invalid() {
1827 let fields = UnionFields::new(
1828 [3, 2],
1829 [
1830 Field::new("a", DataType::Utf8, false),
1831 Field::new("b", DataType::Utf8, false),
1832 ],
1833 );
1834 let children = vec![
1835 Arc::new(StringArray::from_iter_values(["a", "b"])) as _,
1836 Arc::new(StringArray::from_iter_values(["c", "d"])) as _,
1837 ];
1838
1839 let type_ids = vec![3, 3, 2].into();
1840 let err =
1841 UnionArray::try_new(fields.clone(), type_ids, None, children.clone()).unwrap_err();
1842 assert_eq!(
1843 err.to_string(),
1844 "Invalid argument error: Sparse union child arrays must be equal in length to the length of the union"
1845 );
1846
1847 let type_ids = vec![1, 2].into();
1848 let err =
1849 UnionArray::try_new(fields.clone(), type_ids, None, children.clone()).unwrap_err();
1850 assert_eq!(
1851 err.to_string(),
1852 "Invalid argument error: Type Ids values must match one of the field type ids"
1853 );
1854
1855 let type_ids = vec![7, 2].into();
1856 let err = UnionArray::try_new(fields.clone(), type_ids, None, children).unwrap_err();
1857 assert_eq!(
1858 err.to_string(),
1859 "Invalid argument error: Type Ids values must match one of the field type ids"
1860 );
1861
1862 let children = vec![
1863 Arc::new(StringArray::from_iter_values(["a", "b"])) as _,
1864 Arc::new(StringArray::from_iter_values(["c"])) as _,
1865 ];
1866 let type_ids = ScalarBuffer::from(vec![3_i8, 3, 2]);
1867 let offsets = Some(vec![0, 1, 0].into());
1868 UnionArray::try_new(fields.clone(), type_ids.clone(), offsets, children.clone()).unwrap();
1869
1870 let offsets = Some(vec![0, 1, 1].into());
1871 let err = UnionArray::try_new(fields.clone(), type_ids.clone(), offsets, children.clone())
1872 .unwrap_err();
1873
1874 assert_eq!(
1875 err.to_string(),
1876 "Invalid argument error: Offsets must be positive and within the length of the Array"
1877 );
1878
1879 let offsets = Some(vec![0, 1].into());
1880 let err =
1881 UnionArray::try_new(fields.clone(), type_ids.clone(), offsets, children).unwrap_err();
1882
1883 assert_eq!(
1884 err.to_string(),
1885 "Invalid argument error: Type Ids and Offsets lengths must match"
1886 );
1887
1888 let err = UnionArray::try_new(fields.clone(), type_ids, None, vec![]).unwrap_err();
1889
1890 assert_eq!(
1891 err.to_string(),
1892 "Invalid argument error: Union fields length must match child arrays length"
1893 );
1894 }
1895
1896 #[test]
1897 fn test_logical_nulls_fast_paths() {
1898 let array = UnionArray::try_new(UnionFields::empty(), vec![].into(), None, vec![]).unwrap();
1900
1901 assert_eq!(array.logical_nulls(), None);
1902
1903 let fields = UnionFields::new(
1904 [1, 3],
1905 [
1906 Field::new("a", DataType::Int8, false), Field::new("b", DataType::Int8, false), ],
1909 );
1910 let array = UnionArray::try_new(
1911 fields,
1912 vec![1].into(),
1913 None,
1914 vec![
1915 Arc::new(Int8Array::from_value(5, 1)),
1916 Arc::new(Int8Array::from_value(5, 1)),
1917 ],
1918 )
1919 .unwrap();
1920
1921 assert_eq!(array.logical_nulls(), None);
1922
1923 let nullable_fields = UnionFields::new(
1924 [1, 3],
1925 [
1926 Field::new("a", DataType::Int8, true), Field::new("b", DataType::Int8, true), ],
1929 );
1930 let array = UnionArray::try_new(
1931 nullable_fields.clone(),
1932 vec![1, 1].into(),
1933 None,
1934 vec![
1935 Arc::new(Int8Array::from_value(-5, 2)), Arc::new(Int8Array::from_value(-5, 2)), ],
1938 )
1939 .unwrap();
1940
1941 assert_eq!(array.logical_nulls(), None);
1942
1943 let array = UnionArray::try_new(
1944 nullable_fields.clone(),
1945 vec![1, 1].into(),
1946 None,
1947 vec![
1948 Arc::new(Int8Array::new_null(2)), Arc::new(Int8Array::new_null(2)), ],
1952 )
1953 .unwrap();
1954
1955 assert_eq!(array.logical_nulls(), Some(NullBuffer::new_null(2)));
1956
1957 let array = UnionArray::try_new(
1958 nullable_fields.clone(),
1959 vec![1, 1].into(),
1960 Some(vec![0, 1].into()),
1961 vec![
1962 Arc::new(Int8Array::new_null(3)), Arc::new(Int8Array::new_null(3)), ],
1966 )
1967 .unwrap();
1968
1969 assert_eq!(array.logical_nulls(), Some(NullBuffer::new_null(2)));
1970 }
1971
1972 #[test]
1973 fn test_dense_union_logical_nulls_gather() {
1974 let int_array = Int32Array::from(vec![1, 2]);
1976 let float_array = Float64Array::from(vec![Some(3.2), None]);
1977 let str_array = StringArray::new_null(1);
1978 let type_ids = [1, 1, 3, 3, 4, 4].into_iter().collect::<ScalarBuffer<i8>>();
1979 let offsets = [0, 1, 0, 1, 0, 0]
1980 .into_iter()
1981 .collect::<ScalarBuffer<i32>>();
1982
1983 let children = vec![
1984 Arc::new(int_array) as Arc<dyn Array>,
1985 Arc::new(float_array),
1986 Arc::new(str_array),
1987 ];
1988
1989 let array = UnionArray::try_new(union_fields(), type_ids, Some(offsets), children).unwrap();
1990
1991 let expected = BooleanBuffer::from(vec![true, true, true, false, false, false]);
1992
1993 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
1994 assert_eq!(expected, array.gather_nulls(array.fields_logical_nulls()));
1995 }
1996
1997 #[test]
1998 fn test_sparse_union_logical_nulls_mask_all_nulls_skip_one() {
1999 let fields: UnionFields = [
2000 (1, Arc::new(Field::new("A", DataType::Int32, true))),
2001 (3, Arc::new(Field::new("B", DataType::Float64, true))),
2002 ]
2003 .into_iter()
2004 .collect();
2005
2006 let int_array = Int32Array::new_null(4);
2008 let float_array = Float64Array::from(vec![None, None, Some(3.2), None]);
2009 let type_ids = [1, 1, 3, 3].into_iter().collect::<ScalarBuffer<i8>>();
2010
2011 let children = vec![Arc::new(int_array) as Arc<dyn Array>, Arc::new(float_array)];
2012
2013 let array = UnionArray::try_new(fields.clone(), type_ids, None, children).unwrap();
2014
2015 let expected = BooleanBuffer::from(vec![false, false, true, false]);
2016
2017 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2018 assert_eq!(
2019 expected,
2020 array.mask_sparse_all_with_nulls_skip_one(array.fields_logical_nulls())
2021 );
2022
2023 let len = 2 * 64 + 32;
2025
2026 let int_array = Int32Array::new_null(len);
2027 let float_array = Float64Array::from_iter([Some(3.2), None].into_iter().cycle().take(len));
2028 let type_ids = ScalarBuffer::from_iter([1, 1, 3, 3].into_iter().cycle().take(len));
2029
2030 let array = UnionArray::try_new(
2031 fields,
2032 type_ids,
2033 None,
2034 vec![Arc::new(int_array), Arc::new(float_array)],
2035 )
2036 .unwrap();
2037
2038 let expected =
2039 BooleanBuffer::from_iter([false, false, true, false].into_iter().cycle().take(len));
2040
2041 assert_eq!(array.len(), len);
2042 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2043 assert_eq!(
2044 expected,
2045 array.mask_sparse_all_with_nulls_skip_one(array.fields_logical_nulls())
2046 );
2047 }
2048
2049 #[test]
2050 fn test_sparse_union_logical_mask_mixed_nulls_skip_fully_valid() {
2051 let int_array = Int32Array::from_value(2, 6);
2053 let float_array = Float64Array::from_value(4.2, 6);
2054 let str_array = StringArray::new_null(6);
2055 let type_ids = [1, 1, 3, 3, 4, 4].into_iter().collect::<ScalarBuffer<i8>>();
2056
2057 let children = vec![
2058 Arc::new(int_array) as Arc<dyn Array>,
2059 Arc::new(float_array),
2060 Arc::new(str_array),
2061 ];
2062
2063 let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
2064
2065 let expected = BooleanBuffer::from(vec![true, true, true, true, false, false]);
2066
2067 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2068 assert_eq!(
2069 expected,
2070 array.mask_sparse_skip_without_nulls(array.fields_logical_nulls())
2071 );
2072
2073 let len = 2 * 64 + 32;
2075
2076 let int_array = Int32Array::from_value(2, len);
2077 let float_array = Float64Array::from_value(4.2, len);
2078 let str_array = StringArray::from_iter([None, Some("a")].into_iter().cycle().take(len));
2079 let type_ids = ScalarBuffer::from_iter([1, 1, 3, 3, 4, 4].into_iter().cycle().take(len));
2080
2081 let children = vec![
2082 Arc::new(int_array) as Arc<dyn Array>,
2083 Arc::new(float_array),
2084 Arc::new(str_array),
2085 ];
2086
2087 let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
2088
2089 let expected = BooleanBuffer::from_iter(
2090 [true, true, true, true, false, true]
2091 .into_iter()
2092 .cycle()
2093 .take(len),
2094 );
2095
2096 assert_eq!(array.len(), len);
2097 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2098 assert_eq!(
2099 expected,
2100 array.mask_sparse_skip_without_nulls(array.fields_logical_nulls())
2101 );
2102 }
2103
2104 #[test]
2105 fn test_sparse_union_logical_mask_mixed_nulls_skip_fully_null() {
2106 let int_array = Int32Array::new_null(6);
2108 let float_array = Float64Array::from_value(4.2, 6);
2109 let str_array = StringArray::new_null(6);
2110 let type_ids = [1, 1, 3, 3, 4, 4].into_iter().collect::<ScalarBuffer<i8>>();
2111
2112 let children = vec![
2113 Arc::new(int_array) as Arc<dyn Array>,
2114 Arc::new(float_array),
2115 Arc::new(str_array),
2116 ];
2117
2118 let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
2119
2120 let expected = BooleanBuffer::from(vec![false, false, true, true, false, false]);
2121
2122 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2123 assert_eq!(
2124 expected,
2125 array.mask_sparse_skip_fully_null(array.fields_logical_nulls())
2126 );
2127
2128 let len = 2 * 64 + 32;
2130
2131 let int_array = Int32Array::new_null(len);
2132 let float_array = Float64Array::from_value(4.2, len);
2133 let str_array = StringArray::new_null(len);
2134 let type_ids = ScalarBuffer::from_iter([1, 1, 3, 3, 4, 4].into_iter().cycle().take(len));
2135
2136 let children = vec![
2137 Arc::new(int_array) as Arc<dyn Array>,
2138 Arc::new(float_array),
2139 Arc::new(str_array),
2140 ];
2141
2142 let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
2143
2144 let expected = BooleanBuffer::from_iter(
2145 [false, false, true, true, false, false]
2146 .into_iter()
2147 .cycle()
2148 .take(len),
2149 );
2150
2151 assert_eq!(array.len(), len);
2152 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2153 assert_eq!(
2154 expected,
2155 array.mask_sparse_skip_fully_null(array.fields_logical_nulls())
2156 );
2157 }
2158
2159 #[test]
2160 fn test_sparse_union_logical_nulls_gather() {
2161 let n_fields = 50;
2162
2163 let non_null = Int32Array::from_value(2, 4);
2164 let mixed = Int32Array::from(vec![None, None, Some(1), None]);
2165 let fully_null = Int32Array::new_null(4);
2166
2167 let array = UnionArray::try_new(
2168 (1..)
2169 .step_by(2)
2170 .map(|i| {
2171 (
2172 i,
2173 Arc::new(Field::new(format!("f{i}"), DataType::Int32, true)),
2174 )
2175 })
2176 .take(n_fields)
2177 .collect(),
2178 vec![1, 3, 3, 5].into(),
2179 None,
2180 [
2181 Arc::new(non_null) as ArrayRef,
2182 Arc::new(mixed),
2183 Arc::new(fully_null),
2184 ]
2185 .into_iter()
2186 .cycle()
2187 .take(n_fields)
2188 .collect(),
2189 )
2190 .unwrap();
2191
2192 let expected = BooleanBuffer::from(vec![true, false, true, false]);
2193
2194 assert_eq!(expected, array.logical_nulls().unwrap().into_inner());
2195 assert_eq!(expected, array.gather_nulls(array.fields_logical_nulls()));
2196 }
2197
2198 fn union_fields() -> UnionFields {
2199 [
2200 (1, Arc::new(Field::new("A", DataType::Int32, true))),
2201 (3, Arc::new(Field::new("B", DataType::Float64, true))),
2202 (4, Arc::new(Field::new("C", DataType::Utf8, true))),
2203 ]
2204 .into_iter()
2205 .collect()
2206 }
2207
2208 #[test]
2209 fn test_is_nullable() {
2210 assert!(!create_union_array(false, false).is_nullable());
2211 assert!(create_union_array(true, false).is_nullable());
2212 assert!(create_union_array(false, true).is_nullable());
2213 assert!(create_union_array(true, true).is_nullable());
2214 }
2215
2216 fn create_union_array(int_nullable: bool, float_nullable: bool) -> UnionArray {
2223 let int_array = if int_nullable {
2224 Int32Array::from(vec![Some(1), None, Some(3)])
2225 } else {
2226 Int32Array::from(vec![1, 2, 3])
2227 };
2228 let float_array = if float_nullable {
2229 Float64Array::from(vec![Some(3.2), None, Some(4.2)])
2230 } else {
2231 Float64Array::from(vec![3.2, 4.2, 5.2])
2232 };
2233 let type_ids = [0, 1, 0].into_iter().collect::<ScalarBuffer<i8>>();
2234 let offsets = [0, 0, 0].into_iter().collect::<ScalarBuffer<i32>>();
2235 let union_fields = [
2236 (0, Arc::new(Field::new("A", DataType::Int32, true))),
2237 (1, Arc::new(Field::new("B", DataType::Float64, true))),
2238 ]
2239 .into_iter()
2240 .collect::<UnionFields>();
2241
2242 let children = vec![Arc::new(int_array) as Arc<dyn Array>, Arc::new(float_array)];
2243
2244 UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap()
2245 }
2246}