1use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::Rng;
26use rand::SeedableRng;
27use rand::distr::uniform::SampleUniform;
28use rand::rng;
29use rand::{
30 distr::{Alphanumeric, Distribution, StandardUniform},
31 prelude::StdRng,
32};
33use std::ops::Range;
34
35pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38 T: ArrowPrimitiveType,
39 StandardUniform: Distribution<T::Native>,
40{
41 let mut rng = seedable_rng();
42
43 (0..size)
44 .map(|_| {
45 if rng.random::<f32>() < null_density {
46 None
47 } else {
48 Some(rng.random())
49 }
50 })
51 .collect()
52}
53
54pub fn create_primitive_array_range<T>(
57 size: usize,
58 null_density: f32,
59 value_range: Range<T::Native>,
60) -> PrimitiveArray<T>
61where
62 T: ArrowPrimitiveType,
63 StandardUniform: Distribution<T::Native>,
64 T::Native: SampleUniform,
65{
66 let mut rng = seedable_rng();
67
68 (0..size)
69 .map(|_| {
70 if rng.random::<f32>() < null_density {
71 None
72 } else {
73 Some(rng.random_range(value_range.clone()))
74 }
75 })
76 .collect()
77}
78
79pub fn create_primitive_array_with_seed<T>(
82 size: usize,
83 null_density: f32,
84 seed: u64,
85) -> PrimitiveArray<T>
86where
87 T: ArrowPrimitiveType,
88 StandardUniform: Distribution<T::Native>,
89{
90 let mut rng = StdRng::seed_from_u64(seed);
91
92 (0..size)
93 .map(|_| {
94 if rng.random::<f32>() < null_density {
95 None
96 } else {
97 Some(rng.random())
98 }
99 })
100 .collect()
101}
102
103pub fn create_month_day_nano_array_with_seed(
106 size: usize,
107 null_density: f32,
108 seed: u64,
109) -> IntervalMonthDayNanoArray {
110 let mut rng = StdRng::seed_from_u64(seed);
111
112 (0..size)
113 .map(|_| {
114 if rng.random::<f32>() < null_density {
115 None
116 } else {
117 Some(IntervalMonthDayNano::new(
118 rng.random(),
119 rng.random(),
120 rng.random(),
121 ))
122 }
123 })
124 .collect()
125}
126
127pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
129where
130 StandardUniform: Distribution<bool>,
131{
132 let mut rng = seedable_rng();
133 (0..size)
134 .map(|_| {
135 if rng.random::<f32>() < null_density {
136 None
137 } else {
138 let value = rng.random::<f32>() < true_density;
139 Some(value)
140 }
141 })
142 .collect()
143}
144
145pub fn create_boolean_array_with_seed(
147 size: usize,
148 null_density: f32,
149 true_density: f32,
150 seed: u64,
151) -> BooleanArray
152where
153 StandardUniform: Distribution<bool>,
154{
155 let mut rng = StdRng::seed_from_u64(seed);
156 (0..size)
157 .map(|_| {
158 if rng.random::<f32>() < null_density {
159 None
160 } else {
161 let value = rng.random::<f32>() < true_density;
162 Some(value)
163 }
164 })
165 .collect()
166}
167
168pub fn create_string_array<Offset: OffsetSizeTrait>(
174 size: usize,
175 null_density: f32,
176) -> GenericStringArray<Offset> {
177 create_string_array_with_max_len(size, null_density, 400)
178}
179
180pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
184 size: usize,
185 null_density: f32,
186) -> GenericStringArray<Offset> {
187 create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
188}
189
190pub fn create_longer_string_view_array_with_same_prefix(
194 size: usize,
195 null_density: f32,
196) -> StringViewArray {
197 create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
198}
199
200fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
201 size: usize,
202 null_density: f32,
203 min_str_len: usize,
204 max_str_len: usize,
205 prefix: &str,
206) -> GenericStringArray<Offset> {
207 create_string_array_with_len_range_and_prefix_and_seed(
208 size,
209 null_density,
210 min_str_len,
211 max_str_len,
212 prefix,
213 42,
214 )
215}
216
217pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
221 size: usize,
222 null_density: f32,
223 min_str_len: usize,
224 max_str_len: usize,
225 prefix: &str,
226 seed: u64,
227) -> GenericStringArray<Offset> {
228 assert!(
229 min_str_len <= max_str_len,
230 "min_str_len must be <= max_str_len"
231 );
232 assert!(
233 prefix.len() <= max_str_len,
234 "Prefix length must be <= max_str_len"
235 );
236
237 let rng = &mut StdRng::seed_from_u64(seed);
238 (0..size)
239 .map(|_| {
240 if rng.random::<f32>() < null_density {
241 None
242 } else {
243 let remaining_len = rng.random_range(
244 min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
245 );
246
247 let mut value = prefix.to_string();
248 value.extend(
249 rng.sample_iter(&Alphanumeric)
250 .take(remaining_len)
251 .map(char::from),
252 );
253
254 Some(value)
255 }
256 })
257 .collect()
258}
259pub fn create_string_view_array_with_len_range_and_seed(
267 size: usize,
268 null_density: f32,
269 range: Range<usize>,
270 seed: u64,
271) -> StringViewArray {
272 let rng = &mut StdRng::seed_from_u64(seed);
273 (0..size)
274 .map(|_| {
275 if rng.random::<f32>() < null_density {
276 None
277 } else {
278 let str_len = rng.random_range(range.clone());
279 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
280 let value = String::from_utf8(value).unwrap();
281 Some(value)
282 }
283 })
284 .collect()
285}
286
287fn create_string_view_array_with_len_range_and_prefix(
288 size: usize,
289 null_density: f32,
290 min_str_len: usize,
291 max_str_len: usize,
292 prefix: &str,
293) -> StringViewArray {
294 assert!(
295 min_str_len <= max_str_len,
296 "min_str_len must be <= max_str_len"
297 );
298 assert!(
299 prefix.len() <= max_str_len,
300 "Prefix length must be <= max_str_len"
301 );
302
303 let rng = &mut seedable_rng();
304 (0..size)
305 .map(|_| {
306 if rng.random::<f32>() < null_density {
307 None
308 } else {
309 let remaining_len = rng.random_range(
310 min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
311 );
312
313 let mut value = prefix.to_string();
314 value.extend(
315 rng.sample_iter(&Alphanumeric)
316 .take(remaining_len)
317 .map(char::from),
318 );
319
320 Some(value)
321 }
322 })
323 .collect()
324}
325
326pub fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
328 size: usize,
329 null_density: f32,
330 max_str_len: usize,
331) -> GenericStringArray<Offset> {
332 let rng = &mut seedable_rng();
333 (0..size)
334 .map(|_| {
335 if rng.random::<f32>() < null_density {
336 None
337 } else {
338 let str_len = rng.random_range(0..max_str_len);
339 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
340 let value = String::from_utf8(value).unwrap();
341 Some(value)
342 }
343 })
344 .collect()
345}
346
347pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
349 size: usize,
350 null_density: f32,
351 str_len: usize,
352) -> GenericStringArray<Offset> {
353 let rng = &mut seedable_rng();
354
355 (0..size)
356 .map(|_| {
357 if rng.random::<f32>() < null_density {
358 None
359 } else {
360 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
361 let value = String::from_utf8(value).unwrap();
362 Some(value)
363 }
364 })
365 .collect()
366}
367
368pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
372 create_string_view_array_with_max_len(size, null_density, 400)
373}
374
375pub fn create_string_view_array_with_max_len(
377 size: usize,
378 null_density: f32,
379 max_str_len: usize,
380) -> StringViewArray {
381 let rng = &mut seedable_rng();
382 (0..size)
383 .map(|_| {
384 if rng.random::<f32>() < null_density {
385 None
386 } else {
387 let str_len = rng.random_range(0..max_str_len);
388 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
389 let value = String::from_utf8(value).unwrap();
390 Some(value)
391 }
392 })
393 .collect()
394}
395
396pub fn create_string_view_array_with_fixed_len(
398 size: usize,
399 null_density: f32,
400 str_len: usize,
401) -> StringViewArray {
402 let rng = &mut seedable_rng();
403 (0..size)
404 .map(|_| {
405 if rng.random::<f32>() < null_density {
406 None
407 } else {
408 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
409 let value = String::from_utf8(value).unwrap();
410 Some(value)
411 }
412 })
413 .collect()
414}
415
416pub fn create_string_view_array_with_len(
418 size: usize,
419 null_density: f32,
420 str_len: usize,
421 mixed: bool,
422) -> StringViewArray {
423 let rng = &mut seedable_rng();
424
425 let mut lengths = Vec::with_capacity(size);
426
427 if mixed {
429 for _ in 0..size / 2 {
430 lengths.push(rng.random_range(1..12));
431 }
432 for _ in size / 2..size {
433 lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
434 }
435 } else {
436 lengths.resize(size, str_len);
437 }
438
439 lengths
440 .into_iter()
441 .map(|len| {
442 if rng.random::<f32>() < null_density {
443 None
444 } else {
445 let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
446 Some(String::from_utf8(value).unwrap())
447 }
448 })
449 .collect()
450}
451
452pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
455 size: usize,
456 null_density: f32,
457 str_len: usize,
458) -> DictionaryArray<K> {
459 let rng = &mut seedable_rng();
460
461 let data: Vec<_> = (0..size)
462 .map(|_| {
463 if rng.random::<f32>() < null_density {
464 None
465 } else {
466 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
467 let value = String::from_utf8(value).unwrap();
468 Some(value)
469 }
470 })
471 .collect();
472
473 data.iter().map(|x| x.as_deref()).collect()
474}
475
476pub fn create_primitive_list_array_with_seed<O, T>(
485 size: usize,
486 null_density: f32,
487 list_null_density: f32,
488 max_list_size: usize,
489 seed: u64,
490) -> GenericListArray<O>
491where
492 O: OffsetSizeTrait,
493 T: ArrowPrimitiveType,
494 StandardUniform: Distribution<T::Native>,
495{
496 let mut rng = StdRng::seed_from_u64(seed);
497
498 let values = (0..size).map(|_| {
499 if rng.random::<f32>() < null_density {
500 None
501 } else {
502 let list_size = rng.random_range(0..=max_list_size);
503 let list_values: Vec<Option<T::Native>> = (0..list_size)
504 .map(|_| {
505 if rng.random::<f32>() < list_null_density {
506 None
507 } else {
508 Some(rng.random())
509 }
510 })
511 .collect();
512 Some(list_values)
513 }
514 });
515
516 GenericListArray::<O>::from_iter_primitive::<T, _, _>(values)
517}
518
519pub fn create_primitive_list_array<O, T>(
523 size: usize,
524 null_density: f32,
525 list_null_density: f32,
526 max_list_size: usize,
527) -> GenericListArray<O>
528where
529 O: OffsetSizeTrait,
530 T: ArrowPrimitiveType,
531 StandardUniform: Distribution<T::Native>,
532{
533 let mut rng = seedable_rng();
534
535 let values = (0..size).map(|_| {
536 if rng.random::<f32>() < null_density {
537 None
538 } else {
539 let list_size = rng.random_range(0..=max_list_size);
540 let list_values: Vec<Option<T::Native>> = (0..list_size)
541 .map(|_| {
542 if rng.random::<f32>() < list_null_density {
543 None
544 } else {
545 Some(rng.random())
546 }
547 })
548 .collect();
549 Some(list_values)
550 }
551 });
552
553 GenericListArray::<O>::from_iter_primitive::<T, _, _>(values)
554}
555
556pub fn create_primitive_list_view_array<O, T>(
560 size: usize,
561 null_density: f32,
562 list_null_density: f32,
563 max_list_size: usize,
564) -> GenericListViewArray<O>
565where
566 T: ArrowPrimitiveType,
567 StandardUniform: Distribution<T::Native>,
568 O: OffsetSizeTrait,
569{
570 let mut rng = seedable_rng();
571
572 let values = (0..size).map(|_| {
573 if rng.random::<f32>() < null_density {
574 None
575 } else {
576 let list_size = rng.random_range(0..=max_list_size);
577 let list_values: Vec<Option<T::Native>> = (0..list_size)
578 .map(|_| {
579 if rng.random::<f32>() < list_null_density {
580 None
581 } else {
582 Some(rng.random())
583 }
584 })
585 .collect();
586 Some(list_values)
587 }
588 });
589
590 GenericListViewArray::<O>::from_iter_primitive::<T, _, _>(values)
591}
592
593pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
595 logical_array_len: usize,
596 physical_array_len: usize,
597) -> RunArray<R> {
598 assert!(logical_array_len >= physical_array_len);
599 let run_len = logical_array_len / physical_array_len;
601
602 let mut run_len_extra = logical_array_len % physical_array_len;
604
605 let mut values: Vec<V::Native> = (0..physical_array_len)
606 .flat_map(|s| {
607 let mut take_len = run_len;
608 if run_len_extra > 0 {
609 take_len += 1;
610 run_len_extra -= 1;
611 }
612 std::iter::repeat_n(V::Native::from_usize(s).unwrap(), take_len)
613 })
614 .collect();
615 while values.len() < logical_array_len {
616 let last_val = values[values.len() - 1];
617 values.push(last_val);
618 }
619 let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
620 builder.extend(values.into_iter().map(Some));
621
622 builder.finish()
623}
624
625pub fn create_string_array_for_runs(
629 physical_array_len: usize,
630 logical_array_len: usize,
631 string_len: usize,
632) -> Vec<String> {
633 assert!(logical_array_len >= physical_array_len);
634 let mut rng = rng();
635
636 let run_len = logical_array_len / physical_array_len;
638
639 let mut run_len_extra = logical_array_len % physical_array_len;
641
642 let mut values: Vec<String> = (0..physical_array_len)
643 .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
644 .flat_map(|s| {
645 let mut take_len = run_len;
646 if run_len_extra > 0 {
647 take_len += 1;
648 run_len_extra -= 1;
649 }
650 std::iter::repeat_n(s, take_len)
651 })
652 .collect();
653 while values.len() < logical_array_len {
654 let last_val = values[values.len() - 1].clone();
655 values.push(last_val);
656 }
657 values
658}
659
660pub fn create_binary_array<Offset: OffsetSizeTrait>(
662 size: usize,
663 null_density: f32,
664) -> GenericBinaryArray<Offset> {
665 create_binary_array_with_seed(
666 size,
667 null_density,
668 42, 42, )
671}
672
673pub fn create_binary_array_with_seed<Offset: OffsetSizeTrait>(
681 size: usize,
682 null_density: f32,
683 bytes_seed: u64,
684 bytes_length_seed: u64,
685) -> GenericBinaryArray<Offset> {
686 let rng = &mut StdRng::seed_from_u64(bytes_seed);
687 let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed);
688
689 (0..size)
690 .map(|_| {
691 if rng.random::<f32>() < null_density {
692 None
693 } else {
694 let value = rng
695 .sample_iter::<u8, _>(StandardUniform)
696 .take(range_rng.random_range(0..8))
697 .collect::<Vec<u8>>();
698 Some(value)
699 }
700 })
701 .collect()
702}
703
704pub fn create_binary_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
709 size: usize,
710 null_density: f32,
711 min_len: usize,
712 max_len: usize,
713 prefix: &[u8],
714 seed: u64,
715) -> GenericBinaryArray<Offset> {
716 assert!(min_len <= max_len, "min_len must be <= max_len");
717 assert!(prefix.len() <= max_len, "Prefix length must be <= max_len");
718
719 let rng = &mut StdRng::seed_from_u64(seed);
720 (0..size)
721 .map(|_| {
722 if rng.random::<f32>() < null_density {
723 None
724 } else {
725 let remaining_len = rng
726 .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len()));
727
728 let remaining = rng
729 .sample_iter::<u8, _>(StandardUniform)
730 .take(remaining_len);
731
732 let value = prefix.iter().copied().chain(remaining).collect::<Vec<u8>>();
733 Some(value)
734 }
735 })
736 .collect()
737}
738
739pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
741 let rng = &mut seedable_rng();
742
743 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
744 (0..size).map(|_| {
745 if rng.random::<f32>() < null_density {
746 None
747 } else {
748 let value = rng
749 .sample_iter::<u8, _>(StandardUniform)
750 .take(value_len)
751 .collect::<Vec<u8>>();
752 Some(value)
753 }
754 }),
755 value_len as i32,
756 )
757 .unwrap()
758}
759
760pub fn create_dict_from_values<K>(
763 size: usize,
764 null_density: f32,
765 values: &dyn Array,
766) -> DictionaryArray<K>
767where
768 K: ArrowDictionaryKeyType,
769 StandardUniform: Distribution<K::Native>,
770 K::Native: SampleUniform,
771{
772 let min_key = K::Native::from_usize(0).unwrap();
773 let max_key = K::Native::from_usize(values.len()).unwrap();
774 create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
775}
776
777pub fn create_sparse_dict_from_values<K>(
780 size: usize,
781 null_density: f32,
782 values: &dyn Array,
783 key_range: Range<K::Native>,
784) -> DictionaryArray<K>
785where
786 K: ArrowDictionaryKeyType,
787 StandardUniform: Distribution<K::Native>,
788 K::Native: SampleUniform,
789{
790 let mut rng = seedable_rng();
791 let data_type =
792 DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
793
794 let keys: Buffer = (0..size)
795 .map(|_| rng.random_range(key_range.clone()))
796 .collect();
797
798 let nulls: Option<Buffer> = (null_density != 0.).then(|| {
799 (0..size)
800 .map(|_| rng.random_bool(null_density as _))
801 .collect()
802 });
803
804 let data = ArrayDataBuilder::new(data_type)
805 .len(size)
806 .null_bit_buffer(nulls)
807 .add_buffer(keys)
808 .add_child_data(values.to_data())
809 .build()
810 .unwrap();
811
812 DictionaryArray::from(data)
813}
814
815pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
817 let mut rng = seedable_rng();
818
819 (0..size)
820 .map(|_| {
821 if rng.random::<f32>() < nan_density {
822 Some(f16::NAN)
823 } else {
824 Some(rng.random())
825 }
826 })
827 .collect()
828}
829
830pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
832 let mut rng = seedable_rng();
833
834 (0..size)
835 .map(|_| {
836 if rng.random::<f32>() < nan_density {
837 Some(f32::NAN)
838 } else {
839 Some(rng.random())
840 }
841 })
842 .collect()
843}
844
845pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
847 let mut rng = seedable_rng();
848
849 (0..size)
850 .map(|_| {
851 if rng.random::<f32>() < nan_density {
852 Some(f64::NAN)
853 } else {
854 Some(rng.random())
855 }
856 })
857 .collect()
858}
859
860pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) -> Float64Array {
862 let mut rng = StdRng::seed_from_u64(seed);
863
864 (0..size)
865 .map(|_| {
866 if rng.random::<f32>() < nan_density {
867 Some(f64::NAN)
868 } else {
869 Some(rng.random())
870 }
871 })
872 .collect()
873}