1use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::Rng;
26use rand::SeedableRng;
27use rand::distr::uniform::SampleUniform;
28use rand::rng;
29use rand::{
30 distr::{Alphanumeric, Distribution, StandardUniform},
31 prelude::StdRng,
32};
33use std::ops::Range;
34
35pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38 T: ArrowPrimitiveType,
39 StandardUniform: Distribution<T::Native>,
40{
41 let mut rng = seedable_rng();
42
43 (0..size)
44 .map(|_| {
45 if rng.random::<f32>() < null_density {
46 None
47 } else {
48 Some(rng.random())
49 }
50 })
51 .collect()
52}
53
54pub fn create_primitive_array_with_seed<T>(
57 size: usize,
58 null_density: f32,
59 seed: u64,
60) -> PrimitiveArray<T>
61where
62 T: ArrowPrimitiveType,
63 StandardUniform: Distribution<T::Native>,
64{
65 let mut rng = StdRng::seed_from_u64(seed);
66
67 (0..size)
68 .map(|_| {
69 if rng.random::<f32>() < null_density {
70 None
71 } else {
72 Some(rng.random())
73 }
74 })
75 .collect()
76}
77
78pub fn create_month_day_nano_array_with_seed(
81 size: usize,
82 null_density: f32,
83 seed: u64,
84) -> IntervalMonthDayNanoArray {
85 let mut rng = StdRng::seed_from_u64(seed);
86
87 (0..size)
88 .map(|_| {
89 if rng.random::<f32>() < null_density {
90 None
91 } else {
92 Some(IntervalMonthDayNano::new(
93 rng.random(),
94 rng.random(),
95 rng.random(),
96 ))
97 }
98 })
99 .collect()
100}
101
102pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105 StandardUniform: Distribution<bool>,
106{
107 let mut rng = seedable_rng();
108 (0..size)
109 .map(|_| {
110 if rng.random::<f32>() < null_density {
111 None
112 } else {
113 let value = rng.random::<f32>() < true_density;
114 Some(value)
115 }
116 })
117 .collect()
118}
119
120pub fn create_boolean_array_with_seed(
122 size: usize,
123 null_density: f32,
124 true_density: f32,
125 seed: u64,
126) -> BooleanArray
127where
128 StandardUniform: Distribution<bool>,
129{
130 let mut rng = StdRng::seed_from_u64(seed);
131 (0..size)
132 .map(|_| {
133 if rng.random::<f32>() < null_density {
134 None
135 } else {
136 let value = rng.random::<f32>() < true_density;
137 Some(value)
138 }
139 })
140 .collect()
141}
142
143pub fn create_string_array<Offset: OffsetSizeTrait>(
149 size: usize,
150 null_density: f32,
151) -> GenericStringArray<Offset> {
152 create_string_array_with_max_len(size, null_density, 400)
153}
154
155pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
159 size: usize,
160 null_density: f32,
161) -> GenericStringArray<Offset> {
162 create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
163}
164
165pub fn create_longer_string_view_array_with_same_prefix(
169 size: usize,
170 null_density: f32,
171) -> StringViewArray {
172 create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
173}
174
175fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
176 size: usize,
177 null_density: f32,
178 min_str_len: usize,
179 max_str_len: usize,
180 prefix: &str,
181) -> GenericStringArray<Offset> {
182 create_string_array_with_len_range_and_prefix_and_seed(
183 size,
184 null_density,
185 min_str_len,
186 max_str_len,
187 prefix,
188 42,
189 )
190}
191
192pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
196 size: usize,
197 null_density: f32,
198 min_str_len: usize,
199 max_str_len: usize,
200 prefix: &str,
201 seed: u64,
202) -> GenericStringArray<Offset> {
203 assert!(
204 min_str_len <= max_str_len,
205 "min_str_len must be <= max_str_len"
206 );
207 assert!(
208 prefix.len() <= max_str_len,
209 "Prefix length must be <= max_str_len"
210 );
211
212 let rng = &mut StdRng::seed_from_u64(seed);
213 (0..size)
214 .map(|_| {
215 if rng.random::<f32>() < null_density {
216 None
217 } else {
218 let remaining_len = rng.random_range(
219 min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
220 );
221
222 let mut value = prefix.to_string();
223 value.extend(
224 rng.sample_iter(&Alphanumeric)
225 .take(remaining_len)
226 .map(char::from),
227 );
228
229 Some(value)
230 }
231 })
232 .collect()
233}
234pub fn create_string_view_array_with_len_range_and_seed(
242 size: usize,
243 null_density: f32,
244 range: Range<usize>,
245 seed: u64,
246) -> StringViewArray {
247 let rng = &mut StdRng::seed_from_u64(seed);
248 (0..size)
249 .map(|_| {
250 if rng.random::<f32>() < null_density {
251 None
252 } else {
253 let str_len = rng.random_range(range.clone());
254 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
255 let value = String::from_utf8(value).unwrap();
256 Some(value)
257 }
258 })
259 .collect()
260}
261
262fn create_string_view_array_with_len_range_and_prefix(
263 size: usize,
264 null_density: f32,
265 min_str_len: usize,
266 max_str_len: usize,
267 prefix: &str,
268) -> StringViewArray {
269 assert!(
270 min_str_len <= max_str_len,
271 "min_str_len must be <= max_str_len"
272 );
273 assert!(
274 prefix.len() <= max_str_len,
275 "Prefix length must be <= max_str_len"
276 );
277
278 let rng = &mut seedable_rng();
279 (0..size)
280 .map(|_| {
281 if rng.random::<f32>() < null_density {
282 None
283 } else {
284 let remaining_len = rng.random_range(
285 min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
286 );
287
288 let mut value = prefix.to_string();
289 value.extend(
290 rng.sample_iter(&Alphanumeric)
291 .take(remaining_len)
292 .map(char::from),
293 );
294
295 Some(value)
296 }
297 })
298 .collect()
299}
300
301pub fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
303 size: usize,
304 null_density: f32,
305 max_str_len: usize,
306) -> GenericStringArray<Offset> {
307 let rng = &mut seedable_rng();
308 (0..size)
309 .map(|_| {
310 if rng.random::<f32>() < null_density {
311 None
312 } else {
313 let str_len = rng.random_range(0..max_str_len);
314 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
315 let value = String::from_utf8(value).unwrap();
316 Some(value)
317 }
318 })
319 .collect()
320}
321
322pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
324 size: usize,
325 null_density: f32,
326 str_len: usize,
327) -> GenericStringArray<Offset> {
328 let rng = &mut seedable_rng();
329
330 (0..size)
331 .map(|_| {
332 if rng.random::<f32>() < null_density {
333 None
334 } else {
335 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
336 let value = String::from_utf8(value).unwrap();
337 Some(value)
338 }
339 })
340 .collect()
341}
342
343pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
347 create_string_view_array_with_max_len(size, null_density, 400)
348}
349
350pub fn create_string_view_array_with_max_len(
352 size: usize,
353 null_density: f32,
354 max_str_len: usize,
355) -> StringViewArray {
356 let rng = &mut seedable_rng();
357 (0..size)
358 .map(|_| {
359 if rng.random::<f32>() < null_density {
360 None
361 } else {
362 let str_len = rng.random_range(0..max_str_len);
363 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
364 let value = String::from_utf8(value).unwrap();
365 Some(value)
366 }
367 })
368 .collect()
369}
370
371pub fn create_string_view_array_with_fixed_len(
373 size: usize,
374 null_density: f32,
375 str_len: usize,
376) -> StringViewArray {
377 let rng = &mut seedable_rng();
378 (0..size)
379 .map(|_| {
380 if rng.random::<f32>() < null_density {
381 None
382 } else {
383 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
384 let value = String::from_utf8(value).unwrap();
385 Some(value)
386 }
387 })
388 .collect()
389}
390
391pub fn create_string_view_array_with_len(
393 size: usize,
394 null_density: f32,
395 str_len: usize,
396 mixed: bool,
397) -> StringViewArray {
398 let rng = &mut seedable_rng();
399
400 let mut lengths = Vec::with_capacity(size);
401
402 if mixed {
404 for _ in 0..size / 2 {
405 lengths.push(rng.random_range(1..12));
406 }
407 for _ in size / 2..size {
408 lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
409 }
410 } else {
411 lengths.resize(size, str_len);
412 }
413
414 lengths
415 .into_iter()
416 .map(|len| {
417 if rng.random::<f32>() < null_density {
418 None
419 } else {
420 let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
421 Some(String::from_utf8(value).unwrap())
422 }
423 })
424 .collect()
425}
426
427pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
430 size: usize,
431 null_density: f32,
432 str_len: usize,
433) -> DictionaryArray<K> {
434 let rng = &mut seedable_rng();
435
436 let data: Vec<_> = (0..size)
437 .map(|_| {
438 if rng.random::<f32>() < null_density {
439 None
440 } else {
441 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
442 let value = String::from_utf8(value).unwrap();
443 Some(value)
444 }
445 })
446 .collect();
447
448 data.iter().map(|x| x.as_deref()).collect()
449}
450
451pub fn create_primitive_list_array_with_seed<O, T>(
460 size: usize,
461 null_density: f32,
462 list_null_density: f32,
463 max_list_size: usize,
464 seed: u64,
465) -> GenericListArray<O>
466where
467 O: OffsetSizeTrait,
468 T: ArrowPrimitiveType,
469 StandardUniform: Distribution<T::Native>,
470{
471 let mut rng = StdRng::seed_from_u64(seed);
472
473 let values = (0..size).map(|_| {
474 if rng.random::<f32>() < null_density {
475 None
476 } else {
477 let list_size = rng.random_range(0..=max_list_size);
478 let list_values: Vec<Option<T::Native>> = (0..list_size)
479 .map(|_| {
480 if rng.random::<f32>() < list_null_density {
481 None
482 } else {
483 Some(rng.random())
484 }
485 })
486 .collect();
487 Some(list_values)
488 }
489 });
490
491 GenericListArray::<O>::from_iter_primitive::<T, _, _>(values)
492}
493
494pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
496 logical_array_len: usize,
497 physical_array_len: usize,
498) -> RunArray<R> {
499 assert!(logical_array_len >= physical_array_len);
500 let run_len = logical_array_len / physical_array_len;
502
503 let mut run_len_extra = logical_array_len % physical_array_len;
505
506 let mut values: Vec<V::Native> = (0..physical_array_len)
507 .flat_map(|s| {
508 let mut take_len = run_len;
509 if run_len_extra > 0 {
510 take_len += 1;
511 run_len_extra -= 1;
512 }
513 std::iter::repeat_n(V::Native::from_usize(s).unwrap(), take_len)
514 })
515 .collect();
516 while values.len() < logical_array_len {
517 let last_val = values[values.len() - 1];
518 values.push(last_val);
519 }
520 let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
521 builder.extend(values.into_iter().map(Some));
522
523 builder.finish()
524}
525
526pub fn create_string_array_for_runs(
530 physical_array_len: usize,
531 logical_array_len: usize,
532 string_len: usize,
533) -> Vec<String> {
534 assert!(logical_array_len >= physical_array_len);
535 let mut rng = rng();
536
537 let run_len = logical_array_len / physical_array_len;
539
540 let mut run_len_extra = logical_array_len % physical_array_len;
542
543 let mut values: Vec<String> = (0..physical_array_len)
544 .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
545 .flat_map(|s| {
546 let mut take_len = run_len;
547 if run_len_extra > 0 {
548 take_len += 1;
549 run_len_extra -= 1;
550 }
551 std::iter::repeat_n(s, take_len)
552 })
553 .collect();
554 while values.len() < logical_array_len {
555 let last_val = values[values.len() - 1].clone();
556 values.push(last_val);
557 }
558 values
559}
560
561pub fn create_binary_array<Offset: OffsetSizeTrait>(
563 size: usize,
564 null_density: f32,
565) -> GenericBinaryArray<Offset> {
566 create_binary_array_with_seed(
567 size,
568 null_density,
569 42, 42, )
572}
573
574pub fn create_binary_array_with_seed<Offset: OffsetSizeTrait>(
582 size: usize,
583 null_density: f32,
584 bytes_seed: u64,
585 bytes_length_seed: u64,
586) -> GenericBinaryArray<Offset> {
587 let rng = &mut StdRng::seed_from_u64(bytes_seed);
588 let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed);
589
590 (0..size)
591 .map(|_| {
592 if rng.random::<f32>() < null_density {
593 None
594 } else {
595 let value = rng
596 .sample_iter::<u8, _>(StandardUniform)
597 .take(range_rng.random_range(0..8))
598 .collect::<Vec<u8>>();
599 Some(value)
600 }
601 })
602 .collect()
603}
604
605pub fn create_binary_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
610 size: usize,
611 null_density: f32,
612 min_len: usize,
613 max_len: usize,
614 prefix: &[u8],
615 seed: u64,
616) -> GenericBinaryArray<Offset> {
617 assert!(min_len <= max_len, "min_len must be <= max_len");
618 assert!(prefix.len() <= max_len, "Prefix length must be <= max_len");
619
620 let rng = &mut StdRng::seed_from_u64(seed);
621 (0..size)
622 .map(|_| {
623 if rng.random::<f32>() < null_density {
624 None
625 } else {
626 let remaining_len = rng
627 .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len()));
628
629 let remaining = rng
630 .sample_iter::<u8, _>(StandardUniform)
631 .take(remaining_len);
632
633 let value = prefix.iter().copied().chain(remaining).collect::<Vec<u8>>();
634 Some(value)
635 }
636 })
637 .collect()
638}
639
640pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
642 let rng = &mut seedable_rng();
643
644 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
645 (0..size).map(|_| {
646 if rng.random::<f32>() < null_density {
647 None
648 } else {
649 let value = rng
650 .sample_iter::<u8, _>(StandardUniform)
651 .take(value_len)
652 .collect::<Vec<u8>>();
653 Some(value)
654 }
655 }),
656 value_len as i32,
657 )
658 .unwrap()
659}
660
661pub fn create_dict_from_values<K>(
664 size: usize,
665 null_density: f32,
666 values: &dyn Array,
667) -> DictionaryArray<K>
668where
669 K: ArrowDictionaryKeyType,
670 StandardUniform: Distribution<K::Native>,
671 K::Native: SampleUniform,
672{
673 let min_key = K::Native::from_usize(0).unwrap();
674 let max_key = K::Native::from_usize(values.len()).unwrap();
675 create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
676}
677
678pub fn create_sparse_dict_from_values<K>(
681 size: usize,
682 null_density: f32,
683 values: &dyn Array,
684 key_range: Range<K::Native>,
685) -> DictionaryArray<K>
686where
687 K: ArrowDictionaryKeyType,
688 StandardUniform: Distribution<K::Native>,
689 K::Native: SampleUniform,
690{
691 let mut rng = seedable_rng();
692 let data_type =
693 DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
694
695 let keys: Buffer = (0..size)
696 .map(|_| rng.random_range(key_range.clone()))
697 .collect();
698
699 let nulls: Option<Buffer> = (null_density != 0.).then(|| {
700 (0..size)
701 .map(|_| rng.random_bool(null_density as _))
702 .collect()
703 });
704
705 let data = ArrayDataBuilder::new(data_type)
706 .len(size)
707 .null_bit_buffer(nulls)
708 .add_buffer(keys)
709 .add_child_data(values.to_data())
710 .build()
711 .unwrap();
712
713 DictionaryArray::from(data)
714}
715
716pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
718 let mut rng = seedable_rng();
719
720 (0..size)
721 .map(|_| {
722 if rng.random::<f32>() < nan_density {
723 Some(f16::NAN)
724 } else {
725 Some(f16::from_f32(rng.random()))
726 }
727 })
728 .collect()
729}
730
731pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
733 let mut rng = seedable_rng();
734
735 (0..size)
736 .map(|_| {
737 if rng.random::<f32>() < nan_density {
738 Some(f32::NAN)
739 } else {
740 Some(rng.random())
741 }
742 })
743 .collect()
744}
745
746pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
748 let mut rng = seedable_rng();
749
750 (0..size)
751 .map(|_| {
752 if rng.random::<f32>() < nan_density {
753 Some(f64::NAN)
754 } else {
755 Some(rng.random())
756 }
757 })
758 .collect()
759}
760
761pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) -> Float64Array {
763 let mut rng = StdRng::seed_from_u64(seed);
764
765 (0..size)
766 .map(|_| {
767 if rng.random::<f32>() < nan_density {
768 Some(f64::NAN)
769 } else {
770 Some(rng.random())
771 }
772 })
773 .collect()
774}