1use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::Rng;
26use rand::SeedableRng;
27use rand::distr::uniform::SampleUniform;
28use rand::rng;
29use rand::{
30 distr::{Alphanumeric, Distribution, StandardUniform},
31 prelude::StdRng,
32};
33use std::ops::Range;
34
35pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38 T: ArrowPrimitiveType,
39 StandardUniform: Distribution<T::Native>,
40{
41 let mut rng = seedable_rng();
42
43 (0..size)
44 .map(|_| {
45 if rng.random::<f32>() < null_density {
46 None
47 } else {
48 Some(rng.random())
49 }
50 })
51 .collect()
52}
53
54pub fn create_primitive_array_with_seed<T>(
57 size: usize,
58 null_density: f32,
59 seed: u64,
60) -> PrimitiveArray<T>
61where
62 T: ArrowPrimitiveType,
63 StandardUniform: Distribution<T::Native>,
64{
65 let mut rng = StdRng::seed_from_u64(seed);
66
67 (0..size)
68 .map(|_| {
69 if rng.random::<f32>() < null_density {
70 None
71 } else {
72 Some(rng.random())
73 }
74 })
75 .collect()
76}
77
78pub fn create_month_day_nano_array_with_seed(
81 size: usize,
82 null_density: f32,
83 seed: u64,
84) -> IntervalMonthDayNanoArray {
85 let mut rng = StdRng::seed_from_u64(seed);
86
87 (0..size)
88 .map(|_| {
89 if rng.random::<f32>() < null_density {
90 None
91 } else {
92 Some(IntervalMonthDayNano::new(
93 rng.random(),
94 rng.random(),
95 rng.random(),
96 ))
97 }
98 })
99 .collect()
100}
101
102pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105 StandardUniform: Distribution<bool>,
106{
107 let mut rng = seedable_rng();
108 (0..size)
109 .map(|_| {
110 if rng.random::<f32>() < null_density {
111 None
112 } else {
113 let value = rng.random::<f32>() < true_density;
114 Some(value)
115 }
116 })
117 .collect()
118}
119
120pub fn create_string_array<Offset: OffsetSizeTrait>(
126 size: usize,
127 null_density: f32,
128) -> GenericStringArray<Offset> {
129 create_string_array_with_max_len(size, null_density, 400)
130}
131
132pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
136 size: usize,
137 null_density: f32,
138) -> GenericStringArray<Offset> {
139 create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
140}
141
142pub fn create_longer_string_view_array_with_same_prefix(
146 size: usize,
147 null_density: f32,
148) -> StringViewArray {
149 create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
150}
151
152fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
153 size: usize,
154 null_density: f32,
155 min_str_len: usize,
156 max_str_len: usize,
157 prefix: &str,
158) -> GenericStringArray<Offset> {
159 create_string_array_with_len_range_and_prefix_and_seed(
160 size,
161 null_density,
162 min_str_len,
163 max_str_len,
164 prefix,
165 42,
166 )
167}
168
169pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
173 size: usize,
174 null_density: f32,
175 min_str_len: usize,
176 max_str_len: usize,
177 prefix: &str,
178 seed: u64,
179) -> GenericStringArray<Offset> {
180 assert!(
181 min_str_len <= max_str_len,
182 "min_str_len must be <= max_str_len"
183 );
184 assert!(
185 prefix.len() <= max_str_len,
186 "Prefix length must be <= max_str_len"
187 );
188
189 let rng = &mut StdRng::seed_from_u64(seed);
190 (0..size)
191 .map(|_| {
192 if rng.random::<f32>() < null_density {
193 None
194 } else {
195 let remaining_len = rng.random_range(
196 min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
197 );
198
199 let mut value = prefix.to_string();
200 value.extend(
201 rng.sample_iter(&Alphanumeric)
202 .take(remaining_len)
203 .map(char::from),
204 );
205
206 Some(value)
207 }
208 })
209 .collect()
210}
211
212fn create_string_view_array_with_len_range_and_prefix(
213 size: usize,
214 null_density: f32,
215 min_str_len: usize,
216 max_str_len: usize,
217 prefix: &str,
218) -> StringViewArray {
219 assert!(
220 min_str_len <= max_str_len,
221 "min_str_len must be <= max_str_len"
222 );
223 assert!(
224 prefix.len() <= max_str_len,
225 "Prefix length must be <= max_str_len"
226 );
227
228 let rng = &mut seedable_rng();
229 (0..size)
230 .map(|_| {
231 if rng.random::<f32>() < null_density {
232 None
233 } else {
234 let remaining_len = rng.random_range(
235 min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
236 );
237
238 let mut value = prefix.to_string();
239 value.extend(
240 rng.sample_iter(&Alphanumeric)
241 .take(remaining_len)
242 .map(char::from),
243 );
244
245 Some(value)
246 }
247 })
248 .collect()
249}
250
251pub fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
253 size: usize,
254 null_density: f32,
255 max_str_len: usize,
256) -> GenericStringArray<Offset> {
257 let rng = &mut seedable_rng();
258 (0..size)
259 .map(|_| {
260 if rng.random::<f32>() < null_density {
261 None
262 } else {
263 let str_len = rng.random_range(0..max_str_len);
264 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
265 let value = String::from_utf8(value).unwrap();
266 Some(value)
267 }
268 })
269 .collect()
270}
271
272pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
274 size: usize,
275 null_density: f32,
276 str_len: usize,
277) -> GenericStringArray<Offset> {
278 let rng = &mut seedable_rng();
279
280 (0..size)
281 .map(|_| {
282 if rng.random::<f32>() < null_density {
283 None
284 } else {
285 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
286 let value = String::from_utf8(value).unwrap();
287 Some(value)
288 }
289 })
290 .collect()
291}
292
293pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
297 create_string_view_array_with_max_len(size, null_density, 400)
298}
299
300pub fn create_string_view_array_with_max_len(
302 size: usize,
303 null_density: f32,
304 max_str_len: usize,
305) -> StringViewArray {
306 let rng = &mut seedable_rng();
307 (0..size)
308 .map(|_| {
309 if rng.random::<f32>() < null_density {
310 None
311 } else {
312 let str_len = rng.random_range(0..max_str_len);
313 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
314 let value = String::from_utf8(value).unwrap();
315 Some(value)
316 }
317 })
318 .collect()
319}
320
321pub fn create_string_view_array_with_fixed_len(
323 size: usize,
324 null_density: f32,
325 str_len: usize,
326) -> StringViewArray {
327 let rng = &mut seedable_rng();
328 (0..size)
329 .map(|_| {
330 if rng.random::<f32>() < null_density {
331 None
332 } else {
333 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
334 let value = String::from_utf8(value).unwrap();
335 Some(value)
336 }
337 })
338 .collect()
339}
340
341pub fn create_string_view_array_with_len(
343 size: usize,
344 null_density: f32,
345 str_len: usize,
346 mixed: bool,
347) -> StringViewArray {
348 let rng = &mut seedable_rng();
349
350 let mut lengths = Vec::with_capacity(size);
351
352 if mixed {
354 for _ in 0..size / 2 {
355 lengths.push(rng.random_range(1..12));
356 }
357 for _ in size / 2..size {
358 lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
359 }
360 } else {
361 lengths.resize(size, str_len);
362 }
363
364 lengths
365 .into_iter()
366 .map(|len| {
367 if rng.random::<f32>() < null_density {
368 None
369 } else {
370 let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
371 Some(String::from_utf8(value).unwrap())
372 }
373 })
374 .collect()
375}
376
377pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
380 size: usize,
381 null_density: f32,
382 str_len: usize,
383) -> DictionaryArray<K> {
384 let rng = &mut seedable_rng();
385
386 let data: Vec<_> = (0..size)
387 .map(|_| {
388 if rng.random::<f32>() < null_density {
389 None
390 } else {
391 let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
392 let value = String::from_utf8(value).unwrap();
393 Some(value)
394 }
395 })
396 .collect();
397
398 data.iter().map(|x| x.as_deref()).collect()
399}
400
401pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
403 logical_array_len: usize,
404 physical_array_len: usize,
405) -> RunArray<R> {
406 assert!(logical_array_len >= physical_array_len);
407 let run_len = logical_array_len / physical_array_len;
409
410 let mut run_len_extra = logical_array_len % physical_array_len;
412
413 let mut values: Vec<V::Native> = (0..physical_array_len)
414 .flat_map(|s| {
415 let mut take_len = run_len;
416 if run_len_extra > 0 {
417 take_len += 1;
418 run_len_extra -= 1;
419 }
420 std::iter::repeat_n(V::Native::from_usize(s).unwrap(), take_len)
421 })
422 .collect();
423 while values.len() < logical_array_len {
424 let last_val = values[values.len() - 1];
425 values.push(last_val);
426 }
427 let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
428 builder.extend(values.into_iter().map(Some));
429
430 builder.finish()
431}
432
433pub fn create_string_array_for_runs(
437 physical_array_len: usize,
438 logical_array_len: usize,
439 string_len: usize,
440) -> Vec<String> {
441 assert!(logical_array_len >= physical_array_len);
442 let mut rng = rng();
443
444 let run_len = logical_array_len / physical_array_len;
446
447 let mut run_len_extra = logical_array_len % physical_array_len;
449
450 let mut values: Vec<String> = (0..physical_array_len)
451 .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
452 .flat_map(|s| {
453 let mut take_len = run_len;
454 if run_len_extra > 0 {
455 take_len += 1;
456 run_len_extra -= 1;
457 }
458 std::iter::repeat_n(s, take_len)
459 })
460 .collect();
461 while values.len() < logical_array_len {
462 let last_val = values[values.len() - 1].clone();
463 values.push(last_val);
464 }
465 values
466}
467
468pub fn create_binary_array<Offset: OffsetSizeTrait>(
470 size: usize,
471 null_density: f32,
472) -> GenericBinaryArray<Offset> {
473 create_binary_array_with_seed(
474 size,
475 null_density,
476 42, 42, )
479}
480
481pub fn create_binary_array_with_seed<Offset: OffsetSizeTrait>(
489 size: usize,
490 null_density: f32,
491 bytes_seed: u64,
492 bytes_length_seed: u64,
493) -> GenericBinaryArray<Offset> {
494 let rng = &mut StdRng::seed_from_u64(bytes_seed);
495 let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed);
496
497 (0..size)
498 .map(|_| {
499 if rng.random::<f32>() < null_density {
500 None
501 } else {
502 let value = rng
503 .sample_iter::<u8, _>(StandardUniform)
504 .take(range_rng.random_range(0..8))
505 .collect::<Vec<u8>>();
506 Some(value)
507 }
508 })
509 .collect()
510}
511
512pub fn create_binary_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
517 size: usize,
518 null_density: f32,
519 min_len: usize,
520 max_len: usize,
521 prefix: &[u8],
522 seed: u64,
523) -> GenericBinaryArray<Offset> {
524 assert!(min_len <= max_len, "min_len must be <= max_len");
525 assert!(prefix.len() <= max_len, "Prefix length must be <= max_len");
526
527 let rng = &mut StdRng::seed_from_u64(seed);
528 (0..size)
529 .map(|_| {
530 if rng.random::<f32>() < null_density {
531 None
532 } else {
533 let remaining_len = rng
534 .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len()));
535
536 let remaining = rng
537 .sample_iter::<u8, _>(StandardUniform)
538 .take(remaining_len);
539
540 let value = prefix.iter().copied().chain(remaining).collect::<Vec<u8>>();
541 Some(value)
542 }
543 })
544 .collect()
545}
546
547pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
549 let rng = &mut seedable_rng();
550
551 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
552 (0..size).map(|_| {
553 if rng.random::<f32>() < null_density {
554 None
555 } else {
556 let value = rng
557 .sample_iter::<u8, _>(StandardUniform)
558 .take(value_len)
559 .collect::<Vec<u8>>();
560 Some(value)
561 }
562 }),
563 value_len as i32,
564 )
565 .unwrap()
566}
567
568pub fn create_dict_from_values<K>(
571 size: usize,
572 null_density: f32,
573 values: &dyn Array,
574) -> DictionaryArray<K>
575where
576 K: ArrowDictionaryKeyType,
577 StandardUniform: Distribution<K::Native>,
578 K::Native: SampleUniform,
579{
580 let min_key = K::Native::from_usize(0).unwrap();
581 let max_key = K::Native::from_usize(values.len()).unwrap();
582 create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
583}
584
585pub fn create_sparse_dict_from_values<K>(
588 size: usize,
589 null_density: f32,
590 values: &dyn Array,
591 key_range: Range<K::Native>,
592) -> DictionaryArray<K>
593where
594 K: ArrowDictionaryKeyType,
595 StandardUniform: Distribution<K::Native>,
596 K::Native: SampleUniform,
597{
598 let mut rng = seedable_rng();
599 let data_type =
600 DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
601
602 let keys: Buffer = (0..size)
603 .map(|_| rng.random_range(key_range.clone()))
604 .collect();
605
606 let nulls: Option<Buffer> = (null_density != 0.).then(|| {
607 (0..size)
608 .map(|_| rng.random_bool(null_density as _))
609 .collect()
610 });
611
612 let data = ArrayDataBuilder::new(data_type)
613 .len(size)
614 .null_bit_buffer(nulls)
615 .add_buffer(keys)
616 .add_child_data(values.to_data())
617 .build()
618 .unwrap();
619
620 DictionaryArray::from(data)
621}
622
623pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
625 let mut rng = seedable_rng();
626
627 (0..size)
628 .map(|_| {
629 if rng.random::<f32>() < nan_density {
630 Some(f16::NAN)
631 } else {
632 Some(f16::from_f32(rng.random()))
633 }
634 })
635 .collect()
636}
637
638pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
640 let mut rng = seedable_rng();
641
642 (0..size)
643 .map(|_| {
644 if rng.random::<f32>() < nan_density {
645 Some(f32::NAN)
646 } else {
647 Some(rng.random())
648 }
649 })
650 .collect()
651}
652
653pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
655 let mut rng = seedable_rng();
656
657 (0..size)
658 .map(|_| {
659 if rng.random::<f32>() < nan_density {
660 Some(f64::NAN)
661 } else {
662 Some(rng.random())
663 }
664 })
665 .collect()
666}