arrow/util/
bench_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make benchmarking easier
19
20use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::Rng;
26use rand::SeedableRng;
27use rand::distr::uniform::SampleUniform;
28use rand::rng;
29use rand::{
30    distr::{Alphanumeric, Distribution, StandardUniform},
31    prelude::StdRng,
32};
33use std::ops::Range;
34
35/// Creates an random (but fixed-seeded) array of a given size and null density
36pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38    T: ArrowPrimitiveType,
39    StandardUniform: Distribution<T::Native>,
40{
41    let mut rng = seedable_rng();
42
43    (0..size)
44        .map(|_| {
45            if rng.random::<f32>() < null_density {
46                None
47            } else {
48                Some(rng.random())
49            }
50        })
51        .collect()
52}
53
54/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
55/// filling it with random numbers generated using the provided `seed`.
56pub fn create_primitive_array_with_seed<T>(
57    size: usize,
58    null_density: f32,
59    seed: u64,
60) -> PrimitiveArray<T>
61where
62    T: ArrowPrimitiveType,
63    StandardUniform: Distribution<T::Native>,
64{
65    let mut rng = StdRng::seed_from_u64(seed);
66
67    (0..size)
68        .map(|_| {
69            if rng.random::<f32>() < null_density {
70                None
71            } else {
72                Some(rng.random())
73            }
74        })
75        .collect()
76}
77
78/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
79/// filling it with random [`IntervalMonthDayNano`] generated using the provided `seed`.
80pub fn create_month_day_nano_array_with_seed(
81    size: usize,
82    null_density: f32,
83    seed: u64,
84) -> IntervalMonthDayNanoArray {
85    let mut rng = StdRng::seed_from_u64(seed);
86
87    (0..size)
88        .map(|_| {
89            if rng.random::<f32>() < null_density {
90                None
91            } else {
92                Some(IntervalMonthDayNano::new(
93                    rng.random(),
94                    rng.random(),
95                    rng.random(),
96                ))
97            }
98        })
99        .collect()
100}
101
102/// Creates a random (but fixed-seeded) array of a given size and null density
103pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105    StandardUniform: Distribution<bool>,
106{
107    let mut rng = seedable_rng();
108    (0..size)
109        .map(|_| {
110            if rng.random::<f32>() < null_density {
111                None
112            } else {
113                let value = rng.random::<f32>() < true_density;
114                Some(value)
115            }
116        })
117        .collect()
118}
119
120/// Creates a random (but fixed-seeded) string array of a given size and null density.
121///
122/// Strings have a random length
123/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
124/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
125pub fn create_string_array<Offset: OffsetSizeTrait>(
126    size: usize,
127    null_density: f32,
128) -> GenericStringArray<Offset> {
129    create_string_array_with_max_len(size, null_density, 400)
130}
131
132/// Creates longer string array with same prefix, the prefix should be larger than 4 bytes,
133/// and the string length should be larger than 12 bytes
134/// so that we can compare the performance with StringViewArray, because StringViewArray has 4 bytes inline for view
135pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
136    size: usize,
137    null_density: f32,
138) -> GenericStringArray<Offset> {
139    create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
140}
141
142/// Creates longer string view array with same prefix, the prefix should be larger than 4 bytes,
143/// and the string length should be larger than 12 bytes
144/// so that we can compare the StringArray performance with StringViewArray, because StringViewArray has 4 bytes inline for view
145pub fn create_longer_string_view_array_with_same_prefix(
146    size: usize,
147    null_density: f32,
148) -> StringViewArray {
149    create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
150}
151
152fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
153    size: usize,
154    null_density: f32,
155    min_str_len: usize,
156    max_str_len: usize,
157    prefix: &str,
158) -> GenericStringArray<Offset> {
159    create_string_array_with_len_range_and_prefix_and_seed(
160        size,
161        null_density,
162        min_str_len,
163        max_str_len,
164        prefix,
165        42,
166    )
167}
168
169/// Creates a random [`GenericStringArray`] of a given `size` and `null_density`
170/// filling it with random strings with lengths in the specified range,
171/// all starting with the provided `prefix`, generated using the provided `seed`.
172pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
173    size: usize,
174    null_density: f32,
175    min_str_len: usize,
176    max_str_len: usize,
177    prefix: &str,
178    seed: u64,
179) -> GenericStringArray<Offset> {
180    assert!(
181        min_str_len <= max_str_len,
182        "min_str_len must be <= max_str_len"
183    );
184    assert!(
185        prefix.len() <= max_str_len,
186        "Prefix length must be <= max_str_len"
187    );
188
189    let rng = &mut StdRng::seed_from_u64(seed);
190    (0..size)
191        .map(|_| {
192            if rng.random::<f32>() < null_density {
193                None
194            } else {
195                let remaining_len = rng.random_range(
196                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
197                );
198
199                let mut value = prefix.to_string();
200                value.extend(
201                    rng.sample_iter(&Alphanumeric)
202                        .take(remaining_len)
203                        .map(char::from),
204                );
205
206                Some(value)
207            }
208        })
209        .collect()
210}
211/// Creates a string view array of a given range, null density and length
212///
213/// Arguments:
214/// - `size`: number of  string view array
215/// - `null_density`: density of nulls in the string view array
216/// - `range`: range size of each string in the string view array
217/// - `seed`: seed for the random number generator
218pub fn create_string_view_array_with_len_range_and_seed(
219    size: usize,
220    null_density: f32,
221    range: Range<usize>,
222    seed: u64,
223) -> StringViewArray {
224    let rng = &mut StdRng::seed_from_u64(seed);
225    (0..size)
226        .map(|_| {
227            if rng.random::<f32>() < null_density {
228                None
229            } else {
230                let str_len = rng.random_range(range.clone());
231                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
232                let value = String::from_utf8(value).unwrap();
233                Some(value)
234            }
235        })
236        .collect()
237}
238
239fn create_string_view_array_with_len_range_and_prefix(
240    size: usize,
241    null_density: f32,
242    min_str_len: usize,
243    max_str_len: usize,
244    prefix: &str,
245) -> StringViewArray {
246    assert!(
247        min_str_len <= max_str_len,
248        "min_str_len must be <= max_str_len"
249    );
250    assert!(
251        prefix.len() <= max_str_len,
252        "Prefix length must be <= max_str_len"
253    );
254
255    let rng = &mut seedable_rng();
256    (0..size)
257        .map(|_| {
258            if rng.random::<f32>() < null_density {
259                None
260            } else {
261                let remaining_len = rng.random_range(
262                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
263                );
264
265                let mut value = prefix.to_string();
266                value.extend(
267                    rng.sample_iter(&Alphanumeric)
268                        .take(remaining_len)
269                        .map(char::from),
270                );
271
272                Some(value)
273            }
274        })
275        .collect()
276}
277
278/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
279pub fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
280    size: usize,
281    null_density: f32,
282    max_str_len: usize,
283) -> GenericStringArray<Offset> {
284    let rng = &mut seedable_rng();
285    (0..size)
286        .map(|_| {
287            if rng.random::<f32>() < null_density {
288                None
289            } else {
290                let str_len = rng.random_range(0..max_str_len);
291                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
292                let value = String::from_utf8(value).unwrap();
293                Some(value)
294            }
295        })
296        .collect()
297}
298
299/// Creates a random (but fixed-seeded) array of a given size, null density and length
300pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
301    size: usize,
302    null_density: f32,
303    str_len: usize,
304) -> GenericStringArray<Offset> {
305    let rng = &mut seedable_rng();
306
307    (0..size)
308        .map(|_| {
309            if rng.random::<f32>() < null_density {
310                None
311            } else {
312                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
313                let value = String::from_utf8(value).unwrap();
314                Some(value)
315            }
316        })
317        .collect()
318}
319
320/// Creates a random (but fixed-seeded) string view array of a given size and null density.
321///
322/// See `create_string_array` above for more details.
323pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
324    create_string_view_array_with_max_len(size, null_density, 400)
325}
326
327/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
328pub fn create_string_view_array_with_max_len(
329    size: usize,
330    null_density: f32,
331    max_str_len: usize,
332) -> StringViewArray {
333    let rng = &mut seedable_rng();
334    (0..size)
335        .map(|_| {
336            if rng.random::<f32>() < null_density {
337                None
338            } else {
339                let str_len = rng.random_range(0..max_str_len);
340                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
341                let value = String::from_utf8(value).unwrap();
342                Some(value)
343            }
344        })
345        .collect()
346}
347
348/// Creates a random (but fixed-seeded) array of a given size, null density and length
349pub fn create_string_view_array_with_fixed_len(
350    size: usize,
351    null_density: f32,
352    str_len: usize,
353) -> StringViewArray {
354    let rng = &mut seedable_rng();
355    (0..size)
356        .map(|_| {
357            if rng.random::<f32>() < null_density {
358                None
359            } else {
360                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
361                let value = String::from_utf8(value).unwrap();
362                Some(value)
363            }
364        })
365        .collect()
366}
367
368/// Creates a random (but fixed-seeded) array of a given size, null density and length
369pub fn create_string_view_array_with_len(
370    size: usize,
371    null_density: f32,
372    str_len: usize,
373    mixed: bool,
374) -> StringViewArray {
375    let rng = &mut seedable_rng();
376
377    let mut lengths = Vec::with_capacity(size);
378
379    // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes
380    if mixed {
381        for _ in 0..size / 2 {
382            lengths.push(rng.random_range(1..12));
383        }
384        for _ in size / 2..size {
385            lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
386        }
387    } else {
388        lengths.resize(size, str_len);
389    }
390
391    lengths
392        .into_iter()
393        .map(|len| {
394            if rng.random::<f32>() < null_density {
395                None
396            } else {
397                let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
398                Some(String::from_utf8(value).unwrap())
399            }
400        })
401        .collect()
402}
403
404/// Creates an random (but fixed-seeded) array of a given size and null density
405/// consisting of random 4 character alphanumeric strings
406pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
407    size: usize,
408    null_density: f32,
409    str_len: usize,
410) -> DictionaryArray<K> {
411    let rng = &mut seedable_rng();
412
413    let data: Vec<_> = (0..size)
414        .map(|_| {
415            if rng.random::<f32>() < null_density {
416                None
417            } else {
418                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
419                let value = String::from_utf8(value).unwrap();
420                Some(value)
421            }
422        })
423        .collect();
424
425    data.iter().map(|x| x.as_deref()).collect()
426}
427
428/// Create a List/LargeList Array  of primitive values
429///
430/// Arguments:
431/// - `size`: number of lists in the array
432/// - `null_density`: density of nulls in the list array
433/// - `list_null_density`: density of nulls in the primitive arrays inside the lists
434/// - `max_list_size`: maximum size of each list (actual size is random between 0 and max_list_size)
435/// - `seed`: seed for the random number generator
436pub fn create_primitive_list_array_with_seed<O, T>(
437    size: usize,
438    null_density: f32,
439    list_null_density: f32,
440    max_list_size: usize,
441    seed: u64,
442) -> GenericListArray<O>
443where
444    O: OffsetSizeTrait,
445    T: ArrowPrimitiveType,
446    StandardUniform: Distribution<T::Native>,
447{
448    let mut rng = StdRng::seed_from_u64(seed);
449
450    let values = (0..size).map(|_| {
451        if rng.random::<f32>() < null_density {
452            None
453        } else {
454            let list_size = rng.random_range(0..=max_list_size);
455            let list_values: Vec<Option<T::Native>> = (0..list_size)
456                .map(|_| {
457                    if rng.random::<f32>() < list_null_density {
458                        None
459                    } else {
460                        Some(rng.random())
461                    }
462                })
463                .collect();
464            Some(list_values)
465        }
466    });
467
468    GenericListArray::<O>::from_iter_primitive::<T, _, _>(values)
469}
470
471/// Create primitive run array for given logical and physical array lengths
472pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
473    logical_array_len: usize,
474    physical_array_len: usize,
475) -> RunArray<R> {
476    assert!(logical_array_len >= physical_array_len);
477    // typical length of each run
478    let run_len = logical_array_len / physical_array_len;
479
480    // Some runs should have extra length
481    let mut run_len_extra = logical_array_len % physical_array_len;
482
483    let mut values: Vec<V::Native> = (0..physical_array_len)
484        .flat_map(|s| {
485            let mut take_len = run_len;
486            if run_len_extra > 0 {
487                take_len += 1;
488                run_len_extra -= 1;
489            }
490            std::iter::repeat_n(V::Native::from_usize(s).unwrap(), take_len)
491        })
492        .collect();
493    while values.len() < logical_array_len {
494        let last_val = values[values.len() - 1];
495        values.push(last_val);
496    }
497    let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
498    builder.extend(values.into_iter().map(Some));
499
500    builder.finish()
501}
502
503/// Create string array to be used by run array builder. The string array
504/// will result in run array with physical length of `physical_array_len`
505/// and logical length of `logical_array_len`
506pub fn create_string_array_for_runs(
507    physical_array_len: usize,
508    logical_array_len: usize,
509    string_len: usize,
510) -> Vec<String> {
511    assert!(logical_array_len >= physical_array_len);
512    let mut rng = rng();
513
514    // typical length of each run
515    let run_len = logical_array_len / physical_array_len;
516
517    // Some runs should have extra length
518    let mut run_len_extra = logical_array_len % physical_array_len;
519
520    let mut values: Vec<String> = (0..physical_array_len)
521        .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
522        .flat_map(|s| {
523            let mut take_len = run_len;
524            if run_len_extra > 0 {
525                take_len += 1;
526                run_len_extra -= 1;
527            }
528            std::iter::repeat_n(s, take_len)
529        })
530        .collect();
531    while values.len() < logical_array_len {
532        let last_val = values[values.len() - 1].clone();
533        values.push(last_val);
534    }
535    values
536}
537
538/// Creates an random (but fixed-seeded) binary array of a given size and null density
539pub fn create_binary_array<Offset: OffsetSizeTrait>(
540    size: usize,
541    null_density: f32,
542) -> GenericBinaryArray<Offset> {
543    create_binary_array_with_seed(
544        size,
545        null_density,
546        42, // bytes_seed
547        42, // bytes_length_seed
548    )
549}
550
551/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density`
552/// filling it with random bytes, generated using the provided `seed`s.
553///
554/// the `bytes_seed` is used to seed the RNG for generating the byte values,
555/// while the `bytes_length_seed` is used to seed the RNG for generating the length of an array item
556///
557/// These values can be the same as they are used to seed different RNGs internally.
558pub fn create_binary_array_with_seed<Offset: OffsetSizeTrait>(
559    size: usize,
560    null_density: f32,
561    bytes_seed: u64,
562    bytes_length_seed: u64,
563) -> GenericBinaryArray<Offset> {
564    let rng = &mut StdRng::seed_from_u64(bytes_seed);
565    let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed);
566
567    (0..size)
568        .map(|_| {
569            if rng.random::<f32>() < null_density {
570                None
571            } else {
572                let value = rng
573                    .sample_iter::<u8, _>(StandardUniform)
574                    .take(range_rng.random_range(0..8))
575                    .collect::<Vec<u8>>();
576                Some(value)
577            }
578        })
579        .collect()
580}
581
582/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density`
583/// filling it with random bytes with lengths in the specified range,
584/// all starting with the provided `prefix`, generated using the provided `seed`.
585///
586pub fn create_binary_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
587    size: usize,
588    null_density: f32,
589    min_len: usize,
590    max_len: usize,
591    prefix: &[u8],
592    seed: u64,
593) -> GenericBinaryArray<Offset> {
594    assert!(min_len <= max_len, "min_len must be <= max_len");
595    assert!(prefix.len() <= max_len, "Prefix length must be <= max_len");
596
597    let rng = &mut StdRng::seed_from_u64(seed);
598    (0..size)
599        .map(|_| {
600            if rng.random::<f32>() < null_density {
601                None
602            } else {
603                let remaining_len = rng
604                    .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len()));
605
606                let remaining = rng
607                    .sample_iter::<u8, _>(StandardUniform)
608                    .take(remaining_len);
609
610                let value = prefix.iter().copied().chain(remaining).collect::<Vec<u8>>();
611                Some(value)
612            }
613        })
614        .collect()
615}
616
617/// Creates an random (but fixed-seeded) array of a given size and null density
618pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
619    let rng = &mut seedable_rng();
620
621    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
622        (0..size).map(|_| {
623            if rng.random::<f32>() < null_density {
624                None
625            } else {
626                let value = rng
627                    .sample_iter::<u8, _>(StandardUniform)
628                    .take(value_len)
629                    .collect::<Vec<u8>>();
630                Some(value)
631            }
632        }),
633        value_len as i32,
634    )
635    .unwrap()
636}
637
638/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
639/// with the provided values array
640pub fn create_dict_from_values<K>(
641    size: usize,
642    null_density: f32,
643    values: &dyn Array,
644) -> DictionaryArray<K>
645where
646    K: ArrowDictionaryKeyType,
647    StandardUniform: Distribution<K::Native>,
648    K::Native: SampleUniform,
649{
650    let min_key = K::Native::from_usize(0).unwrap();
651    let max_key = K::Native::from_usize(values.len()).unwrap();
652    create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
653}
654
655/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
656/// with the provided values array and key range
657pub fn create_sparse_dict_from_values<K>(
658    size: usize,
659    null_density: f32,
660    values: &dyn Array,
661    key_range: Range<K::Native>,
662) -> DictionaryArray<K>
663where
664    K: ArrowDictionaryKeyType,
665    StandardUniform: Distribution<K::Native>,
666    K::Native: SampleUniform,
667{
668    let mut rng = seedable_rng();
669    let data_type =
670        DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
671
672    let keys: Buffer = (0..size)
673        .map(|_| rng.random_range(key_range.clone()))
674        .collect();
675
676    let nulls: Option<Buffer> = (null_density != 0.).then(|| {
677        (0..size)
678            .map(|_| rng.random_bool(null_density as _))
679            .collect()
680    });
681
682    let data = ArrayDataBuilder::new(data_type)
683        .len(size)
684        .null_bit_buffer(nulls)
685        .add_buffer(keys)
686        .add_child_data(values.to_data())
687        .build()
688        .unwrap();
689
690    DictionaryArray::from(data)
691}
692
693/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
694pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
695    let mut rng = seedable_rng();
696
697    (0..size)
698        .map(|_| {
699            if rng.random::<f32>() < nan_density {
700                Some(f16::NAN)
701            } else {
702                Some(f16::from_f32(rng.random()))
703            }
704        })
705        .collect()
706}
707
708/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
709pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
710    let mut rng = seedable_rng();
711
712    (0..size)
713        .map(|_| {
714            if rng.random::<f32>() < nan_density {
715                Some(f32::NAN)
716            } else {
717                Some(rng.random())
718            }
719        })
720        .collect()
721}
722
723/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
724pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
725    let mut rng = seedable_rng();
726
727    (0..size)
728        .map(|_| {
729            if rng.random::<f32>() < nan_density {
730                Some(f64::NAN)
731            } else {
732                Some(rng.random())
733            }
734        })
735        .collect()
736}