arrow/util/
bench_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make benchmarking easier
19
20use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::Rng;
26use rand::SeedableRng;
27use rand::distr::uniform::SampleUniform;
28use rand::rng;
29use rand::{
30    distr::{Alphanumeric, Distribution, StandardUniform},
31    prelude::StdRng,
32};
33use std::ops::Range;
34
35/// Creates an random (but fixed-seeded) array of a given size and null density
36pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38    T: ArrowPrimitiveType,
39    StandardUniform: Distribution<T::Native>,
40{
41    let mut rng = seedable_rng();
42
43    (0..size)
44        .map(|_| {
45            if rng.random::<f32>() < null_density {
46                None
47            } else {
48                Some(rng.random())
49            }
50        })
51        .collect()
52}
53
54/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
55/// filling it with random numbers generated using the provided `seed`.
56pub fn create_primitive_array_with_seed<T>(
57    size: usize,
58    null_density: f32,
59    seed: u64,
60) -> PrimitiveArray<T>
61where
62    T: ArrowPrimitiveType,
63    StandardUniform: Distribution<T::Native>,
64{
65    let mut rng = StdRng::seed_from_u64(seed);
66
67    (0..size)
68        .map(|_| {
69            if rng.random::<f32>() < null_density {
70                None
71            } else {
72                Some(rng.random())
73            }
74        })
75        .collect()
76}
77
78/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
79/// filling it with random [`IntervalMonthDayNano`] generated using the provided `seed`.
80pub fn create_month_day_nano_array_with_seed(
81    size: usize,
82    null_density: f32,
83    seed: u64,
84) -> IntervalMonthDayNanoArray {
85    let mut rng = StdRng::seed_from_u64(seed);
86
87    (0..size)
88        .map(|_| {
89            if rng.random::<f32>() < null_density {
90                None
91            } else {
92                Some(IntervalMonthDayNano::new(
93                    rng.random(),
94                    rng.random(),
95                    rng.random(),
96                ))
97            }
98        })
99        .collect()
100}
101
102/// Creates a random (but fixed-seeded) array of a given size and null density
103pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105    StandardUniform: Distribution<bool>,
106{
107    let mut rng = seedable_rng();
108    (0..size)
109        .map(|_| {
110            if rng.random::<f32>() < null_density {
111                None
112            } else {
113                let value = rng.random::<f32>() < true_density;
114                Some(value)
115            }
116        })
117        .collect()
118}
119
120/// Creates a random array of a given size and null density based on the provided seed
121pub fn create_boolean_array_with_seed(
122    size: usize,
123    null_density: f32,
124    true_density: f32,
125    seed: u64,
126) -> BooleanArray
127where
128    StandardUniform: Distribution<bool>,
129{
130    let mut rng = StdRng::seed_from_u64(seed);
131    (0..size)
132        .map(|_| {
133            if rng.random::<f32>() < null_density {
134                None
135            } else {
136                let value = rng.random::<f32>() < true_density;
137                Some(value)
138            }
139        })
140        .collect()
141}
142
143/// Creates a random (but fixed-seeded) string array of a given size and null density.
144///
145/// Strings have a random length
146/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
147/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
148pub fn create_string_array<Offset: OffsetSizeTrait>(
149    size: usize,
150    null_density: f32,
151) -> GenericStringArray<Offset> {
152    create_string_array_with_max_len(size, null_density, 400)
153}
154
155/// Creates longer string array with same prefix, the prefix should be larger than 4 bytes,
156/// and the string length should be larger than 12 bytes
157/// so that we can compare the performance with StringViewArray, because StringViewArray has 4 bytes inline for view
158pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
159    size: usize,
160    null_density: f32,
161) -> GenericStringArray<Offset> {
162    create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
163}
164
165/// Creates longer string view array with same prefix, the prefix should be larger than 4 bytes,
166/// and the string length should be larger than 12 bytes
167/// so that we can compare the StringArray performance with StringViewArray, because StringViewArray has 4 bytes inline for view
168pub fn create_longer_string_view_array_with_same_prefix(
169    size: usize,
170    null_density: f32,
171) -> StringViewArray {
172    create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
173}
174
175fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
176    size: usize,
177    null_density: f32,
178    min_str_len: usize,
179    max_str_len: usize,
180    prefix: &str,
181) -> GenericStringArray<Offset> {
182    create_string_array_with_len_range_and_prefix_and_seed(
183        size,
184        null_density,
185        min_str_len,
186        max_str_len,
187        prefix,
188        42,
189    )
190}
191
192/// Creates a random [`GenericStringArray`] of a given `size` and `null_density`
193/// filling it with random strings with lengths in the specified range,
194/// all starting with the provided `prefix`, generated using the provided `seed`.
195pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
196    size: usize,
197    null_density: f32,
198    min_str_len: usize,
199    max_str_len: usize,
200    prefix: &str,
201    seed: u64,
202) -> GenericStringArray<Offset> {
203    assert!(
204        min_str_len <= max_str_len,
205        "min_str_len must be <= max_str_len"
206    );
207    assert!(
208        prefix.len() <= max_str_len,
209        "Prefix length must be <= max_str_len"
210    );
211
212    let rng = &mut StdRng::seed_from_u64(seed);
213    (0..size)
214        .map(|_| {
215            if rng.random::<f32>() < null_density {
216                None
217            } else {
218                let remaining_len = rng.random_range(
219                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
220                );
221
222                let mut value = prefix.to_string();
223                value.extend(
224                    rng.sample_iter(&Alphanumeric)
225                        .take(remaining_len)
226                        .map(char::from),
227                );
228
229                Some(value)
230            }
231        })
232        .collect()
233}
234/// Creates a string view array of a given range, null density and length
235///
236/// Arguments:
237/// - `size`: number of  string view array
238/// - `null_density`: density of nulls in the string view array
239/// - `range`: range size of each string in the string view array
240/// - `seed`: seed for the random number generator
241pub fn create_string_view_array_with_len_range_and_seed(
242    size: usize,
243    null_density: f32,
244    range: Range<usize>,
245    seed: u64,
246) -> StringViewArray {
247    let rng = &mut StdRng::seed_from_u64(seed);
248    (0..size)
249        .map(|_| {
250            if rng.random::<f32>() < null_density {
251                None
252            } else {
253                let str_len = rng.random_range(range.clone());
254                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
255                let value = String::from_utf8(value).unwrap();
256                Some(value)
257            }
258        })
259        .collect()
260}
261
262fn create_string_view_array_with_len_range_and_prefix(
263    size: usize,
264    null_density: f32,
265    min_str_len: usize,
266    max_str_len: usize,
267    prefix: &str,
268) -> StringViewArray {
269    assert!(
270        min_str_len <= max_str_len,
271        "min_str_len must be <= max_str_len"
272    );
273    assert!(
274        prefix.len() <= max_str_len,
275        "Prefix length must be <= max_str_len"
276    );
277
278    let rng = &mut seedable_rng();
279    (0..size)
280        .map(|_| {
281            if rng.random::<f32>() < null_density {
282                None
283            } else {
284                let remaining_len = rng.random_range(
285                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
286                );
287
288                let mut value = prefix.to_string();
289                value.extend(
290                    rng.sample_iter(&Alphanumeric)
291                        .take(remaining_len)
292                        .map(char::from),
293                );
294
295                Some(value)
296            }
297        })
298        .collect()
299}
300
301/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
302pub fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
303    size: usize,
304    null_density: f32,
305    max_str_len: usize,
306) -> GenericStringArray<Offset> {
307    let rng = &mut seedable_rng();
308    (0..size)
309        .map(|_| {
310            if rng.random::<f32>() < null_density {
311                None
312            } else {
313                let str_len = rng.random_range(0..max_str_len);
314                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
315                let value = String::from_utf8(value).unwrap();
316                Some(value)
317            }
318        })
319        .collect()
320}
321
322/// Creates a random (but fixed-seeded) array of a given size, null density and length
323pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
324    size: usize,
325    null_density: f32,
326    str_len: usize,
327) -> GenericStringArray<Offset> {
328    let rng = &mut seedable_rng();
329
330    (0..size)
331        .map(|_| {
332            if rng.random::<f32>() < null_density {
333                None
334            } else {
335                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
336                let value = String::from_utf8(value).unwrap();
337                Some(value)
338            }
339        })
340        .collect()
341}
342
343/// Creates a random (but fixed-seeded) string view array of a given size and null density.
344///
345/// See `create_string_array` above for more details.
346pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
347    create_string_view_array_with_max_len(size, null_density, 400)
348}
349
350/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
351pub fn create_string_view_array_with_max_len(
352    size: usize,
353    null_density: f32,
354    max_str_len: usize,
355) -> StringViewArray {
356    let rng = &mut seedable_rng();
357    (0..size)
358        .map(|_| {
359            if rng.random::<f32>() < null_density {
360                None
361            } else {
362                let str_len = rng.random_range(0..max_str_len);
363                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
364                let value = String::from_utf8(value).unwrap();
365                Some(value)
366            }
367        })
368        .collect()
369}
370
371/// Creates a random (but fixed-seeded) array of a given size, null density and length
372pub fn create_string_view_array_with_fixed_len(
373    size: usize,
374    null_density: f32,
375    str_len: usize,
376) -> StringViewArray {
377    let rng = &mut seedable_rng();
378    (0..size)
379        .map(|_| {
380            if rng.random::<f32>() < null_density {
381                None
382            } else {
383                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
384                let value = String::from_utf8(value).unwrap();
385                Some(value)
386            }
387        })
388        .collect()
389}
390
391/// Creates a random (but fixed-seeded) array of a given size, null density and length
392pub fn create_string_view_array_with_len(
393    size: usize,
394    null_density: f32,
395    str_len: usize,
396    mixed: bool,
397) -> StringViewArray {
398    let rng = &mut seedable_rng();
399
400    let mut lengths = Vec::with_capacity(size);
401
402    // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes
403    if mixed {
404        for _ in 0..size / 2 {
405            lengths.push(rng.random_range(1..12));
406        }
407        for _ in size / 2..size {
408            lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
409        }
410    } else {
411        lengths.resize(size, str_len);
412    }
413
414    lengths
415        .into_iter()
416        .map(|len| {
417            if rng.random::<f32>() < null_density {
418                None
419            } else {
420                let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
421                Some(String::from_utf8(value).unwrap())
422            }
423        })
424        .collect()
425}
426
427/// Creates an random (but fixed-seeded) array of a given size and null density
428/// consisting of random 4 character alphanumeric strings
429pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
430    size: usize,
431    null_density: f32,
432    str_len: usize,
433) -> DictionaryArray<K> {
434    let rng = &mut seedable_rng();
435
436    let data: Vec<_> = (0..size)
437        .map(|_| {
438            if rng.random::<f32>() < null_density {
439                None
440            } else {
441                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
442                let value = String::from_utf8(value).unwrap();
443                Some(value)
444            }
445        })
446        .collect();
447
448    data.iter().map(|x| x.as_deref()).collect()
449}
450
451/// Create a List/LargeList Array  of primitive values
452///
453/// Arguments:
454/// - `size`: number of lists in the array
455/// - `null_density`: density of nulls in the list array
456/// - `list_null_density`: density of nulls in the primitive arrays inside the lists
457/// - `max_list_size`: maximum size of each list (actual size is random between 0 and max_list_size)
458/// - `seed`: seed for the random number generator
459pub fn create_primitive_list_array_with_seed<O, T>(
460    size: usize,
461    null_density: f32,
462    list_null_density: f32,
463    max_list_size: usize,
464    seed: u64,
465) -> GenericListArray<O>
466where
467    O: OffsetSizeTrait,
468    T: ArrowPrimitiveType,
469    StandardUniform: Distribution<T::Native>,
470{
471    let mut rng = StdRng::seed_from_u64(seed);
472
473    let values = (0..size).map(|_| {
474        if rng.random::<f32>() < null_density {
475            None
476        } else {
477            let list_size = rng.random_range(0..=max_list_size);
478            let list_values: Vec<Option<T::Native>> = (0..list_size)
479                .map(|_| {
480                    if rng.random::<f32>() < list_null_density {
481                        None
482                    } else {
483                        Some(rng.random())
484                    }
485                })
486                .collect();
487            Some(list_values)
488        }
489    });
490
491    GenericListArray::<O>::from_iter_primitive::<T, _, _>(values)
492}
493
494/// Create primitive run array for given logical and physical array lengths
495pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
496    logical_array_len: usize,
497    physical_array_len: usize,
498) -> RunArray<R> {
499    assert!(logical_array_len >= physical_array_len);
500    // typical length of each run
501    let run_len = logical_array_len / physical_array_len;
502
503    // Some runs should have extra length
504    let mut run_len_extra = logical_array_len % physical_array_len;
505
506    let mut values: Vec<V::Native> = (0..physical_array_len)
507        .flat_map(|s| {
508            let mut take_len = run_len;
509            if run_len_extra > 0 {
510                take_len += 1;
511                run_len_extra -= 1;
512            }
513            std::iter::repeat_n(V::Native::from_usize(s).unwrap(), take_len)
514        })
515        .collect();
516    while values.len() < logical_array_len {
517        let last_val = values[values.len() - 1];
518        values.push(last_val);
519    }
520    let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
521    builder.extend(values.into_iter().map(Some));
522
523    builder.finish()
524}
525
526/// Create string array to be used by run array builder. The string array
527/// will result in run array with physical length of `physical_array_len`
528/// and logical length of `logical_array_len`
529pub fn create_string_array_for_runs(
530    physical_array_len: usize,
531    logical_array_len: usize,
532    string_len: usize,
533) -> Vec<String> {
534    assert!(logical_array_len >= physical_array_len);
535    let mut rng = rng();
536
537    // typical length of each run
538    let run_len = logical_array_len / physical_array_len;
539
540    // Some runs should have extra length
541    let mut run_len_extra = logical_array_len % physical_array_len;
542
543    let mut values: Vec<String> = (0..physical_array_len)
544        .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
545        .flat_map(|s| {
546            let mut take_len = run_len;
547            if run_len_extra > 0 {
548                take_len += 1;
549                run_len_extra -= 1;
550            }
551            std::iter::repeat_n(s, take_len)
552        })
553        .collect();
554    while values.len() < logical_array_len {
555        let last_val = values[values.len() - 1].clone();
556        values.push(last_val);
557    }
558    values
559}
560
561/// Creates an random (but fixed-seeded) binary array of a given size and null density
562pub fn create_binary_array<Offset: OffsetSizeTrait>(
563    size: usize,
564    null_density: f32,
565) -> GenericBinaryArray<Offset> {
566    create_binary_array_with_seed(
567        size,
568        null_density,
569        42, // bytes_seed
570        42, // bytes_length_seed
571    )
572}
573
574/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density`
575/// filling it with random bytes, generated using the provided `seed`s.
576///
577/// the `bytes_seed` is used to seed the RNG for generating the byte values,
578/// while the `bytes_length_seed` is used to seed the RNG for generating the length of an array item
579///
580/// These values can be the same as they are used to seed different RNGs internally.
581pub fn create_binary_array_with_seed<Offset: OffsetSizeTrait>(
582    size: usize,
583    null_density: f32,
584    bytes_seed: u64,
585    bytes_length_seed: u64,
586) -> GenericBinaryArray<Offset> {
587    let rng = &mut StdRng::seed_from_u64(bytes_seed);
588    let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed);
589
590    (0..size)
591        .map(|_| {
592            if rng.random::<f32>() < null_density {
593                None
594            } else {
595                let value = rng
596                    .sample_iter::<u8, _>(StandardUniform)
597                    .take(range_rng.random_range(0..8))
598                    .collect::<Vec<u8>>();
599                Some(value)
600            }
601        })
602        .collect()
603}
604
605/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density`
606/// filling it with random bytes with lengths in the specified range,
607/// all starting with the provided `prefix`, generated using the provided `seed`.
608///
609pub fn create_binary_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
610    size: usize,
611    null_density: f32,
612    min_len: usize,
613    max_len: usize,
614    prefix: &[u8],
615    seed: u64,
616) -> GenericBinaryArray<Offset> {
617    assert!(min_len <= max_len, "min_len must be <= max_len");
618    assert!(prefix.len() <= max_len, "Prefix length must be <= max_len");
619
620    let rng = &mut StdRng::seed_from_u64(seed);
621    (0..size)
622        .map(|_| {
623            if rng.random::<f32>() < null_density {
624                None
625            } else {
626                let remaining_len = rng
627                    .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len()));
628
629                let remaining = rng
630                    .sample_iter::<u8, _>(StandardUniform)
631                    .take(remaining_len);
632
633                let value = prefix.iter().copied().chain(remaining).collect::<Vec<u8>>();
634                Some(value)
635            }
636        })
637        .collect()
638}
639
640/// Creates an random (but fixed-seeded) array of a given size and null density
641pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
642    let rng = &mut seedable_rng();
643
644    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
645        (0..size).map(|_| {
646            if rng.random::<f32>() < null_density {
647                None
648            } else {
649                let value = rng
650                    .sample_iter::<u8, _>(StandardUniform)
651                    .take(value_len)
652                    .collect::<Vec<u8>>();
653                Some(value)
654            }
655        }),
656        value_len as i32,
657    )
658    .unwrap()
659}
660
661/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
662/// with the provided values array
663pub fn create_dict_from_values<K>(
664    size: usize,
665    null_density: f32,
666    values: &dyn Array,
667) -> DictionaryArray<K>
668where
669    K: ArrowDictionaryKeyType,
670    StandardUniform: Distribution<K::Native>,
671    K::Native: SampleUniform,
672{
673    let min_key = K::Native::from_usize(0).unwrap();
674    let max_key = K::Native::from_usize(values.len()).unwrap();
675    create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
676}
677
678/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
679/// with the provided values array and key range
680pub fn create_sparse_dict_from_values<K>(
681    size: usize,
682    null_density: f32,
683    values: &dyn Array,
684    key_range: Range<K::Native>,
685) -> DictionaryArray<K>
686where
687    K: ArrowDictionaryKeyType,
688    StandardUniform: Distribution<K::Native>,
689    K::Native: SampleUniform,
690{
691    let mut rng = seedable_rng();
692    let data_type =
693        DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
694
695    let keys: Buffer = (0..size)
696        .map(|_| rng.random_range(key_range.clone()))
697        .collect();
698
699    let nulls: Option<Buffer> = (null_density != 0.).then(|| {
700        (0..size)
701            .map(|_| rng.random_bool(null_density as _))
702            .collect()
703    });
704
705    let data = ArrayDataBuilder::new(data_type)
706        .len(size)
707        .null_bit_buffer(nulls)
708        .add_buffer(keys)
709        .add_child_data(values.to_data())
710        .build()
711        .unwrap();
712
713    DictionaryArray::from(data)
714}
715
716/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
717pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
718    let mut rng = seedable_rng();
719
720    (0..size)
721        .map(|_| {
722            if rng.random::<f32>() < nan_density {
723                Some(f16::NAN)
724            } else {
725                Some(f16::from_f32(rng.random()))
726            }
727        })
728        .collect()
729}
730
731/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
732pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
733    let mut rng = seedable_rng();
734
735    (0..size)
736        .map(|_| {
737            if rng.random::<f32>() < nan_density {
738                Some(f32::NAN)
739            } else {
740                Some(rng.random())
741            }
742        })
743        .collect()
744}
745
746/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
747pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
748    let mut rng = seedable_rng();
749
750    (0..size)
751        .map(|_| {
752            if rng.random::<f32>() < nan_density {
753                Some(f64::NAN)
754            } else {
755                Some(rng.random())
756            }
757        })
758        .collect()
759}
760
761/// Creates a random f64 array of a given size and nan-value density based on a given seed
762pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) -> Float64Array {
763    let mut rng = StdRng::seed_from_u64(seed);
764
765    (0..size)
766        .map(|_| {
767            if rng.random::<f32>() < nan_density {
768                Some(f64::NAN)
769            } else {
770                Some(rng.random())
771            }
772        })
773        .collect()
774}