arrow/util/
bench_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make benchmarking easier
19
20use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::Rng;
26use rand::SeedableRng;
27use rand::distr::uniform::SampleUniform;
28use rand::rng;
29use rand::{
30    distr::{Alphanumeric, Distribution, StandardUniform},
31    prelude::StdRng,
32};
33use std::ops::Range;
34
35/// Creates an random (but fixed-seeded) array of a given size and null density
36pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38    T: ArrowPrimitiveType,
39    StandardUniform: Distribution<T::Native>,
40{
41    let mut rng = seedable_rng();
42
43    (0..size)
44        .map(|_| {
45            if rng.random::<f32>() < null_density {
46                None
47            } else {
48                Some(rng.random())
49            }
50        })
51        .collect()
52}
53
54/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
55/// filling it with random numbers generated using the provided `seed`.
56pub fn create_primitive_array_with_seed<T>(
57    size: usize,
58    null_density: f32,
59    seed: u64,
60) -> PrimitiveArray<T>
61where
62    T: ArrowPrimitiveType,
63    StandardUniform: Distribution<T::Native>,
64{
65    let mut rng = StdRng::seed_from_u64(seed);
66
67    (0..size)
68        .map(|_| {
69            if rng.random::<f32>() < null_density {
70                None
71            } else {
72                Some(rng.random())
73            }
74        })
75        .collect()
76}
77
78/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
79/// filling it with random [`IntervalMonthDayNano`] generated using the provided `seed`.
80pub fn create_month_day_nano_array_with_seed(
81    size: usize,
82    null_density: f32,
83    seed: u64,
84) -> IntervalMonthDayNanoArray {
85    let mut rng = StdRng::seed_from_u64(seed);
86
87    (0..size)
88        .map(|_| {
89            if rng.random::<f32>() < null_density {
90                None
91            } else {
92                Some(IntervalMonthDayNano::new(
93                    rng.random(),
94                    rng.random(),
95                    rng.random(),
96                ))
97            }
98        })
99        .collect()
100}
101
102/// Creates a random (but fixed-seeded) array of a given size and null density
103pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105    StandardUniform: Distribution<bool>,
106{
107    let mut rng = seedable_rng();
108    (0..size)
109        .map(|_| {
110            if rng.random::<f32>() < null_density {
111                None
112            } else {
113                let value = rng.random::<f32>() < true_density;
114                Some(value)
115            }
116        })
117        .collect()
118}
119
120/// Creates a random (but fixed-seeded) string array of a given size and null density.
121///
122/// Strings have a random length
123/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
124/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
125pub fn create_string_array<Offset: OffsetSizeTrait>(
126    size: usize,
127    null_density: f32,
128) -> GenericStringArray<Offset> {
129    create_string_array_with_max_len(size, null_density, 400)
130}
131
132/// Creates longer string array with same prefix, the prefix should be larger than 4 bytes,
133/// and the string length should be larger than 12 bytes
134/// so that we can compare the performance with StringViewArray, because StringViewArray has 4 bytes inline for view
135pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
136    size: usize,
137    null_density: f32,
138) -> GenericStringArray<Offset> {
139    create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
140}
141
142/// Creates longer string view array with same prefix, the prefix should be larger than 4 bytes,
143/// and the string length should be larger than 12 bytes
144/// so that we can compare the StringArray performance with StringViewArray, because StringViewArray has 4 bytes inline for view
145pub fn create_longer_string_view_array_with_same_prefix(
146    size: usize,
147    null_density: f32,
148) -> StringViewArray {
149    create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
150}
151
152fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
153    size: usize,
154    null_density: f32,
155    min_str_len: usize,
156    max_str_len: usize,
157    prefix: &str,
158) -> GenericStringArray<Offset> {
159    create_string_array_with_len_range_and_prefix_and_seed(
160        size,
161        null_density,
162        min_str_len,
163        max_str_len,
164        prefix,
165        42,
166    )
167}
168
169/// Creates a random [`GenericStringArray`] of a given `size` and `null_density`
170/// filling it with random strings with lengths in the specified range,
171/// all starting with the provided `prefix`, generated using the provided `seed`.
172pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
173    size: usize,
174    null_density: f32,
175    min_str_len: usize,
176    max_str_len: usize,
177    prefix: &str,
178    seed: u64,
179) -> GenericStringArray<Offset> {
180    assert!(
181        min_str_len <= max_str_len,
182        "min_str_len must be <= max_str_len"
183    );
184    assert!(
185        prefix.len() <= max_str_len,
186        "Prefix length must be <= max_str_len"
187    );
188
189    let rng = &mut StdRng::seed_from_u64(seed);
190    (0..size)
191        .map(|_| {
192            if rng.random::<f32>() < null_density {
193                None
194            } else {
195                let remaining_len = rng.random_range(
196                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
197                );
198
199                let mut value = prefix.to_string();
200                value.extend(
201                    rng.sample_iter(&Alphanumeric)
202                        .take(remaining_len)
203                        .map(char::from),
204                );
205
206                Some(value)
207            }
208        })
209        .collect()
210}
211
212fn create_string_view_array_with_len_range_and_prefix(
213    size: usize,
214    null_density: f32,
215    min_str_len: usize,
216    max_str_len: usize,
217    prefix: &str,
218) -> StringViewArray {
219    assert!(
220        min_str_len <= max_str_len,
221        "min_str_len must be <= max_str_len"
222    );
223    assert!(
224        prefix.len() <= max_str_len,
225        "Prefix length must be <= max_str_len"
226    );
227
228    let rng = &mut seedable_rng();
229    (0..size)
230        .map(|_| {
231            if rng.random::<f32>() < null_density {
232                None
233            } else {
234                let remaining_len = rng.random_range(
235                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
236                );
237
238                let mut value = prefix.to_string();
239                value.extend(
240                    rng.sample_iter(&Alphanumeric)
241                        .take(remaining_len)
242                        .map(char::from),
243                );
244
245                Some(value)
246            }
247        })
248        .collect()
249}
250
251/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
252pub fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
253    size: usize,
254    null_density: f32,
255    max_str_len: usize,
256) -> GenericStringArray<Offset> {
257    let rng = &mut seedable_rng();
258    (0..size)
259        .map(|_| {
260            if rng.random::<f32>() < null_density {
261                None
262            } else {
263                let str_len = rng.random_range(0..max_str_len);
264                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
265                let value = String::from_utf8(value).unwrap();
266                Some(value)
267            }
268        })
269        .collect()
270}
271
272/// Creates a random (but fixed-seeded) array of a given size, null density and length
273pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
274    size: usize,
275    null_density: f32,
276    str_len: usize,
277) -> GenericStringArray<Offset> {
278    let rng = &mut seedable_rng();
279
280    (0..size)
281        .map(|_| {
282            if rng.random::<f32>() < null_density {
283                None
284            } else {
285                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
286                let value = String::from_utf8(value).unwrap();
287                Some(value)
288            }
289        })
290        .collect()
291}
292
293/// Creates a random (but fixed-seeded) string view array of a given size and null density.
294///
295/// See `create_string_array` above for more details.
296pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
297    create_string_view_array_with_max_len(size, null_density, 400)
298}
299
300/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
301pub fn create_string_view_array_with_max_len(
302    size: usize,
303    null_density: f32,
304    max_str_len: usize,
305) -> StringViewArray {
306    let rng = &mut seedable_rng();
307    (0..size)
308        .map(|_| {
309            if rng.random::<f32>() < null_density {
310                None
311            } else {
312                let str_len = rng.random_range(0..max_str_len);
313                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
314                let value = String::from_utf8(value).unwrap();
315                Some(value)
316            }
317        })
318        .collect()
319}
320
321/// Creates a random (but fixed-seeded) array of a given size, null density and length
322pub fn create_string_view_array_with_fixed_len(
323    size: usize,
324    null_density: f32,
325    str_len: usize,
326) -> StringViewArray {
327    let rng = &mut seedable_rng();
328    (0..size)
329        .map(|_| {
330            if rng.random::<f32>() < null_density {
331                None
332            } else {
333                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
334                let value = String::from_utf8(value).unwrap();
335                Some(value)
336            }
337        })
338        .collect()
339}
340
341/// Creates a random (but fixed-seeded) array of a given size, null density and length
342pub fn create_string_view_array_with_len(
343    size: usize,
344    null_density: f32,
345    str_len: usize,
346    mixed: bool,
347) -> StringViewArray {
348    let rng = &mut seedable_rng();
349
350    let mut lengths = Vec::with_capacity(size);
351
352    // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes
353    if mixed {
354        for _ in 0..size / 2 {
355            lengths.push(rng.random_range(1..12));
356        }
357        for _ in size / 2..size {
358            lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
359        }
360    } else {
361        lengths.resize(size, str_len);
362    }
363
364    lengths
365        .into_iter()
366        .map(|len| {
367            if rng.random::<f32>() < null_density {
368                None
369            } else {
370                let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
371                Some(String::from_utf8(value).unwrap())
372            }
373        })
374        .collect()
375}
376
377/// Creates an random (but fixed-seeded) array of a given size and null density
378/// consisting of random 4 character alphanumeric strings
379pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
380    size: usize,
381    null_density: f32,
382    str_len: usize,
383) -> DictionaryArray<K> {
384    let rng = &mut seedable_rng();
385
386    let data: Vec<_> = (0..size)
387        .map(|_| {
388            if rng.random::<f32>() < null_density {
389                None
390            } else {
391                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
392                let value = String::from_utf8(value).unwrap();
393                Some(value)
394            }
395        })
396        .collect();
397
398    data.iter().map(|x| x.as_deref()).collect()
399}
400
401/// Create primitive run array for given logical and physical array lengths
402pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
403    logical_array_len: usize,
404    physical_array_len: usize,
405) -> RunArray<R> {
406    assert!(logical_array_len >= physical_array_len);
407    // typical length of each run
408    let run_len = logical_array_len / physical_array_len;
409
410    // Some runs should have extra length
411    let mut run_len_extra = logical_array_len % physical_array_len;
412
413    let mut values: Vec<V::Native> = (0..physical_array_len)
414        .flat_map(|s| {
415            let mut take_len = run_len;
416            if run_len_extra > 0 {
417                take_len += 1;
418                run_len_extra -= 1;
419            }
420            std::iter::repeat_n(V::Native::from_usize(s).unwrap(), take_len)
421        })
422        .collect();
423    while values.len() < logical_array_len {
424        let last_val = values[values.len() - 1];
425        values.push(last_val);
426    }
427    let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
428    builder.extend(values.into_iter().map(Some));
429
430    builder.finish()
431}
432
433/// Create string array to be used by run array builder. The string array
434/// will result in run array with physical length of `physical_array_len`
435/// and logical length of `logical_array_len`
436pub fn create_string_array_for_runs(
437    physical_array_len: usize,
438    logical_array_len: usize,
439    string_len: usize,
440) -> Vec<String> {
441    assert!(logical_array_len >= physical_array_len);
442    let mut rng = rng();
443
444    // typical length of each run
445    let run_len = logical_array_len / physical_array_len;
446
447    // Some runs should have extra length
448    let mut run_len_extra = logical_array_len % physical_array_len;
449
450    let mut values: Vec<String> = (0..physical_array_len)
451        .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
452        .flat_map(|s| {
453            let mut take_len = run_len;
454            if run_len_extra > 0 {
455                take_len += 1;
456                run_len_extra -= 1;
457            }
458            std::iter::repeat_n(s, take_len)
459        })
460        .collect();
461    while values.len() < logical_array_len {
462        let last_val = values[values.len() - 1].clone();
463        values.push(last_val);
464    }
465    values
466}
467
468/// Creates an random (but fixed-seeded) binary array of a given size and null density
469pub fn create_binary_array<Offset: OffsetSizeTrait>(
470    size: usize,
471    null_density: f32,
472) -> GenericBinaryArray<Offset> {
473    create_binary_array_with_seed(
474        size,
475        null_density,
476        42, // bytes_seed
477        42, // bytes_length_seed
478    )
479}
480
481/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density`
482/// filling it with random bytes, generated using the provided `seed`s.
483///
484/// the `bytes_seed` is used to seed the RNG for generating the byte values,
485/// while the `bytes_length_seed` is used to seed the RNG for generating the length of an array item
486///
487/// These values can be the same as they are used to seed different RNGs internally.
488pub fn create_binary_array_with_seed<Offset: OffsetSizeTrait>(
489    size: usize,
490    null_density: f32,
491    bytes_seed: u64,
492    bytes_length_seed: u64,
493) -> GenericBinaryArray<Offset> {
494    let rng = &mut StdRng::seed_from_u64(bytes_seed);
495    let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed);
496
497    (0..size)
498        .map(|_| {
499            if rng.random::<f32>() < null_density {
500                None
501            } else {
502                let value = rng
503                    .sample_iter::<u8, _>(StandardUniform)
504                    .take(range_rng.random_range(0..8))
505                    .collect::<Vec<u8>>();
506                Some(value)
507            }
508        })
509        .collect()
510}
511
512/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density`
513/// filling it with random bytes with lengths in the specified range,
514/// all starting with the provided `prefix`, generated using the provided `seed`.
515///
516pub fn create_binary_array_with_len_range_and_prefix_and_seed<Offset: OffsetSizeTrait>(
517    size: usize,
518    null_density: f32,
519    min_len: usize,
520    max_len: usize,
521    prefix: &[u8],
522    seed: u64,
523) -> GenericBinaryArray<Offset> {
524    assert!(min_len <= max_len, "min_len must be <= max_len");
525    assert!(prefix.len() <= max_len, "Prefix length must be <= max_len");
526
527    let rng = &mut StdRng::seed_from_u64(seed);
528    (0..size)
529        .map(|_| {
530            if rng.random::<f32>() < null_density {
531                None
532            } else {
533                let remaining_len = rng
534                    .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len()));
535
536                let remaining = rng
537                    .sample_iter::<u8, _>(StandardUniform)
538                    .take(remaining_len);
539
540                let value = prefix.iter().copied().chain(remaining).collect::<Vec<u8>>();
541                Some(value)
542            }
543        })
544        .collect()
545}
546
547/// Creates an random (but fixed-seeded) array of a given size and null density
548pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
549    let rng = &mut seedable_rng();
550
551    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
552        (0..size).map(|_| {
553            if rng.random::<f32>() < null_density {
554                None
555            } else {
556                let value = rng
557                    .sample_iter::<u8, _>(StandardUniform)
558                    .take(value_len)
559                    .collect::<Vec<u8>>();
560                Some(value)
561            }
562        }),
563        value_len as i32,
564    )
565    .unwrap()
566}
567
568/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
569/// with the provided values array
570pub fn create_dict_from_values<K>(
571    size: usize,
572    null_density: f32,
573    values: &dyn Array,
574) -> DictionaryArray<K>
575where
576    K: ArrowDictionaryKeyType,
577    StandardUniform: Distribution<K::Native>,
578    K::Native: SampleUniform,
579{
580    let min_key = K::Native::from_usize(0).unwrap();
581    let max_key = K::Native::from_usize(values.len()).unwrap();
582    create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
583}
584
585/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
586/// with the provided values array and key range
587pub fn create_sparse_dict_from_values<K>(
588    size: usize,
589    null_density: f32,
590    values: &dyn Array,
591    key_range: Range<K::Native>,
592) -> DictionaryArray<K>
593where
594    K: ArrowDictionaryKeyType,
595    StandardUniform: Distribution<K::Native>,
596    K::Native: SampleUniform,
597{
598    let mut rng = seedable_rng();
599    let data_type =
600        DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
601
602    let keys: Buffer = (0..size)
603        .map(|_| rng.random_range(key_range.clone()))
604        .collect();
605
606    let nulls: Option<Buffer> = (null_density != 0.).then(|| {
607        (0..size)
608            .map(|_| rng.random_bool(null_density as _))
609            .collect()
610    });
611
612    let data = ArrayDataBuilder::new(data_type)
613        .len(size)
614        .null_bit_buffer(nulls)
615        .add_buffer(keys)
616        .add_child_data(values.to_data())
617        .build()
618        .unwrap();
619
620    DictionaryArray::from(data)
621}
622
623/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
624pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
625    let mut rng = seedable_rng();
626
627    (0..size)
628        .map(|_| {
629            if rng.random::<f32>() < nan_density {
630                Some(f16::NAN)
631            } else {
632                Some(f16::from_f32(rng.random()))
633            }
634        })
635        .collect()
636}
637
638/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
639pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
640    let mut rng = seedable_rng();
641
642    (0..size)
643        .map(|_| {
644            if rng.random::<f32>() < nan_density {
645                Some(f32::NAN)
646            } else {
647                Some(rng.random())
648            }
649        })
650        .collect()
651}
652
653/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
654pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
655    let mut rng = seedable_rng();
656
657    (0..size)
658        .map(|_| {
659            if rng.random::<f32>() < nan_density {
660                Some(f64::NAN)
661            } else {
662                Some(rng.random())
663            }
664        })
665        .collect()
666}