arrow/util/
bench_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make benchmarking easier
19
20use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::distr::uniform::SampleUniform;
26use rand::rng;
27use rand::Rng;
28use rand::SeedableRng;
29use rand::{
30    distr::{Alphanumeric, Distribution, StandardUniform},
31    prelude::StdRng,
32};
33use std::ops::Range;
34
35/// Creates an random (but fixed-seeded) array of a given size and null density
36pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38    T: ArrowPrimitiveType,
39    StandardUniform: Distribution<T::Native>,
40{
41    let mut rng = seedable_rng();
42
43    (0..size)
44        .map(|_| {
45            if rng.random::<f32>() < null_density {
46                None
47            } else {
48                Some(rng.random())
49            }
50        })
51        .collect()
52}
53
54/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
55/// filling it with random numbers generated using the provided `seed`.
56pub fn create_primitive_array_with_seed<T>(
57    size: usize,
58    null_density: f32,
59    seed: u64,
60) -> PrimitiveArray<T>
61where
62    T: ArrowPrimitiveType,
63    StandardUniform: Distribution<T::Native>,
64{
65    let mut rng = StdRng::seed_from_u64(seed);
66
67    (0..size)
68        .map(|_| {
69            if rng.random::<f32>() < null_density {
70                None
71            } else {
72                Some(rng.random())
73            }
74        })
75        .collect()
76}
77
78/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
79/// filling it with random [`IntervalMonthDayNano`] generated using the provided `seed`.
80pub fn create_month_day_nano_array_with_seed(
81    size: usize,
82    null_density: f32,
83    seed: u64,
84) -> IntervalMonthDayNanoArray {
85    let mut rng = StdRng::seed_from_u64(seed);
86
87    (0..size)
88        .map(|_| {
89            if rng.random::<f32>() < null_density {
90                None
91            } else {
92                Some(IntervalMonthDayNano::new(
93                    rng.random(),
94                    rng.random(),
95                    rng.random(),
96                ))
97            }
98        })
99        .collect()
100}
101
102/// Creates a random (but fixed-seeded) array of a given size and null density
103pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105    StandardUniform: Distribution<bool>,
106{
107    let mut rng = seedable_rng();
108    (0..size)
109        .map(|_| {
110            if rng.random::<f32>() < null_density {
111                None
112            } else {
113                let value = rng.random::<f32>() < true_density;
114                Some(value)
115            }
116        })
117        .collect()
118}
119
120/// Creates a random (but fixed-seeded) string array of a given size and null density.
121///
122/// Strings have a random length
123/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
124/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
125pub fn create_string_array<Offset: OffsetSizeTrait>(
126    size: usize,
127    null_density: f32,
128) -> GenericStringArray<Offset> {
129    create_string_array_with_max_len(size, null_density, 400)
130}
131
132/// Creates longer string array with same prefix, the prefix should be larger than 4 bytes,
133/// and the string length should be larger than 12 bytes
134/// so that we can compare the performance with StringViewArray, because StringViewArray has 4 bytes inline for view
135pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
136    size: usize,
137    null_density: f32,
138) -> GenericStringArray<Offset> {
139    create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
140}
141
142/// Creates longer string view array with same prefix, the prefix should be larger than 4 bytes,
143/// and the string length should be larger than 12 bytes
144/// so that we can compare the StringArray performance with StringViewArray, because StringViewArray has 4 bytes inline for view
145pub fn create_longer_string_view_array_with_same_prefix(
146    size: usize,
147    null_density: f32,
148) -> StringViewArray {
149    create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
150}
151
152fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
153    size: usize,
154    null_density: f32,
155    min_str_len: usize,
156    max_str_len: usize,
157    prefix: &str,
158) -> GenericStringArray<Offset> {
159    assert!(
160        min_str_len <= max_str_len,
161        "min_str_len must be <= max_str_len"
162    );
163    assert!(
164        prefix.len() <= max_str_len,
165        "Prefix length must be <= max_str_len"
166    );
167
168    let rng = &mut seedable_rng();
169    (0..size)
170        .map(|_| {
171            if rng.random::<f32>() < null_density {
172                None
173            } else {
174                let remaining_len = rng.random_range(
175                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
176                );
177
178                let mut value = prefix.to_string();
179                value.extend(
180                    rng.sample_iter(&Alphanumeric)
181                        .take(remaining_len)
182                        .map(char::from),
183                );
184
185                Some(value)
186            }
187        })
188        .collect()
189}
190
191fn create_string_view_array_with_len_range_and_prefix(
192    size: usize,
193    null_density: f32,
194    min_str_len: usize,
195    max_str_len: usize,
196    prefix: &str,
197) -> StringViewArray {
198    assert!(
199        min_str_len <= max_str_len,
200        "min_str_len must be <= max_str_len"
201    );
202    assert!(
203        prefix.len() <= max_str_len,
204        "Prefix length must be <= max_str_len"
205    );
206
207    let rng = &mut seedable_rng();
208    (0..size)
209        .map(|_| {
210            if rng.random::<f32>() < null_density {
211                None
212            } else {
213                let remaining_len = rng.random_range(
214                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
215                );
216
217                let mut value = prefix.to_string();
218                value.extend(
219                    rng.sample_iter(&Alphanumeric)
220                        .take(remaining_len)
221                        .map(char::from),
222                );
223
224                Some(value)
225            }
226        })
227        .collect()
228}
229
230/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
231fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
232    size: usize,
233    null_density: f32,
234    max_str_len: usize,
235) -> GenericStringArray<Offset> {
236    let rng = &mut seedable_rng();
237    (0..size)
238        .map(|_| {
239            if rng.random::<f32>() < null_density {
240                None
241            } else {
242                let str_len = rng.random_range(0..max_str_len);
243                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
244                let value = String::from_utf8(value).unwrap();
245                Some(value)
246            }
247        })
248        .collect()
249}
250
251/// Creates a random (but fixed-seeded) array of a given size, null density and length
252pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
253    size: usize,
254    null_density: f32,
255    str_len: usize,
256) -> GenericStringArray<Offset> {
257    let rng = &mut seedable_rng();
258
259    (0..size)
260        .map(|_| {
261            if rng.random::<f32>() < null_density {
262                None
263            } else {
264                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
265                let value = String::from_utf8(value).unwrap();
266                Some(value)
267            }
268        })
269        .collect()
270}
271
272/// Creates a random (but fixed-seeded) string view array of a given size and null density.
273///
274/// See `create_string_array` above for more details.
275pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
276    create_string_view_array_with_max_len(size, null_density, 400)
277}
278
279/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
280pub fn create_string_view_array_with_max_len(
281    size: usize,
282    null_density: f32,
283    max_str_len: usize,
284) -> StringViewArray {
285    let rng = &mut seedable_rng();
286    (0..size)
287        .map(|_| {
288            if rng.random::<f32>() < null_density {
289                None
290            } else {
291                let str_len = rng.random_range(0..max_str_len);
292                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
293                let value = String::from_utf8(value).unwrap();
294                Some(value)
295            }
296        })
297        .collect()
298}
299
300/// Creates a random (but fixed-seeded) array of a given size, null density and length
301pub fn create_string_view_array_with_fixed_len(
302    size: usize,
303    null_density: f32,
304    str_len: usize,
305) -> StringViewArray {
306    let rng = &mut seedable_rng();
307    (0..size)
308        .map(|_| {
309            if rng.random::<f32>() < null_density {
310                None
311            } else {
312                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
313                let value = String::from_utf8(value).unwrap();
314                Some(value)
315            }
316        })
317        .collect()
318}
319
320/// Creates a random (but fixed-seeded) array of a given size, null density and length
321pub fn create_string_view_array_with_len(
322    size: usize,
323    null_density: f32,
324    str_len: usize,
325    mixed: bool,
326) -> StringViewArray {
327    let rng = &mut seedable_rng();
328
329    let mut lengths = Vec::with_capacity(size);
330
331    // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes
332    if mixed {
333        for _ in 0..size / 2 {
334            lengths.push(rng.random_range(1..12));
335        }
336        for _ in size / 2..size {
337            lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
338        }
339    } else {
340        lengths.resize(size, str_len);
341    }
342
343    lengths
344        .into_iter()
345        .map(|len| {
346            if rng.random::<f32>() < null_density {
347                None
348            } else {
349                let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
350                Some(String::from_utf8(value).unwrap())
351            }
352        })
353        .collect()
354}
355
356/// Creates an random (but fixed-seeded) array of a given size and null density
357/// consisting of random 4 character alphanumeric strings
358pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
359    size: usize,
360    null_density: f32,
361    str_len: usize,
362) -> DictionaryArray<K> {
363    let rng = &mut seedable_rng();
364
365    let data: Vec<_> = (0..size)
366        .map(|_| {
367            if rng.random::<f32>() < null_density {
368                None
369            } else {
370                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
371                let value = String::from_utf8(value).unwrap();
372                Some(value)
373            }
374        })
375        .collect();
376
377    data.iter().map(|x| x.as_deref()).collect()
378}
379
380/// Create primitive run array for given logical and physical array lengths
381pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
382    logical_array_len: usize,
383    physical_array_len: usize,
384) -> RunArray<R> {
385    assert!(logical_array_len >= physical_array_len);
386    // typical length of each run
387    let run_len = logical_array_len / physical_array_len;
388
389    // Some runs should have extra length
390    let mut run_len_extra = logical_array_len % physical_array_len;
391
392    let mut values: Vec<V::Native> = (0..physical_array_len)
393        .flat_map(|s| {
394            let mut take_len = run_len;
395            if run_len_extra > 0 {
396                take_len += 1;
397                run_len_extra -= 1;
398            }
399            std::iter::repeat(V::Native::from_usize(s).unwrap()).take(take_len)
400        })
401        .collect();
402    while values.len() < logical_array_len {
403        let last_val = values[values.len() - 1];
404        values.push(last_val);
405    }
406    let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
407    builder.extend(values.into_iter().map(Some));
408
409    builder.finish()
410}
411
412/// Create string array to be used by run array builder. The string array
413/// will result in run array with physical length of `physical_array_len`
414/// and logical length of `logical_array_len`
415pub fn create_string_array_for_runs(
416    physical_array_len: usize,
417    logical_array_len: usize,
418    string_len: usize,
419) -> Vec<String> {
420    assert!(logical_array_len >= physical_array_len);
421    let mut rng = rng();
422
423    // typical length of each run
424    let run_len = logical_array_len / physical_array_len;
425
426    // Some runs should have extra length
427    let mut run_len_extra = logical_array_len % physical_array_len;
428
429    let mut values: Vec<String> = (0..physical_array_len)
430        .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
431        .flat_map(|s| {
432            let mut take_len = run_len;
433            if run_len_extra > 0 {
434                take_len += 1;
435                run_len_extra -= 1;
436            }
437            std::iter::repeat(s).take(take_len)
438        })
439        .collect();
440    while values.len() < logical_array_len {
441        let last_val = values[values.len() - 1].clone();
442        values.push(last_val);
443    }
444    values
445}
446
447/// Creates an random (but fixed-seeded) binary array of a given size and null density
448pub fn create_binary_array<Offset: OffsetSizeTrait>(
449    size: usize,
450    null_density: f32,
451) -> GenericBinaryArray<Offset> {
452    let rng = &mut seedable_rng();
453    let range_rng = &mut seedable_rng();
454
455    (0..size)
456        .map(|_| {
457            if rng.random::<f32>() < null_density {
458                None
459            } else {
460                let value = rng
461                    .sample_iter::<u8, _>(StandardUniform)
462                    .take(range_rng.random_range(0..8))
463                    .collect::<Vec<u8>>();
464                Some(value)
465            }
466        })
467        .collect()
468}
469
470/// Creates an random (but fixed-seeded) array of a given size and null density
471pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
472    let rng = &mut seedable_rng();
473
474    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
475        (0..size).map(|_| {
476            if rng.random::<f32>() < null_density {
477                None
478            } else {
479                let value = rng
480                    .sample_iter::<u8, _>(StandardUniform)
481                    .take(value_len)
482                    .collect::<Vec<u8>>();
483                Some(value)
484            }
485        }),
486        value_len as i32,
487    )
488    .unwrap()
489}
490
491/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
492/// with the provided values array
493pub fn create_dict_from_values<K>(
494    size: usize,
495    null_density: f32,
496    values: &dyn Array,
497) -> DictionaryArray<K>
498where
499    K: ArrowDictionaryKeyType,
500    StandardUniform: Distribution<K::Native>,
501    K::Native: SampleUniform,
502{
503    let min_key = K::Native::from_usize(0).unwrap();
504    let max_key = K::Native::from_usize(values.len()).unwrap();
505    create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
506}
507
508/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
509/// with the provided values array and key range
510pub fn create_sparse_dict_from_values<K>(
511    size: usize,
512    null_density: f32,
513    values: &dyn Array,
514    key_range: Range<K::Native>,
515) -> DictionaryArray<K>
516where
517    K: ArrowDictionaryKeyType,
518    StandardUniform: Distribution<K::Native>,
519    K::Native: SampleUniform,
520{
521    let mut rng = seedable_rng();
522    let data_type =
523        DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
524
525    let keys: Buffer = (0..size)
526        .map(|_| rng.random_range(key_range.clone()))
527        .collect();
528
529    let nulls: Option<Buffer> = (null_density != 0.).then(|| {
530        (0..size)
531            .map(|_| rng.random_bool(null_density as _))
532            .collect()
533    });
534
535    let data = ArrayDataBuilder::new(data_type)
536        .len(size)
537        .null_bit_buffer(nulls)
538        .add_buffer(keys)
539        .add_child_data(values.to_data())
540        .build()
541        .unwrap();
542
543    DictionaryArray::from(data)
544}
545
546/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
547pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
548    let mut rng = seedable_rng();
549
550    (0..size)
551        .map(|_| {
552            if rng.random::<f32>() < nan_density {
553                Some(f16::NAN)
554            } else {
555                Some(f16::from_f32(rng.random()))
556            }
557        })
558        .collect()
559}
560
561/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
562pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
563    let mut rng = seedable_rng();
564
565    (0..size)
566        .map(|_| {
567            if rng.random::<f32>() < nan_density {
568                Some(f32::NAN)
569            } else {
570                Some(rng.random())
571            }
572        })
573        .collect()
574}
575
576/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
577pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
578    let mut rng = seedable_rng();
579
580    (0..size)
581        .map(|_| {
582            if rng.random::<f32>() < nan_density {
583                Some(f64::NAN)
584            } else {
585                Some(rng.random())
586            }
587        })
588        .collect()
589}