arrow/util/
bench_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make benchmarking easier
19
20use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::distr::uniform::SampleUniform;
26use rand::rng;
27use rand::Rng;
28use rand::SeedableRng;
29use rand::{
30    distr::{Alphanumeric, Distribution, StandardUniform},
31    prelude::StdRng,
32};
33use std::ops::Range;
34
35/// Creates an random (but fixed-seeded) array of a given size and null density
36pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38    T: ArrowPrimitiveType,
39    StandardUniform: Distribution<T::Native>,
40{
41    let mut rng = seedable_rng();
42
43    (0..size)
44        .map(|_| {
45            if rng.random::<f32>() < null_density {
46                None
47            } else {
48                Some(rng.random())
49            }
50        })
51        .collect()
52}
53
54/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
55/// filling it with random numbers generated using the provided `seed`.
56pub fn create_primitive_array_with_seed<T>(
57    size: usize,
58    null_density: f32,
59    seed: u64,
60) -> PrimitiveArray<T>
61where
62    T: ArrowPrimitiveType,
63    StandardUniform: Distribution<T::Native>,
64{
65    let mut rng = StdRng::seed_from_u64(seed);
66
67    (0..size)
68        .map(|_| {
69            if rng.random::<f32>() < null_density {
70                None
71            } else {
72                Some(rng.random())
73            }
74        })
75        .collect()
76}
77
78/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
79/// filling it with random [`IntervalMonthDayNano`] generated using the provided `seed`.
80pub fn create_month_day_nano_array_with_seed(
81    size: usize,
82    null_density: f32,
83    seed: u64,
84) -> IntervalMonthDayNanoArray {
85    let mut rng = StdRng::seed_from_u64(seed);
86
87    (0..size)
88        .map(|_| {
89            if rng.random::<f32>() < null_density {
90                None
91            } else {
92                Some(IntervalMonthDayNano::new(
93                    rng.random(),
94                    rng.random(),
95                    rng.random(),
96                ))
97            }
98        })
99        .collect()
100}
101
102/// Creates a random (but fixed-seeded) array of a given size and null density
103pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
104where
105    StandardUniform: Distribution<bool>,
106{
107    let mut rng = seedable_rng();
108    (0..size)
109        .map(|_| {
110            if rng.random::<f32>() < null_density {
111                None
112            } else {
113                let value = rng.random::<f32>() < true_density;
114                Some(value)
115            }
116        })
117        .collect()
118}
119
120/// Creates a random (but fixed-seeded) string array of a given size and null density.
121///
122/// Strings have a random length
123/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
124/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
125pub fn create_string_array<Offset: OffsetSizeTrait>(
126    size: usize,
127    null_density: f32,
128) -> GenericStringArray<Offset> {
129    create_string_array_with_max_len(size, null_density, 400)
130}
131
132/// Creates longer string array with same prefix, the prefix should be larger than 4 bytes,
133/// and the string length should be larger than 12 bytes
134/// so that we can compare the performance with StringViewArray, because StringViewArray has 4 bytes inline for view
135pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
136    size: usize,
137    null_density: f32,
138) -> GenericStringArray<Offset> {
139    create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
140}
141
142/// Creates longer string view array with same prefix, the prefix should be larger than 4 bytes,
143/// and the string length should be larger than 12 bytes
144/// so that we can compare the StringArray performance with StringViewArray, because StringViewArray has 4 bytes inline for view
145pub fn create_longer_string_view_array_with_same_prefix(
146    size: usize,
147    null_density: f32,
148) -> StringViewArray {
149    create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
150}
151
152fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
153    size: usize,
154    null_density: f32,
155    min_str_len: usize,
156    max_str_len: usize,
157    prefix: &str,
158) -> GenericStringArray<Offset> {
159    assert!(
160        min_str_len <= max_str_len,
161        "min_str_len must be <= max_str_len"
162    );
163    assert!(
164        prefix.len() <= max_str_len,
165        "Prefix length must be <= max_str_len"
166    );
167
168    let rng = &mut seedable_rng();
169    (0..size)
170        .map(|_| {
171            if rng.random::<f32>() < null_density {
172                None
173            } else {
174                let remaining_len = rng.random_range(
175                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
176                );
177
178                let mut value = prefix.to_string();
179                value.extend(
180                    rng.sample_iter(&Alphanumeric)
181                        .take(remaining_len)
182                        .map(char::from),
183                );
184
185                Some(value)
186            }
187        })
188        .collect()
189}
190
191fn create_string_view_array_with_len_range_and_prefix(
192    size: usize,
193    null_density: f32,
194    min_str_len: usize,
195    max_str_len: usize,
196    prefix: &str,
197) -> StringViewArray {
198    assert!(
199        min_str_len <= max_str_len,
200        "min_str_len must be <= max_str_len"
201    );
202    assert!(
203        prefix.len() <= max_str_len,
204        "Prefix length must be <= max_str_len"
205    );
206
207    let rng = &mut seedable_rng();
208    (0..size)
209        .map(|_| {
210            if rng.random::<f32>() < null_density {
211                None
212            } else {
213                let remaining_len = rng.random_range(
214                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
215                );
216
217                let mut value = prefix.to_string();
218                value.extend(
219                    rng.sample_iter(&Alphanumeric)
220                        .take(remaining_len)
221                        .map(char::from),
222                );
223
224                Some(value)
225            }
226        })
227        .collect()
228}
229
230/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
231fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
232    size: usize,
233    null_density: f32,
234    max_str_len: usize,
235) -> GenericStringArray<Offset> {
236    let rng = &mut seedable_rng();
237    (0..size)
238        .map(|_| {
239            if rng.random::<f32>() < null_density {
240                None
241            } else {
242                let str_len = rng.random_range(0..max_str_len);
243                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
244                let value = String::from_utf8(value).unwrap();
245                Some(value)
246            }
247        })
248        .collect()
249}
250
251/// Creates a random (but fixed-seeded) array of a given size, null density and length
252pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
253    size: usize,
254    null_density: f32,
255    str_len: usize,
256) -> GenericStringArray<Offset> {
257    let rng = &mut seedable_rng();
258
259    (0..size)
260        .map(|_| {
261            if rng.random::<f32>() < null_density {
262                None
263            } else {
264                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
265                let value = String::from_utf8(value).unwrap();
266                Some(value)
267            }
268        })
269        .collect()
270}
271
272/// Creates a random (but fixed-seeded) string view array of a given size and null density.
273///
274/// See `create_string_array` above for more details.
275pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
276    create_string_view_array_with_max_len(size, null_density, 400)
277}
278
279/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
280fn create_string_view_array_with_max_len(
281    size: usize,
282    null_density: f32,
283    max_str_len: usize,
284) -> StringViewArray {
285    let rng = &mut seedable_rng();
286    (0..size)
287        .map(|_| {
288            if rng.random::<f32>() < null_density {
289                None
290            } else {
291                let str_len = rng.random_range(0..max_str_len);
292                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
293                let value = String::from_utf8(value).unwrap();
294                Some(value)
295            }
296        })
297        .collect()
298}
299
300/// Creates a random (but fixed-seeded) array of a given size, null density and length
301pub fn create_string_view_array_with_len(
302    size: usize,
303    null_density: f32,
304    str_len: usize,
305    mixed: bool,
306) -> StringViewArray {
307    let rng = &mut seedable_rng();
308
309    let mut lengths = Vec::with_capacity(size);
310
311    // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes
312    if mixed {
313        for _ in 0..size / 2 {
314            lengths.push(rng.random_range(1..12));
315        }
316        for _ in size / 2..size {
317            lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
318        }
319    } else {
320        lengths.resize(size, str_len);
321    }
322
323    lengths
324        .into_iter()
325        .map(|len| {
326            if rng.random::<f32>() < null_density {
327                None
328            } else {
329                let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
330                Some(String::from_utf8(value).unwrap())
331            }
332        })
333        .collect()
334}
335
336/// Creates an random (but fixed-seeded) array of a given size and null density
337/// consisting of random 4 character alphanumeric strings
338pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
339    size: usize,
340    null_density: f32,
341    str_len: usize,
342) -> DictionaryArray<K> {
343    let rng = &mut seedable_rng();
344
345    let data: Vec<_> = (0..size)
346        .map(|_| {
347            if rng.random::<f32>() < null_density {
348                None
349            } else {
350                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
351                let value = String::from_utf8(value).unwrap();
352                Some(value)
353            }
354        })
355        .collect();
356
357    data.iter().map(|x| x.as_deref()).collect()
358}
359
360/// Create primitive run array for given logical and physical array lengths
361pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
362    logical_array_len: usize,
363    physical_array_len: usize,
364) -> RunArray<R> {
365    assert!(logical_array_len >= physical_array_len);
366    // typical length of each run
367    let run_len = logical_array_len / physical_array_len;
368
369    // Some runs should have extra length
370    let mut run_len_extra = logical_array_len % physical_array_len;
371
372    let mut values: Vec<V::Native> = (0..physical_array_len)
373        .flat_map(|s| {
374            let mut take_len = run_len;
375            if run_len_extra > 0 {
376                take_len += 1;
377                run_len_extra -= 1;
378            }
379            std::iter::repeat(V::Native::from_usize(s).unwrap()).take(take_len)
380        })
381        .collect();
382    while values.len() < logical_array_len {
383        let last_val = values[values.len() - 1];
384        values.push(last_val);
385    }
386    let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
387    builder.extend(values.into_iter().map(Some));
388
389    builder.finish()
390}
391
392/// Create string array to be used by run array builder. The string array
393/// will result in run array with physical length of `physical_array_len`
394/// and logical length of `logical_array_len`
395pub fn create_string_array_for_runs(
396    physical_array_len: usize,
397    logical_array_len: usize,
398    string_len: usize,
399) -> Vec<String> {
400    assert!(logical_array_len >= physical_array_len);
401    let mut rng = rng();
402
403    // typical length of each run
404    let run_len = logical_array_len / physical_array_len;
405
406    // Some runs should have extra length
407    let mut run_len_extra = logical_array_len % physical_array_len;
408
409    let mut values: Vec<String> = (0..physical_array_len)
410        .map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
411        .flat_map(|s| {
412            let mut take_len = run_len;
413            if run_len_extra > 0 {
414                take_len += 1;
415                run_len_extra -= 1;
416            }
417            std::iter::repeat(s).take(take_len)
418        })
419        .collect();
420    while values.len() < logical_array_len {
421        let last_val = values[values.len() - 1].clone();
422        values.push(last_val);
423    }
424    values
425}
426
427/// Creates an random (but fixed-seeded) binary array of a given size and null density
428pub fn create_binary_array<Offset: OffsetSizeTrait>(
429    size: usize,
430    null_density: f32,
431) -> GenericBinaryArray<Offset> {
432    let rng = &mut seedable_rng();
433    let range_rng = &mut seedable_rng();
434
435    (0..size)
436        .map(|_| {
437            if rng.random::<f32>() < null_density {
438                None
439            } else {
440                let value = rng
441                    .sample_iter::<u8, _>(StandardUniform)
442                    .take(range_rng.random_range(0..8))
443                    .collect::<Vec<u8>>();
444                Some(value)
445            }
446        })
447        .collect()
448}
449
450/// Creates an random (but fixed-seeded) array of a given size and null density
451pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
452    let rng = &mut seedable_rng();
453
454    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
455        (0..size).map(|_| {
456            if rng.random::<f32>() < null_density {
457                None
458            } else {
459                let value = rng
460                    .sample_iter::<u8, _>(StandardUniform)
461                    .take(value_len)
462                    .collect::<Vec<u8>>();
463                Some(value)
464            }
465        }),
466        value_len as i32,
467    )
468    .unwrap()
469}
470
471/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
472/// with the provided values array
473pub fn create_dict_from_values<K>(
474    size: usize,
475    null_density: f32,
476    values: &dyn Array,
477) -> DictionaryArray<K>
478where
479    K: ArrowDictionaryKeyType,
480    StandardUniform: Distribution<K::Native>,
481    K::Native: SampleUniform,
482{
483    let min_key = K::Native::from_usize(0).unwrap();
484    let max_key = K::Native::from_usize(values.len()).unwrap();
485    create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
486}
487
488/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
489/// with the provided values array and key range
490pub fn create_sparse_dict_from_values<K>(
491    size: usize,
492    null_density: f32,
493    values: &dyn Array,
494    key_range: Range<K::Native>,
495) -> DictionaryArray<K>
496where
497    K: ArrowDictionaryKeyType,
498    StandardUniform: Distribution<K::Native>,
499    K::Native: SampleUniform,
500{
501    let mut rng = seedable_rng();
502    let data_type =
503        DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
504
505    let keys: Buffer = (0..size)
506        .map(|_| rng.random_range(key_range.clone()))
507        .collect();
508
509    let nulls: Option<Buffer> = (null_density != 0.).then(|| {
510        (0..size)
511            .map(|_| rng.random_bool(null_density as _))
512            .collect()
513    });
514
515    let data = ArrayDataBuilder::new(data_type)
516        .len(size)
517        .null_bit_buffer(nulls)
518        .add_buffer(keys)
519        .add_child_data(values.to_data())
520        .build()
521        .unwrap();
522
523    DictionaryArray::from(data)
524}
525
526/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
527pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
528    let mut rng = seedable_rng();
529
530    (0..size)
531        .map(|_| {
532            if rng.random::<f32>() < nan_density {
533                Some(f16::NAN)
534            } else {
535                Some(f16::from_f32(rng.random()))
536            }
537        })
538        .collect()
539}
540
541/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
542pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
543    let mut rng = seedable_rng();
544
545    (0..size)
546        .map(|_| {
547            if rng.random::<f32>() < nan_density {
548                Some(f32::NAN)
549            } else {
550                Some(rng.random())
551            }
552        })
553        .collect()
554}
555
556/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
557pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
558    let mut rng = seedable_rng();
559
560    (0..size)
561        .map(|_| {
562            if rng.random::<f32>() < nan_density {
563                Some(f64::NAN)
564            } else {
565                Some(rng.random())
566            }
567        })
568        .collect()
569}