arrow/util/
bench_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make benchmarking easier
19
20use crate::array::*;
21use crate::datatypes::*;
22use crate::util::test_util::seedable_rng;
23use arrow_buffer::{Buffer, IntervalMonthDayNano};
24use half::f16;
25use rand::distributions::uniform::SampleUniform;
26use rand::thread_rng;
27use rand::Rng;
28use rand::SeedableRng;
29use rand::{
30    distributions::{Alphanumeric, Distribution, Standard},
31    prelude::StdRng,
32};
33use std::ops::Range;
34
35/// Creates an random (but fixed-seeded) array of a given size and null density
36pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
37where
38    T: ArrowPrimitiveType,
39    Standard: Distribution<T::Native>,
40{
41    let mut rng = seedable_rng();
42
43    (0..size)
44        .map(|_| {
45            if rng.gen::<f32>() < null_density {
46                None
47            } else {
48                Some(rng.gen())
49            }
50        })
51        .collect()
52}
53
54/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
55/// filling it with random numbers generated using the provided `seed`.
56pub fn create_primitive_array_with_seed<T>(
57    size: usize,
58    null_density: f32,
59    seed: u64,
60) -> PrimitiveArray<T>
61where
62    T: ArrowPrimitiveType,
63    Standard: Distribution<T::Native>,
64{
65    let mut rng = StdRng::seed_from_u64(seed);
66
67    (0..size)
68        .map(|_| {
69            if rng.gen::<f32>() < null_density {
70                None
71            } else {
72                Some(rng.gen())
73            }
74        })
75        .collect()
76}
77
78/// Creates a [`PrimitiveArray`] of a given `size` and `null_density`
79/// filling it with random [`IntervalMonthDayNano`] generated using the provided `seed`.
80pub fn create_month_day_nano_array_with_seed(
81    size: usize,
82    null_density: f32,
83    seed: u64,
84) -> IntervalMonthDayNanoArray {
85    let mut rng = StdRng::seed_from_u64(seed);
86
87    (0..size)
88        .map(|_| {
89            if rng.gen::<f32>() < null_density {
90                None
91            } else {
92                Some(IntervalMonthDayNano::new(rng.gen(), rng.gen(), rng.gen()))
93            }
94        })
95        .collect()
96}
97
98/// Creates a random (but fixed-seeded) array of a given size and null density
99pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
100where
101    Standard: Distribution<bool>,
102{
103    let mut rng = seedable_rng();
104    (0..size)
105        .map(|_| {
106            if rng.gen::<f32>() < null_density {
107                None
108            } else {
109                let value = rng.gen::<f32>() < true_density;
110                Some(value)
111            }
112        })
113        .collect()
114}
115
116/// Creates a random (but fixed-seeded) string array of a given size and null density.
117///
118/// Strings have a random length
119/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
120/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
121pub fn create_string_array<Offset: OffsetSizeTrait>(
122    size: usize,
123    null_density: f32,
124) -> GenericStringArray<Offset> {
125    create_string_array_with_max_len(size, null_density, 400)
126}
127
128/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
129fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
130    size: usize,
131    null_density: f32,
132    max_str_len: usize,
133) -> GenericStringArray<Offset> {
134    let rng = &mut seedable_rng();
135    (0..size)
136        .map(|_| {
137            if rng.gen::<f32>() < null_density {
138                None
139            } else {
140                let str_len = rng.gen_range(0..max_str_len);
141                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
142                let value = String::from_utf8(value).unwrap();
143                Some(value)
144            }
145        })
146        .collect()
147}
148
149/// Creates a random (but fixed-seeded) array of a given size, null density and length
150pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
151    size: usize,
152    null_density: f32,
153    str_len: usize,
154) -> GenericStringArray<Offset> {
155    let rng = &mut seedable_rng();
156
157    (0..size)
158        .map(|_| {
159            if rng.gen::<f32>() < null_density {
160                None
161            } else {
162                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
163                let value = String::from_utf8(value).unwrap();
164                Some(value)
165            }
166        })
167        .collect()
168}
169
170/// Creates a random (but fixed-seeded) string view array of a given size and null density.
171///
172/// See `create_string_array` above for more details.
173pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
174    create_string_view_array_with_max_len(size, null_density, 400)
175}
176
177/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length
178fn create_string_view_array_with_max_len(
179    size: usize,
180    null_density: f32,
181    max_str_len: usize,
182) -> StringViewArray {
183    let rng = &mut seedable_rng();
184    (0..size)
185        .map(|_| {
186            if rng.gen::<f32>() < null_density {
187                None
188            } else {
189                let str_len = rng.gen_range(0..max_str_len);
190                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
191                let value = String::from_utf8(value).unwrap();
192                Some(value)
193            }
194        })
195        .collect()
196}
197
198/// Creates a random (but fixed-seeded) array of a given size, null density and length
199pub fn create_string_view_array_with_len(
200    size: usize,
201    null_density: f32,
202    str_len: usize,
203    mixed: bool,
204) -> StringViewArray {
205    let rng = &mut seedable_rng();
206
207    let mut lengths = Vec::with_capacity(size);
208
209    // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes
210    if mixed {
211        for _ in 0..size / 2 {
212            lengths.push(rng.gen_range(1..12));
213        }
214        for _ in size / 2..size {
215            lengths.push(rng.gen_range(12..=std::cmp::max(30, str_len)));
216        }
217    } else {
218        lengths.resize(size, str_len);
219    }
220
221    lengths
222        .into_iter()
223        .map(|len| {
224            if rng.gen::<f32>() < null_density {
225                None
226            } else {
227                let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
228                Some(String::from_utf8(value).unwrap())
229            }
230        })
231        .collect()
232}
233
234/// Creates an random (but fixed-seeded) array of a given size and null density
235/// consisting of random 4 character alphanumeric strings
236pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
237    size: usize,
238    null_density: f32,
239    str_len: usize,
240) -> DictionaryArray<K> {
241    let rng = &mut seedable_rng();
242
243    let data: Vec<_> = (0..size)
244        .map(|_| {
245            if rng.gen::<f32>() < null_density {
246                None
247            } else {
248                let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
249                let value = String::from_utf8(value).unwrap();
250                Some(value)
251            }
252        })
253        .collect();
254
255    data.iter().map(|x| x.as_deref()).collect()
256}
257
258/// Create primitive run array for given logical and physical array lengths
259pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
260    logical_array_len: usize,
261    physical_array_len: usize,
262) -> RunArray<R> {
263    assert!(logical_array_len >= physical_array_len);
264    // typical length of each run
265    let run_len = logical_array_len / physical_array_len;
266
267    // Some runs should have extra length
268    let mut run_len_extra = logical_array_len % physical_array_len;
269
270    let mut values: Vec<V::Native> = (0..physical_array_len)
271        .flat_map(|s| {
272            let mut take_len = run_len;
273            if run_len_extra > 0 {
274                take_len += 1;
275                run_len_extra -= 1;
276            }
277            std::iter::repeat(V::Native::from_usize(s).unwrap()).take(take_len)
278        })
279        .collect();
280    while values.len() < logical_array_len {
281        let last_val = values[values.len() - 1];
282        values.push(last_val);
283    }
284    let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
285    builder.extend(values.into_iter().map(Some));
286
287    builder.finish()
288}
289
290/// Create string array to be used by run array builder. The string array
291/// will result in run array with physical length of `physical_array_len`
292/// and logical length of `logical_array_len`
293pub fn create_string_array_for_runs(
294    physical_array_len: usize,
295    logical_array_len: usize,
296    string_len: usize,
297) -> Vec<String> {
298    assert!(logical_array_len >= physical_array_len);
299    let mut rng = thread_rng();
300
301    // typical length of each run
302    let run_len = logical_array_len / physical_array_len;
303
304    // Some runs should have extra length
305    let mut run_len_extra = logical_array_len % physical_array_len;
306
307    let mut values: Vec<String> = (0..physical_array_len)
308        .map(|_| (0..string_len).map(|_| rng.gen::<char>()).collect())
309        .flat_map(|s| {
310            let mut take_len = run_len;
311            if run_len_extra > 0 {
312                take_len += 1;
313                run_len_extra -= 1;
314            }
315            std::iter::repeat(s).take(take_len)
316        })
317        .collect();
318    while values.len() < logical_array_len {
319        let last_val = values[values.len() - 1].clone();
320        values.push(last_val);
321    }
322    values
323}
324
325/// Creates an random (but fixed-seeded) binary array of a given size and null density
326pub fn create_binary_array<Offset: OffsetSizeTrait>(
327    size: usize,
328    null_density: f32,
329) -> GenericBinaryArray<Offset> {
330    let rng = &mut seedable_rng();
331    let range_rng = &mut seedable_rng();
332
333    (0..size)
334        .map(|_| {
335            if rng.gen::<f32>() < null_density {
336                None
337            } else {
338                let value = rng
339                    .sample_iter::<u8, _>(Standard)
340                    .take(range_rng.gen_range(0..8))
341                    .collect::<Vec<u8>>();
342                Some(value)
343            }
344        })
345        .collect()
346}
347
348/// Creates an random (but fixed-seeded) array of a given size and null density
349pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
350    let rng = &mut seedable_rng();
351
352    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
353        (0..size).map(|_| {
354            if rng.gen::<f32>() < null_density {
355                None
356            } else {
357                let value = rng
358                    .sample_iter::<u8, _>(Standard)
359                    .take(value_len)
360                    .collect::<Vec<u8>>();
361                Some(value)
362            }
363        }),
364        value_len as i32,
365    )
366    .unwrap()
367}
368
369/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
370/// with the provided values array
371pub fn create_dict_from_values<K>(
372    size: usize,
373    null_density: f32,
374    values: &dyn Array,
375) -> DictionaryArray<K>
376where
377    K: ArrowDictionaryKeyType,
378    Standard: Distribution<K::Native>,
379    K::Native: SampleUniform,
380{
381    let min_key = K::Native::from_usize(0).unwrap();
382    let max_key = K::Native::from_usize(values.len()).unwrap();
383    create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
384}
385
386/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
387/// with the provided values array and key range
388pub fn create_sparse_dict_from_values<K>(
389    size: usize,
390    null_density: f32,
391    values: &dyn Array,
392    key_range: Range<K::Native>,
393) -> DictionaryArray<K>
394where
395    K: ArrowDictionaryKeyType,
396    Standard: Distribution<K::Native>,
397    K::Native: SampleUniform,
398{
399    let mut rng = seedable_rng();
400    let data_type =
401        DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
402
403    let keys: Buffer = (0..size)
404        .map(|_| rng.gen_range(key_range.clone()))
405        .collect();
406
407    let nulls: Option<Buffer> =
408        (null_density != 0.).then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect());
409
410    let data = ArrayDataBuilder::new(data_type)
411        .len(size)
412        .null_bit_buffer(nulls)
413        .add_buffer(keys)
414        .add_child_data(values.to_data())
415        .build()
416        .unwrap();
417
418    DictionaryArray::from(data)
419}
420
421/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
422pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
423    let mut rng = seedable_rng();
424
425    (0..size)
426        .map(|_| {
427            if rng.gen::<f32>() < nan_density {
428                Some(f16::NAN)
429            } else {
430                Some(f16::from_f32(rng.gen()))
431            }
432        })
433        .collect()
434}
435
436/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
437pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
438    let mut rng = seedable_rng();
439
440    (0..size)
441        .map(|_| {
442            if rng.gen::<f32>() < nan_density {
443                Some(f32::NAN)
444            } else {
445                Some(rng.gen())
446            }
447        })
448        .collect()
449}
450
451/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
452pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
453    let mut rng = seedable_rng();
454
455    (0..size)
456        .map(|_| {
457            if rng.gen::<f32>() < nan_density {
458                Some(f64::NAN)
459            } else {
460                Some(rng.gen())
461            }
462        })
463        .collect()
464}