Skip to main content

arrow/util/
data_gen.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utilities to generate random arrays and batches
19
20use std::sync::Arc;
21
22use rand::{
23    Rng,
24    distr::uniform::{SampleRange, SampleUniform},
25};
26
27use crate::array::*;
28use crate::error::{ArrowError, Result};
29use crate::{
30    buffer::{Buffer, MutableBuffer},
31    datatypes::*,
32};
33
34use super::{bench_util::*, bit_util, test_util::seedable_rng};
35
36/// Create a random [RecordBatch] from a schema
37pub fn create_random_batch(
38    schema: SchemaRef,
39    size: usize,
40    null_density: f32,
41    true_density: f32,
42) -> Result<RecordBatch> {
43    let columns = schema
44        .fields()
45        .iter()
46        .map(|field| create_random_array(field, size, null_density, true_density))
47        .collect::<Result<Vec<ArrayRef>>>()?;
48
49    RecordBatch::try_new_with_options(
50        schema,
51        columns,
52        &RecordBatchOptions::new().with_match_field_names(false),
53    )
54}
55
56/// Create a random [ArrayRef] from a [DataType] with a length,
57/// null density and true density (for [BooleanArray]).
58///
59/// # Arguments
60///
61/// * `field` - The field containing the data type for which to create a random array
62/// * `size` - The number of elements in the generated array
63/// * `null_density` - The approximate fraction of null values in the resulting array (0.0 to 1.0)
64/// * `true_density` - The approximate fraction of true values in boolean arrays (0.0 to 1.0)
65///
66pub fn create_random_array(
67    field: &Field,
68    size: usize,
69    mut null_density: f32,
70    true_density: f32,
71) -> Result<ArrayRef> {
72    // Override nullability in case of not nested and not dictionary
73    // For nested we don't want to override as we want to keep the nullability for the children
74    // For dictionary it handle the nullability internally
75    if !field.data_type().is_nested() && !matches!(field.data_type(), Dictionary(_, _)) {
76        // Override null density with 0.0 if the array is non-nullable
77        null_density = match field.is_nullable() {
78            true => null_density,
79            false => 0.0,
80        };
81    }
82
83    use DataType::*;
84    let array = match field.data_type() {
85        Null => Arc::new(NullArray::new(size)) as ArrayRef,
86        Boolean => Arc::new(create_boolean_array(size, null_density, true_density)),
87        Int8 => Arc::new(create_primitive_array::<Int8Type>(size, null_density)),
88        Int16 => Arc::new(create_primitive_array::<Int16Type>(size, null_density)),
89        Int32 => Arc::new(create_primitive_array::<Int32Type>(size, null_density)),
90        Int64 => Arc::new(create_primitive_array::<Int64Type>(size, null_density)),
91        UInt8 => Arc::new(create_primitive_array::<UInt8Type>(size, null_density)),
92        UInt16 => Arc::new(create_primitive_array::<UInt16Type>(size, null_density)),
93        UInt32 => Arc::new(create_primitive_array::<UInt32Type>(size, null_density)),
94        UInt64 => Arc::new(create_primitive_array::<UInt64Type>(size, null_density)),
95        Float16 => Arc::new(create_primitive_array::<Float16Type>(size, null_density)),
96        Float32 => Arc::new(create_primitive_array::<Float32Type>(size, null_density)),
97        Float64 => Arc::new(create_primitive_array::<Float64Type>(size, null_density)),
98        Timestamp(unit, tz) => match unit {
99            TimeUnit::Second => Arc::new(
100                create_random_temporal_array::<TimestampSecondType>(size, null_density)
101                    .with_timezone_opt(tz.clone()),
102            ) as ArrayRef,
103            TimeUnit::Millisecond => Arc::new(
104                create_random_temporal_array::<TimestampMillisecondType>(size, null_density)
105                    .with_timezone_opt(tz.clone()),
106            ),
107            TimeUnit::Microsecond => Arc::new(
108                create_random_temporal_array::<TimestampMicrosecondType>(size, null_density)
109                    .with_timezone_opt(tz.clone()),
110            ),
111            TimeUnit::Nanosecond => Arc::new(
112                create_random_temporal_array::<TimestampNanosecondType>(size, null_density)
113                    .with_timezone_opt(tz.clone()),
114            ),
115        },
116        Date32 => Arc::new(create_random_temporal_array::<Date32Type>(
117            size,
118            null_density,
119        )),
120        Date64 => Arc::new(create_random_temporal_array::<Date64Type>(
121            size,
122            null_density,
123        )),
124        Time32(unit) => match unit {
125            TimeUnit::Second => Arc::new(create_random_temporal_array::<Time32SecondType>(
126                size,
127                null_density,
128            )) as ArrayRef,
129            TimeUnit::Millisecond => Arc::new(
130                create_random_temporal_array::<Time32MillisecondType>(size, null_density),
131            ),
132            _ => {
133                return Err(ArrowError::InvalidArgumentError(format!(
134                    "Unsupported unit {unit:?} for Time32"
135                )));
136            }
137        },
138        Time64(unit) => match unit {
139            TimeUnit::Microsecond => Arc::new(
140                create_random_temporal_array::<Time64MicrosecondType>(size, null_density),
141            ) as ArrayRef,
142            TimeUnit::Nanosecond => Arc::new(create_random_temporal_array::<Time64NanosecondType>(
143                size,
144                null_density,
145            )),
146            _ => {
147                return Err(ArrowError::InvalidArgumentError(format!(
148                    "Unsupported unit {unit:?} for Time64"
149                )));
150            }
151        },
152        Utf8 => Arc::new(create_string_array::<i32>(size, null_density)),
153        LargeUtf8 => Arc::new(create_string_array::<i64>(size, null_density)),
154        Utf8View => Arc::new(create_string_view_array_with_len(
155            size,
156            null_density,
157            4,
158            false,
159        )),
160        Binary => Arc::new(create_binary_array::<i32>(size, null_density)),
161        LargeBinary => Arc::new(create_binary_array::<i64>(size, null_density)),
162        FixedSizeBinary(len) => {
163            let len = TryInto::<usize>::try_into(*len).map_err(|_| {
164                ArrowError::InvalidArgumentError(format!("cannot use FixedSizeBinary({len})"))
165            })?;
166            Arc::new(create_fsb_array(size, null_density, len))
167        }
168        BinaryView => Arc::new(
169            create_string_view_array_with_len(size, null_density, 4, false).to_binary_view(),
170        ),
171        List(_) => create_random_list_array(field, size, null_density, true_density)?,
172        LargeList(_) => create_random_list_array(field, size, null_density, true_density)?,
173        Struct(_) => create_random_struct_array(field, size, null_density, true_density)?,
174        d @ Dictionary(_, value_type) if crate::compute::can_cast_types(value_type, d) => {
175            let f = Field::new(
176                field.name(),
177                value_type.as_ref().clone(),
178                field.is_nullable(),
179            );
180            let v = create_random_array(&f, size, null_density, true_density)?;
181            crate::compute::cast(&v, d)?
182        }
183        Map(_, _) => create_random_map_array(field, size, null_density, true_density)?,
184        Decimal128(_, _) => create_random_decimal_array(field, size, null_density)?,
185        Decimal256(_, _) => create_random_decimal_array(field, size, null_density)?,
186        RunEndEncoded(index, value) => {
187            create_random_run_end_encoded_array(index, value, size, null_density, true_density)?
188        }
189        other => {
190            return Err(ArrowError::NotYetImplemented(format!(
191                "Generating random arrays not yet implemented for {other:?}"
192            )));
193        }
194    };
195
196    if !field.is_nullable() {
197        assert_eq!(array.null_count(), 0);
198    }
199
200    Ok(array)
201}
202
203#[inline]
204fn create_random_decimal_array(field: &Field, size: usize, null_density: f32) -> Result<ArrayRef> {
205    let mut rng = seedable_rng();
206
207    match field.data_type() {
208        DataType::Decimal128(precision, scale) => {
209            let values = (0..size)
210                .map(|_| {
211                    if rng.random::<f32>() < null_density {
212                        None
213                    } else {
214                        Some(rng.random::<i128>())
215                    }
216                })
217                .collect::<Vec<_>>();
218            Ok(Arc::new(
219                Decimal128Array::from(values).with_precision_and_scale(*precision, *scale)?,
220            ))
221        }
222        DataType::Decimal256(precision, scale) => {
223            let values = (0..size)
224                .map(|_| {
225                    if rng.random::<f32>() < null_density {
226                        None
227                    } else {
228                        Some(i256::from_parts(rng.random::<u128>(), rng.random::<i128>()))
229                    }
230                })
231                .collect::<Vec<_>>();
232            Ok(Arc::new(
233                Decimal256Array::from(values).with_precision_and_scale(*precision, *scale)?,
234            ))
235        }
236        _ => Err(ArrowError::InvalidArgumentError(format!(
237            "Cannot create decimal array for field {field}"
238        ))),
239    }
240}
241#[inline]
242fn create_random_run_end_encoded_array(
243    index: &Field,
244    value: &Field,
245    size: usize,
246    null_density: f32,
247    true_density: f32,
248) -> Result<ArrayRef> {
249    const MIN_RUN: usize = 8;
250    const MAX_RUN: usize = 32;
251
252    let mut rng = seedable_rng();
253    let mut run_lengths: Vec<usize> = Vec::new();
254    let mut remaining = size;
255    while remaining > 0 {
256        let len = rng.random_range(MIN_RUN..=MAX_RUN).min(remaining);
257        run_lengths.push(len);
258        remaining -= len;
259    }
260    let num_runs = run_lengths.len();
261
262    let mut cumulative: i64 = 0;
263    let run_ends_i64: Vec<i64> = run_lengths
264        .iter()
265        .map(|&l| {
266            cumulative += l as i64;
267            cumulative
268        })
269        .collect();
270
271    let values = create_random_array(value, num_runs, null_density, true_density)?;
272
273    match index.data_type() {
274        DataType::Int16 => {
275            let run_ends: Int16Array = run_ends_i64.iter().map(|&v| v as i16).collect();
276            Ok(Arc::new(RunArray::<Int16Type>::try_new(
277                &run_ends, &values,
278            )?))
279        }
280        DataType::Int32 => {
281            let run_ends: Int32Array = run_ends_i64.iter().map(|&v| v as i32).collect();
282            Ok(Arc::new(RunArray::<Int32Type>::try_new(
283                &run_ends, &values,
284            )?))
285        }
286        DataType::Int64 => {
287            let run_ends: Int64Array = run_ends_i64.iter().copied().collect();
288            Ok(Arc::new(RunArray::<Int64Type>::try_new(
289                &run_ends, &values,
290            )?))
291        }
292        other => Err(ArrowError::InvalidArgumentError(format!(
293            "Unsupported run-ends type for REE: {other:?}"
294        ))),
295    }
296}
297
298#[inline]
299fn create_random_list_array(
300    field: &Field,
301    size: usize,
302    null_density: f32,
303    true_density: f32,
304) -> Result<ArrayRef> {
305    // Override null density with 0.0 if the array is non-nullable
306    let list_null_density = match field.is_nullable() {
307        true => null_density,
308        false => 0.0,
309    };
310    let list_field;
311    let (offsets, child_len) = match field.data_type() {
312        DataType::List(f) => {
313            let (offsets, child_len) = create_random_offsets::<i32>(size, 0, 5);
314            list_field = f;
315            (Buffer::from(offsets.to_byte_slice()), child_len as usize)
316        }
317        DataType::LargeList(f) => {
318            let (offsets, child_len) = create_random_offsets::<i64>(size, 0, 5);
319            list_field = f;
320            (Buffer::from(offsets.to_byte_slice()), child_len as usize)
321        }
322        _ => {
323            return Err(ArrowError::InvalidArgumentError(format!(
324                "Cannot create list array for field {field}"
325            )));
326        }
327    };
328
329    // Create list's child data
330    let child_array = create_random_array(list_field, child_len, null_density, true_density)?;
331    let child_data = child_array.to_data();
332    // Create list's null buffers, if it is nullable
333    let null_buffer = match field.is_nullable() {
334        true => Some(create_random_null_buffer(size, list_null_density)),
335        false => None,
336    };
337    let list_data = unsafe {
338        ArrayData::new_unchecked(
339            field.data_type().clone(),
340            size,
341            None,
342            null_buffer,
343            0,
344            vec![offsets],
345            vec![child_data],
346        )
347    };
348    Ok(make_array(list_data))
349}
350
351#[inline]
352fn create_random_struct_array(
353    field: &Field,
354    size: usize,
355    null_density: f32,
356    true_density: f32,
357) -> Result<ArrayRef> {
358    let struct_fields = match field.data_type() {
359        DataType::Struct(fields) => fields,
360        _ => {
361            return Err(ArrowError::InvalidArgumentError(format!(
362                "Cannot create struct array for field {field}"
363            )));
364        }
365    };
366
367    let child_arrays = struct_fields
368        .iter()
369        .map(|struct_field| create_random_array(struct_field, size, null_density, true_density))
370        .collect::<Result<Vec<_>>>()?;
371
372    let null_buffer = match field.is_nullable() {
373        true => {
374            let nulls = arrow_buffer::BooleanBuffer::new(
375                create_random_null_buffer(size, null_density),
376                0,
377                size,
378            );
379            Some(nulls.into())
380        }
381        false => None,
382    };
383
384    Ok(Arc::new(StructArray::try_new(
385        struct_fields.clone(),
386        child_arrays,
387        null_buffer,
388    )?))
389}
390
391#[inline]
392fn create_random_map_array(
393    field: &Field,
394    size: usize,
395    null_density: f32,
396    true_density: f32,
397) -> Result<ArrayRef> {
398    // Override null density with 0.0 if the array is non-nullable
399    let map_null_density = match field.is_nullable() {
400        true => null_density,
401        false => 0.0,
402    };
403
404    let entries_field = match field.data_type() {
405        DataType::Map(f, _) => f,
406        _ => {
407            return Err(ArrowError::InvalidArgumentError(format!(
408                "Cannot create map array for field {field:?}"
409            )));
410        }
411    };
412
413    let (offsets, child_len) = create_random_offsets::<i32>(size, 0, 5);
414    let offsets = Buffer::from(offsets.to_byte_slice());
415
416    let entries = create_random_array(
417        entries_field,
418        child_len as usize,
419        null_density,
420        true_density,
421    )?
422    .to_data();
423
424    let null_buffer = match field.is_nullable() {
425        true => Some(create_random_null_buffer(size, map_null_density)),
426        false => None,
427    };
428
429    let map_data = unsafe {
430        ArrayData::new_unchecked(
431            field.data_type().clone(),
432            size,
433            None,
434            null_buffer,
435            0,
436            vec![offsets],
437            vec![entries],
438        )
439    };
440    Ok(make_array(map_data))
441}
442
443/// Generate random offsets for list arrays
444fn create_random_offsets<T: OffsetSizeTrait + SampleUniform>(
445    size: usize,
446    min: T,
447    max: T,
448) -> (Vec<T>, T) {
449    let rng = &mut seedable_rng();
450
451    let mut current_offset = T::zero();
452
453    let mut offsets = Vec::with_capacity(size + 1);
454    offsets.push(current_offset);
455
456    (0..size).for_each(|_| {
457        current_offset += rng.random_range(min..max);
458        offsets.push(current_offset);
459    });
460
461    (offsets, current_offset)
462}
463
464fn create_random_null_buffer(size: usize, null_density: f32) -> Buffer {
465    let mut rng = seedable_rng();
466    let mut mut_buf = MutableBuffer::new_null(size);
467    {
468        let mut_slice = mut_buf.as_slice_mut();
469        (0..size).for_each(|i| {
470            if rng.random::<f32>() >= null_density {
471                bit_util::set_bit(mut_slice, i)
472            }
473        })
474    };
475    mut_buf.into()
476}
477
478/// Useful for testing. The range of values are not likely to be representative of the
479/// actual bounds.
480pub trait RandomTemporalValue: ArrowTemporalType {
481    /// Returns the range of values for `impl`'d type
482    fn value_range() -> impl SampleRange<Self::Native>;
483
484    /// Generate a random value within the range of the type
485    fn gen_range<R: Rng>(rng: &mut R) -> Self::Native
486    where
487        Self::Native: SampleUniform,
488    {
489        rng.random_range(Self::value_range())
490    }
491
492    /// Generate a random value of the type
493    fn random<R: Rng>(rng: &mut R) -> Self::Native
494    where
495        Self::Native: SampleUniform,
496    {
497        Self::gen_range(rng)
498    }
499}
500
501impl RandomTemporalValue for TimestampSecondType {
502    /// Range of values for a timestamp in seconds. The range begins at the start
503    /// of the unix epoch and continues for 100 years.
504    fn value_range() -> impl SampleRange<Self::Native> {
505        0..60 * 60 * 24 * 365 * 100
506    }
507}
508
509impl RandomTemporalValue for TimestampMillisecondType {
510    /// Range of values for a timestamp in milliseconds. The range begins at the start
511    /// of the unix epoch and continues for 100 years.
512    fn value_range() -> impl SampleRange<Self::Native> {
513        0..1_000 * 60 * 60 * 24 * 365 * 100
514    }
515}
516
517impl RandomTemporalValue for TimestampMicrosecondType {
518    /// Range of values for a timestamp in microseconds. The range begins at the start
519    /// of the unix epoch and continues for 100 years.
520    fn value_range() -> impl SampleRange<Self::Native> {
521        0..1_000 * 1_000 * 60 * 60 * 24 * 365 * 100
522    }
523}
524
525impl RandomTemporalValue for TimestampNanosecondType {
526    /// Range of values for a timestamp in nanoseconds. The range begins at the start
527    /// of the unix epoch and continues for 100 years.
528    fn value_range() -> impl SampleRange<Self::Native> {
529        0..1_000 * 1_000 * 1_000 * 60 * 60 * 24 * 365 * 100
530    }
531}
532
533impl RandomTemporalValue for Date32Type {
534    /// Range of values representing the elapsed time since UNIX epoch in days. The
535    /// range begins at the start of the unix epoch and continues for 100 years.
536    fn value_range() -> impl SampleRange<Self::Native> {
537        0..365 * 100
538    }
539}
540
541impl RandomTemporalValue for Date64Type {
542    /// Range of values  representing the elapsed time since UNIX epoch in milliseconds.
543    /// The range begins at the start of the unix epoch and continues for 100 years.
544    fn value_range() -> impl SampleRange<Self::Native> {
545        0..1_000 * 60 * 60 * 24 * 365 * 100
546    }
547}
548
549impl RandomTemporalValue for Time32SecondType {
550    /// Range of values representing the elapsed time since midnight in seconds. The
551    /// range is from 0 to 24 hours.
552    fn value_range() -> impl SampleRange<Self::Native> {
553        0..60 * 60 * 24
554    }
555}
556
557impl RandomTemporalValue for Time32MillisecondType {
558    /// Range of values representing the elapsed time since midnight in milliseconds. The
559    /// range is from 0 to 24 hours.
560    fn value_range() -> impl SampleRange<Self::Native> {
561        0..1_000 * 60 * 60 * 24
562    }
563}
564
565impl RandomTemporalValue for Time64MicrosecondType {
566    /// Range of values representing the elapsed time since midnight in microseconds. The
567    /// range is from 0 to 24 hours.
568    fn value_range() -> impl SampleRange<Self::Native> {
569        0..1_000 * 1_000 * 60 * 60 * 24
570    }
571}
572
573impl RandomTemporalValue for Time64NanosecondType {
574    /// Range of values representing the elapsed time since midnight in nanoseconds. The
575    /// range is from 0 to 24 hours.
576    fn value_range() -> impl SampleRange<Self::Native> {
577        0..1_000 * 1_000 * 1_000 * 60 * 60 * 24
578    }
579}
580
581fn create_random_temporal_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
582where
583    T: RandomTemporalValue,
584    <T as ArrowPrimitiveType>::Native: SampleUniform,
585{
586    let mut rng = seedable_rng();
587
588    (0..size)
589        .map(|_| {
590            if rng.random::<f32>() < null_density {
591                None
592            } else {
593                Some(T::random(&mut rng))
594            }
595        })
596        .collect()
597}
598
599#[cfg(test)]
600mod tests {
601    use super::*;
602
603    #[test]
604    fn test_create_batch() {
605        let size = 32;
606        let fields = vec![
607            Field::new("a", DataType::Int32, true),
608            Field::new("f16", DataType::Float16, true),
609            Field::new(
610                "timestamp_without_timezone",
611                DataType::Timestamp(TimeUnit::Nanosecond, None),
612                true,
613            ),
614            Field::new(
615                "timestamp_with_timezone",
616                DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
617                true,
618            ),
619        ];
620        let schema = Schema::new(fields);
621        let schema_ref = Arc::new(schema);
622        let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap();
623
624        assert_eq!(batch.schema(), schema_ref);
625        assert_eq!(batch.num_columns(), schema_ref.fields().len());
626        for array in batch.columns() {
627            assert_eq!(array.len(), size);
628        }
629    }
630
631    #[test]
632    fn test_create_batch_non_null() {
633        let size = 32;
634        let fields = vec![
635            Field::new("a", DataType::Int32, false),
636            Field::new(
637                "b",
638                DataType::List(Arc::new(Field::new_list_field(DataType::LargeUtf8, false))),
639                false,
640            ),
641            Field::new("a", DataType::Int32, false),
642        ];
643        let schema = Schema::new(fields);
644        let schema_ref = Arc::new(schema);
645        let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap();
646
647        assert_eq!(batch.schema(), schema_ref);
648        assert_eq!(batch.num_columns(), schema_ref.fields().len());
649        for array in batch.columns() {
650            assert_eq!(array.null_count(), 0);
651            assert_eq!(array.logical_null_count(), 0);
652        }
653        // Test that the list's child values are non-null
654        let b_array = batch.column(1);
655        let list_array = b_array.as_list::<i32>();
656        let child_array = list_array.values();
657        assert_eq!(child_array.null_count(), 0);
658        // There should be more values than the list, to show that it's a list
659        assert!(child_array.len() > list_array.len());
660    }
661
662    #[test]
663    fn test_create_struct_array() {
664        let size = 32;
665        let struct_fields = Fields::from(vec![
666            Field::new("b", DataType::Boolean, true),
667            Field::new(
668                "c",
669                DataType::LargeList(Arc::new(Field::new_list_field(
670                    DataType::List(Arc::new(Field::new_list_field(
671                        DataType::FixedSizeBinary(6),
672                        true,
673                    ))),
674                    false,
675                ))),
676                true,
677            ),
678            Field::new(
679                "d",
680                DataType::Struct(Fields::from(vec![
681                    Field::new("d_x", DataType::Int32, true),
682                    Field::new("d_y", DataType::Float32, false),
683                    Field::new("d_z", DataType::Binary, true),
684                ])),
685                true,
686            ),
687        ]);
688        let field = Field::new("struct", DataType::Struct(struct_fields), true);
689        let array = create_random_array(&field, size, 0.2, 0.5).unwrap();
690
691        assert_eq!(array.len(), 32);
692        let struct_array = array.as_any().downcast_ref::<StructArray>().unwrap();
693        assert_eq!(struct_array.columns().len(), 3);
694
695        // Test that the nested list makes sense,
696        // i.e. its children's values are more than the parent, to show repetition
697        let col_c = struct_array.column_by_name("c").unwrap();
698        let col_c = col_c.as_any().downcast_ref::<LargeListArray>().unwrap();
699        assert_eq!(col_c.len(), size);
700        let col_c_list = col_c.values().as_list::<i32>();
701        assert!(col_c_list.len() > size);
702        // Its values should be FixedSizeBinary(6)
703        let fsb = col_c_list.values();
704        assert_eq!(fsb.data_type(), &DataType::FixedSizeBinary(6));
705        assert!(fsb.len() > col_c_list.len());
706
707        // Test nested struct
708        let col_d = struct_array.column_by_name("d").unwrap();
709        let col_d = col_d.as_any().downcast_ref::<StructArray>().unwrap();
710        let col_d_y = col_d.column_by_name("d_y").unwrap();
711        assert_eq!(col_d_y.data_type(), &DataType::Float32);
712        assert_eq!(col_d_y.null_count(), 0);
713    }
714
715    #[test]
716    fn test_create_run_end_encoded_array() {
717        let size = 1000;
718        let ree_field = Field::new(
719            "ree",
720            DataType::RunEndEncoded(
721                Arc::new(Field::new("run_ends", DataType::Int32, false)),
722                Arc::new(Field::new("values", DataType::Utf8, true)),
723            ),
724            false,
725        );
726
727        let array = create_random_array(&ree_field, size, 0.25, 0.0).unwrap();
728        assert_eq!(array.len(), size);
729
730        let ree = array.as_run::<Int32Type>();
731        let run_ends = ree.run_ends().values();
732        let num_runs = run_ends.len();
733
734        assert_eq!(*run_ends.last().unwrap() as usize, size);
735
736        assert_eq!(ree.values().len(), num_runs);
737    }
738
739    #[test]
740    fn test_create_list_array_nested_nullability() {
741        let list_field = Field::new_list(
742            "not_null_list",
743            Field::new_list_field(DataType::Boolean, true),
744            false,
745        );
746
747        let list_array = create_random_array(&list_field, 100, 0.95, 0.5).unwrap();
748
749        assert_eq!(list_array.null_count(), 0);
750        assert!(list_array.as_list::<i32>().values().null_count() > 0);
751    }
752
753    #[test]
754    fn test_create_struct_array_nested_nullability() {
755        let struct_child_fields = vec![
756            Field::new("null_int", DataType::Int32, true),
757            Field::new("int", DataType::Int32, false),
758        ];
759        let struct_field = Field::new_struct("not_null_struct", struct_child_fields, false);
760
761        let struct_array = create_random_array(&struct_field, 100, 0.95, 0.5).unwrap();
762
763        assert_eq!(struct_array.null_count(), 0);
764        assert!(
765            struct_array
766                .as_struct()
767                .column_by_name("null_int")
768                .unwrap()
769                .null_count()
770                > 0
771        );
772        assert_eq!(
773            struct_array
774                .as_struct()
775                .column_by_name("int")
776                .unwrap()
777                .null_count(),
778            0
779        );
780    }
781
782    #[test]
783    fn test_create_list_array_nested_struct_nullability() {
784        let struct_child_fields = vec![
785            Field::new("null_int", DataType::Int32, true),
786            Field::new("int", DataType::Int32, false),
787        ];
788        let list_item_field =
789            Field::new_list_field(DataType::Struct(struct_child_fields.into()), true);
790        let list_field = Field::new_list("not_null_list", list_item_field, false);
791
792        let list_array = create_random_array(&list_field, 100, 0.95, 0.5).unwrap();
793
794        assert_eq!(list_array.null_count(), 0);
795        assert!(list_array.as_list::<i32>().values().null_count() > 0);
796        assert!(
797            list_array
798                .as_list::<i32>()
799                .values()
800                .as_struct()
801                .column_by_name("null_int")
802                .unwrap()
803                .null_count()
804                > 0
805        );
806        assert_eq!(
807            list_array
808                .as_list::<i32>()
809                .values()
810                .as_struct()
811                .column_by_name("int")
812                .unwrap()
813                .null_count(),
814            0
815        );
816    }
817
818    #[test]
819    fn test_create_map_array() {
820        let map_field = Field::new_map(
821            "map",
822            "entries",
823            Field::new("key", DataType::Utf8, false),
824            Field::new("value", DataType::Utf8, true),
825            false,
826            false,
827        );
828        let array = create_random_array(&map_field, 100, 0.8, 0.5).unwrap();
829
830        assert_eq!(array.len(), 100);
831        // Map field is not null
832        assert_eq!(array.null_count(), 0);
833        assert_eq!(array.logical_null_count(), 0);
834        // Maps have multiple values like a list, so internal arrays are longer
835        assert!(array.as_map().keys().len() > array.len());
836        assert!(array.as_map().values().len() > array.len());
837        // Keys are not nullable
838        assert_eq!(array.as_map().keys().null_count(), 0);
839        // Values are nullable
840        assert!(array.as_map().values().null_count() > 0);
841
842        assert_eq!(array.as_map().keys().data_type(), &DataType::Utf8);
843        assert_eq!(array.as_map().values().data_type(), &DataType::Utf8);
844    }
845
846    #[test]
847    fn test_create_decimal_array() {
848        let size = 10;
849        let fields = vec![
850            Field::new("a", DataType::Decimal128(10, -2), true),
851            Field::new("b", DataType::Decimal256(10, -2), true),
852        ];
853        let schema = Schema::new(fields);
854        let schema_ref = Arc::new(schema);
855        let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap();
856
857        assert_eq!(batch.schema(), schema_ref);
858        assert_eq!(batch.num_columns(), schema_ref.fields().len());
859        for array in batch.columns() {
860            assert_eq!(array.len(), size);
861        }
862    }
863
864    #[test]
865    fn create_non_nullable_decimal_array_with_null_density() {
866        let size = 10;
867        let fields = vec![
868            Field::new("a", DataType::Decimal128(10, -2), false),
869            Field::new("b", DataType::Decimal256(10, -2), false),
870        ];
871        let schema = Schema::new(fields);
872        let schema_ref = Arc::new(schema);
873        let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap();
874
875        assert_eq!(batch.schema(), schema_ref);
876        assert_eq!(batch.num_columns(), schema_ref.fields().len());
877        for array in batch.columns() {
878            assert_eq!(array.len(), size);
879            assert_eq!(array.null_count(), 0);
880        }
881    }
882}