arrow_array/array/
run_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::sync::Arc;
20
21use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer};
22use arrow_data::{ArrayData, ArrayDataBuilder};
23use arrow_schema::{ArrowError, DataType, Field};
24
25use crate::{
26    Array, ArrayAccessor, ArrayRef, PrimitiveArray,
27    builder::StringRunBuilder,
28    make_array,
29    run_iterator::RunArrayIter,
30    types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
31};
32
33/// An array of [run-end encoded values].
34///
35/// This encoding is variation on [run-length encoding (RLE)] and is good for representing
36/// data containing the same values repeated consecutively.
37///
38/// A [`RunArray`] consists of a `run_ends` buffer and a `values` array of equivalent
39/// lengths. The `run_ends` buffer stores the indexes at which the run ends. The
40/// `values` array stores the corresponding value of each run. The below example
41/// illustrates how a logical array is represented by a [`RunArray`]:
42///
43/// ```text
44/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐
45///   ┌─────────────────┐  ┌─────────┐       ┌─────────────────┐
46/// │ │        A        │  │    2    │ │     │        A        │
47///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
48/// │ │        D        │  │    3    │ │     │        A        │    run length of 'A' = runs_ends[0] - 0 = 2
49///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
50/// │ │        B        │  │    6    │ │     │        D        │    run length of 'D' = run_ends[1] - run_ends[0] = 1
51///   └─────────────────┘  └─────────┘       ├─────────────────┤
52/// │        values          run_ends  │     │        B        │
53///                                          ├─────────────────┤
54/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘     │        B        │
55///                                          ├─────────────────┤
56///                RunArray                  │        B        │    run length of 'B' = run_ends[2] - run_ends[1] = 3
57///               length = 3                 └─────────────────┘
58///
59///                                             Logical array
60///                                                Contents
61/// ```
62///
63/// [run-end encoded values]: https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
64/// [run-length encoding (RLE)]: https://en.wikipedia.org/wiki/Run-length_encoding
65pub struct RunArray<R: RunEndIndexType> {
66    data_type: DataType,
67    run_ends: RunEndBuffer<R::Native>,
68    values: ArrayRef,
69}
70
71impl<R: RunEndIndexType> Clone for RunArray<R> {
72    fn clone(&self) -> Self {
73        Self {
74            data_type: self.data_type.clone(),
75            run_ends: self.run_ends.clone(),
76            values: self.values.clone(),
77        }
78    }
79}
80
81impl<R: RunEndIndexType> RunArray<R> {
82    /// Calculates the logical length of the array encoded by treating the `run_ends`
83    /// array as if it were a [`RunEndBuffer`].
84    pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize {
85        let len = run_ends.len();
86        if len == 0 {
87            return 0;
88        }
89        run_ends.value(len - 1).as_usize()
90    }
91
92    /// Attempts to create a [`RunArray`] using the given `run_ends` and `values`.
93    ///
94    /// # Errors
95    ///
96    /// - If `run_ends` and `values` have different lengths
97    /// - If `run_ends` has any null values
98    /// - If `run_ends` doesn't consist of strictly increasing positive integers
99    pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) -> Result<Self, ArrowError> {
100        let run_ends_type = run_ends.data_type().clone();
101        let values_type = values.data_type().clone();
102        let ree_array_type = DataType::RunEndEncoded(
103            Arc::new(Field::new("run_ends", run_ends_type, false)),
104            Arc::new(Field::new("values", values_type, true)),
105        );
106        let len = RunArray::logical_len(run_ends);
107        let builder = ArrayDataBuilder::new(ree_array_type)
108            .len(len)
109            .add_child_data(run_ends.to_data())
110            .add_child_data(values.to_data());
111
112        // `build_unchecked` is used to avoid recursive validation of child arrays.
113        let array_data = unsafe { builder.build_unchecked() };
114
115        // Safety: `validate_data` checks below
116        //    1. The given array data has exactly two child arrays.
117        //    2. The first child array (run_ends) has valid data type.
118        //    3. run_ends array does not have null values
119        //    4. run_ends array has non-zero and strictly increasing values.
120        //    5. The length of run_ends array and values array are the same.
121        array_data.validate_data()?;
122
123        Ok(array_data.into())
124    }
125
126    /// Returns a reference to the [`RunEndBuffer`].
127    pub fn run_ends(&self) -> &RunEndBuffer<R::Native> {
128        &self.run_ends
129    }
130
131    /// Returns a reference to the values array.
132    ///
133    /// Any slicing of this [`RunArray`] array is **not** applied to the returned
134    /// values here and must be handled separately.
135    pub fn values(&self) -> &ArrayRef {
136        &self.values
137    }
138
139    /// Returns the physical index at which the array slice starts.
140    ///
141    /// See [`RunEndBuffer::get_start_physical_index`].
142    pub fn get_start_physical_index(&self) -> usize {
143        self.run_ends.get_start_physical_index()
144    }
145
146    /// Returns the physical index at which the array slice ends.
147    ///
148    /// See [`RunEndBuffer::get_end_physical_index`].
149    pub fn get_end_physical_index(&self) -> usize {
150        self.run_ends.get_end_physical_index()
151    }
152
153    /// Downcast this [`RunArray`] to a [`TypedRunArray`]
154    ///
155    /// ```
156    /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type};
157    ///
158    /// let orig = [Some("a"), Some("b"), None];
159    /// let run_array = RunArray::<Int32Type>::from_iter(orig);
160    /// let typed = run_array.downcast::<StringArray>().unwrap();
161    /// assert_eq!(typed.value(0), "a");
162    /// assert_eq!(typed.value(1), "b");
163    /// assert!(typed.values().is_null(2));
164    /// ```
165    pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
166        let values = self.values.as_any().downcast_ref()?;
167        Some(TypedRunArray {
168            run_array: self,
169            values,
170        })
171    }
172
173    /// Calls [`RunEndBuffer::get_physical_index`].
174    ///
175    /// The result is arbitrary if `logical_index >= self.len()`
176    pub fn get_physical_index(&self, logical_index: usize) -> usize {
177        self.run_ends.get_physical_index(logical_index)
178    }
179
180    /// Returns the physical indices corresponding to the provided logical indices.
181    ///
182    /// See [`RunEndBuffer::get_physical_indices`] for more details.
183    #[inline]
184    pub fn get_physical_indices<I>(&self, logical_indices: &[I]) -> Result<Vec<usize>, ArrowError>
185    where
186        I: ArrowNativeType,
187    {
188        self.run_ends()
189            .get_physical_indices(logical_indices)
190            .map_err(|index| {
191                ArrowError::InvalidArgumentError(format!(
192                    "Logical index {} is out of bounds for RunArray of length {}",
193                    index.as_usize(),
194                    self.len()
195                ))
196            })
197    }
198
199    /// Returns a zero-copy slice of this array with the indicated offset and length.
200    ///
201    /// # Panics
202    ///
203    /// - Specified slice (`offset` + `length`) exceeds existing length
204    pub fn slice(&self, offset: usize, length: usize) -> Self {
205        Self {
206            data_type: self.data_type.clone(),
207            run_ends: self.run_ends.slice(offset, length),
208            values: self.values.clone(),
209        }
210    }
211}
212
213impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
214    // The method assumes the caller already validated the data using `ArrayData::validate_data()`
215    fn from(data: ArrayData) -> Self {
216        match data.data_type() {
217            DataType::RunEndEncoded(_, _) => {}
218            _ => {
219                panic!(
220                    "Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"
221                );
222            }
223        }
224
225        // Safety
226        // ArrayData is valid
227        let child = &data.child_data()[0];
228        assert_eq!(child.data_type(), &R::DATA_TYPE, "Incorrect run ends type");
229        let run_ends = unsafe {
230            let scalar = child.buffers()[0].clone().into();
231            RunEndBuffer::new_unchecked(scalar, data.offset(), data.len())
232        };
233
234        let values = make_array(data.child_data()[1].clone());
235        Self {
236            data_type: data.data_type().clone(),
237            run_ends,
238            values,
239        }
240    }
241}
242
243impl<R: RunEndIndexType> From<RunArray<R>> for ArrayData {
244    fn from(array: RunArray<R>) -> Self {
245        let len = array.run_ends.len();
246        let offset = array.run_ends.offset();
247
248        let run_ends = ArrayDataBuilder::new(R::DATA_TYPE)
249            .len(array.run_ends.values().len())
250            .buffers(vec![array.run_ends.into_inner().into_inner()]);
251
252        let run_ends = unsafe { run_ends.build_unchecked() };
253
254        let builder = ArrayDataBuilder::new(array.data_type)
255            .len(len)
256            .offset(offset)
257            .child_data(vec![run_ends, array.values.to_data()]);
258
259        unsafe { builder.build_unchecked() }
260    }
261}
262
263impl<T: RunEndIndexType> Array for RunArray<T> {
264    fn as_any(&self) -> &dyn Any {
265        self
266    }
267
268    fn to_data(&self) -> ArrayData {
269        self.clone().into()
270    }
271
272    fn into_data(self) -> ArrayData {
273        self.into()
274    }
275
276    fn data_type(&self) -> &DataType {
277        &self.data_type
278    }
279
280    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
281        Arc::new(self.slice(offset, length))
282    }
283
284    fn len(&self) -> usize {
285        self.run_ends.len()
286    }
287
288    fn is_empty(&self) -> bool {
289        self.run_ends.is_empty()
290    }
291
292    fn shrink_to_fit(&mut self) {
293        self.run_ends.shrink_to_fit();
294        self.values.shrink_to_fit();
295    }
296
297    fn offset(&self) -> usize {
298        self.run_ends.offset()
299    }
300
301    fn nulls(&self) -> Option<&NullBuffer> {
302        None
303    }
304
305    fn logical_nulls(&self) -> Option<NullBuffer> {
306        let len = self.len();
307        let nulls = self.values.logical_nulls()?;
308        let mut out = BooleanBufferBuilder::new(len);
309        let offset = self.run_ends.offset();
310        let mut valid_start = 0;
311        let mut last_end = 0;
312        for (idx, end) in self.run_ends.values().iter().enumerate() {
313            let end = end.as_usize();
314            if end < offset {
315                continue;
316            }
317            let end = (end - offset).min(len);
318            if nulls.is_null(idx) {
319                if valid_start < last_end {
320                    out.append_n(last_end - valid_start, true);
321                }
322                out.append_n(end - last_end, false);
323                valid_start = end;
324            }
325            last_end = end;
326            if end == len {
327                break;
328            }
329        }
330        if valid_start < len {
331            out.append_n(len - valid_start, true)
332        }
333        // Sanity check
334        assert_eq!(out.len(), len);
335        Some(out.finish().into())
336    }
337
338    fn is_nullable(&self) -> bool {
339        !self.is_empty() && self.values.is_nullable()
340    }
341
342    fn get_buffer_memory_size(&self) -> usize {
343        self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size()
344    }
345
346    fn get_array_memory_size(&self) -> usize {
347        std::mem::size_of::<Self>()
348            + self.run_ends.inner().inner().capacity()
349            + self.values.get_array_memory_size()
350    }
351}
352
353impl<R: RunEndIndexType> std::fmt::Debug for RunArray<R> {
354    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
355        writeln!(
356            f,
357            "RunArray {{run_ends: {:?}, values: {:?}}}",
358            self.run_ends.values(),
359            self.values
360        )
361    }
362}
363
364/// Constructs a `RunArray` from an iterator of optional strings.
365///
366/// # Example:
367/// ```
368/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
369///
370/// let test = vec!["a", "a", "b", "c", "c"];
371/// let array: RunArray<Int16Type> = test
372///     .iter()
373///     .map(|&x| if x == "b" { None } else { Some(x) })
374///     .collect();
375/// assert_eq!(
376///     "RunArray {run_ends: [2, 3, 5], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
377///     format!("{:?}", array)
378/// );
379/// ```
380impl<'a, T: RunEndIndexType> FromIterator<Option<&'a str>> for RunArray<T> {
381    fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
382        let it = iter.into_iter();
383        let (lower, _) = it.size_hint();
384        let mut builder = StringRunBuilder::with_capacity(lower, 256);
385        it.for_each(|i| {
386            builder.append_option(i);
387        });
388
389        builder.finish()
390    }
391}
392
393/// Constructs a `RunArray` from an iterator of strings.
394///
395/// # Example:
396///
397/// ```
398/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
399///
400/// let test = vec!["a", "a", "b", "c"];
401/// let array: RunArray<Int16Type> = test.into_iter().collect();
402/// assert_eq!(
403///     "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
404///     format!("{:?}", array)
405/// );
406/// ```
407impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray<T> {
408    fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
409        let it = iter.into_iter();
410        let (lower, _) = it.size_hint();
411        let mut builder = StringRunBuilder::with_capacity(lower, 256);
412        it.for_each(|i| {
413            builder.append_value(i);
414        });
415
416        builder.finish()
417    }
418}
419
420///
421/// A [`RunArray`] with `i16` run ends
422///
423/// # Example: Using `collect`
424/// ```
425/// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray};
426/// # use std::sync::Arc;
427///
428/// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
429/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
430/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
431/// assert_eq!(array.values(), &values);
432/// ```
433pub type Int16RunArray = RunArray<Int16Type>;
434
435///
436/// A [`RunArray`] with `i32` run ends
437///
438/// # Example: Using `collect`
439/// ```
440/// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray};
441/// # use std::sync::Arc;
442///
443/// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
444/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
445/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
446/// assert_eq!(array.values(), &values);
447/// ```
448pub type Int32RunArray = RunArray<Int32Type>;
449
450///
451/// A [`RunArray`] with `i64` run ends
452///
453/// # Example: Using `collect`
454/// ```
455/// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray};
456/// # use std::sync::Arc;
457///
458/// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
459/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
460/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
461/// assert_eq!(array.values(), &values);
462/// ```
463pub type Int64RunArray = RunArray<Int64Type>;
464
465/// A [`RunArray`] typed typed on its child values array
466///
467/// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements
468///
469/// ```
470/// use arrow_array::{RunArray, StringArray, types::Int32Type};
471///
472/// let orig = ["a", "b", "a", "b"];
473/// let ree_array = RunArray::<Int32Type>::from_iter(orig);
474///
475/// // `TypedRunArray` allows you to access the values directly
476/// let typed = ree_array.downcast::<StringArray>().unwrap();
477///
478/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
479///     assert_eq!(maybe_val.unwrap(), orig)
480/// }
481/// ```
482pub struct TypedRunArray<'a, R: RunEndIndexType, V> {
483    /// The run array
484    run_array: &'a RunArray<R>,
485
486    /// The values of the run_array
487    values: &'a V,
488}
489
490// Manually implement `Clone` to avoid `V: Clone` type constraint
491impl<R: RunEndIndexType, V> Clone for TypedRunArray<'_, R, V> {
492    fn clone(&self) -> Self {
493        *self
494    }
495}
496
497impl<R: RunEndIndexType, V> Copy for TypedRunArray<'_, R, V> {}
498
499impl<R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'_, R, V> {
500    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
501        writeln!(f, "TypedRunArray({:?})", self.run_array)
502    }
503}
504
505impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> {
506    /// Returns the run_ends of this [`TypedRunArray`]
507    pub fn run_ends(&self) -> &'a RunEndBuffer<R::Native> {
508        self.run_array.run_ends()
509    }
510
511    /// Returns the values of this [`TypedRunArray`]
512    pub fn values(&self) -> &'a V {
513        self.values
514    }
515
516    /// Returns the run array of this [`TypedRunArray`]
517    pub fn run_array(&self) -> &'a RunArray<R> {
518        self.run_array
519    }
520}
521
522impl<R: RunEndIndexType, V: Sync> Array for TypedRunArray<'_, R, V> {
523    fn as_any(&self) -> &dyn Any {
524        self.run_array
525    }
526
527    fn to_data(&self) -> ArrayData {
528        self.run_array.to_data()
529    }
530
531    fn into_data(self) -> ArrayData {
532        self.run_array.into_data()
533    }
534
535    fn data_type(&self) -> &DataType {
536        self.run_array.data_type()
537    }
538
539    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
540        Arc::new(self.run_array.slice(offset, length))
541    }
542
543    fn len(&self) -> usize {
544        self.run_array.len()
545    }
546
547    fn is_empty(&self) -> bool {
548        self.run_array.is_empty()
549    }
550
551    fn offset(&self) -> usize {
552        self.run_array.offset()
553    }
554
555    fn nulls(&self) -> Option<&NullBuffer> {
556        self.run_array.nulls()
557    }
558
559    fn logical_nulls(&self) -> Option<NullBuffer> {
560        self.run_array.logical_nulls()
561    }
562
563    fn logical_null_count(&self) -> usize {
564        self.run_array.logical_null_count()
565    }
566
567    fn is_nullable(&self) -> bool {
568        self.run_array.is_nullable()
569    }
570
571    fn get_buffer_memory_size(&self) -> usize {
572        self.run_array.get_buffer_memory_size()
573    }
574
575    fn get_array_memory_size(&self) -> usize {
576        self.run_array.get_array_memory_size()
577    }
578}
579
580// Array accessor converts the index of logical array to the index of the physical array
581// using binary search. The time complexity is O(log N) where N is number of runs.
582impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V>
583where
584    R: RunEndIndexType,
585    V: Sync + Send,
586    &'a V: ArrayAccessor,
587    <&'a V as ArrayAccessor>::Item: Default,
588{
589    type Item = <&'a V as ArrayAccessor>::Item;
590
591    fn value(&self, logical_index: usize) -> Self::Item {
592        assert!(
593            logical_index < self.len(),
594            "Trying to access an element at index {} from a TypedRunArray of length {}",
595            logical_index,
596            self.len()
597        );
598        unsafe { self.value_unchecked(logical_index) }
599    }
600
601    unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item {
602        let physical_index = self.run_array.get_physical_index(logical_index);
603        unsafe { self.values().value_unchecked(physical_index) }
604    }
605}
606
607impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V>
608where
609    R: RunEndIndexType,
610    V: Sync + Send,
611    &'a V: ArrayAccessor,
612    <&'a V as ArrayAccessor>::Item: Default,
613{
614    type Item = Option<<&'a V as ArrayAccessor>::Item>;
615    type IntoIter = RunArrayIter<'a, R, V>;
616
617    fn into_iter(self) -> Self::IntoIter {
618        RunArrayIter::new(self)
619    }
620}
621
622#[cfg(test)]
623mod tests {
624    use rand::Rng;
625    use rand::rng;
626    use rand::seq::SliceRandom;
627
628    use super::*;
629    use crate::builder::PrimitiveRunBuilder;
630    use crate::cast::AsArray;
631    use crate::types::{Int8Type, UInt32Type};
632    use crate::{Int16Array, Int32Array, StringArray};
633
634    fn build_input_array(size: usize) -> Vec<Option<i32>> {
635        // The input array is created by shuffling and repeating
636        // the seed values random number of times.
637        let mut seed: Vec<Option<i32>> = vec![
638            None,
639            None,
640            None,
641            Some(1),
642            Some(2),
643            Some(3),
644            Some(4),
645            Some(5),
646            Some(6),
647            Some(7),
648            Some(8),
649            Some(9),
650        ];
651        let mut result: Vec<Option<i32>> = Vec::with_capacity(size);
652        let mut ix = 0;
653        let mut rng = rng();
654        // run length can go up to 8. Cap the max run length for smaller arrays to size / 2.
655        let max_run_length = 8_usize.min(1_usize.max(size / 2));
656        while result.len() < size {
657            // shuffle the seed array if all the values are iterated.
658            if ix == 0 {
659                seed.shuffle(&mut rng);
660            }
661            // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays
662            let num = max_run_length.min(rng.random_range(1..=max_run_length));
663            for _ in 0..num {
664                result.push(seed[ix]);
665            }
666            ix += 1;
667            if ix == seed.len() {
668                ix = 0
669            }
670        }
671        result.resize(size, None);
672        result
673    }
674
675    // Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]`
676    fn compare_logical_and_physical_indices(
677        logical_indices: &[u32],
678        logical_array: &[Option<i32>],
679        physical_indices: &[usize],
680        physical_array: &PrimitiveArray<Int32Type>,
681    ) {
682        assert_eq!(logical_indices.len(), physical_indices.len());
683
684        // check value in logical index in the logical_array matches physical index in physical_array
685        logical_indices
686            .iter()
687            .map(|f| f.as_usize())
688            .zip(physical_indices.iter())
689            .for_each(|(logical_ix, physical_ix)| {
690                let expected = logical_array[logical_ix];
691                match expected {
692                    Some(val) => {
693                        assert!(physical_array.is_valid(*physical_ix));
694                        let actual = physical_array.value(*physical_ix);
695                        assert_eq!(val, actual);
696                    }
697                    None => {
698                        assert!(physical_array.is_null(*physical_ix))
699                    }
700                };
701            });
702    }
703    #[test]
704    fn test_run_array() {
705        // Construct a value array
706        let value_data =
707            PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
708
709        // Construct a run_ends array:
710        let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22];
711        let run_ends_data =
712            PrimitiveArray::<Int16Type>::from_iter_values(run_ends_values.iter().copied());
713
714        // Construct a run ends encoded array from the above two
715        let ree_array = RunArray::<Int16Type>::try_new(&run_ends_data, &value_data).unwrap();
716
717        assert_eq!(ree_array.len(), 22);
718        assert_eq!(ree_array.null_count(), 0);
719
720        let values = ree_array.values();
721        assert_eq!(value_data.into_data(), values.to_data());
722        assert_eq!(&DataType::Int8, values.data_type());
723
724        let run_ends = ree_array.run_ends();
725        assert_eq!(run_ends.values(), &run_ends_values);
726    }
727
728    #[test]
729    fn test_run_array_fmt_debug() {
730        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(3);
731        builder.append_value(12345678);
732        builder.append_null();
733        builder.append_value(22345678);
734        let array = builder.finish();
735        assert_eq!(
736            "RunArray {run_ends: [1, 2, 3], values: PrimitiveArray<UInt32>\n[\n  12345678,\n  null,\n  22345678,\n]}\n",
737            format!("{array:?}")
738        );
739
740        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(20);
741        for _ in 0..20 {
742            builder.append_value(1);
743        }
744        let array = builder.finish();
745
746        assert_eq!(array.len(), 20);
747        assert_eq!(array.null_count(), 0);
748        assert_eq!(array.logical_null_count(), 0);
749
750        assert_eq!(
751            "RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n  1,\n]}\n",
752            format!("{array:?}")
753        );
754    }
755
756    #[test]
757    fn test_run_array_from_iter() {
758        let test = vec!["a", "a", "b", "c"];
759        let array: RunArray<Int16Type> = test
760            .iter()
761            .map(|&x| if x == "b" { None } else { Some(x) })
762            .collect();
763        assert_eq!(
764            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
765            format!("{array:?}")
766        );
767
768        assert_eq!(array.len(), 4);
769        assert_eq!(array.null_count(), 0);
770        assert_eq!(array.logical_null_count(), 1);
771
772        let array: RunArray<Int16Type> = test.into_iter().collect();
773        assert_eq!(
774            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
775            format!("{array:?}")
776        );
777    }
778
779    #[test]
780    fn test_run_array_run_ends_as_primitive_array() {
781        let test = vec!["a", "b", "c", "a"];
782        let array: RunArray<Int16Type> = test.into_iter().collect();
783
784        assert_eq!(array.len(), 4);
785        assert_eq!(array.null_count(), 0);
786        assert_eq!(array.logical_null_count(), 0);
787
788        let run_ends = array.run_ends();
789        assert_eq!(&[1, 2, 3, 4], run_ends.values());
790    }
791
792    #[test]
793    fn test_run_array_as_primitive_array_with_null() {
794        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
795        let array: RunArray<Int32Type> = test.into_iter().collect();
796
797        assert_eq!(array.len(), 6);
798        assert_eq!(array.null_count(), 0);
799        assert_eq!(array.logical_null_count(), 3);
800
801        let run_ends = array.run_ends();
802        assert_eq!(&[1, 2, 3, 5, 6], run_ends.values());
803
804        let values_data = array.values();
805        assert_eq!(2, values_data.null_count());
806        assert_eq!(5, values_data.len());
807    }
808
809    #[test]
810    fn test_run_array_all_nulls() {
811        let test = vec![None, None, None];
812        let array: RunArray<Int32Type> = test.into_iter().collect();
813
814        assert_eq!(array.len(), 3);
815        assert_eq!(array.null_count(), 0);
816        assert_eq!(array.logical_null_count(), 3);
817
818        let run_ends = array.run_ends();
819        assert_eq!(3, run_ends.len());
820        assert_eq!(&[3], run_ends.values());
821
822        let values_data = array.values();
823        assert_eq!(1, values_data.null_count());
824    }
825
826    #[test]
827    fn test_run_array_try_new() {
828        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
829            .into_iter()
830            .collect();
831        let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
832
833        let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
834        assert_eq!(array.values().data_type(), &DataType::Utf8);
835
836        assert_eq!(array.null_count(), 0);
837        assert_eq!(array.logical_null_count(), 1);
838        assert_eq!(array.len(), 4);
839        assert_eq!(array.values().null_count(), 1);
840
841        assert_eq!(
842            "RunArray {run_ends: [1, 2, 3, 4], values: StringArray\n[\n  \"foo\",\n  \"bar\",\n  null,\n  \"baz\",\n]}\n",
843            format!("{array:?}")
844        );
845    }
846
847    #[test]
848    fn test_run_array_int16_type_definition() {
849        let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
850        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
851        assert_eq!(array.run_ends().values(), &[2, 3, 5]);
852        assert_eq!(array.values(), &values);
853    }
854
855    #[test]
856    fn test_run_array_empty_string() {
857        let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect();
858        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "", "c"]));
859        assert_eq!(array.run_ends().values(), &[2, 4, 5]);
860        assert_eq!(array.values(), &values);
861    }
862
863    #[test]
864    fn test_run_array_length_mismatch() {
865        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
866            .into_iter()
867            .collect();
868        let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect();
869
870        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
871        let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string());
872        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
873    }
874
875    #[test]
876    fn test_run_array_run_ends_with_null() {
877        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
878            .into_iter()
879            .collect();
880        let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect();
881
882        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
883        let expected = ArrowError::InvalidArgumentError(
884            "Found null values in run_ends array. The run_ends array should not have null values."
885                .to_string(),
886        );
887        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
888    }
889
890    #[test]
891    fn test_run_array_run_ends_with_zeroes() {
892        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
893            .into_iter()
894            .collect();
895        let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect();
896
897        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
898        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string());
899        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
900    }
901
902    #[test]
903    fn test_run_array_run_ends_non_increasing() {
904        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
905            .into_iter()
906            .collect();
907        let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect();
908
909        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
910        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string());
911        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
912    }
913
914    #[test]
915    #[should_panic(expected = "Incorrect run ends type")]
916    fn test_run_array_run_ends_data_type_mismatch() {
917        let a = RunArray::<Int32Type>::from_iter(["32"]);
918        let _ = RunArray::<Int64Type>::from(a.into_data());
919    }
920
921    #[test]
922    fn test_ree_array_accessor() {
923        let input_array = build_input_array(256);
924
925        // Encode the input_array to ree_array
926        let mut builder =
927            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
928        builder.extend(input_array.iter().copied());
929        let run_array = builder.finish();
930        let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
931
932        // Access every index and check if the value in the input array matches returned value.
933        for (i, inp_val) in input_array.iter().enumerate() {
934            if let Some(val) = inp_val {
935                let actual = typed.value(i);
936                assert_eq!(*val, actual)
937            } else {
938                let physical_ix = run_array.get_physical_index(i);
939                assert!(typed.values().is_null(physical_ix));
940            };
941        }
942    }
943
944    #[test]
945    #[cfg_attr(miri, ignore)] // Takes too long
946    fn test_get_physical_indices() {
947        // Test for logical lengths starting from 10 to 250 increasing by 10
948        for logical_len in (0..250).step_by(10) {
949            let input_array = build_input_array(logical_len);
950
951            // create run array using input_array
952            let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
953            builder.extend(input_array.clone().into_iter());
954
955            let run_array = builder.finish();
956            let physical_values_array = run_array.values().as_primitive::<Int32Type>();
957
958            // create an array consisting of all the indices repeated twice and shuffled.
959            let mut logical_indices: Vec<u32> = (0_u32..(logical_len as u32)).collect();
960            // add same indices once more
961            logical_indices.append(&mut logical_indices.clone());
962            let mut rng = rng();
963            logical_indices.shuffle(&mut rng);
964
965            let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap();
966
967            assert_eq!(logical_indices.len(), physical_indices.len());
968
969            // check value in logical index in the input_array matches physical index in typed_run_array
970            compare_logical_and_physical_indices(
971                &logical_indices,
972                &input_array,
973                &physical_indices,
974                physical_values_array,
975            );
976        }
977    }
978
979    #[test]
980    #[cfg_attr(miri, ignore)] // Takes too long
981    fn test_get_physical_indices_sliced() {
982        let total_len = 80;
983        let input_array = build_input_array(total_len);
984
985        // Encode the input_array to run array
986        let mut builder =
987            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
988        builder.extend(input_array.iter().copied());
989        let run_array = builder.finish();
990        let physical_values_array = run_array.values().as_primitive::<Int32Type>();
991
992        // test for all slice lengths.
993        for slice_len in 1..=total_len {
994            // create an array consisting of all the indices repeated twice and shuffled.
995            let mut logical_indices: Vec<u32> = (0_u32..(slice_len as u32)).collect();
996            // add same indices once more
997            logical_indices.append(&mut logical_indices.clone());
998            let mut rng = rng();
999            logical_indices.shuffle(&mut rng);
1000
1001            // test for offset = 0 and slice length = slice_len
1002            // slice the input array using which the run array was built.
1003            let sliced_input_array = &input_array[0..slice_len];
1004
1005            // slice the run array
1006            let sliced_run_array: RunArray<Int16Type> =
1007                run_array.slice(0, slice_len).into_data().into();
1008
1009            // Get physical indices.
1010            let physical_indices = sliced_run_array
1011                .get_physical_indices(&logical_indices)
1012                .unwrap();
1013
1014            compare_logical_and_physical_indices(
1015                &logical_indices,
1016                sliced_input_array,
1017                &physical_indices,
1018                physical_values_array,
1019            );
1020
1021            // test for offset = total_len - slice_len and slice length = slice_len
1022            // slice the input array using which the run array was built.
1023            let sliced_input_array = &input_array[total_len - slice_len..total_len];
1024
1025            // slice the run array
1026            let sliced_run_array: RunArray<Int16Type> = run_array
1027                .slice(total_len - slice_len, slice_len)
1028                .into_data()
1029                .into();
1030
1031            // Get physical indices
1032            let physical_indices = sliced_run_array
1033                .get_physical_indices(&logical_indices)
1034                .unwrap();
1035
1036            compare_logical_and_physical_indices(
1037                &logical_indices,
1038                sliced_input_array,
1039                &physical_indices,
1040                physical_values_array,
1041            );
1042        }
1043    }
1044
1045    #[test]
1046    fn test_logical_nulls() {
1047        let run = Int32Array::from(vec![3, 6, 9, 12]);
1048        let values = Int32Array::from(vec![Some(0), None, Some(1), None]);
1049        let array = RunArray::try_new(&run, &values).unwrap();
1050
1051        let expected = [
1052            true, true, true, false, false, false, true, true, true, false, false, false,
1053        ];
1054
1055        let n = array.logical_nulls().unwrap();
1056        assert_eq!(n.null_count(), 6);
1057
1058        let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)];
1059        for (offset, length) in slices {
1060            let a = array.slice(offset, length);
1061            let n = a.logical_nulls().unwrap();
1062            let n = n.into_iter().collect::<Vec<_>>();
1063            assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}");
1064        }
1065    }
1066
1067    #[test]
1068    fn test_run_array_eq_identical() {
1069        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1070        let values1 = StringArray::from(vec!["a", "b", "c"]);
1071        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1072
1073        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1074        let values2 = StringArray::from(vec!["a", "b", "c"]);
1075        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1076
1077        assert_eq!(array1, array2);
1078    }
1079
1080    #[test]
1081    fn test_run_array_ne_different_run_ends() {
1082        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1083        let values1 = StringArray::from(vec!["a", "b", "c"]);
1084        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1085
1086        let run_ends2 = Int32Array::from(vec![1, 4, 6]);
1087        let values2 = StringArray::from(vec!["a", "b", "c"]);
1088        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1089
1090        assert_ne!(array1, array2);
1091    }
1092
1093    #[test]
1094    fn test_run_array_ne_different_values() {
1095        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1096        let values1 = StringArray::from(vec!["a", "b", "c"]);
1097        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1098
1099        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1100        let values2 = StringArray::from(vec!["a", "b", "d"]);
1101        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1102
1103        assert_ne!(array1, array2);
1104    }
1105
1106    #[test]
1107    fn test_run_array_eq_with_nulls() {
1108        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1109        let values1 = StringArray::from(vec![Some("a"), None, Some("c")]);
1110        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1111
1112        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1113        let values2 = StringArray::from(vec![Some("a"), None, Some("c")]);
1114        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1115
1116        assert_eq!(array1, array2);
1117    }
1118
1119    #[test]
1120    fn test_run_array_eq_different_run_end_types() {
1121        let run_ends_i16_1 = Int16Array::from(vec![2_i16, 4, 6]);
1122        let values_i16_1 = StringArray::from(vec!["a", "b", "c"]);
1123        let array_i16_1 = RunArray::<Int16Type>::try_new(&run_ends_i16_1, &values_i16_1).unwrap();
1124
1125        let run_ends_i16_2 = Int16Array::from(vec![2_i16, 4, 6]);
1126        let values_i16_2 = StringArray::from(vec!["a", "b", "c"]);
1127        let array_i16_2 = RunArray::<Int16Type>::try_new(&run_ends_i16_2, &values_i16_2).unwrap();
1128
1129        assert_eq!(array_i16_1, array_i16_2);
1130    }
1131}