arrow_array/array/
run_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::sync::Arc;
20
21use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer};
22use arrow_data::{ArrayData, ArrayDataBuilder};
23use arrow_schema::{ArrowError, DataType, Field};
24
25use crate::{
26    Array, ArrayAccessor, ArrayRef, PrimitiveArray,
27    builder::StringRunBuilder,
28    make_array,
29    run_iterator::RunArrayIter,
30    types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
31};
32
33/// An array of [run-end encoded values].
34///
35/// This encoding is variation on [run-length encoding (RLE)] and is good for representing
36/// data containing the same values repeated consecutively.
37///
38/// A [`RunArray`] consists of a `run_ends` buffer and a `values` array of equivalent
39/// lengths. The `run_ends` buffer stores the indexes at which the run ends. The
40/// `values` array stores the corresponding value of each run. The below example
41/// illustrates how a logical array is represented by a [`RunArray`]:
42///
43/// ```text
44/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐
45///   ┌─────────────────┐  ┌─────────┐       ┌─────────────────┐
46/// │ │        A        │  │    2    │ │     │        A        │
47///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
48/// │ │        D        │  │    3    │ │     │        A        │    run length of 'A' = runs_ends[0] - 0 = 2
49///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
50/// │ │        B        │  │    6    │ │     │        D        │    run length of 'D' = run_ends[1] - run_ends[0] = 1
51///   └─────────────────┘  └─────────┘       ├─────────────────┤
52/// │        values          run_ends  │     │        B        │
53///                                          ├─────────────────┤
54/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘     │        B        │
55///                                          ├─────────────────┤
56///                RunArray                  │        B        │    run length of 'B' = run_ends[2] - run_ends[1] = 3
57///               length = 3                 └─────────────────┘
58///
59///                                             Logical array
60///                                                Contents
61/// ```
62///
63/// [run-end encoded values]: https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
64/// [run-length encoding (RLE)]: https://en.wikipedia.org/wiki/Run-length_encoding
65pub struct RunArray<R: RunEndIndexType> {
66    data_type: DataType,
67    run_ends: RunEndBuffer<R::Native>,
68    values: ArrayRef,
69}
70
71impl<R: RunEndIndexType> Clone for RunArray<R> {
72    fn clone(&self) -> Self {
73        Self {
74            data_type: self.data_type.clone(),
75            run_ends: self.run_ends.clone(),
76            values: self.values.clone(),
77        }
78    }
79}
80
81impl<R: RunEndIndexType> RunArray<R> {
82    /// Calculates the logical length of the array encoded by treating the `run_ends`
83    /// array as if it were a [`RunEndBuffer`].
84    pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize {
85        let len = run_ends.len();
86        if len == 0 {
87            return 0;
88        }
89        run_ends.value(len - 1).as_usize()
90    }
91
92    /// Attempts to create a [`RunArray`] using the given `run_ends` and `values`.
93    ///
94    /// # Errors
95    ///
96    /// - If `run_ends` and `values` have different lengths
97    /// - If `run_ends` has any null values
98    /// - If `run_ends` doesn't consist of strictly increasing positive integers
99    pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) -> Result<Self, ArrowError> {
100        let run_ends_type = run_ends.data_type().clone();
101        let values_type = values.data_type().clone();
102        let ree_array_type = DataType::RunEndEncoded(
103            Arc::new(Field::new("run_ends", run_ends_type, false)),
104            Arc::new(Field::new("values", values_type, true)),
105        );
106        let len = RunArray::logical_len(run_ends);
107        let builder = ArrayDataBuilder::new(ree_array_type)
108            .len(len)
109            .add_child_data(run_ends.to_data())
110            .add_child_data(values.to_data());
111
112        // `build_unchecked` is used to avoid recursive validation of child arrays.
113        let array_data = unsafe { builder.build_unchecked() };
114
115        // Safety: `validate_data` checks below
116        //    1. The given array data has exactly two child arrays.
117        //    2. The first child array (run_ends) has valid data type.
118        //    3. run_ends array does not have null values
119        //    4. run_ends array has non-zero and strictly increasing values.
120        //    5. The length of run_ends array and values array are the same.
121        array_data.validate_data()?;
122
123        Ok(array_data.into())
124    }
125
126    /// Returns a reference to the [`RunEndBuffer`].
127    pub fn run_ends(&self) -> &RunEndBuffer<R::Native> {
128        &self.run_ends
129    }
130
131    /// Returns a reference to the values array.
132    ///
133    /// Any slicing of this [`RunArray`] array is **not** applied to the returned
134    /// values here and must be handled separately.
135    pub fn values(&self) -> &ArrayRef {
136        &self.values
137    }
138
139    /// Returns the physical index at which the array slice starts.
140    ///
141    /// See [`RunEndBuffer::get_start_physical_index`].
142    pub fn get_start_physical_index(&self) -> usize {
143        self.run_ends.get_start_physical_index()
144    }
145
146    /// Returns the physical index at which the array slice ends.
147    ///
148    /// See [`RunEndBuffer::get_end_physical_index`].
149    pub fn get_end_physical_index(&self) -> usize {
150        self.run_ends.get_end_physical_index()
151    }
152
153    /// Downcast this [`RunArray`] to a [`TypedRunArray`]
154    ///
155    /// ```
156    /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type};
157    ///
158    /// let orig = [Some("a"), Some("b"), None];
159    /// let run_array = RunArray::<Int32Type>::from_iter(orig);
160    /// let typed = run_array.downcast::<StringArray>().unwrap();
161    /// assert_eq!(typed.value(0), "a");
162    /// assert_eq!(typed.value(1), "b");
163    /// assert!(typed.values().is_null(2));
164    /// ```
165    pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
166        let values = self.values.as_any().downcast_ref()?;
167        Some(TypedRunArray {
168            run_array: self,
169            values,
170        })
171    }
172
173    /// Calls [`RunEndBuffer::get_physical_index`].
174    ///
175    /// The result is arbitrary if `logical_index >= self.len()`
176    pub fn get_physical_index(&self, logical_index: usize) -> usize {
177        self.run_ends.get_physical_index(logical_index)
178    }
179
180    /// Returns the physical indices corresponding to the provided logical indices.
181    ///
182    /// See [`RunEndBuffer::get_physical_indices`] for more details.
183    #[inline]
184    pub fn get_physical_indices<I>(&self, logical_indices: &[I]) -> Result<Vec<usize>, ArrowError>
185    where
186        I: ArrowNativeType,
187    {
188        self.run_ends()
189            .get_physical_indices(logical_indices)
190            .map_err(|index| {
191                ArrowError::InvalidArgumentError(format!(
192                    "Logical index {} is out of bounds for RunArray of length {}",
193                    index.as_usize(),
194                    self.len()
195                ))
196            })
197    }
198
199    /// Returns a zero-copy slice of this array with the indicated offset and length.
200    ///
201    /// # Panics
202    ///
203    /// - Specified slice (`offset` + `length`) exceeds existing length
204    pub fn slice(&self, offset: usize, length: usize) -> Self {
205        Self {
206            data_type: self.data_type.clone(),
207            run_ends: self.run_ends.slice(offset, length),
208            values: self.values.clone(),
209        }
210    }
211}
212
213impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
214    // The method assumes the caller already validated the data using `ArrayData::validate_data()`
215    fn from(data: ArrayData) -> Self {
216        match data.data_type() {
217            DataType::RunEndEncoded(_, _) => {}
218            _ => {
219                panic!(
220                    "Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"
221                );
222            }
223        }
224
225        // Safety
226        // ArrayData is valid
227        let child = &data.child_data()[0];
228        assert_eq!(child.data_type(), &R::DATA_TYPE, "Incorrect run ends type");
229        let run_ends = unsafe {
230            let scalar = child.buffers()[0].clone().into();
231            RunEndBuffer::new_unchecked(scalar, data.offset(), data.len())
232        };
233
234        let values = make_array(data.child_data()[1].clone());
235        Self {
236            data_type: data.data_type().clone(),
237            run_ends,
238            values,
239        }
240    }
241}
242
243impl<R: RunEndIndexType> From<RunArray<R>> for ArrayData {
244    fn from(array: RunArray<R>) -> Self {
245        let len = array.run_ends.len();
246        let offset = array.run_ends.offset();
247
248        let run_ends = ArrayDataBuilder::new(R::DATA_TYPE)
249            .len(array.run_ends.values().len())
250            .buffers(vec![array.run_ends.into_inner().into_inner()]);
251
252        let run_ends = unsafe { run_ends.build_unchecked() };
253
254        let builder = ArrayDataBuilder::new(array.data_type)
255            .len(len)
256            .offset(offset)
257            .child_data(vec![run_ends, array.values.to_data()]);
258
259        unsafe { builder.build_unchecked() }
260    }
261}
262
263impl<T: RunEndIndexType> super::private::Sealed for RunArray<T> {}
264
265impl<T: RunEndIndexType> Array for RunArray<T> {
266    fn as_any(&self) -> &dyn Any {
267        self
268    }
269
270    fn to_data(&self) -> ArrayData {
271        self.clone().into()
272    }
273
274    fn into_data(self) -> ArrayData {
275        self.into()
276    }
277
278    fn data_type(&self) -> &DataType {
279        &self.data_type
280    }
281
282    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
283        Arc::new(self.slice(offset, length))
284    }
285
286    fn len(&self) -> usize {
287        self.run_ends.len()
288    }
289
290    fn is_empty(&self) -> bool {
291        self.run_ends.is_empty()
292    }
293
294    fn shrink_to_fit(&mut self) {
295        self.run_ends.shrink_to_fit();
296        self.values.shrink_to_fit();
297    }
298
299    fn offset(&self) -> usize {
300        self.run_ends.offset()
301    }
302
303    fn nulls(&self) -> Option<&NullBuffer> {
304        None
305    }
306
307    fn logical_nulls(&self) -> Option<NullBuffer> {
308        let len = self.len();
309        let nulls = self.values.logical_nulls()?;
310        let mut out = BooleanBufferBuilder::new(len);
311        let offset = self.run_ends.offset();
312        let mut valid_start = 0;
313        let mut last_end = 0;
314        for (idx, end) in self.run_ends.values().iter().enumerate() {
315            let end = end.as_usize();
316            if end < offset {
317                continue;
318            }
319            let end = (end - offset).min(len);
320            if nulls.is_null(idx) {
321                if valid_start < last_end {
322                    out.append_n(last_end - valid_start, true);
323                }
324                out.append_n(end - last_end, false);
325                valid_start = end;
326            }
327            last_end = end;
328            if end == len {
329                break;
330            }
331        }
332        if valid_start < len {
333            out.append_n(len - valid_start, true)
334        }
335        // Sanity check
336        assert_eq!(out.len(), len);
337        Some(out.finish().into())
338    }
339
340    fn is_nullable(&self) -> bool {
341        !self.is_empty() && self.values.is_nullable()
342    }
343
344    fn get_buffer_memory_size(&self) -> usize {
345        self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size()
346    }
347
348    fn get_array_memory_size(&self) -> usize {
349        std::mem::size_of::<Self>()
350            + self.run_ends.inner().inner().capacity()
351            + self.values.get_array_memory_size()
352    }
353}
354
355impl<R: RunEndIndexType> std::fmt::Debug for RunArray<R> {
356    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
357        writeln!(
358            f,
359            "RunArray {{run_ends: {:?}, values: {:?}}}",
360            self.run_ends.values(),
361            self.values
362        )
363    }
364}
365
366/// Constructs a `RunArray` from an iterator of optional strings.
367///
368/// # Example:
369/// ```
370/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
371///
372/// let test = vec!["a", "a", "b", "c", "c"];
373/// let array: RunArray<Int16Type> = test
374///     .iter()
375///     .map(|&x| if x == "b" { None } else { Some(x) })
376///     .collect();
377/// assert_eq!(
378///     "RunArray {run_ends: [2, 3, 5], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
379///     format!("{:?}", array)
380/// );
381/// ```
382impl<'a, T: RunEndIndexType> FromIterator<Option<&'a str>> for RunArray<T> {
383    fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
384        let it = iter.into_iter();
385        let (lower, _) = it.size_hint();
386        let mut builder = StringRunBuilder::with_capacity(lower, 256);
387        it.for_each(|i| {
388            builder.append_option(i);
389        });
390
391        builder.finish()
392    }
393}
394
395/// Constructs a `RunArray` from an iterator of strings.
396///
397/// # Example:
398///
399/// ```
400/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
401///
402/// let test = vec!["a", "a", "b", "c"];
403/// let array: RunArray<Int16Type> = test.into_iter().collect();
404/// assert_eq!(
405///     "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
406///     format!("{:?}", array)
407/// );
408/// ```
409impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray<T> {
410    fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
411        let it = iter.into_iter();
412        let (lower, _) = it.size_hint();
413        let mut builder = StringRunBuilder::with_capacity(lower, 256);
414        it.for_each(|i| {
415            builder.append_value(i);
416        });
417
418        builder.finish()
419    }
420}
421
422///
423/// A [`RunArray`] with `i16` run ends
424///
425/// # Example: Using `collect`
426/// ```
427/// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray};
428/// # use std::sync::Arc;
429///
430/// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
431/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
432/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
433/// assert_eq!(array.values(), &values);
434/// ```
435pub type Int16RunArray = RunArray<Int16Type>;
436
437///
438/// A [`RunArray`] with `i32` run ends
439///
440/// # Example: Using `collect`
441/// ```
442/// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray};
443/// # use std::sync::Arc;
444///
445/// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
446/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
447/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
448/// assert_eq!(array.values(), &values);
449/// ```
450pub type Int32RunArray = RunArray<Int32Type>;
451
452///
453/// A [`RunArray`] with `i64` run ends
454///
455/// # Example: Using `collect`
456/// ```
457/// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray};
458/// # use std::sync::Arc;
459///
460/// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
461/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
462/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
463/// assert_eq!(array.values(), &values);
464/// ```
465pub type Int64RunArray = RunArray<Int64Type>;
466
467/// A [`RunArray`] typed typed on its child values array
468///
469/// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements
470///
471/// ```
472/// use arrow_array::{RunArray, StringArray, types::Int32Type};
473///
474/// let orig = ["a", "b", "a", "b"];
475/// let ree_array = RunArray::<Int32Type>::from_iter(orig);
476///
477/// // `TypedRunArray` allows you to access the values directly
478/// let typed = ree_array.downcast::<StringArray>().unwrap();
479///
480/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
481///     assert_eq!(maybe_val.unwrap(), orig)
482/// }
483/// ```
484pub struct TypedRunArray<'a, R: RunEndIndexType, V> {
485    /// The run array
486    run_array: &'a RunArray<R>,
487
488    /// The values of the run_array
489    values: &'a V,
490}
491
492// Manually implement `Clone` to avoid `V: Clone` type constraint
493impl<R: RunEndIndexType, V> Clone for TypedRunArray<'_, R, V> {
494    fn clone(&self) -> Self {
495        *self
496    }
497}
498
499impl<R: RunEndIndexType, V> Copy for TypedRunArray<'_, R, V> {}
500
501impl<R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'_, R, V> {
502    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
503        writeln!(f, "TypedRunArray({:?})", self.run_array)
504    }
505}
506
507impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> {
508    /// Returns the run_ends of this [`TypedRunArray`]
509    pub fn run_ends(&self) -> &'a RunEndBuffer<R::Native> {
510        self.run_array.run_ends()
511    }
512
513    /// Returns the values of this [`TypedRunArray`]
514    pub fn values(&self) -> &'a V {
515        self.values
516    }
517
518    /// Returns the run array of this [`TypedRunArray`]
519    pub fn run_array(&self) -> &'a RunArray<R> {
520        self.run_array
521    }
522}
523
524impl<R: RunEndIndexType, V: Sync> super::private::Sealed for TypedRunArray<'_, R, V> {}
525
526impl<R: RunEndIndexType, V: Sync> Array for TypedRunArray<'_, R, V> {
527    fn as_any(&self) -> &dyn Any {
528        self.run_array
529    }
530
531    fn to_data(&self) -> ArrayData {
532        self.run_array.to_data()
533    }
534
535    fn into_data(self) -> ArrayData {
536        self.run_array.into_data()
537    }
538
539    fn data_type(&self) -> &DataType {
540        self.run_array.data_type()
541    }
542
543    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
544        Arc::new(self.run_array.slice(offset, length))
545    }
546
547    fn len(&self) -> usize {
548        self.run_array.len()
549    }
550
551    fn is_empty(&self) -> bool {
552        self.run_array.is_empty()
553    }
554
555    fn offset(&self) -> usize {
556        self.run_array.offset()
557    }
558
559    fn nulls(&self) -> Option<&NullBuffer> {
560        self.run_array.nulls()
561    }
562
563    fn logical_nulls(&self) -> Option<NullBuffer> {
564        self.run_array.logical_nulls()
565    }
566
567    fn logical_null_count(&self) -> usize {
568        self.run_array.logical_null_count()
569    }
570
571    fn is_nullable(&self) -> bool {
572        self.run_array.is_nullable()
573    }
574
575    fn get_buffer_memory_size(&self) -> usize {
576        self.run_array.get_buffer_memory_size()
577    }
578
579    fn get_array_memory_size(&self) -> usize {
580        self.run_array.get_array_memory_size()
581    }
582}
583
584// Array accessor converts the index of logical array to the index of the physical array
585// using binary search. The time complexity is O(log N) where N is number of runs.
586impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V>
587where
588    R: RunEndIndexType,
589    V: Sync + Send,
590    &'a V: ArrayAccessor,
591    <&'a V as ArrayAccessor>::Item: Default,
592{
593    type Item = <&'a V as ArrayAccessor>::Item;
594
595    fn value(&self, logical_index: usize) -> Self::Item {
596        assert!(
597            logical_index < self.len(),
598            "Trying to access an element at index {} from a TypedRunArray of length {}",
599            logical_index,
600            self.len()
601        );
602        unsafe { self.value_unchecked(logical_index) }
603    }
604
605    unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item {
606        let physical_index = self.run_array.get_physical_index(logical_index);
607        unsafe { self.values().value_unchecked(physical_index) }
608    }
609}
610
611impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V>
612where
613    R: RunEndIndexType,
614    V: Sync + Send,
615    &'a V: ArrayAccessor,
616    <&'a V as ArrayAccessor>::Item: Default,
617{
618    type Item = Option<<&'a V as ArrayAccessor>::Item>;
619    type IntoIter = RunArrayIter<'a, R, V>;
620
621    fn into_iter(self) -> Self::IntoIter {
622        RunArrayIter::new(self)
623    }
624}
625
626#[cfg(test)]
627mod tests {
628    use rand::Rng;
629    use rand::rng;
630    use rand::seq::SliceRandom;
631
632    use super::*;
633    use crate::builder::PrimitiveRunBuilder;
634    use crate::cast::AsArray;
635    use crate::types::{Int8Type, UInt32Type};
636    use crate::{Int16Array, Int32Array, StringArray};
637
638    fn build_input_array(size: usize) -> Vec<Option<i32>> {
639        // The input array is created by shuffling and repeating
640        // the seed values random number of times.
641        let mut seed: Vec<Option<i32>> = vec![
642            None,
643            None,
644            None,
645            Some(1),
646            Some(2),
647            Some(3),
648            Some(4),
649            Some(5),
650            Some(6),
651            Some(7),
652            Some(8),
653            Some(9),
654        ];
655        let mut result: Vec<Option<i32>> = Vec::with_capacity(size);
656        let mut ix = 0;
657        let mut rng = rng();
658        // run length can go up to 8. Cap the max run length for smaller arrays to size / 2.
659        let max_run_length = 8_usize.min(1_usize.max(size / 2));
660        while result.len() < size {
661            // shuffle the seed array if all the values are iterated.
662            if ix == 0 {
663                seed.shuffle(&mut rng);
664            }
665            // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays
666            let num = max_run_length.min(rng.random_range(1..=max_run_length));
667            for _ in 0..num {
668                result.push(seed[ix]);
669            }
670            ix += 1;
671            if ix == seed.len() {
672                ix = 0
673            }
674        }
675        result.resize(size, None);
676        result
677    }
678
679    // Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]`
680    fn compare_logical_and_physical_indices(
681        logical_indices: &[u32],
682        logical_array: &[Option<i32>],
683        physical_indices: &[usize],
684        physical_array: &PrimitiveArray<Int32Type>,
685    ) {
686        assert_eq!(logical_indices.len(), physical_indices.len());
687
688        // check value in logical index in the logical_array matches physical index in physical_array
689        logical_indices
690            .iter()
691            .map(|f| f.as_usize())
692            .zip(physical_indices.iter())
693            .for_each(|(logical_ix, physical_ix)| {
694                let expected = logical_array[logical_ix];
695                match expected {
696                    Some(val) => {
697                        assert!(physical_array.is_valid(*physical_ix));
698                        let actual = physical_array.value(*physical_ix);
699                        assert_eq!(val, actual);
700                    }
701                    None => {
702                        assert!(physical_array.is_null(*physical_ix))
703                    }
704                };
705            });
706    }
707    #[test]
708    fn test_run_array() {
709        // Construct a value array
710        let value_data =
711            PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
712
713        // Construct a run_ends array:
714        let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22];
715        let run_ends_data =
716            PrimitiveArray::<Int16Type>::from_iter_values(run_ends_values.iter().copied());
717
718        // Construct a run ends encoded array from the above two
719        let ree_array = RunArray::<Int16Type>::try_new(&run_ends_data, &value_data).unwrap();
720
721        assert_eq!(ree_array.len(), 22);
722        assert_eq!(ree_array.null_count(), 0);
723
724        let values = ree_array.values();
725        assert_eq!(value_data.into_data(), values.to_data());
726        assert_eq!(&DataType::Int8, values.data_type());
727
728        let run_ends = ree_array.run_ends();
729        assert_eq!(run_ends.values(), &run_ends_values);
730    }
731
732    #[test]
733    fn test_run_array_fmt_debug() {
734        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(3);
735        builder.append_value(12345678);
736        builder.append_null();
737        builder.append_value(22345678);
738        let array = builder.finish();
739        assert_eq!(
740            "RunArray {run_ends: [1, 2, 3], values: PrimitiveArray<UInt32>\n[\n  12345678,\n  null,\n  22345678,\n]}\n",
741            format!("{array:?}")
742        );
743
744        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(20);
745        for _ in 0..20 {
746            builder.append_value(1);
747        }
748        let array = builder.finish();
749
750        assert_eq!(array.len(), 20);
751        assert_eq!(array.null_count(), 0);
752        assert_eq!(array.logical_null_count(), 0);
753
754        assert_eq!(
755            "RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n  1,\n]}\n",
756            format!("{array:?}")
757        );
758    }
759
760    #[test]
761    fn test_run_array_from_iter() {
762        let test = vec!["a", "a", "b", "c"];
763        let array: RunArray<Int16Type> = test
764            .iter()
765            .map(|&x| if x == "b" { None } else { Some(x) })
766            .collect();
767        assert_eq!(
768            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
769            format!("{array:?}")
770        );
771
772        assert_eq!(array.len(), 4);
773        assert_eq!(array.null_count(), 0);
774        assert_eq!(array.logical_null_count(), 1);
775
776        let array: RunArray<Int16Type> = test.into_iter().collect();
777        assert_eq!(
778            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
779            format!("{array:?}")
780        );
781    }
782
783    #[test]
784    fn test_run_array_run_ends_as_primitive_array() {
785        let test = vec!["a", "b", "c", "a"];
786        let array: RunArray<Int16Type> = test.into_iter().collect();
787
788        assert_eq!(array.len(), 4);
789        assert_eq!(array.null_count(), 0);
790        assert_eq!(array.logical_null_count(), 0);
791
792        let run_ends = array.run_ends();
793        assert_eq!(&[1, 2, 3, 4], run_ends.values());
794    }
795
796    #[test]
797    fn test_run_array_as_primitive_array_with_null() {
798        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
799        let array: RunArray<Int32Type> = test.into_iter().collect();
800
801        assert_eq!(array.len(), 6);
802        assert_eq!(array.null_count(), 0);
803        assert_eq!(array.logical_null_count(), 3);
804
805        let run_ends = array.run_ends();
806        assert_eq!(&[1, 2, 3, 5, 6], run_ends.values());
807
808        let values_data = array.values();
809        assert_eq!(2, values_data.null_count());
810        assert_eq!(5, values_data.len());
811    }
812
813    #[test]
814    fn test_run_array_all_nulls() {
815        let test = vec![None, None, None];
816        let array: RunArray<Int32Type> = test.into_iter().collect();
817
818        assert_eq!(array.len(), 3);
819        assert_eq!(array.null_count(), 0);
820        assert_eq!(array.logical_null_count(), 3);
821
822        let run_ends = array.run_ends();
823        assert_eq!(3, run_ends.len());
824        assert_eq!(&[3], run_ends.values());
825
826        let values_data = array.values();
827        assert_eq!(1, values_data.null_count());
828    }
829
830    #[test]
831    fn test_run_array_try_new() {
832        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
833            .into_iter()
834            .collect();
835        let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
836
837        let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
838        assert_eq!(array.values().data_type(), &DataType::Utf8);
839
840        assert_eq!(array.null_count(), 0);
841        assert_eq!(array.logical_null_count(), 1);
842        assert_eq!(array.len(), 4);
843        assert_eq!(array.values().null_count(), 1);
844
845        assert_eq!(
846            "RunArray {run_ends: [1, 2, 3, 4], values: StringArray\n[\n  \"foo\",\n  \"bar\",\n  null,\n  \"baz\",\n]}\n",
847            format!("{array:?}")
848        );
849    }
850
851    #[test]
852    fn test_run_array_int16_type_definition() {
853        let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
854        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
855        assert_eq!(array.run_ends().values(), &[2, 3, 5]);
856        assert_eq!(array.values(), &values);
857    }
858
859    #[test]
860    fn test_run_array_empty_string() {
861        let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect();
862        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "", "c"]));
863        assert_eq!(array.run_ends().values(), &[2, 4, 5]);
864        assert_eq!(array.values(), &values);
865    }
866
867    #[test]
868    fn test_run_array_length_mismatch() {
869        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
870            .into_iter()
871            .collect();
872        let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect();
873
874        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
875        let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string());
876        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
877    }
878
879    #[test]
880    fn test_run_array_run_ends_with_null() {
881        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
882            .into_iter()
883            .collect();
884        let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect();
885
886        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
887        let expected = ArrowError::InvalidArgumentError(
888            "Found null values in run_ends array. The run_ends array should not have null values."
889                .to_string(),
890        );
891        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
892    }
893
894    #[test]
895    fn test_run_array_run_ends_with_zeroes() {
896        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
897            .into_iter()
898            .collect();
899        let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect();
900
901        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
902        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string());
903        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
904    }
905
906    #[test]
907    fn test_run_array_run_ends_non_increasing() {
908        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
909            .into_iter()
910            .collect();
911        let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect();
912
913        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
914        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string());
915        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
916    }
917
918    #[test]
919    #[should_panic(expected = "Incorrect run ends type")]
920    fn test_run_array_run_ends_data_type_mismatch() {
921        let a = RunArray::<Int32Type>::from_iter(["32"]);
922        let _ = RunArray::<Int64Type>::from(a.into_data());
923    }
924
925    #[test]
926    fn test_ree_array_accessor() {
927        let input_array = build_input_array(256);
928
929        // Encode the input_array to ree_array
930        let mut builder =
931            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
932        builder.extend(input_array.iter().copied());
933        let run_array = builder.finish();
934        let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
935
936        // Access every index and check if the value in the input array matches returned value.
937        for (i, inp_val) in input_array.iter().enumerate() {
938            if let Some(val) = inp_val {
939                let actual = typed.value(i);
940                assert_eq!(*val, actual)
941            } else {
942                let physical_ix = run_array.get_physical_index(i);
943                assert!(typed.values().is_null(physical_ix));
944            };
945        }
946    }
947
948    #[test]
949    #[cfg_attr(miri, ignore)] // Takes too long
950    fn test_get_physical_indices() {
951        // Test for logical lengths starting from 10 to 250 increasing by 10
952        for logical_len in (0..250).step_by(10) {
953            let input_array = build_input_array(logical_len);
954
955            // create run array using input_array
956            let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
957            builder.extend(input_array.clone().into_iter());
958
959            let run_array = builder.finish();
960            let physical_values_array = run_array.values().as_primitive::<Int32Type>();
961
962            // create an array consisting of all the indices repeated twice and shuffled.
963            let mut logical_indices: Vec<u32> = (0_u32..(logical_len as u32)).collect();
964            // add same indices once more
965            logical_indices.append(&mut logical_indices.clone());
966            let mut rng = rng();
967            logical_indices.shuffle(&mut rng);
968
969            let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap();
970
971            assert_eq!(logical_indices.len(), physical_indices.len());
972
973            // check value in logical index in the input_array matches physical index in typed_run_array
974            compare_logical_and_physical_indices(
975                &logical_indices,
976                &input_array,
977                &physical_indices,
978                physical_values_array,
979            );
980        }
981    }
982
983    #[test]
984    #[cfg_attr(miri, ignore)] // Takes too long
985    fn test_get_physical_indices_sliced() {
986        let total_len = 80;
987        let input_array = build_input_array(total_len);
988
989        // Encode the input_array to run array
990        let mut builder =
991            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
992        builder.extend(input_array.iter().copied());
993        let run_array = builder.finish();
994        let physical_values_array = run_array.values().as_primitive::<Int32Type>();
995
996        // test for all slice lengths.
997        for slice_len in 1..=total_len {
998            // create an array consisting of all the indices repeated twice and shuffled.
999            let mut logical_indices: Vec<u32> = (0_u32..(slice_len as u32)).collect();
1000            // add same indices once more
1001            logical_indices.append(&mut logical_indices.clone());
1002            let mut rng = rng();
1003            logical_indices.shuffle(&mut rng);
1004
1005            // test for offset = 0 and slice length = slice_len
1006            // slice the input array using which the run array was built.
1007            let sliced_input_array = &input_array[0..slice_len];
1008
1009            // slice the run array
1010            let sliced_run_array: RunArray<Int16Type> =
1011                run_array.slice(0, slice_len).into_data().into();
1012
1013            // Get physical indices.
1014            let physical_indices = sliced_run_array
1015                .get_physical_indices(&logical_indices)
1016                .unwrap();
1017
1018            compare_logical_and_physical_indices(
1019                &logical_indices,
1020                sliced_input_array,
1021                &physical_indices,
1022                physical_values_array,
1023            );
1024
1025            // test for offset = total_len - slice_len and slice length = slice_len
1026            // slice the input array using which the run array was built.
1027            let sliced_input_array = &input_array[total_len - slice_len..total_len];
1028
1029            // slice the run array
1030            let sliced_run_array: RunArray<Int16Type> = run_array
1031                .slice(total_len - slice_len, slice_len)
1032                .into_data()
1033                .into();
1034
1035            // Get physical indices
1036            let physical_indices = sliced_run_array
1037                .get_physical_indices(&logical_indices)
1038                .unwrap();
1039
1040            compare_logical_and_physical_indices(
1041                &logical_indices,
1042                sliced_input_array,
1043                &physical_indices,
1044                physical_values_array,
1045            );
1046        }
1047    }
1048
1049    #[test]
1050    fn test_logical_nulls() {
1051        let run = Int32Array::from(vec![3, 6, 9, 12]);
1052        let values = Int32Array::from(vec![Some(0), None, Some(1), None]);
1053        let array = RunArray::try_new(&run, &values).unwrap();
1054
1055        let expected = [
1056            true, true, true, false, false, false, true, true, true, false, false, false,
1057        ];
1058
1059        let n = array.logical_nulls().unwrap();
1060        assert_eq!(n.null_count(), 6);
1061
1062        let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)];
1063        for (offset, length) in slices {
1064            let a = array.slice(offset, length);
1065            let n = a.logical_nulls().unwrap();
1066            let n = n.into_iter().collect::<Vec<_>>();
1067            assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}");
1068        }
1069    }
1070
1071    #[test]
1072    fn test_run_array_eq_identical() {
1073        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1074        let values1 = StringArray::from(vec!["a", "b", "c"]);
1075        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1076
1077        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1078        let values2 = StringArray::from(vec!["a", "b", "c"]);
1079        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1080
1081        assert_eq!(array1, array2);
1082    }
1083
1084    #[test]
1085    fn test_run_array_ne_different_run_ends() {
1086        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1087        let values1 = StringArray::from(vec!["a", "b", "c"]);
1088        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1089
1090        let run_ends2 = Int32Array::from(vec![1, 4, 6]);
1091        let values2 = StringArray::from(vec!["a", "b", "c"]);
1092        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1093
1094        assert_ne!(array1, array2);
1095    }
1096
1097    #[test]
1098    fn test_run_array_ne_different_values() {
1099        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1100        let values1 = StringArray::from(vec!["a", "b", "c"]);
1101        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1102
1103        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1104        let values2 = StringArray::from(vec!["a", "b", "d"]);
1105        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1106
1107        assert_ne!(array1, array2);
1108    }
1109
1110    #[test]
1111    fn test_run_array_eq_with_nulls() {
1112        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1113        let values1 = StringArray::from(vec![Some("a"), None, Some("c")]);
1114        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1115
1116        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1117        let values2 = StringArray::from(vec![Some("a"), None, Some("c")]);
1118        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1119
1120        assert_eq!(array1, array2);
1121    }
1122
1123    #[test]
1124    fn test_run_array_eq_different_run_end_types() {
1125        let run_ends_i16_1 = Int16Array::from(vec![2_i16, 4, 6]);
1126        let values_i16_1 = StringArray::from(vec!["a", "b", "c"]);
1127        let array_i16_1 = RunArray::<Int16Type>::try_new(&run_ends_i16_1, &values_i16_1).unwrap();
1128
1129        let run_ends_i16_2 = Int16Array::from(vec![2_i16, 4, 6]);
1130        let values_i16_2 = StringArray::from(vec!["a", "b", "c"]);
1131        let array_i16_2 = RunArray::<Int16Type>::try_new(&run_ends_i16_2, &values_i16_2).unwrap();
1132
1133        assert_eq!(array_i16_1, array_i16_2);
1134    }
1135}
arrow_array/array/run_array.rs

arrow_array/array/
run_array.rs