Skip to main content

arrow_array/array/
dictionary_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder};
19use crate::cast::AsArray;
20use crate::iterator::ArrayIter;
21use crate::types::*;
22use crate::{
23    Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, PrimitiveArray, Scalar, StringArray,
24    make_array,
25};
26use arrow_buffer::bit_util::set_bit;
27use arrow_buffer::buffer::NullBuffer;
28use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder, ScalarBuffer};
29use arrow_data::ArrayData;
30use arrow_schema::{ArrowError, DataType};
31use std::any::Any;
32use std::sync::Arc;
33
34/// A [`DictionaryArray`] indexed by `i8`
35///
36/// # Example: Using `collect`
37/// ```
38/// # use arrow_array::{Array, Int8DictionaryArray, Int8Array, StringArray};
39/// # use std::sync::Arc;
40///
41/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
42/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
43/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
44/// assert_eq!(array.values(), &values);
45/// ```
46///
47/// See [`DictionaryArray`] for more information and examples
48pub type Int8DictionaryArray = DictionaryArray<Int8Type>;
49
50/// A [`DictionaryArray`] indexed by `i16`
51///
52/// # Example: Using `collect`
53/// ```
54/// # use arrow_array::{Array, Int16DictionaryArray, Int16Array, StringArray};
55/// # use std::sync::Arc;
56///
57/// let array: Int16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
58/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
59/// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2]));
60/// assert_eq!(array.values(), &values);
61/// ```
62///
63/// See [`DictionaryArray`] for more information and examples
64pub type Int16DictionaryArray = DictionaryArray<Int16Type>;
65
66/// A [`DictionaryArray`] indexed by `i32`
67///
68/// # Example: Using `collect`
69/// ```
70/// # use arrow_array::{Array, Int32DictionaryArray, Int32Array, StringArray};
71/// # use std::sync::Arc;
72///
73/// let array: Int32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
74/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
75/// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2]));
76/// assert_eq!(array.values(), &values);
77/// ```
78///
79/// See [`DictionaryArray`] for more information and examples
80pub type Int32DictionaryArray = DictionaryArray<Int32Type>;
81
82/// A [`DictionaryArray`] indexed by `i64`
83///
84/// # Example: Using `collect`
85/// ```
86/// # use arrow_array::{Array, Int64DictionaryArray, Int64Array, StringArray};
87/// # use std::sync::Arc;
88///
89/// let array: Int64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
90/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
91/// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2]));
92/// assert_eq!(array.values(), &values);
93/// ```
94///
95/// See [`DictionaryArray`] for more information and examples
96pub type Int64DictionaryArray = DictionaryArray<Int64Type>;
97
98/// A [`DictionaryArray`] indexed by `u8`
99///
100/// # Example: Using `collect`
101/// ```
102/// # use arrow_array::{Array, UInt8DictionaryArray, UInt8Array, StringArray};
103/// # use std::sync::Arc;
104///
105/// let array: UInt8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
106/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
107/// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2]));
108/// assert_eq!(array.values(), &values);
109/// ```
110///
111/// See [`DictionaryArray`] for more information and examples
112pub type UInt8DictionaryArray = DictionaryArray<UInt8Type>;
113
114/// A [`DictionaryArray`] indexed by `u16`
115///
116/// # Example: Using `collect`
117/// ```
118/// # use arrow_array::{Array, UInt16DictionaryArray, UInt16Array, StringArray};
119/// # use std::sync::Arc;
120///
121/// let array: UInt16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
122/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
123/// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2]));
124/// assert_eq!(array.values(), &values);
125/// ```
126///
127/// See [`DictionaryArray`] for more information and examples
128pub type UInt16DictionaryArray = DictionaryArray<UInt16Type>;
129
130/// A [`DictionaryArray`] indexed by `u32`
131///
132/// # Example: Using `collect`
133/// ```
134/// # use arrow_array::{Array, UInt32DictionaryArray, UInt32Array, StringArray};
135/// # use std::sync::Arc;
136///
137/// let array: UInt32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
138/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
139/// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2]));
140/// assert_eq!(array.values(), &values);
141/// ```
142///
143/// See [`DictionaryArray`] for more information and examples
144pub type UInt32DictionaryArray = DictionaryArray<UInt32Type>;
145
146/// A [`DictionaryArray`] indexed by `u64`
147///
148/// # Example: Using `collect`
149/// ```
150/// # use arrow_array::{Array, UInt64DictionaryArray, UInt64Array, StringArray};
151/// # use std::sync::Arc;
152///
153/// let array: UInt64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
154/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
155/// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2]));
156/// assert_eq!(array.values(), &values);
157/// ```
158///
159/// See [`DictionaryArray`] for more information and examples
160pub type UInt64DictionaryArray = DictionaryArray<UInt64Type>;
161
162/// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout)
163///
164/// This is mostly used to represent strings or a limited set of primitive types as integers,
165/// for example when doing NLP analysis or representing chromosomes by name.
166///
167/// [`DictionaryArray`] are represented using a `keys` array and a
168/// `values` array, which may be different lengths. The `keys` array
169/// stores indexes in the `values` array which holds
170/// the corresponding logical value, as shown here:
171///
172/// ```text
173/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
174///   ┌─────────────────┐  ┌─────────┐ │     ┌─────────────────┐
175/// │ │        A        │  │    0    │       │        A        │     values[keys[0]]
176///   ├─────────────────┤  ├─────────┤ │     ├─────────────────┤
177/// │ │        D        │  │    2    │       │        B        │     values[keys[1]]
178///   ├─────────────────┤  ├─────────┤ │     ├─────────────────┤
179/// │ │        B        │  │    2    │       │        B        │     values[keys[2]]
180///   └─────────────────┘  ├─────────┤ │     ├─────────────────┤
181/// │                      │    1    │       │        D        │     values[keys[3]]
182///                        ├─────────┤ │     ├─────────────────┤
183/// │                      │    1    │       │        D        │     values[keys[4]]
184///                        ├─────────┤ │     ├─────────────────┤
185/// │                      │    0    │       │        A        │     values[keys[5]]
186///                        └─────────┘ │     └─────────────────┘
187/// │       values            keys
188///  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
189///                                             Logical array
190///                                                Contents
191///           DictionaryArray
192///              length = 6
193/// ```
194///
195/// # Example: From Nullable Data
196///
197/// ```
198/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
199/// let test = vec!["a", "a", "b", "c"];
200/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
201/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
202/// ```
203///
204/// # Example: From Non-Nullable Data
205///
206/// ```
207/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
208/// let test = vec!["a", "a", "b", "c"];
209/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
210/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
211/// ```
212///
213/// # Example: From Existing Arrays
214///
215/// ```
216/// # use std::sync::Arc;
217/// # use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type};
218/// // You can form your own DictionaryArray by providing the
219/// // values (dictionary) and keys (indexes into the dictionary):
220/// let values = StringArray::from_iter_values(["a", "b", "c"]);
221/// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
222/// let array = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
223/// let expected: DictionaryArray::<Int8Type> = vec!["a", "a", "b", "c"].into_iter().collect();
224/// assert_eq!(&array, &expected);
225/// ```
226///
227/// # Example: Using Builder
228///
229/// ```
230/// # use arrow_array::{Array, StringArray};
231/// # use arrow_array::builder::StringDictionaryBuilder;
232/// # use arrow_array::types::Int32Type;
233/// let mut builder = StringDictionaryBuilder::<Int32Type>::new();
234/// builder.append_value("a");
235/// builder.append_null();
236/// builder.append_value("a");
237/// builder.append_value("b");
238/// let array = builder.finish();
239///
240/// let values: Vec<_> = array.downcast_dict::<StringArray>().unwrap().into_iter().collect();
241/// assert_eq!(&values, &[Some("a"), None, Some("a"), Some("b")]);
242/// ```
243pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
244    data_type: DataType,
245
246    /// The keys of this dictionary. These are constructed from the
247    /// buffer and null bitmap of `data`.  Also, note that these do
248    /// not correspond to the true values of this array. Rather, they
249    /// map to the real values.
250    keys: PrimitiveArray<K>,
251
252    /// Array of dictionary values (can be any DataType).
253    values: ArrayRef,
254
255    /// Values are ordered.
256    is_ordered: bool,
257}
258
259impl<K: ArrowDictionaryKeyType> Clone for DictionaryArray<K> {
260    fn clone(&self) -> Self {
261        Self {
262            data_type: self.data_type.clone(),
263            keys: self.keys.clone(),
264            values: self.values.clone(),
265            is_ordered: self.is_ordered,
266        }
267    }
268}
269
270impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
271    /// Attempt to create a new DictionaryArray with a specified keys
272    /// (indexes into the dictionary) and values (dictionary)
273    /// array.
274    ///
275    /// # Panics
276    ///
277    /// Panics if [`Self::try_new`] returns an error
278    pub fn new(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
279        Self::try_new(keys, values).unwrap()
280    }
281
282    /// Attempt to create a new DictionaryArray with a specified keys
283    /// (indexes into the dictionary) and values (dictionary)
284    /// array.
285    ///
286    /// # Errors
287    ///
288    /// Returns an error if any `keys[i] >= values.len() || keys[i] < 0`
289    pub fn try_new(keys: PrimitiveArray<K>, values: ArrayRef) -> Result<Self, ArrowError> {
290        let data_type = DataType::Dictionary(
291            Box::new(keys.data_type().clone()),
292            Box::new(values.data_type().clone()),
293        );
294
295        // // we can skip the validating the keys if they are all null
296        let all_null = keys.null_count() == keys.len();
297
298        if !all_null {
299            let zero = K::Native::usize_as(0);
300            let values_len = values.len();
301
302            if let Some((idx, v)) = keys.values().iter().enumerate().find(|(idx, v)| {
303                (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx)
304            }) {
305                return Err(ArrowError::InvalidArgumentError(format!(
306                    "Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}",
307                )));
308            }
309        }
310
311        Ok(Self {
312            data_type,
313            keys,
314            values,
315            is_ordered: false,
316        })
317    }
318
319    /// Create a new [`Scalar`] from `value`
320    pub fn new_scalar<T: Array + 'static>(value: Scalar<T>) -> Scalar<Self> {
321        Scalar::new(Self::new(
322            PrimitiveArray::new(vec![K::Native::usize_as(0)].into(), None),
323            Arc::new(value.into_inner()),
324        ))
325    }
326
327    /// Create a new [`DictionaryArray`] without performing validation
328    ///
329    /// # Safety
330    ///
331    /// Safe provided [`Self::try_new`] would not return an error
332    pub unsafe fn new_unchecked(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
333        if cfg!(feature = "force_validate") {
334            return Self::new(keys, values);
335        }
336
337        let data_type = DataType::Dictionary(
338            Box::new(keys.data_type().clone()),
339            Box::new(values.data_type().clone()),
340        );
341
342        Self {
343            data_type,
344            keys,
345            values,
346            is_ordered: false,
347        }
348    }
349
350    /// Deconstruct this array into its constituent parts
351    pub fn into_parts(self) -> (PrimitiveArray<K>, ArrayRef) {
352        (self.keys, self.values)
353    }
354
355    /// Return an array view of the keys of this dictionary as a PrimitiveArray.
356    pub fn keys(&self) -> &PrimitiveArray<K> {
357        &self.keys
358    }
359
360    /// If `value` is present in `values` (aka the dictionary),
361    /// returns the corresponding key (index into the `values`
362    /// array). Otherwise returns `None`.
363    ///
364    /// Panics if `values` is not a [`StringArray`].
365    pub fn lookup_key(&self, value: &str) -> Option<K::Native> {
366        let rd_buf: &StringArray = self.values.as_any().downcast_ref::<StringArray>().unwrap();
367
368        (0..rd_buf.len())
369            .position(|i| rd_buf.value(i) == value)
370            .and_then(K::Native::from_usize)
371    }
372
373    /// Returns a reference to the dictionary values array
374    pub fn values(&self) -> &ArrayRef {
375        &self.values
376    }
377
378    /// Returns a clone of the value type of this list.
379    pub fn value_type(&self) -> DataType {
380        self.values.data_type().clone()
381    }
382
383    /// The length of the dictionary is the length of the keys array.
384    pub fn len(&self) -> usize {
385        self.keys.len()
386    }
387
388    /// Whether this dictionary is empty
389    pub fn is_empty(&self) -> bool {
390        self.keys.is_empty()
391    }
392
393    /// Currently exists for compatibility purposes with Arrow IPC.
394    pub fn is_ordered(&self) -> bool {
395        self.is_ordered
396    }
397
398    /// Return an iterator over the keys (indexes into the dictionary)
399    pub fn keys_iter(&self) -> impl Iterator<Item = Option<usize>> + '_ {
400        self.keys.iter().map(|key| key.map(|k| k.as_usize()))
401    }
402
403    /// Return the value of `keys` (the dictionary key) at index `i`,
404    /// cast to `usize`, `None` if the value at `i` is `NULL`.
405    pub fn key(&self, i: usize) -> Option<usize> {
406        self.keys.is_valid(i).then(|| self.keys.value(i).as_usize())
407    }
408
409    /// Returns a zero-copy slice of this array with the indicated offset and length.
410    pub fn slice(&self, offset: usize, length: usize) -> Self {
411        Self {
412            data_type: self.data_type.clone(),
413            keys: self.keys.slice(offset, length),
414            values: self.values.clone(),
415            is_ordered: self.is_ordered,
416        }
417    }
418
419    /// Downcast this dictionary to a [`TypedDictionaryArray`]
420    ///
421    /// ```
422    /// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::Int32Type};
423    ///
424    /// let orig = [Some("a"), Some("b"), None];
425    /// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
426    /// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
427    /// assert_eq!(typed.value(0), "a");
428    /// assert_eq!(typed.value(1), "b");
429    /// assert!(typed.is_null(2));
430    /// ```
431    ///
432    pub fn downcast_dict<V: 'static>(&self) -> Option<TypedDictionaryArray<'_, K, V>> {
433        let values = self.values.as_any().downcast_ref()?;
434        Some(TypedDictionaryArray {
435            dictionary: self,
436            values,
437        })
438    }
439
440    /// Returns a new dictionary with the same keys as the current instance
441    /// but with a different set of dictionary values
442    ///
443    /// This can be used to perform an operation on the values of a dictionary
444    ///
445    /// # Panics
446    ///
447    /// Panics if `values` has a length less than the current values
448    ///
449    /// ```
450    /// # use std::sync::Arc;
451    /// # use arrow_array::builder::PrimitiveDictionaryBuilder;
452    /// # use arrow_array::{Int8Array, Int64Array, ArrayAccessor};
453    /// # use arrow_array::types::{Int32Type, Int8Type};
454    ///
455    /// // Construct a Dict(Int32, Int8)
456    /// let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int8Type>::with_capacity(2, 200);
457    /// for i in 0..100 {
458    ///     builder.append(i % 2).unwrap();
459    /// }
460    ///
461    /// let dictionary = builder.finish();
462    ///
463    /// // Perform a widening cast of dictionary values
464    /// let typed_dictionary = dictionary.downcast_dict::<Int8Array>().unwrap();
465    /// let values: Int64Array = typed_dictionary.values().unary(|x| x as i64);
466    ///
467    /// // Create a Dict(Int32,
468    /// let new = dictionary.with_values(Arc::new(values));
469    ///
470    /// // Verify values are as expected
471    /// let new_typed = new.downcast_dict::<Int64Array>().unwrap();
472    /// for i in 0..100 {
473    ///     assert_eq!(new_typed.value(i), (i % 2) as i64)
474    /// }
475    /// ```
476    ///
477    pub fn with_values(&self, values: ArrayRef) -> Self {
478        assert!(values.len() >= self.values.len());
479        let data_type =
480            DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
481        Self {
482            data_type,
483            keys: self.keys.clone(),
484            values,
485            is_ordered: false,
486        }
487    }
488
489    /// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating
490    /// its keys and values if the underlying data buffer is not shared by others.
491    #[allow(clippy::result_large_err)]
492    pub fn into_primitive_dict_builder<V>(self) -> Result<PrimitiveDictionaryBuilder<K, V>, Self>
493    where
494        V: ArrowPrimitiveType,
495    {
496        if !self.value_type().is_primitive() {
497            return Err(self);
498        }
499
500        let key_array = self.keys().clone();
501        let value_array = self.values().as_primitive::<V>().clone();
502
503        drop(self.keys);
504        drop(self.values);
505
506        let key_builder = key_array.into_builder();
507        let value_builder = value_array.into_builder();
508
509        match (key_builder, value_builder) {
510            (Ok(key_builder), Ok(value_builder)) => Ok(unsafe {
511                PrimitiveDictionaryBuilder::new_from_builders(key_builder, value_builder)
512            }),
513            (Err(key_array), Ok(mut value_builder)) => {
514                Err(Self::try_new(key_array, Arc::new(value_builder.finish())).unwrap())
515            }
516            (Ok(mut key_builder), Err(value_array)) => {
517                Err(Self::try_new(key_builder.finish(), Arc::new(value_array)).unwrap())
518            }
519            (Err(key_array), Err(value_array)) => {
520                Err(Self::try_new(key_array, Arc::new(value_array)).unwrap())
521            }
522        }
523    }
524
525    /// Applies an unary and infallible function to a mutable dictionary array.
526    /// Mutable dictionary array means that the buffers are not shared with other arrays.
527    /// As a result, this mutates the buffers directly without allocating new buffers.
528    ///
529    /// # Implementation
530    ///
531    /// This will apply the function for all dictionary values, including those on null slots.
532    /// This implies that the operation must be infallible for any value of the corresponding type
533    /// or this function may panic.
534    /// # Example
535    /// ```
536    /// # use std::sync::Arc;
537    /// # use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::{Int8Type, Int32Type}};
538    /// # use arrow_array::{Int8Array, Int32Array};
539    /// let values = Int32Array::from(vec![Some(10), Some(20), None]);
540    /// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
541    /// let dictionary = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
542    /// let c = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap();
543    /// let typed = c.downcast_dict::<Int32Array>().unwrap();
544    /// assert_eq!(typed.value(0), 11);
545    /// assert_eq!(typed.value(1), 11);
546    /// assert_eq!(typed.value(2), 21);
547    /// ```
548    #[allow(clippy::result_large_err)]
549    pub fn unary_mut<F, V>(self, op: F) -> Result<DictionaryArray<K>, DictionaryArray<K>>
550    where
551        V: ArrowPrimitiveType,
552        F: Fn(V::Native) -> V::Native,
553    {
554        let mut builder: PrimitiveDictionaryBuilder<K, V> = self.into_primitive_dict_builder()?;
555        builder
556            .values_slice_mut()
557            .iter_mut()
558            .for_each(|v| *v = op(*v));
559        Ok(builder.finish())
560    }
561
562    /// Computes an occupancy mask for this dictionary's values
563    ///
564    /// For each value in [`Self::values`] the corresponding bit will be set in the
565    /// returned mask if it is referenced by a key in this [`DictionaryArray`]
566    pub fn occupancy(&self) -> BooleanBuffer {
567        let len = self.values.len();
568        let mut builder = BooleanBufferBuilder::new(len);
569        builder.resize(len);
570        let slice = builder.as_slice_mut();
571        match self.keys.nulls().filter(|n| n.null_count() > 0) {
572            Some(n) => {
573                let v = self.keys.values();
574                n.valid_indices()
575                    .for_each(|idx| set_bit(slice, v[idx].as_usize()))
576            }
577            None => {
578                let v = self.keys.values();
579                v.iter().for_each(|v| set_bit(slice, v.as_usize()))
580            }
581        }
582        builder.finish()
583    }
584}
585
586/// Constructs a `DictionaryArray` from an `ArrayData`
587impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
588    fn from(data: ArrayData) -> Self {
589        let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts();
590
591        assert_eq!(
592            buffers.len(),
593            1,
594            "DictionaryArray data should contain a single buffer only (keys)."
595        );
596        let buffer = buffers.pop().expect("checked above");
597        assert_eq!(
598            child_data.len(),
599            1,
600            "DictionaryArray should contain a single child array (values)."
601        );
602        let cd = child_data.pop().expect("checked above");
603
604        if let DataType::Dictionary(key_data_type, _) = &data_type {
605            assert_eq!(
606                &T::DATA_TYPE,
607                key_data_type.as_ref(),
608                "DictionaryArray's data type must match, expected {} got {}",
609                T::DATA_TYPE,
610                key_data_type
611            );
612
613            let values = make_array(cd);
614
615            // create a zero-copy of the keys' data
616            let keys = PrimitiveArray::<T>::new(ScalarBuffer::new(buffer, offset, len), nulls);
617
618            Self {
619                data_type,
620                keys,
621                values,
622                is_ordered: false,
623            }
624        } else {
625            panic!("DictionaryArray must have Dictionary data type.")
626        }
627    }
628}
629
630impl<T: ArrowDictionaryKeyType> From<DictionaryArray<T>> for ArrayData {
631    fn from(array: DictionaryArray<T>) -> Self {
632        let builder = array
633            .keys
634            .into_data()
635            .into_builder()
636            .data_type(array.data_type)
637            .child_data(vec![array.values.to_data()]);
638
639        unsafe { builder.build_unchecked() }
640    }
641}
642
643/// Constructs a `DictionaryArray` from an iterator of optional strings.
644///
645/// # Example:
646/// ```
647/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type};
648///
649/// let test = vec!["a", "a", "b", "c"];
650/// let array: DictionaryArray<Int8Type> = test
651///     .iter()
652///     .map(|&x| if x == "b" { None } else { Some(x) })
653///     .collect();
654/// assert_eq!(
655///     "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  null,\n  1,\n] values: StringArray\n[\n  \"a\",\n  \"c\",\n]}\n",
656///     format!("{:?}", array)
657/// );
658/// ```
659impl<'a, T: ArrowDictionaryKeyType> FromIterator<Option<&'a str>> for DictionaryArray<T> {
660    fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
661        let it = iter.into_iter();
662        let (lower, _) = it.size_hint();
663        let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
664        builder.extend(it);
665        builder.finish()
666    }
667}
668
669/// Constructs a `DictionaryArray` from an iterator of strings.
670///
671/// # Example:
672///
673/// ```
674/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type};
675///
676/// let test = vec!["a", "a", "b", "c"];
677/// let array: DictionaryArray<Int8Type> = test.into_iter().collect();
678/// assert_eq!(
679///     "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  1,\n  2,\n] values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
680///     format!("{:?}", array)
681/// );
682/// ```
683impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray<T> {
684    fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
685        let it = iter.into_iter();
686        let (lower, _) = it.size_hint();
687        let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
688        it.for_each(|i| {
689            builder
690                .append(i)
691                .expect("Unable to append a value to a dictionary array.");
692        });
693
694        builder.finish()
695    }
696}
697
698/// SAFETY: Correctly implements the contract of Arrow Arrays
699unsafe impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
700    fn as_any(&self) -> &dyn Any {
701        self
702    }
703
704    fn to_data(&self) -> ArrayData {
705        self.clone().into()
706    }
707
708    fn into_data(self) -> ArrayData {
709        self.into()
710    }
711
712    fn data_type(&self) -> &DataType {
713        &self.data_type
714    }
715
716    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
717        Arc::new(self.slice(offset, length))
718    }
719
720    fn len(&self) -> usize {
721        self.keys.len()
722    }
723
724    fn is_empty(&self) -> bool {
725        self.keys.is_empty()
726    }
727
728    fn shrink_to_fit(&mut self) {
729        self.keys.shrink_to_fit();
730        self.values.shrink_to_fit();
731    }
732
733    fn offset(&self) -> usize {
734        self.keys.offset()
735    }
736
737    fn nulls(&self) -> Option<&NullBuffer> {
738        self.keys.nulls()
739    }
740
741    fn logical_nulls(&self) -> Option<NullBuffer> {
742        match self.values.logical_nulls() {
743            None => self.nulls().cloned(),
744            Some(value_nulls) => {
745                let mut builder = BooleanBufferBuilder::new(self.len());
746                match self.keys.nulls() {
747                    Some(n) => builder.append_buffer(n.inner()),
748                    None => builder.append_n(self.len(), true),
749                }
750                for (idx, k) in self.keys.values().iter().enumerate() {
751                    let k = k.as_usize();
752                    // Check range to allow for nulls
753                    if k < value_nulls.len() && value_nulls.is_null(k) {
754                        builder.set_bit(idx, false);
755                    }
756                }
757                Some(builder.finish().into())
758            }
759        }
760    }
761
762    fn logical_null_count(&self) -> usize {
763        match (self.keys.nulls(), self.values.logical_nulls()) {
764            (None, None) => 0,
765            (Some(key_nulls), None) => key_nulls.null_count(),
766            (None, Some(value_nulls)) => self
767                .keys
768                .values()
769                .iter()
770                .filter(|k| value_nulls.is_null(k.as_usize()))
771                .count(),
772            (Some(key_nulls), Some(value_nulls)) => self
773                .keys
774                .values()
775                .iter()
776                .enumerate()
777                .filter(|(idx, k)| key_nulls.is_null(*idx) || value_nulls.is_null(k.as_usize()))
778                .count(),
779        }
780    }
781
782    fn is_nullable(&self) -> bool {
783        !self.is_empty() && (self.nulls().is_some() || self.values.is_nullable())
784    }
785
786    fn get_buffer_memory_size(&self) -> usize {
787        self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size()
788    }
789
790    fn get_array_memory_size(&self) -> usize {
791        std::mem::size_of::<Self>()
792            + self.keys.get_buffer_memory_size()
793            + self.values.get_array_memory_size()
794    }
795
796    #[cfg(feature = "pool")]
797    fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
798        self.keys.claim(pool);
799        self.values.claim(pool);
800    }
801}
802
803impl<T: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryArray<T> {
804    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
805        writeln!(
806            f,
807            "DictionaryArray {{keys: {:?} values: {:?}}}",
808            self.keys, self.values
809        )
810    }
811}
812
813/// A [`DictionaryArray`] typed on its child values array
814///
815/// Implements [`ArrayAccessor`] allowing fast access to its elements
816///
817/// ```
818/// use arrow_array::{DictionaryArray, StringArray, types::Int32Type};
819///
820/// let orig = ["a", "b", "a", "b"];
821/// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
822///
823/// // `TypedDictionaryArray` allows you to access the values directly
824/// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
825///
826/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
827///     assert_eq!(maybe_val.unwrap(), orig)
828/// }
829/// ```
830pub struct TypedDictionaryArray<'a, K: ArrowDictionaryKeyType, V> {
831    /// The dictionary array
832    dictionary: &'a DictionaryArray<K>,
833    /// The values of the dictionary
834    values: &'a V,
835}
836
837// Manually implement `Clone` to avoid `V: Clone` type constraint
838impl<K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'_, K, V> {
839    fn clone(&self) -> Self {
840        *self
841    }
842}
843
844impl<K: ArrowDictionaryKeyType, V> Copy for TypedDictionaryArray<'_, K, V> {}
845
846impl<K: ArrowDictionaryKeyType, V> std::fmt::Debug for TypedDictionaryArray<'_, K, V> {
847    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
848        writeln!(f, "TypedDictionaryArray({:?})", self.dictionary)
849    }
850}
851
852impl<'a, K: ArrowDictionaryKeyType, V> TypedDictionaryArray<'a, K, V> {
853    /// Returns the keys of this [`TypedDictionaryArray`]
854    pub fn keys(&self) -> &'a PrimitiveArray<K> {
855        self.dictionary.keys()
856    }
857
858    /// Returns the values of this [`TypedDictionaryArray`]
859    pub fn values(&self) -> &'a V {
860        self.values
861    }
862}
863
864unsafe impl<K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'_, K, V> {
865    fn as_any(&self) -> &dyn Any {
866        self.dictionary
867    }
868
869    fn to_data(&self) -> ArrayData {
870        self.dictionary.to_data()
871    }
872
873    fn into_data(self) -> ArrayData {
874        self.dictionary.into_data()
875    }
876
877    fn data_type(&self) -> &DataType {
878        self.dictionary.data_type()
879    }
880
881    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
882        Arc::new(self.dictionary.slice(offset, length))
883    }
884
885    fn len(&self) -> usize {
886        self.dictionary.len()
887    }
888
889    fn is_empty(&self) -> bool {
890        self.dictionary.is_empty()
891    }
892
893    fn offset(&self) -> usize {
894        self.dictionary.offset()
895    }
896
897    fn nulls(&self) -> Option<&NullBuffer> {
898        self.dictionary.nulls()
899    }
900
901    fn logical_nulls(&self) -> Option<NullBuffer> {
902        self.dictionary.logical_nulls()
903    }
904
905    fn logical_null_count(&self) -> usize {
906        self.dictionary.logical_null_count()
907    }
908
909    fn is_nullable(&self) -> bool {
910        self.dictionary.is_nullable()
911    }
912
913    fn get_buffer_memory_size(&self) -> usize {
914        self.dictionary.get_buffer_memory_size()
915    }
916
917    fn get_array_memory_size(&self) -> usize {
918        self.dictionary.get_array_memory_size()
919    }
920
921    #[cfg(feature = "pool")]
922    fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
923        self.dictionary.claim(pool);
924    }
925}
926
927impl<K, V> IntoIterator for TypedDictionaryArray<'_, K, V>
928where
929    K: ArrowDictionaryKeyType,
930    Self: ArrayAccessor,
931{
932    type Item = Option<<Self as ArrayAccessor>::Item>;
933    type IntoIter = ArrayIter<Self>;
934
935    fn into_iter(self) -> Self::IntoIter {
936        ArrayIter::new(self)
937    }
938}
939
940impl<'a, K, V> ArrayAccessor for TypedDictionaryArray<'a, K, V>
941where
942    K: ArrowDictionaryKeyType,
943    V: Sync + Send,
944    &'a V: ArrayAccessor,
945    <&'a V as ArrayAccessor>::Item: Default,
946{
947    type Item = <&'a V as ArrayAccessor>::Item;
948
949    fn value(&self, index: usize) -> Self::Item {
950        assert!(
951            index < self.len(),
952            "Trying to access an element at index {} from a TypedDictionaryArray of length {}",
953            index,
954            self.len()
955        );
956        unsafe { self.value_unchecked(index) }
957    }
958
959    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
960        let val = unsafe { self.dictionary.keys.value_unchecked(index) };
961        let value_idx = val.as_usize();
962
963        // As dictionary keys are only verified for non-null indexes
964        // we must check the value is within bounds
965        match value_idx < self.values.len() {
966            true => unsafe { self.values.value_unchecked(value_idx) },
967            false => Default::default(),
968        }
969    }
970}
971
972/// A [`DictionaryArray`] with the key type erased
973///
974/// This can be used to efficiently implement kernels for all possible dictionary
975/// keys without needing to create specialized implementations for each key type
976///
977/// For example
978///
979/// ```
980/// # use arrow_array::*;
981/// # use arrow_array::cast::AsArray;
982/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
983/// # use arrow_array::types::*;
984/// # use arrow_schema::ArrowError;
985/// # use std::sync::Arc;
986///
987/// fn to_string(a: &dyn Array) -> Result<ArrayRef, ArrowError> {
988///     if let Some(d) = a.as_any_dictionary_opt() {
989///         // Recursively handle dictionary input
990///         let r = to_string(d.values().as_ref())?;
991///         return Ok(d.with_values(r));
992///     }
993///     downcast_primitive_array! {
994///         a => Ok(Arc::new(a.iter().map(|x| x.map(|x| format!("{x:?}"))).collect::<StringArray>())),
995///         d => Err(ArrowError::InvalidArgumentError(format!("{d:?} not supported")))
996///     }
997/// }
998///
999/// let result = to_string(&Int32Array::from(vec![1, 2, 3])).unwrap();
1000/// let actual = result.as_string::<i32>().iter().map(Option::unwrap).collect::<Vec<_>>();
1001/// assert_eq!(actual, &["1", "2", "3"]);
1002///
1003/// let mut dict = PrimitiveDictionaryBuilder::<Int32Type, UInt16Type>::new();
1004/// dict.extend([Some(1), Some(1), Some(2), Some(3), Some(2)]);
1005/// let dict = dict.finish();
1006///
1007/// let r = to_string(&dict).unwrap();
1008/// let r = r.as_dictionary::<Int32Type>().downcast_dict::<StringArray>().unwrap();
1009/// assert_eq!(r.keys(), dict.keys()); // Keys are the same
1010///
1011/// let actual = r.into_iter().map(Option::unwrap).collect::<Vec<_>>();
1012/// assert_eq!(actual, &["1", "1", "2", "3", "2"]);
1013/// ```
1014///
1015/// See [`AsArray::as_any_dictionary_opt`] and [`AsArray::as_any_dictionary`]
1016pub trait AnyDictionaryArray: Array {
1017    /// Returns the primitive keys of this dictionary as an [`Array`]
1018    fn keys(&self) -> &dyn Array;
1019
1020    /// Returns the values of this dictionary
1021    fn values(&self) -> &ArrayRef;
1022
1023    /// Returns the keys of this dictionary as usize
1024    ///
1025    /// The values for nulls will be arbitrary, but are guaranteed
1026    /// to be in the range `0..self.values.len()`
1027    ///
1028    /// # Panic
1029    ///
1030    /// Panics if `values.len() == 0`
1031    fn normalized_keys(&self) -> Vec<usize>;
1032
1033    /// Create a new [`DictionaryArray`] replacing `values` with the new values
1034    ///
1035    /// See [`DictionaryArray::with_values`]
1036    fn with_values(&self, values: ArrayRef) -> ArrayRef;
1037}
1038
1039impl<K: ArrowDictionaryKeyType> AnyDictionaryArray for DictionaryArray<K> {
1040    fn keys(&self) -> &dyn Array {
1041        &self.keys
1042    }
1043
1044    fn values(&self) -> &ArrayRef {
1045        self.values()
1046    }
1047
1048    fn normalized_keys(&self) -> Vec<usize> {
1049        let v_len = self.values().len();
1050        assert_ne!(v_len, 0);
1051        let iter = self.keys().values().iter();
1052        iter.map(|x| x.as_usize().min(v_len - 1)).collect()
1053    }
1054
1055    fn with_values(&self, values: ArrayRef) -> ArrayRef {
1056        Arc::new(self.with_values(values))
1057    }
1058}
1059
1060#[cfg(test)]
1061mod tests {
1062    use super::*;
1063    use crate::cast::as_dictionary_array;
1064    use crate::{Int8Array, Int16Array, Int32Array, RunArray, UInt8Array};
1065    use arrow_buffer::{Buffer, ToByteSlice};
1066
1067    #[test]
1068    fn test_dictionary_array() {
1069        // Construct a value array
1070        let value_data = ArrayData::builder(DataType::Int8)
1071            .len(8)
1072            .add_buffer(Buffer::from(
1073                [10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(),
1074            ))
1075            .build()
1076            .unwrap();
1077
1078        // Construct a buffer for value offsets, for the nested array:
1079        let keys = Buffer::from([2_i16, 3, 4].to_byte_slice());
1080
1081        // Construct a dictionary array from the above two
1082        let key_type = DataType::Int16;
1083        let value_type = DataType::Int8;
1084        let dict_data_type = DataType::Dictionary(Box::new(key_type), Box::new(value_type));
1085        let dict_data = ArrayData::builder(dict_data_type.clone())
1086            .len(3)
1087            .add_buffer(keys.clone())
1088            .add_child_data(value_data.clone())
1089            .build()
1090            .unwrap();
1091        let dict_array = Int16DictionaryArray::from(dict_data);
1092
1093        let values = dict_array.values();
1094        assert_eq!(value_data, values.to_data());
1095        assert_eq!(DataType::Int8, dict_array.value_type());
1096        assert_eq!(3, dict_array.len());
1097
1098        // Null count only makes sense in terms of the component arrays.
1099        assert_eq!(0, dict_array.null_count());
1100        assert_eq!(0, dict_array.values().null_count());
1101        assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4]));
1102
1103        // Now test with a non-zero offset
1104        let dict_data = ArrayData::builder(dict_data_type)
1105            .len(2)
1106            .offset(1)
1107            .add_buffer(keys)
1108            .add_child_data(value_data.clone())
1109            .build()
1110            .unwrap();
1111        let dict_array = Int16DictionaryArray::from(dict_data);
1112
1113        let values = dict_array.values();
1114        assert_eq!(value_data, values.to_data());
1115        assert_eq!(DataType::Int8, dict_array.value_type());
1116        assert_eq!(2, dict_array.len());
1117        assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
1118    }
1119
1120    #[test]
1121    fn test_dictionary_builder_append_many() {
1122        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
1123
1124        builder.append(1).unwrap();
1125        builder.append_n(2, 2).unwrap();
1126        builder.append_options(None, 2);
1127        builder.append_options(Some(3), 3);
1128
1129        let array = builder.finish();
1130
1131        let values = array
1132            .values()
1133            .as_primitive::<UInt32Type>()
1134            .iter()
1135            .map(Option::unwrap)
1136            .collect::<Vec<_>>();
1137        assert_eq!(values, &[1, 2, 3]);
1138        let keys = array.keys().iter().collect::<Vec<_>>();
1139        assert_eq!(
1140            keys,
1141            &[
1142                Some(0),
1143                Some(1),
1144                Some(1),
1145                None,
1146                None,
1147                Some(2),
1148                Some(2),
1149                Some(2)
1150            ]
1151        );
1152    }
1153
1154    #[test]
1155    fn test_string_dictionary_builder_append_many() {
1156        let mut builder = StringDictionaryBuilder::<Int8Type>::new();
1157
1158        builder.append("a").unwrap();
1159        builder.append_n("b", 2).unwrap();
1160        builder.append_options(None::<&str>, 2);
1161        builder.append_options(Some("c"), 3);
1162
1163        let array = builder.finish();
1164
1165        let values = array
1166            .values()
1167            .as_string::<i32>()
1168            .iter()
1169            .map(Option::unwrap)
1170            .collect::<Vec<_>>();
1171        assert_eq!(values, &["a", "b", "c"]);
1172        let keys = array.keys().iter().collect::<Vec<_>>();
1173        assert_eq!(
1174            keys,
1175            &[
1176                Some(0),
1177                Some(1),
1178                Some(1),
1179                None,
1180                None,
1181                Some(2),
1182                Some(2),
1183                Some(2)
1184            ]
1185        );
1186    }
1187
1188    #[test]
1189    fn test_dictionary_array_fmt_debug() {
1190        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
1191        builder.append(12345678).unwrap();
1192        builder.append_null();
1193        builder.append(22345678).unwrap();
1194        let array = builder.finish();
1195        assert_eq!(
1196            "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n  0,\n  null,\n  1,\n] values: PrimitiveArray<UInt32>\n[\n  12345678,\n  22345678,\n]}\n",
1197            format!("{array:?}")
1198        );
1199
1200        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(20, 2);
1201        for _ in 0..20 {
1202            builder.append(1).unwrap();
1203        }
1204        let array = builder.finish();
1205        assert_eq!(
1206            "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n] values: PrimitiveArray<UInt32>\n[\n  1,\n]}\n",
1207            format!("{array:?}")
1208        );
1209    }
1210
1211    #[test]
1212    fn test_dictionary_array_from_iter() {
1213        let test = vec!["a", "a", "b", "c"];
1214        let array: DictionaryArray<Int8Type> = test
1215            .iter()
1216            .map(|&x| if x == "b" { None } else { Some(x) })
1217            .collect();
1218        assert_eq!(
1219            "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  null,\n  1,\n] values: StringArray\n[\n  \"a\",\n  \"c\",\n]}\n",
1220            format!("{array:?}")
1221        );
1222
1223        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1224        assert_eq!(
1225            "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  1,\n  2,\n] values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
1226            format!("{array:?}")
1227        );
1228    }
1229
1230    #[test]
1231    fn test_dictionary_array_reverse_lookup_key() {
1232        let test = vec!["a", "a", "b", "c"];
1233        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1234
1235        assert_eq!(array.lookup_key("c"), Some(2));
1236
1237        // Direction of building a dictionary is the iterator direction
1238        let test = vec!["t3", "t3", "t2", "t2", "t1", "t3", "t4", "t1", "t0"];
1239        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1240
1241        assert_eq!(array.lookup_key("t1"), Some(2));
1242        assert_eq!(array.lookup_key("non-existent"), None);
1243    }
1244
1245    #[test]
1246    fn test_dictionary_keys_as_primitive_array() {
1247        let test = vec!["a", "b", "c", "a"];
1248        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1249
1250        let keys = array.keys();
1251        assert_eq!(&DataType::Int8, keys.data_type());
1252        assert_eq!(0, keys.null_count());
1253        assert_eq!(&[0, 1, 2, 0], keys.values());
1254    }
1255
1256    #[test]
1257    fn test_dictionary_keys_as_primitive_array_with_null() {
1258        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
1259        let array: DictionaryArray<Int32Type> = test.into_iter().collect();
1260
1261        let keys = array.keys();
1262        assert_eq!(&DataType::Int32, keys.data_type());
1263        assert_eq!(3, keys.null_count());
1264
1265        assert!(keys.is_valid(0));
1266        assert!(!keys.is_valid(1));
1267        assert!(keys.is_valid(2));
1268        assert!(!keys.is_valid(3));
1269        assert!(!keys.is_valid(4));
1270        assert!(keys.is_valid(5));
1271
1272        assert_eq!(0, keys.value(0));
1273        assert_eq!(1, keys.value(2));
1274        assert_eq!(0, keys.value(5));
1275    }
1276
1277    #[test]
1278    fn test_dictionary_all_nulls() {
1279        let test = vec![None, None, None];
1280        let array: DictionaryArray<Int32Type> = test.into_iter().collect();
1281        array
1282            .into_data()
1283            .validate_full()
1284            .expect("All null array has valid array data");
1285    }
1286
1287    #[test]
1288    fn test_dictionary_iter() {
1289        // Construct a value array
1290        let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
1291        let keys = Int16Array::from_iter_values([2_i16, 3, 4]);
1292
1293        // Construct a dictionary array from the above two
1294        let dict_array = DictionaryArray::new(keys, Arc::new(values));
1295
1296        let mut key_iter = dict_array.keys_iter();
1297        assert_eq!(2, key_iter.next().unwrap().unwrap());
1298        assert_eq!(3, key_iter.next().unwrap().unwrap());
1299        assert_eq!(4, key_iter.next().unwrap().unwrap());
1300        assert!(key_iter.next().is_none());
1301
1302        let mut iter = dict_array
1303            .values()
1304            .as_any()
1305            .downcast_ref::<Int8Array>()
1306            .unwrap()
1307            .take_iter(dict_array.keys_iter());
1308
1309        assert_eq!(12, iter.next().unwrap().unwrap());
1310        assert_eq!(13, iter.next().unwrap().unwrap());
1311        assert_eq!(14, iter.next().unwrap().unwrap());
1312        assert!(iter.next().is_none());
1313    }
1314
1315    #[test]
1316    fn test_dictionary_iter_with_null() {
1317        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
1318        let array: DictionaryArray<Int32Type> = test.into_iter().collect();
1319
1320        let mut iter = array
1321            .values()
1322            .as_any()
1323            .downcast_ref::<StringArray>()
1324            .unwrap()
1325            .take_iter(array.keys_iter());
1326
1327        assert_eq!("a", iter.next().unwrap().unwrap());
1328        assert!(iter.next().unwrap().is_none());
1329        assert_eq!("b", iter.next().unwrap().unwrap());
1330        assert!(iter.next().unwrap().is_none());
1331        assert!(iter.next().unwrap().is_none());
1332        assert_eq!("a", iter.next().unwrap().unwrap());
1333        assert!(iter.next().is_none());
1334    }
1335
1336    #[test]
1337    fn test_dictionary_key() {
1338        let keys = Int8Array::from(vec![Some(2), None, Some(1)]);
1339        let values = StringArray::from(vec!["foo", "bar", "baz", "blarg"]);
1340
1341        let array = DictionaryArray::new(keys, Arc::new(values));
1342        assert_eq!(array.key(0), Some(2));
1343        assert_eq!(array.key(1), None);
1344        assert_eq!(array.key(2), Some(1));
1345    }
1346
1347    #[test]
1348    fn test_try_new() {
1349        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
1350            .into_iter()
1351            .collect();
1352        let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect();
1353
1354        let array = DictionaryArray::new(keys, Arc::new(values));
1355        assert_eq!(array.keys().data_type(), &DataType::Int32);
1356        assert_eq!(array.values().data_type(), &DataType::Utf8);
1357
1358        assert_eq!(array.null_count(), 1);
1359        assert_eq!(array.logical_null_count(), 1);
1360
1361        assert!(array.keys().is_valid(0));
1362        assert!(array.keys().is_valid(1));
1363        assert!(array.keys().is_null(2));
1364        assert!(array.keys().is_valid(3));
1365
1366        assert_eq!(array.keys().value(0), 0);
1367        assert_eq!(array.keys().value(1), 2);
1368        assert_eq!(array.keys().value(3), 1);
1369
1370        assert_eq!(
1371            "DictionaryArray {keys: PrimitiveArray<Int32>\n[\n  0,\n  2,\n  null,\n  1,\n] values: StringArray\n[\n  \"foo\",\n  \"bar\",\n  \"baz\",\n]}\n",
1372            format!("{array:?}")
1373        );
1374    }
1375
1376    #[test]
1377    #[should_panic(expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2")]
1378    fn test_try_new_index_too_large() {
1379        let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
1380        // dictionary only has 2 values, so offset 3 is out of bounds
1381        let keys: Int32Array = [Some(0), Some(3)].into_iter().collect();
1382        DictionaryArray::new(keys, Arc::new(values));
1383    }
1384
1385    #[test]
1386    #[should_panic(expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2")]
1387    fn test_try_new_index_too_small() {
1388        let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
1389        let keys: Int32Array = [Some(-100)].into_iter().collect();
1390        DictionaryArray::new(keys, Arc::new(values));
1391    }
1392
1393    #[test]
1394    #[should_panic(expected = "DictionaryArray's data type must match, expected Int64 got Int32")]
1395    fn test_from_array_data_validation() {
1396        let a = DictionaryArray::<Int32Type>::from_iter(["32"]);
1397        let _ = DictionaryArray::<Int64Type>::from(a.into_data());
1398    }
1399
1400    #[test]
1401    fn test_into_primitive_dict_builder() {
1402        let values = Int32Array::from_iter_values([10_i32, 12, 15]);
1403        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1404
1405        let dict_array = DictionaryArray::new(keys, Arc::new(values));
1406
1407        let boxed: ArrayRef = Arc::new(dict_array);
1408        let col: DictionaryArray<Int8Type> = as_dictionary_array(&boxed).clone();
1409
1410        drop(boxed);
1411
1412        let mut builder = col.into_primitive_dict_builder::<Int32Type>().unwrap();
1413
1414        let slice = builder.values_slice_mut();
1415        assert_eq!(slice, &[10, 12, 15]);
1416
1417        slice[0] = 4;
1418        slice[1] = 2;
1419        slice[2] = 1;
1420
1421        let values = Int32Array::from_iter_values([4_i32, 2, 1]);
1422        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1423
1424        let expected = DictionaryArray::new(keys, Arc::new(values));
1425
1426        let new_array = builder.finish();
1427        assert_eq!(expected, new_array);
1428    }
1429
1430    #[test]
1431    fn test_into_primitive_dict_builder_cloned_array() {
1432        let values = Int32Array::from_iter_values([10_i32, 12, 15]);
1433        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1434
1435        let dict_array = DictionaryArray::new(keys, Arc::new(values));
1436
1437        let boxed: ArrayRef = Arc::new(dict_array);
1438
1439        let col: DictionaryArray<Int8Type> = DictionaryArray::<Int8Type>::from(boxed.to_data());
1440        let err = col.into_primitive_dict_builder::<Int32Type>();
1441
1442        let returned = err.unwrap_err();
1443
1444        let values = Int32Array::from_iter_values([10_i32, 12, 15]);
1445        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1446
1447        let expected = DictionaryArray::new(keys, Arc::new(values));
1448        assert_eq!(expected, returned);
1449    }
1450
1451    #[test]
1452    fn test_occupancy() {
1453        let keys = Int32Array::new((100..200).collect(), None);
1454        let values = Int32Array::from(vec![0; 1024]);
1455        let dict = DictionaryArray::new(keys, Arc::new(values));
1456        for (idx, v) in dict.occupancy().iter().enumerate() {
1457            let expected = (100..200).contains(&idx);
1458            assert_eq!(v, expected, "{idx}");
1459        }
1460
1461        let keys = Int32Array::new(
1462            (0..100).collect(),
1463            Some((0..100).map(|x| x % 4 == 0).collect()),
1464        );
1465        let values = Int32Array::from(vec![0; 1024]);
1466        let dict = DictionaryArray::new(keys, Arc::new(values));
1467        for (idx, v) in dict.occupancy().iter().enumerate() {
1468            let expected = idx % 4 == 0 && idx < 100;
1469            assert_eq!(v, expected, "{idx}");
1470        }
1471    }
1472
1473    #[test]
1474    fn test_iterator_nulls() {
1475        let keys = Int32Array::new(
1476            vec![0, 700, 1, 2].into(),
1477            Some(NullBuffer::from(vec![true, false, true, true])),
1478        );
1479        let values = Int32Array::from(vec![Some(50), None, Some(2)]);
1480        let dict = DictionaryArray::new(keys, Arc::new(values));
1481        let values: Vec<_> = dict
1482            .downcast_dict::<Int32Array>()
1483            .unwrap()
1484            .into_iter()
1485            .collect();
1486        assert_eq!(values, &[Some(50), None, None, Some(2)])
1487    }
1488
1489    #[test]
1490    fn test_logical_nulls() -> Result<(), ArrowError> {
1491        let values = Arc::new(RunArray::try_new(
1492            &Int32Array::from(vec![1, 3, 7]),
1493            &Int32Array::from(vec![Some(1), None, Some(3)]),
1494        )?) as ArrayRef;
1495
1496        // For this test to be meaningful, the values array need to have different nulls and logical nulls
1497        assert_eq!(values.null_count(), 0);
1498        assert_eq!(values.logical_null_count(), 2);
1499
1500        // Construct a trivial dictionary with 1-1 mapping to underlying array
1501        let dictionary = DictionaryArray::<Int8Type>::try_new(
1502            Int8Array::from((0..values.len()).map(|i| i as i8).collect::<Vec<_>>()),
1503            Arc::clone(&values),
1504        )?;
1505
1506        // No keys are null
1507        assert_eq!(dictionary.null_count(), 0);
1508        // Dictionary array values are logically nullable
1509        assert_eq!(dictionary.logical_null_count(), values.logical_null_count());
1510        assert_eq!(dictionary.logical_nulls(), values.logical_nulls());
1511        assert!(dictionary.is_nullable());
1512
1513        // Construct a trivial dictionary with 1-1 mapping to underlying array except that key 0 is nulled out
1514        let dictionary = DictionaryArray::<Int8Type>::try_new(
1515            Int8Array::from(
1516                (0..values.len())
1517                    .map(|i| i as i8)
1518                    .map(|i| if i == 0 { None } else { Some(i) })
1519                    .collect::<Vec<_>>(),
1520            ),
1521            Arc::clone(&values),
1522        )?;
1523
1524        // One key is null
1525        assert_eq!(dictionary.null_count(), 1);
1526
1527        // Dictionary array values are logically nullable
1528        assert_eq!(
1529            dictionary.logical_null_count(),
1530            values.logical_null_count() + 1
1531        );
1532        assert!(dictionary.is_nullable());
1533
1534        Ok(())
1535    }
1536
1537    #[test]
1538    fn test_normalized_keys() {
1539        let values = vec![132, 0, 1].into();
1540        let nulls = NullBuffer::from(vec![false, true, true]);
1541        let keys = Int32Array::new(values, Some(nulls));
1542        let dictionary = DictionaryArray::new(keys, Arc::new(Int32Array::new_null(2)));
1543        assert_eq!(&dictionary.normalized_keys(), &[1, 0, 1])
1544    }
1545
1546    #[test]
1547    fn test_all_null_dict() {
1548        let all_null_dict_arr = DictionaryArray::try_new(
1549            UInt8Array::new_null(10),
1550            Arc::new(StringArray::from_iter_values(["a"])),
1551        );
1552        assert!(all_null_dict_arr.is_ok())
1553    }
1554}