Skip to main content

arrow_array/builder/
primitive_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{
21    Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray,
22};
23use arrow_buffer::{ArrowNativeType, ToByteSlice};
24use arrow_schema::{ArrowError, DataType};
25use num_traits::NumCast;
26use std::any::Any;
27use std::collections::HashMap;
28use std::sync::Arc;
29
30/// Wraps a type implementing `ToByteSlice` implementing `Hash` and `Eq` for it
31///
32/// This is necessary to handle types such as f32, which don't natively implement these
33#[derive(Debug)]
34struct Value<T>(T);
35
36impl<T: ToByteSlice> std::hash::Hash for Value<T> {
37    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
38        self.0.to_byte_slice().hash(state)
39    }
40}
41
42impl<T: ToByteSlice> PartialEq for Value<T> {
43    fn eq(&self, other: &Self) -> bool {
44        self.0.to_byte_slice().eq(other.0.to_byte_slice())
45    }
46}
47
48impl<T: ToByteSlice> Eq for Value<T> {}
49
50/// Builder for [`DictionaryArray`] of [`PrimitiveArray`]
51///
52/// # Example:
53///
54/// ```
55///
56/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
57/// # use arrow_array::types::{UInt32Type, UInt8Type};
58/// # use arrow_array::{Array, UInt32Array, UInt8Array};
59///
60/// let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
61///  builder.append(12345678).unwrap();
62///  builder.append_null();
63///  builder.append(22345678).unwrap();
64///  let array = builder.finish();
65///
66///  assert_eq!(
67///      array.keys(),
68///      &UInt8Array::from(vec![Some(0), None, Some(1)])
69///  );
70///
71///  // Values are polymorphic and so require a downcast.
72///  let av = array.values();
73///  let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
74///  let avs: &[u32] = ava.values();
75///
76///  assert!(!array.is_null(0));
77///  assert!(array.is_null(1));
78///  assert!(!array.is_null(2));
79///
80///  assert_eq!(avs, &[12345678, 22345678]);
81/// ```
82#[derive(Debug)]
83pub struct PrimitiveDictionaryBuilder<K, V>
84where
85    K: ArrowPrimitiveType,
86    V: ArrowPrimitiveType,
87{
88    keys_builder: PrimitiveBuilder<K>,
89    values_builder: PrimitiveBuilder<V>,
90    map: HashMap<Value<V::Native>, usize>,
91}
92
93impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
94where
95    K: ArrowPrimitiveType,
96    V: ArrowPrimitiveType,
97{
98    fn default() -> Self {
99        Self::new()
100    }
101}
102
103impl<K, V> PrimitiveDictionaryBuilder<K, V>
104where
105    K: ArrowPrimitiveType,
106    V: ArrowPrimitiveType,
107{
108    /// Creates a new `PrimitiveDictionaryBuilder`.
109    pub fn new() -> Self {
110        Self {
111            keys_builder: PrimitiveBuilder::new(),
112            values_builder: PrimitiveBuilder::new(),
113            map: HashMap::new(),
114        }
115    }
116
117    /// Creates a new `PrimitiveDictionaryBuilder` from the provided keys and values builders.
118    ///
119    /// # Panics
120    ///
121    /// This method panics if `keys_builder` or `values_builder` is not empty.
122    pub fn new_from_empty_builders(
123        keys_builder: PrimitiveBuilder<K>,
124        values_builder: PrimitiveBuilder<V>,
125    ) -> Self {
126        assert!(
127            keys_builder.is_empty() && values_builder.is_empty(),
128            "keys and values builders must be empty"
129        );
130        let values_capacity = values_builder.capacity();
131        Self {
132            keys_builder,
133            values_builder,
134            map: HashMap::with_capacity(values_capacity),
135        }
136    }
137
138    /// Creates a new `PrimitiveDictionaryBuilder` from existing `PrimitiveBuilder`s of keys and values.
139    ///
140    /// # Safety
141    ///
142    /// caller must ensure that the passed in builders are valid for DictionaryArray.
143    pub unsafe fn new_from_builders(
144        keys_builder: PrimitiveBuilder<K>,
145        values_builder: PrimitiveBuilder<V>,
146    ) -> Self {
147        let keys = keys_builder.values_slice();
148        let values = values_builder.values_slice();
149        let mut map = HashMap::with_capacity(values.len());
150
151        keys.iter().zip(values.iter()).for_each(|(key, value)| {
152            map.insert(Value(*value), K::Native::to_usize(*key).unwrap());
153        });
154
155        Self {
156            keys_builder,
157            values_builder,
158            map,
159        }
160    }
161
162    /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities
163    ///
164    /// `keys_capacity`: the number of keys, i.e. length of array to build
165    /// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary
166    pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
167        Self {
168            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
169            values_builder: PrimitiveBuilder::with_capacity(values_capacity),
170            map: HashMap::with_capacity(values_capacity),
171        }
172    }
173
174    /// Creates a new `PrimitiveDictionaryBuilder` from the existing builder with the same
175    /// keys and values, but with a new data type for the keys.
176    ///
177    /// # Example
178    /// ```
179    /// #
180    /// # use arrow_array::builder::PrimitiveDictionaryBuilder;
181    /// # use arrow_array::types::{UInt8Type, UInt16Type, UInt64Type};
182    /// # use arrow_array::UInt16Array;
183    /// # use arrow_schema::ArrowError;
184    ///
185    /// let mut u8_keyed_builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt64Type>::new();
186    ///
187    /// // appending too many values causes the dictionary to overflow
188    /// for i in 0..256 {
189    ///     u8_keyed_builder.append_value(i);
190    /// }
191    /// let result = u8_keyed_builder.append(256);
192    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
193    ///
194    /// // we need to upgrade to a larger key type
195    /// let mut u16_keyed_builder = PrimitiveDictionaryBuilder::<UInt16Type, UInt64Type>::try_new_from_builder(u8_keyed_builder).unwrap();
196    /// let dictionary_array = u16_keyed_builder.finish();
197    /// let keys = dictionary_array.keys();
198    ///
199    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
200    pub fn try_new_from_builder<K2>(
201        mut source: PrimitiveDictionaryBuilder<K2, V>,
202    ) -> Result<Self, ArrowError>
203    where
204        K::Native: NumCast,
205        K2: ArrowDictionaryKeyType,
206        K2::Native: NumCast,
207    {
208        let map = source.map;
209        let values_builder = source.values_builder;
210
211        let source_keys = source.keys_builder.finish();
212        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
213            num_traits::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
214                ArrowError::CastError(format!(
215                    "Can't cast dictionary keys from source type {:?} to type {:?}",
216                    K2::DATA_TYPE,
217                    K::DATA_TYPE
218                ))
219            })
220        })?;
221
222        // drop source key here because currently source_keys and new_keys are holding reference to
223        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
224        // be the only reference holder.
225        drop(source_keys);
226
227        Ok(Self {
228            map,
229            keys_builder: new_keys
230                .into_builder()
231                .expect("underlying buffer has no references"),
232            values_builder,
233        })
234    }
235}
236
237impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
238where
239    K: ArrowDictionaryKeyType,
240    V: ArrowPrimitiveType,
241{
242    /// Returns the builder as an non-mutable `Any` reference.
243    fn as_any(&self) -> &dyn Any {
244        self
245    }
246
247    /// Returns the builder as an mutable `Any` reference.
248    fn as_any_mut(&mut self) -> &mut dyn Any {
249        self
250    }
251
252    /// Returns the boxed builder as a box of `Any`.
253    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
254        self
255    }
256
257    /// Returns the number of array slots in the builder
258    fn len(&self) -> usize {
259        self.keys_builder.len()
260    }
261
262    /// Builds the array and reset this builder.
263    fn finish(&mut self) -> ArrayRef {
264        Arc::new(self.finish())
265    }
266
267    /// Builds the array without resetting the builder.
268    fn finish_cloned(&self) -> ArrayRef {
269        Arc::new(self.finish_cloned())
270    }
271
272    fn finish_preserve_values(&mut self) -> ArrayRef {
273        Arc::new(self.finish_preserve_values())
274    }
275}
276
277impl<K, V> PrimitiveDictionaryBuilder<K, V>
278where
279    K: ArrowDictionaryKeyType,
280    V: ArrowPrimitiveType,
281{
282    #[inline]
283    fn get_or_insert_key(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
284        match self.map.get(&Value(value)) {
285            Some(&key) => {
286                Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
287            }
288            None => {
289                let key = self.values_builder.len();
290                self.values_builder.append_value(value);
291                self.map.insert(Value(value), key);
292                Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
293            }
294        }
295    }
296
297    /// Append a primitive value to the array. Return an existing index
298    /// if already present in the values array or a new index if the
299    /// value is appended to the values array.
300    #[inline]
301    pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
302        let key = self.get_or_insert_key(value)?;
303        self.keys_builder.append_value(key);
304        Ok(key)
305    }
306
307    /// Append a value multiple times to the array.
308    /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
309    ///
310    /// Returns an error if the new index would overflow the key type.
311    pub fn append_n(&mut self, value: V::Native, count: usize) -> Result<K::Native, ArrowError> {
312        let key = self.get_or_insert_key(value)?;
313        self.keys_builder.append_value_n(key, count);
314        Ok(key)
315    }
316
317    /// Infallibly append a value to this builder
318    ///
319    /// # Panics
320    ///
321    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
322    #[inline]
323    pub fn append_value(&mut self, value: V::Native) {
324        self.append(value).expect("dictionary key overflow");
325    }
326
327    /// Infallibly append a value to this builder repeatedly `count` times.
328    /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
329    ///
330    /// # Panics
331    ///
332    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
333    pub fn append_values(&mut self, value: V::Native, count: usize) {
334        self.append_n(value, count)
335            .expect("dictionary key overflow");
336    }
337
338    /// Appends a null slot into the builder
339    #[inline]
340    pub fn append_null(&mut self) {
341        self.keys_builder.append_null()
342    }
343
344    /// Append `n` null slots into the builder
345    #[inline]
346    pub fn append_nulls(&mut self, n: usize) {
347        self.keys_builder.append_nulls(n)
348    }
349
350    /// Append an `Option` value into the builder
351    ///
352    /// # Panics
353    ///
354    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
355    #[inline]
356    pub fn append_option(&mut self, value: Option<V::Native>) {
357        match value {
358            None => self.append_null(),
359            Some(v) => self.append_value(v),
360        };
361    }
362
363    /// Append an `Option` value into the builder repeatedly `count` times.
364    /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
365    ///
366    /// # Panics
367    ///
368    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
369    pub fn append_options(&mut self, value: Option<V::Native>, count: usize) {
370        match value {
371            None => self.keys_builder.append_nulls(count),
372            Some(v) => self.append_values(v, count),
373        };
374    }
375
376    /// Extends builder with dictionary
377    ///
378    /// This is the same as [`Self::extend`] but is faster as it translates
379    /// the dictionary values once rather than doing a lookup for each item in the iterator
380    ///
381    /// when dictionary values are null (the actual mapped values) the keys are null
382    ///
383    pub fn extend_dictionary(
384        &mut self,
385        dictionary: &TypedDictionaryArray<K, PrimitiveArray<V>>,
386    ) -> Result<(), ArrowError> {
387        let values = dictionary.values();
388
389        let v_len = values.len();
390        let k_len = dictionary.keys().len();
391        if v_len == 0 && k_len == 0 {
392            return Ok(());
393        }
394
395        // All nulls
396        if v_len == 0 {
397            self.append_nulls(k_len);
398            return Ok(());
399        }
400
401        if k_len == 0 {
402            return Err(ArrowError::InvalidArgumentError(
403                "Dictionary keys should not be empty when values are not empty".to_string(),
404            ));
405        }
406
407        // Orphan values will be carried over to the new dictionary
408        let mapped_values = values
409            .iter()
410            // Dictionary values can technically be null, so we need to handle that
411            .map(|dict_value| {
412                dict_value
413                    .map(|dict_value| self.get_or_insert_key(dict_value))
414                    .transpose()
415            })
416            .collect::<Result<Vec<_>, _>>()?;
417
418        // Just insert the keys without additional lookups
419        dictionary.keys().iter().for_each(|key| match key {
420            None => self.append_null(),
421            Some(original_dict_index) => {
422                let index = original_dict_index.as_usize().min(v_len - 1);
423                match mapped_values[index] {
424                    None => self.append_null(),
425                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
426                }
427            }
428        });
429
430        Ok(())
431    }
432
433    /// Builds the `DictionaryArray` and reset this builder.
434    pub fn finish(&mut self) -> DictionaryArray<K> {
435        self.map.clear();
436        let values = self.values_builder.finish();
437        let keys = self.keys_builder.finish();
438
439        let data_type =
440            DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
441
442        let builder = keys
443            .into_data()
444            .into_builder()
445            .data_type(data_type)
446            .child_data(vec![values.into_data()]);
447
448        DictionaryArray::from(unsafe { builder.build_unchecked() })
449    }
450
451    /// Builds the `DictionaryArray` without resetting the builder.
452    pub fn finish_cloned(&self) -> DictionaryArray<K> {
453        let values = self.values_builder.finish_cloned();
454        let keys = self.keys_builder.finish_cloned();
455
456        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE));
457
458        let builder = keys
459            .into_data()
460            .into_builder()
461            .data_type(data_type)
462            .child_data(vec![values.into_data()]);
463
464        DictionaryArray::from(unsafe { builder.build_unchecked() })
465    }
466
467    /// Builds the `DictionaryArray` without resetting the values builder or
468    /// the internal de-duplication map.
469    ///
470    /// The advantage of doing this is that the values will represent the entire
471    /// set of what has been built so-far by this builder and ensures
472    /// consistency in the assignment of keys to values across multiple calls
473    /// to `finish_preserve_values`. This enables ipc writers to efficiently
474    /// emit delta dictionaries.
475    ///
476    /// The downside to this is that building the record requires creating a
477    /// copy of the values, which can become slowly more expensive if the
478    /// dictionary grows.
479    ///
480    /// Additionally, if record batches from multiple different dictionary
481    /// builders for the same column are fed into a single ipc writer, beware
482    /// that entire dictionaries are likely to be re-sent frequently even when
483    /// the majority of the values are not used by the current record batch.
484    pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
485        let values = self.values_builder.finish_cloned();
486        let keys = self.keys_builder.finish();
487
488        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE));
489
490        let builder = keys
491            .into_data()
492            .into_builder()
493            .data_type(data_type)
494            .child_data(vec![values.into_data()]);
495
496        DictionaryArray::from(unsafe { builder.build_unchecked() })
497    }
498
499    /// Returns the current dictionary values buffer as a slice
500    pub fn values_slice(&self) -> &[V::Native] {
501        self.values_builder.values_slice()
502    }
503
504    /// Returns the current dictionary values buffer as a mutable slice
505    pub fn values_slice_mut(&mut self) -> &mut [V::Native] {
506        self.values_builder.values_slice_mut()
507    }
508
509    /// Returns the current null buffer as a slice
510    pub fn validity_slice(&self) -> Option<&[u8]> {
511        self.keys_builder.validity_slice()
512    }
513}
514
515impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
516    for PrimitiveDictionaryBuilder<K, P>
517{
518    #[inline]
519    fn extend<T: IntoIterator<Item = Option<P::Native>>>(&mut self, iter: T) {
520        for v in iter {
521            self.append_option(v)
522        }
523    }
524}
525
526#[cfg(test)]
527mod tests {
528    use super::*;
529
530    use crate::array::{Int32Array, UInt8Array, UInt32Array};
531    use crate::builder::Decimal128Builder;
532    use crate::cast::AsArray;
533    use crate::types::{
534        Date32Type, Decimal128Type, DurationNanosecondType, Float32Type, Float64Type, Int8Type,
535        Int16Type, Int32Type, Int64Type, TimestampNanosecondType, UInt8Type, UInt16Type,
536        UInt32Type, UInt64Type,
537    };
538
539    #[test]
540    fn test_primitive_dictionary_builder() {
541        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
542        builder.append(12345678).unwrap();
543        builder.append_null();
544        builder.append(22345678).unwrap();
545        let array = builder.finish();
546
547        assert_eq!(
548            array.keys(),
549            &UInt8Array::from(vec![Some(0), None, Some(1)])
550        );
551
552        // Values are polymorphic and so require a downcast.
553        let av = array.values();
554        let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
555        let avs: &[u32] = ava.values();
556
557        assert!(!array.is_null(0));
558        assert!(array.is_null(1));
559        assert!(!array.is_null(2));
560
561        assert_eq!(avs, &[12345678, 22345678]);
562    }
563
564    #[test]
565    fn test_extend() {
566        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
567        builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
568        builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
569        let dict = builder.finish();
570        assert_eq!(
571            dict.keys().values(),
572            &[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0]
573        );
574        assert_eq!(dict.values().len(), 5);
575    }
576
577    #[test]
578    #[should_panic(expected = "DictionaryKeyOverflowError")]
579    fn test_primitive_dictionary_overflow() {
580        let mut builder =
581            PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
582        // 256 unique keys.
583        for i in 0..256 {
584            builder.append(i + 1000).unwrap();
585        }
586        // Special error if the key overflows (256th entry)
587        builder.append(1257).unwrap();
588    }
589
590    #[test]
591    fn test_primitive_dictionary_with_builders() {
592        let keys_builder = PrimitiveBuilder::<Int32Type>::new();
593        let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
594        let mut builder =
595            PrimitiveDictionaryBuilder::<Int32Type, Decimal128Type>::new_from_empty_builders(
596                keys_builder,
597                values_builder,
598            );
599        let dict_array = builder.finish();
600        assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2));
601        assert_eq!(
602            dict_array.data_type(),
603            &DataType::Dictionary(
604                Box::new(DataType::Int32),
605                Box::new(DataType::Decimal128(1, 2)),
606            )
607        );
608    }
609
610    #[test]
611    fn test_extend_dictionary() {
612        let some_dict = {
613            let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
614            builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
615            builder.extend([None::<i32>]);
616            builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
617            builder.append_null();
618            builder.finish()
619        };
620
621        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
622        builder.extend([6, 6, 7, 6, 5].into_iter().map(Some));
623        builder
624            .extend_dictionary(&some_dict.downcast_dict().unwrap())
625            .unwrap();
626        let dict = builder.finish();
627
628        assert_eq!(dict.values().len(), 7);
629
630        let values = dict
631            .downcast_dict::<Int32Array>()
632            .unwrap()
633            .into_iter()
634            .collect::<Vec<_>>();
635
636        assert_eq!(
637            values,
638            [
639                Some(6),
640                Some(6),
641                Some(7),
642                Some(6),
643                Some(5),
644                Some(1),
645                Some(2),
646                Some(3),
647                Some(1),
648                Some(2),
649                Some(3),
650                Some(1),
651                Some(2),
652                Some(3),
653                None,
654                Some(4),
655                Some(5),
656                Some(1),
657                Some(3),
658                Some(1),
659                None
660            ]
661        );
662    }
663
664    #[test]
665    fn test_extend_dictionary_with_null_in_mapped_value() {
666        let some_dict = {
667            let mut values_builder = PrimitiveBuilder::<Int32Type>::new();
668            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
669
670            // Manually build a dictionary values that the mapped values have null
671            values_builder.append_null();
672            keys_builder.append_value(0);
673            values_builder.append_value(42);
674            keys_builder.append_value(1);
675
676            let values = values_builder.finish();
677            let keys = keys_builder.finish();
678
679            let data_type = DataType::Dictionary(
680                Box::new(Int32Type::DATA_TYPE),
681                Box::new(values.data_type().clone()),
682            );
683
684            let builder = keys
685                .into_data()
686                .into_builder()
687                .data_type(data_type)
688                .child_data(vec![values.into_data()]);
689
690            DictionaryArray::from(unsafe { builder.build_unchecked() })
691        };
692
693        let some_dict_values = some_dict.values().as_primitive::<Int32Type>();
694        assert_eq!(
695            some_dict_values.into_iter().collect::<Vec<_>>(),
696            &[None, Some(42)]
697        );
698
699        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
700        builder
701            .extend_dictionary(&some_dict.downcast_dict().unwrap())
702            .unwrap();
703        let dict = builder.finish();
704
705        assert_eq!(dict.values().len(), 1);
706
707        let values = dict
708            .downcast_dict::<Int32Array>()
709            .unwrap()
710            .into_iter()
711            .collect::<Vec<_>>();
712
713        assert_eq!(values, [None, Some(42)]);
714    }
715
716    #[test]
717    fn test_extend_all_null_dictionary() {
718        let some_dict = {
719            let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
720            builder.append_nulls(2);
721            builder.finish()
722        };
723
724        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
725        builder
726            .extend_dictionary(&some_dict.downcast_dict().unwrap())
727            .unwrap();
728        let dict = builder.finish();
729
730        assert_eq!(dict.values().len(), 0);
731
732        let values = dict
733            .downcast_dict::<Int32Array>()
734            .unwrap()
735            .into_iter()
736            .collect::<Vec<_>>();
737
738        assert_eq!(values, [None, None]);
739    }
740
741    #[test]
742    fn creating_dictionary_from_builders_should_use_values_capacity_for_the_map() {
743        let builder = PrimitiveDictionaryBuilder::<Int32Type, crate::types::TimestampMicrosecondType>::new_from_empty_builders(
744                  PrimitiveBuilder::with_capacity(1).with_data_type(DataType::Int32),
745                  PrimitiveBuilder::with_capacity(2).with_data_type(DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+08:00".into()))),
746              );
747
748        assert!(
749            builder.map.capacity() >= builder.values_builder.capacity(),
750            "map capacity {} should be at least the values capacity {}",
751            builder.map.capacity(),
752            builder.values_builder.capacity()
753        )
754    }
755
756    fn _test_try_new_from_builder_generic_for_key_types<K1, K2, V>(values: Vec<V::Native>)
757    where
758        K1: ArrowDictionaryKeyType,
759        K1::Native: NumCast,
760        K2: ArrowDictionaryKeyType,
761        K2::Native: NumCast + From<u8>,
762        V: ArrowPrimitiveType,
763    {
764        let mut source = PrimitiveDictionaryBuilder::<K1, V>::new();
765        source.append(values[0]).unwrap();
766        source.append_null();
767        source.append(values[1]).unwrap();
768        source.append(values[2]).unwrap();
769
770        let mut result = PrimitiveDictionaryBuilder::<K2, V>::try_new_from_builder(source).unwrap();
771        let array = result.finish();
772
773        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
774        expected_keys_builder
775            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
776        expected_keys_builder.append_null();
777        expected_keys_builder
778            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
779        expected_keys_builder
780            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
781        let expected_keys = expected_keys_builder.finish();
782        assert_eq!(array.keys(), &expected_keys);
783
784        let av = array.values();
785        let ava = av.as_any().downcast_ref::<PrimitiveArray<V>>().unwrap();
786        assert_eq!(ava.value(0), values[0]);
787        assert_eq!(ava.value(1), values[1]);
788        assert_eq!(ava.value(2), values[2]);
789    }
790
791    fn _test_try_new_from_builder_generic_for_value<T>(values: Vec<T::Native>)
792    where
793        T: ArrowPrimitiveType,
794    {
795        // test cast to bigger size unsigned
796        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type, T>(
797            values.clone(),
798        );
799        // test cast going to smaller size unsigned
800        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type, T>(
801            values.clone(),
802        );
803        // test cast going to bigger size signed
804        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type, T>(values.clone());
805        // test cast going to smaller size signed
806        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
807        // test going from signed to signed for different size changes
808        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type, T>(values.clone());
809        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type, T>(values.clone());
810        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type, T>(values.clone());
811        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
812    }
813
814    #[test]
815    fn test_try_new_from_builder() {
816        // test unsigned types
817        _test_try_new_from_builder_generic_for_value::<UInt8Type>(vec![1, 2, 3]);
818        _test_try_new_from_builder_generic_for_value::<UInt16Type>(vec![1, 2, 3]);
819        _test_try_new_from_builder_generic_for_value::<UInt32Type>(vec![1, 2, 3]);
820        _test_try_new_from_builder_generic_for_value::<UInt64Type>(vec![1, 2, 3]);
821        // test signed types
822        _test_try_new_from_builder_generic_for_value::<Int8Type>(vec![-1, 0, 1]);
823        _test_try_new_from_builder_generic_for_value::<Int16Type>(vec![-1, 0, 1]);
824        _test_try_new_from_builder_generic_for_value::<Int32Type>(vec![-1, 0, 1]);
825        _test_try_new_from_builder_generic_for_value::<Int64Type>(vec![-1, 0, 1]);
826        // test some date types
827        _test_try_new_from_builder_generic_for_value::<Date32Type>(vec![5, 6, 7]);
828        _test_try_new_from_builder_generic_for_value::<DurationNanosecondType>(vec![1, 2, 3]);
829        _test_try_new_from_builder_generic_for_value::<TimestampNanosecondType>(vec![1, 2, 3]);
830        // test some floating point types
831        _test_try_new_from_builder_generic_for_value::<Float32Type>(vec![0.1, 0.2, 0.3]);
832        _test_try_new_from_builder_generic_for_value::<Float64Type>(vec![-0.1, 0.2, 0.3]);
833    }
834
835    #[test]
836    fn test_try_new_from_builder_cast_fails() {
837        let mut source_builder = PrimitiveDictionaryBuilder::<UInt16Type, UInt64Type>::new();
838        for i in 0..257 {
839            source_builder.append_value(i);
840        }
841
842        // there should be too many values that we can't downcast to the underlying type
843        // we have keys that wouldn't fit into UInt8Type
844        let result = PrimitiveDictionaryBuilder::<UInt8Type, UInt64Type>::try_new_from_builder(
845            source_builder,
846        );
847        assert!(result.is_err());
848        if let Err(e) = result {
849            assert!(matches!(e, ArrowError::CastError(_)));
850            assert_eq!(
851                e.to_string(),
852                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
853            );
854        }
855    }
856
857    #[test]
858    fn test_finish_preserve_values() {
859        // Create the first dictionary
860        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
861        builder.append(10).unwrap();
862        builder.append(20).unwrap();
863        let array = builder.finish_preserve_values();
864        assert_eq!(array.keys(), &UInt8Array::from(vec![Some(0), Some(1)]));
865        let values: &[u32] = array
866            .values()
867            .as_any()
868            .downcast_ref::<UInt32Array>()
869            .unwrap()
870            .values();
871        assert_eq!(values, &[10, 20]);
872
873        // Create a new dictionary
874        builder.append(30).unwrap();
875        builder.append(40).unwrap();
876        let array2 = builder.finish_preserve_values();
877
878        // Make sure the keys are assigned after the old ones
879        // and that we have the right values
880        assert_eq!(array2.keys(), &UInt8Array::from(vec![Some(2), Some(3)]));
881        let values = array2
882            .downcast_dict::<UInt32Array>()
883            .unwrap()
884            .into_iter()
885            .collect::<Vec<_>>();
886        assert_eq!(values, vec![Some(30), Some(40)]);
887
888        // Check that we have all of the expected values
889        let all_values: &[u32] = array2
890            .values()
891            .as_any()
892            .downcast_ref::<UInt32Array>()
893            .unwrap()
894            .values();
895        assert_eq!(all_values, &[10, 20, 30, 40]);
896    }
897}