Skip to main content

arrow_array/builder/
generic_bytes_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
19use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{
21    Array, ArrayRef, DictionaryArray, GenericByteArray, PrimitiveArray, TypedDictionaryArray,
22};
23use arrow_buffer::ArrowNativeType;
24use arrow_schema::{ArrowError, DataType};
25use hashbrown::HashTable;
26use num_traits::NumCast;
27use std::any::Any;
28use std::sync::Arc;
29
30/// Builder for [`DictionaryArray`] of [`GenericByteArray`]
31///
32/// For example to map a set of byte indices to String values. Note that
33/// the use of a `HashMap` here will not scale to very large arrays or
34/// result in an ordered dictionary.
35#[derive(Debug)]
36pub struct GenericByteDictionaryBuilder<K, T>
37where
38    K: ArrowDictionaryKeyType,
39    T: ByteArrayType,
40{
41    state: ahash::RandomState,
42    dedup: HashTable<usize>,
43
44    keys_builder: PrimitiveBuilder<K>,
45    values_builder: GenericByteBuilder<T>,
46}
47
48impl<K, T> Default for GenericByteDictionaryBuilder<K, T>
49where
50    K: ArrowDictionaryKeyType,
51    T: ByteArrayType,
52{
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl<K, T> GenericByteDictionaryBuilder<K, T>
59where
60    K: ArrowDictionaryKeyType,
61    T: ByteArrayType,
62{
63    /// Creates a new `GenericByteDictionaryBuilder`
64    pub fn new() -> Self {
65        let keys_builder = PrimitiveBuilder::new();
66        let values_builder = GenericByteBuilder::<T>::new();
67        Self {
68            state: Default::default(),
69            dedup: HashTable::with_capacity(keys_builder.capacity()),
70            keys_builder,
71            values_builder,
72        }
73    }
74
75    /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities
76    ///
77    /// `keys_capacity`: the number of keys, i.e. length of array to build
78    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
79    /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary
80    pub fn with_capacity(
81        keys_capacity: usize,
82        value_capacity: usize,
83        data_capacity: usize,
84    ) -> Self {
85        Self {
86            state: Default::default(),
87            dedup: HashTable::with_capacity(value_capacity),
88            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
89            values_builder: GenericByteBuilder::<T>::with_capacity(value_capacity, data_capacity),
90        }
91    }
92
93    /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary
94    /// which is initialized with the given values.
95    /// The indices of those dictionary values are used as keys.
96    ///
97    /// # Example
98    ///
99    /// ```
100    /// # use arrow_array::builder::StringDictionaryBuilder;
101    /// # use arrow_array::{Int16Array, StringArray};
102    ///
103    /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
104    ///
105    /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
106    /// builder.append("def").unwrap();
107    /// builder.append_null();
108    /// builder.append("abc").unwrap();
109    ///
110    /// let dictionary_array = builder.finish();
111    ///
112    /// let keys = dictionary_array.keys();
113    ///
114    /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
115    /// ```
116    pub fn new_with_dictionary(
117        keys_capacity: usize,
118        dictionary_values: &GenericByteArray<T>,
119    ) -> Result<Self, ArrowError> {
120        let state = ahash::RandomState::default();
121        let dict_len = dictionary_values.len();
122
123        let mut dedup = HashTable::with_capacity(dict_len);
124
125        let values_len = dictionary_values.value_data().len();
126        let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
127
128        K::Native::from_usize(dictionary_values.len())
129            .ok_or(ArrowError::DictionaryKeyOverflowError)?;
130
131        for (idx, maybe_value) in dictionary_values.iter().enumerate() {
132            match maybe_value {
133                Some(value) => {
134                    let value_bytes: &[u8] = value.as_ref();
135                    let hash = state.hash_one(value_bytes);
136
137                    dedup
138                        .entry(
139                            hash,
140                            |idx: &usize| value_bytes == get_bytes(&values_builder, *idx),
141                            |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)),
142                        )
143                        .or_insert(idx);
144
145                    values_builder.append_value(value);
146                }
147                None => values_builder.append_null(),
148            }
149        }
150
151        Ok(Self {
152            state,
153            dedup,
154            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
155            values_builder,
156        })
157    }
158
159    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder with the same
160    /// keys and values, but with a new data type for the keys.
161    ///
162    /// # Example
163    /// ```
164    /// #
165    /// # use arrow_array::builder::StringDictionaryBuilder;
166    /// # use arrow_array::types::{UInt8Type, UInt16Type};
167    /// # use arrow_array::UInt16Array;
168    /// # use arrow_schema::ArrowError;
169    ///
170    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
171    ///
172    /// // appending too many values causes the dictionary to overflow
173    /// for i in 0..256 {
174    ///     u8_keyed_builder.append_value(format!("{}", i));
175    /// }
176    /// let result = u8_keyed_builder.append("256");
177    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
178    ///
179    /// // we need to upgrade to a larger key type
180    /// let mut u16_keyed_builder = StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
181    /// let dictionary_array = u16_keyed_builder.finish();
182    /// let keys = dictionary_array.keys();
183    ///
184    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
185    /// ```
186    pub fn try_new_from_builder<K2>(
187        mut source: GenericByteDictionaryBuilder<K2, T>,
188    ) -> Result<Self, ArrowError>
189    where
190        K::Native: NumCast,
191        K2: ArrowDictionaryKeyType,
192        K2::Native: NumCast,
193    {
194        let state = source.state;
195        let dedup = source.dedup;
196        let values_builder = source.values_builder;
197
198        let source_keys = source.keys_builder.finish();
199        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
200            num_traits::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
201                ArrowError::CastError(format!(
202                    "Can't cast dictionary keys from source type {:?} to type {:?}",
203                    K2::DATA_TYPE,
204                    K::DATA_TYPE
205                ))
206            })
207        })?;
208
209        // drop source key here because currently source_keys and new_keys are holding reference to
210        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
211        // be the only reference holder.
212        drop(source_keys);
213
214        Ok(Self {
215            state,
216            dedup,
217            keys_builder: new_keys
218                .into_builder()
219                .expect("underlying buffer has no references"),
220            values_builder,
221        })
222    }
223}
224
225impl<K, T> ArrayBuilder for GenericByteDictionaryBuilder<K, T>
226where
227    K: ArrowDictionaryKeyType,
228    T: ByteArrayType,
229{
230    /// Returns the builder as an non-mutable `Any` reference.
231    fn as_any(&self) -> &dyn Any {
232        self
233    }
234
235    /// Returns the builder as an mutable `Any` reference.
236    fn as_any_mut(&mut self) -> &mut dyn Any {
237        self
238    }
239
240    /// Returns the boxed builder as a box of `Any`.
241    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
242        self
243    }
244
245    /// Returns the number of array slots in the builder
246    fn len(&self) -> usize {
247        self.keys_builder.len()
248    }
249
250    /// Builds the array and reset this builder.
251    fn finish(&mut self) -> ArrayRef {
252        Arc::new(self.finish())
253    }
254
255    /// Builds the array without resetting the builder.
256    fn finish_cloned(&self) -> ArrayRef {
257        Arc::new(self.finish_cloned())
258    }
259
260    fn finish_preserve_values(&mut self) -> ArrayRef {
261        Arc::new(self.finish_preserve_values())
262    }
263}
264
265impl<K, T> GenericByteDictionaryBuilder<K, T>
266where
267    K: ArrowDictionaryKeyType,
268    T: ByteArrayType,
269{
270    fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
271        let value_native: &T::Native = value.as_ref();
272        let value_bytes: &[u8] = value_native.as_ref();
273
274        let state = &self.state;
275        let storage = &mut self.values_builder;
276        let hash = state.hash_one(value_bytes);
277
278        let idx = *self
279            .dedup
280            .entry(
281                hash,
282                |idx| value_bytes == get_bytes(storage, *idx),
283                |idx| state.hash_one(get_bytes(storage, *idx)),
284            )
285            .or_insert_with(|| {
286                let idx = storage.len();
287                storage.append_value(value);
288                idx
289            })
290            .get();
291
292        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
293
294        Ok(key)
295    }
296
297    /// Append a value to the array. Return an existing index
298    /// if already present in the values array or a new index if the
299    /// value is appended to the values array.
300    ///
301    /// Returns an error if the new index would overflow the key type.
302    pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
303        let key = self.get_or_insert_key(value)?;
304        self.keys_builder.append_value(key);
305        Ok(key)
306    }
307
308    /// Append a value multiple times to the array.
309    /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
310    ///
311    /// Returns an error if the new index would overflow the key type.
312    pub fn append_n(
313        &mut self,
314        value: impl AsRef<T::Native>,
315        count: usize,
316    ) -> Result<K::Native, ArrowError> {
317        let key = self.get_or_insert_key(value)?;
318        self.keys_builder.append_value_n(key, count);
319        Ok(key)
320    }
321
322    /// Infallibly append a value to this builder
323    ///
324    /// # Panics
325    ///
326    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
327    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
328        self.append(value).expect("dictionary key overflow");
329    }
330
331    /// Infallibly append a value to this builder repeatedly `count` times.
332    /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
333    ///
334    /// # Panics
335    ///
336    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
337    pub fn append_values(&mut self, value: impl AsRef<T::Native>, count: usize) {
338        self.append_n(value, count)
339            .expect("dictionary key overflow");
340    }
341
342    /// Appends a null slot into the builder
343    #[inline]
344    pub fn append_null(&mut self) {
345        self.keys_builder.append_null()
346    }
347
348    /// Infallibly append `n` null slots into the builder
349    #[inline]
350    pub fn append_nulls(&mut self, n: usize) {
351        self.keys_builder.append_nulls(n)
352    }
353
354    /// Append an `Option` value into the builder
355    ///
356    /// # Panics
357    ///
358    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
359    #[inline]
360    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
361        match value {
362            None => self.append_null(),
363            Some(v) => self.append_value(v),
364        };
365    }
366
367    /// Append an `Option` value into the builder repeatedly `count` times.
368    /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
369    ///
370    /// # Panics
371    ///
372    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
373    pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>, count: usize) {
374        match value {
375            None => self.keys_builder.append_nulls(count),
376            Some(v) => self.append_values(v, count),
377        };
378    }
379
380    /// Extends builder with an existing dictionary array.
381    ///
382    /// This is the same as [`Self::extend`] but is faster as it translates
383    /// the dictionary values once rather than doing a lookup for each item in the iterator
384    ///
385    /// when dictionary values are null (the actual mapped values) the keys are null
386    ///
387    pub fn extend_dictionary(
388        &mut self,
389        dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
390    ) -> Result<(), ArrowError> {
391        let values = dictionary.values();
392
393        let v_len = values.len();
394        let k_len = dictionary.keys().len();
395        if v_len == 0 && k_len == 0 {
396            return Ok(());
397        }
398
399        // All nulls
400        if v_len == 0 {
401            self.append_nulls(k_len);
402            return Ok(());
403        }
404
405        if k_len == 0 {
406            return Err(ArrowError::InvalidArgumentError(
407                "Dictionary keys should not be empty when values are not empty".to_string(),
408            ));
409        }
410
411        // Orphan values will be carried over to the new dictionary
412        let mapped_values = values
413            .iter()
414            // Dictionary values can technically be null, so we need to handle that
415            .map(|dict_value| {
416                dict_value
417                    .map(|dict_value| self.get_or_insert_key(dict_value))
418                    .transpose()
419            })
420            .collect::<Result<Vec<_>, _>>()?;
421
422        // Just insert the keys without additional lookups
423        dictionary.keys().iter().for_each(|key| match key {
424            None => self.append_null(),
425            Some(original_dict_index) => {
426                let index = original_dict_index.as_usize().min(v_len - 1);
427                match mapped_values[index] {
428                    None => self.append_null(),
429                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
430                }
431            }
432        });
433
434        Ok(())
435    }
436
437    /// Builds the `DictionaryArray` and reset this builder.
438    pub fn finish(&mut self) -> DictionaryArray<K> {
439        self.dedup.clear();
440        let values = self.values_builder.finish();
441        let keys = self.keys_builder.finish();
442
443        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
444
445        let builder = keys
446            .into_data()
447            .into_builder()
448            .data_type(data_type)
449            .child_data(vec![values.into_data()]);
450
451        DictionaryArray::from(unsafe { builder.build_unchecked() })
452    }
453
454    /// Builds the `DictionaryArray` without resetting the builder.
455    pub fn finish_cloned(&self) -> DictionaryArray<K> {
456        let values = self.values_builder.finish_cloned();
457        let keys = self.keys_builder.finish_cloned();
458
459        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
460
461        let builder = keys
462            .into_data()
463            .into_builder()
464            .data_type(data_type)
465            .child_data(vec![values.into_data()]);
466
467        DictionaryArray::from(unsafe { builder.build_unchecked() })
468    }
469
470    /// Builds the `DictionaryArray` without resetting the values builder or
471    /// the internal de-duplication map.
472    ///
473    /// The advantage of doing this is that the values will represent the entire
474    /// set of what has been built so-far by this builder and ensures
475    /// consistency in the assignment of keys to values across multiple calls
476    /// to `finish_preserve_values`. This enables ipc writers to efficiently
477    /// emit delta dictionaries.
478    ///
479    /// The downside to this is that building the record requires creating a
480    /// copy of the values, which can become slowly more expensive if the
481    /// dictionary grows.
482    ///
483    /// Additionally, if record batches from multiple different dictionary
484    /// builders for the same column are fed into a single ipc writer, beware
485    /// that entire dictionaries are likely to be re-sent frequently even when
486    /// the majority of the values are not used by the current record batch.
487    pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
488        let values = self.values_builder.finish_cloned();
489        let keys = self.keys_builder.finish();
490
491        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
492
493        let builder = keys
494            .into_data()
495            .into_builder()
496            .data_type(data_type)
497            .child_data(vec![values.into_data()]);
498
499        DictionaryArray::from(unsafe { builder.build_unchecked() })
500    }
501
502    /// Returns the current null buffer as a slice
503    pub fn validity_slice(&self) -> Option<&[u8]> {
504        self.keys_builder.validity_slice()
505    }
506}
507
508impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>>
509    for GenericByteDictionaryBuilder<K, T>
510{
511    #[inline]
512    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
513        for v in iter {
514            self.append_option(v)
515        }
516    }
517}
518
519fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
520    let offsets = values.offsets_slice();
521    let values = values.values_slice();
522
523    let end_offset = offsets[idx + 1].as_usize();
524    let start_offset = offsets[idx].as_usize();
525
526    &values[start_offset..end_offset]
527}
528
529/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray)
530///
531/// ```
532/// // Create a dictionary array indexed by bytes whose values are Strings.
533/// // It can thus hold up to 256 distinct string values.
534///
535/// # use arrow_array::builder::StringDictionaryBuilder;
536/// # use arrow_array::{Int8Array, StringArray};
537/// # use arrow_array::types::Int8Type;
538///
539/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
540///
541/// // The builder builds the dictionary value by value
542/// builder.append("abc").unwrap();
543/// builder.append_null();
544/// builder.append_n("def", 2).unwrap();  // appends "def" twice with a single lookup
545/// builder.append("abc").unwrap();
546/// let array = builder.finish();
547///
548/// assert_eq!(
549///   array.keys(),
550///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
551/// );
552///
553/// // Values are polymorphic and so require a downcast.
554/// let av = array.values();
555/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
556///
557/// assert_eq!(ava.value(0), "abc");
558/// assert_eq!(ava.value(1), "def");
559///
560/// ```
561pub type StringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i32>>;
562
563/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
564pub type LargeStringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i64>>;
565
566/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray)
567///
568/// ```
569/// // Create a dictionary array indexed by bytes whose values are binary.
570/// // It can thus hold up to 256 distinct binary values.
571///
572/// # use arrow_array::builder::BinaryDictionaryBuilder;
573/// # use arrow_array::{BinaryArray, Int8Array};
574/// # use arrow_array::types::Int8Type;
575///
576/// let mut builder = BinaryDictionaryBuilder::<Int8Type>::new();
577///
578/// // The builder builds the dictionary value by value
579/// builder.append(b"abc").unwrap();
580/// builder.append_null();
581/// builder.append(b"def").unwrap();
582/// builder.append(b"def").unwrap();
583/// builder.append(b"abc").unwrap();
584/// let array = builder.finish();
585///
586/// assert_eq!(
587///   array.keys(),
588///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
589/// );
590///
591/// // Values are polymorphic and so require a downcast.
592/// let av = array.values();
593/// let ava: &BinaryArray = av.as_any().downcast_ref::<BinaryArray>().unwrap();
594///
595/// assert_eq!(ava.value(0), b"abc");
596/// assert_eq!(ava.value(1), b"def");
597///
598/// ```
599pub type BinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i32>>;
600
601/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
602pub type LargeBinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i64>>;
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607
608    use crate::array::Int8Array;
609    use crate::cast::AsArray;
610    use crate::types::{Int8Type, Int16Type, Int32Type, UInt8Type, UInt16Type, Utf8Type};
611    use crate::{ArrowPrimitiveType, BinaryArray, StringArray};
612
613    fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
614    where
615        T: ByteArrayType,
616        <T as ByteArrayType>::Native: PartialEq,
617        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
618    {
619        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
620        builder.append(values[0]).unwrap();
621        builder.append_null();
622        builder.append(values[1]).unwrap();
623        builder.append(values[1]).unwrap();
624        builder.append(values[0]).unwrap();
625        let array = builder.finish();
626
627        assert_eq!(
628            array.keys(),
629            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
630        );
631
632        // Values are polymorphic and so require a downcast.
633        let av = array.values();
634        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
635
636        assert_eq!(*ava.value(0), *values[0]);
637        assert_eq!(*ava.value(1), *values[1]);
638    }
639
640    #[test]
641    fn test_string_dictionary_builder() {
642        test_bytes_dictionary_builder::<GenericStringType<i32>>(vec!["abc", "def"]);
643    }
644
645    #[test]
646    fn test_binary_dictionary_builder() {
647        test_bytes_dictionary_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def"]);
648    }
649
650    #[test]
651    fn test_with_capacity_presizes_dedup() {
652        // `with_capacity` must size the dedup `HashTable` from `value_capacity`,
653        // otherwise the first inserts force a chain of resize+rehash cycles.
654        let value_capacity = 128;
655        let builder =
656            GenericByteDictionaryBuilder::<Int32Type, GenericStringType<i32>>::with_capacity(
657                256,
658                value_capacity,
659                value_capacity * 32,
660            );
661        assert!(
662            builder.dedup.capacity() >= value_capacity,
663            "dedup HashTable not pre-sized: got capacity {}, expected >= {}",
664            builder.dedup.capacity(),
665            value_capacity,
666        );
667    }
668
669    fn test_bytes_dictionary_builder_finish_cloned<T>(values: Vec<&T::Native>)
670    where
671        T: ByteArrayType,
672        <T as ByteArrayType>::Native: PartialEq,
673        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
674    {
675        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
676
677        builder.append(values[0]).unwrap();
678        builder.append_null();
679        builder.append(values[1]).unwrap();
680        builder.append(values[1]).unwrap();
681        builder.append(values[0]).unwrap();
682        let mut array = builder.finish_cloned();
683
684        assert_eq!(
685            array.keys(),
686            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
687        );
688
689        // Values are polymorphic and so require a downcast.
690        let av = array.values();
691        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
692
693        assert_eq!(ava.value(0), values[0]);
694        assert_eq!(ava.value(1), values[1]);
695
696        builder.append(values[0]).unwrap();
697        builder.append(values[2]).unwrap();
698        builder.append(values[1]).unwrap();
699
700        array = builder.finish();
701
702        assert_eq!(
703            array.keys(),
704            &Int8Array::from(vec![
705                Some(0),
706                None,
707                Some(1),
708                Some(1),
709                Some(0),
710                Some(0),
711                Some(2),
712                Some(1)
713            ])
714        );
715
716        // Values are polymorphic and so require a downcast.
717        let av2 = array.values();
718        let ava2: &GenericByteArray<T> =
719            av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
720
721        assert_eq!(ava2.value(0), values[0]);
722        assert_eq!(ava2.value(1), values[1]);
723        assert_eq!(ava2.value(2), values[2]);
724    }
725
726    #[test]
727    fn test_string_dictionary_builder_finish_cloned() {
728        test_bytes_dictionary_builder_finish_cloned::<GenericStringType<i32>>(vec![
729            "abc", "def", "ghi",
730        ]);
731    }
732
733    #[test]
734    fn test_binary_dictionary_builder_finish_cloned() {
735        test_bytes_dictionary_builder_finish_cloned::<GenericBinaryType<i32>>(vec![
736            b"abc", b"def", b"ghi",
737        ]);
738    }
739
740    fn _test_try_new_from_builder_generic_for_key_types<K1, K2, T>(values: Vec<&T::Native>)
741    where
742        K1: ArrowDictionaryKeyType,
743        K1::Native: NumCast,
744        K2: ArrowDictionaryKeyType,
745        K2::Native: NumCast + From<u8>,
746        T: ByteArrayType,
747        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
748    {
749        let mut source = GenericByteDictionaryBuilder::<K1, T>::new();
750        source.append(values[0]).unwrap();
751        source.append(values[1]).unwrap();
752        source.append_null();
753        source.append(values[2]).unwrap();
754
755        let mut result =
756            GenericByteDictionaryBuilder::<K2, T>::try_new_from_builder(source).unwrap();
757        let array = result.finish();
758
759        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
760        expected_keys_builder
761            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
762        expected_keys_builder
763            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
764        expected_keys_builder.append_null();
765        expected_keys_builder
766            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
767        let expected_keys = expected_keys_builder.finish();
768        assert_eq!(array.keys(), &expected_keys);
769
770        let av = array.values();
771        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
772        assert_eq!(ava.value(0), values[0]);
773        assert_eq!(ava.value(1), values[1]);
774        assert_eq!(ava.value(2), values[2]);
775    }
776
777    fn test_try_new_from_builder<T>(values: Vec<&T::Native>)
778    where
779        T: ByteArrayType,
780        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
781    {
782        // test cast to bigger size unsigned
783        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type, T>(
784            values.clone(),
785        );
786        // test cast going to smaller size unsigned
787        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type, T>(
788            values.clone(),
789        );
790        // test cast going to bigger size signed
791        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type, T>(values.clone());
792        // test cast going to smaller size signed
793        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
794        // test going from signed to signed for different size changes
795        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type, T>(values.clone());
796        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type, T>(values.clone());
797        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type, T>(values.clone());
798        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
799    }
800
801    #[test]
802    fn test_string_dictionary_builder_try_new_from_builder() {
803        test_try_new_from_builder::<GenericStringType<i32>>(vec!["abc", "def", "ghi"]);
804    }
805
806    #[test]
807    fn test_binary_dictionary_builder_try_new_from_builder() {
808        test_try_new_from_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def", b"ghi"]);
809    }
810
811    #[test]
812    fn test_try_new_from_builder_cast_fails() {
813        let mut source_builder = StringDictionaryBuilder::<UInt16Type>::new();
814        for i in 0..257 {
815            source_builder.append_value(format!("val{i}"));
816        }
817
818        // there should be too many values that we can't downcast to the underlying type
819        // we have keys that wouldn't fit into UInt8Type
820        let result = StringDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
821        assert!(result.is_err());
822        if let Err(e) = result {
823            assert!(matches!(e, ArrowError::CastError(_)));
824            assert_eq!(
825                e.to_string(),
826                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
827            );
828        }
829    }
830
831    fn test_bytes_dictionary_builder_with_existing_dictionary<T>(
832        dictionary: GenericByteArray<T>,
833        values: Vec<&T::Native>,
834    ) where
835        T: ByteArrayType,
836        <T as ByteArrayType>::Native: PartialEq,
837        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
838    {
839        let mut builder =
840            GenericByteDictionaryBuilder::<Int8Type, T>::new_with_dictionary(6, &dictionary)
841                .unwrap();
842        builder.append(values[0]).unwrap();
843        builder.append_null();
844        builder.append(values[1]).unwrap();
845        builder.append(values[1]).unwrap();
846        builder.append(values[0]).unwrap();
847        builder.append(values[2]).unwrap();
848        let array = builder.finish();
849
850        assert_eq!(
851            array.keys(),
852            &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
853        );
854
855        // Values are polymorphic and so require a downcast.
856        let av = array.values();
857        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
858
859        assert!(!ava.is_valid(0));
860        assert_eq!(ava.value(1), values[1]);
861        assert_eq!(ava.value(2), values[0]);
862        assert_eq!(ava.value(3), values[2]);
863    }
864
865    #[test]
866    fn test_string_dictionary_builder_with_existing_dictionary() {
867        test_bytes_dictionary_builder_with_existing_dictionary::<GenericStringType<i32>>(
868            StringArray::from(vec![None, Some("def"), Some("abc")]),
869            vec!["abc", "def", "ghi"],
870        );
871    }
872
873    #[test]
874    fn test_binary_dictionary_builder_with_existing_dictionary() {
875        let values: Vec<Option<&[u8]>> = vec![None, Some(b"def"), Some(b"abc")];
876        test_bytes_dictionary_builder_with_existing_dictionary::<GenericBinaryType<i32>>(
877            BinaryArray::from(values),
878            vec![b"abc", b"def", b"ghi"],
879        );
880    }
881
882    fn test_bytes_dictionary_builder_with_reserved_null_value<T>(
883        dictionary: GenericByteArray<T>,
884        values: Vec<&T::Native>,
885    ) where
886        T: ByteArrayType,
887        <T as ByteArrayType>::Native: PartialEq,
888        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
889    {
890        let mut builder =
891            GenericByteDictionaryBuilder::<Int16Type, T>::new_with_dictionary(4, &dictionary)
892                .unwrap();
893        builder.append(values[0]).unwrap();
894        builder.append_null();
895        builder.append(values[1]).unwrap();
896        builder.append(values[0]).unwrap();
897        let array = builder.finish();
898
899        assert!(array.is_null(1));
900        assert!(!array.is_valid(1));
901
902        let keys = array.keys();
903
904        assert_eq!(keys.value(0), 1);
905        assert!(keys.is_null(1));
906        // zero initialization is currently guaranteed by Buffer allocation and resizing
907        assert_eq!(keys.value(1), 0);
908        assert_eq!(keys.value(2), 2);
909        assert_eq!(keys.value(3), 1);
910    }
911
912    #[test]
913    fn test_string_dictionary_builder_with_reserved_null_value() {
914        let v: Vec<Option<&str>> = vec![None];
915        test_bytes_dictionary_builder_with_reserved_null_value::<GenericStringType<i32>>(
916            StringArray::from(v),
917            vec!["abc", "def"],
918        );
919    }
920
921    #[test]
922    fn test_binary_dictionary_builder_with_reserved_null_value() {
923        let values: Vec<Option<&[u8]>> = vec![None];
924        test_bytes_dictionary_builder_with_reserved_null_value::<GenericBinaryType<i32>>(
925            BinaryArray::from(values),
926            vec![b"abc", b"def"],
927        );
928    }
929
930    #[test]
931    fn test_extend() {
932        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
933        builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
934        builder.extend(["c", "d", "a"].into_iter().map(Some));
935        let dict = builder.finish();
936        assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
937        assert_eq!(dict.values().len(), 4);
938    }
939
940    #[test]
941    fn test_extend_dictionary() {
942        let some_dict = {
943            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
944            builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
945            builder.extend([None::<&str>]);
946            builder.extend(["c", "d", "a"].into_iter().map(Some));
947            builder.append_null();
948            builder.finish()
949        };
950
951        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
952        builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
953        builder
954            .extend_dictionary(&some_dict.downcast_dict().unwrap())
955            .unwrap();
956        let dict = builder.finish();
957
958        assert_eq!(dict.values().len(), 6);
959
960        let values = dict
961            .downcast_dict::<GenericByteArray<Utf8Type>>()
962            .unwrap()
963            .into_iter()
964            .collect::<Vec<_>>();
965
966        assert_eq!(
967            values,
968            [
969                Some("e"),
970                Some("e"),
971                Some("f"),
972                Some("e"),
973                Some("d"),
974                Some("a"),
975                Some("b"),
976                Some("c"),
977                Some("a"),
978                Some("b"),
979                Some("c"),
980                None,
981                Some("c"),
982                Some("d"),
983                Some("a"),
984                None
985            ]
986        );
987    }
988    #[test]
989    fn test_extend_dictionary_with_null_in_mapped_value() {
990        let some_dict = {
991            let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
992            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
993
994            // Manually build a dictionary values that the mapped values have null
995            values_builder.append_null();
996            keys_builder.append_value(0);
997            values_builder.append_value("I like worm hugs");
998            keys_builder.append_value(1);
999
1000            let values = values_builder.finish();
1001            let keys = keys_builder.finish();
1002
1003            let data_type = DataType::Dictionary(
1004                Box::new(Int32Type::DATA_TYPE),
1005                Box::new(Utf8Type::DATA_TYPE),
1006            );
1007
1008            let builder = keys
1009                .into_data()
1010                .into_builder()
1011                .data_type(data_type)
1012                .child_data(vec![values.into_data()]);
1013
1014            DictionaryArray::from(unsafe { builder.build_unchecked() })
1015        };
1016
1017        let some_dict_values = some_dict.values().as_string::<i32>();
1018        assert_eq!(
1019            some_dict_values.into_iter().collect::<Vec<_>>(),
1020            &[None, Some("I like worm hugs")]
1021        );
1022
1023        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1024        builder
1025            .extend_dictionary(&some_dict.downcast_dict().unwrap())
1026            .unwrap();
1027        let dict = builder.finish();
1028
1029        assert_eq!(dict.values().len(), 1);
1030
1031        let values = dict
1032            .downcast_dict::<GenericByteArray<Utf8Type>>()
1033            .unwrap()
1034            .into_iter()
1035            .collect::<Vec<_>>();
1036
1037        assert_eq!(values, [None, Some("I like worm hugs")]);
1038    }
1039
1040    #[test]
1041    fn test_extend_all_null_dictionary() {
1042        let some_dict = {
1043            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1044            builder.append_nulls(2);
1045            builder.finish()
1046        };
1047
1048        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1049        builder
1050            .extend_dictionary(&some_dict.downcast_dict().unwrap())
1051            .unwrap();
1052        let dict = builder.finish();
1053
1054        assert_eq!(dict.values().len(), 0);
1055
1056        let values = dict
1057            .downcast_dict::<GenericByteArray<Utf8Type>>()
1058            .unwrap()
1059            .into_iter()
1060            .collect::<Vec<_>>();
1061
1062        assert_eq!(values, [None, None]);
1063    }
1064
1065    #[test]
1066    fn test_finish_preserve_values() {
1067        // Create the first dictionary
1068        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1069        builder.append("a").unwrap();
1070        builder.append("b").unwrap();
1071        builder.append("c").unwrap();
1072        let dict = builder.finish_preserve_values();
1073        assert_eq!(dict.keys().values(), &[0, 1, 2]);
1074        assert_eq!(dict.values().len(), 3);
1075        let values = dict
1076            .downcast_dict::<GenericByteArray<Utf8Type>>()
1077            .unwrap()
1078            .into_iter()
1079            .collect::<Vec<_>>();
1080        assert_eq!(values, [Some("a"), Some("b"), Some("c")]);
1081
1082        // Create a new dictionary
1083        builder.append("d").unwrap();
1084        builder.append("e").unwrap();
1085        let dict2 = builder.finish_preserve_values();
1086
1087        // Make sure the keys are assigned after the old ones and we have the
1088        // right values
1089        assert_eq!(dict2.keys().values(), &[3, 4]);
1090        let values = dict2
1091            .downcast_dict::<GenericByteArray<Utf8Type>>()
1092            .unwrap()
1093            .into_iter()
1094            .collect::<Vec<_>>();
1095        assert_eq!(values, [Some("d"), Some("e")]);
1096
1097        // Check that we have all of the expected values
1098        assert_eq!(dict2.values().len(), 5);
1099        let all_values = dict2
1100            .values()
1101            .as_any()
1102            .downcast_ref::<StringArray>()
1103            .unwrap()
1104            .into_iter()
1105            .collect::<Vec<_>>();
1106        assert_eq!(
1107            all_values,
1108            [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"),]
1109        );
1110    }
1111}