arrow_array/builder/
generic_bytes_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
19use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{
21    Array, ArrayRef, DictionaryArray, GenericByteArray, PrimitiveArray, TypedDictionaryArray,
22};
23use arrow_buffer::ArrowNativeType;
24use arrow_schema::{ArrowError, DataType};
25use hashbrown::HashTable;
26use num::NumCast;
27use std::any::Any;
28use std::sync::Arc;
29
30/// Builder for [`DictionaryArray`] of [`GenericByteArray`]
31///
32/// For example to map a set of byte indices to String values. Note that
33/// the use of a `HashMap` here will not scale to very large arrays or
34/// result in an ordered dictionary.
35#[derive(Debug)]
36pub struct GenericByteDictionaryBuilder<K, T>
37where
38    K: ArrowDictionaryKeyType,
39    T: ByteArrayType,
40{
41    state: ahash::RandomState,
42    dedup: HashTable<usize>,
43
44    keys_builder: PrimitiveBuilder<K>,
45    values_builder: GenericByteBuilder<T>,
46}
47
48impl<K, T> Default for GenericByteDictionaryBuilder<K, T>
49where
50    K: ArrowDictionaryKeyType,
51    T: ByteArrayType,
52{
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl<K, T> GenericByteDictionaryBuilder<K, T>
59where
60    K: ArrowDictionaryKeyType,
61    T: ByteArrayType,
62{
63    /// Creates a new `GenericByteDictionaryBuilder`
64    pub fn new() -> Self {
65        let keys_builder = PrimitiveBuilder::new();
66        let values_builder = GenericByteBuilder::<T>::new();
67        Self {
68            state: Default::default(),
69            dedup: HashTable::with_capacity(keys_builder.capacity()),
70            keys_builder,
71            values_builder,
72        }
73    }
74
75    /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities
76    ///
77    /// `keys_capacity`: the number of keys, i.e. length of array to build
78    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
79    /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary
80    pub fn with_capacity(
81        keys_capacity: usize,
82        value_capacity: usize,
83        data_capacity: usize,
84    ) -> Self {
85        Self {
86            state: Default::default(),
87            dedup: Default::default(),
88            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
89            values_builder: GenericByteBuilder::<T>::with_capacity(value_capacity, data_capacity),
90        }
91    }
92
93    /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary
94    /// which is initialized with the given values.
95    /// The indices of those dictionary values are used as keys.
96    ///
97    /// # Example
98    ///
99    /// ```
100    /// # use arrow_array::builder::StringDictionaryBuilder;
101    /// # use arrow_array::{Int16Array, StringArray};
102    ///
103    /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
104    ///
105    /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
106    /// builder.append("def").unwrap();
107    /// builder.append_null();
108    /// builder.append("abc").unwrap();
109    ///
110    /// let dictionary_array = builder.finish();
111    ///
112    /// let keys = dictionary_array.keys();
113    ///
114    /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
115    /// ```
116    pub fn new_with_dictionary(
117        keys_capacity: usize,
118        dictionary_values: &GenericByteArray<T>,
119    ) -> Result<Self, ArrowError> {
120        let state = ahash::RandomState::default();
121        let dict_len = dictionary_values.len();
122
123        let mut dedup = HashTable::with_capacity(dict_len);
124
125        let values_len = dictionary_values.value_data().len();
126        let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
127
128        K::Native::from_usize(dictionary_values.len())
129            .ok_or(ArrowError::DictionaryKeyOverflowError)?;
130
131        for (idx, maybe_value) in dictionary_values.iter().enumerate() {
132            match maybe_value {
133                Some(value) => {
134                    let value_bytes: &[u8] = value.as_ref();
135                    let hash = state.hash_one(value_bytes);
136
137                    dedup
138                        .entry(
139                            hash,
140                            |idx: &usize| value_bytes == get_bytes(&values_builder, *idx),
141                            |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)),
142                        )
143                        .or_insert(idx);
144
145                    values_builder.append_value(value);
146                }
147                None => values_builder.append_null(),
148            }
149        }
150
151        Ok(Self {
152            state,
153            dedup,
154            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
155            values_builder,
156        })
157    }
158
159    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder with the same
160    /// keys and values, but with a new data type for the keys.
161    ///
162    /// # Example
163    /// ```
164    /// #
165    /// # use arrow_array::builder::StringDictionaryBuilder;
166    /// # use arrow_array::types::{UInt8Type, UInt16Type};
167    /// # use arrow_array::UInt16Array;
168    /// # use arrow_schema::ArrowError;
169    ///
170    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
171    ///
172    /// // appending too many values causes the dictionary to overflow
173    /// for i in 0..256 {
174    ///     u8_keyed_builder.append_value(format!("{}", i));
175    /// }
176    /// let result = u8_keyed_builder.append("256");
177    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
178    ///
179    /// // we need to upgrade to a larger key type
180    /// let mut u16_keyed_builder = StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
181    /// let dictionary_array = u16_keyed_builder.finish();
182    /// let keys = dictionary_array.keys();
183    ///
184    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
185    /// ```
186    pub fn try_new_from_builder<K2>(
187        mut source: GenericByteDictionaryBuilder<K2, T>,
188    ) -> Result<Self, ArrowError>
189    where
190        K::Native: NumCast,
191        K2: ArrowDictionaryKeyType,
192        K2::Native: NumCast,
193    {
194        let state = source.state;
195        let dedup = source.dedup;
196        let values_builder = source.values_builder;
197
198        let source_keys = source.keys_builder.finish();
199        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
200            num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
201                ArrowError::CastError(format!(
202                    "Can't cast dictionary keys from source type {:?} to type {:?}",
203                    K2::DATA_TYPE,
204                    K::DATA_TYPE
205                ))
206            })
207        })?;
208
209        // drop source key here because currently source_keys and new_keys are holding reference to
210        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
211        // be the only reference holder.
212        drop(source_keys);
213
214        Ok(Self {
215            state,
216            dedup,
217            keys_builder: new_keys
218                .into_builder()
219                .expect("underlying buffer has no references"),
220            values_builder,
221        })
222    }
223}
224
225impl<K, T> ArrayBuilder for GenericByteDictionaryBuilder<K, T>
226where
227    K: ArrowDictionaryKeyType,
228    T: ByteArrayType,
229{
230    /// Returns the builder as an non-mutable `Any` reference.
231    fn as_any(&self) -> &dyn Any {
232        self
233    }
234
235    /// Returns the builder as an mutable `Any` reference.
236    fn as_any_mut(&mut self) -> &mut dyn Any {
237        self
238    }
239
240    /// Returns the boxed builder as a box of `Any`.
241    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
242        self
243    }
244
245    /// Returns the number of array slots in the builder
246    fn len(&self) -> usize {
247        self.keys_builder.len()
248    }
249
250    /// Builds the array and reset this builder.
251    fn finish(&mut self) -> ArrayRef {
252        Arc::new(self.finish())
253    }
254
255    /// Builds the array without resetting the builder.
256    fn finish_cloned(&self) -> ArrayRef {
257        Arc::new(self.finish_cloned())
258    }
259}
260
261impl<K, T> GenericByteDictionaryBuilder<K, T>
262where
263    K: ArrowDictionaryKeyType,
264    T: ByteArrayType,
265{
266    fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
267        let value_native: &T::Native = value.as_ref();
268        let value_bytes: &[u8] = value_native.as_ref();
269
270        let state = &self.state;
271        let storage = &mut self.values_builder;
272        let hash = state.hash_one(value_bytes);
273
274        let idx = *self
275            .dedup
276            .entry(
277                hash,
278                |idx| value_bytes == get_bytes(storage, *idx),
279                |idx| state.hash_one(get_bytes(storage, *idx)),
280            )
281            .or_insert_with(|| {
282                let idx = storage.len();
283                storage.append_value(value);
284                idx
285            })
286            .get();
287
288        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
289
290        Ok(key)
291    }
292
293    /// Append a value to the array. Return an existing index
294    /// if already present in the values array or a new index if the
295    /// value is appended to the values array.
296    ///
297    /// Returns an error if the new index would overflow the key type.
298    pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
299        let key = self.get_or_insert_key(value)?;
300        self.keys_builder.append_value(key);
301        Ok(key)
302    }
303
304    /// Append a value multiple times to the array.
305    /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
306    ///
307    /// Returns an error if the new index would overflow the key type.
308    pub fn append_n(
309        &mut self,
310        value: impl AsRef<T::Native>,
311        count: usize,
312    ) -> Result<K::Native, ArrowError> {
313        let key = self.get_or_insert_key(value)?;
314        self.keys_builder.append_value_n(key, count);
315        Ok(key)
316    }
317
318    /// Infallibly append a value to this builder
319    ///
320    /// # Panics
321    ///
322    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
323    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
324        self.append(value).expect("dictionary key overflow");
325    }
326
327    /// Infallibly append a value to this builder repeatedly `count` times.
328    /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
329    ///
330    /// # Panics
331    ///
332    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
333    pub fn append_values(&mut self, value: impl AsRef<T::Native>, count: usize) {
334        self.append_n(value, count)
335            .expect("dictionary key overflow");
336    }
337
338    /// Appends a null slot into the builder
339    #[inline]
340    pub fn append_null(&mut self) {
341        self.keys_builder.append_null()
342    }
343
344    /// Infallibly append `n` null slots into the builder
345    #[inline]
346    pub fn append_nulls(&mut self, n: usize) {
347        self.keys_builder.append_nulls(n)
348    }
349
350    /// Append an `Option` value into the builder
351    ///
352    /// # Panics
353    ///
354    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
355    #[inline]
356    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
357        match value {
358            None => self.append_null(),
359            Some(v) => self.append_value(v),
360        };
361    }
362
363    /// Append an `Option` value into the builder repeatedly `count` times.
364    /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
365    ///
366    /// # Panics
367    ///
368    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
369    pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>, count: usize) {
370        match value {
371            None => self.keys_builder.append_nulls(count),
372            Some(v) => self.append_values(v, count),
373        };
374    }
375
376    /// Extends builder with an existing dictionary array.
377    ///
378    /// This is the same as [`Self::extend`] but is faster as it translates
379    /// the dictionary values once rather than doing a lookup for each item in the iterator
380    ///
381    /// when dictionary values are null (the actual mapped values) the keys are null
382    ///
383    pub fn extend_dictionary(
384        &mut self,
385        dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
386    ) -> Result<(), ArrowError> {
387        let values = dictionary.values();
388
389        let v_len = values.len();
390        let k_len = dictionary.keys().len();
391        if v_len == 0 && k_len == 0 {
392            return Ok(());
393        }
394
395        // All nulls
396        if v_len == 0 {
397            self.append_nulls(k_len);
398            return Ok(());
399        }
400
401        if k_len == 0 {
402            return Err(ArrowError::InvalidArgumentError(
403                "Dictionary keys should not be empty when values are not empty".to_string(),
404            ));
405        }
406
407        // Orphan values will be carried over to the new dictionary
408        let mapped_values = values
409            .iter()
410            // Dictionary values can technically be null, so we need to handle that
411            .map(|dict_value| {
412                dict_value
413                    .map(|dict_value| self.get_or_insert_key(dict_value))
414                    .transpose()
415            })
416            .collect::<Result<Vec<_>, _>>()?;
417
418        // Just insert the keys without additional lookups
419        dictionary.keys().iter().for_each(|key| match key {
420            None => self.append_null(),
421            Some(original_dict_index) => {
422                let index = original_dict_index.as_usize().min(v_len - 1);
423                match mapped_values[index] {
424                    None => self.append_null(),
425                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
426                }
427            }
428        });
429
430        Ok(())
431    }
432
433    /// Builds the `DictionaryArray` and reset this builder.
434    pub fn finish(&mut self) -> DictionaryArray<K> {
435        self.dedup.clear();
436        let values = self.values_builder.finish();
437        let keys = self.keys_builder.finish();
438
439        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
440
441        let builder = keys
442            .into_data()
443            .into_builder()
444            .data_type(data_type)
445            .child_data(vec![values.into_data()]);
446
447        DictionaryArray::from(unsafe { builder.build_unchecked() })
448    }
449
450    /// Builds the `DictionaryArray` without resetting the builder.
451    pub fn finish_cloned(&self) -> DictionaryArray<K> {
452        let values = self.values_builder.finish_cloned();
453        let keys = self.keys_builder.finish_cloned();
454
455        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
456
457        let builder = keys
458            .into_data()
459            .into_builder()
460            .data_type(data_type)
461            .child_data(vec![values.into_data()]);
462
463        DictionaryArray::from(unsafe { builder.build_unchecked() })
464    }
465
466    /// Returns the current null buffer as a slice
467    pub fn validity_slice(&self) -> Option<&[u8]> {
468        self.keys_builder.validity_slice()
469    }
470}
471
472impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>>
473    for GenericByteDictionaryBuilder<K, T>
474{
475    #[inline]
476    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
477        for v in iter {
478            self.append_option(v)
479        }
480    }
481}
482
483fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
484    let offsets = values.offsets_slice();
485    let values = values.values_slice();
486
487    let end_offset = offsets[idx + 1].as_usize();
488    let start_offset = offsets[idx].as_usize();
489
490    &values[start_offset..end_offset]
491}
492
493/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray)
494///
495/// ```
496/// // Create a dictionary array indexed by bytes whose values are Strings.
497/// // It can thus hold up to 256 distinct string values.
498///
499/// # use arrow_array::builder::StringDictionaryBuilder;
500/// # use arrow_array::{Int8Array, StringArray};
501/// # use arrow_array::types::Int8Type;
502///
503/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
504///
505/// // The builder builds the dictionary value by value
506/// builder.append("abc").unwrap();
507/// builder.append_null();
508/// builder.append_n("def", 2).unwrap();  // appends "def" twice with a single lookup
509/// builder.append("abc").unwrap();
510/// let array = builder.finish();
511///
512/// assert_eq!(
513///   array.keys(),
514///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
515/// );
516///
517/// // Values are polymorphic and so require a downcast.
518/// let av = array.values();
519/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
520///
521/// assert_eq!(ava.value(0), "abc");
522/// assert_eq!(ava.value(1), "def");
523///
524/// ```
525pub type StringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i32>>;
526
527/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
528pub type LargeStringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i64>>;
529
530/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray)
531///
532/// ```
533/// // Create a dictionary array indexed by bytes whose values are binary.
534/// // It can thus hold up to 256 distinct binary values.
535///
536/// # use arrow_array::builder::BinaryDictionaryBuilder;
537/// # use arrow_array::{BinaryArray, Int8Array};
538/// # use arrow_array::types::Int8Type;
539///
540/// let mut builder = BinaryDictionaryBuilder::<Int8Type>::new();
541///
542/// // The builder builds the dictionary value by value
543/// builder.append(b"abc").unwrap();
544/// builder.append_null();
545/// builder.append(b"def").unwrap();
546/// builder.append(b"def").unwrap();
547/// builder.append(b"abc").unwrap();
548/// let array = builder.finish();
549///
550/// assert_eq!(
551///   array.keys(),
552///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
553/// );
554///
555/// // Values are polymorphic and so require a downcast.
556/// let av = array.values();
557/// let ava: &BinaryArray = av.as_any().downcast_ref::<BinaryArray>().unwrap();
558///
559/// assert_eq!(ava.value(0), b"abc");
560/// assert_eq!(ava.value(1), b"def");
561///
562/// ```
563pub type BinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i32>>;
564
565/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
566pub type LargeBinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i64>>;
567
568#[cfg(test)]
569mod tests {
570    use super::*;
571
572    use crate::array::Int8Array;
573    use crate::cast::AsArray;
574    use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type, Utf8Type};
575    use crate::{ArrowPrimitiveType, BinaryArray, StringArray};
576
577    fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
578    where
579        T: ByteArrayType,
580        <T as ByteArrayType>::Native: PartialEq,
581        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
582    {
583        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
584        builder.append(values[0]).unwrap();
585        builder.append_null();
586        builder.append(values[1]).unwrap();
587        builder.append(values[1]).unwrap();
588        builder.append(values[0]).unwrap();
589        let array = builder.finish();
590
591        assert_eq!(
592            array.keys(),
593            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
594        );
595
596        // Values are polymorphic and so require a downcast.
597        let av = array.values();
598        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
599
600        assert_eq!(*ava.value(0), *values[0]);
601        assert_eq!(*ava.value(1), *values[1]);
602    }
603
604    #[test]
605    fn test_string_dictionary_builder() {
606        test_bytes_dictionary_builder::<GenericStringType<i32>>(vec!["abc", "def"]);
607    }
608
609    #[test]
610    fn test_binary_dictionary_builder() {
611        test_bytes_dictionary_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def"]);
612    }
613
614    fn test_bytes_dictionary_builder_finish_cloned<T>(values: Vec<&T::Native>)
615    where
616        T: ByteArrayType,
617        <T as ByteArrayType>::Native: PartialEq,
618        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
619    {
620        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
621
622        builder.append(values[0]).unwrap();
623        builder.append_null();
624        builder.append(values[1]).unwrap();
625        builder.append(values[1]).unwrap();
626        builder.append(values[0]).unwrap();
627        let mut array = builder.finish_cloned();
628
629        assert_eq!(
630            array.keys(),
631            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
632        );
633
634        // Values are polymorphic and so require a downcast.
635        let av = array.values();
636        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
637
638        assert_eq!(ava.value(0), values[0]);
639        assert_eq!(ava.value(1), values[1]);
640
641        builder.append(values[0]).unwrap();
642        builder.append(values[2]).unwrap();
643        builder.append(values[1]).unwrap();
644
645        array = builder.finish();
646
647        assert_eq!(
648            array.keys(),
649            &Int8Array::from(vec![
650                Some(0),
651                None,
652                Some(1),
653                Some(1),
654                Some(0),
655                Some(0),
656                Some(2),
657                Some(1)
658            ])
659        );
660
661        // Values are polymorphic and so require a downcast.
662        let av2 = array.values();
663        let ava2: &GenericByteArray<T> =
664            av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
665
666        assert_eq!(ava2.value(0), values[0]);
667        assert_eq!(ava2.value(1), values[1]);
668        assert_eq!(ava2.value(2), values[2]);
669    }
670
671    #[test]
672    fn test_string_dictionary_builder_finish_cloned() {
673        test_bytes_dictionary_builder_finish_cloned::<GenericStringType<i32>>(vec![
674            "abc", "def", "ghi",
675        ]);
676    }
677
678    #[test]
679    fn test_binary_dictionary_builder_finish_cloned() {
680        test_bytes_dictionary_builder_finish_cloned::<GenericBinaryType<i32>>(vec![
681            b"abc", b"def", b"ghi",
682        ]);
683    }
684
685    fn _test_try_new_from_builder_generic_for_key_types<K1, K2, T>(values: Vec<&T::Native>)
686    where
687        K1: ArrowDictionaryKeyType,
688        K1::Native: NumCast,
689        K2: ArrowDictionaryKeyType,
690        K2::Native: NumCast + From<u8>,
691        T: ByteArrayType,
692        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
693    {
694        let mut source = GenericByteDictionaryBuilder::<K1, T>::new();
695        source.append(values[0]).unwrap();
696        source.append(values[1]).unwrap();
697        source.append_null();
698        source.append(values[2]).unwrap();
699
700        let mut result =
701            GenericByteDictionaryBuilder::<K2, T>::try_new_from_builder(source).unwrap();
702        let array = result.finish();
703
704        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
705        expected_keys_builder
706            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
707        expected_keys_builder
708            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
709        expected_keys_builder.append_null();
710        expected_keys_builder
711            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
712        let expected_keys = expected_keys_builder.finish();
713        assert_eq!(array.keys(), &expected_keys);
714
715        let av = array.values();
716        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
717        assert_eq!(ava.value(0), values[0]);
718        assert_eq!(ava.value(1), values[1]);
719        assert_eq!(ava.value(2), values[2]);
720    }
721
722    fn test_try_new_from_builder<T>(values: Vec<&T::Native>)
723    where
724        T: ByteArrayType,
725        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
726    {
727        // test cast to bigger size unsigned
728        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type, T>(
729            values.clone(),
730        );
731        // test cast going to smaller size unsigned
732        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type, T>(
733            values.clone(),
734        );
735        // test cast going to bigger size signed
736        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type, T>(values.clone());
737        // test cast going to smaller size signed
738        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
739        // test going from signed to signed for different size changes
740        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type, T>(values.clone());
741        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type, T>(values.clone());
742        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type, T>(values.clone());
743        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
744    }
745
746    #[test]
747    fn test_string_dictionary_builder_try_new_from_builder() {
748        test_try_new_from_builder::<GenericStringType<i32>>(vec!["abc", "def", "ghi"]);
749    }
750
751    #[test]
752    fn test_binary_dictionary_builder_try_new_from_builder() {
753        test_try_new_from_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def", b"ghi"]);
754    }
755
756    #[test]
757    fn test_try_new_from_builder_cast_fails() {
758        let mut source_builder = StringDictionaryBuilder::<UInt16Type>::new();
759        for i in 0..257 {
760            source_builder.append_value(format!("val{i}"));
761        }
762
763        // there should be too many values that we can't downcast to the underlying type
764        // we have keys that wouldn't fit into UInt8Type
765        let result = StringDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
766        assert!(result.is_err());
767        if let Err(e) = result {
768            assert!(matches!(e, ArrowError::CastError(_)));
769            assert_eq!(
770                e.to_string(),
771                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
772            );
773        }
774    }
775
776    fn test_bytes_dictionary_builder_with_existing_dictionary<T>(
777        dictionary: GenericByteArray<T>,
778        values: Vec<&T::Native>,
779    ) where
780        T: ByteArrayType,
781        <T as ByteArrayType>::Native: PartialEq,
782        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
783    {
784        let mut builder =
785            GenericByteDictionaryBuilder::<Int8Type, T>::new_with_dictionary(6, &dictionary)
786                .unwrap();
787        builder.append(values[0]).unwrap();
788        builder.append_null();
789        builder.append(values[1]).unwrap();
790        builder.append(values[1]).unwrap();
791        builder.append(values[0]).unwrap();
792        builder.append(values[2]).unwrap();
793        let array = builder.finish();
794
795        assert_eq!(
796            array.keys(),
797            &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
798        );
799
800        // Values are polymorphic and so require a downcast.
801        let av = array.values();
802        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
803
804        assert!(!ava.is_valid(0));
805        assert_eq!(ava.value(1), values[1]);
806        assert_eq!(ava.value(2), values[0]);
807        assert_eq!(ava.value(3), values[2]);
808    }
809
810    #[test]
811    fn test_string_dictionary_builder_with_existing_dictionary() {
812        test_bytes_dictionary_builder_with_existing_dictionary::<GenericStringType<i32>>(
813            StringArray::from(vec![None, Some("def"), Some("abc")]),
814            vec!["abc", "def", "ghi"],
815        );
816    }
817
818    #[test]
819    fn test_binary_dictionary_builder_with_existing_dictionary() {
820        let values: Vec<Option<&[u8]>> = vec![None, Some(b"def"), Some(b"abc")];
821        test_bytes_dictionary_builder_with_existing_dictionary::<GenericBinaryType<i32>>(
822            BinaryArray::from(values),
823            vec![b"abc", b"def", b"ghi"],
824        );
825    }
826
827    fn test_bytes_dictionary_builder_with_reserved_null_value<T>(
828        dictionary: GenericByteArray<T>,
829        values: Vec<&T::Native>,
830    ) where
831        T: ByteArrayType,
832        <T as ByteArrayType>::Native: PartialEq,
833        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
834    {
835        let mut builder =
836            GenericByteDictionaryBuilder::<Int16Type, T>::new_with_dictionary(4, &dictionary)
837                .unwrap();
838        builder.append(values[0]).unwrap();
839        builder.append_null();
840        builder.append(values[1]).unwrap();
841        builder.append(values[0]).unwrap();
842        let array = builder.finish();
843
844        assert!(array.is_null(1));
845        assert!(!array.is_valid(1));
846
847        let keys = array.keys();
848
849        assert_eq!(keys.value(0), 1);
850        assert!(keys.is_null(1));
851        // zero initialization is currently guaranteed by Buffer allocation and resizing
852        assert_eq!(keys.value(1), 0);
853        assert_eq!(keys.value(2), 2);
854        assert_eq!(keys.value(3), 1);
855    }
856
857    #[test]
858    fn test_string_dictionary_builder_with_reserved_null_value() {
859        let v: Vec<Option<&str>> = vec![None];
860        test_bytes_dictionary_builder_with_reserved_null_value::<GenericStringType<i32>>(
861            StringArray::from(v),
862            vec!["abc", "def"],
863        );
864    }
865
866    #[test]
867    fn test_binary_dictionary_builder_with_reserved_null_value() {
868        let values: Vec<Option<&[u8]>> = vec![None];
869        test_bytes_dictionary_builder_with_reserved_null_value::<GenericBinaryType<i32>>(
870            BinaryArray::from(values),
871            vec![b"abc", b"def"],
872        );
873    }
874
875    #[test]
876    fn test_extend() {
877        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
878        builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
879        builder.extend(["c", "d", "a"].into_iter().map(Some));
880        let dict = builder.finish();
881        assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
882        assert_eq!(dict.values().len(), 4);
883    }
884
885    #[test]
886    fn test_extend_dictionary() {
887        let some_dict = {
888            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
889            builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
890            builder.extend([None::<&str>]);
891            builder.extend(["c", "d", "a"].into_iter().map(Some));
892            builder.append_null();
893            builder.finish()
894        };
895
896        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
897        builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
898        builder
899            .extend_dictionary(&some_dict.downcast_dict().unwrap())
900            .unwrap();
901        let dict = builder.finish();
902
903        assert_eq!(dict.values().len(), 6);
904
905        let values = dict
906            .downcast_dict::<GenericByteArray<Utf8Type>>()
907            .unwrap()
908            .into_iter()
909            .collect::<Vec<_>>();
910
911        assert_eq!(
912            values,
913            [
914                Some("e"),
915                Some("e"),
916                Some("f"),
917                Some("e"),
918                Some("d"),
919                Some("a"),
920                Some("b"),
921                Some("c"),
922                Some("a"),
923                Some("b"),
924                Some("c"),
925                None,
926                Some("c"),
927                Some("d"),
928                Some("a"),
929                None
930            ]
931        );
932    }
933    #[test]
934    fn test_extend_dictionary_with_null_in_mapped_value() {
935        let some_dict = {
936            let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
937            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
938
939            // Manually build a dictionary values that the mapped values have null
940            values_builder.append_null();
941            keys_builder.append_value(0);
942            values_builder.append_value("I like worm hugs");
943            keys_builder.append_value(1);
944
945            let values = values_builder.finish();
946            let keys = keys_builder.finish();
947
948            let data_type = DataType::Dictionary(
949                Box::new(Int32Type::DATA_TYPE),
950                Box::new(Utf8Type::DATA_TYPE),
951            );
952
953            let builder = keys
954                .into_data()
955                .into_builder()
956                .data_type(data_type)
957                .child_data(vec![values.into_data()]);
958
959            DictionaryArray::from(unsafe { builder.build_unchecked() })
960        };
961
962        let some_dict_values = some_dict.values().as_string::<i32>();
963        assert_eq!(
964            some_dict_values.into_iter().collect::<Vec<_>>(),
965            &[None, Some("I like worm hugs")]
966        );
967
968        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
969        builder
970            .extend_dictionary(&some_dict.downcast_dict().unwrap())
971            .unwrap();
972        let dict = builder.finish();
973
974        assert_eq!(dict.values().len(), 1);
975
976        let values = dict
977            .downcast_dict::<GenericByteArray<Utf8Type>>()
978            .unwrap()
979            .into_iter()
980            .collect::<Vec<_>>();
981
982        assert_eq!(values, [None, Some("I like worm hugs")]);
983    }
984
985    #[test]
986    fn test_extend_all_null_dictionary() {
987        let some_dict = {
988            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
989            builder.append_nulls(2);
990            builder.finish()
991        };
992
993        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
994        builder
995            .extend_dictionary(&some_dict.downcast_dict().unwrap())
996            .unwrap();
997        let dict = builder.finish();
998
999        assert_eq!(dict.values().len(), 0);
1000
1001        let values = dict
1002            .downcast_dict::<GenericByteArray<Utf8Type>>()
1003            .unwrap()
1004            .into_iter()
1005            .collect::<Vec<_>>();
1006
1007        assert_eq!(values, [None, None]);
1008    }
1009}