arrow_array/builder/
fixed_size_binary_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use num::NumCast;
26use std::any::Any;
27use std::sync::Arc;
28
29/// Builder for [`DictionaryArray`] of [`FixedSizeBinaryArray`]
30///
31/// The output array has a dictionary of unique, fixed-size binary values. The
32/// builder handles deduplication.
33///
34/// # Example
35/// ```
36/// # use arrow_array::builder::{FixedSizeBinaryDictionaryBuilder};
37/// # use arrow_array::array::{Array, FixedSizeBinaryArray};
38/// # use arrow_array::DictionaryArray;
39/// # use arrow_array::types::Int8Type;
40/// // Build 3 byte FixedBinaryArrays
41/// let byte_width = 3;
42/// let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
43/// builder.append("abc").unwrap();
44/// builder.append_null();
45/// builder.append(b"def").unwrap();
46/// builder.append(b"def").unwrap(); // duplicate value
47/// // Result is a Dictionary Array
48/// let array = builder.finish();
49/// let dict_array = array.as_any().downcast_ref::<DictionaryArray<Int8Type>>().unwrap();
50/// // The array represents "abc", null, "def", "def"
51/// assert_eq!(array.keys().len(), 4);
52/// // but there are only 2 unique values
53/// assert_eq!(array.values().len(), 2);
54/// let values = dict_array.values().as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
55/// assert_eq!(values.value(0), "abc".as_bytes());
56/// assert_eq!(values.value(1), "def".as_bytes());
57/// ```
58///
59/// [`FixedSizeBinaryArray`]: crate::FixedSizeBinaryArray
60#[derive(Debug)]
61pub struct FixedSizeBinaryDictionaryBuilder<K>
62where
63    K: ArrowDictionaryKeyType,
64{
65    state: ahash::RandomState,
66    dedup: HashTable<usize>,
67
68    keys_builder: PrimitiveBuilder<K>,
69    values_builder: FixedSizeBinaryBuilder,
70    byte_width: i32,
71}
72
73impl<K> FixedSizeBinaryDictionaryBuilder<K>
74where
75    K: ArrowDictionaryKeyType,
76{
77    /// Creates a new `FixedSizeBinaryDictionaryBuilder`
78    pub fn new(byte_width: i32) -> Self {
79        let keys_builder = PrimitiveBuilder::new();
80        let values_builder = FixedSizeBinaryBuilder::new(byte_width);
81        Self {
82            state: Default::default(),
83            dedup: HashTable::with_capacity(keys_builder.capacity()),
84            keys_builder,
85            values_builder,
86            byte_width,
87        }
88    }
89
90    /// Creates a new `FixedSizeBinaryDictionaryBuilder` with the provided capacities
91    ///
92    /// `keys_capacity`: the number of keys, i.e. length of array to build
93    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
94    /// `byte_width`: the byte width for individual values in the values array
95    pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
96        Self {
97            state: Default::default(),
98            dedup: Default::default(),
99            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
100            values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
101            byte_width,
102        }
103    }
104
105    /// Creates a new `FixedSizeBinaryDictionaryBuilder` from the existing builder with the same
106    /// keys and values, but with a new data type for the keys.
107    ///
108    /// # Example
109    /// ```
110    /// # use arrow_array::builder::FixedSizeBinaryDictionaryBuilder;
111    /// # use arrow_array::types::{UInt8Type, UInt16Type, UInt64Type};
112    /// # use arrow_array::UInt16Array;
113    /// # use arrow_schema::ArrowError;
114    ///
115    /// let mut u8_keyed_builder = FixedSizeBinaryDictionaryBuilder::<UInt8Type>::new(2);
116    /// // appending too many values causes the dictionary to overflow
117    /// for i in 0..=255 {
118    ///     u8_keyed_builder.append_value(vec![0, i]);
119    /// }
120    /// let result = u8_keyed_builder.append(vec![1, 0]);
121    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
122    ///
123    /// // we need to upgrade to a larger key type
124    /// let mut u16_keyed_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
125    /// let dictionary_array = u16_keyed_builder.finish();
126    /// let keys = dictionary_array.keys();
127    ///
128    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
129    /// ```
130    pub fn try_new_from_builder<K2>(
131        mut source: FixedSizeBinaryDictionaryBuilder<K2>,
132    ) -> Result<Self, ArrowError>
133    where
134        K::Native: NumCast,
135        K2: ArrowDictionaryKeyType,
136        K2::Native: NumCast,
137    {
138        let state = source.state;
139        let dedup = source.dedup;
140        let values_builder = source.values_builder;
141        let byte_width = source.byte_width;
142
143        let source_keys = source.keys_builder.finish();
144        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
145            num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
146                ArrowError::CastError(format!(
147                    "Can't cast dictionary keys from source type {:?} to type {:?}",
148                    K2::DATA_TYPE,
149                    K::DATA_TYPE
150                ))
151            })
152        })?;
153
154        // drop source key here because currently source_keys and new_keys are holding reference to
155        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
156        // be the only reference holder.
157        drop(source_keys);
158
159        Ok(Self {
160            state,
161            dedup,
162            keys_builder: new_keys
163                .into_builder()
164                .expect("underlying buffer has no references"),
165            values_builder,
166            byte_width,
167        })
168    }
169}
170
171impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
172where
173    K: ArrowDictionaryKeyType,
174{
175    /// Returns the builder as an non-mutable `Any` reference.
176    fn as_any(&self) -> &dyn Any {
177        self
178    }
179
180    /// Returns the builder as an mutable `Any` reference.
181    fn as_any_mut(&mut self) -> &mut dyn Any {
182        self
183    }
184
185    /// Returns the boxed builder as a box of `Any`.
186    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
187        self
188    }
189
190    /// Returns the number of array slots in the builder
191    fn len(&self) -> usize {
192        self.keys_builder.len()
193    }
194
195    /// Builds the array and reset this builder.
196    fn finish(&mut self) -> ArrayRef {
197        Arc::new(self.finish())
198    }
199
200    /// Builds the array without resetting the builder.
201    fn finish_cloned(&self) -> ArrayRef {
202        Arc::new(self.finish_cloned())
203    }
204}
205
206impl<K> FixedSizeBinaryDictionaryBuilder<K>
207where
208    K: ArrowDictionaryKeyType,
209{
210    fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
211        let value_bytes: &[u8] = value.as_ref();
212
213        let state = &self.state;
214        let storage = &mut self.values_builder;
215        let hash = state.hash_one(value_bytes);
216
217        let idx = *self
218            .dedup
219            .entry(
220                hash,
221                |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
222                |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
223            )
224            .or_insert_with(|| {
225                let idx = storage.len();
226                let _ = storage.append_value(value);
227                idx
228            })
229            .get();
230
231        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
232
233        Ok(key)
234    }
235
236    /// Append a value to the array. Return an existing index
237    /// if already present in the values array or a new index if the
238    /// value is appended to the values array.
239    ///
240    /// Returns an error if the new index would overflow the key type.
241    pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
242        if self.byte_width != value.as_ref().len() as i32 {
243            Err(ArrowError::InvalidArgumentError(format!(
244                "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
245                self.byte_width,
246                value.as_ref().len()
247            )))
248        } else {
249            let key = self.get_or_insert_key(value)?;
250            self.keys_builder.append_value(key);
251            Ok(key)
252        }
253    }
254
255    /// Appends a null slot into the builder
256    #[inline]
257    pub fn append_null(&mut self) {
258        self.keys_builder.append_null()
259    }
260
261    /// Appends `n` `null`s into the builder.
262    #[inline]
263    pub fn append_nulls(&mut self, n: usize) {
264        self.keys_builder.append_nulls(n);
265    }
266
267    /// Infallibly append a value to this builder
268    ///
269    /// # Panics
270    ///
271    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
272    pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
273        self.append(value).expect("dictionary key overflow");
274    }
275
276    /// Builds the `DictionaryArray` and reset this builder.
277    pub fn finish(&mut self) -> DictionaryArray<K> {
278        self.dedup.clear();
279        let values = self.values_builder.finish();
280        let keys = self.keys_builder.finish();
281
282        let data_type = DataType::Dictionary(
283            Box::new(K::DATA_TYPE),
284            Box::new(FixedSizeBinary(self.byte_width)),
285        );
286
287        let builder = keys
288            .into_data()
289            .into_builder()
290            .data_type(data_type)
291            .child_data(vec![values.into_data()]);
292
293        DictionaryArray::from(unsafe { builder.build_unchecked() })
294    }
295
296    /// Builds the `DictionaryArray` without resetting the builder.
297    pub fn finish_cloned(&self) -> DictionaryArray<K> {
298        let values = self.values_builder.finish_cloned();
299        let keys = self.keys_builder.finish_cloned();
300
301        let data_type = DataType::Dictionary(
302            Box::new(K::DATA_TYPE),
303            Box::new(FixedSizeBinary(self.byte_width)),
304        );
305
306        let builder = keys
307            .into_data()
308            .into_builder()
309            .data_type(data_type)
310            .child_data(vec![values.into_data()]);
311
312        DictionaryArray::from(unsafe { builder.build_unchecked() })
313    }
314
315    /// Builds the `DictionaryArray` without resetting the values builder or
316    /// the internal de-duplication map.
317    ///
318    /// The advantage of doing this is that the values will represent the entire
319    /// set of what has been built so-far by this builder and ensures
320    /// consistency in the assignment of keys to values across multiple calls
321    /// to `finish_preserve_values`. This enables ipc writers to efficiently
322    /// emit delta dictionaries.
323    ///
324    /// The downside to this is that building the record requires creating a
325    /// copy of the values, which can become slowly more expensive if the
326    /// dictionary grows.
327    ///
328    /// Additionally, if record batches from multiple different dictionary
329    /// builders for the same column are fed into a single ipc writer, beware
330    /// that entire dictionaries are likely to be re-sent frequently even when
331    /// the majority of the values are not used by the current record batch.
332    pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
333        let values = self.values_builder.finish_cloned();
334        let keys = self.keys_builder.finish();
335
336        let data_type = DataType::Dictionary(
337            Box::new(K::DATA_TYPE),
338            Box::new(FixedSizeBinary(self.byte_width)),
339        );
340
341        let builder = keys
342            .into_data()
343            .into_builder()
344            .data_type(data_type)
345            .child_data(vec![values.into_data()]);
346
347        DictionaryArray::from(unsafe { builder.build_unchecked() })
348    }
349}
350
351fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
352    let values = values.values_slice();
353    let start = idx * byte_width.as_usize();
354    let end = idx * byte_width.as_usize() + byte_width.as_usize();
355    &values[start..end]
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361
362    use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type};
363    use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array};
364
365    #[test]
366    fn test_fixed_size_dictionary_builder() {
367        let values = ["abc", "def"];
368
369        let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
370        assert_eq!(b.append(values[0]).unwrap(), 0);
371        b.append_null();
372        assert_eq!(b.append(values[1]).unwrap(), 1);
373        assert_eq!(b.append(values[1]).unwrap(), 1);
374        assert_eq!(b.append(values[0]).unwrap(), 0);
375        b.append_nulls(2);
376        assert_eq!(b.append(values[0]).unwrap(), 0);
377        let array = b.finish();
378
379        assert_eq!(
380            array.keys(),
381            &Int8Array::from(vec![
382                Some(0),
383                None,
384                Some(1),
385                Some(1),
386                Some(0),
387                None,
388                None,
389                Some(0)
390            ]),
391        );
392
393        // Values are polymorphic and so require a downcast.
394        let ava = array
395            .values()
396            .as_any()
397            .downcast_ref::<FixedSizeBinaryArray>()
398            .unwrap();
399
400        assert_eq!(ava.value(0), values[0].as_bytes());
401        assert_eq!(ava.value(1), values[1].as_bytes());
402    }
403
404    #[test]
405    fn test_fixed_size_dictionary_builder_wrong_size() {
406        let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
407        let err = b.append(b"too long").unwrap_err().to_string();
408        assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
409        let err = b.append("").unwrap_err().to_string();
410        assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
411    }
412
413    #[test]
414    fn test_fixed_size_dictionary_builder_finish_cloned() {
415        let values = ["abc", "def", "ghi"];
416
417        let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
418
419        builder.append(values[0]).unwrap();
420        builder.append_null();
421        builder.append(values[1]).unwrap();
422        builder.append(values[1]).unwrap();
423        builder.append(values[0]).unwrap();
424        let mut array = builder.finish_cloned();
425
426        assert_eq!(
427            array.keys(),
428            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
429        );
430
431        // Values are polymorphic and so require a downcast.
432        let ava = array
433            .values()
434            .as_any()
435            .downcast_ref::<FixedSizeBinaryArray>()
436            .unwrap();
437
438        assert_eq!(ava.value(0), values[0].as_bytes());
439        assert_eq!(ava.value(1), values[1].as_bytes());
440
441        builder.append(values[0]).unwrap();
442        builder.append(values[2]).unwrap();
443        builder.append(values[1]).unwrap();
444
445        array = builder.finish();
446
447        assert_eq!(
448            array.keys(),
449            &Int8Array::from(vec![
450                Some(0),
451                None,
452                Some(1),
453                Some(1),
454                Some(0),
455                Some(0),
456                Some(2),
457                Some(1)
458            ])
459        );
460
461        // Values are polymorphic and so require a downcast.
462        let ava2 = array
463            .values()
464            .as_any()
465            .downcast_ref::<FixedSizeBinaryArray>()
466            .unwrap();
467
468        assert_eq!(ava2.value(0), values[0].as_bytes());
469        assert_eq!(ava2.value(1), values[1].as_bytes());
470        assert_eq!(ava2.value(2), values[2].as_bytes());
471    }
472
473    fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>)
474    where
475        K1: ArrowDictionaryKeyType,
476        K1::Native: NumCast,
477        K2: ArrowDictionaryKeyType,
478        K2::Native: NumCast + From<u8>,
479    {
480        let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3);
481        source.append_value(values[0]);
482        source.append_null();
483        source.append_value(values[1]);
484        source.append_value(values[2]);
485
486        let mut result =
487            FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap();
488        let array = result.finish();
489
490        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
491        expected_keys_builder
492            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
493        expected_keys_builder.append_null();
494        expected_keys_builder
495            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
496        expected_keys_builder
497            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
498        let expected_keys = expected_keys_builder.finish();
499        assert_eq!(array.keys(), &expected_keys);
500
501        let av = array.values();
502        let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
503        assert_eq!(ava.value(0), values[0]);
504        assert_eq!(ava.value(1), values[1]);
505        assert_eq!(ava.value(2), values[2]);
506    }
507
508    #[test]
509    fn test_try_new_from_builder() {
510        let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]];
511        // test cast to bigger size unsigned
512        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone());
513        // test cast going to smaller size unsigned
514        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone());
515        // test cast going to bigger size signed
516        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone());
517        // test cast going to smaller size signed
518        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
519        // test going from signed to signed for different size changes
520        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone());
521        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone());
522        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone());
523        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
524    }
525
526    #[test]
527    fn test_try_new_from_builder_cast_fails() {
528        let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2);
529        for i in 0u16..257u16 {
530            source_builder.append_value(vec![(i >> 8) as u8, i as u8]);
531        }
532
533        // there should be too many values that we can't downcast to the underlying type
534        // we have keys that wouldn't fit into UInt8Type
535        let result =
536            FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
537        assert!(result.is_err());
538        if let Err(e) = result {
539            assert!(matches!(e, ArrowError::CastError(_)));
540            assert_eq!(
541                e.to_string(),
542                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
543            );
544        }
545    }
546
547    #[test]
548    fn test_finish_preserve_values() {
549        // Create the first dictionary
550        let mut builder = FixedSizeBinaryDictionaryBuilder::<Int32Type>::new(3);
551        builder.append_value("aaa");
552        builder.append_value("bbb");
553        builder.append_value("ccc");
554        let dict = builder.finish_preserve_values();
555        assert_eq!(dict.keys().values(), &[0, 1, 2]);
556        let values = dict
557            .downcast_dict::<FixedSizeBinaryArray>()
558            .unwrap()
559            .into_iter()
560            .collect::<Vec<_>>();
561        assert_eq!(
562            values,
563            vec![
564                Some("aaa".as_bytes()),
565                Some("bbb".as_bytes()),
566                Some("ccc".as_bytes())
567            ]
568        );
569
570        // Create a new dictionary
571        builder.append_value("ddd");
572        builder.append_value("eee");
573        let dict2 = builder.finish_preserve_values();
574
575        // Make sure the keys are assigned after the old ones and we have the
576        // right values
577        assert_eq!(dict2.keys().values(), &[3, 4]);
578        let values = dict2
579            .downcast_dict::<FixedSizeBinaryArray>()
580            .unwrap()
581            .into_iter()
582            .collect::<Vec<_>>();
583        assert_eq!(values, [Some("ddd".as_bytes()), Some("eee".as_bytes())]);
584
585        // Check that we have all of the expected values
586        let all_values = dict2
587            .values()
588            .as_any()
589            .downcast_ref::<FixedSizeBinaryArray>()
590            .unwrap()
591            .into_iter()
592            .collect::<Vec<_>>();
593        assert_eq!(
594            all_values,
595            [
596                Some("aaa".as_bytes()),
597                Some("bbb".as_bytes()),
598                Some("ccc".as_bytes()),
599                Some("ddd".as_bytes()),
600                Some("eee".as_bytes())
601            ]
602        );
603    }
604}