1use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use num_traits::NumCast;
26use std::any::Any;
27use std::sync::Arc;
28
29#[derive(Debug)]
61pub struct FixedSizeBinaryDictionaryBuilder<K>
62where
63 K: ArrowDictionaryKeyType,
64{
65 state: ahash::RandomState,
66 dedup: HashTable<usize>,
67
68 keys_builder: PrimitiveBuilder<K>,
69 values_builder: FixedSizeBinaryBuilder,
70 byte_width: i32,
71}
72
73impl<K> FixedSizeBinaryDictionaryBuilder<K>
74where
75 K: ArrowDictionaryKeyType,
76{
77 pub fn new(byte_width: i32) -> Self {
79 let keys_builder = PrimitiveBuilder::new();
80 let values_builder = FixedSizeBinaryBuilder::new(byte_width);
81 Self {
82 state: Default::default(),
83 dedup: HashTable::with_capacity(keys_builder.capacity()),
84 keys_builder,
85 values_builder,
86 byte_width,
87 }
88 }
89
90 pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
96 Self {
97 state: Default::default(),
98 dedup: Default::default(),
99 keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
100 values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
101 byte_width,
102 }
103 }
104
105 pub fn try_new_from_builder<K2>(
131 mut source: FixedSizeBinaryDictionaryBuilder<K2>,
132 ) -> Result<Self, ArrowError>
133 where
134 K::Native: NumCast,
135 K2: ArrowDictionaryKeyType,
136 K2::Native: NumCast,
137 {
138 let state = source.state;
139 let dedup = source.dedup;
140 let values_builder = source.values_builder;
141 let byte_width = source.byte_width;
142
143 let source_keys = source.keys_builder.finish();
144 let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
145 num_traits::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
146 ArrowError::CastError(format!(
147 "Can't cast dictionary keys from source type {:?} to type {:?}",
148 K2::DATA_TYPE,
149 K::DATA_TYPE
150 ))
151 })
152 })?;
153
154 drop(source_keys);
158
159 Ok(Self {
160 state,
161 dedup,
162 keys_builder: new_keys
163 .into_builder()
164 .expect("underlying buffer has no references"),
165 values_builder,
166 byte_width,
167 })
168 }
169}
170
171impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
172where
173 K: ArrowDictionaryKeyType,
174{
175 fn as_any(&self) -> &dyn Any {
177 self
178 }
179
180 fn as_any_mut(&mut self) -> &mut dyn Any {
182 self
183 }
184
185 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
187 self
188 }
189
190 fn len(&self) -> usize {
192 self.keys_builder.len()
193 }
194
195 fn finish(&mut self) -> ArrayRef {
197 Arc::new(self.finish())
198 }
199
200 fn finish_cloned(&self) -> ArrayRef {
202 Arc::new(self.finish_cloned())
203 }
204}
205
206impl<K> FixedSizeBinaryDictionaryBuilder<K>
207where
208 K: ArrowDictionaryKeyType,
209{
210 fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
211 let value_bytes: &[u8] = value.as_ref();
212
213 let state = &self.state;
214 let storage = &mut self.values_builder;
215 let hash = state.hash_one(value_bytes);
216
217 let idx = *self
218 .dedup
219 .entry(
220 hash,
221 |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
222 |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
223 )
224 .or_insert_with(|| {
225 let idx = storage.len();
226 let _ = storage.append_value(value);
227 idx
228 })
229 .get();
230
231 let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
232
233 Ok(key)
234 }
235
236 pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
242 if self.byte_width != value.as_ref().len() as i32 {
243 Err(ArrowError::InvalidArgumentError(format!(
244 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
245 self.byte_width,
246 value.as_ref().len()
247 )))
248 } else {
249 let key = self.get_or_insert_key(value)?;
250 self.keys_builder.append_value(key);
251 Ok(key)
252 }
253 }
254
255 pub fn append_n(
260 &mut self,
261 value: impl AsRef<[u8]>,
262 count: usize,
263 ) -> Result<K::Native, ArrowError> {
264 if self.byte_width != value.as_ref().len() as i32 {
265 Err(ArrowError::InvalidArgumentError(format!(
266 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
267 self.byte_width,
268 value.as_ref().len()
269 )))
270 } else {
271 let key = self.get_or_insert_key(value)?;
272 self.keys_builder.append_value_n(key, count);
273 Ok(key)
274 }
275 }
276
277 #[inline]
279 pub fn append_null(&mut self) {
280 self.keys_builder.append_null()
281 }
282
283 #[inline]
285 pub fn append_nulls(&mut self, n: usize) {
286 self.keys_builder.append_nulls(n);
287 }
288
289 pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
295 self.append(value).expect("dictionary key overflow");
296 }
297
298 pub fn finish(&mut self) -> DictionaryArray<K> {
300 self.dedup.clear();
301 let values = self.values_builder.finish();
302 let keys = self.keys_builder.finish();
303
304 let data_type = DataType::Dictionary(
305 Box::new(K::DATA_TYPE),
306 Box::new(FixedSizeBinary(self.byte_width)),
307 );
308
309 let builder = keys
310 .into_data()
311 .into_builder()
312 .data_type(data_type)
313 .child_data(vec![values.into_data()]);
314
315 DictionaryArray::from(unsafe { builder.build_unchecked() })
316 }
317
318 pub fn finish_cloned(&self) -> DictionaryArray<K> {
320 let values = self.values_builder.finish_cloned();
321 let keys = self.keys_builder.finish_cloned();
322
323 let data_type = DataType::Dictionary(
324 Box::new(K::DATA_TYPE),
325 Box::new(FixedSizeBinary(self.byte_width)),
326 );
327
328 let builder = keys
329 .into_data()
330 .into_builder()
331 .data_type(data_type)
332 .child_data(vec![values.into_data()]);
333
334 DictionaryArray::from(unsafe { builder.build_unchecked() })
335 }
336
337 pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
355 let values = self.values_builder.finish_cloned();
356 let keys = self.keys_builder.finish();
357
358 let data_type = DataType::Dictionary(
359 Box::new(K::DATA_TYPE),
360 Box::new(FixedSizeBinary(self.byte_width)),
361 );
362
363 let builder = keys
364 .into_data()
365 .into_builder()
366 .data_type(data_type)
367 .child_data(vec![values.into_data()]);
368
369 DictionaryArray::from(unsafe { builder.build_unchecked() })
370 }
371}
372
373fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
374 let values = values.values_slice();
375 let start = idx * byte_width.as_usize();
376 let end = idx * byte_width.as_usize() + byte_width.as_usize();
377 &values[start..end]
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383
384 use crate::types::{Int8Type, Int16Type, Int32Type, UInt8Type, UInt16Type};
385 use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array};
386
387 #[test]
388 fn test_fixed_size_dictionary_builder() {
389 let values = ["abc", "def"];
390
391 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
392 assert_eq!(b.append(values[0]).unwrap(), 0);
393 b.append_null();
394 assert_eq!(b.append(values[1]).unwrap(), 1);
395 assert_eq!(b.append(values[1]).unwrap(), 1);
396 assert_eq!(b.append(values[0]).unwrap(), 0);
397 b.append_nulls(2);
398 assert_eq!(b.append(values[0]).unwrap(), 0);
399 let array = b.finish();
400
401 assert_eq!(
402 array.keys(),
403 &Int8Array::from(vec![
404 Some(0),
405 None,
406 Some(1),
407 Some(1),
408 Some(0),
409 None,
410 None,
411 Some(0)
412 ]),
413 );
414
415 let ava = array
417 .values()
418 .as_any()
419 .downcast_ref::<FixedSizeBinaryArray>()
420 .unwrap();
421
422 assert_eq!(ava.value(0), values[0].as_bytes());
423 assert_eq!(ava.value(1), values[1].as_bytes());
424 }
425
426 #[test]
427 fn test_fixed_size_dictionary_builder_append_n() {
428 let values = ["abc", "def"];
429 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
430 assert_eq!(b.append_n(values[0], 2).unwrap(), 0);
431 assert_eq!(b.append_n(values[1], 3).unwrap(), 1);
432 assert_eq!(b.append_n(values[0], 2).unwrap(), 0);
433 let array = b.finish();
434
435 assert_eq!(
436 array.keys(),
437 &Int8Array::from(vec![
438 Some(0),
439 Some(0),
440 Some(1),
441 Some(1),
442 Some(1),
443 Some(0),
444 Some(0),
445 ]),
446 );
447
448 let ava = array
450 .values()
451 .as_any()
452 .downcast_ref::<FixedSizeBinaryArray>()
453 .unwrap();
454
455 assert_eq!(ava.value(0), values[0].as_bytes());
456 assert_eq!(ava.value(1), values[1].as_bytes());
457 }
458
459 #[test]
460 fn test_fixed_size_dictionary_builder_wrong_size() {
461 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
462 let err = b.append(b"too long").unwrap_err().to_string();
463 assert_eq!(
464 err,
465 "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8"
466 );
467 let err = b.append("").unwrap_err().to_string();
468 assert_eq!(
469 err,
470 "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0"
471 );
472 let err = b.append_n("a", 3).unwrap_err().to_string();
473 assert_eq!(
474 err,
475 "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 1"
476 );
477 }
478
479 #[test]
480 fn test_fixed_size_dictionary_builder_finish_cloned() {
481 let values = ["abc", "def", "ghi"];
482
483 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
484
485 builder.append(values[0]).unwrap();
486 builder.append_null();
487 builder.append(values[1]).unwrap();
488 builder.append(values[1]).unwrap();
489 builder.append(values[0]).unwrap();
490 let mut array = builder.finish_cloned();
491
492 assert_eq!(
493 array.keys(),
494 &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
495 );
496
497 let ava = array
499 .values()
500 .as_any()
501 .downcast_ref::<FixedSizeBinaryArray>()
502 .unwrap();
503
504 assert_eq!(ava.value(0), values[0].as_bytes());
505 assert_eq!(ava.value(1), values[1].as_bytes());
506
507 builder.append(values[0]).unwrap();
508 builder.append(values[2]).unwrap();
509 builder.append(values[1]).unwrap();
510
511 array = builder.finish();
512
513 assert_eq!(
514 array.keys(),
515 &Int8Array::from(vec![
516 Some(0),
517 None,
518 Some(1),
519 Some(1),
520 Some(0),
521 Some(0),
522 Some(2),
523 Some(1)
524 ])
525 );
526
527 let ava2 = array
529 .values()
530 .as_any()
531 .downcast_ref::<FixedSizeBinaryArray>()
532 .unwrap();
533
534 assert_eq!(ava2.value(0), values[0].as_bytes());
535 assert_eq!(ava2.value(1), values[1].as_bytes());
536 assert_eq!(ava2.value(2), values[2].as_bytes());
537 }
538
539 fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>)
540 where
541 K1: ArrowDictionaryKeyType,
542 K1::Native: NumCast,
543 K2: ArrowDictionaryKeyType,
544 K2::Native: NumCast + From<u8>,
545 {
546 let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3);
547 source.append_value(values[0]);
548 source.append_null();
549 source.append_value(values[1]);
550 source.append_value(values[2]);
551
552 let mut result =
553 FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap();
554 let array = result.finish();
555
556 let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
557 expected_keys_builder
558 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
559 expected_keys_builder.append_null();
560 expected_keys_builder
561 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
562 expected_keys_builder
563 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
564 let expected_keys = expected_keys_builder.finish();
565 assert_eq!(array.keys(), &expected_keys);
566
567 let av = array.values();
568 let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
569 assert_eq!(ava.value(0), values[0]);
570 assert_eq!(ava.value(1), values[1]);
571 assert_eq!(ava.value(2), values[2]);
572 }
573
574 #[test]
575 fn test_try_new_from_builder() {
576 let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]];
577 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone());
579 _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone());
581 _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone());
583 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
585 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone());
587 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone());
588 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone());
589 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
590 }
591
592 #[test]
593 fn test_try_new_from_builder_cast_fails() {
594 let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2);
595 for i in 0u16..257u16 {
596 source_builder.append_value(vec![(i >> 8) as u8, i as u8]);
597 }
598
599 let result =
602 FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
603 assert!(result.is_err());
604 if let Err(e) = result {
605 assert!(matches!(e, ArrowError::CastError(_)));
606 assert_eq!(
607 e.to_string(),
608 "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
609 );
610 }
611 }
612
613 #[test]
614 fn test_finish_preserve_values() {
615 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int32Type>::new(3);
617 builder.append_value("aaa");
618 builder.append_value("bbb");
619 builder.append_value("ccc");
620 let dict = builder.finish_preserve_values();
621 assert_eq!(dict.keys().values(), &[0, 1, 2]);
622 let values = dict
623 .downcast_dict::<FixedSizeBinaryArray>()
624 .unwrap()
625 .into_iter()
626 .collect::<Vec<_>>();
627 assert_eq!(
628 values,
629 vec![
630 Some("aaa".as_bytes()),
631 Some("bbb".as_bytes()),
632 Some("ccc".as_bytes())
633 ]
634 );
635
636 builder.append_value("ddd");
638 builder.append_value("eee");
639 let dict2 = builder.finish_preserve_values();
640
641 assert_eq!(dict2.keys().values(), &[3, 4]);
644 let values = dict2
645 .downcast_dict::<FixedSizeBinaryArray>()
646 .unwrap()
647 .into_iter()
648 .collect::<Vec<_>>();
649 assert_eq!(values, [Some("ddd".as_bytes()), Some("eee".as_bytes())]);
650
651 let all_values = dict2
653 .values()
654 .as_any()
655 .downcast_ref::<FixedSizeBinaryArray>()
656 .unwrap()
657 .into_iter()
658 .collect::<Vec<_>>();
659 assert_eq!(
660 all_values,
661 [
662 Some("aaa".as_bytes()),
663 Some("bbb".as_bytes()),
664 Some("ccc".as_bytes()),
665 Some("ddd".as_bytes()),
666 Some("eee".as_bytes())
667 ]
668 );
669 }
670}