1use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use num::NumCast;
26use std::any::Any;
27use std::sync::Arc;
28
29#[derive(Debug)]
61pub struct FixedSizeBinaryDictionaryBuilder<K>
62where
63 K: ArrowDictionaryKeyType,
64{
65 state: ahash::RandomState,
66 dedup: HashTable<usize>,
67
68 keys_builder: PrimitiveBuilder<K>,
69 values_builder: FixedSizeBinaryBuilder,
70 byte_width: i32,
71}
72
73impl<K> FixedSizeBinaryDictionaryBuilder<K>
74where
75 K: ArrowDictionaryKeyType,
76{
77 pub fn new(byte_width: i32) -> Self {
79 let keys_builder = PrimitiveBuilder::new();
80 let values_builder = FixedSizeBinaryBuilder::new(byte_width);
81 Self {
82 state: Default::default(),
83 dedup: HashTable::with_capacity(keys_builder.capacity()),
84 keys_builder,
85 values_builder,
86 byte_width,
87 }
88 }
89
90 pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
96 Self {
97 state: Default::default(),
98 dedup: Default::default(),
99 keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
100 values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
101 byte_width,
102 }
103 }
104
105 pub fn try_new_from_builder<K2>(
131 mut source: FixedSizeBinaryDictionaryBuilder<K2>,
132 ) -> Result<Self, ArrowError>
133 where
134 K::Native: NumCast,
135 K2: ArrowDictionaryKeyType,
136 K2::Native: NumCast,
137 {
138 let state = source.state;
139 let dedup = source.dedup;
140 let values_builder = source.values_builder;
141 let byte_width = source.byte_width;
142
143 let source_keys = source.keys_builder.finish();
144 let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
145 num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
146 ArrowError::CastError(format!(
147 "Can't cast dictionary keys from source type {:?} to type {:?}",
148 K2::DATA_TYPE,
149 K::DATA_TYPE
150 ))
151 })
152 })?;
153
154 drop(source_keys);
158
159 Ok(Self {
160 state,
161 dedup,
162 keys_builder: new_keys
163 .into_builder()
164 .expect("underlying buffer has no references"),
165 values_builder,
166 byte_width,
167 })
168 }
169}
170
171impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
172where
173 K: ArrowDictionaryKeyType,
174{
175 fn as_any(&self) -> &dyn Any {
177 self
178 }
179
180 fn as_any_mut(&mut self) -> &mut dyn Any {
182 self
183 }
184
185 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
187 self
188 }
189
190 fn len(&self) -> usize {
192 self.keys_builder.len()
193 }
194
195 fn finish(&mut self) -> ArrayRef {
197 Arc::new(self.finish())
198 }
199
200 fn finish_cloned(&self) -> ArrayRef {
202 Arc::new(self.finish_cloned())
203 }
204}
205
206impl<K> FixedSizeBinaryDictionaryBuilder<K>
207where
208 K: ArrowDictionaryKeyType,
209{
210 fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
211 let value_bytes: &[u8] = value.as_ref();
212
213 let state = &self.state;
214 let storage = &mut self.values_builder;
215 let hash = state.hash_one(value_bytes);
216
217 let idx = *self
218 .dedup
219 .entry(
220 hash,
221 |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
222 |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
223 )
224 .or_insert_with(|| {
225 let idx = storage.len();
226 let _ = storage.append_value(value);
227 idx
228 })
229 .get();
230
231 let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
232
233 Ok(key)
234 }
235
236 pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
242 if self.byte_width != value.as_ref().len() as i32 {
243 Err(ArrowError::InvalidArgumentError(format!(
244 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
245 self.byte_width,
246 value.as_ref().len()
247 )))
248 } else {
249 let key = self.get_or_insert_key(value)?;
250 self.keys_builder.append_value(key);
251 Ok(key)
252 }
253 }
254
255 #[inline]
257 pub fn append_null(&mut self) {
258 self.keys_builder.append_null()
259 }
260
261 #[inline]
263 pub fn append_nulls(&mut self, n: usize) {
264 self.keys_builder.append_nulls(n);
265 }
266
267 pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
273 self.append(value).expect("dictionary key overflow");
274 }
275
276 pub fn finish(&mut self) -> DictionaryArray<K> {
278 self.dedup.clear();
279 let values = self.values_builder.finish();
280 let keys = self.keys_builder.finish();
281
282 let data_type = DataType::Dictionary(
283 Box::new(K::DATA_TYPE),
284 Box::new(FixedSizeBinary(self.byte_width)),
285 );
286
287 let builder = keys
288 .into_data()
289 .into_builder()
290 .data_type(data_type)
291 .child_data(vec![values.into_data()]);
292
293 DictionaryArray::from(unsafe { builder.build_unchecked() })
294 }
295
296 pub fn finish_cloned(&self) -> DictionaryArray<K> {
298 let values = self.values_builder.finish_cloned();
299 let keys = self.keys_builder.finish_cloned();
300
301 let data_type = DataType::Dictionary(
302 Box::new(K::DATA_TYPE),
303 Box::new(FixedSizeBinary(self.byte_width)),
304 );
305
306 let builder = keys
307 .into_data()
308 .into_builder()
309 .data_type(data_type)
310 .child_data(vec![values.into_data()]);
311
312 DictionaryArray::from(unsafe { builder.build_unchecked() })
313 }
314
315 pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
333 let values = self.values_builder.finish_cloned();
334 let keys = self.keys_builder.finish();
335
336 let data_type = DataType::Dictionary(
337 Box::new(K::DATA_TYPE),
338 Box::new(FixedSizeBinary(self.byte_width)),
339 );
340
341 let builder = keys
342 .into_data()
343 .into_builder()
344 .data_type(data_type)
345 .child_data(vec![values.into_data()]);
346
347 DictionaryArray::from(unsafe { builder.build_unchecked() })
348 }
349}
350
351fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
352 let values = values.values_slice();
353 let start = idx * byte_width.as_usize();
354 let end = idx * byte_width.as_usize() + byte_width.as_usize();
355 &values[start..end]
356}
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361
362 use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type};
363 use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array};
364
365 #[test]
366 fn test_fixed_size_dictionary_builder() {
367 let values = ["abc", "def"];
368
369 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
370 assert_eq!(b.append(values[0]).unwrap(), 0);
371 b.append_null();
372 assert_eq!(b.append(values[1]).unwrap(), 1);
373 assert_eq!(b.append(values[1]).unwrap(), 1);
374 assert_eq!(b.append(values[0]).unwrap(), 0);
375 b.append_nulls(2);
376 assert_eq!(b.append(values[0]).unwrap(), 0);
377 let array = b.finish();
378
379 assert_eq!(
380 array.keys(),
381 &Int8Array::from(vec![
382 Some(0),
383 None,
384 Some(1),
385 Some(1),
386 Some(0),
387 None,
388 None,
389 Some(0)
390 ]),
391 );
392
393 let ava = array
395 .values()
396 .as_any()
397 .downcast_ref::<FixedSizeBinaryArray>()
398 .unwrap();
399
400 assert_eq!(ava.value(0), values[0].as_bytes());
401 assert_eq!(ava.value(1), values[1].as_bytes());
402 }
403
404 #[test]
405 fn test_fixed_size_dictionary_builder_wrong_size() {
406 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
407 let err = b.append(b"too long").unwrap_err().to_string();
408 assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
409 let err = b.append("").unwrap_err().to_string();
410 assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
411 }
412
413 #[test]
414 fn test_fixed_size_dictionary_builder_finish_cloned() {
415 let values = ["abc", "def", "ghi"];
416
417 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
418
419 builder.append(values[0]).unwrap();
420 builder.append_null();
421 builder.append(values[1]).unwrap();
422 builder.append(values[1]).unwrap();
423 builder.append(values[0]).unwrap();
424 let mut array = builder.finish_cloned();
425
426 assert_eq!(
427 array.keys(),
428 &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
429 );
430
431 let ava = array
433 .values()
434 .as_any()
435 .downcast_ref::<FixedSizeBinaryArray>()
436 .unwrap();
437
438 assert_eq!(ava.value(0), values[0].as_bytes());
439 assert_eq!(ava.value(1), values[1].as_bytes());
440
441 builder.append(values[0]).unwrap();
442 builder.append(values[2]).unwrap();
443 builder.append(values[1]).unwrap();
444
445 array = builder.finish();
446
447 assert_eq!(
448 array.keys(),
449 &Int8Array::from(vec![
450 Some(0),
451 None,
452 Some(1),
453 Some(1),
454 Some(0),
455 Some(0),
456 Some(2),
457 Some(1)
458 ])
459 );
460
461 let ava2 = array
463 .values()
464 .as_any()
465 .downcast_ref::<FixedSizeBinaryArray>()
466 .unwrap();
467
468 assert_eq!(ava2.value(0), values[0].as_bytes());
469 assert_eq!(ava2.value(1), values[1].as_bytes());
470 assert_eq!(ava2.value(2), values[2].as_bytes());
471 }
472
473 fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>)
474 where
475 K1: ArrowDictionaryKeyType,
476 K1::Native: NumCast,
477 K2: ArrowDictionaryKeyType,
478 K2::Native: NumCast + From<u8>,
479 {
480 let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3);
481 source.append_value(values[0]);
482 source.append_null();
483 source.append_value(values[1]);
484 source.append_value(values[2]);
485
486 let mut result =
487 FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap();
488 let array = result.finish();
489
490 let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
491 expected_keys_builder
492 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
493 expected_keys_builder.append_null();
494 expected_keys_builder
495 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
496 expected_keys_builder
497 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
498 let expected_keys = expected_keys_builder.finish();
499 assert_eq!(array.keys(), &expected_keys);
500
501 let av = array.values();
502 let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
503 assert_eq!(ava.value(0), values[0]);
504 assert_eq!(ava.value(1), values[1]);
505 assert_eq!(ava.value(2), values[2]);
506 }
507
508 #[test]
509 fn test_try_new_from_builder() {
510 let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]];
511 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone());
513 _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone());
515 _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone());
517 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
519 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone());
521 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone());
522 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone());
523 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
524 }
525
526 #[test]
527 fn test_try_new_from_builder_cast_fails() {
528 let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2);
529 for i in 0u16..257u16 {
530 source_builder.append_value(vec![(i >> 8) as u8, i as u8]);
531 }
532
533 let result =
536 FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
537 assert!(result.is_err());
538 if let Err(e) = result {
539 assert!(matches!(e, ArrowError::CastError(_)));
540 assert_eq!(
541 e.to_string(),
542 "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
543 );
544 }
545 }
546
547 #[test]
548 fn test_finish_preserve_values() {
549 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int32Type>::new(3);
551 builder.append_value("aaa");
552 builder.append_value("bbb");
553 builder.append_value("ccc");
554 let dict = builder.finish_preserve_values();
555 assert_eq!(dict.keys().values(), &[0, 1, 2]);
556 let values = dict
557 .downcast_dict::<FixedSizeBinaryArray>()
558 .unwrap()
559 .into_iter()
560 .collect::<Vec<_>>();
561 assert_eq!(
562 values,
563 vec![
564 Some("aaa".as_bytes()),
565 Some("bbb".as_bytes()),
566 Some("ccc".as_bytes())
567 ]
568 );
569
570 builder.append_value("ddd");
572 builder.append_value("eee");
573 let dict2 = builder.finish_preserve_values();
574
575 assert_eq!(dict2.keys().values(), &[3, 4]);
578 let values = dict2
579 .downcast_dict::<FixedSizeBinaryArray>()
580 .unwrap()
581 .into_iter()
582 .collect::<Vec<_>>();
583 assert_eq!(values, [Some("ddd".as_bytes()), Some("eee".as_bytes())]);
584
585 let all_values = dict2
587 .values()
588 .as_any()
589 .downcast_ref::<FixedSizeBinaryArray>()
590 .unwrap()
591 .into_iter()
592 .collect::<Vec<_>>();
593 assert_eq!(
594 all_values,
595 [
596 Some("aaa".as_bytes()),
597 Some("bbb".as_bytes()),
598 Some("ccc".as_bytes()),
599 Some("ddd".as_bytes()),
600 Some("eee".as_bytes())
601 ]
602 );
603 }
604}