1use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use num::NumCast;
26use std::any::Any;
27use std::sync::Arc;
28
29#[derive(Debug)]
61pub struct FixedSizeBinaryDictionaryBuilder<K>
62where
63 K: ArrowDictionaryKeyType,
64{
65 state: ahash::RandomState,
66 dedup: HashTable<usize>,
67
68 keys_builder: PrimitiveBuilder<K>,
69 values_builder: FixedSizeBinaryBuilder,
70 byte_width: i32,
71}
72
73impl<K> FixedSizeBinaryDictionaryBuilder<K>
74where
75 K: ArrowDictionaryKeyType,
76{
77 pub fn new(byte_width: i32) -> Self {
79 let keys_builder = PrimitiveBuilder::new();
80 let values_builder = FixedSizeBinaryBuilder::new(byte_width);
81 Self {
82 state: Default::default(),
83 dedup: HashTable::with_capacity(keys_builder.capacity()),
84 keys_builder,
85 values_builder,
86 byte_width,
87 }
88 }
89
90 pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
96 Self {
97 state: Default::default(),
98 dedup: Default::default(),
99 keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
100 values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
101 byte_width,
102 }
103 }
104
105 pub fn try_new_from_builder<K2>(
131 mut source: FixedSizeBinaryDictionaryBuilder<K2>,
132 ) -> Result<Self, ArrowError>
133 where
134 K::Native: NumCast,
135 K2: ArrowDictionaryKeyType,
136 K2::Native: NumCast,
137 {
138 let state = source.state;
139 let dedup = source.dedup;
140 let values_builder = source.values_builder;
141 let byte_width = source.byte_width;
142
143 let source_keys = source.keys_builder.finish();
144 let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
145 num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
146 ArrowError::CastError(format!(
147 "Can't cast dictionary keys from source type {:?} to type {:?}",
148 K2::DATA_TYPE,
149 K::DATA_TYPE
150 ))
151 })
152 })?;
153
154 drop(source_keys);
158
159 Ok(Self {
160 state,
161 dedup,
162 keys_builder: new_keys
163 .into_builder()
164 .expect("underlying buffer has no references"),
165 values_builder,
166 byte_width,
167 })
168 }
169}
170
171impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
172where
173 K: ArrowDictionaryKeyType,
174{
175 fn as_any(&self) -> &dyn Any {
177 self
178 }
179
180 fn as_any_mut(&mut self) -> &mut dyn Any {
182 self
183 }
184
185 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
187 self
188 }
189
190 fn len(&self) -> usize {
192 self.keys_builder.len()
193 }
194
195 fn finish(&mut self) -> ArrayRef {
197 Arc::new(self.finish())
198 }
199
200 fn finish_cloned(&self) -> ArrayRef {
202 Arc::new(self.finish_cloned())
203 }
204}
205
206impl<K> FixedSizeBinaryDictionaryBuilder<K>
207where
208 K: ArrowDictionaryKeyType,
209{
210 fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
211 let value_bytes: &[u8] = value.as_ref();
212
213 let state = &self.state;
214 let storage = &mut self.values_builder;
215 let hash = state.hash_one(value_bytes);
216
217 let idx = *self
218 .dedup
219 .entry(
220 hash,
221 |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
222 |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
223 )
224 .or_insert_with(|| {
225 let idx = storage.len();
226 let _ = storage.append_value(value);
227 idx
228 })
229 .get();
230
231 let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
232
233 Ok(key)
234 }
235
236 pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
242 if self.byte_width != value.as_ref().len() as i32 {
243 Err(ArrowError::InvalidArgumentError(format!(
244 "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
245 self.byte_width,
246 value.as_ref().len()
247 )))
248 } else {
249 let key = self.get_or_insert_key(value)?;
250 self.keys_builder.append_value(key);
251 Ok(key)
252 }
253 }
254
255 #[inline]
257 pub fn append_null(&mut self) {
258 self.keys_builder.append_null()
259 }
260
261 #[inline]
263 pub fn append_nulls(&mut self, n: usize) {
264 self.keys_builder.append_nulls(n);
265 }
266
267 pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
273 self.append(value).expect("dictionary key overflow");
274 }
275
276 pub fn finish(&mut self) -> DictionaryArray<K> {
278 self.dedup.clear();
279 let values = self.values_builder.finish();
280 let keys = self.keys_builder.finish();
281
282 let data_type = DataType::Dictionary(
283 Box::new(K::DATA_TYPE),
284 Box::new(FixedSizeBinary(self.byte_width)),
285 );
286
287 let builder = keys
288 .into_data()
289 .into_builder()
290 .data_type(data_type)
291 .child_data(vec![values.into_data()]);
292
293 DictionaryArray::from(unsafe { builder.build_unchecked() })
294 }
295
296 pub fn finish_cloned(&self) -> DictionaryArray<K> {
298 let values = self.values_builder.finish_cloned();
299 let keys = self.keys_builder.finish_cloned();
300
301 let data_type = DataType::Dictionary(
302 Box::new(K::DATA_TYPE),
303 Box::new(FixedSizeBinary(self.byte_width)),
304 );
305
306 let builder = keys
307 .into_data()
308 .into_builder()
309 .data_type(data_type)
310 .child_data(vec![values.into_data()]);
311
312 DictionaryArray::from(unsafe { builder.build_unchecked() })
313 }
314}
315
316fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
317 let values = values.values_slice();
318 let start = idx * byte_width.as_usize();
319 let end = idx * byte_width.as_usize() + byte_width.as_usize();
320 &values[start..end]
321}
322
323#[cfg(test)]
324mod tests {
325 use super::*;
326
327 use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type};
328 use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array};
329
330 #[test]
331 fn test_fixed_size_dictionary_builder() {
332 let values = ["abc", "def"];
333
334 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
335 assert_eq!(b.append(values[0]).unwrap(), 0);
336 b.append_null();
337 assert_eq!(b.append(values[1]).unwrap(), 1);
338 assert_eq!(b.append(values[1]).unwrap(), 1);
339 assert_eq!(b.append(values[0]).unwrap(), 0);
340 b.append_nulls(2);
341 assert_eq!(b.append(values[0]).unwrap(), 0);
342 let array = b.finish();
343
344 assert_eq!(
345 array.keys(),
346 &Int8Array::from(vec![
347 Some(0),
348 None,
349 Some(1),
350 Some(1),
351 Some(0),
352 None,
353 None,
354 Some(0)
355 ]),
356 );
357
358 let ava = array
360 .values()
361 .as_any()
362 .downcast_ref::<FixedSizeBinaryArray>()
363 .unwrap();
364
365 assert_eq!(ava.value(0), values[0].as_bytes());
366 assert_eq!(ava.value(1), values[1].as_bytes());
367 }
368
369 #[test]
370 fn test_fixed_size_dictionary_builder_wrong_size() {
371 let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
372 let err = b.append(b"too long").unwrap_err().to_string();
373 assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
374 let err = b.append("").unwrap_err().to_string();
375 assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
376 }
377
378 #[test]
379 fn test_fixed_size_dictionary_builder_finish_cloned() {
380 let values = ["abc", "def", "ghi"];
381
382 let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
383
384 builder.append(values[0]).unwrap();
385 builder.append_null();
386 builder.append(values[1]).unwrap();
387 builder.append(values[1]).unwrap();
388 builder.append(values[0]).unwrap();
389 let mut array = builder.finish_cloned();
390
391 assert_eq!(
392 array.keys(),
393 &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
394 );
395
396 let ava = array
398 .values()
399 .as_any()
400 .downcast_ref::<FixedSizeBinaryArray>()
401 .unwrap();
402
403 assert_eq!(ava.value(0), values[0].as_bytes());
404 assert_eq!(ava.value(1), values[1].as_bytes());
405
406 builder.append(values[0]).unwrap();
407 builder.append(values[2]).unwrap();
408 builder.append(values[1]).unwrap();
409
410 array = builder.finish();
411
412 assert_eq!(
413 array.keys(),
414 &Int8Array::from(vec![
415 Some(0),
416 None,
417 Some(1),
418 Some(1),
419 Some(0),
420 Some(0),
421 Some(2),
422 Some(1)
423 ])
424 );
425
426 let ava2 = array
428 .values()
429 .as_any()
430 .downcast_ref::<FixedSizeBinaryArray>()
431 .unwrap();
432
433 assert_eq!(ava2.value(0), values[0].as_bytes());
434 assert_eq!(ava2.value(1), values[1].as_bytes());
435 assert_eq!(ava2.value(2), values[2].as_bytes());
436 }
437
438 fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>)
439 where
440 K1: ArrowDictionaryKeyType,
441 K1::Native: NumCast,
442 K2: ArrowDictionaryKeyType,
443 K2::Native: NumCast + From<u8>,
444 {
445 let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3);
446 source.append_value(values[0]);
447 source.append_null();
448 source.append_value(values[1]);
449 source.append_value(values[2]);
450
451 let mut result =
452 FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap();
453 let array = result.finish();
454
455 let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
456 expected_keys_builder
457 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
458 expected_keys_builder.append_null();
459 expected_keys_builder
460 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
461 expected_keys_builder
462 .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
463 let expected_keys = expected_keys_builder.finish();
464 assert_eq!(array.keys(), &expected_keys);
465
466 let av = array.values();
467 let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
468 assert_eq!(ava.value(0), values[0]);
469 assert_eq!(ava.value(1), values[1]);
470 assert_eq!(ava.value(2), values[2]);
471 }
472
473 #[test]
474 fn test_try_new_from_builder() {
475 let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]];
476 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone());
478 _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone());
480 _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone());
482 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
484 _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone());
486 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone());
487 _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone());
488 _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
489 }
490
491 #[test]
492 fn test_try_new_from_builder_cast_fails() {
493 let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2);
494 for i in 0u16..257u16 {
495 source_builder.append_value(vec![(i >> 8) as u8, i as u8]);
496 }
497
498 let result =
501 FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
502 assert!(result.is_err());
503 if let Err(e) = result {
504 assert!(matches!(e, ArrowError::CastError(_)));
505 assert_eq!(
506 e.to_string(),
507 "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
508 );
509 }
510 }
511}