Skip to main content

arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    let array = array.as_dictionary::<K>();
32    let from_child_type = array.values().data_type();
33    match (from_child_type, to_type) {
34        (_, Dictionary(to_index_type, to_value_type)) => {
35            dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options)
36        }
37        // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data
38        // copy of the value buffer. Fast path which avoids copying underlying values buffer.
39        // TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit)
40        // TODO: handle cross types (String -> BinaryView, Binary -> StringView)
41        //       (need to validate utf8?)
42        (Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>(
43            array.keys(),
44            array.values().as_string::<i32>(),
45        ),
46        (Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>(
47            array.keys(),
48            array.values().as_binary::<i32>(),
49        ),
50        _ => unpack_dictionary(array, to_type, cast_options),
51    }
52}
53
54fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
55    array: &DictionaryArray<K>,
56    to_index_type: &DataType,
57    to_value_type: &DataType,
58    cast_options: &CastOptions,
59) -> Result<ArrayRef, ArrowError> {
60    use DataType::*;
61
62    let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
63    let values_array = array.values();
64    let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
65    let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
66
67    // Failure to cast keys (because they don't fit in the
68    // target type) results in NULL values;
69    if cast_keys.null_count() > keys_array.null_count() {
70        return Err(ArrowError::ComputeError(format!(
71            "Could not convert {} dictionary indexes from {:?} to {:?}",
72            cast_keys.null_count() - keys_array.null_count(),
73            keys_array.data_type(),
74            to_index_type
75        )));
76    }
77
78    let data = cast_keys.into_data();
79    let builder = data
80        .into_builder()
81        .data_type(Dictionary(
82            Box::new(to_index_type.clone()),
83            Box::new(to_value_type.clone()),
84        ))
85        .child_data(vec![cast_values.into_data()]);
86
87    // Safety
88    // Cast keys are still valid
89    let data = unsafe { builder.build_unchecked() };
90
91    // create the appropriate array type
92    let new_array: ArrayRef = match to_index_type {
93        Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
94        Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
95        Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
96        Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
97        UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
98        UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
99        UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
100        UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
101        _ => {
102            return Err(ArrowError::CastError(format!(
103                "Unsupported type {to_index_type} for dictionary index"
104            )));
105        }
106    };
107
108    Ok(new_array)
109}
110
111fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>(
112    keys: &PrimitiveArray<K>,
113    values: &GenericByteArray<V>,
114) -> Result<ArrayRef, ArrowError> {
115    let value_buffer = values.values();
116    let value_offsets = values.value_offsets();
117    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
118    builder.append_block(value_buffer.clone());
119    for i in keys.iter() {
120        match i {
121            Some(v) => {
122                let idx = v.to_usize().ok_or_else(|| {
123                    ArrowError::ComputeError("Invalid dictionary index".to_string())
124                })?;
125
126                // Safety
127                // (1) The index is within bounds as they are offsets
128                // (2) The append_view is safe
129                unsafe {
130                    let offset = value_offsets.get_unchecked(idx).as_usize();
131                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
132                    let length = end - offset;
133                    builder.append_view_unchecked(0, offset as u32, length as u32)
134                }
135            }
136            None => {
137                builder.append_null();
138            }
139        }
140    }
141    Ok(Arc::new(builder.finish()))
142}
143
144// Unpack a dictionary into a flattened array of type to_type
145pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
146    array: &DictionaryArray<K>,
147    to_type: &DataType,
148    cast_options: &CastOptions,
149) -> Result<ArrayRef, ArrowError> {
150    let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?;
151    take(cast_dict_values.as_ref(), array.keys(), None)
152}
153
154/// Pack a data type into a dictionary array passing the values through a primitive array
155pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
156    array: &dyn Array,
157    primitive_type: DataType,
158    dict_value_type: &DataType,
159    cast_options: &CastOptions,
160) -> Result<ArrayRef, ArrowError> {
161    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
162    let dict = cast_with_options(
163        primitive.as_ref(),
164        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
165        cast_options,
166    )?;
167    cast_with_options(
168        dict.as_ref(),
169        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
170        cast_options,
171    )
172}
173
174/// Attempts to encode an array into an `ArrayDictionary` with index
175/// type K and value (dictionary) type value_type
176///
177/// K is the key type
178pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    dict_value_type: &DataType,
181    cast_options: &CastOptions,
182) -> Result<ArrayRef, ArrowError> {
183    use DataType::*;
184
185    match *dict_value_type {
186        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
187        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
188        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
189        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
190        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
191        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
192        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
193        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
194        Decimal32(p, s) => pack_decimal_to_dictionary::<K, Decimal32Type>(
195            array,
196            dict_value_type,
197            p,
198            s,
199            cast_options,
200        ),
201        Decimal64(p, s) => pack_decimal_to_dictionary::<K, Decimal64Type>(
202            array,
203            dict_value_type,
204            p,
205            s,
206            cast_options,
207        ),
208        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
209            array,
210            dict_value_type,
211            p,
212            s,
213            cast_options,
214        ),
215        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
216            array,
217            dict_value_type,
218            p,
219            s,
220            cast_options,
221        ),
222        Float16 => {
223            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
224        }
225        Float32 => {
226            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
227        }
228        Float64 => {
229            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
230        }
231        Date32 => pack_array_to_dictionary_via_primitive::<K>(
232            array,
233            DataType::Int32,
234            dict_value_type,
235            cast_options,
236        ),
237        Date64 => pack_array_to_dictionary_via_primitive::<K>(
238            array,
239            DataType::Int64,
240            dict_value_type,
241            cast_options,
242        ),
243        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
244            array,
245            DataType::Int32,
246            dict_value_type,
247            cast_options,
248        ),
249        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
250            array,
251            DataType::Int64,
252            dict_value_type,
253            cast_options,
254        ),
255        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
256            array,
257            DataType::Int64,
258            dict_value_type,
259            cast_options,
260        ),
261        Utf8 => {
262            // If the input is a view type, we can avoid casting (thus copying) the data
263            if array.data_type() == &DataType::Utf8View {
264                return string_view_to_dictionary::<K, i32>(array);
265            }
266            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
267        }
268        LargeUtf8 => {
269            // If the input is a view type, we can avoid casting (thus copying) the data
270            if array.data_type() == &DataType::Utf8View {
271                return string_view_to_dictionary::<K, i64>(array);
272            }
273            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
274        }
275        Utf8View => {
276            let base_value_type = match array.data_type() {
277                DataType::LargeUtf8 | DataType::Utf8View => DataType::LargeUtf8,
278                _ => DataType::Utf8,
279            };
280
281            let dict_base = cast_to_dictionary::<K>(array, &base_value_type, cast_options)?;
282            dictionary_cast::<K>(
283                dict_base.as_ref(),
284                &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8View)),
285                cast_options,
286            )
287        }
288        Binary => {
289            // If the input is a view type, we can avoid casting (thus copying) the data
290            if array.data_type() == &DataType::BinaryView {
291                return binary_view_to_dictionary::<K, i32>(array);
292            }
293            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
294        }
295        LargeBinary => {
296            // If the input is a view type, we can avoid casting (thus copying) the data
297            if array.data_type() == &DataType::BinaryView {
298                return binary_view_to_dictionary::<K, i64>(array);
299            }
300            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
301        }
302        BinaryView => {
303            let base_value_type = match array.data_type() {
304                DataType::LargeBinary | DataType::BinaryView => DataType::LargeBinary,
305                _ => DataType::Binary,
306            };
307
308            let dict_base = cast_to_dictionary::<K>(array, &base_value_type, cast_options)?;
309            dictionary_cast::<K>(
310                dict_base.as_ref(),
311                &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::BinaryView)),
312                cast_options,
313            )
314        }
315        FixedSizeBinary(byte_size) => {
316            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
317        }
318        Struct(_) => pack_struct_to_dictionary::<K>(array, dict_value_type, cast_options),
319        _ => Err(ArrowError::CastError(format!(
320            "Unsupported output type for dictionary packing: {dict_value_type}"
321        ))),
322    }
323}
324
325/// Wrap a struct-valued array as a `DictionaryArray<K, Struct>` with identity
326/// keys `[0, 1, ..., len-1]`. Unlike the primitive / byte packers above, no
327/// deduplication is performed, since struct values have no general hash/equality
328/// builder in arrow-rs.
329///
330/// Each child field of the source is recursively cast to the matching field of
331/// `dict_value_type` via `cast_with_options` before keys are emitted. If any
332/// child cast fails, the whole pack fails, the same contract as the primitive
333/// packers above.
334fn pack_struct_to_dictionary<K: ArrowDictionaryKeyType>(
335    array: &dyn Array,
336    dict_value_type: &DataType,
337    cast_options: &CastOptions,
338) -> Result<ArrayRef, ArrowError> {
339    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
340    let len = cast_values.len();
341
342    // Identity keys `[0, 1, ..., len-1]`, with null entries wherever the
343    // source row is null so the dictionary's logical null mask matches.
344    let mut builder = PrimitiveBuilder::<K>::with_capacity(len);
345    for i in 0..len {
346        if cast_values.is_null(i) {
347            builder.append_null();
348        } else {
349            let key = K::Native::from_usize(i).ok_or_else(|| {
350                ArrowError::CastError(format!(
351                    "Cannot fit {len} dictionary keys in {:?}",
352                    K::DATA_TYPE,
353                ))
354            })?;
355            builder.append_value(key);
356        }
357    }
358    let keys = builder.finish();
359
360    Ok(Arc::new(DictionaryArray::<K>::try_new(keys, cast_values)?))
361}
362
363// Packs the data from the primitive array of type <V> to a
364// DictionaryArray with keys of type K and values of value_type V
365pub(crate) fn pack_numeric_to_dictionary<K, V>(
366    array: &dyn Array,
367    dict_value_type: &DataType,
368    cast_options: &CastOptions,
369) -> Result<ArrayRef, ArrowError>
370where
371    K: ArrowDictionaryKeyType,
372    V: ArrowPrimitiveType,
373{
374    // attempt to cast the source array values to the target value type (the dictionary values type)
375    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
376    let values = cast_values.as_primitive::<V>();
377
378    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
379
380    // copy each element one at a time
381    for i in 0..values.len() {
382        if values.is_null(i) {
383            b.append_null();
384        } else {
385            b.append(values.value(i))?;
386        }
387    }
388    Ok(Arc::new(b.finish()))
389}
390
391pub(crate) fn pack_decimal_to_dictionary<K, D>(
392    array: &dyn Array,
393    dict_value_type: &DataType,
394    precision: u8,
395    scale: i8,
396    cast_options: &CastOptions,
397) -> Result<ArrayRef, ArrowError>
398where
399    K: ArrowDictionaryKeyType,
400    D: DecimalType + ArrowPrimitiveType,
401{
402    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
403    let dict = dict.as_dictionary::<K>();
404    let typed = dict.downcast_dict::<PrimitiveArray<D>>().ok_or_else(|| {
405        ArrowError::ComputeError(format!(
406            "Internal Error: Cannot cast dict to {}Array",
407            D::PREFIX
408        ))
409    })?;
410    let value = typed
411        .values()
412        .clone()
413        .with_precision_and_scale(precision, scale)?;
414    Ok(Arc::new(dict.with_values(Arc::new(value))))
415}
416
417pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
418    array: &dyn Array,
419) -> Result<ArrayRef, ArrowError>
420where
421    K: ArrowDictionaryKeyType,
422{
423    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
424        array.len(),
425        1024,
426        1024,
427    );
428    let string_view = array
429        .as_any()
430        .downcast_ref::<StringViewArray>()
431        .ok_or_else(|| {
432            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
433        })?;
434    for v in string_view.iter() {
435        match v {
436            Some(v) => {
437                b.append(v)?;
438            }
439            None => {
440                b.append_null();
441            }
442        }
443    }
444
445    Ok(Arc::new(b.finish()))
446}
447
448pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
449    array: &dyn Array,
450) -> Result<ArrayRef, ArrowError>
451where
452    K: ArrowDictionaryKeyType,
453{
454    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
455        array.len(),
456        1024,
457        1024,
458    );
459    let binary_view = array
460        .as_any()
461        .downcast_ref::<BinaryViewArray>()
462        .ok_or_else(|| {
463            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
464        })?;
465    for v in binary_view.iter() {
466        match v {
467            Some(v) => {
468                b.append(v)?;
469            }
470            None => {
471                b.append_null();
472            }
473        }
474    }
475
476    Ok(Arc::new(b.finish()))
477}
478
479// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
480// key types of K
481pub(crate) fn pack_byte_to_dictionary<K, T>(
482    array: &dyn Array,
483    cast_options: &CastOptions,
484) -> Result<ArrayRef, ArrowError>
485where
486    K: ArrowDictionaryKeyType,
487    T: ByteArrayType,
488{
489    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
490    let values = cast_values
491        .as_any()
492        .downcast_ref::<GenericByteArray<T>>()
493        .ok_or_else(|| {
494            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
495        })?;
496    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
497
498    // copy each element one at a time
499    for i in 0..values.len() {
500        if values.is_null(i) {
501            b.append_null();
502        } else {
503            b.append(values.value(i))?;
504        }
505    }
506    Ok(Arc::new(b.finish()))
507}
508
509// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
510// key types of K
511pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
512    array: &dyn Array,
513    cast_options: &CastOptions,
514    byte_width: i32,
515) -> Result<ArrayRef, ArrowError>
516where
517    K: ArrowDictionaryKeyType,
518{
519    let cast_values =
520        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
521    let values = cast_values
522        .as_any()
523        .downcast_ref::<FixedSizeBinaryArray>()
524        .ok_or_else(|| {
525            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
526        })?;
527    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
528
529    // copy each element one at a time
530    for i in 0..values.len() {
531        if values.is_null(i) {
532            b.append_null();
533        } else {
534            b.append(values.value(i))?;
535        }
536    }
537    Ok(Arc::new(b.finish()))
538}