Skip to main content

arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    let array = array.as_dictionary::<K>();
32    let from_child_type = array.values().data_type();
33    match (from_child_type, to_type) {
34        (_, Dictionary(to_index_type, to_value_type)) => {
35            dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options)
36        }
37        // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data
38        // copy of the value buffer. Fast path which avoids copying underlying values buffer.
39        // TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit)
40        // TODO: handle cross types (String -> BinaryView, Binary -> StringView)
41        //       (need to validate utf8?)
42        (Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>(
43            array.keys(),
44            array.values().as_string::<i32>(),
45        ),
46        (Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>(
47            array.keys(),
48            array.values().as_binary::<i32>(),
49        ),
50        _ => unpack_dictionary(array, to_type, cast_options),
51    }
52}
53
54fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
55    array: &DictionaryArray<K>,
56    to_index_type: &DataType,
57    to_value_type: &DataType,
58    cast_options: &CastOptions,
59) -> Result<ArrayRef, ArrowError> {
60    use DataType::*;
61
62    let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
63    let values_array = array.values();
64    let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
65    let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
66
67    // Failure to cast keys (because they don't fit in the
68    // target type) results in NULL values;
69    if cast_keys.null_count() > keys_array.null_count() {
70        return Err(ArrowError::ComputeError(format!(
71            "Could not convert {} dictionary indexes from {:?} to {:?}",
72            cast_keys.null_count() - keys_array.null_count(),
73            keys_array.data_type(),
74            to_index_type
75        )));
76    }
77
78    let data = cast_keys.into_data();
79    let builder = data
80        .into_builder()
81        .data_type(Dictionary(
82            Box::new(to_index_type.clone()),
83            Box::new(to_value_type.clone()),
84        ))
85        .child_data(vec![cast_values.into_data()]);
86
87    // Safety
88    // Cast keys are still valid
89    let data = unsafe { builder.build_unchecked() };
90
91    // create the appropriate array type
92    let new_array: ArrayRef = match to_index_type {
93        Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
94        Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
95        Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
96        Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
97        UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
98        UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
99        UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
100        UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
101        _ => {
102            return Err(ArrowError::CastError(format!(
103                "Unsupported type {to_index_type} for dictionary index"
104            )));
105        }
106    };
107
108    Ok(new_array)
109}
110
111fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>(
112    keys: &PrimitiveArray<K>,
113    values: &GenericByteArray<V>,
114) -> Result<ArrayRef, ArrowError> {
115    let value_buffer = values.values();
116    let value_offsets = values.value_offsets();
117    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
118    builder.append_block(value_buffer.clone());
119    for i in keys.iter() {
120        match i {
121            Some(v) => {
122                let idx = v.to_usize().ok_or_else(|| {
123                    ArrowError::ComputeError("Invalid dictionary index".to_string())
124                })?;
125
126                // Safety
127                // (1) The index is within bounds as they are offsets
128                // (2) The append_view is safe
129                unsafe {
130                    let offset = value_offsets.get_unchecked(idx).as_usize();
131                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
132                    let length = end - offset;
133                    builder.append_view_unchecked(0, offset as u32, length as u32)
134                }
135            }
136            None => {
137                builder.append_null();
138            }
139        }
140    }
141    Ok(Arc::new(builder.finish()))
142}
143
144// Unpack a dictionary into a flattened array of type to_type
145pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
146    array: &DictionaryArray<K>,
147    to_type: &DataType,
148    cast_options: &CastOptions,
149) -> Result<ArrayRef, ArrowError> {
150    let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?;
151    take(cast_dict_values.as_ref(), array.keys(), None)
152}
153
154/// Pack a data type into a dictionary array passing the values through a primitive array
155pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
156    array: &dyn Array,
157    primitive_type: DataType,
158    dict_value_type: &DataType,
159    cast_options: &CastOptions,
160) -> Result<ArrayRef, ArrowError> {
161    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
162    let dict = cast_with_options(
163        primitive.as_ref(),
164        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
165        cast_options,
166    )?;
167    cast_with_options(
168        dict.as_ref(),
169        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
170        cast_options,
171    )
172}
173
174/// Attempts to encode an array into an `ArrayDictionary` with index
175/// type K and value (dictionary) type value_type
176///
177/// K is the key type
178pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    dict_value_type: &DataType,
181    cast_options: &CastOptions,
182) -> Result<ArrayRef, ArrowError> {
183    use DataType::*;
184
185    match *dict_value_type {
186        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
187        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
188        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
189        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
190        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
191        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
192        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
193        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
194        Decimal32(p, s) => pack_decimal_to_dictionary::<K, Decimal32Type>(
195            array,
196            dict_value_type,
197            p,
198            s,
199            cast_options,
200        ),
201        Decimal64(p, s) => pack_decimal_to_dictionary::<K, Decimal64Type>(
202            array,
203            dict_value_type,
204            p,
205            s,
206            cast_options,
207        ),
208        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
209            array,
210            dict_value_type,
211            p,
212            s,
213            cast_options,
214        ),
215        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
216            array,
217            dict_value_type,
218            p,
219            s,
220            cast_options,
221        ),
222        Float16 => {
223            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
224        }
225        Float32 => {
226            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
227        }
228        Float64 => {
229            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
230        }
231        Date32 => pack_array_to_dictionary_via_primitive::<K>(
232            array,
233            DataType::Int32,
234            dict_value_type,
235            cast_options,
236        ),
237        Date64 => pack_array_to_dictionary_via_primitive::<K>(
238            array,
239            DataType::Int64,
240            dict_value_type,
241            cast_options,
242        ),
243        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
244            array,
245            DataType::Int32,
246            dict_value_type,
247            cast_options,
248        ),
249        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
250            array,
251            DataType::Int64,
252            dict_value_type,
253            cast_options,
254        ),
255        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
256            array,
257            DataType::Int64,
258            dict_value_type,
259            cast_options,
260        ),
261        Utf8 => {
262            // If the input is a view type, we can avoid casting (thus copying) the data
263            if array.data_type() == &DataType::Utf8View {
264                return string_view_to_dictionary::<K, i32>(array);
265            }
266            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
267        }
268        LargeUtf8 => {
269            // If the input is a view type, we can avoid casting (thus copying) the data
270            if array.data_type() == &DataType::Utf8View {
271                return string_view_to_dictionary::<K, i64>(array);
272            }
273            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
274        }
275        Utf8View => {
276            let base_value_type = match array.data_type() {
277                DataType::LargeUtf8 | DataType::Utf8View => DataType::LargeUtf8,
278                _ => DataType::Utf8,
279            };
280
281            let dict_base = cast_to_dictionary::<K>(array, &base_value_type, cast_options)?;
282            dictionary_cast::<K>(
283                dict_base.as_ref(),
284                &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8View)),
285                cast_options,
286            )
287        }
288        Binary => {
289            // If the input is a view type, we can avoid casting (thus copying) the data
290            if array.data_type() == &DataType::BinaryView {
291                return binary_view_to_dictionary::<K, i32>(array);
292            }
293            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
294        }
295        LargeBinary => {
296            // If the input is a view type, we can avoid casting (thus copying) the data
297            if array.data_type() == &DataType::BinaryView {
298                return binary_view_to_dictionary::<K, i64>(array);
299            }
300            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
301        }
302        BinaryView => {
303            let base_value_type = match array.data_type() {
304                DataType::LargeBinary | DataType::BinaryView => DataType::LargeBinary,
305                _ => DataType::Binary,
306            };
307
308            let dict_base = cast_to_dictionary::<K>(array, &base_value_type, cast_options)?;
309            dictionary_cast::<K>(
310                dict_base.as_ref(),
311                &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::BinaryView)),
312                cast_options,
313            )
314        }
315        FixedSizeBinary(byte_size) => {
316            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
317        }
318        _ => Err(ArrowError::CastError(format!(
319            "Unsupported output type for dictionary packing: {dict_value_type}"
320        ))),
321    }
322}
323
324// Packs the data from the primitive array of type <V> to a
325// DictionaryArray with keys of type K and values of value_type V
326pub(crate) fn pack_numeric_to_dictionary<K, V>(
327    array: &dyn Array,
328    dict_value_type: &DataType,
329    cast_options: &CastOptions,
330) -> Result<ArrayRef, ArrowError>
331where
332    K: ArrowDictionaryKeyType,
333    V: ArrowPrimitiveType,
334{
335    // attempt to cast the source array values to the target value type (the dictionary values type)
336    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
337    let values = cast_values.as_primitive::<V>();
338
339    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
340
341    // copy each element one at a time
342    for i in 0..values.len() {
343        if values.is_null(i) {
344            b.append_null();
345        } else {
346            b.append(values.value(i))?;
347        }
348    }
349    Ok(Arc::new(b.finish()))
350}
351
352pub(crate) fn pack_decimal_to_dictionary<K, D>(
353    array: &dyn Array,
354    dict_value_type: &DataType,
355    precision: u8,
356    scale: i8,
357    cast_options: &CastOptions,
358) -> Result<ArrayRef, ArrowError>
359where
360    K: ArrowDictionaryKeyType,
361    D: DecimalType + ArrowPrimitiveType,
362{
363    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
364    let dict = dict
365        .as_dictionary::<K>()
366        .downcast_dict::<PrimitiveArray<D>>()
367        .ok_or_else(|| {
368            ArrowError::ComputeError(format!(
369                "Internal Error: Cannot cast dict to {}Array",
370                D::PREFIX
371            ))
372        })?;
373    let value = dict.values().clone();
374    // Set correct precision/scale
375    let value = value.with_precision_and_scale(precision, scale)?;
376    Ok(Arc::new(DictionaryArray::<K>::try_new(
377        dict.keys().clone(),
378        Arc::new(value),
379    )?))
380}
381
382pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
383    array: &dyn Array,
384) -> Result<ArrayRef, ArrowError>
385where
386    K: ArrowDictionaryKeyType,
387{
388    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
389        array.len(),
390        1024,
391        1024,
392    );
393    let string_view = array
394        .as_any()
395        .downcast_ref::<StringViewArray>()
396        .ok_or_else(|| {
397            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
398        })?;
399    for v in string_view.iter() {
400        match v {
401            Some(v) => {
402                b.append(v)?;
403            }
404            None => {
405                b.append_null();
406            }
407        }
408    }
409
410    Ok(Arc::new(b.finish()))
411}
412
413pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
414    array: &dyn Array,
415) -> Result<ArrayRef, ArrowError>
416where
417    K: ArrowDictionaryKeyType,
418{
419    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
420        array.len(),
421        1024,
422        1024,
423    );
424    let binary_view = array
425        .as_any()
426        .downcast_ref::<BinaryViewArray>()
427        .ok_or_else(|| {
428            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
429        })?;
430    for v in binary_view.iter() {
431        match v {
432            Some(v) => {
433                b.append(v)?;
434            }
435            None => {
436                b.append_null();
437            }
438        }
439    }
440
441    Ok(Arc::new(b.finish()))
442}
443
444// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
445// key types of K
446pub(crate) fn pack_byte_to_dictionary<K, T>(
447    array: &dyn Array,
448    cast_options: &CastOptions,
449) -> Result<ArrayRef, ArrowError>
450where
451    K: ArrowDictionaryKeyType,
452    T: ByteArrayType,
453{
454    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
455    let values = cast_values
456        .as_any()
457        .downcast_ref::<GenericByteArray<T>>()
458        .ok_or_else(|| {
459            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
460        })?;
461    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
462
463    // copy each element one at a time
464    for i in 0..values.len() {
465        if values.is_null(i) {
466            b.append_null();
467        } else {
468            b.append(values.value(i))?;
469        }
470    }
471    Ok(Arc::new(b.finish()))
472}
473
474// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
475// key types of K
476pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
477    array: &dyn Array,
478    cast_options: &CastOptions,
479    byte_width: i32,
480) -> Result<ArrayRef, ArrowError>
481where
482    K: ArrowDictionaryKeyType,
483{
484    let cast_values =
485        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
486    let values = cast_values
487        .as_any()
488        .downcast_ref::<FixedSizeBinaryArray>()
489        .ok_or_else(|| {
490            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
491        })?;
492    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
493
494    // copy each element one at a time
495    for i in 0..values.len() {
496        if values.is_null(i) {
497            b.append_null();
498        } else {
499            b.append(values.value(i))?;
500        }
501    }
502    Ok(Arc::new(b.finish()))
503}