arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    match to_type {
32        Dictionary(to_index_type, to_value_type) => {
33            let dict_array = array
34                .as_any()
35                .downcast_ref::<DictionaryArray<K>>()
36                .ok_or_else(|| {
37                    ArrowError::ComputeError(
38                        "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
39                    )
40                })?;
41
42            let keys_array: ArrayRef =
43                Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
44            let values_array = dict_array.values();
45            let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
46            let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
47
48            // Failure to cast keys (because they don't fit in the
49            // target type) results in NULL values;
50            if cast_keys.null_count() > keys_array.null_count() {
51                return Err(ArrowError::ComputeError(format!(
52                    "Could not convert {} dictionary indexes from {:?} to {:?}",
53                    cast_keys.null_count() - keys_array.null_count(),
54                    keys_array.data_type(),
55                    to_index_type
56                )));
57            }
58
59            let data = cast_keys.into_data();
60            let builder = data
61                .into_builder()
62                .data_type(to_type.clone())
63                .child_data(vec![cast_values.into_data()]);
64
65            // Safety
66            // Cast keys are still valid
67            let data = unsafe { builder.build_unchecked() };
68
69            // create the appropriate array type
70            let new_array: ArrayRef = match **to_index_type {
71                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
72                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
73                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
74                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
75                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
76                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
77                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
78                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
79                _ => {
80                    return Err(ArrowError::CastError(format!(
81                        "Unsupported type {to_index_type:?} for dictionary index"
82                    )));
83                }
84            };
85
86            Ok(new_array)
87        }
88        Utf8View => {
89            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
90            // we handle it here to avoid the copy.
91            let dict_array = array
92                .as_dictionary::<K>()
93                .downcast_dict::<StringArray>()
94                .ok_or_else(|| {
95                    ArrowError::ComputeError(
96                        "Internal Error: Cannot cast Utf8View to StringArray of expected type"
97                            .to_string(),
98                    )
99                })?;
100
101            let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
102                dict_array.values(),
103                dict_array.keys(),
104            )?;
105            Ok(Arc::new(string_view))
106        }
107        BinaryView => {
108            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
109            // we handle it here to avoid the copy.
110            let dict_array = array
111                .as_dictionary::<K>()
112                .downcast_dict::<BinaryArray>()
113                .ok_or_else(|| {
114                    ArrowError::ComputeError(
115                        "Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
116                            .to_string(),
117                    )
118                })?;
119
120            let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
121                dict_array.values(),
122                dict_array.keys(),
123            )?;
124            Ok(Arc::new(binary_view))
125        }
126        _ => unpack_dictionary::<K>(array, to_type, cast_options),
127    }
128}
129
130fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
131    array: &GenericByteArray<V>,
132    keys: &PrimitiveArray<K>,
133) -> Result<GenericByteViewArray<T>, ArrowError> {
134    let value_buffer = array.values();
135    let value_offsets = array.value_offsets();
136    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
137    builder.append_block(value_buffer.clone());
138    for i in keys.iter() {
139        match i {
140            Some(v) => {
141                let idx = v.to_usize().ok_or_else(|| {
142                    ArrowError::ComputeError("Invalid dictionary index".to_string())
143                })?;
144
145                // Safety
146                // (1) The index is within bounds as they are offsets
147                // (2) The append_view is safe
148                unsafe {
149                    let offset = value_offsets.get_unchecked(idx).as_usize();
150                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
151                    let length = end - offset;
152                    builder.append_view_unchecked(0, offset as u32, length as u32)
153                }
154            }
155            None => {
156                builder.append_null();
157            }
158        }
159    }
160    Ok(builder.finish())
161}
162
163// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
164pub(crate) fn unpack_dictionary<K>(
165    array: &dyn Array,
166    to_type: &DataType,
167    cast_options: &CastOptions,
168) -> Result<ArrayRef, ArrowError>
169where
170    K: ArrowDictionaryKeyType,
171{
172    let dict_array = array.as_dictionary::<K>();
173    let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
174    take(cast_dict_values.as_ref(), dict_array.keys(), None)
175}
176
177/// Pack a data type into a dictionary array passing the values through a primitive array
178pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    primitive_type: DataType,
181    dict_value_type: &DataType,
182    cast_options: &CastOptions,
183) -> Result<ArrayRef, ArrowError> {
184    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
185    let dict = cast_with_options(
186        primitive.as_ref(),
187        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
188        cast_options,
189    )?;
190    cast_with_options(
191        dict.as_ref(),
192        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
193        cast_options,
194    )
195}
196
197/// Attempts to encode an array into an `ArrayDictionary` with index
198/// type K and value (dictionary) type value_type
199///
200/// K is the key type
201pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
202    array: &dyn Array,
203    dict_value_type: &DataType,
204    cast_options: &CastOptions,
205) -> Result<ArrayRef, ArrowError> {
206    use DataType::*;
207
208    match *dict_value_type {
209        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
210        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
211        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
212        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
213        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
214        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
215        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
216        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
217        Decimal32(p, s) => pack_decimal_to_dictionary::<K, Decimal32Type>(
218            array,
219            dict_value_type,
220            p,
221            s,
222            cast_options,
223        ),
224        Decimal64(p, s) => pack_decimal_to_dictionary::<K, Decimal64Type>(
225            array,
226            dict_value_type,
227            p,
228            s,
229            cast_options,
230        ),
231        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
232            array,
233            dict_value_type,
234            p,
235            s,
236            cast_options,
237        ),
238        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
239            array,
240            dict_value_type,
241            p,
242            s,
243            cast_options,
244        ),
245        Float16 => {
246            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
247        }
248        Float32 => {
249            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
250        }
251        Float64 => {
252            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
253        }
254        Date32 => pack_array_to_dictionary_via_primitive::<K>(
255            array,
256            DataType::Int32,
257            dict_value_type,
258            cast_options,
259        ),
260        Date64 => pack_array_to_dictionary_via_primitive::<K>(
261            array,
262            DataType::Int64,
263            dict_value_type,
264            cast_options,
265        ),
266        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
267            array,
268            DataType::Int32,
269            dict_value_type,
270            cast_options,
271        ),
272        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
273            array,
274            DataType::Int64,
275            dict_value_type,
276            cast_options,
277        ),
278        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
279            array,
280            DataType::Int64,
281            dict_value_type,
282            cast_options,
283        ),
284        Utf8 => {
285            // If the input is a view type, we can avoid casting (thus copying) the data
286            if array.data_type() == &DataType::Utf8View {
287                return string_view_to_dictionary::<K, i32>(array);
288            }
289            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
290        }
291        LargeUtf8 => {
292            // If the input is a view type, we can avoid casting (thus copying) the data
293            if array.data_type() == &DataType::Utf8View {
294                return string_view_to_dictionary::<K, i64>(array);
295            }
296            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
297        }
298        Binary => {
299            // If the input is a view type, we can avoid casting (thus copying) the data
300            if array.data_type() == &DataType::BinaryView {
301                return binary_view_to_dictionary::<K, i32>(array);
302            }
303            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
304        }
305        LargeBinary => {
306            // If the input is a view type, we can avoid casting (thus copying) the data
307            if array.data_type() == &DataType::BinaryView {
308                return binary_view_to_dictionary::<K, i64>(array);
309            }
310            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
311        }
312        FixedSizeBinary(byte_size) => {
313            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
314        }
315        _ => Err(ArrowError::CastError(format!(
316            "Unsupported output type for dictionary packing: {dict_value_type:?}"
317        ))),
318    }
319}
320
321// Packs the data from the primitive array of type <V> to a
322// DictionaryArray with keys of type K and values of value_type V
323pub(crate) fn pack_numeric_to_dictionary<K, V>(
324    array: &dyn Array,
325    dict_value_type: &DataType,
326    cast_options: &CastOptions,
327) -> Result<ArrayRef, ArrowError>
328where
329    K: ArrowDictionaryKeyType,
330    V: ArrowPrimitiveType,
331{
332    // attempt to cast the source array values to the target value type (the dictionary values type)
333    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
334    let values = cast_values.as_primitive::<V>();
335
336    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
337
338    // copy each element one at a time
339    for i in 0..values.len() {
340        if values.is_null(i) {
341            b.append_null();
342        } else {
343            b.append(values.value(i))?;
344        }
345    }
346    Ok(Arc::new(b.finish()))
347}
348
349pub(crate) fn pack_decimal_to_dictionary<K, D>(
350    array: &dyn Array,
351    dict_value_type: &DataType,
352    precision: u8,
353    scale: i8,
354    cast_options: &CastOptions,
355) -> Result<ArrayRef, ArrowError>
356where
357    K: ArrowDictionaryKeyType,
358    D: DecimalType + ArrowPrimitiveType,
359{
360    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
361    let dict = dict
362        .as_dictionary::<K>()
363        .downcast_dict::<PrimitiveArray<D>>()
364        .ok_or_else(|| {
365            ArrowError::ComputeError(format!(
366                "Internal Error: Cannot cast dict to {}Array",
367                D::PREFIX
368            ))
369        })?;
370    let value = dict.values().clone();
371    // Set correct precision/scale
372    let value = value.with_precision_and_scale(precision, scale)?;
373    Ok(Arc::new(DictionaryArray::<K>::try_new(
374        dict.keys().clone(),
375        Arc::new(value),
376    )?))
377}
378
379pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
380    array: &dyn Array,
381) -> Result<ArrayRef, ArrowError>
382where
383    K: ArrowDictionaryKeyType,
384{
385    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
386        array.len(),
387        1024,
388        1024,
389    );
390    let string_view = array
391        .as_any()
392        .downcast_ref::<StringViewArray>()
393        .ok_or_else(|| {
394            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
395        })?;
396    for v in string_view.iter() {
397        match v {
398            Some(v) => {
399                b.append(v)?;
400            }
401            None => {
402                b.append_null();
403            }
404        }
405    }
406
407    Ok(Arc::new(b.finish()))
408}
409
410pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
411    array: &dyn Array,
412) -> Result<ArrayRef, ArrowError>
413where
414    K: ArrowDictionaryKeyType,
415{
416    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
417        array.len(),
418        1024,
419        1024,
420    );
421    let binary_view = array
422        .as_any()
423        .downcast_ref::<BinaryViewArray>()
424        .ok_or_else(|| {
425            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
426        })?;
427    for v in binary_view.iter() {
428        match v {
429            Some(v) => {
430                b.append(v)?;
431            }
432            None => {
433                b.append_null();
434            }
435        }
436    }
437
438    Ok(Arc::new(b.finish()))
439}
440
441// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
442// key types of K
443pub(crate) fn pack_byte_to_dictionary<K, T>(
444    array: &dyn Array,
445    cast_options: &CastOptions,
446) -> Result<ArrayRef, ArrowError>
447where
448    K: ArrowDictionaryKeyType,
449    T: ByteArrayType,
450{
451    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
452    let values = cast_values
453        .as_any()
454        .downcast_ref::<GenericByteArray<T>>()
455        .ok_or_else(|| {
456            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
457        })?;
458    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
459
460    // copy each element one at a time
461    for i in 0..values.len() {
462        if values.is_null(i) {
463            b.append_null();
464        } else {
465            b.append(values.value(i))?;
466        }
467    }
468    Ok(Arc::new(b.finish()))
469}
470
471// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
472// key types of K
473pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
474    array: &dyn Array,
475    cast_options: &CastOptions,
476    byte_width: i32,
477) -> Result<ArrayRef, ArrowError>
478where
479    K: ArrowDictionaryKeyType,
480{
481    let cast_values =
482        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
483    let values = cast_values
484        .as_any()
485        .downcast_ref::<FixedSizeBinaryArray>()
486        .ok_or_else(|| {
487            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
488        })?;
489    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
490
491    // copy each element one at a time
492    for i in 0..values.len() {
493        if values.is_null(i) {
494            b.append_null();
495        } else {
496            b.append(values.value(i))?;
497        }
498    }
499    Ok(Arc::new(b.finish()))
500}