arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    match to_type {
32        Dictionary(to_index_type, to_value_type) => {
33            let dict_array = array
34                .as_any()
35                .downcast_ref::<DictionaryArray<K>>()
36                .ok_or_else(|| {
37                    ArrowError::ComputeError(
38                        "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
39                    )
40                })?;
41
42            let keys_array: ArrayRef =
43                Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
44            let values_array = dict_array.values();
45            let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
46            let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
47
48            // Failure to cast keys (because they don't fit in the
49            // target type) results in NULL values;
50            if cast_keys.null_count() > keys_array.null_count() {
51                return Err(ArrowError::ComputeError(format!(
52                    "Could not convert {} dictionary indexes from {:?} to {:?}",
53                    cast_keys.null_count() - keys_array.null_count(),
54                    keys_array.data_type(),
55                    to_index_type
56                )));
57            }
58
59            let data = cast_keys.into_data();
60            let builder = data
61                .into_builder()
62                .data_type(to_type.clone())
63                .child_data(vec![cast_values.into_data()]);
64
65            // Safety
66            // Cast keys are still valid
67            let data = unsafe { builder.build_unchecked() };
68
69            // create the appropriate array type
70            let new_array: ArrayRef = match **to_index_type {
71                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
72                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
73                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
74                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
75                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
76                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
77                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
78                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
79                _ => {
80                    return Err(ArrowError::CastError(format!(
81                        "Unsupported type {to_index_type:?} for dictionary index"
82                    )));
83                }
84            };
85
86            Ok(new_array)
87        }
88        Utf8View => {
89            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
90            // we handle it here to avoid the copy.
91            let dict_array = array
92                .as_dictionary::<K>()
93                .downcast_dict::<StringArray>()
94                .ok_or_else(|| {
95                    ArrowError::ComputeError(
96                        "Internal Error: Cannot cast Utf8View to StringArray of expected type"
97                            .to_string(),
98                    )
99                })?;
100
101            let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
102                dict_array.values(),
103                dict_array.keys(),
104            )?;
105            Ok(Arc::new(string_view))
106        }
107        BinaryView => {
108            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
109            // we handle it here to avoid the copy.
110            let dict_array = array
111                .as_dictionary::<K>()
112                .downcast_dict::<BinaryArray>()
113                .ok_or_else(|| {
114                    ArrowError::ComputeError(
115                        "Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
116                            .to_string(),
117                    )
118                })?;
119
120            let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
121                dict_array.values(),
122                dict_array.keys(),
123            )?;
124            Ok(Arc::new(binary_view))
125        }
126        _ => unpack_dictionary::<K>(array, to_type, cast_options),
127    }
128}
129
130fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
131    array: &GenericByteArray<V>,
132    keys: &PrimitiveArray<K>,
133) -> Result<GenericByteViewArray<T>, ArrowError> {
134    let value_buffer = array.values();
135    let value_offsets = array.value_offsets();
136    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
137    builder.append_block(value_buffer.clone());
138    for i in keys.iter() {
139        match i {
140            Some(v) => {
141                let idx = v.to_usize().ok_or_else(|| {
142                    ArrowError::ComputeError("Invalid dictionary index".to_string())
143                })?;
144
145                // Safety
146                // (1) The index is within bounds as they are offsets
147                // (2) The append_view is safe
148                unsafe {
149                    let offset = value_offsets.get_unchecked(idx).as_usize();
150                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
151                    let length = end - offset;
152                    builder.append_view_unchecked(0, offset as u32, length as u32)
153                }
154            }
155            None => {
156                builder.append_null();
157            }
158        }
159    }
160    Ok(builder.finish())
161}
162
163// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
164pub(crate) fn unpack_dictionary<K>(
165    array: &dyn Array,
166    to_type: &DataType,
167    cast_options: &CastOptions,
168) -> Result<ArrayRef, ArrowError>
169where
170    K: ArrowDictionaryKeyType,
171{
172    let dict_array = array.as_dictionary::<K>();
173    let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
174    take(cast_dict_values.as_ref(), dict_array.keys(), None)
175}
176
177/// Pack a data type into a dictionary array passing the values through a primitive array
178pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    primitive_type: DataType,
181    dict_value_type: &DataType,
182    cast_options: &CastOptions,
183) -> Result<ArrayRef, ArrowError> {
184    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
185    let dict = cast_with_options(
186        primitive.as_ref(),
187        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
188        cast_options,
189    )?;
190    cast_with_options(
191        dict.as_ref(),
192        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
193        cast_options,
194    )
195}
196
197/// Attempts to encode an array into an `ArrayDictionary` with index
198/// type K and value (dictionary) type value_type
199///
200/// K is the key type
201pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
202    array: &dyn Array,
203    dict_value_type: &DataType,
204    cast_options: &CastOptions,
205) -> Result<ArrayRef, ArrowError> {
206    use DataType::*;
207
208    match *dict_value_type {
209        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
210        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
211        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
212        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
213        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
214        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
215        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
216        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
217        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
218            array,
219            dict_value_type,
220            p,
221            s,
222            cast_options,
223        ),
224        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
225            array,
226            dict_value_type,
227            p,
228            s,
229            cast_options,
230        ),
231        Float16 => {
232            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
233        }
234        Float32 => {
235            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
236        }
237        Float64 => {
238            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
239        }
240        Date32 => pack_array_to_dictionary_via_primitive::<K>(
241            array,
242            DataType::Int32,
243            dict_value_type,
244            cast_options,
245        ),
246        Date64 => pack_array_to_dictionary_via_primitive::<K>(
247            array,
248            DataType::Int64,
249            dict_value_type,
250            cast_options,
251        ),
252        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
253            array,
254            DataType::Int32,
255            dict_value_type,
256            cast_options,
257        ),
258        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
259            array,
260            DataType::Int64,
261            dict_value_type,
262            cast_options,
263        ),
264        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
265            array,
266            DataType::Int64,
267            dict_value_type,
268            cast_options,
269        ),
270        Utf8 => {
271            // If the input is a view type, we can avoid casting (thus copying) the data
272            if array.data_type() == &DataType::Utf8View {
273                return string_view_to_dictionary::<K, i32>(array);
274            }
275            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
276        }
277        LargeUtf8 => {
278            // If the input is a view type, we can avoid casting (thus copying) the data
279            if array.data_type() == &DataType::Utf8View {
280                return string_view_to_dictionary::<K, i64>(array);
281            }
282            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
283        }
284        Binary => {
285            // If the input is a view type, we can avoid casting (thus copying) the data
286            if array.data_type() == &DataType::BinaryView {
287                return binary_view_to_dictionary::<K, i32>(array);
288            }
289            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
290        }
291        LargeBinary => {
292            // If the input is a view type, we can avoid casting (thus copying) the data
293            if array.data_type() == &DataType::BinaryView {
294                return binary_view_to_dictionary::<K, i64>(array);
295            }
296            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
297        }
298        FixedSizeBinary(byte_size) => {
299            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
300        }
301        _ => Err(ArrowError::CastError(format!(
302            "Unsupported output type for dictionary packing: {dict_value_type:?}"
303        ))),
304    }
305}
306
307// Packs the data from the primitive array of type <V> to a
308// DictionaryArray with keys of type K and values of value_type V
309pub(crate) fn pack_numeric_to_dictionary<K, V>(
310    array: &dyn Array,
311    dict_value_type: &DataType,
312    cast_options: &CastOptions,
313) -> Result<ArrayRef, ArrowError>
314where
315    K: ArrowDictionaryKeyType,
316    V: ArrowPrimitiveType,
317{
318    // attempt to cast the source array values to the target value type (the dictionary values type)
319    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
320    let values = cast_values.as_primitive::<V>();
321
322    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
323
324    // copy each element one at a time
325    for i in 0..values.len() {
326        if values.is_null(i) {
327            b.append_null();
328        } else {
329            b.append(values.value(i))?;
330        }
331    }
332    Ok(Arc::new(b.finish()))
333}
334
335pub(crate) fn pack_decimal_to_dictionary<K, D>(
336    array: &dyn Array,
337    dict_value_type: &DataType,
338    precision: u8,
339    scale: i8,
340    cast_options: &CastOptions,
341) -> Result<ArrayRef, ArrowError>
342where
343    K: ArrowDictionaryKeyType,
344    D: DecimalType + ArrowPrimitiveType,
345{
346    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
347    let dict = dict
348        .as_dictionary::<K>()
349        .downcast_dict::<PrimitiveArray<D>>()
350        .ok_or_else(|| {
351            ArrowError::ComputeError(format!(
352                "Internal Error: Cannot cast dict to {}Array",
353                D::PREFIX
354            ))
355        })?;
356    let value = dict.values().clone();
357    // Set correct precision/scale
358    let value = value.with_precision_and_scale(precision, scale)?;
359    Ok(Arc::new(DictionaryArray::<K>::try_new(
360        dict.keys().clone(),
361        Arc::new(value),
362    )?))
363}
364
365pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
366    array: &dyn Array,
367) -> Result<ArrayRef, ArrowError>
368where
369    K: ArrowDictionaryKeyType,
370{
371    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
372        array.len(),
373        1024,
374        1024,
375    );
376    let string_view = array
377        .as_any()
378        .downcast_ref::<StringViewArray>()
379        .ok_or_else(|| {
380            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
381        })?;
382    for v in string_view.iter() {
383        match v {
384            Some(v) => {
385                b.append(v)?;
386            }
387            None => {
388                b.append_null();
389            }
390        }
391    }
392
393    Ok(Arc::new(b.finish()))
394}
395
396pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
397    array: &dyn Array,
398) -> Result<ArrayRef, ArrowError>
399where
400    K: ArrowDictionaryKeyType,
401{
402    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
403        array.len(),
404        1024,
405        1024,
406    );
407    let binary_view = array
408        .as_any()
409        .downcast_ref::<BinaryViewArray>()
410        .ok_or_else(|| {
411            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
412        })?;
413    for v in binary_view.iter() {
414        match v {
415            Some(v) => {
416                b.append(v)?;
417            }
418            None => {
419                b.append_null();
420            }
421        }
422    }
423
424    Ok(Arc::new(b.finish()))
425}
426
427// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
428// key types of K
429pub(crate) fn pack_byte_to_dictionary<K, T>(
430    array: &dyn Array,
431    cast_options: &CastOptions,
432) -> Result<ArrayRef, ArrowError>
433where
434    K: ArrowDictionaryKeyType,
435    T: ByteArrayType,
436{
437    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
438    let values = cast_values
439        .as_any()
440        .downcast_ref::<GenericByteArray<T>>()
441        .ok_or_else(|| {
442            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
443        })?;
444    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
445
446    // copy each element one at a time
447    for i in 0..values.len() {
448        if values.is_null(i) {
449            b.append_null();
450        } else {
451            b.append(values.value(i))?;
452        }
453    }
454    Ok(Arc::new(b.finish()))
455}
456
457// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
458// key types of K
459pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
460    array: &dyn Array,
461    cast_options: &CastOptions,
462    byte_width: i32,
463) -> Result<ArrayRef, ArrowError>
464where
465    K: ArrowDictionaryKeyType,
466{
467    let cast_values =
468        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
469    let values = cast_values
470        .as_any()
471        .downcast_ref::<FixedSizeBinaryArray>()
472        .ok_or_else(|| {
473            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
474        })?;
475    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
476
477    // copy each element one at a time
478    for i in 0..values.len() {
479        if values.is_null(i) {
480            b.append_null();
481        } else {
482            b.append(values.value(i))?;
483        }
484    }
485    Ok(Arc::new(b.finish()))
486}