arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    let array = array.as_dictionary::<K>();
32    let from_child_type = array.values().data_type();
33    match (from_child_type, to_type) {
34        (_, Dictionary(to_index_type, to_value_type)) => {
35            dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options)
36        }
37        // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data
38        // copy of the value buffer. Fast path which avoids copying underlying values buffer.
39        // TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit)
40        // TODO: handle cross types (String -> BinaryView, Binary -> StringView)
41        //       (need to validate utf8?)
42        (Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>(
43            array.keys(),
44            array.values().as_string::<i32>(),
45        ),
46        (Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>(
47            array.keys(),
48            array.values().as_binary::<i32>(),
49        ),
50        _ => unpack_dictionary(array, to_type, cast_options),
51    }
52}
53
54fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
55    array: &DictionaryArray<K>,
56    to_index_type: &DataType,
57    to_value_type: &DataType,
58    cast_options: &CastOptions,
59) -> Result<ArrayRef, ArrowError> {
60    use DataType::*;
61
62    let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
63    let values_array = array.values();
64    let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
65    let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
66
67    // Failure to cast keys (because they don't fit in the
68    // target type) results in NULL values;
69    if cast_keys.null_count() > keys_array.null_count() {
70        return Err(ArrowError::ComputeError(format!(
71            "Could not convert {} dictionary indexes from {:?} to {:?}",
72            cast_keys.null_count() - keys_array.null_count(),
73            keys_array.data_type(),
74            to_index_type
75        )));
76    }
77
78    let data = cast_keys.into_data();
79    let builder = data
80        .into_builder()
81        .data_type(Dictionary(
82            Box::new(to_index_type.clone()),
83            Box::new(to_value_type.clone()),
84        ))
85        .child_data(vec![cast_values.into_data()]);
86
87    // Safety
88    // Cast keys are still valid
89    let data = unsafe { builder.build_unchecked() };
90
91    // create the appropriate array type
92    let new_array: ArrayRef = match to_index_type {
93        Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
94        Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
95        Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
96        Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
97        UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
98        UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
99        UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
100        UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
101        _ => {
102            return Err(ArrowError::CastError(format!(
103                "Unsupported type {to_index_type} for dictionary index"
104            )));
105        }
106    };
107
108    Ok(new_array)
109}
110
111fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>(
112    keys: &PrimitiveArray<K>,
113    values: &GenericByteArray<V>,
114) -> Result<ArrayRef, ArrowError> {
115    let value_buffer = values.values();
116    let value_offsets = values.value_offsets();
117    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
118    builder.append_block(value_buffer.clone());
119    for i in keys.iter() {
120        match i {
121            Some(v) => {
122                let idx = v.to_usize().ok_or_else(|| {
123                    ArrowError::ComputeError("Invalid dictionary index".to_string())
124                })?;
125
126                // Safety
127                // (1) The index is within bounds as they are offsets
128                // (2) The append_view is safe
129                unsafe {
130                    let offset = value_offsets.get_unchecked(idx).as_usize();
131                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
132                    let length = end - offset;
133                    builder.append_view_unchecked(0, offset as u32, length as u32)
134                }
135            }
136            None => {
137                builder.append_null();
138            }
139        }
140    }
141    Ok(Arc::new(builder.finish()))
142}
143
144// Unpack a dictionary into a flattened array of type to_type
145pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
146    array: &DictionaryArray<K>,
147    to_type: &DataType,
148    cast_options: &CastOptions,
149) -> Result<ArrayRef, ArrowError> {
150    let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?;
151    take(cast_dict_values.as_ref(), array.keys(), None)
152}
153
154/// Pack a data type into a dictionary array passing the values through a primitive array
155pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
156    array: &dyn Array,
157    primitive_type: DataType,
158    dict_value_type: &DataType,
159    cast_options: &CastOptions,
160) -> Result<ArrayRef, ArrowError> {
161    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
162    let dict = cast_with_options(
163        primitive.as_ref(),
164        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
165        cast_options,
166    )?;
167    cast_with_options(
168        dict.as_ref(),
169        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
170        cast_options,
171    )
172}
173
174/// Attempts to encode an array into an `ArrayDictionary` with index
175/// type K and value (dictionary) type value_type
176///
177/// K is the key type
178pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    dict_value_type: &DataType,
181    cast_options: &CastOptions,
182) -> Result<ArrayRef, ArrowError> {
183    use DataType::*;
184
185    match *dict_value_type {
186        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
187        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
188        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
189        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
190        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
191        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
192        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
193        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
194        Decimal32(p, s) => pack_decimal_to_dictionary::<K, Decimal32Type>(
195            array,
196            dict_value_type,
197            p,
198            s,
199            cast_options,
200        ),
201        Decimal64(p, s) => pack_decimal_to_dictionary::<K, Decimal64Type>(
202            array,
203            dict_value_type,
204            p,
205            s,
206            cast_options,
207        ),
208        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
209            array,
210            dict_value_type,
211            p,
212            s,
213            cast_options,
214        ),
215        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
216            array,
217            dict_value_type,
218            p,
219            s,
220            cast_options,
221        ),
222        Float16 => {
223            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
224        }
225        Float32 => {
226            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
227        }
228        Float64 => {
229            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
230        }
231        Date32 => pack_array_to_dictionary_via_primitive::<K>(
232            array,
233            DataType::Int32,
234            dict_value_type,
235            cast_options,
236        ),
237        Date64 => pack_array_to_dictionary_via_primitive::<K>(
238            array,
239            DataType::Int64,
240            dict_value_type,
241            cast_options,
242        ),
243        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
244            array,
245            DataType::Int32,
246            dict_value_type,
247            cast_options,
248        ),
249        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
250            array,
251            DataType::Int64,
252            dict_value_type,
253            cast_options,
254        ),
255        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
256            array,
257            DataType::Int64,
258            dict_value_type,
259            cast_options,
260        ),
261        Utf8 => {
262            // If the input is a view type, we can avoid casting (thus copying) the data
263            if array.data_type() == &DataType::Utf8View {
264                return string_view_to_dictionary::<K, i32>(array);
265            }
266            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
267        }
268        LargeUtf8 => {
269            // If the input is a view type, we can avoid casting (thus copying) the data
270            if array.data_type() == &DataType::Utf8View {
271                return string_view_to_dictionary::<K, i64>(array);
272            }
273            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
274        }
275        Binary => {
276            // If the input is a view type, we can avoid casting (thus copying) the data
277            if array.data_type() == &DataType::BinaryView {
278                return binary_view_to_dictionary::<K, i32>(array);
279            }
280            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
281        }
282        LargeBinary => {
283            // If the input is a view type, we can avoid casting (thus copying) the data
284            if array.data_type() == &DataType::BinaryView {
285                return binary_view_to_dictionary::<K, i64>(array);
286            }
287            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
288        }
289        FixedSizeBinary(byte_size) => {
290            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
291        }
292        _ => Err(ArrowError::CastError(format!(
293            "Unsupported output type for dictionary packing: {dict_value_type}"
294        ))),
295    }
296}
297
298// Packs the data from the primitive array of type <V> to a
299// DictionaryArray with keys of type K and values of value_type V
300pub(crate) fn pack_numeric_to_dictionary<K, V>(
301    array: &dyn Array,
302    dict_value_type: &DataType,
303    cast_options: &CastOptions,
304) -> Result<ArrayRef, ArrowError>
305where
306    K: ArrowDictionaryKeyType,
307    V: ArrowPrimitiveType,
308{
309    // attempt to cast the source array values to the target value type (the dictionary values type)
310    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
311    let values = cast_values.as_primitive::<V>();
312
313    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
314
315    // copy each element one at a time
316    for i in 0..values.len() {
317        if values.is_null(i) {
318            b.append_null();
319        } else {
320            b.append(values.value(i))?;
321        }
322    }
323    Ok(Arc::new(b.finish()))
324}
325
326pub(crate) fn pack_decimal_to_dictionary<K, D>(
327    array: &dyn Array,
328    dict_value_type: &DataType,
329    precision: u8,
330    scale: i8,
331    cast_options: &CastOptions,
332) -> Result<ArrayRef, ArrowError>
333where
334    K: ArrowDictionaryKeyType,
335    D: DecimalType + ArrowPrimitiveType,
336{
337    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
338    let dict = dict
339        .as_dictionary::<K>()
340        .downcast_dict::<PrimitiveArray<D>>()
341        .ok_or_else(|| {
342            ArrowError::ComputeError(format!(
343                "Internal Error: Cannot cast dict to {}Array",
344                D::PREFIX
345            ))
346        })?;
347    let value = dict.values().clone();
348    // Set correct precision/scale
349    let value = value.with_precision_and_scale(precision, scale)?;
350    Ok(Arc::new(DictionaryArray::<K>::try_new(
351        dict.keys().clone(),
352        Arc::new(value),
353    )?))
354}
355
356pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
357    array: &dyn Array,
358) -> Result<ArrayRef, ArrowError>
359where
360    K: ArrowDictionaryKeyType,
361{
362    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
363        array.len(),
364        1024,
365        1024,
366    );
367    let string_view = array
368        .as_any()
369        .downcast_ref::<StringViewArray>()
370        .ok_or_else(|| {
371            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
372        })?;
373    for v in string_view.iter() {
374        match v {
375            Some(v) => {
376                b.append(v)?;
377            }
378            None => {
379                b.append_null();
380            }
381        }
382    }
383
384    Ok(Arc::new(b.finish()))
385}
386
387pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
388    array: &dyn Array,
389) -> Result<ArrayRef, ArrowError>
390where
391    K: ArrowDictionaryKeyType,
392{
393    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
394        array.len(),
395        1024,
396        1024,
397    );
398    let binary_view = array
399        .as_any()
400        .downcast_ref::<BinaryViewArray>()
401        .ok_or_else(|| {
402            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
403        })?;
404    for v in binary_view.iter() {
405        match v {
406            Some(v) => {
407                b.append(v)?;
408            }
409            None => {
410                b.append_null();
411            }
412        }
413    }
414
415    Ok(Arc::new(b.finish()))
416}
417
418// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
419// key types of K
420pub(crate) fn pack_byte_to_dictionary<K, T>(
421    array: &dyn Array,
422    cast_options: &CastOptions,
423) -> Result<ArrayRef, ArrowError>
424where
425    K: ArrowDictionaryKeyType,
426    T: ByteArrayType,
427{
428    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
429    let values = cast_values
430        .as_any()
431        .downcast_ref::<GenericByteArray<T>>()
432        .ok_or_else(|| {
433            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
434        })?;
435    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
436
437    // copy each element one at a time
438    for i in 0..values.len() {
439        if values.is_null(i) {
440            b.append_null();
441        } else {
442            b.append(values.value(i))?;
443        }
444    }
445    Ok(Arc::new(b.finish()))
446}
447
448// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
449// key types of K
450pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
451    array: &dyn Array,
452    cast_options: &CastOptions,
453    byte_width: i32,
454) -> Result<ArrayRef, ArrowError>
455where
456    K: ArrowDictionaryKeyType,
457{
458    let cast_values =
459        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
460    let values = cast_values
461        .as_any()
462        .downcast_ref::<FixedSizeBinaryArray>()
463        .ok_or_else(|| {
464            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
465        })?;
466    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
467
468    // copy each element one at a time
469    for i in 0..values.len() {
470        if values.is_null(i) {
471            b.append_null();
472        } else {
473            b.append(values.value(i))?;
474        }
475    }
476    Ok(Arc::new(b.finish()))
477}