Skip to main content

arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    let array = array.as_dictionary::<K>();
32    let from_child_type = array.values().data_type();
33    match (from_child_type, to_type) {
34        (_, Dictionary(to_index_type, to_value_type)) => {
35            dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options)
36        }
37        // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data
38        // copy of the value buffer. Fast path which avoids copying underlying values buffer.
39        // TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit)
40        // TODO: handle cross types (String -> BinaryView, Binary -> StringView)
41        //       (need to validate utf8?)
42        (Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>(
43            array.keys(),
44            array.values().as_string::<i32>(),
45        ),
46        (Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>(
47            array.keys(),
48            array.values().as_binary::<i32>(),
49        ),
50        _ => unpack_dictionary(array, to_type, cast_options),
51    }
52}
53
54fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
55    array: &DictionaryArray<K>,
56    to_index_type: &DataType,
57    to_value_type: &DataType,
58    cast_options: &CastOptions,
59) -> Result<ArrayRef, ArrowError> {
60    use DataType::*;
61
62    // Fast path for a nested dictionary source (`Dictionary<K, Dictionary<K2, V>>`).
63    // Both layers index into the same inner values, so the two index layers can
64    // be composed into one rather than materializing the values: `take` gathers
65    // the inner keys through the outer keys and reuses the inner values buffer
66    // untouched, so no value data is rewritten. The flattened single-level
67    // dictionary is then cast to the requested index/value types.
68    if matches!(array.values().data_type(), Dictionary(_, _)) {
69        let flattened = take(array.values().as_ref(), array.keys(), None)?;
70        return cast_with_options(
71            &flattened,
72            &Dictionary(
73                Box::new(to_index_type.clone()),
74                Box::new(to_value_type.clone()),
75            ),
76            cast_options,
77        );
78    }
79
80    let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
81    let values_array = array.values();
82    let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
83    let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
84
85    // Failure to cast keys (because they don't fit in the
86    // target type) results in NULL values;
87    if cast_keys.null_count() > keys_array.null_count() {
88        return Err(ArrowError::ComputeError(format!(
89            "Could not convert {} dictionary indexes from {:?} to {:?}",
90            cast_keys.null_count() - keys_array.null_count(),
91            keys_array.data_type(),
92            to_index_type
93        )));
94    }
95
96    let data = cast_keys.into_data();
97    let builder = data
98        .into_builder()
99        .data_type(Dictionary(
100            Box::new(to_index_type.clone()),
101            Box::new(to_value_type.clone()),
102        ))
103        .child_data(vec![cast_values.into_data()]);
104
105    // Safety
106    // Cast keys are still valid
107    let data = unsafe { builder.build_unchecked() };
108
109    // create the appropriate array type
110    let new_array: ArrayRef = match to_index_type {
111        Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
112        Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
113        Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
114        Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
115        UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
116        UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
117        UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
118        UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
119        _ => {
120            return Err(ArrowError::CastError(format!(
121                "Unsupported type {to_index_type} for dictionary index"
122            )));
123        }
124    };
125
126    Ok(new_array)
127}
128
129fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>(
130    keys: &PrimitiveArray<K>,
131    values: &GenericByteArray<V>,
132) -> Result<ArrayRef, ArrowError> {
133    let value_buffer = values.values();
134    let value_offsets = values.value_offsets();
135    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
136    builder.append_block(value_buffer.clone());
137    for i in keys.iter() {
138        match i {
139            Some(v) => {
140                let idx = v.to_usize().ok_or_else(|| {
141                    ArrowError::ComputeError("Invalid dictionary index".to_string())
142                })?;
143
144                // Safety
145                // (1) The index is within bounds as they are offsets
146                // (2) The append_view is safe
147                unsafe {
148                    let offset = value_offsets.get_unchecked(idx).as_usize();
149                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
150                    let length = end - offset;
151                    builder.append_view_unchecked(0, offset as u32, length as u32)
152                }
153            }
154            None => {
155                builder.append_null();
156            }
157        }
158    }
159    Ok(Arc::new(builder.finish()))
160}
161
162// Unpack a dictionary into a flattened array of type to_type
163pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
164    array: &DictionaryArray<K>,
165    to_type: &DataType,
166    cast_options: &CastOptions,
167) -> Result<ArrayRef, ArrowError> {
168    let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?;
169    take(cast_dict_values.as_ref(), array.keys(), None)
170}
171
172/// Pack a data type into a dictionary array passing the values through a primitive array
173pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
174    array: &dyn Array,
175    primitive_type: DataType,
176    dict_value_type: &DataType,
177    cast_options: &CastOptions,
178) -> Result<ArrayRef, ArrowError> {
179    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
180    let dict = cast_with_options(
181        primitive.as_ref(),
182        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
183        cast_options,
184    )?;
185    cast_with_options(
186        dict.as_ref(),
187        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
188        cast_options,
189    )
190}
191
192/// Attempts to encode an array into an `ArrayDictionary` with index
193/// type K and value (dictionary) type value_type
194///
195/// K is the key type
196pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
197    array: &dyn Array,
198    dict_value_type: &DataType,
199    cast_options: &CastOptions,
200) -> Result<ArrayRef, ArrowError> {
201    use DataType::*;
202
203    match *dict_value_type {
204        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
205        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
206        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
207        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
208        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
209        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
210        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
211        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
212        Decimal32(p, s) => pack_decimal_to_dictionary::<K, Decimal32Type>(
213            array,
214            dict_value_type,
215            p,
216            s,
217            cast_options,
218        ),
219        Decimal64(p, s) => pack_decimal_to_dictionary::<K, Decimal64Type>(
220            array,
221            dict_value_type,
222            p,
223            s,
224            cast_options,
225        ),
226        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
227            array,
228            dict_value_type,
229            p,
230            s,
231            cast_options,
232        ),
233        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
234            array,
235            dict_value_type,
236            p,
237            s,
238            cast_options,
239        ),
240        Float16 => {
241            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
242        }
243        Float32 => {
244            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
245        }
246        Float64 => {
247            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
248        }
249        Date32 => pack_array_to_dictionary_via_primitive::<K>(
250            array,
251            DataType::Int32,
252            dict_value_type,
253            cast_options,
254        ),
255        Date64 => pack_array_to_dictionary_via_primitive::<K>(
256            array,
257            DataType::Int64,
258            dict_value_type,
259            cast_options,
260        ),
261        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
262            array,
263            DataType::Int32,
264            dict_value_type,
265            cast_options,
266        ),
267        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
268            array,
269            DataType::Int64,
270            dict_value_type,
271            cast_options,
272        ),
273        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
274            array,
275            DataType::Int64,
276            dict_value_type,
277            cast_options,
278        ),
279        Utf8 => {
280            // If the input is a view type, we can avoid casting (thus copying) the data
281            if array.data_type() == &DataType::Utf8View {
282                return string_view_to_dictionary::<K, i32>(array);
283            }
284            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
285        }
286        LargeUtf8 => {
287            // If the input is a view type, we can avoid casting (thus copying) the data
288            if array.data_type() == &DataType::Utf8View {
289                return string_view_to_dictionary::<K, i64>(array);
290            }
291            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
292        }
293        Utf8View => {
294            let base_value_type = match array.data_type() {
295                DataType::LargeUtf8 | DataType::Utf8View => DataType::LargeUtf8,
296                _ => DataType::Utf8,
297            };
298
299            let dict_base = cast_to_dictionary::<K>(array, &base_value_type, cast_options)?;
300            dictionary_cast::<K>(
301                dict_base.as_ref(),
302                &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8View)),
303                cast_options,
304            )
305        }
306        Binary => {
307            // If the input is a view type, we can avoid casting (thus copying) the data
308            if array.data_type() == &DataType::BinaryView {
309                return binary_view_to_dictionary::<K, i32>(array);
310            }
311            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
312        }
313        LargeBinary => {
314            // If the input is a view type, we can avoid casting (thus copying) the data
315            if array.data_type() == &DataType::BinaryView {
316                return binary_view_to_dictionary::<K, i64>(array);
317            }
318            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
319        }
320        BinaryView => {
321            let base_value_type = match array.data_type() {
322                DataType::LargeBinary | DataType::BinaryView => DataType::LargeBinary,
323                _ => DataType::Binary,
324            };
325
326            let dict_base = cast_to_dictionary::<K>(array, &base_value_type, cast_options)?;
327            dictionary_cast::<K>(
328                dict_base.as_ref(),
329                &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::BinaryView)),
330                cast_options,
331            )
332        }
333        FixedSizeBinary(byte_size) => {
334            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
335        }
336        Struct(_) => pack_struct_to_dictionary::<K>(array, dict_value_type, cast_options),
337        _ => Err(ArrowError::CastError(format!(
338            "Unsupported output type for dictionary packing: {dict_value_type}"
339        ))),
340    }
341}
342
343/// Wrap a struct-valued array as a `DictionaryArray<K, Struct>` with identity
344/// keys `[0, 1, ..., len-1]`. Unlike the primitive / byte packers above, no
345/// deduplication is performed, since struct values have no general hash/equality
346/// builder in arrow-rs.
347///
348/// Each child field of the source is recursively cast to the matching field of
349/// `dict_value_type` via `cast_with_options` before keys are emitted. If any
350/// child cast fails, the whole pack fails, the same contract as the primitive
351/// packers above.
352fn pack_struct_to_dictionary<K: ArrowDictionaryKeyType>(
353    array: &dyn Array,
354    dict_value_type: &DataType,
355    cast_options: &CastOptions,
356) -> Result<ArrayRef, ArrowError> {
357    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
358    let len = cast_values.len();
359
360    // Identity keys `[0, 1, ..., len-1]`, with null entries wherever the
361    // source row is null so the dictionary's logical null mask matches.
362    let mut builder = PrimitiveBuilder::<K>::with_capacity(len);
363    for i in 0..len {
364        if cast_values.is_null(i) {
365            builder.append_null();
366        } else {
367            let key = K::Native::from_usize(i).ok_or_else(|| {
368                ArrowError::CastError(format!(
369                    "Cannot fit {len} dictionary keys in {:?}",
370                    K::DATA_TYPE,
371                ))
372            })?;
373            builder.append_value(key);
374        }
375    }
376    let keys = builder.finish();
377
378    Ok(Arc::new(DictionaryArray::<K>::try_new(keys, cast_values)?))
379}
380
381// Packs the data from the primitive array of type <V> to a
382// DictionaryArray with keys of type K and values of value_type V
383pub(crate) fn pack_numeric_to_dictionary<K, V>(
384    array: &dyn Array,
385    dict_value_type: &DataType,
386    cast_options: &CastOptions,
387) -> Result<ArrayRef, ArrowError>
388where
389    K: ArrowDictionaryKeyType,
390    V: ArrowPrimitiveType,
391{
392    // attempt to cast the source array values to the target value type (the dictionary values type)
393    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
394    let values = cast_values.as_primitive::<V>();
395
396    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
397
398    // copy each element one at a time
399    for i in 0..values.len() {
400        if values.is_null(i) {
401            b.append_null();
402        } else {
403            b.append(values.value(i))?;
404        }
405    }
406    Ok(Arc::new(b.finish()))
407}
408
409pub(crate) fn pack_decimal_to_dictionary<K, D>(
410    array: &dyn Array,
411    dict_value_type: &DataType,
412    precision: u8,
413    scale: i8,
414    cast_options: &CastOptions,
415) -> Result<ArrayRef, ArrowError>
416where
417    K: ArrowDictionaryKeyType,
418    D: DecimalType + ArrowPrimitiveType,
419{
420    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
421    let dict = dict.as_dictionary::<K>();
422    let typed = dict.downcast_dict::<PrimitiveArray<D>>().ok_or_else(|| {
423        ArrowError::ComputeError(format!(
424            "Internal Error: Cannot cast dict to {}Array",
425            D::PREFIX
426        ))
427    })?;
428    let value = typed
429        .values()
430        .clone()
431        .with_precision_and_scale(precision, scale)?;
432    Ok(Arc::new(dict.with_values(Arc::new(value))))
433}
434
435pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
436    array: &dyn Array,
437) -> Result<ArrayRef, ArrowError>
438where
439    K: ArrowDictionaryKeyType,
440{
441    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
442        array.len(),
443        1024,
444        1024,
445    );
446    let string_view = array
447        .as_any()
448        .downcast_ref::<StringViewArray>()
449        .ok_or_else(|| {
450            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
451        })?;
452    for v in string_view.iter() {
453        match v {
454            Some(v) => {
455                b.append(v)?;
456            }
457            None => {
458                b.append_null();
459            }
460        }
461    }
462
463    Ok(Arc::new(b.finish()))
464}
465
466pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
467    array: &dyn Array,
468) -> Result<ArrayRef, ArrowError>
469where
470    K: ArrowDictionaryKeyType,
471{
472    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
473        array.len(),
474        1024,
475        1024,
476    );
477    let binary_view = array
478        .as_any()
479        .downcast_ref::<BinaryViewArray>()
480        .ok_or_else(|| {
481            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
482        })?;
483    for v in binary_view.iter() {
484        match v {
485            Some(v) => {
486                b.append(v)?;
487            }
488            None => {
489                b.append_null();
490            }
491        }
492    }
493
494    Ok(Arc::new(b.finish()))
495}
496
497// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
498// key types of K
499pub(crate) fn pack_byte_to_dictionary<K, T>(
500    array: &dyn Array,
501    cast_options: &CastOptions,
502) -> Result<ArrayRef, ArrowError>
503where
504    K: ArrowDictionaryKeyType,
505    T: ByteArrayType,
506{
507    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
508    let values = cast_values
509        .as_any()
510        .downcast_ref::<GenericByteArray<T>>()
511        .ok_or_else(|| {
512            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
513        })?;
514    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
515
516    // copy each element one at a time
517    for i in 0..values.len() {
518        if values.is_null(i) {
519            b.append_null();
520        } else {
521            b.append(values.value(i))?;
522        }
523    }
524    Ok(Arc::new(b.finish()))
525}
526
527// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
528// key types of K
529pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
530    array: &dyn Array,
531    cast_options: &CastOptions,
532    byte_width: i32,
533) -> Result<ArrayRef, ArrowError>
534where
535    K: ArrowDictionaryKeyType,
536{
537    let cast_values =
538        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
539    let values = cast_values
540        .as_any()
541        .downcast_ref::<FixedSizeBinaryArray>()
542        .ok_or_else(|| {
543            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
544        })?;
545    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
546
547    // copy each element one at a time
548    for i in 0..values.len() {
549        if values.is_null(i) {
550            b.append_null();
551        } else {
552            b.append(values.value(i))?;
553        }
554    }
555    Ok(Arc::new(b.finish()))
556}