arrow_cast/cast/
string.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19use arrow_buffer::NullBuffer;
20
21pub(crate) fn value_to_string<O: OffsetSizeTrait>(
22    array: &dyn Array,
23    options: &CastOptions,
24) -> Result<ArrayRef, ArrowError> {
25    let mut builder = GenericStringBuilder::<O>::new();
26    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
27    let nulls = array.nulls();
28    for i in 0..array.len() {
29        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
30            true => builder.append_null(),
31            false => {
32                formatter.value(i).write(&mut builder)?;
33                // tell the builder the row is finished
34                builder.append_value("");
35            }
36        }
37    }
38    Ok(Arc::new(builder.finish()))
39}
40
41pub(crate) fn value_to_string_view(
42    array: &dyn Array,
43    options: &CastOptions,
44) -> Result<ArrayRef, ArrowError> {
45    let mut builder = StringViewBuilder::with_capacity(array.len());
46    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
47    let nulls = array.nulls();
48    // buffer to avoid reallocating on each value
49    // TODO: replace with write to builder after https://github.com/apache/arrow-rs/issues/6373
50    let mut buffer = String::new();
51    for i in 0..array.len() {
52        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
53            true => builder.append_null(),
54            false => {
55                // write to buffer first and then copy into target array
56                buffer.clear();
57                formatter.value(i).write(&mut buffer)?;
58                builder.append_value(&buffer)
59            }
60        }
61    }
62    Ok(Arc::new(builder.finish()))
63}
64
65/// Parse UTF-8
66pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
67    array: &dyn Array,
68    cast_options: &CastOptions,
69) -> Result<ArrayRef, ArrowError> {
70    let string_array = array.as_string::<O>();
71    parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || {
72        string_array.nulls().cloned()
73    })
74}
75
76/// Parse UTF-8 View
77pub(crate) fn parse_string_view<P: Parser>(
78    array: &dyn Array,
79    cast_options: &CastOptions,
80) -> Result<ArrayRef, ArrowError> {
81    let string_view_array = array.as_string_view();
82    parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || {
83        string_view_array.nulls().cloned()
84    })
85}
86
87fn parse_string_iter<
88    'a,
89    P: Parser,
90    I: Iterator<Item = Option<&'a str>>,
91    F: FnOnce() -> Option<NullBuffer>,
92>(
93    iter: I,
94    cast_options: &CastOptions,
95    nulls: F,
96) -> Result<ArrayRef, ArrowError> {
97    let array = if cast_options.safe {
98        let iter = iter.map(|x| x.and_then(P::parse));
99
100        // Benefit:
101        //     20% performance improvement
102        // Soundness:
103        //     The iterator is trustedLen because it comes from an `StringArray`.
104        unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
105    } else {
106        let v = iter
107            .map(|x| match x {
108                Some(v) => P::parse(v).ok_or_else(|| {
109                    ArrowError::CastError(format!(
110                        "Cannot cast string '{}' to value of {:?} type",
111                        v,
112                        P::DATA_TYPE
113                    ))
114                }),
115                None => Ok(P::Native::default()),
116            })
117            .collect::<Result<Vec<_>, ArrowError>>()?;
118        PrimitiveArray::new(v.into(), nulls())
119    };
120
121    Ok(Arc::new(array) as ArrayRef)
122}
123
124/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
125pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
126    array: &dyn Array,
127    to_tz: &Option<Arc<str>>,
128    cast_options: &CastOptions,
129) -> Result<ArrayRef, ArrowError> {
130    let array = array.as_string::<O>();
131    let out: PrimitiveArray<T> = match to_tz {
132        Some(tz) => {
133            let tz: Tz = tz.as_ref().parse()?;
134            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
135        }
136        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
137    };
138    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
139}
140
141/// Casts string view arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
142pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>(
143    array: &dyn Array,
144    to_tz: &Option<Arc<str>>,
145    cast_options: &CastOptions,
146) -> Result<ArrayRef, ArrowError> {
147    let array = array.as_string_view();
148    let out: PrimitiveArray<T> = match to_tz {
149        Some(tz) => {
150            let tz: Tz = tz.as_ref().parse()?;
151            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
152        }
153        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
154    };
155    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
156}
157
158fn cast_string_to_timestamp_impl<
159    'a,
160    I: Iterator<Item = Option<&'a str>>,
161    T: ArrowTimestampType,
162    Tz: TimeZone,
163>(
164    iter: I,
165    tz: &Tz,
166    cast_options: &CastOptions,
167) -> Result<PrimitiveArray<T>, ArrowError> {
168    if cast_options.safe {
169        let iter = iter.map(|v| {
170            v.and_then(|v| {
171                let naive = string_to_datetime(tz, v).ok()?.naive_utc();
172                T::make_value(naive)
173            })
174        });
175        // Benefit:
176        //     20% performance improvement
177        // Soundness:
178        //     The iterator is trustedLen because it comes from an `StringArray`.
179
180        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
181    } else {
182        let vec = iter
183            .map(|v| {
184                v.map(|v| {
185                    let naive = string_to_datetime(tz, v)?.naive_utc();
186                    T::make_value(naive).ok_or_else(|| match T::UNIT {
187                        TimeUnit::Nanosecond => ArrowError::CastError(format!(
188                            "Overflow converting {naive} to Nanosecond. The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"
189                        )),
190                        _ => ArrowError::CastError(format!(
191                            "Overflow converting {naive} to {:?}",
192                            T::UNIT
193                        ))
194                    })
195                })
196                    .transpose()
197            })
198            .collect::<Result<Vec<Option<i64>>, _>>()?;
199
200        // Benefit:
201        //     20% performance improvement
202        // Soundness:
203        //     The iterator is trustedLen because it comes from an `StringArray`.
204        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
205    }
206}
207
208pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>(
209    array: &dyn Array,
210    cast_options: &CastOptions,
211    parse_function: F,
212) -> Result<ArrayRef, ArrowError>
213where
214    Offset: OffsetSizeTrait,
215    ArrowType: ArrowPrimitiveType,
216    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
217{
218    let string_array = array
219        .as_any()
220        .downcast_ref::<GenericStringArray<Offset>>()
221        .unwrap();
222    cast_string_to_interval_impl::<_, ArrowType, F>(
223        string_array.iter(),
224        cast_options,
225        parse_function,
226    )
227}
228
229pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
230    array: &dyn Array,
231    cast_options: &CastOptions,
232) -> Result<ArrayRef, ArrowError> {
233    cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
234        array,
235        cast_options,
236        parse_interval_year_month,
237    )
238}
239
240pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
241    array: &dyn Array,
242    cast_options: &CastOptions,
243) -> Result<ArrayRef, ArrowError> {
244    cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
245        array,
246        cast_options,
247        parse_interval_day_time,
248    )
249}
250
251pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
252    array: &dyn Array,
253    cast_options: &CastOptions,
254) -> Result<ArrayRef, ArrowError> {
255    cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
256        array,
257        cast_options,
258        parse_interval_month_day_nano,
259    )
260}
261
262pub(crate) fn cast_view_to_interval<F, ArrowType>(
263    array: &dyn Array,
264    cast_options: &CastOptions,
265    parse_function: F,
266) -> Result<ArrayRef, ArrowError>
267where
268    ArrowType: ArrowPrimitiveType,
269    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
270{
271    let string_view_array = array.as_any().downcast_ref::<StringViewArray>().unwrap();
272    cast_string_to_interval_impl::<_, ArrowType, F>(
273        string_view_array.iter(),
274        cast_options,
275        parse_function,
276    )
277}
278
279pub(crate) fn cast_view_to_year_month_interval(
280    array: &dyn Array,
281    cast_options: &CastOptions,
282) -> Result<ArrayRef, ArrowError> {
283    cast_view_to_interval::<_, IntervalYearMonthType>(
284        array,
285        cast_options,
286        parse_interval_year_month,
287    )
288}
289
290pub(crate) fn cast_view_to_day_time_interval(
291    array: &dyn Array,
292    cast_options: &CastOptions,
293) -> Result<ArrayRef, ArrowError> {
294    cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, parse_interval_day_time)
295}
296
297pub(crate) fn cast_view_to_month_day_nano_interval(
298    array: &dyn Array,
299    cast_options: &CastOptions,
300) -> Result<ArrayRef, ArrowError> {
301    cast_view_to_interval::<_, IntervalMonthDayNanoType>(
302        array,
303        cast_options,
304        parse_interval_month_day_nano,
305    )
306}
307
308fn cast_string_to_interval_impl<'a, I, ArrowType, F>(
309    iter: I,
310    cast_options: &CastOptions,
311    parse_function: F,
312) -> Result<ArrayRef, ArrowError>
313where
314    I: Iterator<Item = Option<&'a str>>,
315    ArrowType: ArrowPrimitiveType,
316    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
317{
318    let interval_array = if cast_options.safe {
319        let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok()));
320
321        // Benefit:
322        //     20% performance improvement
323        // Soundness:
324        //     The iterator is trustedLen because it comes from an `StringArray`.
325        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
326    } else {
327        let vec = iter
328            .map(|v| v.map(parse_function).transpose())
329            .collect::<Result<Vec<_>, ArrowError>>()?;
330
331        // Benefit:
332        //     20% performance improvement
333        // Soundness:
334        //     The iterator is trustedLen because it comes from an `StringArray`.
335        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
336    };
337    Ok(Arc::new(interval_array) as ArrayRef)
338}
339
340/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
341/// offset size so re-encoding offset is unnecessary.
342pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
343    array: &dyn Array,
344    cast_options: &CastOptions,
345) -> Result<ArrayRef, ArrowError> {
346    let array = array
347        .as_any()
348        .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
349        .unwrap();
350
351    match GenericStringArray::<O>::try_from_binary(array.clone()) {
352        Ok(a) => Ok(Arc::new(a)),
353        Err(e) => match cast_options.safe {
354            true => {
355                // Fallback to slow method to convert invalid sequences to nulls
356                let mut builder =
357                    GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());
358
359                let iter = array
360                    .iter()
361                    .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
362
363                builder.extend(iter);
364                Ok(Arc::new(builder.finish()))
365            }
366            false => Err(e),
367        },
368    }
369}
370
371/// Casts string to boolean
372fn cast_string_to_boolean<'a, StrArray>(
373    array: &StrArray,
374    cast_options: &CastOptions,
375) -> Result<ArrayRef, ArrowError>
376where
377    StrArray: StringArrayType<'a>,
378{
379    let output_array = array
380        .iter()
381        .map(|value| match value {
382            Some(value) => match value.to_ascii_lowercase().trim() {
383                "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
384                "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
385                    Ok(Some(false))
386                }
387                invalid_value => match cast_options.safe {
388                    true => Ok(None),
389                    false => Err(ArrowError::CastError(format!(
390                        "Cannot cast value '{invalid_value}' to value of Boolean type",
391                    ))),
392                },
393            },
394            None => Ok(None),
395        })
396        .collect::<Result<BooleanArray, _>>()?;
397
398    Ok(Arc::new(output_array))
399}
400
401pub(crate) fn cast_utf8_to_boolean<OffsetSize>(
402    from: &dyn Array,
403    cast_options: &CastOptions,
404) -> Result<ArrayRef, ArrowError>
405where
406    OffsetSize: OffsetSizeTrait,
407{
408    let array = from
409        .as_any()
410        .downcast_ref::<GenericStringArray<OffsetSize>>()
411        .unwrap();
412
413    cast_string_to_boolean(&array, cast_options)
414}
415
416pub(crate) fn cast_utf8view_to_boolean(
417    from: &dyn Array,
418    cast_options: &CastOptions,
419) -> Result<ArrayRef, ArrowError> {
420    let array = from.as_any().downcast_ref::<StringViewArray>().unwrap();
421
422    cast_string_to_boolean(&array, cast_options)
423}