arrow_cast/cast/
string.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19use arrow_buffer::NullBuffer;
20
21pub(crate) fn value_to_string<O: OffsetSizeTrait>(
22    array: &dyn Array,
23    options: &CastOptions,
24) -> Result<ArrayRef, ArrowError> {
25    let mut builder = GenericStringBuilder::<O>::new();
26    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
27    let nulls = array.nulls();
28    for i in 0..array.len() {
29        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
30            true => builder.append_null(),
31            false => {
32                formatter.value(i).write(&mut builder)?;
33                // tell the builder the row is finished
34                builder.append_value("");
35            }
36        }
37    }
38    Ok(Arc::new(builder.finish()))
39}
40
41pub(crate) fn value_to_string_view(
42    array: &dyn Array,
43    options: &CastOptions,
44) -> Result<ArrayRef, ArrowError> {
45    let mut builder = StringViewBuilder::with_capacity(array.len());
46    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
47    let nulls = array.nulls();
48    // buffer to avoid reallocating on each value
49    // TODO: replace with write to builder after https://github.com/apache/arrow-rs/issues/6373
50    let mut buffer = String::new();
51    for i in 0..array.len() {
52        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
53            true => builder.append_null(),
54            false => {
55                // write to buffer first and then copy into target array
56                buffer.clear();
57                formatter.value(i).write(&mut buffer)?;
58                builder.append_value(&buffer)
59            }
60        }
61    }
62    Ok(Arc::new(builder.finish()))
63}
64
65/// Parse UTF-8
66pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
67    array: &dyn Array,
68    cast_options: &CastOptions,
69) -> Result<ArrayRef, ArrowError> {
70    let string_array = array.as_string::<O>();
71    parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || {
72        string_array.nulls().cloned()
73    })
74}
75
76/// Parse UTF-8 View
77pub(crate) fn parse_string_view<P: Parser>(
78    array: &dyn Array,
79    cast_options: &CastOptions,
80) -> Result<ArrayRef, ArrowError> {
81    let string_view_array = array.as_string_view();
82    parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || {
83        string_view_array.nulls().cloned()
84    })
85}
86
87fn parse_string_iter<
88    'a,
89    P: Parser,
90    I: Iterator<Item = Option<&'a str>>,
91    F: FnOnce() -> Option<NullBuffer>,
92>(
93    iter: I,
94    cast_options: &CastOptions,
95    nulls: F,
96) -> Result<ArrayRef, ArrowError> {
97    let array = if cast_options.safe {
98        let iter = iter.map(|x| x.and_then(P::parse));
99
100        // Benefit:
101        //     20% performance improvement
102        // Soundness:
103        //     The iterator is trustedLen because it comes from an `StringArray`.
104        unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
105    } else {
106        let v = iter
107            .map(|x| match x {
108                Some(v) => P::parse(v).ok_or_else(|| {
109                    ArrowError::CastError(format!(
110                        "Cannot cast string '{v}' to value of {} type",
111                        P::DATA_TYPE
112                    ))
113                }),
114                None => Ok(P::Native::default()),
115            })
116            .collect::<Result<Vec<_>, ArrowError>>()?;
117        PrimitiveArray::try_new(v.into(), nulls())?
118    };
119
120    Ok(Arc::new(array) as ArrayRef)
121}
122
123/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
124pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
125    array: &dyn Array,
126    to_tz: &Option<Arc<str>>,
127    cast_options: &CastOptions,
128) -> Result<ArrayRef, ArrowError> {
129    let array = array.as_string::<O>();
130    let out: PrimitiveArray<T> = match to_tz {
131        Some(tz) => {
132            let tz: Tz = tz.as_ref().parse()?;
133            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
134        }
135        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
136    };
137    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
138}
139
140/// Casts string view arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
141pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>(
142    array: &dyn Array,
143    to_tz: &Option<Arc<str>>,
144    cast_options: &CastOptions,
145) -> Result<ArrayRef, ArrowError> {
146    let array = array.as_string_view();
147    let out: PrimitiveArray<T> = match to_tz {
148        Some(tz) => {
149            let tz: Tz = tz.as_ref().parse()?;
150            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
151        }
152        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
153    };
154    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
155}
156
157fn cast_string_to_timestamp_impl<
158    'a,
159    I: Iterator<Item = Option<&'a str>>,
160    T: ArrowTimestampType,
161    Tz: TimeZone,
162>(
163    iter: I,
164    tz: &Tz,
165    cast_options: &CastOptions,
166) -> Result<PrimitiveArray<T>, ArrowError> {
167    if cast_options.safe {
168        let iter = iter.map(|v| {
169            v.and_then(|v| {
170                let naive = string_to_datetime(tz, v).ok()?.naive_utc();
171                T::make_value(naive)
172            })
173        });
174        // Benefit:
175        //     20% performance improvement
176        // Soundness:
177        //     The iterator is trustedLen because it comes from an `StringArray`.
178
179        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
180    } else {
181        let vec = iter
182            .map(|v| {
183                v.map(|v| {
184                    let naive = string_to_datetime(tz, v)?.naive_utc();
185                    T::make_value(naive).ok_or_else(|| match T::UNIT {
186                        TimeUnit::Nanosecond => ArrowError::CastError(format!(
187                            "Overflow converting {naive} to Nanosecond. The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"
188                        )),
189                        _ => ArrowError::CastError(format!(
190                            "Overflow converting {naive} to {:?}",
191                            T::UNIT
192                        ))
193                    })
194                })
195                    .transpose()
196            })
197            .collect::<Result<Vec<Option<i64>>, _>>()?;
198
199        // Benefit:
200        //     20% performance improvement
201        // Soundness:
202        //     The iterator is trustedLen because it comes from an `StringArray`.
203        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
204    }
205}
206
207pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>(
208    array: &dyn Array,
209    cast_options: &CastOptions,
210    parse_function: F,
211) -> Result<ArrayRef, ArrowError>
212where
213    Offset: OffsetSizeTrait,
214    ArrowType: ArrowPrimitiveType,
215    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
216{
217    let string_array = array
218        .as_any()
219        .downcast_ref::<GenericStringArray<Offset>>()
220        .unwrap();
221    cast_string_to_interval_impl::<_, ArrowType, F>(
222        string_array.iter(),
223        cast_options,
224        parse_function,
225    )
226}
227
228pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
229    array: &dyn Array,
230    cast_options: &CastOptions,
231) -> Result<ArrayRef, ArrowError> {
232    cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
233        array,
234        cast_options,
235        parse_interval_year_month,
236    )
237}
238
239pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
240    array: &dyn Array,
241    cast_options: &CastOptions,
242) -> Result<ArrayRef, ArrowError> {
243    cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
244        array,
245        cast_options,
246        parse_interval_day_time,
247    )
248}
249
250pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
251    array: &dyn Array,
252    cast_options: &CastOptions,
253) -> Result<ArrayRef, ArrowError> {
254    cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
255        array,
256        cast_options,
257        parse_interval_month_day_nano,
258    )
259}
260
261pub(crate) fn cast_view_to_interval<F, ArrowType>(
262    array: &dyn Array,
263    cast_options: &CastOptions,
264    parse_function: F,
265) -> Result<ArrayRef, ArrowError>
266where
267    ArrowType: ArrowPrimitiveType,
268    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
269{
270    let string_view_array = array.as_any().downcast_ref::<StringViewArray>().unwrap();
271    cast_string_to_interval_impl::<_, ArrowType, F>(
272        string_view_array.iter(),
273        cast_options,
274        parse_function,
275    )
276}
277
278pub(crate) fn cast_view_to_year_month_interval(
279    array: &dyn Array,
280    cast_options: &CastOptions,
281) -> Result<ArrayRef, ArrowError> {
282    cast_view_to_interval::<_, IntervalYearMonthType>(
283        array,
284        cast_options,
285        parse_interval_year_month,
286    )
287}
288
289pub(crate) fn cast_view_to_day_time_interval(
290    array: &dyn Array,
291    cast_options: &CastOptions,
292) -> Result<ArrayRef, ArrowError> {
293    cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, parse_interval_day_time)
294}
295
296pub(crate) fn cast_view_to_month_day_nano_interval(
297    array: &dyn Array,
298    cast_options: &CastOptions,
299) -> Result<ArrayRef, ArrowError> {
300    cast_view_to_interval::<_, IntervalMonthDayNanoType>(
301        array,
302        cast_options,
303        parse_interval_month_day_nano,
304    )
305}
306
307fn cast_string_to_interval_impl<'a, I, ArrowType, F>(
308    iter: I,
309    cast_options: &CastOptions,
310    parse_function: F,
311) -> Result<ArrayRef, ArrowError>
312where
313    I: Iterator<Item = Option<&'a str>>,
314    ArrowType: ArrowPrimitiveType,
315    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
316{
317    let interval_array = if cast_options.safe {
318        let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok()));
319
320        // Benefit:
321        //     20% performance improvement
322        // Soundness:
323        //     The iterator is trustedLen because it comes from an `StringArray`.
324        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
325    } else {
326        let vec = iter
327            .map(|v| v.map(parse_function).transpose())
328            .collect::<Result<Vec<_>, ArrowError>>()?;
329
330        // Benefit:
331        //     20% performance improvement
332        // Soundness:
333        //     The iterator is trustedLen because it comes from an `StringArray`.
334        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
335    };
336    Ok(Arc::new(interval_array) as ArrayRef)
337}
338
339/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
340/// offset size so re-encoding offset is unnecessary.
341fn extend_valid_utf8<'a, B, I>(builder: &mut B, iter: I)
342where
343    B: Extend<Option<&'a str>>,
344    I: Iterator<Item = Option<&'a [u8]>>,
345{
346    builder.extend(iter.map(|value| value.and_then(|bytes| std::str::from_utf8(bytes).ok())));
347}
348
349pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
350    array: &dyn Array,
351    cast_options: &CastOptions,
352) -> Result<ArrayRef, ArrowError> {
353    let array = array
354        .as_any()
355        .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
356        .unwrap();
357
358    match GenericStringArray::<O>::try_from_binary(array.clone()) {
359        Ok(a) => Ok(Arc::new(a)),
360        Err(e) => match cast_options.safe {
361            true => {
362                // Fallback to slow method to convert invalid sequences to nulls
363                let mut builder =
364                    GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());
365
366                extend_valid_utf8(&mut builder, array.iter());
367                Ok(Arc::new(builder.finish()))
368            }
369            false => Err(e),
370        },
371    }
372}
373
374pub(crate) fn cast_binary_view_to_string_view(
375    array: &dyn Array,
376    cast_options: &CastOptions,
377) -> Result<ArrayRef, ArrowError> {
378    let array = array.as_binary_view();
379
380    match array.clone().to_string_view() {
381        Ok(result) => Ok(Arc::new(result)),
382        Err(error) => match cast_options.safe {
383            true => {
384                let mut builder = StringViewBuilder::with_capacity(array.len());
385                extend_valid_utf8(&mut builder, array.iter());
386                Ok(Arc::new(builder.finish()))
387            }
388            false => Err(error),
389        },
390    }
391}
392
393/// Casts string to boolean
394fn cast_string_to_boolean<'a, StrArray>(
395    array: &StrArray,
396    cast_options: &CastOptions,
397) -> Result<ArrayRef, ArrowError>
398where
399    StrArray: StringArrayType<'a>,
400{
401    let output_array = array
402        .iter()
403        .map(|value| match value {
404            Some(value) => match value.to_ascii_lowercase().trim() {
405                "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
406                "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
407                    Ok(Some(false))
408                }
409                invalid_value => match cast_options.safe {
410                    true => Ok(None),
411                    false => Err(ArrowError::CastError(format!(
412                        "Cannot cast value '{invalid_value}' to value of Boolean type",
413                    ))),
414                },
415            },
416            None => Ok(None),
417        })
418        .collect::<Result<BooleanArray, _>>()?;
419
420    Ok(Arc::new(output_array))
421}
422
423pub(crate) fn cast_utf8_to_boolean<OffsetSize>(
424    from: &dyn Array,
425    cast_options: &CastOptions,
426) -> Result<ArrayRef, ArrowError>
427where
428    OffsetSize: OffsetSizeTrait,
429{
430    let array = from
431        .as_any()
432        .downcast_ref::<GenericStringArray<OffsetSize>>()
433        .unwrap();
434
435    cast_string_to_boolean(&array, cast_options)
436}
437
438pub(crate) fn cast_utf8view_to_boolean(
439    from: &dyn Array,
440    cast_options: &CastOptions,
441) -> Result<ArrayRef, ArrowError> {
442    let array = from.as_any().downcast_ref::<StringViewArray>().unwrap();
443
444    cast_string_to_boolean(&array, cast_options)
445}