Skip to main content

arrow_json/reader/
string_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::marker::PhantomData;
19use std::sync::Arc;
20
21use arrow_array::builder::GenericStringBuilder;
22use arrow_array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
23use arrow_schema::ArrowError;
24use itoa;
25use ryu;
26
27use crate::reader::tape::{Tape, TapeElement};
28use crate::reader::{ArrayDecoder, DecoderContext};
29
30const TRUE: &str = "true";
31const FALSE: &str = "false";
32
33pub struct StringArrayDecoder<O: OffsetSizeTrait> {
34    coerce_primitive: bool,
35    ignore_type_conflicts: bool,
36    phantom: PhantomData<O>,
37}
38
39impl<O: OffsetSizeTrait> StringArrayDecoder<O> {
40    pub fn new(ctx: &DecoderContext) -> Self {
41        Self {
42            coerce_primitive: ctx.coerce_primitive(),
43            ignore_type_conflicts: ctx.ignore_type_conflicts(),
44            phantom: Default::default(),
45        }
46    }
47}
48
49impl<O: OffsetSizeTrait> ArrayDecoder for StringArrayDecoder<O> {
50    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
51        let coerce_primitive = self.coerce_primitive;
52
53        let mut data_capacity = 0;
54        for p in pos {
55            match tape.get(*p) {
56                TapeElement::String(idx) => {
57                    data_capacity += tape.get_string(idx).len();
58                }
59                TapeElement::Null => {}
60                TapeElement::True if coerce_primitive => {
61                    data_capacity += TRUE.len();
62                }
63                TapeElement::False if coerce_primitive => {
64                    data_capacity += FALSE.len();
65                }
66                TapeElement::Number(idx) if coerce_primitive => {
67                    data_capacity += tape.get_string(idx).len();
68                }
69                TapeElement::I64(_)
70                | TapeElement::I32(_)
71                | TapeElement::F64(_)
72                | TapeElement::F32(_)
73                    if coerce_primitive =>
74                {
75                    // An arbitrary estimate
76                    data_capacity += 10;
77                }
78                _ if self.ignore_type_conflicts => {}
79                _ => {
80                    return Err(tape.error(*p, "string"));
81                }
82            }
83        }
84
85        if O::from_usize(data_capacity).is_none() {
86            return Err(ArrowError::JsonError(format!(
87                "offset overflow decoding {}",
88                GenericStringArray::<O>::DATA_TYPE
89            )));
90        }
91
92        let mut builder = GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);
93
94        let mut float_formatter = ryu::Buffer::new();
95        let mut int_formatter = itoa::Buffer::new();
96
97        for p in pos {
98            match tape.get(*p) {
99                TapeElement::String(idx) => {
100                    builder.append_value(tape.get_string(idx));
101                }
102                TapeElement::Null => builder.append_null(),
103                TapeElement::True if coerce_primitive => {
104                    builder.append_value(TRUE);
105                }
106                TapeElement::False if coerce_primitive => {
107                    builder.append_value(FALSE);
108                }
109                TapeElement::Number(idx) if coerce_primitive => {
110                    builder.append_value(tape.get_string(idx));
111                }
112                TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) {
113                    TapeElement::I32(low) => {
114                        let val = ((high as i64) << 32) | (low as u32) as i64;
115                        builder.append_value(int_formatter.format(val));
116                    }
117                    _ => unreachable!(),
118                },
119                TapeElement::I32(n) if coerce_primitive => {
120                    builder.append_value(int_formatter.format(n));
121                }
122                TapeElement::F32(n) if coerce_primitive => {
123                    builder.append_value(int_formatter.format(n));
124                }
125                TapeElement::F64(high) if coerce_primitive => match tape.get(p + 1) {
126                    TapeElement::F32(low) => {
127                        let val = f64::from_bits(((high as u64) << 32) | low as u64);
128                        builder.append_value(float_formatter.format_finite(val));
129                    }
130                    _ => unreachable!(),
131                },
132                _ if self.ignore_type_conflicts => builder.append_null(),
133                _ => unreachable!(),
134            }
135        }
136
137        Ok(Arc::new(builder.finish()))
138    }
139}