arrow_json/reader/
primitive_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use num::NumCast;
19use std::marker::PhantomData;
20
21use arrow_array::builder::PrimitiveBuilder;
22use arrow_array::{Array, ArrowPrimitiveType};
23use arrow_cast::parse::Parser;
24use arrow_data::ArrayData;
25use arrow_schema::{ArrowError, DataType};
26use half::f16;
27
28use crate::reader::tape::{Tape, TapeElement};
29use crate::reader::ArrayDecoder;
30
31/// A trait for JSON-specific primitive parsing logic
32///
33/// According to the specification unquoted fields should be parsed as a double-precision
34/// floating point numbers, including scientific representation such as `2e3`
35///
36/// In practice, it is common to serialize numbers outside the range of an `f64` and expect
37/// them to round-trip correctly. As such when parsing integers we first parse as the integer
38/// and fallback to parsing as a floating point if this fails
39trait ParseJsonNumber: Sized {
40    fn parse(s: &[u8]) -> Option<Self>;
41}
42
43macro_rules! primitive_parse {
44    ($($t:ty),+) => {
45        $(impl ParseJsonNumber for $t {
46            fn parse(s: &[u8]) -> Option<Self> {
47                match lexical_core::parse::<Self>(s) {
48                    Ok(f) => Some(f),
49                    Err(_) => lexical_core::parse::<f64>(s).ok().and_then(NumCast::from),
50                }
51            }
52        })+
53    };
54}
55
56primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64);
57
58impl ParseJsonNumber for f16 {
59    fn parse(s: &[u8]) -> Option<Self> {
60        lexical_core::parse::<f32>(s).ok().map(f16::from_f32)
61    }
62}
63
64impl ParseJsonNumber for f32 {
65    fn parse(s: &[u8]) -> Option<Self> {
66        lexical_core::parse::<Self>(s).ok()
67    }
68}
69
70impl ParseJsonNumber for f64 {
71    fn parse(s: &[u8]) -> Option<Self> {
72        lexical_core::parse::<Self>(s).ok()
73    }
74}
75
76pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
77    data_type: DataType,
78    // Invariant and Send
79    phantom: PhantomData<fn(P) -> P>,
80}
81
82impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
83    pub fn new(data_type: DataType) -> Self {
84        Self {
85            data_type,
86            phantom: Default::default(),
87        }
88    }
89}
90
91impl<P> ArrayDecoder for PrimitiveArrayDecoder<P>
92where
93    P: ArrowPrimitiveType + Parser,
94    P::Native: ParseJsonNumber + NumCast,
95{
96    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
97        let mut builder =
98            PrimitiveBuilder::<P>::with_capacity(pos.len()).with_data_type(self.data_type.clone());
99        let d = &self.data_type;
100
101        for p in pos {
102            match tape.get(*p) {
103                TapeElement::Null => builder.append_null(),
104                TapeElement::String(idx) => {
105                    let s = tape.get_string(idx);
106                    let value = P::parse(s).ok_or_else(|| {
107                        ArrowError::JsonError(format!("failed to parse \"{s}\" as {d}",))
108                    })?;
109
110                    builder.append_value(value)
111                }
112                TapeElement::Number(idx) => {
113                    let s = tape.get_string(idx);
114                    let value = ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| {
115                        ArrowError::JsonError(format!("failed to parse {s} as {d}",))
116                    })?;
117
118                    builder.append_value(value)
119                }
120                TapeElement::F32(v) => {
121                    let v = f32::from_bits(v);
122                    let value = NumCast::from(v).ok_or_else(|| {
123                        ArrowError::JsonError(format!("failed to parse {v} as {d}",))
124                    })?;
125                    builder.append_value(value)
126                }
127                TapeElement::I32(v) => {
128                    let value = NumCast::from(v).ok_or_else(|| {
129                        ArrowError::JsonError(format!("failed to parse {v} as {d}",))
130                    })?;
131                    builder.append_value(value)
132                }
133                TapeElement::F64(high) => match tape.get(p + 1) {
134                    TapeElement::F32(low) => {
135                        let v = f64::from_bits(((high as u64) << 32) | low as u64);
136                        let value = NumCast::from(v).ok_or_else(|| {
137                            ArrowError::JsonError(format!("failed to parse {v} as {d}",))
138                        })?;
139                        builder.append_value(value)
140                    }
141                    _ => unreachable!(),
142                },
143                TapeElement::I64(high) => match tape.get(p + 1) {
144                    TapeElement::I32(low) => {
145                        let v = ((high as i64) << 32) | (low as u32) as i64;
146                        let value = NumCast::from(v).ok_or_else(|| {
147                            ArrowError::JsonError(format!("failed to parse {v} as {d}",))
148                        })?;
149                        builder.append_value(value)
150                    }
151                    _ => unreachable!(),
152                },
153                _ => return Err(tape.error(*p, "primitive")),
154            }
155        }
156
157        Ok(builder.finish().into_data())
158    }
159}