Skip to main content

arrow_json/reader/
primitive_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::marker::PhantomData;
19use std::sync::Arc;
20
21use arrow_array::builder::PrimitiveBuilder;
22use arrow_array::{ArrayRef, ArrowPrimitiveType};
23use arrow_cast::parse::Parser;
24use arrow_schema::{ArrowError, DataType};
25use half::f16;
26use num_traits::NumCast;
27
28use crate::reader::tape::{Tape, TapeElement};
29use crate::reader::{ArrayDecoder, DecoderContext};
30
31/// A trait for JSON-specific primitive parsing logic
32///
33/// According to the specification unquoted fields should be parsed as a double-precision
34/// floating point numbers, including scientific representation such as `2e3`
35///
36/// In practice, it is common to serialize numbers outside the range of an `f64` and expect
37/// them to round-trip correctly. As such when parsing integers we first parse as the integer
38/// and fallback to parsing as a floating point if this fails
39trait ParseJsonNumber: Sized {
40    fn parse(s: &[u8]) -> Option<Self>;
41}
42
43macro_rules! primitive_parse {
44    ($($t:ty),+) => {
45        $(impl ParseJsonNumber for $t {
46            fn parse(s: &[u8]) -> Option<Self> {
47                match lexical_core::parse::<Self>(s) {
48                    Ok(f) => Some(f),
49                    Err(_) => lexical_core::parse::<f64>(s).ok().and_then(NumCast::from),
50                }
51            }
52        })+
53    };
54}
55
56primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64);
57
58impl ParseJsonNumber for f16 {
59    fn parse(s: &[u8]) -> Option<Self> {
60        lexical_core::parse::<f32>(s).ok().map(f16::from_f32)
61    }
62}
63
64impl ParseJsonNumber for f32 {
65    fn parse(s: &[u8]) -> Option<Self> {
66        lexical_core::parse::<Self>(s).ok()
67    }
68}
69
70impl ParseJsonNumber for f64 {
71    fn parse(s: &[u8]) -> Option<Self> {
72        lexical_core::parse::<Self>(s).ok()
73    }
74}
75
76pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
77    data_type: DataType,
78    ignore_type_conflicts: bool,
79    // Invariant and Send
80    phantom: PhantomData<fn(P) -> P>,
81}
82
83impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
84    pub fn new(ctx: &DecoderContext, data_type: &DataType) -> Self {
85        Self {
86            data_type: data_type.clone(),
87            ignore_type_conflicts: ctx.ignore_type_conflicts(),
88            phantom: Default::default(),
89        }
90    }
91}
92
93impl<P> ArrayDecoder for PrimitiveArrayDecoder<P>
94where
95    P: ArrowPrimitiveType + Parser,
96    P::Native: ParseJsonNumber + NumCast,
97{
98    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
99        let mut builder =
100            PrimitiveBuilder::<P>::with_capacity(pos.len()).with_data_type(self.data_type.clone());
101        let d = &self.data_type;
102
103        for p in pos {
104            let value = match tape.get(*p) {
105                TapeElement::Null => {
106                    builder.append_null();
107                    continue;
108                }
109                TapeElement::String(idx) => {
110                    let s = tape.get_string(idx);
111                    P::parse(s).ok_or_else(|| {
112                        ArrowError::JsonError(format!("failed to parse \"{s}\" as {d}",))
113                    })
114                }
115                TapeElement::Number(idx) => {
116                    let s = tape.get_string(idx);
117                    ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| {
118                        ArrowError::JsonError(format!("failed to parse {s} as {d}",))
119                    })
120                }
121                TapeElement::F32(v) => {
122                    let v = f32::from_bits(v);
123                    NumCast::from(v).ok_or_else(|| {
124                        ArrowError::JsonError(format!("failed to parse {v} as {d}",))
125                    })
126                }
127                TapeElement::I32(v) => NumCast::from(v)
128                    .ok_or_else(|| ArrowError::JsonError(format!("failed to parse {v} as {d}",))),
129                TapeElement::F64(high) => match tape.get(p + 1) {
130                    TapeElement::F32(low) => {
131                        let v = f64::from_bits(((high as u64) << 32) | low as u64);
132                        NumCast::from(v).ok_or_else(|| {
133                            ArrowError::JsonError(format!("failed to parse {v} as {d}",))
134                        })
135                    }
136                    _ => unreachable!(),
137                },
138                TapeElement::I64(high) => match tape.get(p + 1) {
139                    TapeElement::I32(low) => {
140                        let v = ((high as i64) << 32) | (low as u32) as i64;
141                        NumCast::from(v).ok_or_else(|| {
142                            ArrowError::JsonError(format!("failed to parse {v} as {d}",))
143                        })
144                    }
145                    _ => unreachable!(),
146                },
147                _ => Err(tape.error(*p, "primitive")),
148            };
149
150            match value {
151                Ok(value) => builder.append_value(value),
152                Err(_) if self.ignore_type_conflicts => builder.append_null(),
153                Err(e) => return Err(e),
154            }
155        }
156
157        Ok(Arc::new(builder.finish()))
158    }
159}