arrow_json/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Transfer data between the Arrow memory format and JSON line-delimited records.
19//!
20//! See the module level documentation for the
21//! [`reader`] and [`writer`] for usage examples.
22//!
23//! # Binary Data
24//!
25//! As per [RFC7159] JSON cannot encode arbitrary binary data. A common approach to workaround
26//! this is to use a [binary-to-text encoding] scheme, such as base64, to encode the
27//! input data and then decode it on output.
28//!
29//! ```
30//! # use std::io::Cursor;
31//! # use std::sync::Arc;
32//! # use arrow_array::{BinaryArray, RecordBatch, StringArray};
33//! # use arrow_array::cast::AsArray;
34//! # use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
35//! # use arrow_json::{LineDelimitedWriter, ReaderBuilder};
36//! #
37//! // The data we want to write
38//! let input = BinaryArray::from(vec![b"\xDE\x00\xFF".as_ref()]);
39//!
40//! // Base64 encode it to a string
41//! let encoded: StringArray = b64_encode(&BASE64_STANDARD, &input);
42//!
43//! // Write the StringArray to JSON
44//! let batch = RecordBatch::try_from_iter([("col", Arc::new(encoded) as _)]).unwrap();
45//! let mut buf = Vec::with_capacity(1024);
46//! let mut writer = LineDelimitedWriter::new(&mut buf);
47//! writer.write(&batch).unwrap();
48//! writer.finish().unwrap();
49//!
50//! // Read the JSON data
51//! let cursor = Cursor::new(buf);
52//! let mut reader = ReaderBuilder::new(batch.schema()).build(cursor).unwrap();
53//! let batch = reader.next().unwrap().unwrap();
54//!
55//! // Reverse the base64 encoding
56//! let col: BinaryArray = batch.column(0).as_string::<i32>().clone().into();
57//! let output = b64_decode(&BASE64_STANDARD, &col).unwrap();
58//!
59//! assert_eq!(input, output);
60//! ```
61//!
62//! [RFC7159]: https://datatracker.ietf.org/doc/html/rfc7159#section-8.1
63//! [binary-to-text encoding]: https://en.wikipedia.org/wiki/Binary-to-text_encoding
64//!
65
66#![doc(
67    html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
68    html_favicon_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_transparent-bg.svg"
69)]
70#![cfg_attr(docsrs, feature(doc_auto_cfg))]
71#![deny(rustdoc::broken_intra_doc_links)]
72#![warn(missing_docs)]
73
74pub mod reader;
75pub mod writer;
76
77pub use self::reader::{Reader, ReaderBuilder};
78pub use self::writer::{
79    ArrayWriter, Encoder, EncoderFactory, EncoderOptions, LineDelimitedWriter, Writer,
80    WriterBuilder,
81};
82use half::f16;
83use serde_json::{Number, Value};
84
85/// Specifies what is considered valid JSON when reading or writing
86/// RecordBatches or StructArrays.
87///
88/// This enum controls which form(s) the Reader will accept and which form the
89/// Writer will produce. For example, if the RecordBatch Schema is
90/// `[("a", Int32), ("r", Struct([("b", Boolean), ("c", Utf8)]))]`
91/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
92/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
93/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
94/// rows formatted similarly.
95///
96/// The list encoding is more compact if the schema is known, and is used by
97/// tools such as [Presto] and [Trino].
98///
99/// When reading objects, the order of the key does not matter. When reading
100/// lists, the entries must be the same number and in the same order as the
101/// struct fields. Map columns are not affected by this option.
102///
103/// [Presto]: https://prestodb.io/docs/current/develop/client-protocol.html#important-queryresults-attributes
104/// [Trino]: https://trino.io/docs/current/develop/client-protocol.html#important-queryresults-attributes
105#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
106pub enum StructMode {
107    #[default]
108    /// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
109    ObjectOnly,
110    /// Encode/decode structs as lists (e.g., [1, "c"])
111    ListOnly,
112}
113
114/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
115pub trait JsonSerializable: 'static {
116    /// Converts self into json value if its possible
117    fn into_json_value(self) -> Option<Value>;
118}
119
120macro_rules! json_serializable {
121    ($t:ty) => {
122        impl JsonSerializable for $t {
123            fn into_json_value(self) -> Option<Value> {
124                Some(self.into())
125            }
126        }
127    };
128}
129
130json_serializable!(bool);
131json_serializable!(u8);
132json_serializable!(u16);
133json_serializable!(u32);
134json_serializable!(u64);
135json_serializable!(i8);
136json_serializable!(i16);
137json_serializable!(i32);
138json_serializable!(i64);
139
140impl JsonSerializable for i128 {
141    fn into_json_value(self) -> Option<Value> {
142        // Serialize as string to avoid issues with arbitrary_precision serde_json feature
143        // - https://github.com/serde-rs/json/issues/559
144        // - https://github.com/serde-rs/json/issues/845
145        // - https://github.com/serde-rs/json/issues/846
146        Some(self.to_string().into())
147    }
148}
149
150impl JsonSerializable for f16 {
151    fn into_json_value(self) -> Option<Value> {
152        Number::from_f64(f64::round(f64::from(self) * 1000.0) / 1000.0).map(Value::Number)
153    }
154}
155
156impl JsonSerializable for f32 {
157    fn into_json_value(self) -> Option<Value> {
158        Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(Value::Number)
159    }
160}
161
162impl JsonSerializable for f64 {
163    fn into_json_value(self) -> Option<Value> {
164        Number::from_f64(self).map(Value::Number)
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    use serde_json::Value::{Bool, Number as VNumber, String as VString};
173
174    #[test]
175    fn test_arrow_native_type_to_json() {
176        assert_eq!(Some(Bool(true)), true.into_json_value());
177        assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value());
178        assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value());
179        assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value());
180        assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value());
181        assert_eq!(Some(VString("1".to_string())), 1i128.into_json_value());
182        assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value());
183        assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value());
184        assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value());
185        assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value());
186        assert_eq!(
187            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
188            0.01.into_json_value()
189        );
190        assert_eq!(
191            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
192            0.01f64.into_json_value()
193        );
194        assert_eq!(None, f32::NAN.into_json_value());
195    }
196
197    #[test]
198    fn test_json_roundtrip_structs() {
199        use crate::writer::LineDelimited;
200        use arrow_schema::DataType;
201        use arrow_schema::Field;
202        use arrow_schema::Fields;
203        use arrow_schema::Schema;
204        use std::sync::Arc;
205
206        let schema = Arc::new(Schema::new(vec![
207            Field::new(
208                "c1",
209                DataType::Struct(Fields::from(vec![
210                    Field::new("c11", DataType::Int32, true),
211                    Field::new(
212                        "c12",
213                        DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
214                        false,
215                    ),
216                ])),
217                false,
218            ),
219            Field::new("c2", DataType::Utf8, false),
220        ]));
221
222        {
223            let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
224{"c1":{"c12":{"c121":"f"}},"c2":"b"}
225{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
226"#
227            .as_bytes();
228            let object_reader = ReaderBuilder::new(schema.clone())
229                .with_struct_mode(StructMode::ObjectOnly)
230                .build(object_input)
231                .unwrap();
232
233            let mut object_output: Vec<u8> = Vec::new();
234            let mut object_writer = WriterBuilder::new()
235                .with_struct_mode(StructMode::ObjectOnly)
236                .build::<_, LineDelimited>(&mut object_output);
237            for batch_res in object_reader {
238                object_writer.write(&batch_res.unwrap()).unwrap();
239            }
240            assert_eq!(object_input, &object_output);
241        }
242
243        {
244            let list_input = r#"[[1,["e"]],"a"]
245[[null,["f"]],"b"]
246[[5,["g"]],"c"]
247"#
248            .as_bytes();
249            let list_reader = ReaderBuilder::new(schema.clone())
250                .with_struct_mode(StructMode::ListOnly)
251                .build(list_input)
252                .unwrap();
253
254            let mut list_output: Vec<u8> = Vec::new();
255            let mut list_writer = WriterBuilder::new()
256                .with_struct_mode(StructMode::ListOnly)
257                .build::<_, LineDelimited>(&mut list_output);
258            for batch_res in list_reader {
259                list_writer.write(&batch_res.unwrap()).unwrap();
260            }
261            assert_eq!(list_input, &list_output);
262        }
263    }
264}