Skip to main content

arrow_json/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Transfer data between the Arrow memory format and JSON line-delimited records.
19//!
20//! See the module level documentation for the
21//! [`reader`] and [`writer`] for usage examples.
22//!
23//! # Binary Data uses `Base16` Encoding
24//!
25//! As per [RFC7159] JSON cannot encode arbitrary binary data. This crate works around that
26//! limitation by encoding/decoding binary data as a [hexadecimal] string (i.e.
27//! [`Base16` encoding]).
28//!
29//! Note that `Base16` only has 50% space efficiency (i.e., the encoded data is twice as large
30//! as the original). If that is an issue, we recommend to convert binary data to/from a different
31//! encoding format such as `Base64` instead. See the following example for details.
32//!
33//! ## `Base64` Encoding Example
34//!
35//! [`Base64`] is a common [binary-to-text encoding] scheme with a space efficiency of 75%. The
36//! following example shows how to use the [`arrow_cast`] crate to encode binary data to `Base64`
37//! before converting it to JSON and how to decode it back.
38//!
39//! ```
40//! # use std::io::Cursor;
41//! # use std::sync::Arc;
42//! # use arrow_array::{BinaryArray, RecordBatch, StringArray};
43//! # use arrow_array::cast::AsArray;
44//! use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
45//! # use arrow_json::{LineDelimitedWriter, ReaderBuilder};
46//! #
47//! // The data we want to write
48//! let input = BinaryArray::from(vec![b"\xDE\x00\xFF".as_ref()]);
49//!
50//! // Base64 encode it to a string
51//! let encoded: StringArray = b64_encode(&BASE64_STANDARD, &input);
52//!
53//! // Write the StringArray to JSON
54//! let batch = RecordBatch::try_from_iter([("col", Arc::new(encoded) as _)]).unwrap();
55//! let mut buf = Vec::with_capacity(1024);
56//! let mut writer = LineDelimitedWriter::new(&mut buf);
57//! writer.write(&batch).unwrap();
58//! writer.finish().unwrap();
59//!
60//! // Read the JSON data
61//! let cursor = Cursor::new(buf);
62//! let mut reader = ReaderBuilder::new(batch.schema()).build(cursor).unwrap();
63//! let batch = reader.next().unwrap().unwrap();
64//!
65//! // Reverse the base64 encoding
66//! let col: BinaryArray = batch.column(0).as_string::<i32>().clone().into();
67//! let output = b64_decode(&BASE64_STANDARD, &col).unwrap();
68//!
69//! assert_eq!(input, output);
70//! ```
71//!
72//! [RFC7159]: https://datatracker.ietf.org/doc/html/rfc7159#section-8.1
73//! [binary-to-text encoding]: https://en.wikipedia.org/wiki/Binary-to-text_encoding
74//! [hexadecimal]: https://en.wikipedia.org/wiki/Hexadecimal
75//! [`Base16` encoding]: https://en.wikipedia.org/wiki/Base16#Base16
76//! [`Base64`]: https://en.wikipedia.org/wiki/Base64
77
78#![doc(
79    html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
80    html_favicon_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_transparent-bg.svg"
81)]
82#![cfg_attr(docsrs, feature(doc_cfg))]
83#![deny(rustdoc::broken_intra_doc_links)]
84#![warn(missing_docs)]
85
86pub mod reader;
87pub mod writer;
88
89pub use self::reader::{Reader, ReaderBuilder};
90pub use self::writer::{
91    ArrayWriter, Encoder, EncoderFactory, EncoderOptions, LineDelimitedWriter, Writer,
92    WriterBuilder,
93};
94use half::f16;
95use serde_json::{Number, Value};
96
97/// Specifies what is considered valid JSON when reading or writing
98/// RecordBatches or StructArrays.
99///
100/// This enum controls which form(s) the Reader will accept and which form the
101/// Writer will produce. For example, if the RecordBatch Schema is
102/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]`
103/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
104/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
105/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
106/// rows formatted similarly.
107///
108/// The list encoding is more compact if the schema is known, and is used by
109/// tools such as [Presto] and [Trino].
110///
111/// When reading objects, the order of the key does not matter. When reading
112/// lists, the entries must be the same number and in the same order as the
113/// struct fields. Map columns are not affected by this option.
114///
115/// [Presto]: https://prestodb.io/docs/current/develop/client-protocol.html#important-queryresults-attributes
116/// [Trino]: https://trino.io/docs/current/develop/client-protocol.html#important-queryresults-attributes
117#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
118pub enum StructMode {
119    #[default]
120    /// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
121    ObjectOnly,
122    /// Encode/decode structs as lists (e.g., [1, "c"])
123    ListOnly,
124}
125
126/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
127pub trait JsonSerializable: 'static {
128    /// Converts self into json value if its possible
129    fn into_json_value(self) -> Option<Value>;
130}
131
132macro_rules! json_serializable {
133    ($t:ty) => {
134        impl JsonSerializable for $t {
135            fn into_json_value(self) -> Option<Value> {
136                Some(self.into())
137            }
138        }
139    };
140}
141
142json_serializable!(bool);
143json_serializable!(u8);
144json_serializable!(u16);
145json_serializable!(u32);
146json_serializable!(u64);
147json_serializable!(i8);
148json_serializable!(i16);
149json_serializable!(i32);
150json_serializable!(i64);
151
152impl JsonSerializable for i128 {
153    fn into_json_value(self) -> Option<Value> {
154        // Serialize as string to avoid issues with arbitrary_precision serde_json feature
155        // - https://github.com/serde-rs/json/issues/559
156        // - https://github.com/serde-rs/json/issues/845
157        // - https://github.com/serde-rs/json/issues/846
158        Some(self.to_string().into())
159    }
160}
161
162impl JsonSerializable for f16 {
163    fn into_json_value(self) -> Option<Value> {
164        Number::from_f64(f64::round(f64::from(self) * 1000.0) / 1000.0).map(Value::Number)
165    }
166}
167
168impl JsonSerializable for f32 {
169    fn into_json_value(self) -> Option<Value> {
170        Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(Value::Number)
171    }
172}
173
174impl JsonSerializable for f64 {
175    fn into_json_value(self) -> Option<Value> {
176        Number::from_f64(self).map(Value::Number)
177    }
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183    use crate::writer::JsonArray;
184    use crate::writer::LineDelimited;
185    use arrow_array::{
186        ArrayRef, GenericBinaryArray, GenericByteViewArray, GenericListViewArray, RecordBatch,
187        RecordBatchWriter, builder::FixedSizeBinaryBuilder, types::BinaryViewType,
188    };
189    use arrow_schema::{DataType, Field, Fields, Schema};
190    use serde_json::Value::{Bool, Number as VNumber, String as VString};
191    use std::io::Cursor;
192    use std::sync::Arc;
193
194    #[test]
195    fn test_arrow_native_type_to_json() {
196        assert_eq!(Some(Bool(true)), true.into_json_value());
197        assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value());
198        assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value());
199        assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value());
200        assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value());
201        assert_eq!(Some(VString("1".to_string())), 1i128.into_json_value());
202        assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value());
203        assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value());
204        assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value());
205        assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value());
206        assert_eq!(
207            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
208            0.01.into_json_value()
209        );
210        assert_eq!(
211            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
212            0.01f64.into_json_value()
213        );
214        assert_eq!(None, f32::NAN.into_json_value());
215    }
216
217    #[test]
218    fn test_json_roundtrip_structs() {
219        let schema = Arc::new(Schema::new(vec![
220            Field::new(
221                "c1",
222                DataType::Struct(Fields::from(vec![
223                    Field::new("c11", DataType::Int32, true),
224                    Field::new(
225                        "c12",
226                        DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
227                        false,
228                    ),
229                ])),
230                false,
231            ),
232            Field::new("c2", DataType::Utf8, false),
233        ]));
234
235        {
236            let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
237{"c1":{"c12":{"c121":"f"}},"c2":"b"}
238{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
239"#
240            .as_bytes();
241            let object_reader = ReaderBuilder::new(schema.clone())
242                .with_struct_mode(StructMode::ObjectOnly)
243                .build(object_input)
244                .unwrap();
245
246            let mut object_output: Vec<u8> = Vec::new();
247            let mut object_writer = WriterBuilder::new()
248                .with_struct_mode(StructMode::ObjectOnly)
249                .build::<_, LineDelimited>(&mut object_output);
250            for batch_res in object_reader {
251                object_writer.write(&batch_res.unwrap()).unwrap();
252            }
253            assert_eq!(object_input, &object_output);
254        }
255
256        {
257            let list_input = r#"[[1,["e"]],"a"]
258[[null,["f"]],"b"]
259[[5,["g"]],"c"]
260"#
261            .as_bytes();
262            let list_reader = ReaderBuilder::new(schema.clone())
263                .with_struct_mode(StructMode::ListOnly)
264                .build(list_input)
265                .unwrap();
266
267            let mut list_output: Vec<u8> = Vec::new();
268            let mut list_writer = WriterBuilder::new()
269                .with_struct_mode(StructMode::ListOnly)
270                .build::<_, LineDelimited>(&mut list_output);
271            for batch_res in list_reader {
272                list_writer.write(&batch_res.unwrap()).unwrap();
273            }
274            assert_eq!(list_input, &list_output);
275        }
276    }
277
278    #[test]
279    #[allow(invalid_from_utf8)]
280    fn test_json_roundtrip_binary() {
281        let not_utf8: &[u8] = b"Not UTF8 \xa0\xa1!";
282        assert!(str::from_utf8(not_utf8).is_err());
283
284        let values: &[Option<&[u8]>] = &[
285            Some(b"Ned Flanders" as &[u8]),
286            None,
287            Some(b"Troy McClure" as &[u8]),
288            Some(not_utf8),
289        ];
290        // Binary:
291        assert_binary_json(Arc::new(GenericBinaryArray::<i32>::from_iter(values)));
292
293        // LargeBinary:
294        assert_binary_json(Arc::new(GenericBinaryArray::<i64>::from_iter(values)));
295
296        // FixedSizeBinary:
297        assert_binary_json(build_array_fixed_size_binary(12, values));
298
299        // BinaryView:
300        assert_binary_json(Arc::new(GenericByteViewArray::<BinaryViewType>::from_iter(
301            values,
302        )));
303    }
304
305    fn build_array_fixed_size_binary(byte_width: i32, values: &[Option<&[u8]>]) -> ArrayRef {
306        let mut builder = FixedSizeBinaryBuilder::new(byte_width);
307        for value in values {
308            match value {
309                Some(v) => builder.append_value(v).unwrap(),
310                None => builder.append_null(),
311            }
312        }
313        Arc::new(builder.finish())
314    }
315
316    fn assert_binary_json(array: ArrayRef) {
317        // encode and check JSON with and without explicit nulls
318        assert_binary_json_with_writer(
319            array.clone(),
320            WriterBuilder::new().with_explicit_nulls(true),
321        );
322        assert_binary_json_with_writer(array, WriterBuilder::new().with_explicit_nulls(false));
323    }
324
325    fn assert_binary_json_with_writer(array: ArrayRef, builder: WriterBuilder) {
326        let batch = RecordBatch::try_from_iter([("bytes", array)]).unwrap();
327
328        let mut buf = Vec::new();
329        let json_value: Value = {
330            let mut writer = builder.build::<_, JsonArray>(&mut buf);
331            writer.write(&batch).unwrap();
332            writer.close().unwrap();
333            serde_json::from_slice(&buf).unwrap()
334        };
335
336        let json_array = json_value.as_array().unwrap();
337
338        let decoded = {
339            let mut decoder = ReaderBuilder::new(batch.schema().clone())
340                .build_decoder()
341                .unwrap();
342            decoder.serialize(json_array).unwrap();
343            decoder.flush().unwrap().unwrap()
344        };
345
346        assert_eq!(batch, decoded);
347    }
348
349    fn assert_list_view_roundtrip<O: arrow_array::OffsetSizeTrait>() {
350        let flat_field = Arc::new(Field::new("item", DataType::Int32, true));
351        let flat_dt = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(flat_field);
352
353        let nested_inner = Arc::new(Field::new("item", DataType::Int32, false));
354        let nested_inner_dt = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(nested_inner);
355        let nested_outer = Arc::new(Field::new("item", nested_inner_dt, true));
356        let nested_dt = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(nested_outer);
357
358        let schema = Arc::new(Schema::new(vec![
359            Field::new("flat", flat_dt, true),
360            Field::new("nested", nested_dt, true),
361        ]));
362
363        let input = r#"{"flat":[1,2,3],"nested":[[1,2],[3]]}
364{"flat":[4,null]}
365{}
366{"flat":[6],"nested":[[4,5,6]]}
367{"flat":[]}
368"#
369        .as_bytes();
370
371        let batches: Vec<RecordBatch> = ReaderBuilder::new(schema.clone())
372            .with_batch_size(1024)
373            .build(Cursor::new(input))
374            .unwrap()
375            .collect::<Result<Vec<_>, _>>()
376            .unwrap();
377
378        let mut output = Vec::new();
379        let mut writer = WriterBuilder::new().build::<_, LineDelimited>(&mut output);
380        for batch in &batches {
381            writer.write(batch).unwrap();
382        }
383        writer.finish().unwrap();
384
385        assert_eq!(input, &output);
386    }
387
388    #[test]
389    fn test_json_roundtrip_list_view() {
390        assert_list_view_roundtrip::<i32>();
391        assert_list_view_roundtrip::<i64>();
392    }
393}