arrow_json/reader/
tape.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::reader::serializer::TapeSerializer;
19use arrow_schema::ArrowError;
20use memchr::memchr2;
21use serde::Serialize;
22use std::fmt::Write;
23
24/// We decode JSON to a flattened tape representation,
25/// allowing for efficient traversal of the JSON data
26///
27/// This approach is inspired by [simdjson]
28///
29/// Uses `u32` for offsets to ensure `TapeElement` is 64-bits. A future
30/// iteration may increase this to a custom `u56` type.
31///
32/// [simdjson]: https://github.com/simdjson/simdjson/blob/master/doc/tape.md
33#[derive(Debug, Copy, Clone, PartialEq, Eq)]
34pub enum TapeElement {
35    /// The start of an object, i.e. `{`
36    ///
37    /// Contains the offset of the corresponding [`Self::EndObject`]
38    StartObject(u32),
39    /// The end of an object, i.e. `}`
40    ///
41    /// Contains the offset of the corresponding [`Self::StartObject`]
42    EndObject(u32),
43    /// The start of a list , i.e. `[`
44    ///
45    /// Contains the offset of the corresponding [`Self::EndList`]
46    StartList(u32),
47    /// The end of a list , i.e. `]`
48    ///
49    /// Contains the offset of the corresponding [`Self::StartList`]
50    EndList(u32),
51    /// A string value
52    ///
53    /// Contains the offset into the [`Tape`] string data
54    String(u32),
55    /// A numeric value
56    ///
57    /// Contains the offset into the [`Tape`] string data
58    Number(u32),
59
60    /// The high bits of a i64
61    ///
62    /// Followed by [`Self::I32`] containing the low bits
63    I64(i32),
64
65    /// A 32-bit signed integer
66    ///
67    /// May be preceded by [`Self::I64`] containing high bits
68    I32(i32),
69
70    /// The high bits of a 64-bit float
71    ///
72    /// Followed by [`Self::F32`] containing the low bits
73    F64(u32),
74
75    /// A 32-bit float or the low-bits of a 64-bit float if preceded by [`Self::F64`]
76    F32(u32),
77
78    /// A true literal
79    True,
80    /// A false literal
81    False,
82    /// A null literal
83    Null,
84}
85
86/// A decoded JSON tape
87///
88/// String and numeric data is stored alongside an array of [`TapeElement`]
89///
90/// The first element is always [`TapeElement::Null`]
91///
92/// This approach to decoding JSON is inspired by [simdjson]
93///
94/// [simdjson]: https://github.com/simdjson/simdjson/blob/master/doc/tape.md
95#[derive(Debug)]
96pub struct Tape<'a> {
97    elements: &'a [TapeElement],
98    strings: &'a str,
99    string_offsets: &'a [usize],
100    num_rows: usize,
101}
102
103impl<'a> Tape<'a> {
104    /// Returns the string for the given string index
105    #[inline]
106    pub fn get_string(&self, idx: u32) -> &'a str {
107        let end_offset = self.string_offsets[idx as usize + 1];
108        let start_offset = self.string_offsets[idx as usize];
109        // SAFETY:
110        // Verified offsets
111        unsafe { self.strings.get_unchecked(start_offset..end_offset) }
112    }
113
114    /// Returns the tape element at `idx`
115    pub fn get(&self, idx: u32) -> TapeElement {
116        self.elements[idx as usize]
117    }
118
119    /// Returns the index of the next field at the same level as `cur_idx`
120    ///
121    /// Return an error if `cur_idx` is not the start of a field
122    pub fn next(&self, cur_idx: u32, expected: &str) -> Result<u32, ArrowError> {
123        match self.get(cur_idx) {
124            TapeElement::String(_)
125            | TapeElement::Number(_)
126            | TapeElement::True
127            | TapeElement::False
128            | TapeElement::Null
129            | TapeElement::I32(_)
130            | TapeElement::F32(_) => Ok(cur_idx + 1),
131            TapeElement::I64(_) | TapeElement::F64(_) => Ok(cur_idx + 2),
132            TapeElement::StartList(end_idx) => Ok(end_idx + 1),
133            TapeElement::StartObject(end_idx) => Ok(end_idx + 1),
134            TapeElement::EndObject(_) | TapeElement::EndList(_) => {
135                Err(self.error(cur_idx, expected))
136            }
137        }
138    }
139
140    /// Returns the number of rows
141    pub fn num_rows(&self) -> usize {
142        self.num_rows
143    }
144
145    /// Serialize the tape element at index `idx` to `out` returning the next field index
146    fn serialize(&self, out: &mut String, idx: u32) -> u32 {
147        match self.get(idx) {
148            TapeElement::StartObject(end) => {
149                out.push('{');
150                let mut cur_idx = idx + 1;
151                while cur_idx < end {
152                    cur_idx = self.serialize(out, cur_idx);
153                    out.push_str(": ");
154                    cur_idx = self.serialize(out, cur_idx);
155                }
156                out.push('}');
157                return end + 1;
158            }
159            TapeElement::EndObject(_) => out.push('}'),
160            TapeElement::StartList(end) => {
161                out.push('[');
162                let mut cur_idx = idx + 1;
163                while cur_idx < end {
164                    cur_idx = self.serialize(out, cur_idx);
165                    if cur_idx < end {
166                        out.push_str(", ");
167                    }
168                }
169                out.push(']');
170                return end + 1;
171            }
172            TapeElement::EndList(_) => out.push(']'),
173            TapeElement::String(s) => {
174                out.push('"');
175                out.push_str(self.get_string(s));
176                out.push('"')
177            }
178            TapeElement::Number(n) => out.push_str(self.get_string(n)),
179            TapeElement::True => out.push_str("true"),
180            TapeElement::False => out.push_str("false"),
181            TapeElement::Null => out.push_str("null"),
182            TapeElement::I64(high) => match self.get(idx + 1) {
183                TapeElement::I32(low) => {
184                    let val = ((high as i64) << 32) | (low as u32) as i64;
185                    let _ = write!(out, "{val}");
186                    return idx + 2;
187                }
188                _ => unreachable!(),
189            },
190            TapeElement::I32(val) => {
191                let _ = write!(out, "{val}");
192            }
193            TapeElement::F64(high) => match self.get(idx + 1) {
194                TapeElement::F32(low) => {
195                    let val = f64::from_bits(((high as u64) << 32) | low as u64);
196                    let _ = write!(out, "{val}");
197                    return idx + 2;
198                }
199                _ => unreachable!(),
200            },
201            TapeElement::F32(val) => {
202                let _ = write!(out, "{}", f32::from_bits(val));
203            }
204        }
205        idx + 1
206    }
207
208    /// Returns an error reading index `idx`
209    pub fn error(&self, idx: u32, expected: &str) -> ArrowError {
210        let mut out = String::with_capacity(64);
211        self.serialize(&mut out, idx);
212        ArrowError::JsonError(format!("expected {expected} got {out}"))
213    }
214}
215
216/// States based on <https://www.json.org/json-en.html>
217#[derive(Debug, Copy, Clone)]
218enum DecoderState {
219    /// Decoding an object
220    ///
221    /// Contains index of start [`TapeElement::StartObject`]
222    Object(u32),
223    /// Decoding a list
224    ///
225    /// Contains index of start [`TapeElement::StartList`]
226    List(u32),
227    String,
228    Value,
229    Number,
230    Colon,
231    Escape,
232    /// A unicode escape sequence,
233    ///
234    /// Consists of a `(low surrogate, high surrogate, decoded length)`
235    Unicode(u16, u16, u8),
236    /// A boolean or null literal
237    ///
238    /// Consists of `(literal, decoded length)`
239    Literal(Literal, u8),
240}
241
242impl DecoderState {
243    fn as_str(&self) -> &'static str {
244        match self {
245            DecoderState::Object(_) => "object",
246            DecoderState::List(_) => "list",
247            DecoderState::String => "string",
248            DecoderState::Value => "value",
249            DecoderState::Number => "number",
250            DecoderState::Colon => "colon",
251            DecoderState::Escape => "escape",
252            DecoderState::Unicode(_, _, _) => "unicode literal",
253            DecoderState::Literal(d, _) => d.as_str(),
254        }
255    }
256}
257
258#[derive(Debug, Copy, Clone)]
259enum Literal {
260    Null,
261    True,
262    False,
263}
264
265impl Literal {
266    fn element(&self) -> TapeElement {
267        match self {
268            Literal::Null => TapeElement::Null,
269            Literal::True => TapeElement::True,
270            Literal::False => TapeElement::False,
271        }
272    }
273
274    fn as_str(&self) -> &'static str {
275        match self {
276            Literal::Null => "null",
277            Literal::True => "true",
278            Literal::False => "false",
279        }
280    }
281
282    fn bytes(&self) -> &'static [u8] {
283        self.as_str().as_bytes()
284    }
285}
286
287/// Evaluates to the next element in the iterator or breaks the current loop
288macro_rules! next {
289    ($next:ident) => {
290        match $next.next() {
291            Some(b) => b,
292            None => break,
293        }
294    };
295}
296
297/// Implements a state machine for decoding JSON to a tape
298pub struct TapeDecoder {
299    elements: Vec<TapeElement>,
300
301    /// The number of rows decoded, including any in progress if `!stack.is_empty()`
302    cur_row: usize,
303
304    /// Number of rows to read per batch
305    batch_size: usize,
306
307    /// A buffer of parsed string data
308    ///
309    /// Note: if part way through a record, i.e. `stack` is not empty,
310    /// this may contain truncated UTF-8 data
311    bytes: Vec<u8>,
312
313    /// Offsets into `data`
314    offsets: Vec<usize>,
315
316    /// A stack of [`DecoderState`]
317    stack: Vec<DecoderState>,
318}
319
320impl TapeDecoder {
321    /// Create a new [`TapeDecoder`] with the provided batch size
322    /// and an estimated number of fields in each row
323    pub fn new(batch_size: usize, num_fields: usize) -> Self {
324        let tokens_per_row = 2 + num_fields * 2;
325        let mut offsets = Vec::with_capacity(batch_size * (num_fields * 2) + 1);
326        offsets.push(0);
327
328        let mut elements = Vec::with_capacity(batch_size * tokens_per_row);
329        elements.push(TapeElement::Null);
330
331        Self {
332            offsets,
333            elements,
334            batch_size,
335            cur_row: 0,
336            bytes: Vec::with_capacity(num_fields * 2 * 8),
337            stack: Vec::with_capacity(10),
338        }
339    }
340
341    pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
342        let mut iter = BufIter::new(buf);
343
344        while !iter.is_empty() {
345            let state = match self.stack.last_mut() {
346                Some(l) => l,
347                None => {
348                    iter.skip_whitespace();
349                    if iter.is_empty() || self.cur_row >= self.batch_size {
350                        break;
351                    }
352
353                    // Start of row
354                    self.cur_row += 1;
355                    self.stack.push(DecoderState::Value);
356                    self.stack.last_mut().unwrap()
357                }
358            };
359
360            match state {
361                // Decoding an object
362                DecoderState::Object(start_idx) => {
363                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
364                    match next!(iter) {
365                        b'"' => {
366                            self.stack.push(DecoderState::Value);
367                            self.stack.push(DecoderState::Colon);
368                            self.stack.push(DecoderState::String);
369                        }
370                        b'}' => {
371                            let start_idx = *start_idx;
372                            let end_idx = self.elements.len() as u32;
373                            self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
374                            self.elements.push(TapeElement::EndObject(start_idx));
375                            self.stack.pop();
376                        }
377                        b => return Err(err(b, "parsing object")),
378                    }
379                }
380                // Decoding a list
381                DecoderState::List(start_idx) => {
382                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
383                    match iter.peek() {
384                        Some(b']') => {
385                            iter.next();
386                            let start_idx = *start_idx;
387                            let end_idx = self.elements.len() as u32;
388                            self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
389                            self.elements.push(TapeElement::EndList(start_idx));
390                            self.stack.pop();
391                        }
392                        Some(_) => self.stack.push(DecoderState::Value),
393                        None => break,
394                    }
395                }
396                // Decoding a string
397                DecoderState::String => {
398                    let s = iter.skip_chrs(b'\\', b'"');
399                    self.bytes.extend_from_slice(s);
400
401                    match next!(iter) {
402                        b'\\' => self.stack.push(DecoderState::Escape),
403                        b'"' => {
404                            let idx = self.offsets.len() - 1;
405                            self.elements.push(TapeElement::String(idx as _));
406                            self.offsets.push(self.bytes.len());
407                            self.stack.pop();
408                        }
409                        b => unreachable!("{}", b),
410                    }
411                }
412                state @ DecoderState::Value => {
413                    iter.skip_whitespace();
414                    *state = match next!(iter) {
415                        b'"' => DecoderState::String,
416                        b @ b'-' | b @ b'0'..=b'9' => {
417                            self.bytes.push(b);
418                            DecoderState::Number
419                        }
420                        b'n' => DecoderState::Literal(Literal::Null, 1),
421                        b'f' => DecoderState::Literal(Literal::False, 1),
422                        b't' => DecoderState::Literal(Literal::True, 1),
423                        b'[' => {
424                            let idx = self.elements.len() as u32;
425                            self.elements.push(TapeElement::StartList(u32::MAX));
426                            DecoderState::List(idx)
427                        }
428                        b'{' => {
429                            let idx = self.elements.len() as u32;
430                            self.elements.push(TapeElement::StartObject(u32::MAX));
431                            DecoderState::Object(idx)
432                        }
433                        b => return Err(err(b, "parsing value")),
434                    };
435                }
436                DecoderState::Number => {
437                    let s = iter.advance_until(|b| {
438                        !matches!(b, b'0'..=b'9' | b'-' | b'+' | b'.' | b'e' | b'E')
439                    });
440                    self.bytes.extend_from_slice(s);
441
442                    if !iter.is_empty() {
443                        self.stack.pop();
444                        let idx = self.offsets.len() - 1;
445                        self.elements.push(TapeElement::Number(idx as _));
446                        self.offsets.push(self.bytes.len());
447                    }
448                }
449                DecoderState::Colon => {
450                    iter.skip_whitespace();
451                    match next!(iter) {
452                        b':' => self.stack.pop(),
453                        b => return Err(err(b, "parsing colon")),
454                    };
455                }
456                DecoderState::Literal(literal, idx) => {
457                    let bytes = literal.bytes();
458                    let expected = bytes.iter().skip(*idx as usize).copied();
459                    for (expected, b) in expected.zip(&mut iter) {
460                        match b == expected {
461                            true => *idx += 1,
462                            false => return Err(err(b, "parsing literal")),
463                        }
464                    }
465                    if *idx == bytes.len() as u8 {
466                        let element = literal.element();
467                        self.stack.pop();
468                        self.elements.push(element);
469                    }
470                }
471                DecoderState::Escape => {
472                    let v = match next!(iter) {
473                        b'u' => {
474                            self.stack.pop();
475                            self.stack.push(DecoderState::Unicode(0, 0, 0));
476                            continue;
477                        }
478                        b'"' => b'"',
479                        b'\\' => b'\\',
480                        b'/' => b'/',
481                        b'b' => 8,  // BS
482                        b'f' => 12, // FF
483                        b'n' => b'\n',
484                        b'r' => b'\r',
485                        b't' => b'\t',
486                        b => return Err(err(b, "parsing escape sequence")),
487                    };
488
489                    self.stack.pop();
490                    self.bytes.push(v);
491                }
492                // Parse a unicode escape sequence
493                DecoderState::Unicode(high, low, idx) => loop {
494                    match *idx {
495                        0..=3 => *high = (*high << 4) | parse_hex(next!(iter))? as u16,
496                        4 => {
497                            if let Some(c) = char::from_u32(*high as u32) {
498                                write_char(c, &mut self.bytes);
499                                self.stack.pop();
500                                break;
501                            }
502
503                            match next!(iter) {
504                                b'\\' => {}
505                                b => return Err(err(b, "parsing surrogate pair escape")),
506                            }
507                        }
508                        5 => match next!(iter) {
509                            b'u' => {}
510                            b => return Err(err(b, "parsing surrogate pair unicode")),
511                        },
512                        6..=9 => *low = (*low << 4) | parse_hex(next!(iter))? as u16,
513                        _ => {
514                            let c = char_from_surrogate_pair(*low, *high)?;
515                            write_char(c, &mut self.bytes);
516                            self.stack.pop();
517                            break;
518                        }
519                    }
520                    *idx += 1;
521                },
522            }
523        }
524
525        Ok(buf.len() - iter.len())
526    }
527
528    /// Writes any type that implements [`Serialize`] into this [`TapeDecoder`]
529    pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(), ArrowError> {
530        if let Some(b) = self.stack.last() {
531            return Err(ArrowError::JsonError(format!(
532                "Cannot serialize to tape containing partial decode state {}",
533                b.as_str()
534            )));
535        }
536
537        let mut serializer =
538            TapeSerializer::new(&mut self.elements, &mut self.bytes, &mut self.offsets);
539
540        rows.iter()
541            .try_for_each(|row| row.serialize(&mut serializer))
542            .map_err(|e| ArrowError::JsonError(e.to_string()))?;
543
544        self.cur_row += rows.len();
545
546        Ok(())
547    }
548
549    /// The number of buffered rows, including the partially decoded row (if any).
550    pub fn num_buffered_rows(&self) -> usize {
551        self.cur_row
552    }
553
554    /// True if the decoder is part way through decoding a row. If so, calling [`Self::finish`]
555    /// would return an error.
556    pub fn has_partial_row(&self) -> bool {
557        !self.stack.is_empty()
558    }
559
560    /// Finishes the current [`Tape`]
561    pub fn finish(&self) -> Result<Tape<'_>, ArrowError> {
562        if let Some(b) = self.stack.last() {
563            return Err(ArrowError::JsonError(format!(
564                "Truncated record whilst reading {}",
565                b.as_str()
566            )));
567        }
568
569        if self.offsets.len() >= u32::MAX as usize {
570            return Err(ArrowError::JsonError(format!(
571                "Encountered more than {} bytes of string data, consider using a smaller batch size",
572                u32::MAX
573            )));
574        }
575
576        if self.offsets.len() >= u32::MAX as usize {
577            return Err(ArrowError::JsonError(format!(
578                "Encountered more than {} JSON elements, consider using a smaller batch size",
579                u32::MAX
580            )));
581        }
582
583        // Sanity check
584        assert_eq!(
585            self.offsets.last().copied().unwrap_or_default(),
586            self.bytes.len()
587        );
588
589        let strings = simdutf8::basic::from_utf8(&self.bytes)
590            .map_err(|_| ArrowError::JsonError("Encountered non-UTF-8 data".to_string()))?;
591
592        for offset in self.offsets.iter().copied() {
593            if !strings.is_char_boundary(offset) {
594                return Err(ArrowError::JsonError(
595                    "Encountered truncated UTF-8 sequence".to_string(),
596                ));
597            }
598        }
599
600        Ok(Tape {
601            strings,
602            elements: &self.elements,
603            string_offsets: &self.offsets,
604            num_rows: self.cur_row,
605        })
606    }
607
608    /// Clears this [`TapeDecoder`] in preparation to read the next batch
609    pub fn clear(&mut self) {
610        assert!(self.stack.is_empty());
611
612        self.cur_row = 0;
613        self.bytes.clear();
614        self.elements.clear();
615        self.elements.push(TapeElement::Null);
616        self.offsets.clear();
617        self.offsets.push(0);
618    }
619}
620
621/// A wrapper around a slice iterator that provides some helper functionality
622struct BufIter<'a> {
623    buf: &'a [u8],
624    pos: usize,
625}
626
627impl<'a> BufIter<'a> {
628    fn new(buf: &'a [u8]) -> Self {
629        Self { buf, pos: 0 }
630    }
631
632    #[inline]
633    fn as_slice(&self) -> &'a [u8] {
634        &self.buf[self.pos..]
635    }
636
637    #[inline]
638    fn is_empty(&self) -> bool {
639        self.pos >= self.buf.len()
640    }
641
642    fn peek(&self) -> Option<u8> {
643        self.buf.get(self.pos).copied()
644    }
645
646    #[inline]
647    fn advance(&mut self, skip: usize) {
648        self.pos += skip;
649    }
650
651    fn advance_until<F: FnMut(u8) -> bool>(&mut self, f: F) -> &[u8] {
652        let s = self.as_slice();
653        match s.iter().copied().position(f) {
654            Some(x) => {
655                self.advance(x);
656                &s[..x]
657            }
658            None => {
659                self.advance(s.len());
660                s
661            }
662        }
663    }
664
665    fn skip_chrs(&mut self, c1: u8, c2: u8) -> &[u8] {
666        let s = self.as_slice();
667        match memchr2(c1, c2, s) {
668            Some(p) => {
669                self.advance(p);
670                &s[..p]
671            }
672            None => {
673                self.advance(s.len());
674                s
675            }
676        }
677    }
678
679    fn skip_whitespace(&mut self) {
680        self.advance_until(|b| !json_whitespace(b));
681    }
682}
683
684impl Iterator for BufIter<'_> {
685    type Item = u8;
686
687    fn next(&mut self) -> Option<Self::Item> {
688        let b = self.peek();
689        self.pos += 1;
690        b
691    }
692
693    fn size_hint(&self) -> (usize, Option<usize>) {
694        let s = self.buf.len().checked_sub(self.pos).unwrap_or_default();
695        (s, Some(s))
696    }
697}
698
699impl ExactSizeIterator for BufIter<'_> {}
700
701/// Returns an error for a given byte `b` and context `ctx`
702fn err(b: u8, ctx: &str) -> ArrowError {
703    ArrowError::JsonError(format!(
704        "Encountered unexpected '{}' whilst {ctx}",
705        b as char
706    ))
707}
708
709/// Creates a character from an UTF-16 surrogate pair
710fn char_from_surrogate_pair(low: u16, high: u16) -> Result<char, ArrowError> {
711    match (low, high) {
712        (0xDC00..=0xDFFF, 0xD800..=0xDBFF) => {
713            let n = (((high - 0xD800) as u32) << 10) | ((low - 0xDC00) as u32 + 0x1_0000);
714            char::from_u32(n)
715                .ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")))
716        }
717        _ => Err(ArrowError::JsonError(format!(
718            "Invalid UTF-16 surrogate pair. High: {high:#02X}, Low: {low:#02X}"
719        ))),
720    }
721}
722
723/// Writes `c` as UTF-8 to `out`
724fn write_char(c: char, out: &mut Vec<u8>) {
725    let mut t = [0; 4];
726    out.extend_from_slice(c.encode_utf8(&mut t).as_bytes());
727}
728
729/// Evaluates to true if `b` is a valid JSON whitespace character
730#[inline]
731fn json_whitespace(b: u8) -> bool {
732    matches!(b, b' ' | b'\n' | b'\r' | b'\t')
733}
734
735/// Parse a hex character to `u8`
736fn parse_hex(b: u8) -> Result<u8, ArrowError> {
737    let digit = char::from(b)
738        .to_digit(16)
739        .ok_or_else(|| err(b, "unicode escape"))?;
740    Ok(digit as u8)
741}
742
743#[cfg(test)]
744mod tests {
745    use super::*;
746
747    #[test]
748    fn test_sizes() {
749        assert_eq!(std::mem::size_of::<DecoderState>(), 8);
750        assert_eq!(std::mem::size_of::<TapeElement>(), 8);
751    }
752
753    #[test]
754    fn test_basic() {
755        let a = r#"
756        {"hello": "world", "foo": 2, "bar": 45}
757
758        {"foo": "bar"}
759
760        {"fiz": null}
761
762        {"a": true, "b": false, "c": null}
763
764        {"a": "", "": "a"}
765
766        {"a": "b", "object": {"nested": "hello", "foo": 23}, "b": {}, "c": {"foo": null }}
767
768        {"a": ["", "foo", ["bar", "c"]], "b": {"1": []}, "c": {"2": [1, 2, 3]} }
769        "#;
770        let mut decoder = TapeDecoder::new(16, 2);
771        decoder.decode(a.as_bytes()).unwrap();
772        assert!(!decoder.has_partial_row());
773        assert_eq!(decoder.num_buffered_rows(), 7);
774
775        let finished = decoder.finish().unwrap();
776        assert!(!decoder.has_partial_row());
777        assert_eq!(decoder.num_buffered_rows(), 7); // didn't call clear() yet
778        assert_eq!(
779            finished.elements,
780            &[
781                TapeElement::Null,
782                TapeElement::StartObject(8), // {"hello": "world", "foo": 2, "bar": 45}
783                TapeElement::String(0),      // "hello"
784                TapeElement::String(1),      // "world"
785                TapeElement::String(2),      // "foo"
786                TapeElement::Number(3),      // 2
787                TapeElement::String(4),      // "bar"
788                TapeElement::Number(5),      // 45
789                TapeElement::EndObject(1),
790                TapeElement::StartObject(12), // {"foo": "bar"}
791                TapeElement::String(6),       // "foo"
792                TapeElement::String(7),       // "bar"
793                TapeElement::EndObject(9),
794                TapeElement::StartObject(16), // {"fiz": null}
795                TapeElement::String(8),       // "fiz
796                TapeElement::Null,            // null
797                TapeElement::EndObject(13),
798                TapeElement::StartObject(24), // {"a": true, "b": false, "c": null}
799                TapeElement::String(9),       // "a"
800                TapeElement::True,            // true
801                TapeElement::String(10),      // "b"
802                TapeElement::False,           // false
803                TapeElement::String(11),      // "c"
804                TapeElement::Null,            // null
805                TapeElement::EndObject(17),
806                TapeElement::StartObject(30), // {"a": "", "": "a"}
807                TapeElement::String(12),      // "a"
808                TapeElement::String(13),      // ""
809                TapeElement::String(14),      // ""
810                TapeElement::String(15),      // "a"
811                TapeElement::EndObject(25),
812                TapeElement::StartObject(49), // {"a": "b", "object": {"nested": "hello", "foo": 23}, "b": {}, "c": {"foo": null }}
813                TapeElement::String(16),      // "a"
814                TapeElement::String(17),      // "b"
815                TapeElement::String(18),      // "object"
816                TapeElement::StartObject(40), // {"nested": "hello", "foo": 23}
817                TapeElement::String(19),      // "nested"
818                TapeElement::String(20),      // "hello"
819                TapeElement::String(21),      // "foo"
820                TapeElement::Number(22),      // 23
821                TapeElement::EndObject(35),
822                TapeElement::String(23),      // "b"
823                TapeElement::StartObject(43), // {}
824                TapeElement::EndObject(42),
825                TapeElement::String(24),      // "c"
826                TapeElement::StartObject(48), // {"foo": null }
827                TapeElement::String(25),      // "foo"
828                TapeElement::Null,            // null
829                TapeElement::EndObject(45),
830                TapeElement::EndObject(31),
831                TapeElement::StartObject(75), // {"a": ["", "foo", ["bar", "c"]], "b": {"1": []}, "c": {"2": [1, 2, 3]} }
832                TapeElement::String(26),      // "a"
833                TapeElement::StartList(59),   // ["", "foo", ["bar", "c"]]
834                TapeElement::String(27),      // ""
835                TapeElement::String(28),      // "foo"
836                TapeElement::StartList(58),   // ["bar", "c"]
837                TapeElement::String(29),      // "bar"
838                TapeElement::String(30),      // "c"
839                TapeElement::EndList(55),
840                TapeElement::EndList(52),
841                TapeElement::String(31),      // "b"
842                TapeElement::StartObject(65), // {"1": []}
843                TapeElement::String(32),      // "1"
844                TapeElement::StartList(64),   // []
845                TapeElement::EndList(63),
846                TapeElement::EndObject(61),
847                TapeElement::String(33),      // "c"
848                TapeElement::StartObject(74), // {"2": [1, 2, 3]}
849                TapeElement::String(34),      // "2"
850                TapeElement::StartList(73),   // [1, 2, 3]
851                TapeElement::Number(35),      // 1
852                TapeElement::Number(36),      // 2
853                TapeElement::Number(37),      // 3
854                TapeElement::EndList(69),
855                TapeElement::EndObject(67),
856                TapeElement::EndObject(50)
857            ]
858        );
859
860        assert_eq!(
861            finished.strings,
862            "helloworldfoo2bar45foobarfizabcaaabobjectnestedhellofoo23bcfooafoobarcb1c2123"
863        );
864        assert_eq!(
865            &finished.string_offsets,
866            &[
867                0, 5, 10, 13, 14, 17, 19, 22, 25, 28, 29, 30, 31, 32, 32, 32, 33, 34, 35, 41, 47,
868                52, 55, 57, 58, 59, 62, 63, 63, 66, 69, 70, 71, 72, 73, 74, 75, 76, 77
869            ]
870        );
871
872        decoder.clear();
873        assert!(!decoder.has_partial_row());
874        assert_eq!(decoder.num_buffered_rows(), 0);
875    }
876
877    #[test]
878    fn test_invalid() {
879        // Test invalid
880        let mut decoder = TapeDecoder::new(16, 2);
881        let err = decoder.decode(b"hello").unwrap_err().to_string();
882        assert_eq!(
883            err,
884            "Json error: Encountered unexpected 'h' whilst parsing value"
885        );
886
887        let mut decoder = TapeDecoder::new(16, 2);
888        let err = decoder.decode(b"{\"hello\": }").unwrap_err().to_string();
889        assert_eq!(
890            err,
891            "Json error: Encountered unexpected '}' whilst parsing value"
892        );
893
894        let mut decoder = TapeDecoder::new(16, 2);
895        let err = decoder
896            .decode(b"{\"hello\": [ false, tru ]}")
897            .unwrap_err()
898            .to_string();
899        assert_eq!(
900            err,
901            "Json error: Encountered unexpected ' ' whilst parsing literal"
902        );
903
904        let mut decoder = TapeDecoder::new(16, 2);
905        let err = decoder
906            .decode(b"{\"hello\": \"\\ud8\"}")
907            .unwrap_err()
908            .to_string();
909        assert_eq!(
910            err,
911            "Json error: Encountered unexpected '\"' whilst unicode escape"
912        );
913
914        // Missing surrogate pair
915        let mut decoder = TapeDecoder::new(16, 2);
916        let err = decoder
917            .decode(b"{\"hello\": \"\\ud83d\"}")
918            .unwrap_err()
919            .to_string();
920        assert_eq!(
921            err,
922            "Json error: Encountered unexpected '\"' whilst parsing surrogate pair escape"
923        );
924
925        // Test truncation
926        let mut decoder = TapeDecoder::new(16, 2);
927        decoder.decode(b"{\"he").unwrap();
928        assert!(decoder.has_partial_row());
929        assert_eq!(decoder.num_buffered_rows(), 1);
930        let err = decoder.finish().unwrap_err().to_string();
931        assert_eq!(err, "Json error: Truncated record whilst reading string");
932
933        let mut decoder = TapeDecoder::new(16, 2);
934        decoder.decode(b"{\"hello\" : ").unwrap();
935        let err = decoder.finish().unwrap_err().to_string();
936        assert_eq!(err, "Json error: Truncated record whilst reading value");
937
938        let mut decoder = TapeDecoder::new(16, 2);
939        decoder.decode(b"{\"hello\" : [").unwrap();
940        let err = decoder.finish().unwrap_err().to_string();
941        assert_eq!(err, "Json error: Truncated record whilst reading list");
942
943        let mut decoder = TapeDecoder::new(16, 2);
944        decoder.decode(b"{\"hello\" : tru").unwrap();
945        let err = decoder.finish().unwrap_err().to_string();
946        assert_eq!(err, "Json error: Truncated record whilst reading true");
947
948        let mut decoder = TapeDecoder::new(16, 2);
949        decoder.decode(b"{\"hello\" : nu").unwrap();
950        let err = decoder.finish().unwrap_err().to_string();
951        assert_eq!(err, "Json error: Truncated record whilst reading null");
952
953        // Test invalid UTF-8
954        let mut decoder = TapeDecoder::new(16, 2);
955        decoder.decode(b"{\"hello\" : \"world\xFF\"}").unwrap();
956        let err = decoder.finish().unwrap_err().to_string();
957        assert_eq!(err, "Json error: Encountered non-UTF-8 data");
958
959        let mut decoder = TapeDecoder::new(16, 2);
960        decoder.decode(b"{\"\xe2\" : \"\x96\xa1\"}").unwrap();
961        let err = decoder.finish().unwrap_err().to_string();
962        assert_eq!(err, "Json error: Encountered truncated UTF-8 sequence");
963    }
964
965    #[test]
966    fn test_invalid_surrogates() {
967        let mut decoder = TapeDecoder::new(16, 2);
968        let res = decoder.decode(b"{\"test\": \"\\ud800\\ud801\"}");
969        assert!(res.is_err());
970
971        let mut decoder = TapeDecoder::new(16, 2);
972        let res = decoder.decode(b"{\"test\": \"\\udc00\\udc01\"}");
973        assert!(res.is_err());
974    }
975}