arrow_json/reader/
tape.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::reader::serializer::TapeSerializer;
19use arrow_schema::ArrowError;
20use memchr::memchr2;
21use serde::Serialize;
22use std::fmt::Write;
23
24/// We decode JSON to a flattened tape representation,
25/// allowing for efficient traversal of the JSON data
26///
27/// This approach is inspired by [simdjson]
28///
29/// Uses `u32` for offsets to ensure `TapeElement` is 64-bits. A future
30/// iteration may increase this to a custom `u56` type.
31///
32/// [simdjson]: https://github.com/simdjson/simdjson/blob/master/doc/tape.md
33#[derive(Debug, Copy, Clone, PartialEq, Eq)]
34pub enum TapeElement {
35    /// The start of an object, i.e. `{`
36    ///
37    /// Contains the offset of the corresponding [`Self::EndObject`]
38    StartObject(u32),
39    /// The end of an object, i.e. `}`
40    ///
41    /// Contains the offset of the corresponding [`Self::StartObject`]
42    EndObject(u32),
43    /// The start of a list , i.e. `[`
44    ///
45    /// Contains the offset of the corresponding [`Self::EndList`]
46    StartList(u32),
47    /// The end of a list , i.e. `]`
48    ///
49    /// Contains the offset of the corresponding [`Self::StartList`]
50    EndList(u32),
51    /// A string value
52    ///
53    /// Contains the offset into the [`Tape`] string data
54    String(u32),
55    /// A numeric value
56    ///
57    /// Contains the offset into the [`Tape`] string data
58    Number(u32),
59
60    /// The high bits of a i64
61    ///
62    /// Followed by [`Self::I32`] containing the low bits
63    I64(i32),
64
65    /// A 32-bit signed integer
66    ///
67    /// May be preceded by [`Self::I64`] containing high bits
68    I32(i32),
69
70    /// The high bits of a 64-bit float
71    ///
72    /// Followed by [`Self::F32`] containing the low bits
73    F64(u32),
74
75    /// A 32-bit float or the low-bits of a 64-bit float if preceded by [`Self::F64`]
76    F32(u32),
77
78    /// A true literal
79    True,
80    /// A false literal
81    False,
82    /// A null literal
83    Null,
84}
85
86/// A decoded JSON tape
87///
88/// String and numeric data is stored alongside an array of [`TapeElement`]
89///
90/// The first element is always [`TapeElement::Null`]
91///
92/// This approach to decoding JSON is inspired by [simdjson]
93///
94/// [simdjson]: https://github.com/simdjson/simdjson/blob/master/doc/tape.md
95#[derive(Debug)]
96pub struct Tape<'a> {
97    elements: &'a [TapeElement],
98    strings: &'a str,
99    string_offsets: &'a [usize],
100    num_rows: usize,
101}
102
103impl<'a> Tape<'a> {
104    /// Returns the string for the given string index
105    #[inline]
106    pub fn get_string(&self, idx: u32) -> &'a str {
107        let end_offset = self.string_offsets[idx as usize + 1];
108        let start_offset = self.string_offsets[idx as usize];
109        // SAFETY:
110        // Verified offsets
111        unsafe { self.strings.get_unchecked(start_offset..end_offset) }
112    }
113
114    /// Returns the tape element at `idx`
115    pub fn get(&self, idx: u32) -> TapeElement {
116        self.elements[idx as usize]
117    }
118
119    /// Returns the index of the next field at the same level as `cur_idx`
120    ///
121    /// Return an error if `cur_idx` is not the start of a field
122    pub fn next(&self, cur_idx: u32, expected: &str) -> Result<u32, ArrowError> {
123        match self.get(cur_idx) {
124            TapeElement::String(_)
125            | TapeElement::Number(_)
126            | TapeElement::True
127            | TapeElement::False
128            | TapeElement::Null
129            | TapeElement::I32(_)
130            | TapeElement::F32(_) => Ok(cur_idx + 1),
131            TapeElement::I64(_) | TapeElement::F64(_) => Ok(cur_idx + 2),
132            TapeElement::StartList(end_idx) => Ok(end_idx + 1),
133            TapeElement::StartObject(end_idx) => Ok(end_idx + 1),
134            TapeElement::EndObject(_) | TapeElement::EndList(_) => {
135                Err(self.error(cur_idx, expected))
136            }
137        }
138    }
139
140    /// Returns the number of rows
141    pub fn num_rows(&self) -> usize {
142        self.num_rows
143    }
144
145    /// Serialize the tape element at index `idx` to `out` returning the next field index
146    fn serialize(&self, out: &mut String, idx: u32) -> u32 {
147        match self.get(idx) {
148            TapeElement::StartObject(end) => {
149                out.push('{');
150                let mut cur_idx = idx + 1;
151                while cur_idx < end {
152                    cur_idx = self.serialize(out, cur_idx);
153                    out.push_str(": ");
154                    cur_idx = self.serialize(out, cur_idx);
155                }
156                out.push('}');
157                return end + 1;
158            }
159            TapeElement::EndObject(_) => out.push('}'),
160            TapeElement::StartList(end) => {
161                out.push('[');
162                let mut cur_idx = idx + 1;
163                while cur_idx < end {
164                    cur_idx = self.serialize(out, cur_idx);
165                    if cur_idx < end {
166                        out.push_str(", ");
167                    }
168                }
169                out.push(']');
170                return end + 1;
171            }
172            TapeElement::EndList(_) => out.push(']'),
173            TapeElement::String(s) => {
174                out.push('"');
175                out.push_str(self.get_string(s));
176                out.push('"')
177            }
178            TapeElement::Number(n) => out.push_str(self.get_string(n)),
179            TapeElement::True => out.push_str("true"),
180            TapeElement::False => out.push_str("false"),
181            TapeElement::Null => out.push_str("null"),
182            TapeElement::I64(high) => match self.get(idx + 1) {
183                TapeElement::I32(low) => {
184                    let val = ((high as i64) << 32) | (low as u32) as i64;
185                    let _ = write!(out, "{val}");
186                    return idx + 2;
187                }
188                _ => unreachable!(),
189            },
190            TapeElement::I32(val) => {
191                let _ = write!(out, "{val}");
192            }
193            TapeElement::F64(high) => match self.get(idx + 1) {
194                TapeElement::F32(low) => {
195                    let val = f64::from_bits(((high as u64) << 32) | low as u64);
196                    let _ = write!(out, "{val}");
197                    return idx + 2;
198                }
199                _ => unreachable!(),
200            },
201            TapeElement::F32(val) => {
202                let _ = write!(out, "{}", f32::from_bits(val));
203            }
204        }
205        idx + 1
206    }
207
208    /// Returns an error reading index `idx`
209    pub fn error(&self, idx: u32, expected: &str) -> ArrowError {
210        let mut out = String::with_capacity(64);
211        self.serialize(&mut out, idx);
212        ArrowError::JsonError(format!("expected {expected} got {out}"))
213    }
214}
215
216/// States based on <https://www.json.org/json-en.html>
217#[derive(Debug, Copy, Clone)]
218enum DecoderState {
219    /// Decoding an object
220    ///
221    /// Contains index of start [`TapeElement::StartObject`]
222    Object(u32),
223    /// Decoding a list
224    ///
225    /// Contains index of start [`TapeElement::StartList`]
226    List(u32),
227    String,
228    Value,
229    Number,
230    Colon,
231    Escape,
232    /// A unicode escape sequence,
233    ///
234    /// Consists of a `(low surrogate, high surrogate, decoded length)`
235    Unicode(u16, u16, u8),
236    /// A boolean or null literal
237    ///
238    /// Consists of `(literal, decoded length)`
239    Literal(Literal, u8),
240}
241
242impl DecoderState {
243    fn as_str(&self) -> &'static str {
244        match self {
245            DecoderState::Object(_) => "object",
246            DecoderState::List(_) => "list",
247            DecoderState::String => "string",
248            DecoderState::Value => "value",
249            DecoderState::Number => "number",
250            DecoderState::Colon => "colon",
251            DecoderState::Escape => "escape",
252            DecoderState::Unicode(_, _, _) => "unicode literal",
253            DecoderState::Literal(d, _) => d.as_str(),
254        }
255    }
256}
257
258#[derive(Debug, Copy, Clone)]
259enum Literal {
260    Null,
261    True,
262    False,
263}
264
265impl Literal {
266    fn element(&self) -> TapeElement {
267        match self {
268            Literal::Null => TapeElement::Null,
269            Literal::True => TapeElement::True,
270            Literal::False => TapeElement::False,
271        }
272    }
273
274    fn as_str(&self) -> &'static str {
275        match self {
276            Literal::Null => "null",
277            Literal::True => "true",
278            Literal::False => "false",
279        }
280    }
281
282    fn bytes(&self) -> &'static [u8] {
283        self.as_str().as_bytes()
284    }
285}
286
287/// Evaluates to the next element in the iterator or breaks the current loop
288macro_rules! next {
289    ($next:ident) => {
290        match $next.next() {
291            Some(b) => b,
292            None => break,
293        }
294    };
295}
296
297/// Implements a state machine for decoding JSON to a tape
298pub struct TapeDecoder {
299    elements: Vec<TapeElement>,
300
301    /// The number of rows decoded, including any in progress if `!stack.is_empty()`
302    cur_row: usize,
303
304    /// Number of rows to read per batch
305    batch_size: usize,
306
307    /// A buffer of parsed string data
308    ///
309    /// Note: if part way through a record, i.e. `stack` is not empty,
310    /// this may contain truncated UTF-8 data
311    bytes: Vec<u8>,
312
313    /// Offsets into `data`
314    offsets: Vec<usize>,
315
316    /// A stack of [`DecoderState`]
317    stack: Vec<DecoderState>,
318}
319
320impl TapeDecoder {
321    /// Create a new [`TapeDecoder`] with the provided batch size
322    /// and an estimated number of fields in each row
323    pub fn new(batch_size: usize, num_fields: usize) -> Self {
324        let tokens_per_row = 2 + num_fields * 2;
325        let mut offsets = Vec::with_capacity(batch_size * (num_fields * 2) + 1);
326        offsets.push(0);
327
328        let mut elements = Vec::with_capacity(batch_size * tokens_per_row);
329        elements.push(TapeElement::Null);
330
331        Self {
332            offsets,
333            elements,
334            batch_size,
335            cur_row: 0,
336            bytes: Vec::with_capacity(num_fields * 2 * 8),
337            stack: Vec::with_capacity(10),
338        }
339    }
340
341    pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
342        let mut iter = BufIter::new(buf);
343
344        while !iter.is_empty() {
345            let state = match self.stack.last_mut() {
346                Some(l) => l,
347                None => {
348                    iter.skip_whitespace();
349                    if iter.is_empty() || self.cur_row >= self.batch_size {
350                        break;
351                    }
352
353                    // Start of row
354                    self.cur_row += 1;
355                    self.stack.push(DecoderState::Value);
356                    self.stack.last_mut().unwrap()
357                }
358            };
359
360            match state {
361                // Decoding an object
362                DecoderState::Object(start_idx) => {
363                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
364                    match next!(iter) {
365                        b'"' => {
366                            self.stack.push(DecoderState::Value);
367                            self.stack.push(DecoderState::Colon);
368                            self.stack.push(DecoderState::String);
369                        }
370                        b'}' => {
371                            let start_idx = *start_idx;
372                            let end_idx = self.elements.len() as u32;
373                            self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
374                            self.elements.push(TapeElement::EndObject(start_idx));
375                            self.stack.pop();
376                        }
377                        b => return Err(err(b, "parsing object")),
378                    }
379                }
380                // Decoding a list
381                DecoderState::List(start_idx) => {
382                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
383                    match iter.peek() {
384                        Some(b']') => {
385                            iter.next();
386                            let start_idx = *start_idx;
387                            let end_idx = self.elements.len() as u32;
388                            self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
389                            self.elements.push(TapeElement::EndList(start_idx));
390                            self.stack.pop();
391                        }
392                        Some(_) => self.stack.push(DecoderState::Value),
393                        None => break,
394                    }
395                }
396                // Decoding a string
397                DecoderState::String => {
398                    let s = iter.skip_chrs(b'\\', b'"');
399                    self.bytes.extend_from_slice(s);
400
401                    match next!(iter) {
402                        b'\\' => self.stack.push(DecoderState::Escape),
403                        b'"' => {
404                            let idx = self.offsets.len() - 1;
405                            self.elements.push(TapeElement::String(idx as _));
406                            self.offsets.push(self.bytes.len());
407                            self.stack.pop();
408                        }
409                        b => unreachable!("{}", b),
410                    }
411                }
412                state @ DecoderState::Value => {
413                    iter.skip_whitespace();
414                    *state = match next!(iter) {
415                        b'"' => DecoderState::String,
416                        b @ b'-' | b @ b'0'..=b'9' => {
417                            self.bytes.push(b);
418                            DecoderState::Number
419                        }
420                        b'n' => DecoderState::Literal(Literal::Null, 1),
421                        b'f' => DecoderState::Literal(Literal::False, 1),
422                        b't' => DecoderState::Literal(Literal::True, 1),
423                        b'[' => {
424                            let idx = self.elements.len() as u32;
425                            self.elements.push(TapeElement::StartList(u32::MAX));
426                            DecoderState::List(idx)
427                        }
428                        b'{' => {
429                            let idx = self.elements.len() as u32;
430                            self.elements.push(TapeElement::StartObject(u32::MAX));
431                            DecoderState::Object(idx)
432                        }
433                        b => return Err(err(b, "parsing value")),
434                    };
435                }
436                DecoderState::Number => {
437                    let s = iter.advance_until(|b| {
438                        !matches!(b, b'0'..=b'9' | b'-' | b'+' | b'.' | b'e' | b'E')
439                    });
440                    self.bytes.extend_from_slice(s);
441
442                    if !iter.is_empty() {
443                        self.stack.pop();
444                        let idx = self.offsets.len() - 1;
445                        self.elements.push(TapeElement::Number(idx as _));
446                        self.offsets.push(self.bytes.len());
447                    }
448                }
449                DecoderState::Colon => {
450                    iter.skip_whitespace();
451                    match next!(iter) {
452                        b':' => self.stack.pop(),
453                        b => return Err(err(b, "parsing colon")),
454                    };
455                }
456                DecoderState::Literal(literal, idx) => {
457                    let bytes = literal.bytes();
458                    let expected = bytes.iter().skip(*idx as usize).copied();
459                    for (expected, b) in expected.zip(&mut iter) {
460                        match b == expected {
461                            true => *idx += 1,
462                            false => return Err(err(b, "parsing literal")),
463                        }
464                    }
465                    if *idx == bytes.len() as u8 {
466                        let element = literal.element();
467                        self.stack.pop();
468                        self.elements.push(element);
469                    }
470                }
471                DecoderState::Escape => {
472                    let v = match next!(iter) {
473                        b'u' => {
474                            self.stack.pop();
475                            self.stack.push(DecoderState::Unicode(0, 0, 0));
476                            continue;
477                        }
478                        b'"' => b'"',
479                        b'\\' => b'\\',
480                        b'/' => b'/',
481                        b'b' => 8,  // BS
482                        b'f' => 12, // FF
483                        b'n' => b'\n',
484                        b'r' => b'\r',
485                        b't' => b'\t',
486                        b => return Err(err(b, "parsing escape sequence")),
487                    };
488
489                    self.stack.pop();
490                    self.bytes.push(v);
491                }
492                // Parse a unicode escape sequence
493                DecoderState::Unicode(high, low, idx) => loop {
494                    match *idx {
495                        0..=3 => *high = (*high << 4) | parse_hex(next!(iter))? as u16,
496                        4 => {
497                            if let Some(c) = char::from_u32(*high as u32) {
498                                write_char(c, &mut self.bytes);
499                                self.stack.pop();
500                                break;
501                            }
502
503                            match next!(iter) {
504                                b'\\' => {}
505                                b => return Err(err(b, "parsing surrogate pair escape")),
506                            }
507                        }
508                        5 => match next!(iter) {
509                            b'u' => {}
510                            b => return Err(err(b, "parsing surrogate pair unicode")),
511                        },
512                        6..=9 => *low = (*low << 4) | parse_hex(next!(iter))? as u16,
513                        _ => {
514                            let c = char_from_surrogate_pair(*low, *high)?;
515                            write_char(c, &mut self.bytes);
516                            self.stack.pop();
517                            break;
518                        }
519                    }
520                    *idx += 1;
521                },
522            }
523        }
524
525        Ok(buf.len() - iter.len())
526    }
527
528    /// Writes any type that implements [`Serialize`] into this [`TapeDecoder`]
529    pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(), ArrowError> {
530        if let Some(b) = self.stack.last() {
531            return Err(ArrowError::JsonError(format!(
532                "Cannot serialize to tape containing partial decode state {}",
533                b.as_str()
534            )));
535        }
536
537        let mut serializer =
538            TapeSerializer::new(&mut self.elements, &mut self.bytes, &mut self.offsets);
539
540        rows.iter()
541            .try_for_each(|row| row.serialize(&mut serializer))
542            .map_err(|e| ArrowError::JsonError(e.to_string()))?;
543
544        self.cur_row += rows.len();
545
546        Ok(())
547    }
548
549    /// The number of buffered rows, including the partially decoded row (if any).
550    pub fn num_buffered_rows(&self) -> usize {
551        self.cur_row
552    }
553
554    /// True if the decoder is part way through decoding a row. If so, calling [`Self::finish`]
555    /// would return an error.
556    pub fn has_partial_row(&self) -> bool {
557        !self.stack.is_empty()
558    }
559
560    /// Finishes the current [`Tape`]
561    pub fn finish(&self) -> Result<Tape<'_>, ArrowError> {
562        if let Some(b) = self.stack.last() {
563            return Err(ArrowError::JsonError(format!(
564                "Truncated record whilst reading {}",
565                b.as_str()
566            )));
567        }
568
569        if self.offsets.len() >= u32::MAX as usize {
570            return Err(ArrowError::JsonError(format!("Encountered more than {} bytes of string data, consider using a smaller batch size", u32::MAX)));
571        }
572
573        if self.offsets.len() >= u32::MAX as usize {
574            return Err(ArrowError::JsonError(format!(
575                "Encountered more than {} JSON elements, consider using a smaller batch size",
576                u32::MAX
577            )));
578        }
579
580        // Sanity check
581        assert_eq!(
582            self.offsets.last().copied().unwrap_or_default(),
583            self.bytes.len()
584        );
585
586        let strings = simdutf8::basic::from_utf8(&self.bytes)
587            .map_err(|_| ArrowError::JsonError("Encountered non-UTF-8 data".to_string()))?;
588
589        for offset in self.offsets.iter().copied() {
590            if !strings.is_char_boundary(offset) {
591                return Err(ArrowError::JsonError(
592                    "Encountered truncated UTF-8 sequence".to_string(),
593                ));
594            }
595        }
596
597        Ok(Tape {
598            strings,
599            elements: &self.elements,
600            string_offsets: &self.offsets,
601            num_rows: self.cur_row,
602        })
603    }
604
605    /// Clears this [`TapeDecoder`] in preparation to read the next batch
606    pub fn clear(&mut self) {
607        assert!(self.stack.is_empty());
608
609        self.cur_row = 0;
610        self.bytes.clear();
611        self.elements.clear();
612        self.elements.push(TapeElement::Null);
613        self.offsets.clear();
614        self.offsets.push(0);
615    }
616}
617
618/// A wrapper around a slice iterator that provides some helper functionality
619struct BufIter<'a> {
620    buf: &'a [u8],
621    pos: usize,
622}
623
624impl<'a> BufIter<'a> {
625    fn new(buf: &'a [u8]) -> Self {
626        Self { buf, pos: 0 }
627    }
628
629    #[inline]
630    fn as_slice(&self) -> &'a [u8] {
631        &self.buf[self.pos..]
632    }
633
634    #[inline]
635    fn is_empty(&self) -> bool {
636        self.pos >= self.buf.len()
637    }
638
639    fn peek(&self) -> Option<u8> {
640        self.buf.get(self.pos).copied()
641    }
642
643    #[inline]
644    fn advance(&mut self, skip: usize) {
645        self.pos += skip;
646    }
647
648    fn advance_until<F: FnMut(u8) -> bool>(&mut self, f: F) -> &[u8] {
649        let s = self.as_slice();
650        match s.iter().copied().position(f) {
651            Some(x) => {
652                self.advance(x);
653                &s[..x]
654            }
655            None => {
656                self.advance(s.len());
657                s
658            }
659        }
660    }
661
662    fn skip_chrs(&mut self, c1: u8, c2: u8) -> &[u8] {
663        let s = self.as_slice();
664        match memchr2(c1, c2, s) {
665            Some(p) => {
666                self.advance(p);
667                &s[..p]
668            }
669            None => {
670                self.advance(s.len());
671                s
672            }
673        }
674    }
675
676    fn skip_whitespace(&mut self) {
677        self.advance_until(|b| !json_whitespace(b));
678    }
679}
680
681impl Iterator for BufIter<'_> {
682    type Item = u8;
683
684    fn next(&mut self) -> Option<Self::Item> {
685        let b = self.peek();
686        self.pos += 1;
687        b
688    }
689
690    fn size_hint(&self) -> (usize, Option<usize>) {
691        let s = self.buf.len().checked_sub(self.pos).unwrap_or_default();
692        (s, Some(s))
693    }
694}
695
696impl ExactSizeIterator for BufIter<'_> {}
697
698/// Returns an error for a given byte `b` and context `ctx`
699fn err(b: u8, ctx: &str) -> ArrowError {
700    ArrowError::JsonError(format!(
701        "Encountered unexpected '{}' whilst {ctx}",
702        b as char
703    ))
704}
705
706/// Creates a character from an UTF-16 surrogate pair
707fn char_from_surrogate_pair(low: u16, high: u16) -> Result<char, ArrowError> {
708    let n = (((high - 0xD800) as u32) << 10) | ((low - 0xDC00) as u32 + 0x1_0000);
709    char::from_u32(n)
710        .ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")))
711}
712
713/// Writes `c` as UTF-8 to `out`
714fn write_char(c: char, out: &mut Vec<u8>) {
715    let mut t = [0; 4];
716    out.extend_from_slice(c.encode_utf8(&mut t).as_bytes());
717}
718
719/// Evaluates to true if `b` is a valid JSON whitespace character
720#[inline]
721fn json_whitespace(b: u8) -> bool {
722    matches!(b, b' ' | b'\n' | b'\r' | b'\t')
723}
724
725/// Parse a hex character to `u8`
726fn parse_hex(b: u8) -> Result<u8, ArrowError> {
727    let digit = char::from(b)
728        .to_digit(16)
729        .ok_or_else(|| err(b, "unicode escape"))?;
730    Ok(digit as u8)
731}
732
733#[cfg(test)]
734mod tests {
735    use super::*;
736
737    #[test]
738    fn test_sizes() {
739        assert_eq!(std::mem::size_of::<DecoderState>(), 8);
740        assert_eq!(std::mem::size_of::<TapeElement>(), 8);
741    }
742
743    #[test]
744    fn test_basic() {
745        let a = r#"
746        {"hello": "world", "foo": 2, "bar": 45}
747
748        {"foo": "bar"}
749
750        {"fiz": null}
751
752        {"a": true, "b": false, "c": null}
753
754        {"a": "", "": "a"}
755
756        {"a": "b", "object": {"nested": "hello", "foo": 23}, "b": {}, "c": {"foo": null }}
757
758        {"a": ["", "foo", ["bar", "c"]], "b": {"1": []}, "c": {"2": [1, 2, 3]} }
759        "#;
760        let mut decoder = TapeDecoder::new(16, 2);
761        decoder.decode(a.as_bytes()).unwrap();
762        assert!(!decoder.has_partial_row());
763        assert_eq!(decoder.num_buffered_rows(), 7);
764
765        let finished = decoder.finish().unwrap();
766        assert!(!decoder.has_partial_row());
767        assert_eq!(decoder.num_buffered_rows(), 7); // didn't call clear() yet
768        assert_eq!(
769            finished.elements,
770            &[
771                TapeElement::Null,
772                TapeElement::StartObject(8), // {"hello": "world", "foo": 2, "bar": 45}
773                TapeElement::String(0),      // "hello"
774                TapeElement::String(1),      // "world"
775                TapeElement::String(2),      // "foo"
776                TapeElement::Number(3),      // 2
777                TapeElement::String(4),      // "bar"
778                TapeElement::Number(5),      // 45
779                TapeElement::EndObject(1),
780                TapeElement::StartObject(12), // {"foo": "bar"}
781                TapeElement::String(6),       // "foo"
782                TapeElement::String(7),       // "bar"
783                TapeElement::EndObject(9),
784                TapeElement::StartObject(16), // {"fiz": null}
785                TapeElement::String(8),       // "fiz
786                TapeElement::Null,            // null
787                TapeElement::EndObject(13),
788                TapeElement::StartObject(24), // {"a": true, "b": false, "c": null}
789                TapeElement::String(9),       // "a"
790                TapeElement::True,            // true
791                TapeElement::String(10),      // "b"
792                TapeElement::False,           // false
793                TapeElement::String(11),      // "c"
794                TapeElement::Null,            // null
795                TapeElement::EndObject(17),
796                TapeElement::StartObject(30), // {"a": "", "": "a"}
797                TapeElement::String(12),      // "a"
798                TapeElement::String(13),      // ""
799                TapeElement::String(14),      // ""
800                TapeElement::String(15),      // "a"
801                TapeElement::EndObject(25),
802                TapeElement::StartObject(49), // {"a": "b", "object": {"nested": "hello", "foo": 23}, "b": {}, "c": {"foo": null }}
803                TapeElement::String(16),      // "a"
804                TapeElement::String(17),      // "b"
805                TapeElement::String(18),      // "object"
806                TapeElement::StartObject(40), // {"nested": "hello", "foo": 23}
807                TapeElement::String(19),      // "nested"
808                TapeElement::String(20),      // "hello"
809                TapeElement::String(21),      // "foo"
810                TapeElement::Number(22),      // 23
811                TapeElement::EndObject(35),
812                TapeElement::String(23),      // "b"
813                TapeElement::StartObject(43), // {}
814                TapeElement::EndObject(42),
815                TapeElement::String(24),      // "c"
816                TapeElement::StartObject(48), // {"foo": null }
817                TapeElement::String(25),      // "foo"
818                TapeElement::Null,            // null
819                TapeElement::EndObject(45),
820                TapeElement::EndObject(31),
821                TapeElement::StartObject(75), // {"a": ["", "foo", ["bar", "c"]], "b": {"1": []}, "c": {"2": [1, 2, 3]} }
822                TapeElement::String(26),      // "a"
823                TapeElement::StartList(59),   // ["", "foo", ["bar", "c"]]
824                TapeElement::String(27),      // ""
825                TapeElement::String(28),      // "foo"
826                TapeElement::StartList(58),   // ["bar", "c"]
827                TapeElement::String(29),      // "bar"
828                TapeElement::String(30),      // "c"
829                TapeElement::EndList(55),
830                TapeElement::EndList(52),
831                TapeElement::String(31),      // "b"
832                TapeElement::StartObject(65), // {"1": []}
833                TapeElement::String(32),      // "1"
834                TapeElement::StartList(64),   // []
835                TapeElement::EndList(63),
836                TapeElement::EndObject(61),
837                TapeElement::String(33),      // "c"
838                TapeElement::StartObject(74), // {"2": [1, 2, 3]}
839                TapeElement::String(34),      // "2"
840                TapeElement::StartList(73),   // [1, 2, 3]
841                TapeElement::Number(35),      // 1
842                TapeElement::Number(36),      // 2
843                TapeElement::Number(37),      // 3
844                TapeElement::EndList(69),
845                TapeElement::EndObject(67),
846                TapeElement::EndObject(50)
847            ]
848        );
849
850        assert_eq!(
851            finished.strings,
852            "helloworldfoo2bar45foobarfizabcaaabobjectnestedhellofoo23bcfooafoobarcb1c2123"
853        );
854        assert_eq!(
855            &finished.string_offsets,
856            &[
857                0, 5, 10, 13, 14, 17, 19, 22, 25, 28, 29, 30, 31, 32, 32, 32, 33, 34, 35, 41, 47,
858                52, 55, 57, 58, 59, 62, 63, 63, 66, 69, 70, 71, 72, 73, 74, 75, 76, 77
859            ]
860        );
861
862        decoder.clear();
863        assert!(!decoder.has_partial_row());
864        assert_eq!(decoder.num_buffered_rows(), 0);
865    }
866
867    #[test]
868    fn test_invalid() {
869        // Test invalid
870        let mut decoder = TapeDecoder::new(16, 2);
871        let err = decoder.decode(b"hello").unwrap_err().to_string();
872        assert_eq!(
873            err,
874            "Json error: Encountered unexpected 'h' whilst parsing value"
875        );
876
877        let mut decoder = TapeDecoder::new(16, 2);
878        let err = decoder.decode(b"{\"hello\": }").unwrap_err().to_string();
879        assert_eq!(
880            err,
881            "Json error: Encountered unexpected '}' whilst parsing value"
882        );
883
884        let mut decoder = TapeDecoder::new(16, 2);
885        let err = decoder
886            .decode(b"{\"hello\": [ false, tru ]}")
887            .unwrap_err()
888            .to_string();
889        assert_eq!(
890            err,
891            "Json error: Encountered unexpected ' ' whilst parsing literal"
892        );
893
894        let mut decoder = TapeDecoder::new(16, 2);
895        let err = decoder
896            .decode(b"{\"hello\": \"\\ud8\"}")
897            .unwrap_err()
898            .to_string();
899        assert_eq!(
900            err,
901            "Json error: Encountered unexpected '\"' whilst unicode escape"
902        );
903
904        // Missing surrogate pair
905        let mut decoder = TapeDecoder::new(16, 2);
906        let err = decoder
907            .decode(b"{\"hello\": \"\\ud83d\"}")
908            .unwrap_err()
909            .to_string();
910        assert_eq!(
911            err,
912            "Json error: Encountered unexpected '\"' whilst parsing surrogate pair escape"
913        );
914
915        // Test truncation
916        let mut decoder = TapeDecoder::new(16, 2);
917        decoder.decode(b"{\"he").unwrap();
918        assert!(decoder.has_partial_row());
919        assert_eq!(decoder.num_buffered_rows(), 1);
920        let err = decoder.finish().unwrap_err().to_string();
921        assert_eq!(err, "Json error: Truncated record whilst reading string");
922
923        let mut decoder = TapeDecoder::new(16, 2);
924        decoder.decode(b"{\"hello\" : ").unwrap();
925        let err = decoder.finish().unwrap_err().to_string();
926        assert_eq!(err, "Json error: Truncated record whilst reading value");
927
928        let mut decoder = TapeDecoder::new(16, 2);
929        decoder.decode(b"{\"hello\" : [").unwrap();
930        let err = decoder.finish().unwrap_err().to_string();
931        assert_eq!(err, "Json error: Truncated record whilst reading list");
932
933        let mut decoder = TapeDecoder::new(16, 2);
934        decoder.decode(b"{\"hello\" : tru").unwrap();
935        let err = decoder.finish().unwrap_err().to_string();
936        assert_eq!(err, "Json error: Truncated record whilst reading true");
937
938        let mut decoder = TapeDecoder::new(16, 2);
939        decoder.decode(b"{\"hello\" : nu").unwrap();
940        let err = decoder.finish().unwrap_err().to_string();
941        assert_eq!(err, "Json error: Truncated record whilst reading null");
942
943        // Test invalid UTF-8
944        let mut decoder = TapeDecoder::new(16, 2);
945        decoder.decode(b"{\"hello\" : \"world\xFF\"}").unwrap();
946        let err = decoder.finish().unwrap_err().to_string();
947        assert_eq!(err, "Json error: Encountered non-UTF-8 data");
948
949        let mut decoder = TapeDecoder::new(16, 2);
950        decoder.decode(b"{\"\xe2\" : \"\x96\xa1\"}").unwrap();
951        let err = decoder.finish().unwrap_err().to_string();
952        assert_eq!(err, "Json error: Encountered truncated UTF-8 sequence");
953    }
954}