arrow_json/reader/
string_array.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_array::builder::GenericStringBuilder;
use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
use arrow_data::ArrayData;
use arrow_schema::ArrowError;
use std::marker::PhantomData;

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::ArrayDecoder;

const TRUE: &str = "true";
const FALSE: &str = "false";

pub struct StringArrayDecoder<O: OffsetSizeTrait> {
    coerce_primitive: bool,
    phantom: PhantomData<O>,
}

impl<O: OffsetSizeTrait> StringArrayDecoder<O> {
    pub fn new(coerce_primitive: bool) -> Self {
        Self {
            coerce_primitive,
            phantom: Default::default(),
        }
    }
}

impl<O: OffsetSizeTrait> ArrayDecoder for StringArrayDecoder<O> {
    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
        let coerce_primitive = self.coerce_primitive;

        let mut data_capacity = 0;
        for p in pos {
            match tape.get(*p) {
                TapeElement::String(idx) => {
                    data_capacity += tape.get_string(idx).len();
                }
                TapeElement::Null => {}
                TapeElement::True if coerce_primitive => {
                    data_capacity += TRUE.len();
                }
                TapeElement::False if coerce_primitive => {
                    data_capacity += FALSE.len();
                }
                TapeElement::Number(idx) if coerce_primitive => {
                    data_capacity += tape.get_string(idx).len();
                }
                TapeElement::I64(_)
                | TapeElement::I32(_)
                | TapeElement::F64(_)
                | TapeElement::F32(_)
                    if coerce_primitive =>
                {
                    // An arbitrary estimate
                    data_capacity += 10;
                }
                _ => {
                    return Err(tape.error(*p, "string"));
                }
            }
        }

        if O::from_usize(data_capacity).is_none() {
            return Err(ArrowError::JsonError(format!(
                "offset overflow decoding {}",
                GenericStringArray::<O>::DATA_TYPE
            )));
        }

        let mut builder = GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);

        for p in pos {
            match tape.get(*p) {
                TapeElement::String(idx) => {
                    builder.append_value(tape.get_string(idx));
                }
                TapeElement::Null => builder.append_null(),
                TapeElement::True if coerce_primitive => {
                    builder.append_value(TRUE);
                }
                TapeElement::False if coerce_primitive => {
                    builder.append_value(FALSE);
                }
                TapeElement::Number(idx) if coerce_primitive => {
                    builder.append_value(tape.get_string(idx));
                }
                TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) {
                    TapeElement::I32(low) => {
                        let val = (high as i64) << 32 | (low as u32) as i64;
                        builder.append_value(val.to_string());
                    }
                    _ => unreachable!(),
                },
                TapeElement::I32(n) if coerce_primitive => {
                    builder.append_value(n.to_string());
                }
                TapeElement::F32(n) if coerce_primitive => {
                    builder.append_value(n.to_string());
                }
                TapeElement::F64(high) if coerce_primitive => match tape.get(p + 1) {
                    TapeElement::F32(low) => {
                        let val = f64::from_bits((high as u64) << 32 | low as u64);
                        builder.append_value(val.to_string());
                    }
                    _ => unreachable!(),
                },
                _ => unreachable!(),
            }
        }

        Ok(builder.finish().into_data())
    }
}