arrow_json/reader/
string_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow_array::builder::GenericByteViewBuilder;
19use arrow_array::types::StringViewType;
20use arrow_array::Array;
21use arrow_data::ArrayData;
22use arrow_schema::ArrowError;
23use std::fmt::Write;
24
25use crate::reader::tape::{Tape, TapeElement};
26use crate::reader::ArrayDecoder;
27
28const TRUE: &str = "true";
29const FALSE: &str = "false";
30
31pub struct StringViewArrayDecoder {
32    coerce_primitive: bool,
33}
34
35impl StringViewArrayDecoder {
36    pub fn new(coerce_primitive: bool) -> Self {
37        Self { coerce_primitive }
38    }
39}
40
41impl ArrayDecoder for StringViewArrayDecoder {
42    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
43        let coerce = self.coerce_primitive;
44        let mut data_capacity = 0;
45        for &p in pos {
46            // note that StringView is different that StringArray in that only
47            // "long" strings (longer than 12 bytes) are stored in the buffer.
48            // "short" strings are inlined into a fixed length structure.
49            match tape.get(p) {
50                TapeElement::String(idx) => {
51                    let s = tape.get_string(idx);
52                    // Only increase capacity if the string length is greater than 12 bytes
53                    if s.len() > 12 {
54                        data_capacity += s.len();
55                    }
56                }
57                TapeElement::Null => {
58                    // Do not increase capacity for null values
59                }
60                // For booleans, do not increase capacity (both "true" and "false" are less than
61                // 12 bytes)
62                TapeElement::True if coerce => {}
63                TapeElement::False if coerce => {}
64                // For Number, use the same strategy as for strings
65                TapeElement::Number(idx) if coerce => {
66                    let s = tape.get_string(idx);
67                    if s.len() > 12 {
68                        data_capacity += s.len();
69                    }
70                }
71                // For I64, only add capacity if the absolute value is greater than 999,999,999,999
72                // (the largest number that can fit in 12 bytes)
73                TapeElement::I64(_) if coerce => {
74                    match tape.get(p + 1) {
75                        TapeElement::I32(_) => {
76                            let high = match tape.get(p) {
77                                TapeElement::I64(h) => h,
78                                _ => unreachable!(),
79                            };
80                            let low = match tape.get(p + 1) {
81                                TapeElement::I32(l) => l,
82                                _ => unreachable!(),
83                            };
84                            let val = ((high as i64) << 32) | (low as u32) as i64;
85                            if val.abs() > 999_999_999_999 {
86                                // Only allocate capacity based on the string representation if the number is large
87                                data_capacity += val.to_string().len();
88                            }
89                        }
90                        _ => unreachable!(),
91                    }
92                }
93                // For I32, do not increase capacity (the longest string representation is <= 12 bytes)
94                TapeElement::I32(_) if coerce => {}
95                // For F32 and F64, keep the existing estimate
96                TapeElement::F32(_) if coerce => {
97                    data_capacity += 10;
98                }
99                TapeElement::F64(_) if coerce => {
100                    data_capacity += 10;
101                }
102                _ => {
103                    return Err(tape.error(p, "string"));
104                }
105            }
106        }
107
108        let mut builder = GenericByteViewBuilder::<StringViewType>::with_capacity(data_capacity);
109        // Temporary buffer to avoid per-iteration allocation for numeric types
110        let mut tmp_buf = String::new();
111
112        for &p in pos {
113            match tape.get(p) {
114                TapeElement::String(idx) => {
115                    builder.append_value(tape.get_string(idx));
116                }
117                TapeElement::Null => {
118                    builder.append_null();
119                }
120                TapeElement::True if coerce => {
121                    builder.append_value(TRUE);
122                }
123                TapeElement::False if coerce => {
124                    builder.append_value(FALSE);
125                }
126                TapeElement::Number(idx) if coerce => {
127                    builder.append_value(tape.get_string(idx));
128                }
129                TapeElement::I64(high) if coerce => match tape.get(p + 1) {
130                    TapeElement::I32(low) => {
131                        let val = ((high as i64) << 32) | (low as u32) as i64;
132                        tmp_buf.clear();
133                        // Reuse the temporary buffer instead of allocating a new String
134                        write!(&mut tmp_buf, "{}", val).unwrap();
135                        builder.append_value(&tmp_buf);
136                    }
137                    _ => unreachable!(),
138                },
139                TapeElement::I32(n) if coerce => {
140                    tmp_buf.clear();
141                    write!(&mut tmp_buf, "{}", n).unwrap();
142                    builder.append_value(&tmp_buf);
143                }
144                TapeElement::F32(n) if coerce => {
145                    tmp_buf.clear();
146                    write!(&mut tmp_buf, "{}", n).unwrap();
147                    builder.append_value(&tmp_buf);
148                }
149                TapeElement::F64(high) if coerce => match tape.get(p + 1) {
150                    TapeElement::F32(low) => {
151                        let val = f64::from_bits(((high as u64) << 32) | (low as u64));
152                        tmp_buf.clear();
153                        write!(&mut tmp_buf, "{}", val).unwrap();
154                        builder.append_value(&tmp_buf);
155                    }
156                    _ => unreachable!(),
157                },
158                _ => unreachable!(),
159            }
160        }
161
162        let array = builder.finish();
163        Ok(array.into_data())
164    }
165}