arrow_json/reader/string_view_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow_array::builder::GenericByteViewBuilder;
19use arrow_array::types::StringViewType;
20use arrow_array::Array;
21use arrow_data::ArrayData;
22use arrow_schema::ArrowError;
23use std::fmt::Write;
24
25use crate::reader::tape::{Tape, TapeElement};
26use crate::reader::ArrayDecoder;
27
28const TRUE: &str = "true";
29const FALSE: &str = "false";
30
31pub struct StringViewArrayDecoder {
32 coerce_primitive: bool,
33}
34
35impl StringViewArrayDecoder {
36 pub fn new(coerce_primitive: bool) -> Self {
37 Self { coerce_primitive }
38 }
39}
40
41impl ArrayDecoder for StringViewArrayDecoder {
42 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
43 let coerce = self.coerce_primitive;
44 let mut data_capacity = 0;
45 for &p in pos {
46 // note that StringView is different that StringArray in that only
47 // "long" strings (longer than 12 bytes) are stored in the buffer.
48 // "short" strings are inlined into a fixed length structure.
49 match tape.get(p) {
50 TapeElement::String(idx) => {
51 let s = tape.get_string(idx);
52 // Only increase capacity if the string length is greater than 12 bytes
53 if s.len() > 12 {
54 data_capacity += s.len();
55 }
56 }
57 TapeElement::Null => {
58 // Do not increase capacity for null values
59 }
60 // For booleans, do not increase capacity (both "true" and "false" are less than
61 // 12 bytes)
62 TapeElement::True if coerce => {}
63 TapeElement::False if coerce => {}
64 // For Number, use the same strategy as for strings
65 TapeElement::Number(idx) if coerce => {
66 let s = tape.get_string(idx);
67 if s.len() > 12 {
68 data_capacity += s.len();
69 }
70 }
71 // For I64, only add capacity if the absolute value is greater than 999,999,999,999
72 // (the largest number that can fit in 12 bytes)
73 TapeElement::I64(_) if coerce => {
74 match tape.get(p + 1) {
75 TapeElement::I32(_) => {
76 let high = match tape.get(p) {
77 TapeElement::I64(h) => h,
78 _ => unreachable!(),
79 };
80 let low = match tape.get(p + 1) {
81 TapeElement::I32(l) => l,
82 _ => unreachable!(),
83 };
84 let val = ((high as i64) << 32) | (low as u32) as i64;
85 if val.abs() > 999_999_999_999 {
86 // Only allocate capacity based on the string representation if the number is large
87 data_capacity += val.to_string().len();
88 }
89 }
90 _ => unreachable!(),
91 }
92 }
93 // For I32, do not increase capacity (the longest string representation is <= 12 bytes)
94 TapeElement::I32(_) if coerce => {}
95 // For F32 and F64, keep the existing estimate
96 TapeElement::F32(_) if coerce => {
97 data_capacity += 10;
98 }
99 TapeElement::F64(_) if coerce => {
100 data_capacity += 10;
101 }
102 _ => {
103 return Err(tape.error(p, "string"));
104 }
105 }
106 }
107
108 let mut builder = GenericByteViewBuilder::<StringViewType>::with_capacity(data_capacity);
109 // Temporary buffer to avoid per-iteration allocation for numeric types
110 let mut tmp_buf = String::new();
111
112 for &p in pos {
113 match tape.get(p) {
114 TapeElement::String(idx) => {
115 builder.append_value(tape.get_string(idx));
116 }
117 TapeElement::Null => {
118 builder.append_null();
119 }
120 TapeElement::True if coerce => {
121 builder.append_value(TRUE);
122 }
123 TapeElement::False if coerce => {
124 builder.append_value(FALSE);
125 }
126 TapeElement::Number(idx) if coerce => {
127 builder.append_value(tape.get_string(idx));
128 }
129 TapeElement::I64(high) if coerce => match tape.get(p + 1) {
130 TapeElement::I32(low) => {
131 let val = ((high as i64) << 32) | (low as u32) as i64;
132 tmp_buf.clear();
133 // Reuse the temporary buffer instead of allocating a new String
134 write!(&mut tmp_buf, "{}", val).unwrap();
135 builder.append_value(&tmp_buf);
136 }
137 _ => unreachable!(),
138 },
139 TapeElement::I32(n) if coerce => {
140 tmp_buf.clear();
141 write!(&mut tmp_buf, "{}", n).unwrap();
142 builder.append_value(&tmp_buf);
143 }
144 TapeElement::F32(n) if coerce => {
145 tmp_buf.clear();
146 write!(&mut tmp_buf, "{}", n).unwrap();
147 builder.append_value(&tmp_buf);
148 }
149 TapeElement::F64(high) if coerce => match tape.get(p + 1) {
150 TapeElement::F32(low) => {
151 let val = f64::from_bits(((high as u64) << 32) | (low as u64));
152 tmp_buf.clear();
153 write!(&mut tmp_buf, "{}", val).unwrap();
154 builder.append_value(&tmp_buf);
155 }
156 _ => unreachable!(),
157 },
158 _ => unreachable!(),
159 }
160 }
161
162 let array = builder.finish();
163 Ok(array.into_data())
164 }
165}