arrow_json/reader/string_view_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::fmt::Write;
19use std::sync::Arc;
20
21use arrow_array::ArrayRef;
22use arrow_array::builder::GenericByteViewBuilder;
23use arrow_array::types::StringViewType;
24use arrow_schema::ArrowError;
25
26use crate::reader::tape::{Tape, TapeElement};
27use crate::reader::{ArrayDecoder, DecoderContext};
28
29const TRUE: &str = "true";
30const FALSE: &str = "false";
31
32pub struct StringViewArrayDecoder {
33 coerce_primitive: bool,
34 ignore_type_conflicts: bool,
35}
36
37impl StringViewArrayDecoder {
38 pub fn new(ctx: &DecoderContext) -> Self {
39 Self {
40 coerce_primitive: ctx.coerce_primitive(),
41 ignore_type_conflicts: ctx.ignore_type_conflicts(),
42 }
43 }
44}
45
46impl ArrayDecoder for StringViewArrayDecoder {
47 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
48 let coerce = self.coerce_primitive;
49 let mut data_capacity = 0;
50 for &p in pos {
51 // note that StringView is different that StringArray in that only
52 // "long" strings (longer than 12 bytes) are stored in the buffer.
53 // "short" strings are inlined into a fixed length structure.
54 match tape.get(p) {
55 TapeElement::String(idx) => {
56 let s = tape.get_string(idx);
57 // Only increase capacity if the string length is greater than 12 bytes
58 if s.len() > 12 {
59 data_capacity += s.len();
60 }
61 }
62 TapeElement::Null => {
63 // Do not increase capacity for null values
64 }
65 // For booleans, do not increase capacity (both "true" and "false" are less than
66 // 12 bytes)
67 TapeElement::True if coerce => {}
68 TapeElement::False if coerce => {}
69 // For Number, use the same strategy as for strings
70 TapeElement::Number(idx) if coerce => {
71 let s = tape.get_string(idx);
72 if s.len() > 12 {
73 data_capacity += s.len();
74 }
75 }
76 // For I64, only add capacity if the absolute value is greater than 999,999,999,999
77 // (the largest number that can fit in 12 bytes)
78 TapeElement::I64(_) if coerce => {
79 match tape.get(p + 1) {
80 TapeElement::I32(_) => {
81 let high = match tape.get(p) {
82 TapeElement::I64(h) => h,
83 _ => unreachable!(),
84 };
85 let low = match tape.get(p + 1) {
86 TapeElement::I32(l) => l,
87 _ => unreachable!(),
88 };
89 let val = ((high as i64) << 32) | (low as u32) as i64;
90 if val.abs() > 999_999_999_999 {
91 // Only allocate capacity based on the string representation if the number is large
92 data_capacity += val.to_string().len();
93 }
94 }
95 _ => unreachable!(),
96 }
97 }
98 // For I32, do not increase capacity (the longest string representation is <= 12 bytes)
99 TapeElement::I32(_) if coerce => {}
100 // For F32 and F64, keep the existing estimate
101 TapeElement::F32(_) if coerce => {
102 data_capacity += 10;
103 }
104 TapeElement::F64(_) if coerce => {
105 data_capacity += 10;
106 }
107 _ if self.ignore_type_conflicts => {} // treat type conflicts like nulls
108 _ => {
109 return Err(tape.error(p, "string"));
110 }
111 }
112 }
113
114 let mut builder = GenericByteViewBuilder::<StringViewType>::with_capacity(data_capacity);
115 // Temporary buffer to avoid per-iteration allocation for numeric types
116 let mut tmp_buf = String::new();
117
118 for &p in pos {
119 match tape.get(p) {
120 TapeElement::String(idx) => {
121 builder.append_value(tape.get_string(idx));
122 }
123 TapeElement::Null => {
124 builder.append_null();
125 }
126 TapeElement::True if coerce => {
127 builder.append_value(TRUE);
128 }
129 TapeElement::False if coerce => {
130 builder.append_value(FALSE);
131 }
132 TapeElement::Number(idx) if coerce => {
133 builder.append_value(tape.get_string(idx));
134 }
135 TapeElement::I64(high) if coerce => match tape.get(p + 1) {
136 TapeElement::I32(low) => {
137 let val = ((high as i64) << 32) | (low as u32) as i64;
138 tmp_buf.clear();
139 // Reuse the temporary buffer instead of allocating a new String
140 write!(&mut tmp_buf, "{val}").unwrap();
141 builder.append_value(&tmp_buf);
142 }
143 _ => unreachable!(),
144 },
145 TapeElement::I32(n) if coerce => {
146 tmp_buf.clear();
147 write!(&mut tmp_buf, "{n}").unwrap();
148 builder.append_value(&tmp_buf);
149 }
150 TapeElement::F32(n) if coerce => {
151 tmp_buf.clear();
152 write!(&mut tmp_buf, "{n}").unwrap();
153 builder.append_value(&tmp_buf);
154 }
155 TapeElement::F64(high) if coerce => match tape.get(p + 1) {
156 TapeElement::F32(low) => {
157 let val = f64::from_bits(((high as u64) << 32) | (low as u64));
158 tmp_buf.clear();
159 write!(&mut tmp_buf, "{val}").unwrap();
160 builder.append_value(&tmp_buf);
161 }
162 _ => unreachable!(),
163 },
164 _ if self.ignore_type_conflicts => {
165 builder.append_null();
166 }
167 _ => unreachable!(),
168 }
169 }
170
171 Ok(Arc::new(builder.finish()))
172 }
173}