Skip to main content

arrow_json/reader/
string_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::fmt::Write;
19use std::sync::Arc;
20
21use arrow_array::ArrayRef;
22use arrow_array::builder::GenericByteViewBuilder;
23use arrow_array::types::StringViewType;
24use arrow_schema::ArrowError;
25
26use crate::reader::tape::{Tape, TapeElement};
27use crate::reader::{ArrayDecoder, DecoderContext};
28
29const TRUE: &str = "true";
30const FALSE: &str = "false";
31
32pub struct StringViewArrayDecoder {
33    coerce_primitive: bool,
34    ignore_type_conflicts: bool,
35}
36
37impl StringViewArrayDecoder {
38    pub fn new(ctx: &DecoderContext) -> Self {
39        Self {
40            coerce_primitive: ctx.coerce_primitive(),
41            ignore_type_conflicts: ctx.ignore_type_conflicts(),
42        }
43    }
44}
45
46impl ArrayDecoder for StringViewArrayDecoder {
47    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
48        let coerce = self.coerce_primitive;
49        let mut data_capacity = 0;
50        for &p in pos {
51            // note that StringView is different that StringArray in that only
52            // "long" strings (longer than 12 bytes) are stored in the buffer.
53            // "short" strings are inlined into a fixed length structure.
54            match tape.get(p) {
55                TapeElement::String(idx) => {
56                    let s = tape.get_string(idx);
57                    // Only increase capacity if the string length is greater than 12 bytes
58                    if s.len() > 12 {
59                        data_capacity += s.len();
60                    }
61                }
62                TapeElement::Null => {
63                    // Do not increase capacity for null values
64                }
65                // For booleans, do not increase capacity (both "true" and "false" are less than
66                // 12 bytes)
67                TapeElement::True if coerce => {}
68                TapeElement::False if coerce => {}
69                // For Number, use the same strategy as for strings
70                TapeElement::Number(idx) if coerce => {
71                    let s = tape.get_string(idx);
72                    if s.len() > 12 {
73                        data_capacity += s.len();
74                    }
75                }
76                // For I64, only add capacity if the absolute value is greater than 999,999,999,999
77                // (the largest number that can fit in 12 bytes)
78                TapeElement::I64(_) if coerce => {
79                    match tape.get(p + 1) {
80                        TapeElement::I32(_) => {
81                            let high = match tape.get(p) {
82                                TapeElement::I64(h) => h,
83                                _ => unreachable!(),
84                            };
85                            let low = match tape.get(p + 1) {
86                                TapeElement::I32(l) => l,
87                                _ => unreachable!(),
88                            };
89                            let val = ((high as i64) << 32) | (low as u32) as i64;
90                            if val.abs() > 999_999_999_999 {
91                                // Only allocate capacity based on the string representation if the number is large
92                                data_capacity += val.to_string().len();
93                            }
94                        }
95                        _ => unreachable!(),
96                    }
97                }
98                // For I32, do not increase capacity (the longest string representation is <= 12 bytes)
99                TapeElement::I32(_) if coerce => {}
100                // For F32 and F64, keep the existing estimate
101                TapeElement::F32(_) if coerce => {
102                    data_capacity += 10;
103                }
104                TapeElement::F64(_) if coerce => {
105                    data_capacity += 10;
106                }
107                _ if self.ignore_type_conflicts => {} // treat type conflicts like nulls
108                _ => {
109                    return Err(tape.error(p, "string"));
110                }
111            }
112        }
113
114        let mut builder = GenericByteViewBuilder::<StringViewType>::with_capacity(data_capacity);
115        // Temporary buffer to avoid per-iteration allocation for numeric types
116        let mut tmp_buf = String::new();
117
118        for &p in pos {
119            match tape.get(p) {
120                TapeElement::String(idx) => {
121                    builder.append_value(tape.get_string(idx));
122                }
123                TapeElement::Null => {
124                    builder.append_null();
125                }
126                TapeElement::True if coerce => {
127                    builder.append_value(TRUE);
128                }
129                TapeElement::False if coerce => {
130                    builder.append_value(FALSE);
131                }
132                TapeElement::Number(idx) if coerce => {
133                    builder.append_value(tape.get_string(idx));
134                }
135                TapeElement::I64(high) if coerce => match tape.get(p + 1) {
136                    TapeElement::I32(low) => {
137                        let val = ((high as i64) << 32) | (low as u32) as i64;
138                        tmp_buf.clear();
139                        // Reuse the temporary buffer instead of allocating a new String
140                        write!(&mut tmp_buf, "{val}").unwrap();
141                        builder.append_value(&tmp_buf);
142                    }
143                    _ => unreachable!(),
144                },
145                TapeElement::I32(n) if coerce => {
146                    tmp_buf.clear();
147                    write!(&mut tmp_buf, "{n}").unwrap();
148                    builder.append_value(&tmp_buf);
149                }
150                TapeElement::F32(n) if coerce => {
151                    tmp_buf.clear();
152                    write!(&mut tmp_buf, "{n}").unwrap();
153                    builder.append_value(&tmp_buf);
154                }
155                TapeElement::F64(high) if coerce => match tape.get(p + 1) {
156                    TapeElement::F32(low) => {
157                        let val = f64::from_bits(((high as u64) << 32) | (low as u64));
158                        tmp_buf.clear();
159                        write!(&mut tmp_buf, "{val}").unwrap();
160                        builder.append_value(&tmp_buf);
161                    }
162                    _ => unreachable!(),
163                },
164                _ if self.ignore_type_conflicts => {
165                    builder.append_null();
166                }
167                _ => unreachable!(),
168            }
169        }
170
171        Ok(Arc::new(builder.finish()))
172    }
173}