arrow_json/reader/
binary_array.rs1use arrow_array::builder::{BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder};
19use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
20use arrow_data::ArrayData;
21use arrow_schema::ArrowError;
22use std::io::Write;
23use std::marker::PhantomData;
24
25use crate::reader::ArrayDecoder;
26use crate::reader::tape::{Tape, TapeElement};
27
28#[inline]
29fn decode_hex_digit(byte: u8) -> Option<u8> {
30 match byte {
31 b'0'..=b'9' => Some(byte - b'0'),
32 b'a'..=b'f' => Some(byte - b'a' + 10),
33 b'A'..=b'F' => Some(byte - b'A' + 10),
34 _ => None,
35 }
36}
37
38fn invalid_hex_error_at(index: usize, byte: u8) -> ArrowError {
39 ArrowError::JsonError(format!(
40 "invalid hex encoding in binary data: invalid digit 0x{byte:02x} at position {index}"
41 ))
42}
43
44fn decode_hex_to_writer<W: Write>(hex_string: &str, writer: &mut W) -> Result<(), ArrowError> {
45 let bytes = hex_string.as_bytes();
46 let mut iter = bytes.chunks_exact(2);
47 let mut buffer = [0u8; 64];
48 let mut buffered = 0;
49
50 for (pair_index, pair) in (&mut iter).enumerate() {
51 let base = pair_index * 2;
52 let high = decode_hex_digit(pair[0]).ok_or_else(|| invalid_hex_error_at(base, pair[0]))?;
53 let low =
54 decode_hex_digit(pair[1]).ok_or_else(|| invalid_hex_error_at(base + 1, pair[1]))?;
55 buffer[buffered] = (high << 4) | low;
56 buffered += 1;
57
58 if buffered == buffer.len() {
59 writer
60 .write_all(&buffer)
61 .map_err(|e| ArrowError::JsonError(format!("failed to write binary data: {e}")))?;
62 buffered = 0;
63 }
64 }
65
66 let remainder = iter.remainder();
67 if !remainder.is_empty() {
68 let index = (bytes.len() / 2) * 2;
69 let low = decode_hex_digit(remainder[0])
70 .ok_or_else(|| invalid_hex_error_at(index, remainder[0]))?;
71 buffer[buffered] = low;
72 buffered += 1;
73 }
74
75 if buffered > 0 {
76 writer
77 .write_all(&buffer[..buffered])
78 .map_err(|e| ArrowError::JsonError(format!("failed to write binary data: {e}")))?;
79 }
80
81 Ok(())
82}
83
84#[derive(Default)]
85pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
86 phantom: PhantomData<O>,
87}
88
89impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
90 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
91 let data_capacity = estimate_data_capacity(tape, pos)?;
92
93 if O::from_usize(data_capacity).is_none() {
94 return Err(ArrowError::JsonError(format!(
95 "offset overflow decoding {}",
96 GenericStringArray::<O>::DATA_TYPE
97 )));
98 }
99
100 let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(), data_capacity);
101
102 for p in pos {
103 match tape.get(*p) {
104 TapeElement::String(idx) => {
105 let string = tape.get_string(idx);
106 decode_hex_to_writer(string, &mut builder)?;
109 builder.append_value(b"");
110 }
111 TapeElement::Null => builder.append_null(),
112 _ => unreachable!(),
113 }
114 }
115
116 Ok(builder.finish().into_data())
117 }
118}
119
120#[derive(Default)]
121pub struct FixedSizeBinaryArrayDecoder {
122 len: i32,
123}
124
125impl FixedSizeBinaryArrayDecoder {
126 pub fn new(len: i32) -> Self {
127 Self { len }
128 }
129}
130
131impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
132 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
133 let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len);
134 let mut scratch = Vec::with_capacity(self.len as usize);
136
137 for p in pos {
138 match tape.get(*p) {
139 TapeElement::String(idx) => {
140 let string = tape.get_string(idx);
141 scratch.clear();
142 scratch.reserve(string.len().div_ceil(2));
143 decode_hex_to_writer(string, &mut scratch)?;
144 builder.append_value(&scratch)?;
145 }
146 TapeElement::Null => builder.append_null(),
147 _ => unreachable!(),
148 }
149 }
150
151 Ok(builder.finish().into_data())
152 }
153}
154
155#[derive(Default)]
156pub struct BinaryViewDecoder {}
157
158impl ArrayDecoder for BinaryViewDecoder {
159 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
160 let data_capacity = estimate_data_capacity(tape, pos)?;
161 let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
162 let mut scratch = Vec::new();
163
164 for p in pos {
165 match tape.get(*p) {
166 TapeElement::String(idx) => {
167 let string = tape.get_string(idx);
168 scratch.clear();
169 scratch.reserve(string.len().div_ceil(2));
170 decode_hex_to_writer(string, &mut scratch)?;
171 builder.append_value(&scratch);
172 }
173 TapeElement::Null => builder.append_null(),
174 _ => unreachable!(),
175 }
176 }
177
178 Ok(builder.finish().into_data())
179 }
180}
181
182fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result<usize, ArrowError> {
183 let mut data_capacity = 0;
184 for p in pos {
185 match tape.get(*p) {
186 TapeElement::String(idx) => {
187 let string_len = tape.get_string(idx).len();
188 let decoded_len = string_len.div_ceil(2);
190 data_capacity += decoded_len;
191 }
192 TapeElement::Null => {}
193 _ => {
194 return Err(tape.error(*p, "binary data encoded as string"));
195 }
196 }
197 }
198 Ok(data_capacity)
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204 use crate::ReaderBuilder;
205 use arrow_schema::{DataType, Field};
206 use std::io::Cursor;
207
208 #[test]
209 fn test_decode_hex_to_writer_empty() {
210 let mut out = Vec::new();
211 decode_hex_to_writer("", &mut out).unwrap();
212 assert!(out.is_empty());
213 }
214
215 #[test]
216 fn test_decode_hex_to_writer_odd_length() {
217 let mut out = Vec::new();
218 decode_hex_to_writer("0f0", &mut out).unwrap();
219 assert_eq!(out, vec![0x0f, 0x00]);
220
221 out.clear();
222 decode_hex_to_writer("a", &mut out).unwrap();
223 assert_eq!(out, vec![0x0a]);
224 }
225
226 #[test]
227 fn test_decode_hex_to_writer_invalid() {
228 let mut out = Vec::new();
229 let err = decode_hex_to_writer("0f0g", &mut out).unwrap_err();
230 match err {
231 ArrowError::JsonError(msg) => {
232 assert!(msg.contains("invalid hex encoding in binary data"));
233 assert!(msg.contains("position 3"));
234 }
235 _ => panic!("expected JsonError"),
236 }
237 }
238
239 #[test]
240 fn test_binary_reader_invalid_hex_is_terminal() {
241 let field = Field::new("item", DataType::Binary, false);
242 let data = b"\"0f0g\"\n\"0f00\"\n";
243 let mut reader = ReaderBuilder::new_with_field(field)
244 .build(Cursor::new(data))
245 .unwrap();
246
247 let err = reader.next().unwrap().unwrap_err().to_string();
248 assert!(err.contains("invalid hex encoding in binary data"));
249
250 match reader.next() {
251 None => {}
252 Some(Err(_)) => {}
253 Some(Ok(_)) => panic!("expected terminal error after invalid hex"),
254 }
255 }
256}