arrow_json/reader/
binary_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow_array::builder::{BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder};
19use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
20use arrow_data::ArrayData;
21use arrow_schema::ArrowError;
22use std::io::Write;
23use std::marker::PhantomData;
24
25use crate::reader::ArrayDecoder;
26use crate::reader::tape::{Tape, TapeElement};
27
28#[inline]
29fn decode_hex_digit(byte: u8) -> Option<u8> {
30    match byte {
31        b'0'..=b'9' => Some(byte - b'0'),
32        b'a'..=b'f' => Some(byte - b'a' + 10),
33        b'A'..=b'F' => Some(byte - b'A' + 10),
34        _ => None,
35    }
36}
37
38fn invalid_hex_error_at(index: usize, byte: u8) -> ArrowError {
39    ArrowError::JsonError(format!(
40        "invalid hex encoding in binary data: invalid digit 0x{byte:02x} at position {index}"
41    ))
42}
43
44fn decode_hex_to_writer<W: Write>(hex_string: &str, writer: &mut W) -> Result<(), ArrowError> {
45    let bytes = hex_string.as_bytes();
46    let mut iter = bytes.chunks_exact(2);
47    let mut buffer = [0u8; 64];
48    let mut buffered = 0;
49
50    for (pair_index, pair) in (&mut iter).enumerate() {
51        let base = pair_index * 2;
52        let high = decode_hex_digit(pair[0]).ok_or_else(|| invalid_hex_error_at(base, pair[0]))?;
53        let low =
54            decode_hex_digit(pair[1]).ok_or_else(|| invalid_hex_error_at(base + 1, pair[1]))?;
55        buffer[buffered] = (high << 4) | low;
56        buffered += 1;
57
58        if buffered == buffer.len() {
59            writer
60                .write_all(&buffer)
61                .map_err(|e| ArrowError::JsonError(format!("failed to write binary data: {e}")))?;
62            buffered = 0;
63        }
64    }
65
66    let remainder = iter.remainder();
67    if !remainder.is_empty() {
68        let index = (bytes.len() / 2) * 2;
69        let low = decode_hex_digit(remainder[0])
70            .ok_or_else(|| invalid_hex_error_at(index, remainder[0]))?;
71        buffer[buffered] = low;
72        buffered += 1;
73    }
74
75    if buffered > 0 {
76        writer
77            .write_all(&buffer[..buffered])
78            .map_err(|e| ArrowError::JsonError(format!("failed to write binary data: {e}")))?;
79    }
80
81    Ok(())
82}
83
84#[derive(Default)]
85pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
86    phantom: PhantomData<O>,
87}
88
89impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
90    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
91        let data_capacity = estimate_data_capacity(tape, pos)?;
92
93        if O::from_usize(data_capacity).is_none() {
94            return Err(ArrowError::JsonError(format!(
95                "offset overflow decoding {}",
96                GenericStringArray::<O>::DATA_TYPE
97            )));
98        }
99
100        let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(), data_capacity);
101
102        for p in pos {
103            match tape.get(*p) {
104                TapeElement::String(idx) => {
105                    let string = tape.get_string(idx);
106                    // Decode directly into the builder for performance. If decoding fails,
107                    // the error is terminal and the builder is discarded by the caller.
108                    decode_hex_to_writer(string, &mut builder)?;
109                    builder.append_value(b"");
110                }
111                TapeElement::Null => builder.append_null(),
112                _ => unreachable!(),
113            }
114        }
115
116        Ok(builder.finish().into_data())
117    }
118}
119
120#[derive(Default)]
121pub struct FixedSizeBinaryArrayDecoder {
122    len: i32,
123}
124
125impl FixedSizeBinaryArrayDecoder {
126    pub fn new(len: i32) -> Self {
127        Self { len }
128    }
129}
130
131impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
132    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
133        let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len);
134        // Preallocate for the decoded byte width (FixedSizeBinary len), not the hex string length.
135        let mut scratch = Vec::with_capacity(self.len as usize);
136
137        for p in pos {
138            match tape.get(*p) {
139                TapeElement::String(idx) => {
140                    let string = tape.get_string(idx);
141                    scratch.clear();
142                    scratch.reserve(string.len().div_ceil(2));
143                    decode_hex_to_writer(string, &mut scratch)?;
144                    builder.append_value(&scratch)?;
145                }
146                TapeElement::Null => builder.append_null(),
147                _ => unreachable!(),
148            }
149        }
150
151        Ok(builder.finish().into_data())
152    }
153}
154
155#[derive(Default)]
156pub struct BinaryViewDecoder {}
157
158impl ArrayDecoder for BinaryViewDecoder {
159    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
160        let data_capacity = estimate_data_capacity(tape, pos)?;
161        let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
162        let mut scratch = Vec::new();
163
164        for p in pos {
165            match tape.get(*p) {
166                TapeElement::String(idx) => {
167                    let string = tape.get_string(idx);
168                    scratch.clear();
169                    scratch.reserve(string.len().div_ceil(2));
170                    decode_hex_to_writer(string, &mut scratch)?;
171                    builder.append_value(&scratch);
172                }
173                TapeElement::Null => builder.append_null(),
174                _ => unreachable!(),
175            }
176        }
177
178        Ok(builder.finish().into_data())
179    }
180}
181
182fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result<usize, ArrowError> {
183    let mut data_capacity = 0;
184    for p in pos {
185        match tape.get(*p) {
186            TapeElement::String(idx) => {
187                let string_len = tape.get_string(idx).len();
188                // two hex characters represent one byte
189                let decoded_len = string_len.div_ceil(2);
190                data_capacity += decoded_len;
191            }
192            TapeElement::Null => {}
193            _ => {
194                return Err(tape.error(*p, "binary data encoded as string"));
195            }
196        }
197    }
198    Ok(data_capacity)
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204    use crate::ReaderBuilder;
205    use arrow_schema::{DataType, Field};
206    use std::io::Cursor;
207
208    #[test]
209    fn test_decode_hex_to_writer_empty() {
210        let mut out = Vec::new();
211        decode_hex_to_writer("", &mut out).unwrap();
212        assert!(out.is_empty());
213    }
214
215    #[test]
216    fn test_decode_hex_to_writer_odd_length() {
217        let mut out = Vec::new();
218        decode_hex_to_writer("0f0", &mut out).unwrap();
219        assert_eq!(out, vec![0x0f, 0x00]);
220
221        out.clear();
222        decode_hex_to_writer("a", &mut out).unwrap();
223        assert_eq!(out, vec![0x0a]);
224    }
225
226    #[test]
227    fn test_decode_hex_to_writer_invalid() {
228        let mut out = Vec::new();
229        let err = decode_hex_to_writer("0f0g", &mut out).unwrap_err();
230        match err {
231            ArrowError::JsonError(msg) => {
232                assert!(msg.contains("invalid hex encoding in binary data"));
233                assert!(msg.contains("position 3"));
234            }
235            _ => panic!("expected JsonError"),
236        }
237    }
238
239    #[test]
240    fn test_binary_reader_invalid_hex_is_terminal() {
241        let field = Field::new("item", DataType::Binary, false);
242        let data = b"\"0f0g\"\n\"0f00\"\n";
243        let mut reader = ReaderBuilder::new_with_field(field)
244            .build(Cursor::new(data))
245            .unwrap();
246
247        let err = reader.next().unwrap().unwrap_err().to_string();
248        assert!(err.contains("invalid hex encoding in binary data"));
249
250        match reader.next() {
251            None => {}
252            Some(Err(_)) => {}
253            Some(Ok(_)) => panic!("expected terminal error after invalid hex"),
254        }
255    }
256}