arrow_json/reader/
binary_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow_array::builder::{
19    BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder, GenericStringBuilder,
20};
21use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
22use arrow_data::ArrayData;
23use arrow_schema::ArrowError;
24use std::marker::PhantomData;
25
26use crate::reader::ArrayDecoder;
27use crate::reader::tape::{Tape, TapeElement};
28
29/// Decode a hex-encoded string into bytes
30fn decode_hex_string(hex_string: &str) -> Result<Vec<u8>, ArrowError> {
31    let mut decoded = Vec::with_capacity(hex_string.len() / 2);
32    for substr in hex_string.as_bytes().chunks(2) {
33        let str = std::str::from_utf8(substr).map_err(|e| {
34            ArrowError::JsonError(format!("invalid utf8 in hex encoded binary data: {e}"))
35        })?;
36        let byte = u8::from_str_radix(str, 16).map_err(|e| {
37            ArrowError::JsonError(format!("invalid hex encoding in binary data: {e}"))
38        })?;
39        decoded.push(byte);
40    }
41    Ok(decoded)
42}
43
44#[derive(Default)]
45pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
46    phantom: PhantomData<O>,
47}
48
49impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
50    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
51        let data_capacity = estimate_data_capacity(tape, pos)?;
52
53        if O::from_usize(data_capacity).is_none() {
54            return Err(ArrowError::JsonError(format!(
55                "offset overflow decoding {}",
56                GenericStringArray::<O>::DATA_TYPE
57            )));
58        }
59
60        let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(), data_capacity);
61
62        GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);
63
64        for p in pos {
65            match tape.get(*p) {
66                TapeElement::String(idx) => {
67                    let string = tape.get_string(idx);
68                    let decoded = decode_hex_string(string)?;
69                    builder.append_value(&decoded);
70                }
71                TapeElement::Null => builder.append_null(),
72                _ => unreachable!(),
73            }
74        }
75
76        Ok(builder.finish().into_data())
77    }
78}
79
80#[derive(Default)]
81pub struct FixedSizeBinaryArrayDecoder {
82    len: i32,
83}
84
85impl FixedSizeBinaryArrayDecoder {
86    pub fn new(len: i32) -> Self {
87        Self { len }
88    }
89}
90
91impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
92    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
93        let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len);
94
95        for p in pos {
96            match tape.get(*p) {
97                TapeElement::String(idx) => {
98                    let string = tape.get_string(idx);
99                    let decoded = decode_hex_string(string)?;
100                    builder.append_value(&decoded)?;
101                }
102                TapeElement::Null => builder.append_null(),
103                _ => unreachable!(),
104            }
105        }
106
107        Ok(builder.finish().into_data())
108    }
109}
110
111#[derive(Default)]
112pub struct BinaryViewDecoder {}
113
114impl ArrayDecoder for BinaryViewDecoder {
115    fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
116        let data_capacity = estimate_data_capacity(tape, pos)?;
117        let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
118
119        for p in pos {
120            match tape.get(*p) {
121                TapeElement::String(idx) => {
122                    let string = tape.get_string(idx);
123                    let decoded = decode_hex_string(string)?;
124                    builder.append_value(&decoded);
125                }
126                TapeElement::Null => builder.append_null(),
127                _ => unreachable!(),
128            }
129        }
130
131        Ok(builder.finish().into_data())
132    }
133}
134
135fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result<usize, ArrowError> {
136    let mut data_capacity = 0;
137    for p in pos {
138        match tape.get(*p) {
139            TapeElement::String(idx) => {
140                let string_len = tape.get_string(idx).len();
141                // two hex characters represent one byte
142                let decoded_len = string_len / 2;
143                data_capacity += decoded_len;
144            }
145            TapeElement::Null => {}
146            _ => {
147                return Err(tape.error(*p, "binary data encoded as string"));
148            }
149        }
150    }
151    Ok(data_capacity)
152}