arrow_json/reader/
binary_array.rs1use arrow_array::builder::{
19 BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder, GenericStringBuilder,
20};
21use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
22use arrow_data::ArrayData;
23use arrow_schema::ArrowError;
24use std::marker::PhantomData;
25
26use crate::reader::ArrayDecoder;
27use crate::reader::tape::{Tape, TapeElement};
28
29fn decode_hex_string(hex_string: &str) -> Result<Vec<u8>, ArrowError> {
31 let mut decoded = Vec::with_capacity(hex_string.len() / 2);
32 for substr in hex_string.as_bytes().chunks(2) {
33 let str = std::str::from_utf8(substr).map_err(|e| {
34 ArrowError::JsonError(format!("invalid utf8 in hex encoded binary data: {e}"))
35 })?;
36 let byte = u8::from_str_radix(str, 16).map_err(|e| {
37 ArrowError::JsonError(format!("invalid hex encoding in binary data: {e}"))
38 })?;
39 decoded.push(byte);
40 }
41 Ok(decoded)
42}
43
44#[derive(Default)]
45pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
46 phantom: PhantomData<O>,
47}
48
49impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
50 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
51 let data_capacity = estimate_data_capacity(tape, pos)?;
52
53 if O::from_usize(data_capacity).is_none() {
54 return Err(ArrowError::JsonError(format!(
55 "offset overflow decoding {}",
56 GenericStringArray::<O>::DATA_TYPE
57 )));
58 }
59
60 let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(), data_capacity);
61
62 GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);
63
64 for p in pos {
65 match tape.get(*p) {
66 TapeElement::String(idx) => {
67 let string = tape.get_string(idx);
68 let decoded = decode_hex_string(string)?;
69 builder.append_value(&decoded);
70 }
71 TapeElement::Null => builder.append_null(),
72 _ => unreachable!(),
73 }
74 }
75
76 Ok(builder.finish().into_data())
77 }
78}
79
80#[derive(Default)]
81pub struct FixedSizeBinaryArrayDecoder {
82 len: i32,
83}
84
85impl FixedSizeBinaryArrayDecoder {
86 pub fn new(len: i32) -> Self {
87 Self { len }
88 }
89}
90
91impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
92 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
93 let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len);
94
95 for p in pos {
96 match tape.get(*p) {
97 TapeElement::String(idx) => {
98 let string = tape.get_string(idx);
99 let decoded = decode_hex_string(string)?;
100 builder.append_value(&decoded)?;
101 }
102 TapeElement::Null => builder.append_null(),
103 _ => unreachable!(),
104 }
105 }
106
107 Ok(builder.finish().into_data())
108 }
109}
110
111#[derive(Default)]
112pub struct BinaryViewDecoder {}
113
114impl ArrayDecoder for BinaryViewDecoder {
115 fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
116 let data_capacity = estimate_data_capacity(tape, pos)?;
117 let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
118
119 for p in pos {
120 match tape.get(*p) {
121 TapeElement::String(idx) => {
122 let string = tape.get_string(idx);
123 let decoded = decode_hex_string(string)?;
124 builder.append_value(&decoded);
125 }
126 TapeElement::Null => builder.append_null(),
127 _ => unreachable!(),
128 }
129 }
130
131 Ok(builder.finish().into_data())
132 }
133}
134
135fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result<usize, ArrowError> {
136 let mut data_capacity = 0;
137 for p in pos {
138 match tape.get(*p) {
139 TapeElement::String(idx) => {
140 let string_len = tape.get_string(idx).len();
141 let decoded_len = string_len / 2;
143 data_capacity += decoded_len;
144 }
145 TapeElement::Null => {}
146 _ => {
147 return Err(tape.error(*p, "binary data encoded as string"));
148 }
149 }
150 }
151 Ok(data_capacity)
152}