parquet_variant/
decoder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use arrow_schema::ArrowError;
18use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
19use std::array::TryFromSliceError;
20
21use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
22
23#[derive(Debug, Clone, Copy)]
24pub enum VariantBasicType {
25    Primitive = 0,
26    ShortString = 1,
27    Object = 2,
28    Array = 3,
29}
30
31#[derive(Debug, Clone, Copy)]
32pub enum VariantPrimitiveType {
33    Null = 0,
34    BooleanTrue = 1,
35    BooleanFalse = 2,
36    Int8 = 3,
37    Int16 = 4,
38    Int32 = 5,
39    Int64 = 6,
40    Double = 7,
41    Decimal4 = 8,
42    Decimal8 = 9,
43    Decimal16 = 10,
44    Date = 11,
45    TimestampMicros = 12,
46    TimestampNtzMicros = 13,
47    Float = 14,
48    Binary = 15,
49    String = 16,
50}
51
52/// Extracts the basic type from a header byte
53pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> {
54    // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
55    let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits
56    let basic_type = match basic_type {
57        0 => VariantBasicType::Primitive,
58        1 => VariantBasicType::ShortString,
59        2 => VariantBasicType::Object,
60        3 => VariantBasicType::Array,
61        _ => {
62            //NOTE:  A 2-bit value has a max of 4 different values (0-3), hence this is unreachable as we
63            // masked `basic_type` with 0x03 above.
64            unreachable!();
65        }
66    };
67    Ok(basic_type)
68}
69
70impl TryFrom<u8> for VariantPrimitiveType {
71    type Error = ArrowError;
72
73    fn try_from(value: u8) -> Result<Self, Self::Error> {
74        match value {
75            0 => Ok(VariantPrimitiveType::Null),
76            1 => Ok(VariantPrimitiveType::BooleanTrue),
77            2 => Ok(VariantPrimitiveType::BooleanFalse),
78            3 => Ok(VariantPrimitiveType::Int8),
79            4 => Ok(VariantPrimitiveType::Int16),
80            5 => Ok(VariantPrimitiveType::Int32),
81            6 => Ok(VariantPrimitiveType::Int64),
82            7 => Ok(VariantPrimitiveType::Double),
83            8 => Ok(VariantPrimitiveType::Decimal4),
84            9 => Ok(VariantPrimitiveType::Decimal8),
85            10 => Ok(VariantPrimitiveType::Decimal16),
86            11 => Ok(VariantPrimitiveType::Date),
87            12 => Ok(VariantPrimitiveType::TimestampMicros),
88            13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
89            14 => Ok(VariantPrimitiveType::Float),
90            15 => Ok(VariantPrimitiveType::Binary),
91            16 => Ok(VariantPrimitiveType::String),
92            _ => Err(ArrowError::InvalidArgumentError(format!(
93                "unknown primitive type: {}",
94                value
95            ))),
96        }
97    }
98}
99/// Extract the primitive type from a Variant value-metadata byte
100pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
101    // last 6 bits contain the primitive-type, see spec
102    VariantPrimitiveType::try_from(metadata >> 2)
103}
104
105/// To be used in `map_err` when unpacking an integer from a slice of bytes.
106fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
107    ArrowError::InvalidArgumentError(e.to_string())
108}
109
110/// Decodes an Int8 from the value section of a variant.
111pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
112    Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
113}
114
115/// Decodes an Int16 from the value section of a variant.
116pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
117    Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
118}
119
120/// Decodes an Int32 from the value section of a variant.
121pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
122    Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
123}
124
125/// Decodes an Int64 from the value section of a variant.
126pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
127    Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
128}
129
130/// Decodes a Decimal4 from the value section of a variant.
131pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
132    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
133    let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
134    Ok((integer, scale))
135}
136
137/// Decodes a Decimal8 from the value section of a variant.
138pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
139    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
140    let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
141    Ok((integer, scale))
142}
143
144/// Decodes a Decimal16 from the value section of a variant.
145pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
146    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
147    let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
148    Ok((integer, scale))
149}
150
151/// Decodes a Float from the value section of a variant.
152pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
153    Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
154}
155
156/// Decodes a Double from the value section of a variant.
157pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
158    Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
159}
160
161/// Decodes a Date from the value section of a variant.
162pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
163    let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
164    let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
165    Ok(value.date_naive())
166}
167
168/// Decodes a TimestampMicros from the value section of a variant.
169pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
170    let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
171    DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
172        ArrowError::CastError(format!(
173            "Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
174        ))
175    })
176}
177
178/// Decodes a TimestampNtzMicros from the value section of a variant.
179pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
180    let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
181    DateTime::from_timestamp_micros(micros_since_epoch)
182        .ok_or_else(|| {
183            ArrowError::CastError(format!(
184                "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
185            ))
186        })
187        .map(|v| v.naive_utc())
188}
189
190/// Decodes a Binary from the value section of a variant.
191pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
192    let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
193    let value = slice_from_slice(data, 4..4 + len)?;
194    Ok(value)
195}
196
197/// Decodes a long string from the value section of a variant.
198pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
199    let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
200    let string = string_from_slice(data, 4..4 + len)?;
201    Ok(string)
202}
203
204/// Decodes a short string from the value section of a variant.
205pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> {
206    let len = (metadata >> 2) as usize;
207    let string = string_from_slice(data, 0..len)?;
208    Ok(string)
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_i8() -> Result<(), ArrowError> {
217        let data = [0x2a];
218        let result = decode_int8(&data)?;
219        assert_eq!(result, 42);
220        Ok(())
221    }
222
223    #[test]
224    fn test_i16() -> Result<(), ArrowError> {
225        let data = [0xd2, 0x04];
226        let result = decode_int16(&data)?;
227        assert_eq!(result, 1234);
228        Ok(())
229    }
230
231    #[test]
232    fn test_i32() -> Result<(), ArrowError> {
233        let data = [0x40, 0xe2, 0x01, 0x00];
234        let result = decode_int32(&data)?;
235        assert_eq!(result, 123456);
236        Ok(())
237    }
238
239    #[test]
240    fn test_i64() -> Result<(), ArrowError> {
241        let data = [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11];
242        let result = decode_int64(&data)?;
243        assert_eq!(result, 1234567890123456789);
244        Ok(())
245    }
246
247    #[test]
248    fn test_decimal4() -> Result<(), ArrowError> {
249        let data = [
250            0x02, // Scale
251            0xd2, 0x04, 0x00, 0x00, // Integer
252        ];
253        let result = decode_decimal4(&data)?;
254        assert_eq!(result, (1234, 2));
255        Ok(())
256    }
257
258    #[test]
259    fn test_decimal8() -> Result<(), ArrowError> {
260        let data = [
261            0x02, // Scale
262            0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Integer
263        ];
264        let result = decode_decimal8(&data)?;
265        assert_eq!(result, (1234567890, 2));
266        Ok(())
267    }
268
269    #[test]
270    fn test_decimal16() -> Result<(), ArrowError> {
271        let data = [
272            0x02, // Scale
273            0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
274            0x00, 0x00, // Integer
275        ];
276        let result = decode_decimal16(&data)?;
277        assert_eq!(result, (1234567891234567890, 2));
278        Ok(())
279    }
280
281    #[test]
282    fn test_float() -> Result<(), ArrowError> {
283        let data = [0x06, 0x2c, 0x93, 0x4e];
284        let result = decode_float(&data)?;
285        assert_eq!(result, 1234567890.1234);
286        Ok(())
287    }
288
289    #[test]
290    fn test_double() -> Result<(), ArrowError> {
291        let data = [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41];
292        let result = decode_double(&data)?;
293        assert_eq!(result, 1234567890.1234);
294        Ok(())
295    }
296
297    #[test]
298    fn test_date() -> Result<(), ArrowError> {
299        let data = [0xe2, 0x4e, 0x0, 0x0];
300        let result = decode_date(&data)?;
301        assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap());
302        Ok(())
303    }
304
305    #[test]
306    fn test_timestamp_micros() -> Result<(), ArrowError> {
307        let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00];
308        let result = decode_timestamp_micros(&data)?;
309        assert_eq!(
310            result,
311            NaiveDate::from_ymd_opt(2025, 4, 16)
312                .unwrap()
313                .and_hms_milli_opt(16, 34, 56, 780)
314                .unwrap()
315                .and_utc()
316        );
317        Ok(())
318    }
319
320    #[test]
321    fn test_timestampntz_micros() -> Result<(), ArrowError> {
322        let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00];
323        let result = decode_timestampntz_micros(&data)?;
324        assert_eq!(
325            result,
326            NaiveDate::from_ymd_opt(2025, 4, 16)
327                .unwrap()
328                .and_hms_milli_opt(16, 34, 56, 780)
329                .unwrap()
330        );
331        Ok(())
332    }
333
334    #[test]
335    fn test_binary() -> Result<(), ArrowError> {
336        let data = [
337            0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
338            0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
339        ];
340        let result = decode_binary(&data)?;
341        assert_eq!(
342            result,
343            [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
344        );
345        Ok(())
346    }
347
348    #[test]
349    fn test_short_string() -> Result<(), ArrowError> {
350        let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
351        let result = decode_short_string(1 | 5 << 2, &data)?;
352        assert_eq!(result, "Hello");
353        Ok(())
354    }
355
356    #[test]
357    fn test_string() -> Result<(), ArrowError> {
358        let data = [
359            0x05, 0, 0, 0, // Length of string, 4-byte little-endian
360            b'H', b'e', b'l', b'l', b'o', b'o',
361        ];
362        let result = decode_long_string(&data)?;
363        assert_eq!(result, "Hello");
364        Ok(())
365    }
366}