parquet_variant/
decoder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use crate::utils::{
18    array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice,
19};
20use crate::ShortString;
21
22use arrow_schema::ArrowError;
23use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
24
25/// The basic type of a [`Variant`] value, encoded in the first two bits of the
26/// header byte.
27///
28/// See the [Variant Encoding specification] for details
29///
30/// [`Variant`]: crate::Variant
31/// [Variant Encoding specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
32#[derive(Debug, Clone, Copy, PartialEq)]
33pub enum VariantBasicType {
34    Primitive = 0,
35    ShortString = 1,
36    Object = 2,
37    Array = 3,
38}
39
40/// The type of [`VariantBasicType::Primitive`], for a primitive [`Variant`]
41/// value.
42///
43/// See the [Variant Encoding specification] for details
44///
45/// [`Variant`]: crate::Variant
46/// [Variant Encoding specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
47#[derive(Debug, Clone, Copy, PartialEq)]
48pub enum VariantPrimitiveType {
49    Null = 0,
50    BooleanTrue = 1,
51    BooleanFalse = 2,
52    Int8 = 3,
53    Int16 = 4,
54    Int32 = 5,
55    Int64 = 6,
56    Double = 7,
57    Decimal4 = 8,
58    Decimal8 = 9,
59    Decimal16 = 10,
60    Date = 11,
61    TimestampMicros = 12,
62    TimestampNtzMicros = 13,
63    Float = 14,
64    Binary = 15,
65    String = 16,
66}
67
68/// Extracts the basic type from a header byte
69pub(crate) fn get_basic_type(header: u8) -> VariantBasicType {
70    // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
71    let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits
72    match basic_type {
73        0 => VariantBasicType::Primitive,
74        1 => VariantBasicType::ShortString,
75        2 => VariantBasicType::Object,
76        3 => VariantBasicType::Array,
77        _ => {
78            //NOTE:  A 2-bit value has a max of 4 different values (0-3), hence this is unreachable as we
79            // masked `basic_type` with 0x03 above.
80            unreachable!();
81        }
82    }
83}
84
85impl TryFrom<u8> for VariantPrimitiveType {
86    type Error = ArrowError;
87
88    fn try_from(value: u8) -> Result<Self, Self::Error> {
89        match value {
90            0 => Ok(VariantPrimitiveType::Null),
91            1 => Ok(VariantPrimitiveType::BooleanTrue),
92            2 => Ok(VariantPrimitiveType::BooleanFalse),
93            3 => Ok(VariantPrimitiveType::Int8),
94            4 => Ok(VariantPrimitiveType::Int16),
95            5 => Ok(VariantPrimitiveType::Int32),
96            6 => Ok(VariantPrimitiveType::Int64),
97            7 => Ok(VariantPrimitiveType::Double),
98            8 => Ok(VariantPrimitiveType::Decimal4),
99            9 => Ok(VariantPrimitiveType::Decimal8),
100            10 => Ok(VariantPrimitiveType::Decimal16),
101            11 => Ok(VariantPrimitiveType::Date),
102            12 => Ok(VariantPrimitiveType::TimestampMicros),
103            13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
104            14 => Ok(VariantPrimitiveType::Float),
105            15 => Ok(VariantPrimitiveType::Binary),
106            16 => Ok(VariantPrimitiveType::String),
107            _ => Err(ArrowError::InvalidArgumentError(format!(
108                "unknown primitive type: {value}",
109            ))),
110        }
111    }
112}
113
114/// Used to unpack offset array entries such as metadata dictionary offsets or object/array value
115/// offsets. Also used to unpack object field ids. These are always derived from a two-bit
116/// `XXX_size_minus_one` field in the corresponding header byte.
117#[derive(Debug, Clone, Copy, PartialEq)]
118pub(crate) enum OffsetSizeBytes {
119    One = 1,
120    Two = 2,
121    Three = 3,
122    Four = 4,
123}
124
125impl OffsetSizeBytes {
126    /// Build from the `offset_size_minus_one` bits (see spec).
127    pub(crate) fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> {
128        use OffsetSizeBytes::*;
129        let result = match offset_size_minus_one {
130            0 => One,
131            1 => Two,
132            2 => Three,
133            3 => Four,
134            _ => {
135                return Err(ArrowError::InvalidArgumentError(
136                    "offset_size_minus_one must be 0–3".to_string(),
137                ))
138            }
139        };
140        Ok(result)
141    }
142
143    /// Return one unsigned little-endian value from `bytes`.
144    ///
145    /// * `bytes` – the byte buffer to index
146    /// * `index` – 0-based index into the buffer
147    ///
148    /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
149    pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, ArrowError> {
150        self.unpack_u32_at_offset(bytes, 0, index)
151    }
152
153    /// Return one unsigned little-endian value from `bytes`.
154    ///
155    /// * `bytes` – the byte buffer to index
156    /// * `byte_offset` – number of bytes to skip **before** reading the first
157    ///   value (e.g. `1` to move past a header byte).
158    /// * `offset_index` – 0-based index **after** the skipped bytes
159    ///   (`0` is the first value, `1` the next, …).
160    ///
161    /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
162    pub(crate) fn unpack_u32_at_offset(
163        &self,
164        bytes: &[u8],
165        byte_offset: usize,  // how many bytes to skip
166        offset_index: usize, // which offset in an array of offsets
167    ) -> Result<u32, ArrowError> {
168        use OffsetSizeBytes::*;
169
170        // Index into the byte array:
171        // byte_offset + (*self as usize) * offset_index
172        let offset = offset_index
173            .checked_mul(*self as usize)
174            .and_then(|n| n.checked_add(byte_offset))
175            .ok_or_else(|| overflow_error("unpacking offset array value"))?;
176        let value = match self {
177            One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
178            Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
179            Three => {
180                // Let's grab the three byte le-chunk first
181                let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
182                // Let's pad it and construct a padded u32 from it.
183                let mut buf = [0u8; 4];
184                buf[..3].copy_from_slice(&b3_chunks);
185                u32::from_le_bytes(buf)
186            }
187            Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
188        };
189        Ok(value)
190    }
191}
192
193/// Converts a byte buffer to offset values based on the specific offset size
194pub(crate) fn map_bytes_to_offsets(
195    buffer: &[u8],
196    offset_size: OffsetSizeBytes,
197) -> impl Iterator<Item = usize> + use<'_> {
198    buffer
199        .chunks_exact(offset_size as usize)
200        .map(move |chunk| match offset_size {
201            OffsetSizeBytes::One => chunk[0] as usize,
202            OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize,
203            OffsetSizeBytes::Three => {
204                u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize
205            }
206            OffsetSizeBytes::Four => {
207                u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize
208            }
209        })
210}
211
212/// Extract the primitive type from a Variant value-metadata byte
213pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
214    // last 6 bits contain the primitive-type, see spec
215    VariantPrimitiveType::try_from(metadata >> 2)
216}
217
218/// Decodes an Int8 from the value section of a variant.
219pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
220    Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
221}
222
223/// Decodes an Int16 from the value section of a variant.
224pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
225    Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
226}
227
228/// Decodes an Int32 from the value section of a variant.
229pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
230    Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
231}
232
233/// Decodes an Int64 from the value section of a variant.
234pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
235    Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
236}
237
238/// Decodes a Decimal4 from the value section of a variant.
239pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
240    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
241    let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
242    Ok((integer, scale))
243}
244
245/// Decodes a Decimal8 from the value section of a variant.
246pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
247    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
248    let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
249    Ok((integer, scale))
250}
251
252/// Decodes a Decimal16 from the value section of a variant.
253pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
254    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
255    let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
256    Ok((integer, scale))
257}
258
259/// Decodes a Float from the value section of a variant.
260pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
261    Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
262}
263
264/// Decodes a Double from the value section of a variant.
265pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
266    Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
267}
268
269/// Decodes a Date from the value section of a variant.
270pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
271    let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
272    let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
273    Ok(value.date_naive())
274}
275
276/// Decodes a TimestampMicros from the value section of a variant.
277pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
278    let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
279    DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
280        ArrowError::CastError(format!(
281            "Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
282        ))
283    })
284}
285
286/// Decodes a TimestampNtzMicros from the value section of a variant.
287pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
288    let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
289    DateTime::from_timestamp_micros(micros_since_epoch)
290        .ok_or_else(|| {
291            ArrowError::CastError(format!(
292                "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
293            ))
294        })
295        .map(|v| v.naive_utc())
296}
297
298/// Decodes a Binary from the value section of a variant.
299pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
300    let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
301    slice_from_slice_at_offset(data, 4, 0..len)
302}
303
304/// Decodes a long string from the value section of a variant.
305pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
306    let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
307    string_from_slice(data, 4, 0..len)
308}
309
310/// Decodes a short string from the value section of a variant.
311pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<ShortString, ArrowError> {
312    let len = (metadata >> 2) as usize;
313    let string = string_from_slice(data, 0, 0..len)?;
314    ShortString::try_new(string)
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320    use paste::paste;
321
322    macro_rules! test_decoder_bounds {
323        ($test_name:ident, $data:expr, $decode_fn:ident, $expected:expr) => {
324            paste! {
325                #[test]
326                fn [<$test_name _exact_length>]() {
327                    let result = $decode_fn(&$data).unwrap();
328                    assert_eq!(result, $expected);
329                }
330
331                #[test]
332                fn [<$test_name _truncated_length>]() {
333                    // Remove the last byte of data so that there is not enough to decode
334                    let truncated_data = &$data[.. $data.len() - 1];
335                    let result = $decode_fn(truncated_data);
336                    assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
337                }
338            }
339        };
340    }
341
342    mod integer {
343        use super::*;
344
345        test_decoder_bounds!(test_i8, [0x2a], decode_int8, 42);
346        test_decoder_bounds!(test_i16, [0xd2, 0x04], decode_int16, 1234);
347        test_decoder_bounds!(test_i32, [0x40, 0xe2, 0x01, 0x00], decode_int32, 123456);
348        test_decoder_bounds!(
349            test_i64,
350            [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11],
351            decode_int64,
352            1234567890123456789
353        );
354    }
355
356    mod decimal {
357        use super::*;
358
359        test_decoder_bounds!(
360            test_decimal4,
361            [
362                0x02, // Scale
363                0xd2, 0x04, 0x00, 0x00, // Unscaled Value
364            ],
365            decode_decimal4,
366            (1234, 2)
367        );
368
369        test_decoder_bounds!(
370            test_decimal8,
371            [
372                0x02, // Scale
373                0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Unscaled Value
374            ],
375            decode_decimal8,
376            (1234567890, 2)
377        );
378
379        test_decoder_bounds!(
380            test_decimal16,
381            [
382                0x02, // Scale
383                0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
384                0x00, 0x00, // Unscaled Value
385            ],
386            decode_decimal16,
387            (1234567891234567890, 2)
388        );
389    }
390
391    mod float {
392        use super::*;
393
394        test_decoder_bounds!(
395            test_float,
396            [0x06, 0x2c, 0x93, 0x4e],
397            decode_float,
398            1234567890.1234
399        );
400
401        test_decoder_bounds!(
402            test_double,
403            [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41],
404            decode_double,
405            1234567890.1234
406        );
407    }
408
409    mod datetime {
410        use super::*;
411
412        test_decoder_bounds!(
413            test_date,
414            [0xe2, 0x4e, 0x0, 0x0],
415            decode_date,
416            NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()
417        );
418
419        test_decoder_bounds!(
420            test_timestamp_micros,
421            [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
422            decode_timestamp_micros,
423            NaiveDate::from_ymd_opt(2025, 4, 16)
424                .unwrap()
425                .and_hms_milli_opt(16, 34, 56, 780)
426                .unwrap()
427                .and_utc()
428        );
429
430        test_decoder_bounds!(
431            test_timestampntz_micros,
432            [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
433            decode_timestampntz_micros,
434            NaiveDate::from_ymd_opt(2025, 4, 16)
435                .unwrap()
436                .and_hms_milli_opt(16, 34, 56, 780)
437                .unwrap()
438        );
439    }
440
441    #[test]
442    fn test_binary_exact_length() {
443        let data = [
444            0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
445            0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
446        ];
447        let result = decode_binary(&data).unwrap();
448        assert_eq!(
449            result,
450            [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
451        );
452    }
453
454    #[test]
455    fn test_binary_truncated_length() {
456        let data = [
457            0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
458            0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca,
459        ];
460        let result = decode_binary(&data);
461        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
462    }
463
464    #[test]
465    fn test_short_string_exact_length() {
466        let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
467        let result = decode_short_string(1 | 5 << 2, &data).unwrap();
468        assert_eq!(result.0, "Hello");
469    }
470
471    #[test]
472    fn test_short_string_truncated_length() {
473        let data = [b'H', b'e', b'l'];
474        let result = decode_short_string(1 | 5 << 2, &data);
475        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
476    }
477
478    #[test]
479    fn test_string_exact_length() {
480        let data = [
481            0x05, 0, 0, 0, // Length of string, 4-byte little-endian
482            b'H', b'e', b'l', b'l', b'o', b'o',
483        ];
484        let result = decode_long_string(&data).unwrap();
485        assert_eq!(result, "Hello");
486    }
487
488    #[test]
489    fn test_string_truncated_length() {
490        let data = [
491            0x05, 0, 0, 0, // Length of string, 4-byte little-endian
492            b'H', b'e', b'l',
493        ];
494        let result = decode_long_string(&data);
495        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
496    }
497
498    #[test]
499    fn test_offset() {
500        assert_eq!(OffsetSizeBytes::try_new(0).unwrap(), OffsetSizeBytes::One);
501        assert_eq!(OffsetSizeBytes::try_new(1).unwrap(), OffsetSizeBytes::Two);
502        assert_eq!(OffsetSizeBytes::try_new(2).unwrap(), OffsetSizeBytes::Three);
503        assert_eq!(OffsetSizeBytes::try_new(3).unwrap(), OffsetSizeBytes::Four);
504
505        // everything outside 0-3 must error
506        assert!(OffsetSizeBytes::try_new(4).is_err());
507        assert!(OffsetSizeBytes::try_new(255).is_err());
508    }
509
510    #[test]
511    fn unpack_u32_all_widths() {
512        // One-byte offsets
513        let buf_one = [0x01u8, 0xAB, 0xCD];
514        assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 0x01);
515        assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 0xCD);
516
517        // Two-byte offsets (little-endian 0x1234, 0x5678)
518        let buf_two = [0x34, 0x12, 0x78, 0x56];
519        assert_eq!(
520            OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
521            0x1234
522        );
523        assert_eq!(
524            OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
525            0x5678
526        );
527
528        // Three-byte offsets (0x030201 and 0x0000FF)
529        let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
530        assert_eq!(
531            OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
532            0x030201
533        );
534        assert_eq!(
535            OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
536            0x0000FF
537        );
538
539        // Four-byte offsets (0x12345678, 0x90ABCDEF)
540        let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
541        assert_eq!(
542            OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
543            0x1234_5678
544        );
545        assert_eq!(
546            OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
547            0x90AB_CDEF
548        );
549    }
550
551    #[test]
552    fn unpack_u32_out_of_bounds() {
553        let tiny = [0x00u8]; // deliberately too short
554        assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
555        assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
556    }
557
558    #[test]
559    fn unpack_simple() {
560        let buf = [
561            0x41, // header
562            0x02, 0x00, // dictionary_size = 2
563            0x00, 0x00, // offset[0] = 0
564            0x05, 0x00, // offset[1] = 5
565            0x09, 0x00, // offset[2] = 9
566        ];
567
568        let width = OffsetSizeBytes::Two;
569
570        // dictionary_size starts immediately after the header byte
571        let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
572        assert_eq!(dict_size, 2);
573
574        // offset array immediately follows the dictionary size
575        let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
576        assert_eq!(first, 0);
577
578        let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
579        assert_eq!(second, 5);
580
581        let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
582        assert_eq!(third, 9);
583
584        let err = width.unpack_u32_at_offset(&buf, 1, 4);
585        assert!(err.is_err())
586    }
587}