parquet_variant/
decoder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use crate::utils::{
18    array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice,
19};
20use crate::ShortString;
21
22use arrow_schema::ArrowError;
23use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
24
25/// The basic type of a [`Variant`] value, encoded in the first two bits of the
26/// header byte.
27///
28/// See the [Variant Encoding specification] for details
29///
30/// [`Variant`]: crate::Variant
31/// [Variant Encoding specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
32#[derive(Debug, Clone, Copy, PartialEq)]
33pub enum VariantBasicType {
34    Primitive = 0,
35    ShortString = 1,
36    Object = 2,
37    Array = 3,
38}
39
40/// The type of [`VariantBasicType::Primitive`], for a primitive [`Variant`]
41/// value.
42///
43/// See the [Variant Encoding specification] for details
44///
45/// [`Variant`]: crate::Variant
46/// [Variant Encoding specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
47#[derive(Debug, Clone, Copy, PartialEq)]
48pub enum VariantPrimitiveType {
49    Null = 0,
50    BooleanTrue = 1,
51    BooleanFalse = 2,
52    Int8 = 3,
53    Int16 = 4,
54    Int32 = 5,
55    Int64 = 6,
56    Double = 7,
57    Decimal4 = 8,
58    Decimal8 = 9,
59    Decimal16 = 10,
60    Date = 11,
61    TimestampMicros = 12,
62    TimestampNtzMicros = 13,
63    Float = 14,
64    Binary = 15,
65    String = 16,
66}
67
68/// Extracts the basic type from a header byte
69pub(crate) fn get_basic_type(header: u8) -> VariantBasicType {
70    // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
71    let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits
72    match basic_type {
73        0 => VariantBasicType::Primitive,
74        1 => VariantBasicType::ShortString,
75        2 => VariantBasicType::Object,
76        3 => VariantBasicType::Array,
77        _ => {
78            //NOTE:  A 2-bit value has a max of 4 different values (0-3), hence this is unreachable as we
79            // masked `basic_type` with 0x03 above.
80            unreachable!();
81        }
82    }
83}
84
85impl TryFrom<u8> for VariantPrimitiveType {
86    type Error = ArrowError;
87
88    fn try_from(value: u8) -> Result<Self, Self::Error> {
89        match value {
90            0 => Ok(VariantPrimitiveType::Null),
91            1 => Ok(VariantPrimitiveType::BooleanTrue),
92            2 => Ok(VariantPrimitiveType::BooleanFalse),
93            3 => Ok(VariantPrimitiveType::Int8),
94            4 => Ok(VariantPrimitiveType::Int16),
95            5 => Ok(VariantPrimitiveType::Int32),
96            6 => Ok(VariantPrimitiveType::Int64),
97            7 => Ok(VariantPrimitiveType::Double),
98            8 => Ok(VariantPrimitiveType::Decimal4),
99            9 => Ok(VariantPrimitiveType::Decimal8),
100            10 => Ok(VariantPrimitiveType::Decimal16),
101            11 => Ok(VariantPrimitiveType::Date),
102            12 => Ok(VariantPrimitiveType::TimestampMicros),
103            13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
104            14 => Ok(VariantPrimitiveType::Float),
105            15 => Ok(VariantPrimitiveType::Binary),
106            16 => Ok(VariantPrimitiveType::String),
107            _ => Err(ArrowError::InvalidArgumentError(format!(
108                "unknown primitive type: {value}",
109            ))),
110        }
111    }
112}
113
114/// Used to unpack offset array entries such as metadata dictionary offsets or object/array value
115/// offsets. Also used to unpack object field ids. These are always derived from a two-bit
116/// `XXX_size_minus_one` field in the corresponding header byte.
117#[derive(Debug, Clone, Copy, PartialEq)]
118pub(crate) enum OffsetSizeBytes {
119    One = 1,
120    Two = 2,
121    Three = 3,
122    Four = 4,
123}
124
125impl OffsetSizeBytes {
126    /// Build from the `offset_size_minus_one` bits (see spec).
127    pub(crate) fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> {
128        use OffsetSizeBytes::*;
129        let result = match offset_size_minus_one {
130            0 => One,
131            1 => Two,
132            2 => Three,
133            3 => Four,
134            _ => {
135                return Err(ArrowError::InvalidArgumentError(
136                    "offset_size_minus_one must be 0–3".to_string(),
137                ))
138            }
139        };
140        Ok(result)
141    }
142
143    /// Return one unsigned little-endian value from `bytes`.
144    ///
145    /// * `bytes` – the byte buffer to index
146    /// * `index` – 0-based index into the buffer
147    ///
148    /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
149    pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, ArrowError> {
150        self.unpack_u32_at_offset(bytes, 0, index)
151    }
152
153    /// Return one unsigned little-endian value from `bytes`.
154    ///
155    /// * `bytes` – the byte buffer to index
156    /// * `byte_offset` – number of bytes to skip **before** reading the first
157    ///   value (e.g. `1` to move past a header byte).
158    /// * `offset_index` – 0-based index **after** the skipped bytes
159    ///   (`0` is the first value, `1` the next, …).
160    ///
161    /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
162    pub(crate) fn unpack_u32_at_offset(
163        &self,
164        bytes: &[u8],
165        byte_offset: usize,  // how many bytes to skip
166        offset_index: usize, // which offset in an array of offsets
167    ) -> Result<u32, ArrowError> {
168        use OffsetSizeBytes::*;
169
170        // Index into the byte array:
171        // byte_offset + (*self as usize) * offset_index
172        let offset = offset_index
173            .checked_mul(*self as usize)
174            .and_then(|n| n.checked_add(byte_offset))
175            .ok_or_else(|| overflow_error("unpacking offset array value"))?;
176        let value = match self {
177            One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
178            Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
179            Three => {
180                // Let's grab the three byte le-chunk first
181                let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
182                // Let's pad it and construct a padded u32 from it.
183                let mut buf = [0u8; 4];
184                buf[..3].copy_from_slice(&b3_chunks);
185                u32::from_le_bytes(buf)
186            }
187            Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
188        };
189        Ok(value)
190    }
191}
192
193/// Converts a byte buffer to offset values based on the specific offset size
194pub(crate) fn map_bytes_to_offsets(
195    buffer: &[u8],
196    offset_size: OffsetSizeBytes,
197) -> impl Iterator<Item = usize> + use<'_> {
198    buffer
199        .chunks_exact(offset_size as usize)
200        .map(move |chunk| match offset_size {
201            OffsetSizeBytes::One => chunk[0] as usize,
202            OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize,
203            OffsetSizeBytes::Three => {
204                u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize
205            }
206            OffsetSizeBytes::Four => {
207                u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize
208            }
209        })
210}
211
212/// Extract the primitive type from a Variant value-metadata byte
213pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
214    // last 6 bits contain the primitive-type, see spec
215    VariantPrimitiveType::try_from(metadata >> 2)
216}
217
218/// Decodes an Int8 from the value section of a variant.
219pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
220    Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
221}
222
223/// Decodes an Int16 from the value section of a variant.
224pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
225    Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
226}
227
228/// Decodes an Int32 from the value section of a variant.
229pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
230    Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
231}
232
233/// Decodes an Int64 from the value section of a variant.
234pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
235    Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
236}
237
238/// Decodes a Decimal4 from the value section of a variant.
239pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
240    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
241    let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
242    Ok((integer, scale))
243}
244
245/// Decodes a Decimal8 from the value section of a variant.
246pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
247    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
248    let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
249    Ok((integer, scale))
250}
251
252/// Decodes a Decimal16 from the value section of a variant.
253pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
254    let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
255    let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
256    Ok((integer, scale))
257}
258
259/// Decodes a Float from the value section of a variant.
260pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
261    Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
262}
263
264/// Decodes a Double from the value section of a variant.
265pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
266    Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
267}
268
269/// Decodes a Date from the value section of a variant.
270pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
271    let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
272    let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
273    Ok(value.date_naive())
274}
275
276/// Decodes a TimestampMicros from the value section of a variant.
277pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
278    let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
279    DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
280        ArrowError::CastError(format!(
281            "Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
282        ))
283    })
284}
285
286/// Decodes a TimestampNtzMicros from the value section of a variant.
287pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
288    let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
289    DateTime::from_timestamp_micros(micros_since_epoch)
290        .ok_or_else(|| {
291            ArrowError::CastError(format!(
292                "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
293            ))
294        })
295        .map(|v| v.naive_utc())
296}
297
298/// Decodes a Binary from the value section of a variant.
299pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
300    let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
301    slice_from_slice_at_offset(data, 4, 0..len)
302}
303
304/// Decodes a long string from the value section of a variant.
305pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
306    let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
307    string_from_slice(data, 4, 0..len)
308}
309
310/// Decodes a short string from the value section of a variant.
311pub(crate) fn decode_short_string(
312    metadata: u8,
313    data: &[u8],
314) -> Result<ShortString<'_>, ArrowError> {
315    let len = (metadata >> 2) as usize;
316    let string = string_from_slice(data, 0, 0..len)?;
317    ShortString::try_new(string)
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    use paste::paste;
324
325    macro_rules! test_decoder_bounds {
326        ($test_name:ident, $data:expr, $decode_fn:ident, $expected:expr) => {
327            paste! {
328                #[test]
329                fn [<$test_name _exact_length>]() {
330                    let result = $decode_fn(&$data).unwrap();
331                    assert_eq!(result, $expected);
332                }
333
334                #[test]
335                fn [<$test_name _truncated_length>]() {
336                    // Remove the last byte of data so that there is not enough to decode
337                    let truncated_data = &$data[.. $data.len() - 1];
338                    let result = $decode_fn(truncated_data);
339                    assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
340                }
341            }
342        };
343    }
344
345    mod integer {
346        use super::*;
347
348        test_decoder_bounds!(test_i8, [0x2a], decode_int8, 42);
349        test_decoder_bounds!(test_i16, [0xd2, 0x04], decode_int16, 1234);
350        test_decoder_bounds!(test_i32, [0x40, 0xe2, 0x01, 0x00], decode_int32, 123456);
351        test_decoder_bounds!(
352            test_i64,
353            [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11],
354            decode_int64,
355            1234567890123456789
356        );
357    }
358
359    mod decimal {
360        use super::*;
361
362        test_decoder_bounds!(
363            test_decimal4,
364            [
365                0x02, // Scale
366                0xd2, 0x04, 0x00, 0x00, // Unscaled Value
367            ],
368            decode_decimal4,
369            (1234, 2)
370        );
371
372        test_decoder_bounds!(
373            test_decimal8,
374            [
375                0x02, // Scale
376                0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Unscaled Value
377            ],
378            decode_decimal8,
379            (1234567890, 2)
380        );
381
382        test_decoder_bounds!(
383            test_decimal16,
384            [
385                0x02, // Scale
386                0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
387                0x00, 0x00, // Unscaled Value
388            ],
389            decode_decimal16,
390            (1234567891234567890, 2)
391        );
392    }
393
394    mod float {
395        use super::*;
396
397        test_decoder_bounds!(
398            test_float,
399            [0x06, 0x2c, 0x93, 0x4e],
400            decode_float,
401            1234567890.1234
402        );
403
404        test_decoder_bounds!(
405            test_double,
406            [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41],
407            decode_double,
408            1234567890.1234
409        );
410    }
411
412    mod datetime {
413        use super::*;
414
415        test_decoder_bounds!(
416            test_date,
417            [0xe2, 0x4e, 0x0, 0x0],
418            decode_date,
419            NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()
420        );
421
422        test_decoder_bounds!(
423            test_timestamp_micros,
424            [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
425            decode_timestamp_micros,
426            NaiveDate::from_ymd_opt(2025, 4, 16)
427                .unwrap()
428                .and_hms_milli_opt(16, 34, 56, 780)
429                .unwrap()
430                .and_utc()
431        );
432
433        test_decoder_bounds!(
434            test_timestampntz_micros,
435            [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
436            decode_timestampntz_micros,
437            NaiveDate::from_ymd_opt(2025, 4, 16)
438                .unwrap()
439                .and_hms_milli_opt(16, 34, 56, 780)
440                .unwrap()
441        );
442    }
443
444    #[test]
445    fn test_binary_exact_length() {
446        let data = [
447            0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
448            0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
449        ];
450        let result = decode_binary(&data).unwrap();
451        assert_eq!(
452            result,
453            [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
454        );
455    }
456
457    #[test]
458    fn test_binary_truncated_length() {
459        let data = [
460            0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
461            0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca,
462        ];
463        let result = decode_binary(&data);
464        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
465    }
466
467    #[test]
468    fn test_short_string_exact_length() {
469        let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
470        let result = decode_short_string(1 | 5 << 2, &data).unwrap();
471        assert_eq!(result.0, "Hello");
472    }
473
474    #[test]
475    fn test_short_string_truncated_length() {
476        let data = [b'H', b'e', b'l'];
477        let result = decode_short_string(1 | 5 << 2, &data);
478        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
479    }
480
481    #[test]
482    fn test_string_exact_length() {
483        let data = [
484            0x05, 0, 0, 0, // Length of string, 4-byte little-endian
485            b'H', b'e', b'l', b'l', b'o', b'o',
486        ];
487        let result = decode_long_string(&data).unwrap();
488        assert_eq!(result, "Hello");
489    }
490
491    #[test]
492    fn test_string_truncated_length() {
493        let data = [
494            0x05, 0, 0, 0, // Length of string, 4-byte little-endian
495            b'H', b'e', b'l',
496        ];
497        let result = decode_long_string(&data);
498        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
499    }
500
501    #[test]
502    fn test_offset() {
503        assert_eq!(OffsetSizeBytes::try_new(0).unwrap(), OffsetSizeBytes::One);
504        assert_eq!(OffsetSizeBytes::try_new(1).unwrap(), OffsetSizeBytes::Two);
505        assert_eq!(OffsetSizeBytes::try_new(2).unwrap(), OffsetSizeBytes::Three);
506        assert_eq!(OffsetSizeBytes::try_new(3).unwrap(), OffsetSizeBytes::Four);
507
508        // everything outside 0-3 must error
509        assert!(OffsetSizeBytes::try_new(4).is_err());
510        assert!(OffsetSizeBytes::try_new(255).is_err());
511    }
512
513    #[test]
514    fn unpack_u32_all_widths() {
515        // One-byte offsets
516        let buf_one = [0x01u8, 0xAB, 0xCD];
517        assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 0x01);
518        assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 0xCD);
519
520        // Two-byte offsets (little-endian 0x1234, 0x5678)
521        let buf_two = [0x34, 0x12, 0x78, 0x56];
522        assert_eq!(
523            OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
524            0x1234
525        );
526        assert_eq!(
527            OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
528            0x5678
529        );
530
531        // Three-byte offsets (0x030201 and 0x0000FF)
532        let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
533        assert_eq!(
534            OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
535            0x030201
536        );
537        assert_eq!(
538            OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
539            0x0000FF
540        );
541
542        // Four-byte offsets (0x12345678, 0x90ABCDEF)
543        let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
544        assert_eq!(
545            OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
546            0x1234_5678
547        );
548        assert_eq!(
549            OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
550            0x90AB_CDEF
551        );
552    }
553
554    #[test]
555    fn unpack_u32_out_of_bounds() {
556        let tiny = [0x00u8]; // deliberately too short
557        assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
558        assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
559    }
560
561    #[test]
562    fn unpack_simple() {
563        let buf = [
564            0x41, // header
565            0x02, 0x00, // dictionary_size = 2
566            0x00, 0x00, // offset[0] = 0
567            0x05, 0x00, // offset[1] = 5
568            0x09, 0x00, // offset[2] = 9
569        ];
570
571        let width = OffsetSizeBytes::Two;
572
573        // dictionary_size starts immediately after the header byte
574        let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
575        assert_eq!(dict_size, 2);
576
577        // offset array immediately follows the dictionary size
578        let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
579        assert_eq!(first, 0);
580
581        let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
582        assert_eq!(second, 5);
583
584        let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
585        assert_eq!(third, 9);
586
587        let err = width.unpack_u32_at_offset(&buf, 1, 4);
588        assert!(err.is_err())
589    }
590}