parquet/util/
bit_pack.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Vectorised bit-packing utilities
19
20/// Macro that generates an unpack function taking the number of bits as a const generic
21macro_rules! unpack_impl {
22    ($t:ty, $bytes:literal, $bits:tt) => {
23        pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; $bits]) {
24            if NUM_BITS == 0 {
25                for out in output {
26                    *out = 0;
27                }
28                return;
29            }
30
31            assert!(NUM_BITS <= $bytes * 8);
32
33            let mask = match NUM_BITS {
34                $bits => <$t>::MAX,
35                _ => ((1 << NUM_BITS) - 1),
36            };
37
38            assert!(input.len() >= NUM_BITS * $bytes);
39
40            let r = |output_idx: usize| {
41                <$t>::from_le_bytes(
42                    input[output_idx * $bytes..output_idx * $bytes + $bytes]
43                        .try_into()
44                        .unwrap(),
45                )
46            };
47
48            seq_macro::seq!(i in 0..$bits {
49                let start_bit = i * NUM_BITS;
50                let end_bit = start_bit + NUM_BITS;
51
52                let start_bit_offset = start_bit % $bits;
53                let end_bit_offset = end_bit % $bits;
54                let start_byte = start_bit / $bits;
55                let end_byte = end_bit / $bits;
56                if start_byte != end_byte && end_bit_offset != 0 {
57                    let val = r(start_byte);
58                    let a = val >> start_bit_offset;
59                    let val = r(end_byte);
60                    let b = val << (NUM_BITS - end_bit_offset);
61
62                    output[i] = a | (b & mask);
63                } else {
64                    let val = r(start_byte);
65                    output[i] = (val >> start_bit_offset) & mask;
66                }
67            });
68        }
69    };
70}
71
72/// Macro that generates unpack functions that accept num_bits as a parameter
73macro_rules! unpack {
74    ($name:ident, $t:ty, $bytes:literal, $bits:tt) => {
75        mod $name {
76            unpack_impl!($t, $bytes, $bits);
77        }
78
79        /// Unpack packed `input` into `output` with a bit width of `num_bits`
80        pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) {
81            // This will get optimised into a jump table
82            seq_macro::seq!(i in 0..=$bits {
83                if i == num_bits {
84                    return $name::unpack::<i>(input, output);
85                }
86            });
87            unreachable!("invalid num_bits {}", num_bits);
88        }
89    };
90}
91
92unpack!(unpack8, u8, 1, 8);
93unpack!(unpack16, u16, 2, 16);
94unpack!(unpack32, u32, 4, 32);
95unpack!(unpack64, u64, 8, 64);
96
97#[cfg(test)]
98mod tests {
99    use super::*;
100
101    #[test]
102    fn test_basic() {
103        let input = [0xFF; 4096];
104
105        for i in 0..=8 {
106            let mut output = [0; 8];
107            unpack8(&input, &mut output, i);
108            for (idx, out) in output.iter().enumerate() {
109                assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
110            }
111        }
112
113        for i in 0..=16 {
114            let mut output = [0; 16];
115            unpack16(&input, &mut output, i);
116            for (idx, out) in output.iter().enumerate() {
117                assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
118            }
119        }
120
121        for i in 0..=32 {
122            let mut output = [0; 32];
123            unpack32(&input, &mut output, i);
124            for (idx, out) in output.iter().enumerate() {
125                assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
126            }
127        }
128
129        for i in 0..=64 {
130            let mut output = [0; 64];
131            unpack64(&input, &mut output, i);
132            for (idx, out) in output.iter().enumerate() {
133                assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
134            }
135        }
136    }
137}