Skip to main content

parquet/arrow/record_reader/
buffer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::arrow::buffer::bit_util::iter_set_bits_rev;
19use crate::errors::Result;
20
21/// A buffer that supports padding with nulls
22pub trait ValuesBuffer {
23    /// Create a new buffer with capacity for at least `capacity` elements
24    ///
25    /// This allows pre-allocating buffers to avoid reallocations during reading,
26    /// improving performance when the number of values is known in advance.
27    fn with_capacity(capacity: usize) -> Self;
28
29    /// If a column contains nulls, more level data may be read than value data, as null
30    /// values are not encoded. Therefore, first the levels data is read, the null count
31    /// determined, and then the corresponding number of values read to a [`ValuesBuffer`].
32    ///
33    /// It is then necessary to move this values data into positions that correspond to
34    /// the non-null level positions. This is what this method does.
35    ///
36    /// It is provided with:
37    ///
38    /// - `read_offset` - the offset in [`ValuesBuffer`] to start null padding from
39    /// - `values_read` - the number of values read
40    /// - `levels_read` - the number of levels read
41    /// - `valid_mask` - a packed mask of valid levels
42    ///
43    /// Returns an error if the inputs are inconsistent, for example because the
44    /// decoded data was corrupt. This must not panic on such input.
45    fn pad_nulls(
46        &mut self,
47        read_offset: usize,
48        values_read: usize,
49        levels_read: usize,
50        valid_mask: &[u8],
51    ) -> Result<()>;
52}
53
54impl<T: Copy + Default> ValuesBuffer for Vec<T> {
55    fn with_capacity(capacity: usize) -> Self {
56        Vec::with_capacity(capacity)
57    }
58
59    fn pad_nulls(
60        &mut self,
61        read_offset: usize,
62        values_read: usize,
63        levels_read: usize,
64        valid_mask: &[u8],
65    ) -> Result<()> {
66        self.resize(read_offset + levels_read, T::default());
67
68        let values_range = read_offset..read_offset + values_read;
69        for (value_pos, level_pos) in values_range.rev().zip(iter_set_bits_rev(valid_mask)) {
70            debug_assert!(level_pos >= value_pos);
71            if level_pos <= value_pos {
72                break;
73            }
74            self[level_pos] = self[value_pos];
75        }
76        Ok(())
77    }
78}