parquet/arrow/buffer/
view_buffer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::arrow::record_reader::buffer::ValuesBuffer;
19use arrow_array::{builder::make_view, make_array, ArrayRef};
20use arrow_buffer::Buffer;
21use arrow_data::ArrayDataBuilder;
22use arrow_schema::DataType as ArrowType;
23
24/// A buffer of view type byte arrays that can be converted into
25/// `GenericByteViewArray`
26///
27/// Note this does not reuse `GenericByteViewBuilder` due to the need to call `pad_nulls`
28/// and reuse the existing logic for Vec in the parquet crate
29#[derive(Debug, Default)]
30pub struct ViewBuffer {
31    pub views: Vec<u128>,
32    pub buffers: Vec<Buffer>,
33}
34
35impl ViewBuffer {
36    pub fn is_empty(&self) -> bool {
37        self.views.is_empty()
38    }
39
40    pub fn append_block(&mut self, block: Buffer) -> u32 {
41        let block_id = self.buffers.len() as u32;
42        self.buffers.push(block);
43        block_id
44    }
45
46    /// # Safety
47    /// This method is only safe when:
48    /// - `block` is a valid index, i.e., the return value of `append_block`
49    /// - `offset` and `offset + len` are valid indices into the buffer
50    /// - The `(offset, offset + len)` is valid value for the native type.
51    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
52        let b = self.buffers.get_unchecked(block as usize);
53        let end = offset.saturating_add(len);
54        let b = b.get_unchecked(offset as usize..end as usize);
55
56        let view = make_view(b, block, offset);
57
58        self.views.push(view);
59    }
60
61    /// Directly append a view to the view array.
62    /// This is used when we create a StringViewArray from a dictionary whose values are StringViewArray.
63    ///
64    /// # Safety
65    /// The `view` must be a valid view as per the ByteView spec.
66    pub unsafe fn append_raw_view_unchecked(&mut self, view: &u128) {
67        self.views.push(*view);
68    }
69
70    /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
71    pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
72        let len = self.views.len();
73        let views = Buffer::from_vec(self.views);
74        match data_type {
75            ArrowType::Utf8View => {
76                let builder = ArrayDataBuilder::new(ArrowType::Utf8View)
77                    .len(len)
78                    .add_buffer(views)
79                    .add_buffers(self.buffers)
80                    .null_bit_buffer(null_buffer);
81                // We have checked that the data is utf8 when building the buffer, so it is safe
82                let array = unsafe { builder.build_unchecked() };
83                make_array(array)
84            }
85            ArrowType::BinaryView => {
86                let builder = ArrayDataBuilder::new(ArrowType::BinaryView)
87                    .len(len)
88                    .add_buffer(views)
89                    .add_buffers(self.buffers)
90                    .null_bit_buffer(null_buffer);
91                let array = unsafe { builder.build_unchecked() };
92                make_array(array)
93            }
94            _ => panic!("Unsupported data type: {:?}", data_type),
95        }
96    }
97}
98
99impl ValuesBuffer for ViewBuffer {
100    fn pad_nulls(
101        &mut self,
102        read_offset: usize,
103        values_read: usize,
104        levels_read: usize,
105        valid_mask: &[u8],
106    ) {
107        self.views
108            .pad_nulls(read_offset, values_read, levels_read, valid_mask);
109    }
110}
111
112#[cfg(test)]
113mod tests {
114
115    use arrow_array::Array;
116
117    use super::*;
118
119    #[test]
120    fn test_view_buffer_empty() {
121        let buffer = ViewBuffer::default();
122        let array = buffer.into_array(None, &ArrowType::Utf8View);
123        let strings = array
124            .as_any()
125            .downcast_ref::<arrow::array::StringViewArray>()
126            .unwrap();
127        assert_eq!(strings.len(), 0);
128    }
129
130    #[test]
131    fn test_view_buffer_append_view() {
132        let mut buffer = ViewBuffer::default();
133        let string_buffer = Buffer::from(b"0123456789long string to test string view");
134        let block_id = buffer.append_block(string_buffer);
135
136        unsafe {
137            buffer.append_view_unchecked(block_id, 0, 1);
138            buffer.append_view_unchecked(block_id, 1, 9);
139            buffer.append_view_unchecked(block_id, 10, 31);
140        }
141
142        let array = buffer.into_array(None, &ArrowType::Utf8View);
143        let string_array = array
144            .as_any()
145            .downcast_ref::<arrow::array::StringViewArray>()
146            .unwrap();
147        assert_eq!(
148            string_array.iter().collect::<Vec<_>>(),
149            vec![
150                Some("0"),
151                Some("123456789"),
152                Some("long string to test string view"),
153            ]
154        );
155    }
156
157    #[test]
158    fn test_view_buffer_pad_null() {
159        let mut buffer = ViewBuffer::default();
160        let string_buffer = Buffer::from(b"0123456789long string to test string view");
161        let block_id = buffer.append_block(string_buffer);
162
163        unsafe {
164            buffer.append_view_unchecked(block_id, 0, 1);
165            buffer.append_view_unchecked(block_id, 1, 9);
166            buffer.append_view_unchecked(block_id, 10, 31);
167        }
168
169        let valid = [true, false, false, true, false, false, true];
170        let valid_mask = Buffer::from_iter(valid.iter().copied());
171
172        buffer.pad_nulls(1, 2, valid.len() - 1, valid_mask.as_slice());
173
174        let array = buffer.into_array(Some(valid_mask), &ArrowType::Utf8View);
175        let strings = array
176            .as_any()
177            .downcast_ref::<arrow::array::StringViewArray>()
178            .unwrap();
179
180        assert_eq!(
181            strings.iter().collect::<Vec<_>>(),
182            vec![
183                Some("0"),
184                None,
185                None,
186                Some("123456789"),
187                None,
188                None,
189                Some("long string to test string view"),
190            ]
191        );
192    }
193}