Skip to main content

parquet/arrow/buffer/
view_buffer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::arrow::record_reader::buffer::ValuesBuffer;
19use arrow_array::{ArrayRef, BinaryViewArray, StringViewArray};
20use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
21use arrow_schema::DataType as ArrowType;
22use std::sync::Arc;
23
24/// A buffer of view type byte arrays that can be converted into
25/// `GenericByteViewArray`
26///
27/// Note this does not reuse `GenericByteViewBuilder` due to the need to call `pad_nulls`
28/// and reuse the existing logic for Vec in the parquet crate
29#[derive(Debug, Default)]
30pub struct ViewBuffer {
31    pub views: Vec<u128>,
32    pub buffers: Vec<Buffer>,
33}
34
35impl ViewBuffer {
36    /// Create a new ViewBuffer with capacity for the specified number of views
37    pub fn with_capacity(capacity: usize) -> Self {
38        Self {
39            views: Vec::with_capacity(capacity),
40            buffers: Vec::new(),
41        }
42    }
43
44    pub fn is_empty(&self) -> bool {
45        self.views.is_empty()
46    }
47
48    pub fn append_block(&mut self, block: Buffer) -> u32 {
49        let block_id = self.buffers.len() as u32;
50        self.buffers.push(block);
51        block_id
52    }
53
54    /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
55    pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
56        let len = self.views.len();
57        let views = ScalarBuffer::from(self.views);
58        let nulls = null_buffer.and_then(|b| NullBuffer::from_unsliced_buffer(b, len));
59        match data_type {
60            ArrowType::Utf8View => {
61                // Safety: views were created correctly, and checked that the data is utf8 when building the buffer
62                unsafe { Arc::new(StringViewArray::new_unchecked(views, self.buffers, nulls)) }
63            }
64            ArrowType::BinaryView => {
65                // Safety: views were created correctly
66                unsafe { Arc::new(BinaryViewArray::new_unchecked(views, self.buffers, nulls)) }
67            }
68            _ => panic!("Unsupported data type: {data_type}"),
69        }
70    }
71}
72
73impl ValuesBuffer for ViewBuffer {
74    fn with_capacity(capacity: usize) -> Self {
75        Self::with_capacity(capacity)
76    }
77
78    fn pad_nulls(
79        &mut self,
80        read_offset: usize,
81        values_read: usize,
82        levels_read: usize,
83        valid_mask: &[u8],
84    ) {
85        self.views
86            .pad_nulls(read_offset, values_read, levels_read, valid_mask);
87    }
88}
89
90#[cfg(test)]
91mod tests {
92
93    use arrow::array::make_view;
94    use arrow_array::Array;
95
96    use super::*;
97
98    #[test]
99    fn test_view_buffer_empty() {
100        let buffer = ViewBuffer::with_capacity(0);
101        let array = buffer.into_array(None, &ArrowType::Utf8View);
102        let strings = array
103            .as_any()
104            .downcast_ref::<arrow::array::StringViewArray>()
105            .unwrap();
106        assert_eq!(strings.len(), 0);
107    }
108
109    #[test]
110    fn test_view_buffer_append_view() {
111        let mut buffer = ViewBuffer::with_capacity(0);
112        let data = b"0123456789long string to test string view";
113        let string_buffer = Buffer::from(data);
114        let block_id = buffer.append_block(string_buffer);
115
116        buffer.views.push(make_view(&data[0..1], block_id, 0));
117        buffer.views.push(make_view(&data[1..10], block_id, 1));
118        buffer.views.push(make_view(&data[10..41], block_id, 10));
119
120        let array = buffer.into_array(None, &ArrowType::Utf8View);
121        let string_array = array
122            .as_any()
123            .downcast_ref::<arrow::array::StringViewArray>()
124            .unwrap();
125        assert_eq!(
126            string_array.iter().collect::<Vec<_>>(),
127            vec![
128                Some("0"),
129                Some("123456789"),
130                Some("long string to test string view"),
131            ]
132        );
133    }
134
135    #[test]
136    fn test_view_buffer_pad_null() {
137        let mut buffer = ViewBuffer::with_capacity(0);
138        let data = b"0123456789long string to test string view";
139        let string_buffer = Buffer::from(data);
140        let block_id = buffer.append_block(string_buffer);
141
142        buffer.views.push(make_view(&data[0..1], block_id, 0));
143        buffer.views.push(make_view(&data[1..10], block_id, 1));
144        buffer.views.push(make_view(&data[10..41], block_id, 10));
145
146        let valid = [true, false, false, true, false, false, true];
147        let valid_mask = Buffer::from_iter(valid.iter().copied());
148
149        buffer.pad_nulls(1, 2, valid.len() - 1, valid_mask.as_slice());
150
151        let array = buffer.into_array(Some(valid_mask), &ArrowType::Utf8View);
152        let strings = array
153            .as_any()
154            .downcast_ref::<arrow::array::StringViewArray>()
155            .unwrap();
156
157        assert_eq!(
158            strings.iter().collect::<Vec<_>>(),
159            vec![
160                Some("0"),
161                None,
162                None,
163                Some("123456789"),
164                None,
165                None,
166                Some("long string to test string view"),
167            ]
168        );
169    }
170}