Skip to main content

parquet/arrow/buffer/
view_buffer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::arrow::record_reader::buffer::ValuesBuffer;
19use arrow_array::{ArrayRef, BinaryViewArray, StringViewArray};
20use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
21use arrow_schema::DataType as ArrowType;
22use std::sync::Arc;
23
24/// A buffer of view type byte arrays that can be converted into
25/// `GenericByteViewArray`
26///
27/// Note this does not reuse `GenericByteViewBuilder` due to the need to call `pad_nulls`
28/// and reuse the existing logic for Vec in the parquet crate
29#[derive(Debug, Default)]
30pub struct ViewBuffer {
31    pub views: Vec<u128>,
32    pub buffers: Vec<Buffer>,
33}
34
35impl ViewBuffer {
36    pub fn is_empty(&self) -> bool {
37        self.views.is_empty()
38    }
39
40    pub fn append_block(&mut self, block: Buffer) -> u32 {
41        let block_id = self.buffers.len() as u32;
42        self.buffers.push(block);
43        block_id
44    }
45
46    /// Directly append a view to the view array.
47    /// This is used when we create a StringViewArray from a dictionary whose values are StringViewArray.
48    ///
49    /// # Safety
50    /// The `view` must be a valid view as per the ByteView spec.
51    pub unsafe fn append_raw_view_unchecked(&mut self, view: u128) {
52        self.views.push(view);
53    }
54
55    /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
56    pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
57        let len = self.views.len();
58        let views = ScalarBuffer::from(self.views);
59        let nulls = null_buffer.and_then(|b| NullBuffer::from_unsliced_buffer(b, len));
60        match data_type {
61            ArrowType::Utf8View => {
62                // Safety: views were created correctly, and checked that the data is utf8 when building the buffer
63                unsafe { Arc::new(StringViewArray::new_unchecked(views, self.buffers, nulls)) }
64            }
65            ArrowType::BinaryView => {
66                // Safety: views were created correctly
67                unsafe { Arc::new(BinaryViewArray::new_unchecked(views, self.buffers, nulls)) }
68            }
69            _ => panic!("Unsupported data type: {data_type}"),
70        }
71    }
72}
73
74impl ValuesBuffer for ViewBuffer {
75    fn pad_nulls(
76        &mut self,
77        read_offset: usize,
78        values_read: usize,
79        levels_read: usize,
80        valid_mask: &[u8],
81    ) {
82        self.views
83            .pad_nulls(read_offset, values_read, levels_read, valid_mask);
84    }
85}
86
87#[cfg(test)]
88mod tests {
89
90    use arrow::array::make_view;
91    use arrow_array::Array;
92
93    use super::*;
94
95    #[test]
96    fn test_view_buffer_empty() {
97        let buffer = ViewBuffer::default();
98        let array = buffer.into_array(None, &ArrowType::Utf8View);
99        let strings = array
100            .as_any()
101            .downcast_ref::<arrow::array::StringViewArray>()
102            .unwrap();
103        assert_eq!(strings.len(), 0);
104    }
105
106    #[test]
107    fn test_view_buffer_append_view() {
108        let mut buffer = ViewBuffer::default();
109        let data = b"0123456789long string to test string view";
110        let string_buffer = Buffer::from(data);
111        let block_id = buffer.append_block(string_buffer);
112
113        unsafe {
114            buffer.append_raw_view_unchecked(make_view(&data[0..1], block_id, 0));
115            buffer.append_raw_view_unchecked(make_view(&data[1..10], block_id, 1));
116            buffer.append_raw_view_unchecked(make_view(&data[10..41], block_id, 10));
117        }
118
119        let array = buffer.into_array(None, &ArrowType::Utf8View);
120        let string_array = array
121            .as_any()
122            .downcast_ref::<arrow::array::StringViewArray>()
123            .unwrap();
124        assert_eq!(
125            string_array.iter().collect::<Vec<_>>(),
126            vec![
127                Some("0"),
128                Some("123456789"),
129                Some("long string to test string view"),
130            ]
131        );
132    }
133
134    #[test]
135    fn test_view_buffer_pad_null() {
136        let mut buffer = ViewBuffer::default();
137        let data = b"0123456789long string to test string view";
138        let string_buffer = Buffer::from(data);
139        let block_id = buffer.append_block(string_buffer);
140
141        unsafe {
142            buffer.append_raw_view_unchecked(make_view(&data[0..1], block_id, 0));
143            buffer.append_raw_view_unchecked(make_view(&data[1..10], block_id, 1));
144            buffer.append_raw_view_unchecked(make_view(&data[10..41], block_id, 10));
145        }
146
147        let valid = [true, false, false, true, false, false, true];
148        let valid_mask = Buffer::from_iter(valid.iter().copied());
149
150        buffer.pad_nulls(1, 2, valid.len() - 1, valid_mask.as_slice());
151
152        let array = buffer.into_array(Some(valid_mask), &ArrowType::Utf8View);
153        let strings = array
154            .as_any()
155            .downcast_ref::<arrow::array::StringViewArray>()
156            .unwrap();
157
158        assert_eq!(
159            strings.iter().collect::<Vec<_>>(),
160            vec![
161                Some("0"),
162                None,
163                None,
164                Some("123456789"),
165                None,
166                None,
167                Some("long string to test string view"),
168            ]
169        );
170    }
171}