Skip to main content

parquet/arrow/buffer/
view_buffer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::arrow::record_reader::buffer::ValuesBuffer;
19use arrow_array::{ArrayRef, BinaryViewArray, StringViewArray};
20use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
21use arrow_schema::DataType as ArrowType;
22use std::sync::Arc;
23
24/// A buffer of view type byte arrays that can be converted into
25/// `GenericByteViewArray`
26///
27/// Note this does not reuse `GenericByteViewBuilder` due to the need to call `pad_nulls`
28/// and reuse the existing logic for Vec in the parquet crate
29#[derive(Debug, Default)]
30pub struct ViewBuffer {
31    pub views: Vec<u128>,
32    pub buffers: Vec<Buffer>,
33}
34
35impl ViewBuffer {
36    pub fn is_empty(&self) -> bool {
37        self.views.is_empty()
38    }
39
40    pub fn append_block(&mut self, block: Buffer) -> u32 {
41        let block_id = self.buffers.len() as u32;
42        self.buffers.push(block);
43        block_id
44    }
45
46    /// Directly append a view to the view array.
47    /// This is used when we create a StringViewArray from a dictionary whose values are StringViewArray.
48    ///
49    /// # Safety
50    /// The `view` must be a valid view as per the ByteView spec.
51    pub unsafe fn append_raw_view_unchecked(&mut self, view: u128) {
52        self.views.push(view);
53    }
54
55    /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
56    pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
57        let len = self.views.len();
58        let views = ScalarBuffer::from(self.views);
59        let nulls = null_buffer
60            .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, len)))
61            .filter(|n| n.null_count() != 0);
62        match data_type {
63            ArrowType::Utf8View => {
64                // Safety: views were created correctly, and checked that the data is utf8 when building the buffer
65                unsafe { Arc::new(StringViewArray::new_unchecked(views, self.buffers, nulls)) }
66            }
67            ArrowType::BinaryView => {
68                // Safety: views were created correctly
69                unsafe { Arc::new(BinaryViewArray::new_unchecked(views, self.buffers, nulls)) }
70            }
71            _ => panic!("Unsupported data type: {data_type}"),
72        }
73    }
74}
75
76impl ValuesBuffer for ViewBuffer {
77    fn pad_nulls(
78        &mut self,
79        read_offset: usize,
80        values_read: usize,
81        levels_read: usize,
82        valid_mask: &[u8],
83    ) {
84        self.views
85            .pad_nulls(read_offset, values_read, levels_read, valid_mask);
86    }
87}
88
89#[cfg(test)]
90mod tests {
91
92    use arrow::array::make_view;
93    use arrow_array::Array;
94
95    use super::*;
96
97    #[test]
98    fn test_view_buffer_empty() {
99        let buffer = ViewBuffer::default();
100        let array = buffer.into_array(None, &ArrowType::Utf8View);
101        let strings = array
102            .as_any()
103            .downcast_ref::<arrow::array::StringViewArray>()
104            .unwrap();
105        assert_eq!(strings.len(), 0);
106    }
107
108    #[test]
109    fn test_view_buffer_append_view() {
110        let mut buffer = ViewBuffer::default();
111        let data = b"0123456789long string to test string view";
112        let string_buffer = Buffer::from(data);
113        let block_id = buffer.append_block(string_buffer);
114
115        unsafe {
116            buffer.append_raw_view_unchecked(make_view(&data[0..1], block_id, 0));
117            buffer.append_raw_view_unchecked(make_view(&data[1..10], block_id, 1));
118            buffer.append_raw_view_unchecked(make_view(&data[10..41], block_id, 10));
119        }
120
121        let array = buffer.into_array(None, &ArrowType::Utf8View);
122        let string_array = array
123            .as_any()
124            .downcast_ref::<arrow::array::StringViewArray>()
125            .unwrap();
126        assert_eq!(
127            string_array.iter().collect::<Vec<_>>(),
128            vec![
129                Some("0"),
130                Some("123456789"),
131                Some("long string to test string view"),
132            ]
133        );
134    }
135
136    #[test]
137    fn test_view_buffer_pad_null() {
138        let mut buffer = ViewBuffer::default();
139        let data = b"0123456789long string to test string view";
140        let string_buffer = Buffer::from(data);
141        let block_id = buffer.append_block(string_buffer);
142
143        unsafe {
144            buffer.append_raw_view_unchecked(make_view(&data[0..1], block_id, 0));
145            buffer.append_raw_view_unchecked(make_view(&data[1..10], block_id, 1));
146            buffer.append_raw_view_unchecked(make_view(&data[10..41], block_id, 10));
147        }
148
149        let valid = [true, false, false, true, false, false, true];
150        let valid_mask = Buffer::from_iter(valid.iter().copied());
151
152        buffer.pad_nulls(1, 2, valid.len() - 1, valid_mask.as_slice());
153
154        let array = buffer.into_array(Some(valid_mask), &ArrowType::Utf8View);
155        let strings = array
156            .as_any()
157            .downcast_ref::<arrow::array::StringViewArray>()
158            .unwrap();
159
160        assert_eq!(
161            strings.iter().collect::<Vec<_>>(),
162            vec![
163                Some("0"),
164                None,
165                None,
166                Some("123456789"),
167                None,
168                None,
169                Some("long string to test string view"),
170            ]
171        );
172    }
173}