parquet_variant/
utils.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use std::{array::TryFromSliceError, ops::Range, str};
18
19use arrow_schema::ArrowError;
20
21use std::fmt::Debug;
22use std::slice::SliceIndex;
23
24/// Helper for reporting integer overflow errors in a consistent way.
25pub(crate) fn overflow_error(msg: &str) -> ArrowError {
26    ArrowError::InvalidArgumentError(format!("Integer overflow computing {msg}"))
27}
28
29#[inline]
30pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
31    bytes: &[u8],
32    index: I,
33) -> Result<&I::Output, ArrowError> {
34    bytes.get(index.clone()).ok_or_else(|| {
35        ArrowError::InvalidArgumentError(format!(
36            "Tried to extract byte(s) {index:?} from {}-byte buffer",
37            bytes.len(),
38        ))
39    })
40}
41
42/// Helper to safely slice bytes with offset calculations.
43///
44/// Equivalent to `slice_from_slice(bytes, (base_offset + range.start)..(base_offset + range.end))`
45/// but using checked addition to prevent integer overflow panics on 32-bit systems.
46#[inline]
47pub(crate) fn slice_from_slice_at_offset(
48    bytes: &[u8],
49    base_offset: usize,
50    range: Range<usize>,
51) -> Result<&[u8], ArrowError> {
52    let start_byte = base_offset
53        .checked_add(range.start)
54        .ok_or_else(|| overflow_error("slice start"))?;
55    let end_byte = base_offset
56        .checked_add(range.end)
57        .ok_or_else(|| overflow_error("slice end"))?;
58    slice_from_slice(bytes, start_byte..end_byte)
59}
60
61pub(crate) fn array_from_slice<const N: usize>(
62    bytes: &[u8],
63    offset: usize,
64) -> Result<[u8; N], ArrowError> {
65    slice_from_slice_at_offset(bytes, offset, 0..N)?
66        .try_into()
67        .map_err(|e: TryFromSliceError| ArrowError::InvalidArgumentError(e.to_string()))
68}
69
70pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
71    slice
72        .first()
73        .copied()
74        .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string()))
75}
76
77/// Helper to get a &str from a slice at the given offset and range, or an error if it contains invalid UTF-8 data.
78#[inline]
79pub(crate) fn string_from_slice(
80    slice: &[u8],
81    offset: usize,
82    range: Range<usize>,
83) -> Result<&str, ArrowError> {
84    let offset_buffer = slice_from_slice_at_offset(slice, offset, range)?;
85
86    //Use simdutf8 by default
87    #[cfg(feature = "simdutf8")]
88    {
89        simdutf8::basic::from_utf8(offset_buffer).map_err(|_| {
90            // Use simdutf8::compat to return details about the decoding error
91            let e = simdutf8::compat::from_utf8(offset_buffer).unwrap_err();
92            ArrowError::InvalidArgumentError(format!("encountered non UTF-8 data: {e}"))
93        })
94    }
95
96    //Use std::str if simdutf8 is not enabled
97    #[cfg(not(feature = "simdutf8"))]
98    str::from_utf8(offset_buffer)
99        .map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()))
100}
101
102/// Performs a binary search over a range using a fallible key extraction function; a failed key
103/// extraction immediately terminats the search.
104///
105/// This is similar to the standard library's `binary_search_by`, but generalized to ranges instead
106/// of slices.
107///
108/// # Arguments
109/// * `range` - The range to search in
110/// * `target` - The target value to search for
111/// * `key_extractor` - A function that extracts a comparable key from slice elements.
112///   This function can fail and return None.
113///
114/// # Returns
115/// * `Some(Ok(index))` - Element found at the given index
116/// * `Some(Err(index))` - Element not found, but would be inserted at the given index
117/// * `None` - Key extraction failed
118pub(crate) fn try_binary_search_range_by<K, F>(
119    range: Range<usize>,
120    target: &K,
121    key_extractor: F,
122) -> Option<Result<usize, usize>>
123where
124    K: Ord,
125    F: Fn(usize) -> Option<K>,
126{
127    let Range { mut start, mut end } = range;
128    while start < end {
129        let mid = start + (end - start) / 2;
130        let key = key_extractor(mid)?;
131        match key.cmp(target) {
132            std::cmp::Ordering::Equal => return Some(Ok(mid)),
133            std::cmp::Ordering::Greater => end = mid,
134            std::cmp::Ordering::Less => start = mid + 1,
135        }
136    }
137
138    Some(Err(start))
139}
140
141/// Verifies the expected size of type T, for a type that should only grow if absolutely necessary.
142#[allow(unused)]
143pub(crate) const fn expect_size_of<T>(expected: usize) {
144    let size = std::mem::size_of::<T>();
145    if size != expected {
146        let _ = [""; 0][size];
147    }
148}