Skip to main content

parquet_variant/
utils.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use std::{array::TryFromSliceError, ops::Range, str};
18
19use crate::VariantPathElement;
20use arrow_schema::ArrowError;
21
22use std::cmp::Ordering;
23use std::fmt::Debug;
24use std::slice::SliceIndex;
25
26/// Helper for reporting integer overflow errors in a consistent way.
27pub(crate) fn overflow_error(msg: &str) -> ArrowError {
28    ArrowError::InvalidArgumentError(format!("Integer overflow computing {msg}"))
29}
30
31#[inline]
32pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
33    bytes: &[u8],
34    index: I,
35) -> Result<&I::Output, ArrowError> {
36    bytes.get(index.clone()).ok_or_else(|| {
37        ArrowError::InvalidArgumentError(format!(
38            "Tried to extract byte(s) {index:?} from {}-byte buffer",
39            bytes.len(),
40        ))
41    })
42}
43
44/// Helper to safely slice bytes with offset calculations.
45///
46/// Equivalent to `slice_from_slice(bytes, (base_offset + range.start)..(base_offset + range.end))`
47/// but using checked addition to prevent integer overflow panics on 32-bit systems.
48#[inline]
49pub(crate) fn slice_from_slice_at_offset(
50    bytes: &[u8],
51    base_offset: usize,
52    range: Range<usize>,
53) -> Result<&[u8], ArrowError> {
54    let start_byte = base_offset
55        .checked_add(range.start)
56        .ok_or_else(|| overflow_error("slice start"))?;
57    let end_byte = base_offset
58        .checked_add(range.end)
59        .ok_or_else(|| overflow_error("slice end"))?;
60    slice_from_slice(bytes, start_byte..end_byte)
61}
62
63pub(crate) fn array_from_slice<const N: usize>(
64    bytes: &[u8],
65    offset: usize,
66) -> Result<[u8; N], ArrowError> {
67    slice_from_slice_at_offset(bytes, offset, 0..N)?
68        .try_into()
69        .map_err(|e: TryFromSliceError| ArrowError::InvalidArgumentError(e.to_string()))
70}
71
72pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
73    slice
74        .first()
75        .copied()
76        .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string()))
77}
78
79/// Helper to get a &str from a slice at the given offset and range, or an error if it contains invalid UTF-8 data.
80#[inline]
81pub(crate) fn string_from_slice(
82    slice: &[u8],
83    offset: usize,
84    range: Range<usize>,
85) -> Result<&str, ArrowError> {
86    let offset_buffer = slice_from_slice_at_offset(slice, offset, range)?;
87
88    //Use simdutf8 by default
89    #[cfg(feature = "simdutf8")]
90    {
91        simdutf8::basic::from_utf8(offset_buffer).map_err(|_| {
92            // Use simdutf8::compat to return details about the decoding error
93            let e = simdutf8::compat::from_utf8(offset_buffer).unwrap_err();
94            ArrowError::InvalidArgumentError(format!("encountered non UTF-8 data: {e}"))
95        })
96    }
97
98    //Use std::str if simdutf8 is not enabled
99    #[cfg(not(feature = "simdutf8"))]
100    str::from_utf8(offset_buffer)
101        .map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()))
102}
103
104/// Performs a binary search over a range using a fallible key extraction function; a failed key
105/// extraction immediately terminats the search.
106///
107/// This is similar to the standard library's `binary_search_by`, but generalized to ranges instead
108/// of slices.
109///
110/// # Arguments
111/// * `range` - The range to search in
112/// * `target` - The target value to search for
113/// * `key_extractor` - A function that extracts a comparable key from slice elements.
114///   This function can fail and return None.
115///
116/// # Returns
117/// * `Some(Ok(index))` - Element found at the given index
118/// * `Some(Err(index))` - Element not found, but would be inserted at the given index
119/// * `None` - Key extraction failed
120pub(crate) fn try_binary_search_range_by<F>(
121    range: Range<usize>,
122    cmp: F,
123) -> Option<Result<usize, usize>>
124where
125    F: Fn(usize) -> Option<Ordering>,
126{
127    let Range { mut start, mut end } = range;
128    while start < end {
129        let mid = start + (end - start) / 2;
130        match cmp(mid)? {
131            Ordering::Equal => return Some(Ok(mid)),
132            Ordering::Greater => end = mid,
133            Ordering::Less => start = mid + 1,
134        }
135    }
136
137    Some(Err(start))
138}
139
140/// Verifies the expected size of type T, for a type that should only grow if absolutely necessary.
141#[allow(unused)]
142pub(crate) const fn expect_size_of<T>(expected: usize) {
143    let size = std::mem::size_of::<T>();
144    if size != expected {
145        let _ = [""; 0][size];
146    }
147}
148
149pub(crate) fn fits_precision<const N: u32>(n: impl Into<i64>) -> bool {
150    n.into().unsigned_abs().leading_zeros() >= (i64::BITS - N)
151}
152
153/// Parse a path string into a vector of [`VariantPathElement`].
154///
155/// # Syntax
156/// - `.field` or `field` - access object field (do not support special char)
157/// - `[index]` - access array element by index
158/// - `[field]` - access object field (support special char with escape `\`)
159///
160/// # Escape Rules
161/// Inside brackets `[...]`:
162/// - `\\` -> literal `\`
163/// - `\]` -> literal `]`
164/// - Any other `\x` -> literal `x`
165///
166/// Outside brackets, no escaping is supported.
167///
168/// # Examples
169/// - `""` -> empty path
170/// - `"foo"` -> single field `foo`
171/// - `"foo.bar"` -> nested fields `foo`, `bar`
172/// - `"[1]"` -> array index 1
173/// - `"['1']"` or `"["1"]"`-> field `1`
174/// - `"foo[1].bar"` -> field `foo`, index 1, field `bar`
175/// - `"['a.b']"` -> field `a.b` (dot is literal inside bracket)
176/// - `"['a\]b']"` -> field `a]b` (escaped `]`
177/// - etc.
178///
179/// # Errors
180/// - Leading `.` (e.g., `".foo"`)
181/// - Trailing `.` (e.g., `"foo."`)
182/// - Unclosed '[' (e.g., `"foo[1"`)
183/// - Unexpected ']' (e.g., `"foo]"`)
184/// - Trailing '`' inside bracket (treated as unclosed bracket)
185#[inline]
186pub(crate) fn parse_path(s: &str) -> Result<Vec<VariantPathElement<'_>>, ArrowError> {
187    let scan_field = |start: usize| {
188        s[start..]
189            .find(['.', '[', ']'])
190            .map_or_else(|| s.len(), |p| start + p)
191    };
192
193    let bytes = s.as_bytes();
194    if let Some(b'.') = bytes.first() {
195        return Err(ArrowError::ParseError("Unexpected leading '.'".into()));
196    }
197
198    let mut elements = Vec::new();
199    let mut i = 0;
200
201    while i < bytes.len() {
202        let (elem, end) = match bytes[i] {
203            b'.' => {
204                i += 1; // skip the dot; a field must follow
205                let end = scan_field(i);
206                if end == i {
207                    return Err(ArrowError::ParseError(match bytes.get(i) {
208                        None => "Unexpected trailing '.'".into(),
209                        Some(&c) => format!("Unexpected '{}' at byte {i}", c as char),
210                    }));
211                }
212                (VariantPathElement::field(&s[i..end]), end)
213            }
214            b'[' => {
215                let (element, end) = parse_in_bracket(s, i)?;
216                (element, end)
217            }
218            b']' => {
219                return Err(ArrowError::ParseError(format!(
220                    "Unexpected ']' at byte {i}"
221                )));
222            }
223            _ => {
224                let end = scan_field(i);
225                (VariantPathElement::field(&s[i..end]), end)
226            }
227        };
228        elements.push(elem);
229        i = end;
230    }
231
232    Ok(elements)
233}
234
235/// Parse `[digits | field]` starting at `i` (which points to `[`).
236/// Returns (VariantPathElement, position after `]`).
237fn parse_in_bracket(s: &str, i: usize) -> Result<(VariantPathElement<'_>, usize), ArrowError> {
238    let start = i + 1; // skip '['
239
240    let mut unescaped = String::new();
241    let mut chars = s[start..].char_indices().peekable();
242    let mut end = None;
243
244    while let Some((offset, c)) = chars.next() {
245        match c {
246            // Escape: take next char literally
247            '\\' => {
248                if let Some((_, next)) = chars.next() {
249                    unescaped.push(next);
250                }
251                // Trailing backslash will be handled as 'unclosed [' below
252            }
253            ']' => {
254                // Unescaped ']' ends the bracket
255                end = Some(start + offset);
256                break;
257            }
258            _ => {
259                unescaped.push(c);
260            }
261        }
262    }
263
264    let end = match end {
265        Some(e) => e,
266        None => {
267            return Err(ArrowError::ParseError(format!("Unclosed '[' at byte {i}")));
268        }
269    };
270
271    let element = if let Some(inner) = unescaped
272        .strip_prefix('\'')
273        .and_then(|s| s.strip_suffix('\''))
274        .or_else(|| {
275            unescaped
276                .strip_prefix('"')
277                .and_then(|s| s.strip_suffix('"'))
278        }) {
279        // Quoted field name, e.g., ['field'] or ['123'] or ["123"]
280        VariantPathElement::field(inner.to_string())
281    } else {
282        let Ok(idx) = unescaped.parse() else {
283            return Err(ArrowError::ParseError(format!(
284                "Invalid token in bracket request: `{unescaped}`. Expected a quoted string or a number(e.g., `['field']` or `[123]`)"
285            )));
286        };
287        VariantPathElement::index(idx)
288    };
289
290    Ok((element, end + 1))
291}
292
293#[cfg(test)]
294mod test {
295    use super::*;
296
297    #[test]
298    fn test_fits_precision() {
299        assert!(fits_precision::<10>(1023));
300        assert!(!fits_precision::<10>(1024));
301        assert!(fits_precision::<10>(-1023));
302        assert!(!fits_precision::<10>(-1024));
303    }
304}