parquet/util/
utf8.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`check_valid_utf8`] validation function
19use crate::errors::{ParquetError, Result};
20
21/// Check that `val` is a valid UTF-8 sequence.
22///
23/// If the `simdutf8` feature is enabled, this function will use
24/// SIMD-accelerated validation from the [`simdutf8`] crate. Otherwise, it will use
25/// [`std::str::from_utf8`].
26///
27/// # Errors
28///
29/// Returns `Err::General` with a message compatible with [`std::str::from_utf8`] on failure.
30///
31/// # Example
32/// ```
33/// use parquet::utf8::check_valid_utf8;
34/// assert!(check_valid_utf8(b"hello").is_ok());
35/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98\x8E").is_ok());
36/// // invalid UTF-8
37/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98").is_err());
38/// ```
39///
40/// [`simdutf8`]: https://crates.io/crates/simdutf8
41#[inline(always)]
42pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
43    #[cfg(feature = "simdutf8")]
44    match simdutf8::basic::from_utf8(val) {
45        Ok(_) => Ok(()),
46        Err(_) => {
47            // Use simdutf8::compat to return details about the decoding error
48            let e = simdutf8::compat::from_utf8(val).unwrap_err();
49            Err(general_err!("encountered non UTF-8 data: {}", e))
50        }
51    }
52    #[cfg(not(feature = "simdutf8"))]
53    match std::str::from_utf8(val) {
54        Ok(_) => Ok(()),
55        Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
56    }
57}