parquet/util/utf8.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`check_valid_utf8`] validation function
19use crate::errors::{ParquetError, Result};
20
21/// Check that `val` is a valid UTF-8 sequence.
22///
23/// If the `simdutf8` feature is enabled, this function will use
24/// SIMD-accelerated validation from the [`simdutf8`] crate. Otherwise, it will use
25/// [`std::str::from_utf8`].
26///
27/// # Errors
28///
29/// Returns `Err::General` with a message compatible with [`std::str::from_utf8`] on failure.
30///
31/// # Example
32/// ```
33/// use parquet::utf8::check_valid_utf8;
34/// assert!(check_valid_utf8(b"hello").is_ok());
35/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98\x8E").is_ok());
36/// // invalid UTF-8
37/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98").is_err());
38/// ```
39///
40/// [`simdutf8`]: https://crates.io/crates/simdutf8
41#[inline(always)]
42pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
43 #[cfg(feature = "simdutf8")]
44 match simdutf8::basic::from_utf8(val) {
45 Ok(_) => Ok(()),
46 Err(_) => {
47 // Use simdutf8::compat to return details about the decoding error
48 let e = simdutf8::compat::from_utf8(val).unwrap_err();
49 Err(general_err!("encountered non UTF-8 data: {}", e))
50 }
51 }
52 #[cfg(not(feature = "simdutf8"))]
53 match std::str::from_utf8(val) {
54 Ok(_) => Ok(()),
55 Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
56 }
57}