arrow/util/
test_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utils to make testing easier
19
20use rand::{rngs::StdRng, Rng, SeedableRng};
21use std::{env, error::Error, fs, io::Write, path::PathBuf};
22
23/// Returns a vector of size `n`, filled with randomly generated bytes.
24pub fn random_bytes(n: usize) -> Vec<u8> {
25    let mut result = vec![];
26    let mut rng = seedable_rng();
27    for _ in 0..n {
28        result.push(rng.gen_range(0..255));
29    }
30    result
31}
32
33/// Returns fixed seedable RNG
34pub fn seedable_rng() -> StdRng {
35    StdRng::seed_from_u64(42)
36}
37
38/// Returns file handle for a temp file in 'target' directory with a provided content
39///
40/// TODO: Originates from `parquet` utils, can be merged in [ARROW-4064]
41pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File {
42    // build tmp path to a file in "target/debug/testdata"
43    let mut path_buf = env::current_dir().unwrap();
44    path_buf.push("target");
45    path_buf.push("debug");
46    path_buf.push("testdata");
47    fs::create_dir_all(&path_buf).unwrap();
48    path_buf.push(file_name);
49
50    // write file content
51    let mut tmp_file = fs::File::create(path_buf.as_path()).unwrap();
52    tmp_file.write_all(content).unwrap();
53    tmp_file.sync_all().unwrap();
54
55    // return file handle for both read and write
56    let file = fs::OpenOptions::new()
57        .read(true)
58        .write(true)
59        .open(path_buf.as_path());
60    assert!(file.is_ok());
61    file.unwrap()
62}
63
64/// Returns the arrow test data directory, which is by default stored
65/// in a git submodule rooted at `arrow/testing/data`.
66///
67/// The default can be overridden by the optional environment
68/// variable `ARROW_TEST_DATA`
69///
70/// panics when the directory can not be found.
71///
72/// Example:
73/// ```
74/// let testdata = arrow::util::test_util::arrow_test_data();
75/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
76/// assert!(std::path::PathBuf::from(csvdata).exists());
77/// ```
78pub fn arrow_test_data() -> String {
79    match get_data_dir("ARROW_TEST_DATA", "../testing/data") {
80        Ok(pb) => pb.display().to_string(),
81        Err(err) => panic!("failed to get arrow data dir: {err}"),
82    }
83}
84
85/// Returns the parquest test data directory, which is by default
86/// stored in a git submodule rooted at
87/// `arrow/parquest-testing/data`.
88///
89/// The default can be overridden by the optional environment variable
90/// `PARQUET_TEST_DATA`
91///
92/// panics when the directory can not be found.
93///
94/// Example:
95/// ```
96/// let testdata = arrow::util::test_util::parquet_test_data();
97/// let filename = format!("{}/binary.parquet", testdata);
98/// assert!(std::path::PathBuf::from(filename).exists());
99/// ```
100pub fn parquet_test_data() -> String {
101    match get_data_dir("PARQUET_TEST_DATA", "../parquet-testing/data") {
102        Ok(pb) => pb.display().to_string(),
103        Err(err) => panic!("failed to get parquet data dir: {err}"),
104    }
105}
106
107/// Returns a directory path for finding test data.
108///
109/// udf_env: name of an environment variable
110///
111/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR)
112///
113///  Returns either:
114/// The path referred to in `udf_env` if that variable is set and refers to a directory
115/// The submodule_data directory relative to CARGO_MANIFEST_PATH
116fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {
117    // Try user defined env.
118    if let Ok(dir) = env::var(udf_env) {
119        let trimmed = dir.trim().to_string();
120        if !trimmed.is_empty() {
121            let pb = PathBuf::from(trimmed);
122            if pb.is_dir() {
123                return Ok(pb);
124            } else {
125                return Err(format!(
126                    "the data dir `{}` defined by env {} not found",
127                    pb.display(),
128                    udf_env
129                )
130                .into());
131            }
132        }
133    }
134
135    // The env is undefined or its value is trimmed to empty, let's try default dir.
136
137    // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package",
138    // set by `cargo run` or `cargo test`, see:
139    // https://doc.rust-lang.org/cargo/reference/environment-variables.html
140    let dir = env!("CARGO_MANIFEST_DIR");
141
142    let pb = PathBuf::from(dir).join(submodule_data);
143    if pb.is_dir() {
144        Ok(pb)
145    } else {
146        Err(format!(
147            "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\
148             HINT: try running `git submodule update --init`",
149            udf_env,
150            pb.display(),
151        ).into())
152    }
153}
154
155/// An iterator that is untruthful about its actual length
156#[derive(Debug, Clone)]
157pub struct BadIterator<T> {
158    /// where the iterator currently is
159    cur: usize,
160    /// How many items will this iterator *actually* make
161    limit: usize,
162    /// How many items this iterator claims it will make
163    claimed: usize,
164    /// The items to return. If there are fewer items than `limit`
165    /// they will be repeated
166    pub items: Vec<T>,
167}
168
169impl<T> BadIterator<T> {
170    /// Create a new iterator for `<limit>` items, but that reports to
171    /// produce `<claimed>` items. Must provide at least 1 item.
172    pub fn new(limit: usize, claimed: usize, items: Vec<T>) -> Self {
173        assert!(!items.is_empty());
174        Self {
175            cur: 0,
176            limit,
177            claimed,
178            items,
179        }
180    }
181}
182
183impl<T: Clone> Iterator for BadIterator<T> {
184    type Item = T;
185
186    fn next(&mut self) -> Option<Self::Item> {
187        if self.cur < self.limit {
188            let next_item_idx = self.cur % self.items.len();
189            let next_item = self.items[next_item_idx].clone();
190            self.cur += 1;
191            Some(next_item)
192        } else {
193            None
194        }
195    }
196
197    /// report whatever the iterator says to
198    fn size_hint(&self) -> (usize, Option<usize>) {
199        (0, Some(self.claimed))
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn test_data_dir() {
209        let udf_env = "get_data_dir";
210        let cwd = env::current_dir().unwrap();
211
212        let existing_pb = cwd.join("..");
213        let existing = existing_pb.display().to_string();
214        let existing_str = existing.as_str();
215
216        let non_existing = cwd.join("non-existing-dir").display().to_string();
217        let non_existing_str = non_existing.as_str();
218
219        env::set_var(udf_env, non_existing_str);
220        let res = get_data_dir(udf_env, existing_str);
221        assert!(res.is_err());
222
223        env::set_var(udf_env, "");
224        let res = get_data_dir(udf_env, existing_str);
225        assert!(res.is_ok());
226        assert_eq!(res.unwrap(), existing_pb);
227
228        env::set_var(udf_env, " ");
229        let res = get_data_dir(udf_env, existing_str);
230        assert!(res.is_ok());
231        assert_eq!(res.unwrap(), existing_pb);
232
233        env::set_var(udf_env, existing_str);
234        let res = get_data_dir(udf_env, existing_str);
235        assert!(res.is_ok());
236        assert_eq!(res.unwrap(), existing_pb);
237
238        env::remove_var(udf_env);
239        let res = get_data_dir(udf_env, non_existing_str);
240        assert!(res.is_err());
241
242        let res = get_data_dir(udf_env, existing_str);
243        assert!(res.is_ok());
244        assert_eq!(res.unwrap(), existing_pb);
245    }
246
247    #[test]
248    fn test_happy() {
249        let res = arrow_test_data();
250        assert!(PathBuf::from(res).is_dir());
251
252        let res = parquet_test_data();
253        assert!(PathBuf::from(res).is_dir());
254    }
255}