arrow/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! A complete, safe, native Rust implementation of [Apache Arrow](https://arrow.apache.org), a cross-language
19//! development platform for in-memory data.
20//!
21//! Please see the [arrow crates.io](https://crates.io/crates/arrow)
22//! page for feature flags and tips to improve performance.
23//!
24//! # Columnar Format
25//!
26//! The [`array`] module provides statically typed implementations of all the array types as defined
27//! by the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html)
28//!
29//! For example, an [`Int32Array`](array::Int32Array) represents a nullable array of `i32`
30//!
31//! ```rust
32//! # use arrow::array::{Array, Int32Array};
33//! let array = Int32Array::from(vec![Some(1), None, Some(3)]);
34//! assert_eq!(array.len(), 3);
35//! assert_eq!(array.value(0), 1);
36//! assert_eq!(array.is_null(1), true);
37//!
38//! let collected: Vec<_> = array.iter().collect();
39//! assert_eq!(collected, vec![Some(1), None, Some(3)]);
40//! assert_eq!(array.values(), &[1, 0, 3])
41//! ```
42//!
43//! It is also possible to write generic code for different concrete types.
44//! For example, since the following function is generic over all primitively
45//! typed arrays, when invoked the Rust compiler will generate specialized implementations
46//! with optimized code for each concrete type.
47//!
48//! ```rust
49//! # use std::iter::Sum;
50//! # use arrow::array::{Float32Array, PrimitiveArray, TimestampNanosecondArray};
51//! # use arrow::datatypes::ArrowPrimitiveType;
52//! #
53//! fn sum<T: ArrowPrimitiveType>(array: &PrimitiveArray<T>) -> T::Native
54//! where
55//! T: ArrowPrimitiveType,
56//! T::Native: Sum
57//! {
58//! array.iter().map(|v| v.unwrap_or_default()).sum()
59//! }
60//!
61//! assert_eq!(sum(&Float32Array::from(vec![1.1, 2.9, 3.])), 7.);
62//! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6);
63//! ```
64//!
65//! And the following uses [`ArrayAccessor`] to implement a generic function
66//! over all arrays with comparable values.
67//!
68//! [`ArrayAccessor`]: array::ArrayAccessor
69//!
70//! ```rust
71//! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray};
72//! # use arrow::datatypes::ArrowPrimitiveType;
73//! #
74//! fn min<T: ArrayAccessor>(array: T) -> Option<T::Item>
75//! where
76//! T::Item: Ord
77//! {
78//! ArrayIter::new(array).filter_map(|v| v).min()
79//! }
80//!
81//! assert_eq!(min(&Int32Array::from(vec![4, 2, 1, 6])), Some(1));
82//! assert_eq!(min(&StringArray::from(vec!["b", "a", "c"])), Some("a"));
83//! ```
84//!
85//! **For more examples, and details consult the [arrow_array] docs.**
86//!
87//! # Type Erasure / Trait Objects
88//!
89//! It is common to write code that handles any type of array, without necessarily
90//! knowing its concrete type. This is done using the [`Array`] trait and using
91//! [`DataType`] to determine the appropriate `downcast_ref`.
92//!
93//! [`DataType`]: datatypes::DataType
94//!
95//! ```rust
96//! # use arrow::array::{Array, Float32Array};
97//! # use arrow::array::StringArray;
98//! # use arrow::datatypes::DataType;
99//! #
100//! fn impl_string(array: &StringArray) {}
101//! fn impl_f32(array: &Float32Array) {}
102//!
103//! fn impl_dyn(array: &dyn Array) {
104//! match array.data_type() {
105//! // downcast `dyn Array` to concrete `StringArray`
106//! DataType::Utf8 => impl_string(array.as_any().downcast_ref().unwrap()),
107//! // downcast `dyn Array` to concrete `Float32Array`
108//! DataType::Float32 => impl_f32(array.as_any().downcast_ref().unwrap()),
109//! _ => unimplemented!()
110//! }
111//! }
112//! ```
113//!
114//! You can use the [`AsArray`] extension trait to facilitate downcasting:
115//!
116//! [`AsArray`]: crate::array::AsArray
117//!
118//! ```rust
119//! # use arrow::array::{Array, Float32Array, AsArray};
120//! # use arrow::array::StringArray;
121//! # use arrow::datatypes::DataType;
122//! #
123//! fn impl_string(array: &StringArray) {}
124//! fn impl_f32(array: &Float32Array) {}
125//!
126//! fn impl_dyn(array: &dyn Array) {
127//! match array.data_type() {
128//! DataType::Utf8 => impl_string(array.as_string()),
129//! DataType::Float32 => impl_f32(array.as_primitive()),
130//! _ => unimplemented!()
131//! }
132//! }
133//! ```
134//!
135//! It is also common to want to write a function that returns one of a number of possible
136//! array implementations. [`ArrayRef`] is a type-alias for [`Arc<dyn Array>`](array::Array)
137//! which is frequently used for this purpose
138//!
139//! ```rust
140//! # use std::str::FromStr;
141//! # use std::sync::Arc;
142//! # use arrow::array::{ArrayRef, Int32Array, PrimitiveArray};
143//! # use arrow::datatypes::{ArrowPrimitiveType, DataType, Int32Type, UInt32Type};
144//! # use arrow::compute::cast;
145//! #
146//! fn parse_to_primitive<'a, T, I>(iter: I) -> PrimitiveArray<T>
147//! where
148//! T: ArrowPrimitiveType,
149//! T::Native: FromStr,
150//! I: IntoIterator<Item=&'a str>,
151//! {
152//! PrimitiveArray::from_iter(iter.into_iter().map(|val| T::Native::from_str(val).ok()))
153//! }
154//!
155//! fn parse_strings<'a, I>(iter: I, to_data_type: DataType) -> ArrayRef
156//! where
157//! I: IntoIterator<Item=&'a str>,
158//! {
159//! match to_data_type {
160//! DataType::Int32 => Arc::new(parse_to_primitive::<Int32Type, _>(iter)) as _,
161//! DataType::UInt32 => Arc::new(parse_to_primitive::<UInt32Type, _>(iter)) as _,
162//! _ => unimplemented!()
163//! }
164//! }
165//!
166//! let array = parse_strings(["1", "2", "3"], DataType::Int32);
167//! let integers = array.as_any().downcast_ref::<Int32Array>().unwrap();
168//! assert_eq!(integers.values(), &[1, 2, 3])
169//! ```
170//!
171//! # Compute Kernels
172//!
173//! The [`compute`] module provides optimised implementations of many common operations,
174//! for example the `parse_strings` operation above could also be implemented as follows:
175//!
176//! ```
177//! # use std::sync::Arc;
178//! # use arrow::error::Result;
179//! # use arrow::array::{ArrayRef, StringArray, UInt32Array};
180//! # use arrow::datatypes::DataType;
181//! #
182//! fn parse_strings<'a, I>(iter: I, to_data_type: &DataType) -> Result<ArrayRef>
183//! where
184//! I: IntoIterator<Item=&'a str>,
185//! {
186//! let array = StringArray::from_iter(iter.into_iter().map(Some));
187//! arrow::compute::cast(&array, to_data_type)
188//! }
189//!
190//! let array = parse_strings(["1", "2", "3"], &DataType::UInt32).unwrap();
191//! let integers = array.as_any().downcast_ref::<UInt32Array>().unwrap();
192//! assert_eq!(integers.values(), &[1, 2, 3])
193//! ```
194//!
195//! This module also implements many common vertical operations:
196//!
197//! * All mathematical binary operators, such as [`sub`](compute::kernels::numeric::sub)
198//! * All boolean binary operators such as [`equality`](compute::kernels::cmp::eq)
199//! * [`cast`](compute::kernels::cast::cast)
200//! * [`filter`](compute::kernels::filter::filter)
201//! * [`take`](compute::kernels::take::take)
202//! * [`sort`](compute::kernels::sort::sort)
203//! * some string operators such as [`substring`](compute::kernels::substring::substring) and [`length`](compute::kernels::length::length)
204//!
205//! ```
206//! # use arrow::compute::kernels::cmp::gt;
207//! # use arrow_array::cast::AsArray;
208//! # use arrow_array::Int32Array;
209//! # use arrow_array::types::Int32Type;
210//! # use arrow_select::filter::filter;
211//! let array = Int32Array::from_iter(0..100);
212//! // Create a 32-bit integer scalar (single) value:
213//! let scalar = Int32Array::new_scalar(60);
214//! // find all rows in the array that are greater than 60
215//! let predicate = gt(&array, &scalar).unwrap();
216//! // copy all matching rows into a new array
217//! let filtered = filter(&array, &predicate).unwrap();
218//!
219//! let expected = Int32Array::from_iter(61..100);
220//! assert_eq!(&expected, filtered.as_primitive::<Int32Type>());
221//! ```
222//!
223//! As well as some horizontal operations, such as:
224//!
225//! * [`min`](compute::kernels::aggregate::min) and [`max`](compute::kernels::aggregate::max)
226//! * [`sum`](compute::kernels::aggregate::sum)
227//!
228//! # Tabular Representation
229//!
230//! It is common to want to group one or more columns together into a tabular representation. This
231//! is provided by [`RecordBatch`] which combines a [`Schema`](datatypes::Schema)
232//! and a corresponding list of [`ArrayRef`].
233//!
234//!
235//! ```
236//! # use std::sync::Arc;
237//! # use arrow::array::{Float32Array, Int32Array};
238//! # use arrow::record_batch::RecordBatch;
239//! #
240//! let col_1 = Arc::new(Int32Array::from_iter([1, 2, 3])) as _;
241//! let col_2 = Arc::new(Float32Array::from_iter([1., 6.3, 4.])) as _;
242//!
243//! let batch = RecordBatch::try_from_iter([("col1", col_1), ("col_2", col_2)]).unwrap();
244//! ```
245//!
246//! # IO
247//!
248//! This crate provides readers and writers for various formats to/from [`RecordBatch`]
249//!
250//! * JSON: [`Reader`](json::reader::Reader) and [`Writer`](json::writer::Writer)
251//! * CSV: [`Reader`](csv::reader::Reader) and [`Writer`](csv::writer::Writer)
252//! * IPC: [`Reader`](ipc::reader::StreamReader) and [`Writer`](ipc::writer::FileWriter)
253//!
254//! Parquet is published as a [separate crate](https://crates.io/crates/parquet)
255//!
256//! # Serde Compatibility
257//!
258//! [`arrow_json::reader::Decoder`] provides a mechanism to convert arbitrary, serde-compatible
259//! structures into [`RecordBatch`].
260//!
261//! Whilst likely less performant than implementing a custom builder, as described in
262//! [arrow_array::builder], this provides a simple mechanism to get up and running quickly
263//!
264//! ```
265//! # use std::sync::Arc;
266//! # use arrow_json::ReaderBuilder;
267//! # use arrow_schema::{DataType, Field, Schema};
268//! # use serde::Serialize;
269//! # use arrow_array::cast::AsArray;
270//! # use arrow_array::types::{Float32Type, Int32Type};
271//! #
272//! #[derive(Serialize)]
273//! struct MyStruct {
274//! int32: i32,
275//! string: String,
276//! }
277//!
278//! let schema = Schema::new(vec![
279//! Field::new("int32", DataType::Int32, false),
280//! Field::new("string", DataType::Utf8, false),
281//! ]);
282//!
283//! let rows = vec![
284//! MyStruct{ int32: 5, string: "bar".to_string() },
285//! MyStruct{ int32: 8, string: "foo".to_string() },
286//! ];
287//!
288//! let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap();
289//! decoder.serialize(&rows).unwrap();
290//!
291//! let batch = decoder.flush().unwrap().unwrap();
292//!
293//! // Expect batch containing two columns
294//! let int32 = batch.column(0).as_primitive::<Int32Type>();
295//! assert_eq!(int32.values(), &[5, 8]);
296//!
297//! let string = batch.column(1).as_string::<i32>();
298//! assert_eq!(string.value(0), "bar");
299//! assert_eq!(string.value(1), "foo");
300//! ```
301//!
302//! # Crate Topology
303//!
304//! The [`arrow`] project is implemented as multiple sub-crates, which are then re-exported by
305//! this top-level crate.
306//!
307//! Crate authors can choose to depend on this top-level crate, or just
308//! the sub-crates they need.
309//!
310//! The current list of sub-crates is:
311//!
312//! * [`arrow-arith`][arrow_arith] - arithmetic kernels
313//! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions
314//! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays
315//! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays
316//! * [`arrow-csv`][arrow_csv] - read/write CSV to arrow format
317//! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays
318//! * [`arrow-ipc`][arrow_ipc] - read/write IPC to arrow format
319//! * [`arrow-json`][arrow_json] - read/write JSON to arrow format
320//! * [`arrow-ord`][arrow_ord] - ordering kernels for arrow arrays
321//! * [`arrow-row`][arrow_row] - comparable row format
322//! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays
323//! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays
324//! * [`arrow-string`][arrow_string] - string kernels for arrow arrays
325//!
326//! Some functionality is also distributed independently of this crate:
327//!
328//! * [`arrow-flight`] - support for [Arrow Flight RPC]
329//! * [`arrow-integration-test`] - support for [Arrow JSON Test Format]
330//! * [`parquet`](https://docs.rs/parquet/latest/parquet/) - support for [Apache Parquet]
331//!
332//! # Safety and Security
333//!
334//! Like many crates, this crate makes use of unsafe where prudent. However, it endeavours to be
335//! sound. Specifically, **it should not be possible to trigger undefined behaviour using safe APIs.**
336//!
337//! If you think you have found an instance where this is possible, please file
338//! a ticket in our [issue tracker] and it will be triaged and fixed. For more information on
339//! arrow's use of unsafe, see [here](https://github.com/apache/arrow-rs/tree/main/arrow#safety).
340//!
341//! # Higher-level Processing
342//!
343//! This crate aims to provide reusable, low-level primitives for operating on columnar data. For
344//! more sophisticated query processing workloads, consider checking out [DataFusion]. This
345//! orchestrates the primitives exported by this crate into an embeddable query engine, with
346//! SQL and DataFrame frontends, and heavily influences this crate's roadmap.
347//!
348//! [`arrow`]: https://github.com/apache/arrow-rs
349//! [`array`]: mod@array
350//! [`Array`]: array::Array
351//! [`ArrayRef`]: array::ArrayRef
352//! [`ArrayData`]: array::ArrayData
353//! [`make_array`]: array::make_array
354//! [`Buffer`]: buffer::Buffer
355//! [`RecordBatch`]: record_batch::RecordBatch
356//! [`arrow-flight`]: https://docs.rs/arrow-flight/latest/arrow_flight/
357//! [`arrow-integration-test`]: https://docs.rs/arrow-integration-test/latest/arrow_integration_test/
358//! [`parquet`]: https://docs.rs/parquet/latest/parquet/
359//! [Arrow Flight RPC]: https://arrow.apache.org/docs/format/Flight.html
360//! [Arrow JSON Test Format]: https://github.com/apache/arrow/blob/master/docs/source/format/Integration.rst#json-test-data-format
361//! [Apache Parquet]: https://parquet.apache.org/
362//! [DataFusion]: https://github.com/apache/arrow-datafusion
363//! [issue tracker]: https://github.com/apache/arrow-rs/issues
364
365#![deny(clippy::redundant_clone)]
366#![warn(missing_debug_implementations)]
367#![warn(missing_docs)]
368#![allow(rustdoc::invalid_html_tags)]
369pub use arrow_array::{downcast_dictionary_array, downcast_primitive_array};
370
371pub use arrow_buffer::{alloc, buffer};
372
373/// Arrow crate version
374pub const ARROW_VERSION: &str = env!("CARGO_PKG_VERSION");
375
376pub mod array;
377pub mod compute;
378#[cfg(feature = "csv")]
379pub use arrow_csv as csv;
380pub mod datatypes;
381pub mod error;
382#[cfg(feature = "ffi")]
383pub use arrow_array::ffi;
384#[cfg(feature = "ffi")]
385pub use arrow_array::ffi_stream;
386#[cfg(feature = "ipc")]
387pub use arrow_ipc as ipc;
388#[cfg(feature = "json")]
389pub use arrow_json as json;
390#[cfg(feature = "pyarrow")]
391pub mod pyarrow;
392
393/// Contains the `RecordBatch` type and associated traits
394pub mod record_batch {
395 pub use arrow_array::{
396 RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter,
397 };
398}
399pub use arrow_array::temporal_conversions;
400pub use arrow_row as row;
401pub mod tensor;
402pub mod util;