arrow_avro/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro].
19//!
20//! This crate provides:
21//! - a [`reader`] that decodes Avro (Object Container Files, Avro Single‑Object encoding,
22//! and Confluent Schema Registry wire format) into Arrow `RecordBatch`es,
23//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or SOE).
24//!
25//! If you’re new to Arrow or Avro, see:
26//! - Arrow project site: <https://arrow.apache.org/>
27//! - Avro 1.11.1 specification: <https://avro.apache.org/docs/1.11.1/specification/>
28//!
29//! ## Example: OCF (Object Container File) round‑trip *(runnable)*
30//!
31//! The example below creates an Arrow table, writes an **Avro OCF** fully in memory,
32//! and then reads it back. OCF is a self‑describing file format that embeds the Avro
33//! schema in a header with optional compression and block sync markers.
34//! Spec: <https://avro.apache.org/docs/1.11.1/specification/#object-container-files>
35//!
36//! ```
37//! use std::io::Cursor;
38//! use std::sync::Arc;
39//! use arrow_array::{ArrayRef, Int32Array, RecordBatch};
40//! use arrow_schema::{DataType, Field, Schema};
41//! use arrow_avro::writer::AvroWriter;
42//! use arrow_avro::reader::ReaderBuilder;
43//!
44//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
45//! // Build a tiny Arrow batch
46//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
47//! let batch = RecordBatch::try_new(
48//! Arc::new(schema.clone()),
49//! vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
50//! )?;
51//!
52//! // Write an Avro **Object Container File** (OCF) to a Vec<u8>
53//! let sink: Vec<u8> = Vec::new();
54//! let mut w = AvroWriter::new(sink, schema.clone())?;
55//! w.write(&batch)?;
56//! w.finish()?;
57//! let bytes = w.into_inner();
58//! assert!(!bytes.is_empty());
59//!
60//! // Read it back
61//! let mut r = ReaderBuilder::new().build(Cursor::new(bytes))?;
62//! let out = r.next().unwrap()?;
63//! assert_eq!(out.num_rows(), 3);
64//! # Ok(()) }
65//! ```
66//!
67//! ## Quickstart: SOE (Single‑Object Encoding) round‑trip *(runnable)*
68//!
69//! Avro **Single‑Object Encoding (SOE)** wraps an Avro body with a 2‑byte marker
70//! `0xC3 0x01` and an **8‑byte little‑endian CRC‑64‑AVRO Rabin fingerprint** of the
71//! writer schema, then the Avro body. Spec:
72//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
73//!
74//! This example registers the writer schema (computing a Rabin fingerprint), writes a
75//! single‑row Avro body (using `AvroStreamWriter`), constructs the SOE frame, and decodes it back to Arrow.
76//!
77//! ```
78//! use std::collections::HashMap;
79//! use std::sync::Arc;
80//! use arrow_array::{ArrayRef, Int64Array, RecordBatch};
81//! use arrow_schema::{DataType, Field, Schema};
82//! use arrow_avro::writer::{AvroStreamWriter, WriterBuilder};
83//! use arrow_avro::reader::ReaderBuilder;
84//! use arrow_avro::schema::{AvroSchema, SchemaStore, FingerprintStrategy, SCHEMA_METADATA_KEY};
85//!
86//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
87//! // Writer schema: { "type":"record","name":"User","fields":[{"name":"x","type":"long"}] }
88//! let writer_json = r#"{"type":"record","name":"User","fields":[{"name":"x","type":"long"}]}"#;
89//! let mut store = SchemaStore::new(); // Rabin CRC‑64‑AVRO by default
90//! let _fp = store.register(AvroSchema::new(writer_json.to_string()))?;
91//!
92//! // Build an Arrow schema that references the same Avro JSON
93//! let mut md = HashMap::new();
94//! md.insert(SCHEMA_METADATA_KEY.to_string(), writer_json.to_string());
95//! let schema = Schema::new_with_metadata(
96//! vec![Field::new("x", DataType::Int64, false)],
97//! md,
98//! );
99//!
100//! // One‑row batch: { x: 7 }
101//! let batch = RecordBatch::try_new(
102//! Arc::new(schema.clone()),
103//! vec![Arc::new(Int64Array::from(vec![7])) as ArrayRef],
104//! )?;
105//!
106//! // Stream‑write a single record; the writer adds **SOE** (C3 01 + Rabin) automatically.
107//! let sink: Vec<u8> = Vec::new();
108//! let mut w: AvroStreamWriter<Vec<u8>> = WriterBuilder::new(schema.clone())
109//! .with_fingerprint_strategy(FingerprintStrategy::Rabin)
110//! .build(sink)?;
111//! w.write(&batch)?;
112//! w.finish()?;
113//! let frame = w.into_inner(); // already: C3 01 + 8B LE Rabin + Avro body
114//! assert!(frame.len() > 10);
115//!
116//! // Decode
117//! let mut dec = ReaderBuilder::new()
118//! .with_writer_schema_store(store)
119//! .build_decoder()?;
120//! dec.decode(&frame)?;
121//! let out = dec.flush()?.expect("one row");
122//! assert_eq!(out.num_rows(), 1);
123//! # Ok(()) }
124//! ```
125//!
126//! ## `async` Reading (`async` feature)
127//!
128//! The [`reader`] module provides async APIs for reading Avro files when the `async`
129//! feature is enabled.
130//!
131//! [`AsyncAvroFileReader`] implements `Stream<Item = Result<RecordBatch, ArrowError>>`,
132//! allowing efficient async streaming of record batches. When the `object_store` feature
133//! is enabled, [`AvroObjectReader`] provides integration with object storage services
134//! such as S3 via the [object_store] crate.
135//!
136//! ```ignore
137//! use std::sync::Arc;
138//! use arrow_avro::reader::{AsyncAvroFileReader, AvroObjectReader};
139//! use futures::TryStreamExt;
140//! use object_store::ObjectStore;
141//! use object_store::local::LocalFileSystem;
142//! use object_store::path::Path;
143//!
144//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
145//! let store: Arc<dyn ObjectStore> = Arc::new(LocalFileSystem::new());
146//! let path = Path::from("data/example.avro");
147//! let meta = store.head(&path).await?;
148//!
149//! let reader = AvroObjectReader::new(store, path);
150//! let stream = AsyncAvroFileReader::builder(reader, meta.size, 1024)
151//! .try_build()
152//! .await?;
153//!
154//! let batches: Vec<_> = stream.try_collect().await?;
155//! # Ok(())
156//! # }
157//! ```
158//!
159//! [object_store]: https://docs.rs/object_store/latest/object_store/
160//!
161//! ---
162//!
163//! ### Modules
164//!
165//! - [`reader`]: read Avro (OCF, SOE, Confluent) into Arrow `RecordBatch`es.
166//! - With the `async` feature: [`AsyncAvroFileReader`] for async streaming reads.
167//! - With the `object_store` feature: [`AvroObjectReader`] for reading from cloud storage.
168//! - [`writer`]: write Arrow `RecordBatch`es as Avro (OCF, SOE, Confluent, Apicurio).
169//! - [`schema`]: Avro schema parsing / fingerprints / registries.
170//! - [`compression`]: codecs used for **OCF block compression** (i.e., Deflate, Snappy, Zstandard, BZip2, and XZ).
171//! - [`codec`]: internal Avro-Arrow type conversion and row decode/encode plans.
172//!
173//! [`AsyncAvroFileReader`]: reader::AsyncAvroFileReader
174//! [`AvroObjectReader`]: reader::AvroObjectReader
175//!
176//! ### Features
177//!
178//! **OCF compression (enabled by default)**
179//! - `deflate` — enable DEFLATE block compression (via `flate2`).
180//! - `snappy` — enable Snappy block compression with 4‑byte BE CRC32 (per Avro).
181//! - `zstd` — enable Zstandard block compression.
182//! - `bzip2` — enable BZip2 block compression.
183//! - `xz` — enable XZ/LZMA block compression.
184//!
185//! **Async & Object Store (opt‑in)**
186//! - `async` — enable async APIs for reading Avro (`AsyncAvroFileReader`, `AsyncFileReader` trait).
187//! - `object_store` — enable integration with the [`object_store`] crate for reading Avro
188//! from cloud storage (S3, GCS, Azure Blob, etc.) via `AvroObjectReader`. Implies `async`.
189//!
190//! **Schema fingerprints & helpers (opt‑in)**
191//! - `md5` — enable MD5 writer‑schema fingerprints.
192//! - `sha256` — enable SHA‑256 writer‑schema fingerprints.
193//! - `small_decimals` — support for compact Arrow representations of small Avro decimals (`Decimal32` and `Decimal64`).
194//! - `avro_custom_types` — interpret Avro fields annotated with Arrow‑specific logical
195//! types such as `arrow.duration-nanos`, `arrow.duration-micros`,
196//! `arrow.duration-millis`, or `arrow.duration-seconds` as Arrow `Duration(TimeUnit)`.
197//! - `canonical_extension_types` — enable support for Arrow [canonical extension types]
198//! from `arrow-schema` so `arrow-avro` can respect them during Avro↔Arrow mapping.
199//!
200//! **Notes**
201//! - OCF compression codecs apply only to **Object Container Files**; they do not affect Avro
202//! single object encodings.
203//!
204//! [`object_store`]: https://docs.rs/object_store/latest/object_store/
205//!
206//! [canonical extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
207//!
208//! [Apache Arrow]: https://arrow.apache.org/
209//! [Apache Avro]: https://avro.apache.org/
210
211#![doc(
212 html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
213 html_favicon_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_transparent-bg.svg"
214)]
215#![cfg_attr(docsrs, feature(doc_cfg))]
216#![warn(missing_docs)]
217
218/// Core functionality for reading Avro data into Arrow arrays
219///
220/// Implements the primary reader interface and record decoding logic.
221pub mod reader;
222
223/// Core functionality for writing Arrow arrays as Avro data
224///
225/// Implements the primary writer interface and record encoding logic.
226pub mod writer;
227
228/// Avro schema parsing and representation
229///
230/// Provides types for parsing and representing Avro schema definitions.
231pub mod schema;
232
233/// Compression codec implementations for Avro
234///
235/// Provides support for various compression algorithms used in Avro files,
236/// including Deflate, Snappy, and ZStandard.
237pub mod compression;
238
239/// Data type conversions between Avro and Arrow types
240///
241/// This module contains the necessary types and functions to convert between
242/// Avro data types and Arrow data types.
243pub mod codec;
244
245/// AvroError variants
246pub mod errors;
247
248/// Extension trait for AvroField to add Utf8View support
249///
250/// This trait adds methods for working with Utf8View support to the AvroField struct.
251pub trait AvroFieldExt {
252 /// Returns a new field with Utf8View support enabled for string data
253 ///
254 /// This will convert any string data to use StringViewArray instead of StringArray.
255 fn with_utf8view(&self) -> Self;
256}
257
258impl AvroFieldExt for codec::AvroField {
259 fn with_utf8view(&self) -> Self {
260 codec::AvroField::with_utf8view(self)
261 }
262}
263
264#[cfg(test)]
265mod test_util {
266 pub fn arrow_test_data(path: &str) -> String {
267 match std::env::var("ARROW_TEST_DATA") {
268 Ok(dir) => format!("{dir}/{path}"),
269 Err(_) => format!("../testing/data/{path}"),
270 }
271 }
272}