arrow_avro/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro].
19//!
20//! This crate provides:
21//! - a [`reader`] that decodes Avro (Object Container Files, Avro Single‑Object encoding,
22//! and Confluent Schema Registry wire format) into Arrow `RecordBatch`es,
23//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or SOE).
24//!
25//! If you’re new to Arrow or Avro, see:
26//! - Arrow project site: <https://arrow.apache.org/>
27//! - Avro 1.11.1 specification: <https://avro.apache.org/docs/1.11.1/specification/>
28//!
29//! ## Example: OCF (Object Container File) round‑trip *(runnable)*
30//!
31//! The example below creates an Arrow table, writes an **Avro OCF** fully in memory,
32//! and then reads it back. OCF is a self‑describing file format that embeds the Avro
33//! schema in a header with optional compression and block sync markers.
34//! Spec: <https://avro.apache.org/docs/1.11.1/specification/#object-container-files>
35//!
36//! ```
37//! use std::io::Cursor;
38//! use std::sync::Arc;
39//! use arrow_array::{ArrayRef, Int32Array, RecordBatch};
40//! use arrow_schema::{DataType, Field, Schema};
41//! use arrow_avro::writer::AvroWriter;
42//! use arrow_avro::reader::ReaderBuilder;
43//!
44//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
45//! // Build a tiny Arrow batch
46//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
47//! let batch = RecordBatch::try_new(
48//! Arc::new(schema.clone()),
49//! vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
50//! )?;
51//!
52//! // Write an Avro **Object Container File** (OCF) to a Vec<u8>
53//! let sink: Vec<u8> = Vec::new();
54//! let mut w = AvroWriter::new(sink, schema.clone())?;
55//! w.write(&batch)?;
56//! w.finish()?;
57//! let bytes = w.into_inner();
58//! assert!(!bytes.is_empty());
59//!
60//! // Read it back
61//! let mut r = ReaderBuilder::new().build(Cursor::new(bytes))?;
62//! let out = r.next().unwrap()?;
63//! assert_eq!(out.num_rows(), 3);
64//! # Ok(()) }
65//! ```
66//!
67//! ## Quickstart: SOE (Single‑Object Encoding) round‑trip *(runnable)*
68//!
69//! Avro **Single‑Object Encoding (SOE)** wraps an Avro body with a 2‑byte marker
70//! `0xC3 0x01` and an **8‑byte little‑endian CRC‑64‑AVRO Rabin fingerprint** of the
71//! writer schema, then the Avro body. Spec:
72//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
73//!
74//! This example registers the writer schema (computing a Rabin fingerprint), writes a
75//! single‑row Avro body (using `AvroStreamWriter`), constructs the SOE frame, and decodes it back to Arrow.
76//!
77//! ```
78//! use std::collections::HashMap;
79//! use std::sync::Arc;
80//! use arrow_array::{ArrayRef, Int64Array, RecordBatch};
81//! use arrow_schema::{DataType, Field, Schema};
82//! use arrow_avro::writer::{AvroStreamWriter, WriterBuilder};
83//! use arrow_avro::reader::ReaderBuilder;
84//! use arrow_avro::schema::{AvroSchema, SchemaStore, FingerprintStrategy, SCHEMA_METADATA_KEY};
85//!
86//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
87//! // Writer schema: { "type":"record","name":"User","fields":[{"name":"x","type":"long"}] }
88//! let writer_json = r#"{"type":"record","name":"User","fields":[{"name":"x","type":"long"}]}"#;
89//! let mut store = SchemaStore::new(); // Rabin CRC‑64‑AVRO by default
90//! let _fp = store.register(AvroSchema::new(writer_json.to_string()))?;
91//!
92//! // Build an Arrow schema that references the same Avro JSON
93//! let mut md = HashMap::new();
94//! md.insert(SCHEMA_METADATA_KEY.to_string(), writer_json.to_string());
95//! let schema = Schema::new_with_metadata(
96//! vec![Field::new("x", DataType::Int64, false)],
97//! md,
98//! );
99//!
100//! // One‑row batch: { x: 7 }
101//! let batch = RecordBatch::try_new(
102//! Arc::new(schema.clone()),
103//! vec![Arc::new(Int64Array::from(vec![7])) as ArrayRef],
104//! )?;
105//!
106//! // Stream‑write a single record; the writer adds **SOE** (C3 01 + Rabin) automatically.
107//! let sink: Vec<u8> = Vec::new();
108//! let mut w: AvroStreamWriter<Vec<u8>> = WriterBuilder::new(schema.clone())
109//! .with_fingerprint_strategy(FingerprintStrategy::Rabin)
110//! .build(sink)?;
111//! w.write(&batch)?;
112//! w.finish()?;
113//! let frame = w.into_inner(); // already: C3 01 + 8B LE Rabin + Avro body
114//! assert!(frame.len() > 10);
115//!
116//! // Decode
117//! let mut dec = ReaderBuilder::new()
118//! .with_writer_schema_store(store)
119//! .build_decoder()?;
120//! dec.decode(&frame)?;
121//! let out = dec.flush()?.expect("one row");
122//! assert_eq!(out.num_rows(), 1);
123//! # Ok(()) }
124//! ```
125//!
126//! ---
127//!
128//! ### Modules
129//!
130//! - [`reader`]: read Avro (OCF, SOE, Confluent) into Arrow `RecordBatch`es.
131//! - [`writer`]: write Arrow `RecordBatch`es as Avro (OCF, SOE, Confluent, Apicurio).
132//! - [`schema`]: Avro schema parsing / fingerprints / registries.
133//! - [`compression`]: codecs used for **OCF block compression** (i.e., Deflate, Snappy, Zstandard, BZip2, and XZ).
134//! - [`codec`]: internal Avro-Arrow type conversion and row decode/encode plans.
135//!
136//! ### Features
137//!
138//! **OCF compression (enabled by default)**
139//! - `deflate` — enable DEFLATE block compression (via `flate2`).
140//! - `snappy` — enable Snappy block compression with 4‑byte BE CRC32 (per Avro).
141//! - `zstd` — enable Zstandard block compression.
142//! - `bzip2` — enable BZip2 block compression.
143//! - `xz` — enable XZ/LZMA block compression.
144//!
145//! **Schema fingerprints & helpers (opt‑in)**
146//! - `md5` — enable MD5 writer‑schema fingerprints.
147//! - `sha256` — enable SHA‑256 writer‑schema fingerprints.
148//! - `small_decimals` — support for compact Arrow representations of small Avro decimals (`Decimal32` and `Decimal64`).
149//! - `avro_custom_types` — interpret Avro fields annotated with Arrow‑specific logical
150//! types such as `arrow.duration-nanos`, `arrow.duration-micros`,
151//! `arrow.duration-millis`, or `arrow.duration-seconds` as Arrow `Duration(TimeUnit)`.
152//! - `canonical_extension_types` — enable support for Arrow [canonical extension types]
153//! from `arrow-schema` so `arrow-avro` can respect them during Avro↔Arrow mapping.
154//!
155//! **Notes**
156//! - OCF compression codecs apply only to **Object Container Files**; they do not affect Avro
157//! single object encodings.
158//!
159//! [canonical extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
160//!
161//! [Apache Arrow]: https://arrow.apache.org/
162//! [Apache Avro]: https://avro.apache.org/
163
164#![doc(
165 html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
166 html_favicon_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_transparent-bg.svg"
167)]
168#![cfg_attr(docsrs, feature(doc_cfg))]
169#![warn(missing_docs)]
170
171/// Core functionality for reading Avro data into Arrow arrays
172///
173/// Implements the primary reader interface and record decoding logic.
174pub mod reader;
175
176/// Core functionality for writing Arrow arrays as Avro data
177///
178/// Implements the primary writer interface and record encoding logic.
179pub mod writer;
180
181/// Avro schema parsing and representation
182///
183/// Provides types for parsing and representing Avro schema definitions.
184pub mod schema;
185
186/// Compression codec implementations for Avro
187///
188/// Provides support for various compression algorithms used in Avro files,
189/// including Deflate, Snappy, and ZStandard.
190pub mod compression;
191
192/// Data type conversions between Avro and Arrow types
193///
194/// This module contains the necessary types and functions to convert between
195/// Avro data types and Arrow data types.
196pub mod codec;
197
198/// Extension trait for AvroField to add Utf8View support
199///
200/// This trait adds methods for working with Utf8View support to the AvroField struct.
201pub trait AvroFieldExt {
202 /// Returns a new field with Utf8View support enabled for string data
203 ///
204 /// This will convert any string data to use StringViewArray instead of StringArray.
205 fn with_utf8view(&self) -> Self;
206}
207
208impl AvroFieldExt for codec::AvroField {
209 fn with_utf8view(&self) -> Self {
210 codec::AvroField::with_utf8view(self)
211 }
212}
213
214#[cfg(test)]
215mod test_util {
216 pub fn arrow_test_data(path: &str) -> String {
217 match std::env::var("ARROW_TEST_DATA") {
218 Ok(dir) => format!("{dir}/{path}"),
219 Err(_) => format!("../testing/data/{path}"),
220 }
221 }
222}