arrow_avro/reader/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Avro reader
19//!
20//! Facilities to read Apache Avro–encoded data into Arrow's `RecordBatch` format.
21//!
22//! ### Limitations
23//!
24//!- **Avro unions with > 127 branches are not supported.**
25//!  When decoding Avro unions to Arrow `UnionArray`, Arrow stores the union
26//!  type identifiers in an **8‑bit signed** buffer (`i8`). This implies a
27//!  practical limit of **127** distinct branch ids. Inputs that resolve to
28//!  more than 127 branches will return an error. If you truly need more,
29//!  model the schema as a **union of unions**, per the Arrow format spec.
30//!
31//!  See: Arrow Columnar Format — Dense Union (“types buffer: 8‑bit signed;
32//!  a union with more than 127 possible types can be modeled as a union of
33//!  unions”).
34//!
35//! This module exposes three layers of the API surface, from highest to lowest-level:
36//!
37//! * [`ReaderBuilder`](crate::reader::ReaderBuilder): configures how Avro is read (batch size, strict union handling,
38//!   string representation, reader schema, etc.) and produces either:
39//!   * a `Reader` for **Avro Object Container Files (OCF)** read from any `BufRead`, or
40//!   * a low-level `Decoder` for **single‑object encoded** Avro bytes and Confluent
41//!     **Schema Registry** framed messages.
42//! * [`Reader`](crate::reader::Reader): a convenient, synchronous iterator over `RecordBatch` decoded from an OCF
43//!   input. Implements [`Iterator<Item = Result<RecordBatch, ArrowError>>`] and
44//!   `RecordBatchReader`.
45//! * [`Decoder`](crate::reader::Decoder): a push‑based row decoder that consumes SOE framed Avro bytes and yields ready
46//!   `RecordBatch` values when batches fill. This is suitable for integrating with async
47//!   byte streams, network protocols, or other custom data sources.
48//!
49//! ## Encodings and when to use which type
50//!
51//! * **Object Container File (OCF)**: A self‑describing file format with a header containing
52//!   the writer schema, optional compression codec, and a sync marker, followed by one or
53//!   more data blocks. Use `Reader` for this format. See the Avro 1.11.1 specification
54//!   (“Object Container Files”). <https://avro.apache.org/docs/1.11.1/specification/#object-container-files>
55//! * **Single‑Object Encoding**: A stream‑friendly framing that prefixes each record body with
56//!   the 2‑byte marker `0xC3 0x01` followed by the **8‑byte little‑endian CRC‑64‑AVRO Rabin
57//!   fingerprint** of the writer schema, then the Avro binary body. Use `Decoder` with a
58//!   populated `SchemaStore` to resolve fingerprints to full schemas.
59//!   See “Single object encoding” in the Avro 1.11.1 spec.
60//!   <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
61//! * **Confluent Schema Registry wire format**: A 1‑byte magic `0x00`, a **4‑byte big‑endian**
62//!   schema ID, then the Avro‑encoded body. Use `Decoder` with a `SchemaStore` configured
63//!   for `FingerprintAlgorithm::Id` and entries keyed by `Fingerprint::Id`. See
64//!   Confluent’s “Wire format” documentation.
65//!   <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
66//! * **Apicurio Schema Registry wire format**: A 1‑byte magic `0x00`, a **8‑byte big‑endian**
67//!   global schema ID, then the Avro‑encoded body. Use `Decoder` with a `SchemaStore` configured
68//!   for `FingerprintAlgorithm::Id64` and entries keyed by `Fingerprint::Id64`. See
69//!   Apicurio’s “Avro SerDe” documentation.
70//!   <https://www.apicur.io/registry/docs/apicurio-registry/1.3.3.Final/getting-started/assembly-using-kafka-client-serdes.html#registry-serdes-types-avro-registry>
71//!
72//! ## Basic file usage (OCF)
73//!
74//! Use `ReaderBuilder::build` to construct a `Reader` from any `BufRead`. The doctest below
75//! creates a tiny OCF in memory using `AvroWriter` and then reads it back.
76//!
77//! ```
78//! use std::io::Cursor;
79//! use std::sync::Arc;
80//! use arrow_array::{ArrayRef, Int32Array, RecordBatch};
81//! use arrow_schema::{DataType, Field, Schema};
82//! use arrow_avro::writer::AvroWriter;
83//! use arrow_avro::reader::ReaderBuilder;
84//!
85//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
86//! // Build a minimal Arrow schema and batch
87//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
88//! let batch = RecordBatch::try_new(
89//!     Arc::new(schema.clone()),
90//!     vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
91//! )?;
92//!
93//! // Write an Avro OCF to memory
94//! let buffer: Vec<u8> = Vec::new();
95//! let mut writer = AvroWriter::new(buffer, schema.clone())?;
96//! writer.write(&batch)?;
97//! writer.finish()?;
98//! let bytes = writer.into_inner();
99//!
100//! // Read it back with ReaderBuilder
101//! let mut reader = ReaderBuilder::new().build(Cursor::new(bytes))?;
102//! let out = reader.next().unwrap()?;
103//! assert_eq!(out.num_rows(), 3);
104//! # Ok(()) }
105//! ```
106//!
107//! ## Streaming usage (single‑object / Confluent / Apicurio)
108//!
109//! The `Decoder` lets you integrate Avro decoding with **any** source of bytes by
110//! periodically calling `Decoder::decode` with new data and calling `Decoder::flush`
111//! to get a `RecordBatch` once at least one row is complete.
112//!
113//! The example below shows how to decode from an arbitrary stream of `bytes::Bytes` using
114//! `futures` utilities. Note: this is illustrative and keeps a single in‑memory `Bytes`
115//! buffer for simplicity—real applications typically maintain a rolling buffer.
116//!
117//! ```
118//! use bytes::{Buf, Bytes};
119//! use futures::{Stream, StreamExt};
120//! use std::task::{Poll, ready};
121//! use arrow_array::RecordBatch;
122//! use arrow_schema::ArrowError;
123//! use arrow_avro::reader::Decoder;
124//!
125//! /// Decode a stream of Avro-framed bytes into RecordBatch values.
126//! fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
127//!     mut decoder: Decoder,
128//!     mut input: S,
129//! ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
130//!     let mut buffered = Bytes::new();
131//!     futures::stream::poll_fn(move |cx| {
132//!         loop {
133//!             if buffered.is_empty() {
134//!                 buffered = match ready!(input.poll_next_unpin(cx)) {
135//!                     Some(b) => b,
136//!                     None => break, // EOF
137//!                 };
138//!             }
139//!             // Feed as much as possible
140//!             let decoded = match decoder.decode(buffered.as_ref()) {
141//!                 Ok(n) => n,
142//!                 Err(e) => return Poll::Ready(Some(Err(e))),
143//!             };
144//!             let read = buffered.len();
145//!             buffered.advance(decoded);
146//!             if decoded != read {
147//!                 // decoder made partial progress; request more bytes
148//!                 break
149//!             }
150//!         }
151//!         // Return a batch if one or more rows are complete
152//!         Poll::Ready(decoder.flush().transpose())
153//!     })
154//! }
155//! ```
156//!
157//! ### Building and using a `Decoder` for **single‑object encoding** (Rabin fingerprints)
158//!
159//! The doctest below **writes** a single‑object framed record using the Avro writer
160//! (no manual varints) for the writer schema
161//! (`{"type":"record","name":"User","fields":[{"name":"id","type":"long"}]}`)
162//! and then decodes it into a `RecordBatch`.
163//!
164//! ```
165//! use std::sync::Arc;
166//! use std::collections::HashMap;
167//! use arrow_array::{ArrayRef, Int64Array, RecordBatch};
168//! use arrow_schema::{DataType, Field, Schema};
169//! use arrow_avro::schema::{AvroSchema, SchemaStore, SCHEMA_METADATA_KEY, FingerprintStrategy};
170//! use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
171//! use arrow_avro::reader::ReaderBuilder;
172//!
173//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
174//! // Register the writer schema (Rabin fingerprint by default).
175//! let mut store = SchemaStore::new();
176//! let avro_schema = AvroSchema::new(r#"{"type":"record","name":"User","fields":[
177//!   {"name":"id","type":"long"}]}"#.to_string());
178//! let _fp = store.register(avro_schema.clone())?;
179//!
180//! // Create a single-object framed record { id: 42 } with the Avro writer.
181//! let mut md = HashMap::new();
182//! md.insert(SCHEMA_METADATA_KEY.to_string(), avro_schema.json_string.clone());
183//! let arrow = Schema::new_with_metadata(vec![Field::new("id", DataType::Int64, false)], md);
184//! let batch = RecordBatch::try_new(
185//!     Arc::new(arrow.clone()),
186//!     vec![Arc::new(Int64Array::from(vec![42])) as ArrayRef],
187//! )?;
188//! let mut w = WriterBuilder::new(arrow)
189//!     .with_fingerprint_strategy(FingerprintStrategy::Rabin) // SOE prefix
190//!     .build::<_, AvroSoeFormat>(Vec::new())?;
191//! w.write(&batch)?;
192//! w.finish()?;
193//! let frame = w.into_inner(); // C3 01 + fp + Avro body
194//!
195//! // Decode with a `Decoder`
196//! let mut dec = ReaderBuilder::new()
197//!   .with_writer_schema_store(store)
198//!   .with_batch_size(1024)
199//!   .build_decoder()?;
200//!
201//! dec.decode(&frame)?;
202//! let out = dec.flush()?.expect("one batch");
203//! assert_eq!(out.num_rows(), 1);
204//! # Ok(()) }
205//! ```
206//!
207//! See Avro 1.11.1 “Single object encoding” for details of the 2‑byte marker
208//! and little‑endian CRC‑64‑AVRO fingerprint:
209//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
210//!
211//! ### Building and using a `Decoder` for **Confluent Schema Registry** framing
212//!
213//! The Confluent wire format is: 1‑byte magic `0x00`, then a **4‑byte big‑endian** schema ID,
214//! then the Avro body. The doctest below crafts two messages for the same schema ID and
215//! decodes them into a single `RecordBatch` with two rows.
216//!
217//! ```
218//! use std::sync::Arc;
219//! use std::collections::HashMap;
220//! use arrow_array::{ArrayRef, Int64Array, StringArray, RecordBatch};
221//! use arrow_schema::{DataType, Field, Schema};
222//! use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm, SCHEMA_METADATA_KEY, FingerprintStrategy};
223//! use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
224//! use arrow_avro::reader::ReaderBuilder;
225//!
226//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
227//! // Set up a store keyed by numeric IDs (Confluent).
228//! let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
229//! let schema_id = 7u32;
230//! let avro_schema = AvroSchema::new(r#"{"type":"record","name":"User","fields":[
231//!   {"name":"id","type":"long"}, {"name":"name","type":"string"}]}"#.to_string());
232//! store.set(Fingerprint::Id(schema_id), avro_schema.clone())?;
233//!
234//! // Write two Confluent-framed messages {id:1,name:"a"} and {id:2,name:"b"}.
235//! fn msg(id: i64, name: &str, schema: &AvroSchema, schema_id: u32) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
236//!     let mut md = HashMap::new();
237//!     md.insert(SCHEMA_METADATA_KEY.to_string(), schema.json_string.clone());
238//!     let arrow = Schema::new_with_metadata(
239//!         vec![Field::new("id", DataType::Int64, false), Field::new("name", DataType::Utf8, false)],
240//!         md,
241//!     );
242//!     let batch = RecordBatch::try_new(
243//!         Arc::new(arrow.clone()),
244//!         vec![
245//!           Arc::new(Int64Array::from(vec![id])) as ArrayRef,
246//!           Arc::new(StringArray::from(vec![name])) as ArrayRef,
247//!         ],
248//!     )?;
249//!     let mut w = WriterBuilder::new(arrow)
250//!         .with_fingerprint_strategy(FingerprintStrategy::Id(schema_id)) // 0x00 + ID + body
251//!         .build::<_, AvroSoeFormat>(Vec::new())?;
252//!     w.write(&batch)?; w.finish()?;
253//!     Ok(w.into_inner())
254//! }
255//! let m1 = msg(1, "a", &avro_schema, schema_id)?;
256//! let m2 = msg(2, "b", &avro_schema, schema_id)?;
257//!
258//! // Decode both into a single batch.
259//! let mut dec = ReaderBuilder::new()
260//!   .with_writer_schema_store(store)
261//!   .with_batch_size(1024)
262//!   .build_decoder()?;
263//! dec.decode(&m1)?;
264//! dec.decode(&m2)?;
265//! let batch = dec.flush()?.expect("batch");
266//! assert_eq!(batch.num_rows(), 2);
267//! # Ok(()) }
268//! ```
269//!
270//! See Confluent’s “Wire format” notes: magic byte `0x00`, 4‑byte **big‑endian** schema ID,
271//! then the Avro‑encoded payload.
272//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
273//!
274//! ## Schema resolution (reader vs. writer schemas)
275//!
276//! Avro supports resolving data written with one schema (“writer”) into another (“reader”)
277//! using rules like **field aliases**, **default values**, and **numeric promotions**.
278//! In practice this lets you evolve schemas over time while remaining compatible with old data.
279//!
280//! *Spec background:* See Avro’s **Schema Resolution** (aliases, defaults) and the Confluent
281//! **Wire format** (magic `0x00` + big‑endian schema id + Avro body).
282//! <https://avro.apache.org/docs/1.11.1/specification/#schema-resolution>
283//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
284//!
285//! ### OCF example: rename a field and add a default via a reader schema
286//!
287//! Below we write an OCF with a *writer schema* having fields `id: long`, `name: string`.
288//! We then read it with a *reader schema* that:
289//! - **renames** `name` to `full_name` via `aliases`, and
290//! - **adds** `is_active: boolean` with a **default** value `true`.
291//!
292//! ```
293//! use std::io::Cursor;
294//! use std::sync::Arc;
295//! use arrow_array::{ArrayRef, Int64Array, StringArray, RecordBatch};
296//! use arrow_schema::{DataType, Field, Schema};
297//! use arrow_avro::writer::AvroWriter;
298//! use arrow_avro::reader::ReaderBuilder;
299//! use arrow_avro::schema::AvroSchema;
300//!
301//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
302//! // Writer (past version): { id: long, name: string }
303//! let writer_arrow = Schema::new(vec![
304//!     Field::new("id", DataType::Int64, false),
305//!     Field::new("name", DataType::Utf8, false),
306//! ]);
307//! let batch = RecordBatch::try_new(
308//!     Arc::new(writer_arrow.clone()),
309//!     vec![
310//!         Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
311//!         Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
312//!     ],
313//! )?;
314//!
315//! // Write an OCF entirely in memory
316//! let mut w = AvroWriter::new(Vec::<u8>::new(), writer_arrow)?;
317//! w.write(&batch)?;
318//! w.finish()?;
319//! let bytes = w.into_inner();
320//!
321//! // Reader (current version):
322//! //  - record name "topLevelRecord" matches the crate's default for OCF
323//! //  - rename `name` -> `full_name` using aliases (optional)
324//! let reader_json = r#"
325//! {
326//!   "type": "record",
327//!   "name": "topLevelRecord",
328//!   "fields": [
329//!     { "name": "id", "type": "long" },
330//!     { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
331//!     { "name": "is_active", "type": "boolean", "default": true }
332//!   ]
333//! }"#;
334//!
335//! let mut reader = ReaderBuilder::new()
336//!   .with_reader_schema(AvroSchema::new(reader_json.to_string()))
337//!   .build(Cursor::new(bytes))?;
338//!
339//! let out = reader.next().unwrap()?;
340//! assert_eq!(out.num_rows(), 2);
341//! # Ok(()) }
342//! ```
343//!
344//! ### Confluent single‑object example: resolve *past* writer versions to the topic’s **current** reader schema
345//!
346//! In this scenario, the **reader schema** is the topic’s *current* schema, while the two
347//! **writer schemas** registered under Confluent IDs **1** and **2** represent *past versions*.
348//! The decoder uses the reader schema to resolve both versions.
349//!
350//! ```
351//! use std::sync::Arc;
352//! use std::collections::HashMap;
353//! use arrow_avro::reader::ReaderBuilder;
354//! use arrow_avro::schema::{
355//!     AvroSchema, Fingerprint, FingerprintAlgorithm, SchemaStore,
356//!     SCHEMA_METADATA_KEY, FingerprintStrategy,
357//! };
358//! use arrow_array::{ArrayRef, Int32Array, Int64Array, StringArray, RecordBatch};
359//! use arrow_schema::{DataType, Field, Schema};
360//!
361//! fn main() -> Result<(), Box<dyn std::error::Error>> {
362//!     // Reader: current topic schema (no reader-added fields)
363//!     //   {"type":"record","name":"User","fields":[
364//!     //     {"name":"id","type":"long"},
365//!     //     {"name":"name","type":"string"}]}
366//!     let reader_schema = AvroSchema::new(
367//!         r#"{"type":"record","name":"User",
368//!             "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"}]}"#
369//!             .to_string(),
370//!     );
371//!
372//!     // Register two *writer* schemas under Confluent IDs 0 and 1
373//!     let writer_v0 = AvroSchema::new(
374//!         r#"{"type":"record","name":"User",
375//!             "fields":[{"name":"id","type":"int"},{"name":"name","type":"string"}]}"#
376//!             .to_string(),
377//!     );
378//!     let writer_v1 = AvroSchema::new(
379//!         r#"{"type":"record","name":"User",
380//!             "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"},
381//!                       {"name":"email","type":["null","string"],"default":null}]}"#
382//!             .to_string(),
383//!     );
384//!
385//!     let id_v0: u32 = 0;
386//!     let id_v1: u32 = 1;
387//!
388//!     let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id); // integer IDs
389//!     store.set(Fingerprint::Id(id_v0), writer_v0.clone())?;
390//!     store.set(Fingerprint::Id(id_v1), writer_v1.clone())?;
391//!
392//!     // Write two Confluent-framed messages using each writer version
393//!     // frame0: writer v0 body {id:1001_i32, name:"v0-alice"}
394//!     let mut md0 = HashMap::new();
395//!     md0.insert(SCHEMA_METADATA_KEY.to_string(), writer_v0.json_string.clone());
396//!     let arrow0 = Schema::new_with_metadata(
397//!         vec![Field::new("id", DataType::Int32, false),
398//!              Field::new("name", DataType::Utf8, false)], md0);
399//!     let batch0 = RecordBatch::try_new(
400//!         Arc::new(arrow0.clone()),
401//!         vec![Arc::new(Int32Array::from(vec![1001])) as ArrayRef,
402//!              Arc::new(StringArray::from(vec!["v0-alice"])) as ArrayRef])?;
403//!     let mut w0 = arrow_avro::writer::WriterBuilder::new(arrow0)
404//!         .with_fingerprint_strategy(FingerprintStrategy::Id(id_v0))
405//!         .build::<_, arrow_avro::writer::format::AvroSoeFormat>(Vec::new())?;
406//!     w0.write(&batch0)?; w0.finish()?;
407//!     let frame0 = w0.into_inner(); // 0x00 + id_v0 + body
408//!
409//!     // frame1: writer v1 body {id:2002_i64, name:"v1-bob", email: Some("bob@example.com")}
410//!     let mut md1 = HashMap::new();
411//!    md1.insert(SCHEMA_METADATA_KEY.to_string(), writer_v1.json_string.clone());
412//!     let arrow1 = Schema::new_with_metadata(
413//!         vec![Field::new("id", DataType::Int64, false),
414//!              Field::new("name", DataType::Utf8, false),
415//!              Field::new("email", DataType::Utf8, true)], md1);
416//!     let batch1 = RecordBatch::try_new(
417//!         Arc::new(arrow1.clone()),
418//!         vec![Arc::new(Int64Array::from(vec![2002])) as ArrayRef,
419//!              Arc::new(StringArray::from(vec!["v1-bob"])) as ArrayRef,
420//!              Arc::new(StringArray::from(vec![Some("bob@example.com")])) as ArrayRef])?;
421//!     let mut w1 = arrow_avro::writer::WriterBuilder::new(arrow1)
422//!         .with_fingerprint_strategy(FingerprintStrategy::Id(id_v1))
423//!         .build::<_, arrow_avro::writer::format::AvroSoeFormat>(Vec::new())?;
424//!     w1.write(&batch1)?; w1.finish()?;
425//!     let frame1 = w1.into_inner(); // 0x00 + id_v1 + body
426//!
427//!     // Build a streaming Decoder that understands Confluent framing
428//!     let mut decoder = ReaderBuilder::new()
429//!         .with_reader_schema(reader_schema)
430//!         .with_writer_schema_store(store)
431//!         .with_batch_size(8) // small demo batches
432//!         .build_decoder()?;
433//!
434//!     // Decode each whole frame, then drain completed rows with flush()
435//!     let mut total_rows = 0usize;
436//!
437//!     let consumed0 = decoder.decode(&frame0)?;
438//!     assert_eq!(consumed0, frame0.len(), "decoder must consume the whole frame");
439//!     while let Some(batch) = decoder.flush()? { total_rows += batch.num_rows(); }
440//!
441//!     let consumed1 = decoder.decode(&frame1)?;
442//!     assert_eq!(consumed1, frame1.len(), "decoder must consume the whole frame");
443//!     while let Some(batch) = decoder.flush()? { total_rows += batch.num_rows(); }
444//!
445//!     // We sent 2 records so we should get 2 rows (possibly one per flush)
446//!     assert_eq!(total_rows, 2);
447//!     Ok(())
448//! }
449//! ```
450//!
451//! ## Schema evolution and batch boundaries
452//!
453//! `Decoder` supports mid‑stream schema changes when the input framing carries a schema
454//! fingerprint (single‑object or Confluent). When a new fingerprint is observed:
455//!
456//! * If the current `RecordBatch` is **empty**, the decoder switches to the new schema
457//!   immediately.
458//! * If not, the decoder finishes the current batch first and only then switches.
459//!
460//! Consequently, the schema of batches produced by `Decoder::flush` may change over time,
461//! and `Decoder` intentionally does **not** implement `RecordBatchReader`. In contrast,
462//! `Reader` (OCF) has a single writer schema for the entire file and therefore implements
463//! `RecordBatchReader`.
464//!
465//! ## Performance & memory
466//!
467//! * `batch_size` controls the maximum number of rows per `RecordBatch`. Larger batches
468//!   amortize per‑batch overhead; smaller batches reduce peak memory usage and latency.
469//! * When `utf8_view` is enabled, string columns use Arrow’s `StringViewArray`, which can
470//!   reduce allocations for short strings.
471//! * For OCF, blocks may be compressed; `Reader` will decompress using the codec specified
472//!   in the file header and feed uncompressed bytes to the row `Decoder`.
473//!
474//! ## Error handling
475//!
476//! * Incomplete inputs return parse errors with "Unexpected EOF"; callers typically provide
477//!   more bytes and try again.
478//! * If a fingerprint is unknown to the provided `SchemaStore`, decoding fails with a
479//!   descriptive error. Populate the store up front to avoid this.
480//!
481//! ---
482use crate::codec::AvroFieldBuilder;
483use crate::reader::header::read_header;
484use crate::schema::{
485    AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, SCHEMA_METADATA_KEY,
486    SINGLE_OBJECT_MAGIC, Schema, SchemaStore,
487};
488use arrow_array::{RecordBatch, RecordBatchReader};
489use arrow_schema::{ArrowError, SchemaRef};
490use block::BlockDecoder;
491use header::Header;
492use indexmap::IndexMap;
493use record::RecordDecoder;
494use std::io::BufRead;
495
496mod block;
497mod cursor;
498mod header;
499mod record;
500mod vlq;
501
502fn is_incomplete_data(err: &ArrowError) -> bool {
503    matches!(
504        err,
505        ArrowError::ParseError(msg)
506            if msg.contains("Unexpected EOF")
507    )
508}
509
510/// A low‑level, push‑based decoder from Avro bytes to Arrow `RecordBatch`.
511///
512/// `Decoder` is designed for **streaming** scenarios:
513///
514/// * You *feed* freshly received bytes using `Self::decode`, potentially multiple times,
515///   until at least one row is complete.
516/// * You then *drain* completed rows with `Self::flush`, which yields a `RecordBatch`
517///   if any rows were finished since the last flush.
518///
519/// Unlike `Reader`, which is specialized for Avro **Object Container Files**, `Decoder`
520/// understands **framed single‑object** inputs and **Confluent Schema Registry** messages,
521/// switching schemas mid‑stream when the framing indicates a new fingerprint.
522///
523/// ### Supported prefixes
524///
525/// On each new row boundary, `Decoder` tries to match one of the following "prefixes":
526///
527/// * **Single‑Object encoding**: magic `0xC3 0x01` + schema fingerprint (length depends on
528///   the configured `FingerprintAlgorithm`); see `SINGLE_OBJECT_MAGIC`.
529/// * **Confluent wire format**: magic `0x00` + 4‑byte big‑endian schema id; see
530///   `CONFLUENT_MAGIC`.
531///
532/// The active fingerprint determines which cached row decoder is used to decode the following
533/// record body bytes.
534///
535/// ### Schema switching semantics
536///
537/// When a new fingerprint is observed:
538///
539/// * If the current batch is empty, the decoder switches immediately;
540/// * Otherwise, the current batch is finalized on the next `flush` and only then
541///   does the decoder switch to the new schema. This guarantees that a single `RecordBatch`
542///   never mixes rows with different schemas.
543///
544/// ### Examples
545///
546/// Build and use a `Decoder` for single‑object encoding:
547///
548/// ```
549/// use arrow_avro::schema::{AvroSchema, SchemaStore};
550/// use arrow_avro::reader::ReaderBuilder;
551///
552/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
553/// // Use a record schema at the top level so we can build an Arrow RecordBatch
554/// let mut store = SchemaStore::new(); // Rabin fingerprinting by default
555/// let avro = AvroSchema::new(
556///     r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string()
557/// );
558/// let fp = store.register(avro)?;
559///
560/// // --- Hidden: write a single-object framed row {x:7} ---
561/// # use std::sync::Arc;
562/// # use std::collections::HashMap;
563/// # use arrow_array::{ArrayRef, Int64Array, RecordBatch};
564/// # use arrow_schema::{DataType, Field, Schema};
565/// # use arrow_avro::schema::{SCHEMA_METADATA_KEY, FingerprintStrategy};
566/// # use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
567/// # let mut md = HashMap::new();
568/// # md.insert(SCHEMA_METADATA_KEY.to_string(),
569/// #     r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string());
570/// # let arrow = Schema::new_with_metadata(vec![Field::new("x", DataType::Int64, false)], md);
571/// # let batch = RecordBatch::try_new(Arc::new(arrow.clone()), vec![Arc::new(Int64Array::from(vec![7])) as ArrayRef])?;
572/// # let mut w = WriterBuilder::new(arrow)
573/// #     .with_fingerprint_strategy(fp.into())
574/// #     .build::<_, AvroSoeFormat>(Vec::new())?;
575/// # w.write(&batch)?; w.finish()?; let frame = w.into_inner();
576///
577/// let mut decoder = ReaderBuilder::new()
578///     .with_writer_schema_store(store)
579///     .with_batch_size(16)
580///     .build_decoder()?;
581///
582/// # decoder.decode(&frame)?;
583/// let batch = decoder.flush()?.expect("one row");
584/// assert_eq!(batch.num_rows(), 1);
585/// # Ok(()) }
586/// ```
587///
588/// *Background:* Avro's single‑object encoding is defined as `0xC3 0x01` + 8‑byte
589/// little‑endian CRC‑64‑AVRO fingerprint of the **writer schema** + Avro binary body.
590/// See the Avro 1.11.1 spec for details. <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
591///
592/// Build and use a `Decoder` for Confluent Registry messages:
593///
594/// ```
595/// use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm};
596/// use arrow_avro::reader::ReaderBuilder;
597///
598/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
599/// let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
600/// store.set(Fingerprint::Id(1234), AvroSchema::new(r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string()))?;
601///
602/// // --- Hidden: encode two Confluent-framed messages {x:1} and {x:2} ---
603/// # use std::sync::Arc;
604/// # use std::collections::HashMap;
605/// # use arrow_array::{ArrayRef, Int64Array, RecordBatch};
606/// # use arrow_schema::{DataType, Field, Schema};
607/// # use arrow_avro::schema::{SCHEMA_METADATA_KEY, FingerprintStrategy};
608/// # use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
609/// # fn msg(x: i64) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
610/// #   let mut md = HashMap::new();
611/// #   md.insert(SCHEMA_METADATA_KEY.to_string(),
612/// #     r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string());
613/// #   let arrow = Schema::new_with_metadata(vec![Field::new("x", DataType::Int64, false)], md);
614/// #   let batch = RecordBatch::try_new(Arc::new(arrow.clone()), vec![Arc::new(Int64Array::from(vec![x])) as ArrayRef])?;
615/// #   let mut w = WriterBuilder::new(arrow)
616/// #       .with_fingerprint_strategy(FingerprintStrategy::Id(1234))
617/// #       .build::<_, AvroSoeFormat>(Vec::new())?;
618/// #   w.write(&batch)?; w.finish()?; Ok(w.into_inner())
619/// # }
620/// # let m1 = msg(1)?;
621/// # let m2 = msg(2)?;
622///
623/// let mut decoder = ReaderBuilder::new()
624///     .with_writer_schema_store(store)
625///     .build_decoder()?;
626/// # decoder.decode(&m1)?;
627/// # decoder.decode(&m2)?;
628/// let batch = decoder.flush()?.expect("two rows");
629/// assert_eq!(batch.num_rows(), 2);
630/// # Ok(()) }
631/// ```
632#[derive(Debug)]
633pub struct Decoder {
634    active_decoder: RecordDecoder,
635    active_fingerprint: Option<Fingerprint>,
636    batch_size: usize,
637    remaining_capacity: usize,
638    cache: IndexMap<Fingerprint, RecordDecoder>,
639    fingerprint_algorithm: FingerprintAlgorithm,
640    pending_schema: Option<(Fingerprint, RecordDecoder)>,
641    awaiting_body: bool,
642}
643
644impl Decoder {
645    /// Returns the Arrow schema for the rows decoded by this decoder.
646    ///
647    /// **Note:** With single‑object or Confluent framing, the schema may change
648    /// at a row boundary when the input indicates a new fingerprint.
649    pub fn schema(&self) -> SchemaRef {
650        self.active_decoder.schema().clone()
651    }
652
653    /// Returns the configured maximum number of rows per batch.
654    pub fn batch_size(&self) -> usize {
655        self.batch_size
656    }
657
658    /// Feed a chunk of bytes into the decoder.
659    ///
660    /// This will:
661    ///
662    /// * Decode at most `Self::batch_size` rows;
663    /// * Return the number of input bytes **consumed** from `data` (which may be 0 if more
664    ///   bytes are required, or less than `data.len()` if a prefix/body straddles the
665    ///   chunk boundary);
666    /// * Defer producing a `RecordBatch` until you call `Self::flush`.
667    ///
668    /// # Returns
669    /// The number of bytes consumed from `data`.
670    ///
671    /// # Errors
672    /// Returns an error if:
673    ///
674    /// * The input indicates an unknown fingerprint (not present in the provided
675    ///   `SchemaStore`;
676    /// * The Avro body is malformed;
677    /// * A strict‑mode union rule is violated (see `ReaderBuilder::with_strict_mode`).
678    pub fn decode(&mut self, data: &[u8]) -> Result<usize, ArrowError> {
679        let mut total_consumed = 0usize;
680        while total_consumed < data.len() && self.remaining_capacity > 0 {
681            if self.awaiting_body {
682                match self.active_decoder.decode(&data[total_consumed..], 1) {
683                    Ok(n) => {
684                        self.remaining_capacity -= 1;
685                        total_consumed += n;
686                        self.awaiting_body = false;
687                        continue;
688                    }
689                    Err(ref e) if is_incomplete_data(e) => break,
690                    err => return err,
691                };
692            }
693            match self.handle_prefix(&data[total_consumed..])? {
694                Some(0) => break, // Insufficient bytes
695                Some(n) => {
696                    total_consumed += n;
697                    self.apply_pending_schema_if_batch_empty();
698                    self.awaiting_body = true;
699                }
700                None => {
701                    return Err(ArrowError::ParseError(
702                        "Missing magic bytes and fingerprint".to_string(),
703                    ));
704                }
705            }
706        }
707        Ok(total_consumed)
708    }
709
710    // Attempt to handle a prefix at the current position.
711    // * Ok(None) – buffer does not start with the prefix.
712    // * Ok(Some(0)) – prefix detected, but the buffer is too short; caller should await more bytes.
713    // * Ok(Some(n)) – consumed `n > 0` bytes of a complete prefix (magic and fingerprint).
714    fn handle_prefix(&mut self, buf: &[u8]) -> Result<Option<usize>, ArrowError> {
715        match self.fingerprint_algorithm {
716            FingerprintAlgorithm::Rabin => {
717                self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
718                    Fingerprint::Rabin(u64::from_le_bytes(bytes))
719                })
720            }
721            FingerprintAlgorithm::Id => self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
722                Fingerprint::Id(u32::from_be_bytes(bytes))
723            }),
724            FingerprintAlgorithm::Id64 => {
725                self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
726                    Fingerprint::Id64(u64::from_be_bytes(bytes))
727                })
728            }
729            #[cfg(feature = "md5")]
730            FingerprintAlgorithm::MD5 => {
731                self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
732                    Fingerprint::MD5(bytes)
733                })
734            }
735            #[cfg(feature = "sha256")]
736            FingerprintAlgorithm::SHA256 => {
737                self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
738                    Fingerprint::SHA256(bytes)
739                })
740            }
741        }
742    }
743
744    /// This method checks for the provided `magic` bytes at the start of `buf` and, if present,
745    /// attempts to read the following fingerprint of `N` bytes, converting it to a
746    /// `Fingerprint` using `fingerprint_from`.
747    fn handle_prefix_common<const MAGIC_LEN: usize, const N: usize>(
748        &mut self,
749        buf: &[u8],
750        magic: &[u8; MAGIC_LEN],
751        fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
752    ) -> Result<Option<usize>, ArrowError> {
753        // Need at least the magic bytes to decide
754        // 2 bytes for Avro Spec and 1 byte for Confluent Wire Protocol.
755        if buf.len() < MAGIC_LEN {
756            return Ok(Some(0));
757        }
758        // Bail out early if the magic does not match.
759        if &buf[..MAGIC_LEN] != magic {
760            return Ok(None);
761        }
762        // Try to parse the fingerprint that follows the magic.
763        let consumed_fp = self.handle_fingerprint(&buf[MAGIC_LEN..], fingerprint_from)?;
764        // Convert the inner result into a “bytes consumed” count.
765        // NOTE: Incomplete fingerprint consumes no bytes.
766        Ok(Some(consumed_fp.map_or(0, |n| n + MAGIC_LEN)))
767    }
768
769    // Attempts to read and install a new fingerprint of `N` bytes.
770    //
771    // * Ok(None) – insufficient bytes (`buf.len() < `N`).
772    // * Ok(Some(N)) – fingerprint consumed (always `N`).
773    fn handle_fingerprint<const N: usize>(
774        &mut self,
775        buf: &[u8],
776        fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
777    ) -> Result<Option<usize>, ArrowError> {
778        // Need enough bytes to get fingerprint (next N bytes)
779        let Some(fingerprint_bytes) = buf.get(..N) else {
780            return Ok(None); // insufficient bytes
781        };
782        // SAFETY: length checked above.
783        let new_fingerprint = fingerprint_from(fingerprint_bytes.try_into().unwrap());
784        // If the fingerprint indicates a schema change, prepare to switch decoders.
785        if self.active_fingerprint != Some(new_fingerprint) {
786            let Some(new_decoder) = self.cache.shift_remove(&new_fingerprint) else {
787                return Err(ArrowError::ParseError(format!(
788                    "Unknown fingerprint: {new_fingerprint:?}"
789                )));
790            };
791            self.pending_schema = Some((new_fingerprint, new_decoder));
792            // If there are already decoded rows, we must flush them first.
793            // Reducing `remaining_capacity` to 0 ensures `flush` is called next.
794            if self.remaining_capacity < self.batch_size {
795                self.remaining_capacity = 0;
796            }
797        }
798        Ok(Some(N))
799    }
800
801    fn apply_pending_schema(&mut self) {
802        if let Some((new_fingerprint, new_decoder)) = self.pending_schema.take() {
803            if let Some(old_fingerprint) = self.active_fingerprint.replace(new_fingerprint) {
804                let old_decoder = std::mem::replace(&mut self.active_decoder, new_decoder);
805                self.cache.shift_remove(&old_fingerprint);
806                self.cache.insert(old_fingerprint, old_decoder);
807            } else {
808                self.active_decoder = new_decoder;
809            }
810        }
811    }
812
813    fn apply_pending_schema_if_batch_empty(&mut self) {
814        if self.batch_is_empty() {
815            self.apply_pending_schema();
816        }
817    }
818
819    fn flush_and_reset(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
820        if self.batch_is_empty() {
821            return Ok(None);
822        }
823        let batch = self.active_decoder.flush()?;
824        self.remaining_capacity = self.batch_size;
825        Ok(Some(batch))
826    }
827
828    /// Produce a `RecordBatch` if at least one row is fully decoded, returning
829    /// `Ok(None)` if no new rows are available.
830    ///
831    /// If a schema change was detected while decoding rows for the current batch, the
832    /// schema switch is applied **after** flushing this batch, so the **next** batch
833    /// (if any) may have a different schema.
834    pub fn flush(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
835        // We must flush the active decoder before switching to the pending one.
836        let batch = self.flush_and_reset();
837        self.apply_pending_schema();
838        batch
839    }
840
841    /// Returns the number of rows that can be added to this decoder before it is full.
842    pub fn capacity(&self) -> usize {
843        self.remaining_capacity
844    }
845
846    /// Returns true if the decoder has reached its capacity for the current batch.
847    pub fn batch_is_full(&self) -> bool {
848        self.remaining_capacity == 0
849    }
850
851    /// Returns true if the decoder has not decoded any batches yet (i.e., the current batch is empty).
852    pub fn batch_is_empty(&self) -> bool {
853        self.remaining_capacity == self.batch_size
854    }
855
856    // Decode either the block count or remaining capacity from `data` (an OCF block payload).
857    //
858    // Returns the number of bytes consumed from `data` along with the number of records decoded.
859    fn decode_block(&mut self, data: &[u8], count: usize) -> Result<(usize, usize), ArrowError> {
860        // OCF decoding never interleaves records across blocks, so no chunking.
861        let to_decode = std::cmp::min(count, self.remaining_capacity);
862        if to_decode == 0 {
863            return Ok((0, 0));
864        }
865        let consumed = self.active_decoder.decode(data, to_decode)?;
866        self.remaining_capacity -= to_decode;
867        Ok((consumed, to_decode))
868    }
869
870    // Produce a `RecordBatch` if at least one row is fully decoded, returning
871    // `Ok(None)` if no new rows are available.
872    fn flush_block(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
873        self.flush_and_reset()
874    }
875}
876
877/// A builder that configures and constructs Avro readers and decoders.
878///
879/// `ReaderBuilder` is the primary entry point for this module. It supports:
880///
881/// * OCF reading via `Self::build`, returning a `Reader` over any `BufRead`;
882/// * streaming decoding via `Self::build_decoder`, returning a `Decoder`.
883///
884/// ### Options
885///
886/// * **`batch_size`**: Max rows per `RecordBatch` (default: `1024`). See `Self::with_batch_size`.
887/// * **`utf8_view`**: Use Arrow `StringViewArray` for string columns (default: `false`).
888///   See `Self::with_utf8_view`.
889/// * **`strict_mode`**: Opt‑in to stricter union handling (default: `false`).
890///   See `Self::with_strict_mode`.
891/// * **`reader_schema`**: Optional reader schema (projection / evolution) used when decoding
892///   values (default: `None`). See `Self::with_reader_schema`.
893/// * **`projection`**: Optional projection of **top‑level record fields** by index (default: `None`).
894///
895///   If set, the effective reader schema is **pruned** to include only the projected fields, in the
896///   specified order:
897///
898///   * If a reader schema is provided, that schema is pruned.
899///   * Otherwise, a reader schema is derived from the writer schema and then pruned.
900///   * For streaming `Decoder` with multiple writer schemas and no reader schema, a projected reader
901///     schema is derived **per writer schema** in the `SchemaStore`.
902///
903///   See `Self::with_projection`.
904/// * **`writer_schema_store`**: Required for building a `Decoder` for single‑object or
905///   Confluent framing. Maps fingerprints to Avro schemas. See `Self::with_writer_schema_store`.
906/// * **`active_fingerprint`**: Optional starting fingerprint for streaming decode when the
907///   first frame omits one (rare). See `Self::with_active_fingerprint`.
908///
909/// ### Examples
910///
911/// Read an OCF file in batches of 4096 rows:
912///
913/// ```no_run
914/// use std::fs::File;
915/// use std::io::BufReader;
916/// use arrow_avro::reader::ReaderBuilder;
917///
918/// let file = File::open("data.avro")?;
919/// let mut reader = ReaderBuilder::new()
920///     .with_batch_size(4096)
921///     .build(BufReader::new(file))?;
922/// # Ok::<(), Box<dyn std::error::Error>>(())
923/// ```
924///
925/// Build a `Decoder` for Confluent messages:
926///
927/// ```
928/// use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm};
929/// use arrow_avro::reader::ReaderBuilder;
930///
931/// let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
932/// store.set(Fingerprint::Id(1234), AvroSchema::new(r#"{"type":"record","name":"E","fields":[]}"#.to_string()))?;
933///
934/// let decoder = ReaderBuilder::new()
935///     .with_writer_schema_store(store)
936///     .build_decoder()?;
937/// # Ok::<(), Box<dyn std::error::Error>>(())
938/// ```
939#[derive(Debug)]
940pub struct ReaderBuilder {
941    batch_size: usize,
942    strict_mode: bool,
943    utf8_view: bool,
944    reader_schema: Option<AvroSchema>,
945    projection: Option<Vec<usize>>,
946    writer_schema_store: Option<SchemaStore>,
947    active_fingerprint: Option<Fingerprint>,
948}
949
950impl Default for ReaderBuilder {
951    fn default() -> Self {
952        Self {
953            batch_size: 1024,
954            strict_mode: false,
955            utf8_view: false,
956            reader_schema: None,
957            projection: None,
958            writer_schema_store: None,
959            active_fingerprint: None,
960        }
961    }
962}
963
964impl ReaderBuilder {
965    /// Creates a new `ReaderBuilder` with defaults:
966    ///
967    /// * `batch_size = 1024`
968    /// * `strict_mode = false`
969    /// * `utf8_view = false`
970    /// * `reader_schema = None`
971    /// * `projection = None`
972    /// * `writer_schema_store = None`
973    /// * `active_fingerprint = None`
974    pub fn new() -> Self {
975        Self::default()
976    }
977
978    fn make_record_decoder(
979        &self,
980        writer_schema: &Schema,
981        reader_schema: Option<&Schema>,
982    ) -> Result<RecordDecoder, ArrowError> {
983        let mut builder = AvroFieldBuilder::new(writer_schema);
984        if let Some(reader_schema) = reader_schema {
985            builder = builder.with_reader_schema(reader_schema);
986        }
987        let root = builder
988            .with_utf8view(self.utf8_view)
989            .with_strict_mode(self.strict_mode)
990            .build()?;
991        RecordDecoder::try_new_with_options(root.data_type())
992    }
993
994    fn make_record_decoder_from_schemas(
995        &self,
996        writer_schema: &Schema,
997        reader_schema: Option<&AvroSchema>,
998    ) -> Result<RecordDecoder, ArrowError> {
999        let reader_schema_raw = reader_schema.map(|s| s.schema()).transpose()?;
1000        self.make_record_decoder(writer_schema, reader_schema_raw.as_ref())
1001    }
1002
1003    fn make_decoder_with_parts(
1004        &self,
1005        active_decoder: RecordDecoder,
1006        active_fingerprint: Option<Fingerprint>,
1007        cache: IndexMap<Fingerprint, RecordDecoder>,
1008        fingerprint_algorithm: FingerprintAlgorithm,
1009    ) -> Decoder {
1010        Decoder {
1011            batch_size: self.batch_size,
1012            remaining_capacity: self.batch_size,
1013            active_fingerprint,
1014            active_decoder,
1015            cache,
1016            fingerprint_algorithm,
1017            pending_schema: None,
1018            awaiting_body: false,
1019        }
1020    }
1021
1022    fn make_decoder(
1023        &self,
1024        header: Option<&Header>,
1025        reader_schema: Option<&AvroSchema>,
1026    ) -> Result<Decoder, ArrowError> {
1027        if let Some(hdr) = header {
1028            let writer_schema = hdr
1029                .schema()
1030                .map_err(|e| ArrowError::ExternalError(Box::new(e)))?
1031                .ok_or_else(|| {
1032                    ArrowError::ParseError("No Avro schema present in file header".into())
1033                })?;
1034            let projected_reader_schema = self
1035                .projection
1036                .as_deref()
1037                .map(|projection| {
1038                    let base_schema = if let Some(reader_schema) = reader_schema {
1039                        reader_schema.clone()
1040                    } else {
1041                        let raw = hdr.get(SCHEMA_METADATA_KEY).ok_or_else(|| {
1042                            ArrowError::ParseError(
1043                                "No Avro schema present in file header".to_string(),
1044                            )
1045                        })?;
1046                        let json_string = std::str::from_utf8(raw)
1047                            .map_err(|e| {
1048                                ArrowError::ParseError(format!(
1049                                    "Invalid UTF-8 in Avro schema header: {e}"
1050                                ))
1051                            })?
1052                            .to_string();
1053                        AvroSchema::new(json_string)
1054                    };
1055                    base_schema.project(projection)
1056                })
1057                .transpose()?;
1058            let effective_reader_schema = projected_reader_schema.as_ref().or(reader_schema);
1059            let record_decoder =
1060                self.make_record_decoder_from_schemas(&writer_schema, effective_reader_schema)?;
1061            return Ok(self.make_decoder_with_parts(
1062                record_decoder,
1063                None,
1064                IndexMap::new(),
1065                FingerprintAlgorithm::Rabin,
1066            ));
1067        }
1068        let store = self.writer_schema_store.as_ref().ok_or_else(|| {
1069            ArrowError::ParseError("Writer schema store required for raw Avro".into())
1070        })?;
1071        let fingerprints = store.fingerprints();
1072        if fingerprints.is_empty() {
1073            return Err(ArrowError::ParseError(
1074                "Writer schema store must contain at least one schema".into(),
1075            ));
1076        }
1077        let start_fingerprint = self
1078            .active_fingerprint
1079            .or_else(|| fingerprints.first().copied())
1080            .ok_or_else(|| {
1081                ArrowError::ParseError("Could not determine initial schema fingerprint".into())
1082            })?;
1083        let projection = self.projection.as_deref();
1084        let projected_reader_schema = match (projection, reader_schema) {
1085            (Some(projection), Some(reader_schema)) => Some(reader_schema.project(projection)?),
1086            _ => None,
1087        };
1088        let mut cache = IndexMap::with_capacity(fingerprints.len().saturating_sub(1));
1089        let mut active_decoder: Option<RecordDecoder> = None;
1090        for fingerprint in store.fingerprints() {
1091            let avro_schema = match store.lookup(&fingerprint) {
1092                Some(schema) => schema,
1093                None => {
1094                    return Err(ArrowError::ComputeError(format!(
1095                        "Fingerprint {fingerprint:?} not found in schema store",
1096                    )));
1097                }
1098            };
1099            let writer_schema = avro_schema.schema()?;
1100            let record_decoder = match projection {
1101                None => self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?,
1102                Some(projection) => {
1103                    if let Some(ref pruned_reader_schema) = projected_reader_schema {
1104                        self.make_record_decoder_from_schemas(
1105                            &writer_schema,
1106                            Some(pruned_reader_schema),
1107                        )?
1108                    } else {
1109                        let derived_reader_schema = avro_schema.project(projection)?;
1110                        self.make_record_decoder_from_schemas(
1111                            &writer_schema,
1112                            Some(&derived_reader_schema),
1113                        )?
1114                    }
1115                }
1116            };
1117            if fingerprint == start_fingerprint {
1118                active_decoder = Some(record_decoder);
1119            } else {
1120                cache.insert(fingerprint, record_decoder);
1121            }
1122        }
1123        let active_decoder = active_decoder.ok_or_else(|| {
1124            ArrowError::ComputeError(format!(
1125                "Initial fingerprint {start_fingerprint:?} not found in schema store"
1126            ))
1127        })?;
1128        Ok(self.make_decoder_with_parts(
1129            active_decoder,
1130            Some(start_fingerprint),
1131            cache,
1132            store.fingerprint_algorithm(),
1133        ))
1134    }
1135
1136    /// Sets the **row‑based batch size**.
1137    ///
1138    /// Each call to `Decoder::flush` or each iteration of `Reader` yields a batch with
1139    /// *up to* this many rows. Larger batches can reduce overhead; smaller batches can
1140    /// reduce peak memory usage and latency.
1141    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
1142        self.batch_size = batch_size;
1143        self
1144    }
1145
1146    /// Choose Arrow's `StringViewArray` for UTF‑8 string data.
1147    ///
1148    /// When enabled, textual Avro fields are loaded into Arrow’s **StringViewArray**
1149    /// instead of the standard `StringArray`. This can improve performance for workloads
1150    /// with many short strings by reducing allocations.
1151    pub fn with_utf8_view(mut self, utf8_view: bool) -> Self {
1152        self.utf8_view = utf8_view;
1153        self
1154    }
1155
1156    /// Returns whether `StringViewArray` is enabled for string data.
1157    pub fn use_utf8view(&self) -> bool {
1158        self.utf8_view
1159    }
1160
1161    /// Enable stricter behavior for certain Avro unions (e.g., `[T, "null"]`).
1162    ///
1163    /// When `true`, ambiguous or lossy unions that would otherwise be coerced may instead
1164    /// produce a descriptive error. Use this to catch schema issues early during ingestion.
1165    pub fn with_strict_mode(mut self, strict_mode: bool) -> Self {
1166        self.strict_mode = strict_mode;
1167        self
1168    }
1169
1170    /// Sets the **reader schema** used during decoding.
1171    ///
1172    /// If not provided, the writer schema from the OCF header (for `Reader`) or the
1173    /// schema looked up from the fingerprint (for `Decoder`) is used directly.
1174    ///
1175    /// A reader schema can be used for **schema evolution** or **projection**.
1176    pub fn with_reader_schema(mut self, schema: AvroSchema) -> Self {
1177        self.reader_schema = Some(schema);
1178        self
1179    }
1180
1181    /// Sets an explicit top-level field projection by index.
1182    ///
1183    /// The provided `projection` is a list of indices into the **top-level record** fields.
1184    /// The output schema will contain only these fields, in the specified order.
1185    ///
1186    /// Internally, this is implemented by pruning the effective Avro *reader schema*:
1187    ///
1188    /// * If a reader schema is provided via `Self::with_reader_schema`, that schema is pruned.
1189    /// * Otherwise, a reader schema is derived from the writer schema and then pruned.
1190    /// * For streaming `Decoder` with multiple writer schemas and no reader schema, a projected
1191    ///   reader schema is derived **per writer schema** in the `SchemaStore`.
1192    ///
1193    /// # Example
1194    ///
1195    /// Read only specific columns from an Avro OCF file:
1196    ///
1197    /// ```
1198    /// use std::io::Cursor;
1199    /// use std::sync::Arc;
1200    /// use arrow_array::{ArrayRef, Int32Array, StringArray, Float64Array, RecordBatch};
1201    /// use arrow_schema::{DataType, Field, Schema};
1202    /// use arrow_avro::writer::AvroWriter;
1203    /// use arrow_avro::reader::ReaderBuilder;
1204    ///
1205    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
1206    /// // Original schema has three fields: id, name, value
1207    /// let schema = Schema::new(vec![
1208    ///     Field::new("id", DataType::Int32, false),
1209    ///     Field::new("name", DataType::Utf8, false),
1210    ///     Field::new("value", DataType::Float64, false),
1211    /// ]);
1212    /// let batch = RecordBatch::try_new(
1213    ///     Arc::new(schema.clone()),
1214    ///     vec![
1215    ///         Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
1216    ///         Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef,
1217    ///         Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
1218    ///     ],
1219    /// )?;
1220    ///
1221    /// // Write Avro OCF
1222    /// let mut writer = AvroWriter::new(Vec::new(), schema)?;
1223    /// writer.write(&batch)?;
1224    /// writer.finish()?;
1225    /// let bytes = writer.into_inner();
1226    ///
1227    /// // Read only fields at indices 2 and 0 (value, id) — in that order
1228    /// let mut reader = ReaderBuilder::new()
1229    ///     .with_projection(vec![2, 0])
1230    ///     .build(Cursor::new(bytes))?;
1231    ///
1232    /// let out = reader.next().unwrap()?;
1233    /// assert_eq!(out.num_columns(), 2);
1234    /// assert_eq!(out.schema().field(0).name(), "value");
1235    /// assert_eq!(out.schema().field(1).name(), "id");
1236    /// # Ok(()) }
1237    /// ```
1238    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
1239        self.projection = Some(projection);
1240        self
1241    }
1242
1243    /// Sets the `SchemaStore` used to resolve writer schemas by fingerprint.
1244    ///
1245    /// This is required when building a `Decoder` for **single‑object encoding** or the
1246    /// **Confluent** wire format. The store maps a fingerprint (Rabin / MD5 / SHA‑256 /
1247    /// ID) to a full Avro schema.
1248    ///
1249    /// Defaults to `None`.
1250    pub fn with_writer_schema_store(mut self, store: SchemaStore) -> Self {
1251        self.writer_schema_store = Some(store);
1252        self
1253    }
1254
1255    /// Sets the initial schema fingerprint for stream decoding.
1256    ///
1257    /// This can be useful for streams that **do not include** a fingerprint before the first
1258    /// record body (uncommon). If not set, the first observed fingerprint is used.
1259    pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self {
1260        self.active_fingerprint = Some(fp);
1261        self
1262    }
1263
1264    /// Build a `Reader` (OCF) from this builder and a `BufRead`.
1265    ///
1266    /// This reads and validates the OCF header, initializes an internal row decoder from
1267    /// the discovered writer (and optional reader) schema, and prepares to iterate blocks,
1268    /// decompressing if necessary.
1269    pub fn build<R: BufRead>(self, mut reader: R) -> Result<Reader<R>, ArrowError> {
1270        let header = read_header(&mut reader)?;
1271        let decoder = self.make_decoder(Some(&header), self.reader_schema.as_ref())?;
1272        Ok(Reader {
1273            reader,
1274            header,
1275            decoder,
1276            block_decoder: BlockDecoder::default(),
1277            block_data: Vec::new(),
1278            block_count: 0,
1279            block_cursor: 0,
1280            finished: false,
1281        })
1282    }
1283
1284    /// Build a streaming `Decoder` from this builder.
1285    ///
1286    /// # Requirements
1287    /// * `SchemaStore` **must** be provided via `Self::with_writer_schema_store`.
1288    /// * The store should contain **all** fingerprints that may appear on the stream.
1289    ///
1290    /// # Errors
1291    /// * Returns [`ArrowError::InvalidArgumentError`] if the schema store is missing
1292    pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
1293        if self.writer_schema_store.is_none() {
1294            return Err(ArrowError::InvalidArgumentError(
1295                "Building a decoder requires a writer schema store".to_string(),
1296            ));
1297        }
1298        self.make_decoder(None, self.reader_schema.as_ref())
1299    }
1300}
1301
1302/// A high‑level Avro **Object Container File** reader.
1303///
1304/// `Reader` pulls blocks from a `BufRead` source, handles optional block compression,
1305/// and decodes them row‑by‑row into Arrow `RecordBatch` values using an internal
1306/// `Decoder`. It implements both:
1307///
1308/// * [`Iterator<Item = Result<RecordBatch, ArrowError>>`], and
1309/// * `RecordBatchReader`, guaranteeing a consistent schema across all produced batches.
1310///
1311#[derive(Debug)]
1312pub struct Reader<R: BufRead> {
1313    reader: R,
1314    header: Header,
1315    decoder: Decoder,
1316    block_decoder: BlockDecoder,
1317    block_data: Vec<u8>,
1318    block_count: usize,
1319    block_cursor: usize,
1320    finished: bool,
1321}
1322
1323impl<R: BufRead> Reader<R> {
1324    /// Returns the Arrow schema discovered from the Avro file header (or derived via
1325    /// the optional reader schema).
1326    pub fn schema(&self) -> SchemaRef {
1327        self.decoder.schema()
1328    }
1329
1330    /// Returns a reference to the parsed Avro container‑file header (magic, metadata, codec, sync).
1331    pub fn avro_header(&self) -> &Header {
1332        &self.header
1333    }
1334
1335    /// Reads the next `RecordBatch` from the Avro file, or `Ok(None)` on EOF.
1336    ///
1337    /// Batches are bounded by `batch_size`; a single OCF block may yield multiple batches,
1338    /// and a batch may also span multiple blocks.
1339    fn read(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
1340        'outer: while !self.finished && !self.decoder.batch_is_full() {
1341            while self.block_cursor == self.block_data.len() {
1342                let buf = self.reader.fill_buf()?;
1343                if buf.is_empty() {
1344                    self.finished = true;
1345                    break 'outer;
1346                }
1347                // Try to decode another block from the buffered reader.
1348                let consumed = self.block_decoder.decode(buf)?;
1349                self.reader.consume(consumed);
1350                if let Some(block) = self.block_decoder.flush() {
1351                    // Successfully decoded a block.
1352                    self.block_data = if let Some(ref codec) = self.header.compression()? {
1353                        codec.decompress(&block.data)?
1354                    } else {
1355                        block.data
1356                    };
1357                    self.block_count = block.count;
1358                    self.block_cursor = 0;
1359                } else if consumed == 0 {
1360                    // The block decoder made no progress on a non-empty buffer.
1361                    return Err(ArrowError::ParseError(
1362                        "Could not decode next Avro block from partial data".to_string(),
1363                    ));
1364                }
1365            }
1366            // Decode as many rows as will fit in the current batch
1367            if self.block_cursor < self.block_data.len() {
1368                let (consumed, records_decoded) = self
1369                    .decoder
1370                    .decode_block(&self.block_data[self.block_cursor..], self.block_count)?;
1371                self.block_cursor += consumed;
1372                self.block_count -= records_decoded;
1373            }
1374        }
1375        self.decoder.flush_block()
1376    }
1377}
1378
1379impl<R: BufRead> Iterator for Reader<R> {
1380    type Item = Result<RecordBatch, ArrowError>;
1381
1382    fn next(&mut self) -> Option<Self::Item> {
1383        self.read().transpose()
1384    }
1385}
1386
1387impl<R: BufRead> RecordBatchReader for Reader<R> {
1388    fn schema(&self) -> SchemaRef {
1389        self.schema()
1390    }
1391}
1392
1393#[cfg(test)]
1394mod test {
1395    use crate::codec::AvroFieldBuilder;
1396    use crate::reader::record::RecordDecoder;
1397    use crate::reader::{Decoder, Reader, ReaderBuilder};
1398    use crate::schema::{
1399        AVRO_ENUM_SYMBOLS_METADATA_KEY, AVRO_NAME_METADATA_KEY, AVRO_NAMESPACE_METADATA_KEY,
1400        AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, PrimitiveType,
1401        SINGLE_OBJECT_MAGIC, SchemaStore,
1402    };
1403    use crate::test_util::arrow_test_data;
1404    use crate::writer::AvroWriter;
1405    use arrow_array::builder::{
1406        ArrayBuilder, BooleanBuilder, Float32Builder, Int32Builder, Int64Builder, ListBuilder,
1407        MapBuilder, StringBuilder, StructBuilder,
1408    };
1409    #[cfg(feature = "snappy")]
1410    use arrow_array::builder::{Float64Builder, MapFieldNames};
1411    use arrow_array::cast::AsArray;
1412    #[cfg(not(feature = "avro_custom_types"))]
1413    use arrow_array::types::Int64Type;
1414    #[cfg(feature = "avro_custom_types")]
1415    use arrow_array::types::{
1416        DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
1417        DurationSecondType,
1418    };
1419    use arrow_array::types::{Int32Type, IntervalMonthDayNanoType};
1420    use arrow_array::*;
1421    #[cfg(feature = "snappy")]
1422    use arrow_buffer::{Buffer, NullBuffer};
1423    use arrow_buffer::{IntervalMonthDayNano, OffsetBuffer, ScalarBuffer, i256};
1424    #[cfg(feature = "avro_custom_types")]
1425    use arrow_schema::{
1426        ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit, UnionFields,
1427        UnionMode,
1428    };
1429    #[cfg(not(feature = "avro_custom_types"))]
1430    use arrow_schema::{
1431        ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, UnionFields, UnionMode,
1432    };
1433    use bytes::Bytes;
1434    use futures::executor::block_on;
1435    use futures::{Stream, StreamExt, TryStreamExt, stream};
1436    use serde_json::{Value, json};
1437    use std::collections::HashMap;
1438    use std::fs::File;
1439    use std::io::{BufReader, Cursor};
1440    use std::sync::Arc;
1441
1442    fn files() -> impl Iterator<Item = &'static str> {
1443        [
1444            // TODO: avoid requiring snappy for this file
1445            #[cfg(feature = "snappy")]
1446            "avro/alltypes_plain.avro",
1447            #[cfg(feature = "snappy")]
1448            "avro/alltypes_plain.snappy.avro",
1449            #[cfg(feature = "zstd")]
1450            "avro/alltypes_plain.zstandard.avro",
1451            #[cfg(feature = "bzip2")]
1452            "avro/alltypes_plain.bzip2.avro",
1453            #[cfg(feature = "xz")]
1454            "avro/alltypes_plain.xz.avro",
1455        ]
1456        .into_iter()
1457    }
1458
1459    fn read_file(path: &str, batch_size: usize, utf8_view: bool) -> RecordBatch {
1460        let file = File::open(path).unwrap();
1461        let reader = ReaderBuilder::new()
1462            .with_batch_size(batch_size)
1463            .with_utf8_view(utf8_view)
1464            .build(BufReader::new(file))
1465            .unwrap();
1466        let schema = reader.schema();
1467        let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1468        arrow::compute::concat_batches(&schema, &batches).unwrap()
1469    }
1470
1471    fn read_file_strict(
1472        path: &str,
1473        batch_size: usize,
1474        utf8_view: bool,
1475    ) -> Result<Reader<BufReader<File>>, ArrowError> {
1476        let file = File::open(path)?;
1477        ReaderBuilder::new()
1478            .with_batch_size(batch_size)
1479            .with_utf8_view(utf8_view)
1480            .with_strict_mode(true)
1481            .build(BufReader::new(file))
1482    }
1483
1484    fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
1485        mut decoder: Decoder,
1486        mut input: S,
1487    ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
1488        async_stream::try_stream! {
1489            if let Some(data) = input.next().await {
1490                let consumed = decoder.decode(&data)?;
1491                if consumed < data.len() {
1492                    Err(ArrowError::ParseError(
1493                        "did not consume all bytes".to_string(),
1494                    ))?;
1495                }
1496            }
1497            if let Some(batch) = decoder.flush()? {
1498                yield batch
1499            }
1500        }
1501    }
1502
1503    fn make_record_schema(pt: PrimitiveType) -> AvroSchema {
1504        let js = format!(
1505            r#"{{"type":"record","name":"TestRecord","fields":[{{"name":"a","type":"{}"}}]}}"#,
1506            pt.as_ref()
1507        );
1508        AvroSchema::new(js)
1509    }
1510
1511    fn make_two_schema_store() -> (
1512        SchemaStore,
1513        Fingerprint,
1514        Fingerprint,
1515        AvroSchema,
1516        AvroSchema,
1517    ) {
1518        let schema_int = make_record_schema(PrimitiveType::Int);
1519        let schema_long = make_record_schema(PrimitiveType::Long);
1520        let mut store = SchemaStore::new();
1521        let fp_int = store
1522            .register(schema_int.clone())
1523            .expect("register int schema");
1524        let fp_long = store
1525            .register(schema_long.clone())
1526            .expect("register long schema");
1527        (store, fp_int, fp_long, schema_int, schema_long)
1528    }
1529
1530    fn make_prefix(fp: Fingerprint) -> Vec<u8> {
1531        match fp {
1532            Fingerprint::Rabin(v) => {
1533                let mut out = Vec::with_capacity(2 + 8);
1534                out.extend_from_slice(&SINGLE_OBJECT_MAGIC);
1535                out.extend_from_slice(&v.to_le_bytes());
1536                out
1537            }
1538            Fingerprint::Id(v) => {
1539                panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1540            }
1541            Fingerprint::Id64(v) => {
1542                panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1543            }
1544            #[cfg(feature = "md5")]
1545            Fingerprint::MD5(v) => {
1546                panic!("make_prefix expects a Rabin fingerprint, got ({v:?})");
1547            }
1548            #[cfg(feature = "sha256")]
1549            Fingerprint::SHA256(id) => {
1550                panic!("make_prefix expects a Rabin fingerprint, got ({id:?})");
1551            }
1552        }
1553    }
1554
1555    fn make_decoder(store: &SchemaStore, fp: Fingerprint, reader_schema: &AvroSchema) -> Decoder {
1556        ReaderBuilder::new()
1557            .with_batch_size(8)
1558            .with_reader_schema(reader_schema.clone())
1559            .with_writer_schema_store(store.clone())
1560            .with_active_fingerprint(fp)
1561            .build_decoder()
1562            .expect("decoder")
1563    }
1564
1565    fn make_id_prefix(id: u32, additional: usize) -> Vec<u8> {
1566        let capacity = CONFLUENT_MAGIC.len() + size_of::<u32>() + additional;
1567        let mut out = Vec::with_capacity(capacity);
1568        out.extend_from_slice(&CONFLUENT_MAGIC);
1569        out.extend_from_slice(&id.to_be_bytes());
1570        out
1571    }
1572
1573    fn make_message_id(id: u32, value: i64) -> Vec<u8> {
1574        let encoded_value = encode_zigzag(value);
1575        let mut msg = make_id_prefix(id, encoded_value.len());
1576        msg.extend_from_slice(&encoded_value);
1577        msg
1578    }
1579
1580    fn make_id64_prefix(id: u64, additional: usize) -> Vec<u8> {
1581        let capacity = CONFLUENT_MAGIC.len() + size_of::<u64>() + additional;
1582        let mut out = Vec::with_capacity(capacity);
1583        out.extend_from_slice(&CONFLUENT_MAGIC);
1584        out.extend_from_slice(&id.to_be_bytes());
1585        out
1586    }
1587
1588    fn make_message_id64(id: u64, value: i64) -> Vec<u8> {
1589        let encoded_value = encode_zigzag(value);
1590        let mut msg = make_id64_prefix(id, encoded_value.len());
1591        msg.extend_from_slice(&encoded_value);
1592        msg
1593    }
1594
1595    fn make_value_schema(pt: PrimitiveType) -> AvroSchema {
1596        let json_schema = format!(
1597            r#"{{"type":"record","name":"S","fields":[{{"name":"v","type":"{}"}}]}}"#,
1598            pt.as_ref()
1599        );
1600        AvroSchema::new(json_schema)
1601    }
1602
1603    fn encode_zigzag(value: i64) -> Vec<u8> {
1604        let mut n = ((value << 1) ^ (value >> 63)) as u64;
1605        let mut out = Vec::new();
1606        loop {
1607            if (n & !0x7F) == 0 {
1608                out.push(n as u8);
1609                break;
1610            } else {
1611                out.push(((n & 0x7F) | 0x80) as u8);
1612                n >>= 7;
1613            }
1614        }
1615        out
1616    }
1617
1618    fn make_message(fp: Fingerprint, value: i64) -> Vec<u8> {
1619        let mut msg = make_prefix(fp);
1620        msg.extend_from_slice(&encode_zigzag(value));
1621        msg
1622    }
1623
1624    fn load_writer_schema_json(path: &str) -> Value {
1625        let file = File::open(path).unwrap();
1626        let header = super::read_header(BufReader::new(file)).unwrap();
1627        let schema = header.schema().unwrap().unwrap();
1628        serde_json::to_value(&schema).unwrap()
1629    }
1630
1631    fn make_reader_schema_with_promotions(
1632        path: &str,
1633        promotions: &HashMap<&str, &str>,
1634    ) -> AvroSchema {
1635        let mut root = load_writer_schema_json(path);
1636        assert_eq!(root["type"], "record", "writer schema must be a record");
1637        let fields = root
1638            .get_mut("fields")
1639            .and_then(|f| f.as_array_mut())
1640            .expect("record has fields");
1641        for f in fields.iter_mut() {
1642            let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1643                continue;
1644            };
1645            if let Some(new_ty) = promotions.get(name) {
1646                let ty = f.get_mut("type").expect("field has a type");
1647                match ty {
1648                    Value::String(_) => {
1649                        *ty = Value::String((*new_ty).to_string());
1650                    }
1651                    // Union
1652                    Value::Array(arr) => {
1653                        for b in arr.iter_mut() {
1654                            match b {
1655                                Value::String(s) if s != "null" => {
1656                                    *b = Value::String((*new_ty).to_string());
1657                                    break;
1658                                }
1659                                Value::Object(_) => {
1660                                    *b = Value::String((*new_ty).to_string());
1661                                    break;
1662                                }
1663                                _ => {}
1664                            }
1665                        }
1666                    }
1667                    Value::Object(_) => {
1668                        *ty = Value::String((*new_ty).to_string());
1669                    }
1670                    _ => {}
1671                }
1672            }
1673        }
1674        AvroSchema::new(root.to_string())
1675    }
1676
1677    fn make_reader_schema_with_enum_remap(
1678        path: &str,
1679        remap: &HashMap<&str, Vec<&str>>,
1680    ) -> AvroSchema {
1681        let mut root = load_writer_schema_json(path);
1682        assert_eq!(root["type"], "record", "writer schema must be a record");
1683        let fields = root
1684            .get_mut("fields")
1685            .and_then(|f| f.as_array_mut())
1686            .expect("record has fields");
1687
1688        fn to_symbols_array(symbols: &[&str]) -> Value {
1689            Value::Array(symbols.iter().map(|s| Value::String((*s).into())).collect())
1690        }
1691
1692        fn update_enum_symbols(ty: &mut Value, symbols: &Value) {
1693            match ty {
1694                Value::Object(map) => {
1695                    if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1696                        map.insert("symbols".to_string(), symbols.clone());
1697                    }
1698                }
1699                Value::Array(arr) => {
1700                    for b in arr.iter_mut() {
1701                        if let Value::Object(map) = b {
1702                            if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1703                                map.insert("symbols".to_string(), symbols.clone());
1704                            }
1705                        }
1706                    }
1707                }
1708                _ => {}
1709            }
1710        }
1711        for f in fields.iter_mut() {
1712            let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1713                continue;
1714            };
1715            if let Some(new_symbols) = remap.get(name) {
1716                let symbols_val = to_symbols_array(new_symbols);
1717                let ty = f.get_mut("type").expect("field has a type");
1718                update_enum_symbols(ty, &symbols_val);
1719            }
1720        }
1721        AvroSchema::new(root.to_string())
1722    }
1723
1724    fn read_alltypes_with_reader_schema(path: &str, reader_schema: AvroSchema) -> RecordBatch {
1725        let file = File::open(path).unwrap();
1726        let reader = ReaderBuilder::new()
1727            .with_batch_size(1024)
1728            .with_utf8_view(false)
1729            .with_reader_schema(reader_schema)
1730            .build(BufReader::new(file))
1731            .unwrap();
1732        let schema = reader.schema();
1733        let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1734        arrow::compute::concat_batches(&schema, &batches).unwrap()
1735    }
1736
1737    fn make_reader_schema_with_selected_fields_in_order(
1738        path: &str,
1739        selected: &[&str],
1740    ) -> AvroSchema {
1741        let mut root = load_writer_schema_json(path);
1742        assert_eq!(root["type"], "record", "writer schema must be a record");
1743        let writer_fields = root
1744            .get("fields")
1745            .and_then(|f| f.as_array())
1746            .expect("record has fields");
1747        let mut field_map: HashMap<String, Value> = HashMap::with_capacity(writer_fields.len());
1748        for f in writer_fields {
1749            if let Some(name) = f.get("name").and_then(|n| n.as_str()) {
1750                field_map.insert(name.to_string(), f.clone());
1751            }
1752        }
1753        let mut new_fields = Vec::with_capacity(selected.len());
1754        for name in selected {
1755            let f = field_map
1756                .get(*name)
1757                .unwrap_or_else(|| panic!("field '{name}' not found in writer schema"))
1758                .clone();
1759            new_fields.push(f);
1760        }
1761        root["fields"] = Value::Array(new_fields);
1762        AvroSchema::new(root.to_string())
1763    }
1764
1765    fn write_ocf(schema: &Schema, batches: &[RecordBatch]) -> Vec<u8> {
1766        let mut w = AvroWriter::new(Vec::<u8>::new(), schema.clone()).expect("writer");
1767        for b in batches {
1768            w.write(b).expect("write");
1769        }
1770        w.finish().expect("finish");
1771        w.into_inner()
1772    }
1773
1774    #[test]
1775    fn ocf_projection_no_reader_schema_reorder() -> Result<(), Box<dyn std::error::Error>> {
1776        // Writer: { id: int, name: string, is_active: boolean }
1777        let writer_schema = Schema::new(vec![
1778            Field::new("id", DataType::Int32, false),
1779            Field::new("name", DataType::Utf8, false),
1780            Field::new("is_active", DataType::Boolean, false),
1781        ]);
1782        let batch = RecordBatch::try_new(
1783            Arc::new(writer_schema.clone()),
1784            vec![
1785                Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef,
1786                Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1787                Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef,
1788            ],
1789        )?;
1790        let bytes = write_ocf(&writer_schema, &[batch]);
1791        // Project and reorder: [is_active, id]
1792        let mut reader = ReaderBuilder::new()
1793            .with_projection(vec![2, 0])
1794            .build(Cursor::new(bytes))?;
1795        let out = reader.next().unwrap()?;
1796        assert_eq!(out.num_columns(), 2);
1797        assert_eq!(out.schema().field(0).name(), "is_active");
1798        assert_eq!(out.schema().field(1).name(), "id");
1799        let is_active = out.column(0).as_boolean();
1800        assert!(is_active.value(0));
1801        assert!(!is_active.value(1));
1802        let id = out.column(1).as_primitive::<Int32Type>();
1803        assert_eq!(id.value(0), 1);
1804        assert_eq!(id.value(1), 2);
1805        Ok(())
1806    }
1807
1808    #[test]
1809    fn ocf_projection_with_reader_schema_alias_and_default()
1810    -> Result<(), Box<dyn std::error::Error>> {
1811        // Writer: { id: long, name: string }
1812        let writer_schema = Schema::new(vec![
1813            Field::new("id", DataType::Int64, false),
1814            Field::new("name", DataType::Utf8, false),
1815        ]);
1816        let batch = RecordBatch::try_new(
1817            Arc::new(writer_schema.clone()),
1818            vec![
1819                Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1820                Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1821            ],
1822        )?;
1823        let bytes = write_ocf(&writer_schema, &[batch]);
1824        // Reader adds alias + default field:
1825        //  - rename `name` -> `full_name` via aliases
1826        //  - add `is_active` with default true
1827        let reader_json = r#"
1828    {
1829      "type": "record",
1830      "name": "topLevelRecord",
1831      "fields": [
1832        { "name": "id", "type": "long" },
1833        { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
1834        { "name": "is_active", "type": "boolean", "default": true }
1835      ]
1836    }"#;
1837        // Project only [full_name, is_active] (indices relative to the reader schema)
1838        let mut reader = ReaderBuilder::new()
1839            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1840            .with_projection(vec![1, 2])
1841            .build(Cursor::new(bytes))?;
1842        let out = reader.next().unwrap()?;
1843        assert_eq!(out.num_columns(), 2);
1844        assert_eq!(out.schema().field(0).name(), "full_name");
1845        assert_eq!(out.schema().field(1).name(), "is_active");
1846        let full_name = out.column(0).as_string::<i32>();
1847        assert_eq!(full_name.value(0), "a");
1848        assert_eq!(full_name.value(1), "b");
1849        let is_active = out.column(1).as_boolean();
1850        assert!(is_active.value(0));
1851        assert!(is_active.value(1));
1852        Ok(())
1853    }
1854
1855    #[test]
1856    fn projection_errors_out_of_bounds_and_duplicate() -> Result<(), Box<dyn std::error::Error>> {
1857        let writer_schema = Schema::new(vec![
1858            Field::new("a", DataType::Int32, false),
1859            Field::new("b", DataType::Int32, false),
1860        ]);
1861        let batch = RecordBatch::try_new(
1862            Arc::new(writer_schema.clone()),
1863            vec![
1864                Arc::new(Int32Array::from(vec![1])) as ArrayRef,
1865                Arc::new(Int32Array::from(vec![2])) as ArrayRef,
1866            ],
1867        )?;
1868        let bytes = write_ocf(&writer_schema, &[batch]);
1869        let err = ReaderBuilder::new()
1870            .with_projection(vec![2])
1871            .build(Cursor::new(bytes.clone()))
1872            .unwrap_err();
1873        assert!(matches!(err, ArrowError::AvroError(_)));
1874        assert!(err.to_string().contains("out of bounds"));
1875        let err = ReaderBuilder::new()
1876            .with_projection(vec![0, 0])
1877            .build(Cursor::new(bytes))
1878            .unwrap_err();
1879        assert!(matches!(err, ArrowError::AvroError(_)));
1880        assert!(err.to_string().contains("Duplicate projection index"));
1881        Ok(())
1882    }
1883
1884    #[test]
1885    #[cfg(feature = "snappy")]
1886    fn test_alltypes_plain_with_projection_and_reader_schema() {
1887        use std::fs::File;
1888        use std::io::BufReader;
1889        let path = arrow_test_data("avro/alltypes_plain.avro");
1890        // Build a reader schema that selects [double_col, id, tinyint_col] in that order
1891        let reader_schema = make_reader_schema_with_selected_fields_in_order(
1892            &path,
1893            &["double_col", "id", "tinyint_col"],
1894        );
1895        let file = File::open(&path).expect("open avro/alltypes_plain.avro");
1896        let reader = ReaderBuilder::new()
1897            .with_batch_size(1024)
1898            .with_reader_schema(reader_schema)
1899            .with_projection(vec![1, 2]) // Select indices 1 and 2 from reader schema: [id, tinyint_col]
1900            .build(BufReader::new(file))
1901            .expect("build reader with projection and reader schema");
1902        let schema = reader.schema();
1903        // Verify the projected schema has exactly 2 fields in the correct order
1904        assert_eq!(schema.fields().len(), 2);
1905        assert_eq!(schema.field(0).name(), "id");
1906        assert_eq!(schema.field(1).name(), "tinyint_col");
1907        let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
1908        assert_eq!(batches.len(), 1);
1909        let batch = &batches[0];
1910        assert_eq!(batch.num_rows(), 8);
1911        assert_eq!(batch.num_columns(), 2);
1912        // Build expected batch with exact values from alltypes_plain.avro:
1913        // - id values: [4, 5, 6, 7, 2, 3, 0, 1]
1914        // - tinyint_col values: [0, 1, 0, 1, 0, 1, 0, 1] (i.e., row_index % 2)
1915        let expected = RecordBatch::try_from_iter_with_nullable([
1916            (
1917                "id",
1918                Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as ArrayRef,
1919                true,
1920            ),
1921            (
1922                "tinyint_col",
1923                Arc::new(Int32Array::from(vec![0, 1, 0, 1, 0, 1, 0, 1])) as ArrayRef,
1924                true,
1925            ),
1926        ])
1927        .unwrap();
1928        assert_eq!(
1929            batch, &expected,
1930            "Projected batch mismatch for alltypes_plain.avro with reader schema and projection [1, 2]"
1931        );
1932    }
1933
1934    #[test]
1935    #[cfg(feature = "snappy")]
1936    fn test_alltypes_plain_with_projection() {
1937        use std::fs::File;
1938        use std::io::BufReader;
1939        let path = arrow_test_data("avro/alltypes_plain.avro");
1940        let file = File::open(&path).expect("open avro/alltypes_plain.avro");
1941        let reader = ReaderBuilder::new()
1942            .with_batch_size(1024)
1943            .with_projection(vec![2, 0, 5])
1944            .build(BufReader::new(file))
1945            .expect("build reader with projection");
1946        let schema = reader.schema();
1947        assert_eq!(schema.fields().len(), 3);
1948        assert_eq!(schema.field(0).name(), "tinyint_col");
1949        assert_eq!(schema.field(1).name(), "id");
1950        assert_eq!(schema.field(2).name(), "bigint_col");
1951        let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
1952        assert_eq!(batches.len(), 1);
1953        let batch = &batches[0];
1954        assert_eq!(batch.num_rows(), 8);
1955        assert_eq!(batch.num_columns(), 3);
1956        let expected = RecordBatch::try_from_iter_with_nullable([
1957            (
1958                "tinyint_col",
1959                Arc::new(Int32Array::from(vec![0, 1, 0, 1, 0, 1, 0, 1])) as ArrayRef,
1960                true,
1961            ),
1962            (
1963                "id",
1964                Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as ArrayRef,
1965                true,
1966            ),
1967            (
1968                "bigint_col",
1969                Arc::new(Int64Array::from(vec![0, 10, 0, 10, 0, 10, 0, 10])) as ArrayRef,
1970                true,
1971            ),
1972        ])
1973        .unwrap();
1974        assert_eq!(
1975            batch, &expected,
1976            "Projected batch mismatch for alltypes_plain.avro with projection [2, 0, 5]"
1977        );
1978    }
1979
1980    #[test]
1981    fn writer_string_reader_nullable_with_alias() -> Result<(), Box<dyn std::error::Error>> {
1982        let writer_schema = Schema::new(vec![
1983            Field::new("id", DataType::Int64, false),
1984            Field::new("name", DataType::Utf8, false),
1985        ]);
1986        let batch = RecordBatch::try_new(
1987            Arc::new(writer_schema.clone()),
1988            vec![
1989                Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1990                Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1991            ],
1992        )?;
1993        let bytes = write_ocf(&writer_schema, &[batch]);
1994        let reader_json = r#"
1995    {
1996      "type": "record",
1997      "name": "topLevelRecord",
1998      "fields": [
1999        { "name": "id", "type": "long" },
2000        { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
2001        { "name": "is_active", "type": "boolean", "default": true }
2002      ]
2003    }"#;
2004        let mut reader = ReaderBuilder::new()
2005            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2006            .build(Cursor::new(bytes))?;
2007        let out = reader.next().unwrap()?;
2008        let full_name = out.column(1).as_string::<i32>();
2009        assert_eq!(full_name.value(0), "a");
2010        assert_eq!(full_name.value(1), "b");
2011        Ok(())
2012    }
2013
2014    #[test]
2015    fn writer_string_reader_string_null_order_second() -> Result<(), Box<dyn std::error::Error>> {
2016        // Writer: { name: string }
2017        let writer_schema = Schema::new(vec![Field::new("name", DataType::Utf8, false)]);
2018        let batch = RecordBatch::try_new(
2019            Arc::new(writer_schema.clone()),
2020            vec![Arc::new(StringArray::from(vec!["x", "y"])) as ArrayRef],
2021        )?;
2022        let bytes = write_ocf(&writer_schema, &[batch]);
2023
2024        // Reader: ["string","null"] (NullSecond)
2025        let reader_json = r#"
2026    {
2027      "type":"record", "name":"topLevelRecord",
2028      "fields":[ { "name":"name", "type":["string","null"], "default":"x" } ]
2029    }"#;
2030
2031        let mut reader = ReaderBuilder::new()
2032            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2033            .build(Cursor::new(bytes))?;
2034
2035        let out = reader.next().unwrap()?;
2036        assert_eq!(out.num_rows(), 2);
2037
2038        // Should decode as non-null strings (writer non-union -> reader union)
2039        let name = out.column(0).as_string::<i32>();
2040        assert_eq!(name.value(0), "x");
2041        assert_eq!(name.value(1), "y");
2042
2043        Ok(())
2044    }
2045
2046    #[test]
2047    fn promotion_writer_int_reader_nullable_long() -> Result<(), Box<dyn std::error::Error>> {
2048        // Writer: { v: int }
2049        let writer_schema = Schema::new(vec![Field::new("v", DataType::Int32, false)]);
2050        let batch = RecordBatch::try_new(
2051            Arc::new(writer_schema.clone()),
2052            vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
2053        )?;
2054        let bytes = write_ocf(&writer_schema, &[batch]);
2055
2056        // Reader: { v: ["null","long"] }
2057        let reader_json = r#"
2058    {
2059      "type":"record", "name":"topLevelRecord",
2060      "fields":[ { "name":"v", "type":["null","long"], "default": null } ]
2061    }"#;
2062
2063        let mut reader = ReaderBuilder::new()
2064            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2065            .build(Cursor::new(bytes))?;
2066
2067        let out = reader.next().unwrap()?;
2068        assert_eq!(out.num_rows(), 3);
2069
2070        // Should have promoted to Int64 and be non-null (no union tag in writer)
2071        let v = out
2072            .column(0)
2073            .as_primitive::<arrow_array::types::Int64Type>();
2074        assert_eq!(v.values(), &[1, 2, 3]);
2075        assert!(
2076            out.column(0).nulls().is_none(),
2077            "expected no validity bitmap for all-valid column"
2078        );
2079
2080        Ok(())
2081    }
2082
2083    #[test]
2084    fn test_alltypes_schema_promotion_mixed() {
2085        for file in files() {
2086            let file = arrow_test_data(file);
2087            let mut promotions: HashMap<&str, &str> = HashMap::new();
2088            promotions.insert("id", "long");
2089            promotions.insert("tinyint_col", "float");
2090            promotions.insert("smallint_col", "double");
2091            promotions.insert("int_col", "double");
2092            promotions.insert("bigint_col", "double");
2093            promotions.insert("float_col", "double");
2094            promotions.insert("date_string_col", "string");
2095            promotions.insert("string_col", "string");
2096            let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2097            let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2098            let expected = RecordBatch::try_from_iter_with_nullable([
2099                (
2100                    "id",
2101                    Arc::new(Int64Array::from(vec![4i64, 5, 6, 7, 2, 3, 0, 1])) as _,
2102                    true,
2103                ),
2104                (
2105                    "bool_col",
2106                    Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2107                    true,
2108                ),
2109                (
2110                    "tinyint_col",
2111                    Arc::new(Float32Array::from_iter_values(
2112                        (0..8).map(|x| (x % 2) as f32),
2113                    )) as _,
2114                    true,
2115                ),
2116                (
2117                    "smallint_col",
2118                    Arc::new(Float64Array::from_iter_values(
2119                        (0..8).map(|x| (x % 2) as f64),
2120                    )) as _,
2121                    true,
2122                ),
2123                (
2124                    "int_col",
2125                    Arc::new(Float64Array::from_iter_values(
2126                        (0..8).map(|x| (x % 2) as f64),
2127                    )) as _,
2128                    true,
2129                ),
2130                (
2131                    "bigint_col",
2132                    Arc::new(Float64Array::from_iter_values(
2133                        (0..8).map(|x| ((x % 2) * 10) as f64),
2134                    )) as _,
2135                    true,
2136                ),
2137                (
2138                    "float_col",
2139                    Arc::new(Float64Array::from_iter_values(
2140                        (0..8).map(|x| ((x % 2) as f32 * 1.1f32) as f64),
2141                    )) as _,
2142                    true,
2143                ),
2144                (
2145                    "double_col",
2146                    Arc::new(Float64Array::from_iter_values(
2147                        (0..8).map(|x| (x % 2) as f64 * 10.1),
2148                    )) as _,
2149                    true,
2150                ),
2151                (
2152                    "date_string_col",
2153                    Arc::new(StringArray::from(vec![
2154                        "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2155                        "01/01/09", "01/01/09",
2156                    ])) as _,
2157                    true,
2158                ),
2159                (
2160                    "string_col",
2161                    Arc::new(StringArray::from(
2162                        (0..8)
2163                            .map(|x| if x % 2 == 0 { "0" } else { "1" })
2164                            .collect::<Vec<_>>(),
2165                    )) as _,
2166                    true,
2167                ),
2168                (
2169                    "timestamp_col",
2170                    Arc::new(
2171                        TimestampMicrosecondArray::from_iter_values([
2172                            1235865600000000, // 2009-03-01T00:00:00.000
2173                            1235865660000000, // 2009-03-01T00:01:00.000
2174                            1238544000000000, // 2009-04-01T00:00:00.000
2175                            1238544060000000, // 2009-04-01T00:01:00.000
2176                            1233446400000000, // 2009-02-01T00:00:00.000
2177                            1233446460000000, // 2009-02-01T00:01:00.000
2178                            1230768000000000, // 2009-01-01T00:00:00.000
2179                            1230768060000000, // 2009-01-01T00:01:00.000
2180                        ])
2181                        .with_timezone("+00:00"),
2182                    ) as _,
2183                    true,
2184                ),
2185            ])
2186            .unwrap();
2187            assert_eq!(batch, expected, "mismatch for file {file}");
2188        }
2189    }
2190
2191    #[test]
2192    fn test_alltypes_schema_promotion_long_to_float_only() {
2193        for file in files() {
2194            let file = arrow_test_data(file);
2195            let mut promotions: HashMap<&str, &str> = HashMap::new();
2196            promotions.insert("bigint_col", "float");
2197            let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2198            let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2199            let expected = RecordBatch::try_from_iter_with_nullable([
2200                (
2201                    "id",
2202                    Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2203                    true,
2204                ),
2205                (
2206                    "bool_col",
2207                    Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2208                    true,
2209                ),
2210                (
2211                    "tinyint_col",
2212                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2213                    true,
2214                ),
2215                (
2216                    "smallint_col",
2217                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2218                    true,
2219                ),
2220                (
2221                    "int_col",
2222                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2223                    true,
2224                ),
2225                (
2226                    "bigint_col",
2227                    Arc::new(Float32Array::from_iter_values(
2228                        (0..8).map(|x| ((x % 2) * 10) as f32),
2229                    )) as _,
2230                    true,
2231                ),
2232                (
2233                    "float_col",
2234                    Arc::new(Float32Array::from_iter_values(
2235                        (0..8).map(|x| (x % 2) as f32 * 1.1),
2236                    )) as _,
2237                    true,
2238                ),
2239                (
2240                    "double_col",
2241                    Arc::new(Float64Array::from_iter_values(
2242                        (0..8).map(|x| (x % 2) as f64 * 10.1),
2243                    )) as _,
2244                    true,
2245                ),
2246                (
2247                    "date_string_col",
2248                    Arc::new(BinaryArray::from_iter_values([
2249                        [48, 51, 47, 48, 49, 47, 48, 57],
2250                        [48, 51, 47, 48, 49, 47, 48, 57],
2251                        [48, 52, 47, 48, 49, 47, 48, 57],
2252                        [48, 52, 47, 48, 49, 47, 48, 57],
2253                        [48, 50, 47, 48, 49, 47, 48, 57],
2254                        [48, 50, 47, 48, 49, 47, 48, 57],
2255                        [48, 49, 47, 48, 49, 47, 48, 57],
2256                        [48, 49, 47, 48, 49, 47, 48, 57],
2257                    ])) as _,
2258                    true,
2259                ),
2260                (
2261                    "string_col",
2262                    Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
2263                    true,
2264                ),
2265                (
2266                    "timestamp_col",
2267                    Arc::new(
2268                        TimestampMicrosecondArray::from_iter_values([
2269                            1235865600000000, // 2009-03-01T00:00:00.000
2270                            1235865660000000, // 2009-03-01T00:01:00.000
2271                            1238544000000000, // 2009-04-01T00:00:00.000
2272                            1238544060000000, // 2009-04-01T00:01:00.000
2273                            1233446400000000, // 2009-02-01T00:00:00.000
2274                            1233446460000000, // 2009-02-01T00:01:00.000
2275                            1230768000000000, // 2009-01-01T00:00:00.000
2276                            1230768060000000, // 2009-01-01T00:01:00.000
2277                        ])
2278                        .with_timezone("+00:00"),
2279                    ) as _,
2280                    true,
2281                ),
2282            ])
2283            .unwrap();
2284            assert_eq!(batch, expected, "mismatch for file {file}");
2285        }
2286    }
2287
2288    #[test]
2289    fn test_alltypes_schema_promotion_bytes_to_string_only() {
2290        for file in files() {
2291            let file = arrow_test_data(file);
2292            let mut promotions: HashMap<&str, &str> = HashMap::new();
2293            promotions.insert("date_string_col", "string");
2294            promotions.insert("string_col", "string");
2295            let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2296            let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2297            let expected = RecordBatch::try_from_iter_with_nullable([
2298                (
2299                    "id",
2300                    Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2301                    true,
2302                ),
2303                (
2304                    "bool_col",
2305                    Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2306                    true,
2307                ),
2308                (
2309                    "tinyint_col",
2310                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2311                    true,
2312                ),
2313                (
2314                    "smallint_col",
2315                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2316                    true,
2317                ),
2318                (
2319                    "int_col",
2320                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2321                    true,
2322                ),
2323                (
2324                    "bigint_col",
2325                    Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
2326                    true,
2327                ),
2328                (
2329                    "float_col",
2330                    Arc::new(Float32Array::from_iter_values(
2331                        (0..8).map(|x| (x % 2) as f32 * 1.1),
2332                    )) as _,
2333                    true,
2334                ),
2335                (
2336                    "double_col",
2337                    Arc::new(Float64Array::from_iter_values(
2338                        (0..8).map(|x| (x % 2) as f64 * 10.1),
2339                    )) as _,
2340                    true,
2341                ),
2342                (
2343                    "date_string_col",
2344                    Arc::new(StringArray::from(vec![
2345                        "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2346                        "01/01/09", "01/01/09",
2347                    ])) as _,
2348                    true,
2349                ),
2350                (
2351                    "string_col",
2352                    Arc::new(StringArray::from(
2353                        (0..8)
2354                            .map(|x| if x % 2 == 0 { "0" } else { "1" })
2355                            .collect::<Vec<_>>(),
2356                    )) as _,
2357                    true,
2358                ),
2359                (
2360                    "timestamp_col",
2361                    Arc::new(
2362                        TimestampMicrosecondArray::from_iter_values([
2363                            1235865600000000, // 2009-03-01T00:00:00.000
2364                            1235865660000000, // 2009-03-01T00:01:00.000
2365                            1238544000000000, // 2009-04-01T00:00:00.000
2366                            1238544060000000, // 2009-04-01T00:01:00.000
2367                            1233446400000000, // 2009-02-01T00:00:00.000
2368                            1233446460000000, // 2009-02-01T00:01:00.000
2369                            1230768000000000, // 2009-01-01T00:00:00.000
2370                            1230768060000000, // 2009-01-01T00:01:00.000
2371                        ])
2372                        .with_timezone("+00:00"),
2373                    ) as _,
2374                    true,
2375                ),
2376            ])
2377            .unwrap();
2378            assert_eq!(batch, expected, "mismatch for file {file}");
2379        }
2380    }
2381
2382    #[test]
2383    // TODO: avoid requiring snappy for this file
2384    #[cfg(feature = "snappy")]
2385    fn test_alltypes_illegal_promotion_bool_to_double_errors() {
2386        let file = arrow_test_data("avro/alltypes_plain.avro");
2387        let mut promotions: HashMap<&str, &str> = HashMap::new();
2388        promotions.insert("bool_col", "double"); // illegal
2389        let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2390        let file_handle = File::open(&file).unwrap();
2391        let result = ReaderBuilder::new()
2392            .with_reader_schema(reader_schema)
2393            .build(BufReader::new(file_handle));
2394        let err = result.expect_err("expected illegal promotion to error");
2395        let msg = err.to_string();
2396        assert!(
2397            msg.contains("Illegal promotion") || msg.contains("illegal promotion"),
2398            "unexpected error: {msg}"
2399        );
2400    }
2401
2402    #[test]
2403    fn test_simple_enum_with_reader_schema_mapping() {
2404        let file = arrow_test_data("avro/simple_enum.avro");
2405        let mut remap: HashMap<&str, Vec<&str>> = HashMap::new();
2406        remap.insert("f1", vec!["d", "c", "b", "a"]);
2407        remap.insert("f2", vec!["h", "g", "f", "e"]);
2408        remap.insert("f3", vec!["k", "i", "j"]);
2409        let reader_schema = make_reader_schema_with_enum_remap(&file, &remap);
2410        let actual = read_alltypes_with_reader_schema(&file, reader_schema);
2411        let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
2412        // f1
2413        let f1_keys = Int32Array::from(vec![3, 2, 1, 0]);
2414        let f1_vals = StringArray::from(vec!["d", "c", "b", "a"]);
2415        let f1 = DictionaryArray::<Int32Type>::try_new(f1_keys, Arc::new(f1_vals)).unwrap();
2416        let mut md_f1 = HashMap::new();
2417        md_f1.insert(
2418            AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2419            r#"["d","c","b","a"]"#.to_string(),
2420        );
2421        // New named-type metadata
2422        md_f1.insert("avro.name".to_string(), "enum1".to_string());
2423        md_f1.insert("avro.namespace".to_string(), "ns1".to_string());
2424        let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
2425        // f2
2426        let f2_keys = Int32Array::from(vec![1, 0, 3, 2]);
2427        let f2_vals = StringArray::from(vec!["h", "g", "f", "e"]);
2428        let f2 = DictionaryArray::<Int32Type>::try_new(f2_keys, Arc::new(f2_vals)).unwrap();
2429        let mut md_f2 = HashMap::new();
2430        md_f2.insert(
2431            AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2432            r#"["h","g","f","e"]"#.to_string(),
2433        );
2434        // New named-type metadata
2435        md_f2.insert("avro.name".to_string(), "enum2".to_string());
2436        md_f2.insert("avro.namespace".to_string(), "ns2".to_string());
2437        let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
2438        // f3
2439        let f3_keys = Int32Array::from(vec![Some(2), Some(0), None, Some(1)]);
2440        let f3_vals = StringArray::from(vec!["k", "i", "j"]);
2441        let f3 = DictionaryArray::<Int32Type>::try_new(f3_keys, Arc::new(f3_vals)).unwrap();
2442        let mut md_f3 = HashMap::new();
2443        md_f3.insert(
2444            AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2445            r#"["k","i","j"]"#.to_string(),
2446        );
2447        // New named-type metadata
2448        md_f3.insert("avro.name".to_string(), "enum3".to_string());
2449        md_f3.insert("avro.namespace".to_string(), "ns1".to_string());
2450        let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
2451        let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
2452        let expected = RecordBatch::try_new(
2453            expected_schema,
2454            vec![Arc::new(f1) as ArrayRef, Arc::new(f2), Arc::new(f3)],
2455        )
2456        .unwrap();
2457        assert_eq!(actual, expected);
2458    }
2459
2460    #[test]
2461    fn test_schema_store_register_lookup() {
2462        let schema_int = make_record_schema(PrimitiveType::Int);
2463        let schema_long = make_record_schema(PrimitiveType::Long);
2464        let mut store = SchemaStore::new();
2465        let fp_int = store.register(schema_int.clone()).unwrap();
2466        let fp_long = store.register(schema_long.clone()).unwrap();
2467        assert_eq!(store.lookup(&fp_int).cloned(), Some(schema_int));
2468        assert_eq!(store.lookup(&fp_long).cloned(), Some(schema_long));
2469        assert_eq!(store.fingerprint_algorithm(), FingerprintAlgorithm::Rabin);
2470    }
2471
2472    #[test]
2473    fn test_unknown_fingerprint_is_error() {
2474        let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2475        let unknown_fp = Fingerprint::Rabin(0xDEAD_BEEF_DEAD_BEEF);
2476        let prefix = make_prefix(unknown_fp);
2477        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2478        let err = decoder.decode(&prefix).expect_err("decode should error");
2479        let msg = err.to_string();
2480        assert!(
2481            msg.contains("Unknown fingerprint"),
2482            "unexpected message: {msg}"
2483        );
2484    }
2485
2486    #[test]
2487    fn test_handle_prefix_incomplete_magic() {
2488        let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2489        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2490        let buf = &SINGLE_OBJECT_MAGIC[..1];
2491        let res = decoder.handle_prefix(buf).unwrap();
2492        assert_eq!(res, Some(0));
2493        assert!(decoder.pending_schema.is_none());
2494    }
2495
2496    #[test]
2497    fn test_handle_prefix_magic_mismatch() {
2498        let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2499        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2500        let buf = [0xFFu8, 0x00u8, 0x01u8];
2501        let res = decoder.handle_prefix(&buf).unwrap();
2502        assert!(res.is_none());
2503    }
2504
2505    #[test]
2506    fn test_handle_prefix_incomplete_fingerprint() {
2507        let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2508        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2509        let long_bytes = match fp_long {
2510            Fingerprint::Rabin(v) => v.to_le_bytes(),
2511            Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2512            Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2513            #[cfg(feature = "md5")]
2514            Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2515            #[cfg(feature = "sha256")]
2516            Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2517        };
2518        let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2519        buf.extend_from_slice(&long_bytes[..4]);
2520        let res = decoder.handle_prefix(&buf).unwrap();
2521        assert_eq!(res, Some(0));
2522        assert!(decoder.pending_schema.is_none());
2523    }
2524
2525    #[test]
2526    fn test_handle_prefix_valid_prefix_switches_schema() {
2527        let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2528        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2529        let writer_schema_long = schema_long.schema().unwrap();
2530        let root_long = AvroFieldBuilder::new(&writer_schema_long).build().unwrap();
2531        let long_decoder = RecordDecoder::try_new_with_options(root_long.data_type()).unwrap();
2532        let _ = decoder.cache.insert(fp_long, long_decoder);
2533        let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2534        match fp_long {
2535            Fingerprint::Rabin(v) => buf.extend_from_slice(&v.to_le_bytes()),
2536            Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2537            Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2538            #[cfg(feature = "md5")]
2539            Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2540            #[cfg(feature = "sha256")]
2541            Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2542        }
2543        let consumed = decoder.handle_prefix(&buf).unwrap().unwrap();
2544        assert_eq!(consumed, buf.len());
2545        assert!(decoder.pending_schema.is_some());
2546        assert_eq!(decoder.pending_schema.as_ref().unwrap().0, fp_long);
2547    }
2548
2549    #[test]
2550    fn test_decoder_projection_multiple_writer_schemas_no_reader_schema()
2551    -> Result<(), Box<dyn std::error::Error>> {
2552        // Two writer schemas with different shapes
2553        let writer_v1 = AvroSchema::new(
2554            r#"{"type":"record","name":"E","fields":[{"name":"a","type":"int"},{"name":"b","type":"string"}]}"#
2555                .to_string(),
2556        );
2557        let writer_v2 = AvroSchema::new(
2558            r#"{"type":"record","name":"E","fields":[{"name":"a","type":"long"},{"name":"b","type":"string"},{"name":"c","type":"int"}]}"#
2559                .to_string(),
2560        );
2561        let mut store = SchemaStore::new();
2562        let fp1 = store.register(writer_v1)?;
2563        let fp2 = store.register(writer_v2)?;
2564        let mut decoder = ReaderBuilder::new()
2565            .with_writer_schema_store(store)
2566            .with_active_fingerprint(fp1)
2567            .with_batch_size(8)
2568            .with_projection(vec![1])
2569            .build_decoder()?;
2570        // Message for v1: {a:1, b:"x"}
2571        let mut msg1 = make_prefix(fp1);
2572        msg1.extend_from_slice(&encode_zigzag(1)); // a = 1
2573        msg1.push((1u8) << 1);
2574        msg1.extend_from_slice(b"x");
2575        // Message for v2: {a:2, b:"y", c:7}
2576        let mut msg2 = make_prefix(fp2);
2577        msg2.extend_from_slice(&encode_zigzag(2)); // a = 2
2578        msg2.push((1u8) << 1);
2579        msg2.extend_from_slice(b"y");
2580        msg2.extend_from_slice(&encode_zigzag(7)); // c = 7
2581        decoder.decode(&msg1)?;
2582        let batch1 = decoder.flush()?.expect("batch1");
2583        assert_eq!(batch1.num_columns(), 1);
2584        assert_eq!(batch1.schema().field(0).name(), "b");
2585        let b1 = batch1.column(0).as_string::<i32>();
2586        assert_eq!(b1.value(0), "x");
2587        decoder.decode(&msg2)?;
2588        let batch2 = decoder.flush()?.expect("batch2");
2589        assert_eq!(batch2.num_columns(), 1);
2590        assert_eq!(batch2.schema().field(0).name(), "b");
2591        let b2 = batch2.column(0).as_string::<i32>();
2592        assert_eq!(b2.value(0), "y");
2593        Ok(())
2594    }
2595
2596    #[test]
2597    fn test_two_messages_same_schema() {
2598        let writer_schema = make_value_schema(PrimitiveType::Int);
2599        let reader_schema = writer_schema.clone();
2600        let mut store = SchemaStore::new();
2601        let fp = store.register(writer_schema).unwrap();
2602        let msg1 = make_message(fp, 42);
2603        let msg2 = make_message(fp, 11);
2604        let input = [msg1.clone(), msg2.clone()].concat();
2605        let mut decoder = ReaderBuilder::new()
2606            .with_batch_size(8)
2607            .with_reader_schema(reader_schema.clone())
2608            .with_writer_schema_store(store)
2609            .with_active_fingerprint(fp)
2610            .build_decoder()
2611            .unwrap();
2612        let _ = decoder.decode(&input).unwrap();
2613        let batch = decoder.flush().unwrap().expect("batch");
2614        assert_eq!(batch.num_rows(), 2);
2615        let col = batch
2616            .column(0)
2617            .as_any()
2618            .downcast_ref::<Int32Array>()
2619            .unwrap();
2620        assert_eq!(col.value(0), 42);
2621        assert_eq!(col.value(1), 11);
2622    }
2623
2624    #[test]
2625    fn test_two_messages_schema_switch() {
2626        let w_int = make_value_schema(PrimitiveType::Int);
2627        let w_long = make_value_schema(PrimitiveType::Long);
2628        let mut store = SchemaStore::new();
2629        let fp_int = store.register(w_int).unwrap();
2630        let fp_long = store.register(w_long).unwrap();
2631        let msg_int = make_message(fp_int, 1);
2632        let msg_long = make_message(fp_long, 123456789_i64);
2633        let mut decoder = ReaderBuilder::new()
2634            .with_batch_size(8)
2635            .with_writer_schema_store(store)
2636            .with_active_fingerprint(fp_int)
2637            .build_decoder()
2638            .unwrap();
2639        let _ = decoder.decode(&msg_int).unwrap();
2640        let batch1 = decoder.flush().unwrap().expect("batch1");
2641        assert_eq!(batch1.num_rows(), 1);
2642        assert_eq!(
2643            batch1
2644                .column(0)
2645                .as_any()
2646                .downcast_ref::<Int32Array>()
2647                .unwrap()
2648                .value(0),
2649            1
2650        );
2651        let _ = decoder.decode(&msg_long).unwrap();
2652        let batch2 = decoder.flush().unwrap().expect("batch2");
2653        assert_eq!(batch2.num_rows(), 1);
2654        assert_eq!(
2655            batch2
2656                .column(0)
2657                .as_any()
2658                .downcast_ref::<Int64Array>()
2659                .unwrap()
2660                .value(0),
2661            123456789_i64
2662        );
2663    }
2664
2665    #[test]
2666    fn test_two_messages_same_schema_id() {
2667        let writer_schema = make_value_schema(PrimitiveType::Int);
2668        let reader_schema = writer_schema.clone();
2669        let id = 100u32;
2670        // Set up store with None fingerprint algorithm and register schema by id
2671        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2672        let _ = store
2673            .set(Fingerprint::Id(id), writer_schema.clone())
2674            .expect("set id schema");
2675        let msg1 = make_message_id(id, 21);
2676        let msg2 = make_message_id(id, 22);
2677        let input = [msg1.clone(), msg2.clone()].concat();
2678        let mut decoder = ReaderBuilder::new()
2679            .with_batch_size(8)
2680            .with_reader_schema(reader_schema)
2681            .with_writer_schema_store(store)
2682            .with_active_fingerprint(Fingerprint::Id(id))
2683            .build_decoder()
2684            .unwrap();
2685        let _ = decoder.decode(&input).unwrap();
2686        let batch = decoder.flush().unwrap().expect("batch");
2687        assert_eq!(batch.num_rows(), 2);
2688        let col = batch
2689            .column(0)
2690            .as_any()
2691            .downcast_ref::<Int32Array>()
2692            .unwrap();
2693        assert_eq!(col.value(0), 21);
2694        assert_eq!(col.value(1), 22);
2695    }
2696
2697    #[test]
2698    fn test_unknown_id_fingerprint_is_error() {
2699        let writer_schema = make_value_schema(PrimitiveType::Int);
2700        let id_known = 7u32;
2701        let id_unknown = 9u32;
2702        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2703        let _ = store
2704            .set(Fingerprint::Id(id_known), writer_schema.clone())
2705            .expect("set id schema");
2706        let mut decoder = ReaderBuilder::new()
2707            .with_batch_size(8)
2708            .with_reader_schema(writer_schema)
2709            .with_writer_schema_store(store)
2710            .with_active_fingerprint(Fingerprint::Id(id_known))
2711            .build_decoder()
2712            .unwrap();
2713        let prefix = make_id_prefix(id_unknown, 0);
2714        let err = decoder.decode(&prefix).expect_err("decode should error");
2715        let msg = err.to_string();
2716        assert!(
2717            msg.contains("Unknown fingerprint"),
2718            "unexpected message: {msg}"
2719        );
2720    }
2721
2722    #[test]
2723    fn test_handle_prefix_id_incomplete_magic() {
2724        let writer_schema = make_value_schema(PrimitiveType::Int);
2725        let id = 5u32;
2726        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2727        let _ = store
2728            .set(Fingerprint::Id(id), writer_schema.clone())
2729            .expect("set id schema");
2730        let mut decoder = ReaderBuilder::new()
2731            .with_batch_size(8)
2732            .with_reader_schema(writer_schema)
2733            .with_writer_schema_store(store)
2734            .with_active_fingerprint(Fingerprint::Id(id))
2735            .build_decoder()
2736            .unwrap();
2737        let buf = &CONFLUENT_MAGIC[..0]; // empty incomplete magic
2738        let res = decoder.handle_prefix(buf).unwrap();
2739        assert_eq!(res, Some(0));
2740        assert!(decoder.pending_schema.is_none());
2741    }
2742
2743    #[test]
2744    fn test_two_messages_same_schema_id64() {
2745        let writer_schema = make_value_schema(PrimitiveType::Int);
2746        let reader_schema = writer_schema.clone();
2747        let id = 100u64;
2748        // Set up store with None fingerprint algorithm and register schema by id
2749        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id64);
2750        let _ = store
2751            .set(Fingerprint::Id64(id), writer_schema.clone())
2752            .expect("set id schema");
2753        let msg1 = make_message_id64(id, 21);
2754        let msg2 = make_message_id64(id, 22);
2755        let input = [msg1.clone(), msg2.clone()].concat();
2756        let mut decoder = ReaderBuilder::new()
2757            .with_batch_size(8)
2758            .with_reader_schema(reader_schema)
2759            .with_writer_schema_store(store)
2760            .with_active_fingerprint(Fingerprint::Id64(id))
2761            .build_decoder()
2762            .unwrap();
2763        let _ = decoder.decode(&input).unwrap();
2764        let batch = decoder.flush().unwrap().expect("batch");
2765        assert_eq!(batch.num_rows(), 2);
2766        let col = batch
2767            .column(0)
2768            .as_any()
2769            .downcast_ref::<Int32Array>()
2770            .unwrap();
2771        assert_eq!(col.value(0), 21);
2772        assert_eq!(col.value(1), 22);
2773    }
2774
2775    #[test]
2776    fn test_decode_stream_with_schema() {
2777        struct TestCase<'a> {
2778            name: &'a str,
2779            schema: &'a str,
2780            expected_error: Option<&'a str>,
2781        }
2782        let tests = vec![
2783            TestCase {
2784                name: "success",
2785                schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"string"}]}"#,
2786                expected_error: None,
2787            },
2788            TestCase {
2789                name: "valid schema invalid data",
2790                schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"long"}]}"#,
2791                expected_error: Some("did not consume all bytes"),
2792            },
2793        ];
2794        for test in tests {
2795            let avro_schema = AvroSchema::new(test.schema.to_string());
2796            let mut store = SchemaStore::new();
2797            let fp = store.register(avro_schema.clone()).unwrap();
2798            let prefix = make_prefix(fp);
2799            let record_val = "some_string";
2800            let mut body = prefix;
2801            body.push((record_val.len() as u8) << 1);
2802            body.extend_from_slice(record_val.as_bytes());
2803            let decoder_res = ReaderBuilder::new()
2804                .with_batch_size(1)
2805                .with_writer_schema_store(store)
2806                .with_active_fingerprint(fp)
2807                .build_decoder();
2808            let decoder = match decoder_res {
2809                Ok(d) => d,
2810                Err(e) => {
2811                    if let Some(expected) = test.expected_error {
2812                        assert!(
2813                            e.to_string().contains(expected),
2814                            "Test '{}' failed at build – expected '{expected}', got '{e}'",
2815                            test.name
2816                        );
2817                        continue;
2818                    } else {
2819                        panic!("Test '{}' failed during build: {e}", test.name);
2820                    }
2821                }
2822            };
2823            let stream = Box::pin(stream::once(async { Bytes::from(body) }));
2824            let decoded_stream = decode_stream(decoder, stream);
2825            let batches_result: Result<Vec<RecordBatch>, ArrowError> =
2826                block_on(decoded_stream.try_collect());
2827            match (batches_result, test.expected_error) {
2828                (Ok(batches), None) => {
2829                    let batch =
2830                        arrow::compute::concat_batches(&batches[0].schema(), &batches).unwrap();
2831                    let expected_field = Field::new("f2", DataType::Utf8, false);
2832                    let expected_schema = Arc::new(Schema::new(vec![expected_field]));
2833                    let expected_array = Arc::new(StringArray::from(vec![record_val]));
2834                    let expected_batch =
2835                        RecordBatch::try_new(expected_schema, vec![expected_array]).unwrap();
2836                    assert_eq!(batch, expected_batch, "Test '{}'", test.name);
2837                }
2838                (Err(e), Some(expected)) => {
2839                    assert!(
2840                        e.to_string().contains(expected),
2841                        "Test '{}' – expected error containing '{expected}', got '{e}'",
2842                        test.name
2843                    );
2844                }
2845                (Ok(_), Some(expected)) => {
2846                    panic!(
2847                        "Test '{}' expected failure ('{expected}') but succeeded",
2848                        test.name
2849                    );
2850                }
2851                (Err(e), None) => {
2852                    panic!("Test '{}' unexpectedly failed with '{e}'", test.name);
2853                }
2854            }
2855        }
2856    }
2857
2858    #[test]
2859    fn test_utf8view_support() {
2860        struct TestHelper;
2861        impl TestHelper {
2862            fn with_utf8view(field: &Field) -> Field {
2863                match field.data_type() {
2864                    DataType::Utf8 => {
2865                        Field::new(field.name(), DataType::Utf8View, field.is_nullable())
2866                            .with_metadata(field.metadata().clone())
2867                    }
2868                    _ => field.clone(),
2869                }
2870            }
2871        }
2872
2873        let field = TestHelper::with_utf8view(&Field::new("str_field", DataType::Utf8, false));
2874
2875        assert_eq!(field.data_type(), &DataType::Utf8View);
2876
2877        let array = StringViewArray::from(vec!["test1", "test2"]);
2878        let batch =
2879            RecordBatch::try_from_iter(vec![("str_field", Arc::new(array) as ArrayRef)]).unwrap();
2880
2881        assert!(batch.column(0).as_any().is::<StringViewArray>());
2882    }
2883
2884    fn make_reader_schema_with_default_fields(
2885        path: &str,
2886        default_fields: Vec<Value>,
2887    ) -> AvroSchema {
2888        let mut root = load_writer_schema_json(path);
2889        assert_eq!(root["type"], "record", "writer schema must be a record");
2890        root.as_object_mut()
2891            .expect("schema is a JSON object")
2892            .insert("fields".to_string(), Value::Array(default_fields));
2893        AvroSchema::new(root.to_string())
2894    }
2895
2896    #[test]
2897    fn test_schema_resolution_defaults_all_supported_types() {
2898        let path = "test/data/skippable_types.avro";
2899        let duration_default = "\u{0000}".repeat(12);
2900        let reader_schema = make_reader_schema_with_default_fields(
2901            path,
2902            vec![
2903                serde_json::json!({"name":"d_bool","type":"boolean","default":true}),
2904                serde_json::json!({"name":"d_int","type":"int","default":42}),
2905                serde_json::json!({"name":"d_long","type":"long","default":12345}),
2906                serde_json::json!({"name":"d_float","type":"float","default":1.5}),
2907                serde_json::json!({"name":"d_double","type":"double","default":2.25}),
2908                serde_json::json!({"name":"d_bytes","type":"bytes","default":"XYZ"}),
2909                serde_json::json!({"name":"d_string","type":"string","default":"hello"}),
2910                serde_json::json!({"name":"d_date","type":{"type":"int","logicalType":"date"},"default":0}),
2911                serde_json::json!({"name":"d_time_ms","type":{"type":"int","logicalType":"time-millis"},"default":1000}),
2912                serde_json::json!({"name":"d_time_us","type":{"type":"long","logicalType":"time-micros"},"default":2000}),
2913                serde_json::json!({"name":"d_ts_ms","type":{"type":"long","logicalType":"local-timestamp-millis"},"default":0}),
2914                serde_json::json!({"name":"d_ts_us","type":{"type":"long","logicalType":"local-timestamp-micros"},"default":0}),
2915                serde_json::json!({"name":"d_decimal","type":{"type":"bytes","logicalType":"decimal","precision":10,"scale":2},"default":""}),
2916                serde_json::json!({"name":"d_fixed","type":{"type":"fixed","name":"F4","size":4},"default":"ABCD"}),
2917                serde_json::json!({"name":"d_enum","type":{"type":"enum","name":"E","symbols":["A","B","C"]},"default":"A"}),
2918                serde_json::json!({"name":"d_duration","type":{"type":"fixed","name":"Dur","size":12,"logicalType":"duration"},"default":duration_default}),
2919                serde_json::json!({"name":"d_uuid","type":{"type":"string","logicalType":"uuid"},"default":"00000000-0000-0000-0000-000000000000"}),
2920                serde_json::json!({"name":"d_array","type":{"type":"array","items":"int"},"default":[1,2,3]}),
2921                serde_json::json!({"name":"d_map","type":{"type":"map","values":"long"},"default":{"a":1,"b":2}}),
2922                serde_json::json!({"name":"d_record","type":{
2923              "type":"record","name":"DefaultRec","fields":[
2924                  {"name":"x","type":"int"},
2925                  {"name":"y","type":["null","string"],"default":null}
2926              ]
2927        },"default":{"x":7}}),
2928                serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}),
2929                serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}),
2930            ],
2931        );
2932        let actual = read_alltypes_with_reader_schema(path, reader_schema);
2933        let num_rows = actual.num_rows();
2934        assert!(num_rows > 0, "skippable_types.avro should contain rows");
2935        assert_eq!(
2936            actual.num_columns(),
2937            22,
2938            "expected exactly our defaulted fields"
2939        );
2940        let mut arrays: Vec<Arc<dyn Array>> = Vec::with_capacity(22);
2941        arrays.push(Arc::new(BooleanArray::from_iter(std::iter::repeat_n(
2942            Some(true),
2943            num_rows,
2944        ))));
2945        arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2946            42, num_rows,
2947        ))));
2948        arrays.push(Arc::new(Int64Array::from_iter_values(std::iter::repeat_n(
2949            12345, num_rows,
2950        ))));
2951        arrays.push(Arc::new(Float32Array::from_iter_values(
2952            std::iter::repeat_n(1.5f32, num_rows),
2953        )));
2954        arrays.push(Arc::new(Float64Array::from_iter_values(
2955            std::iter::repeat_n(2.25f64, num_rows),
2956        )));
2957        arrays.push(Arc::new(BinaryArray::from_iter_values(
2958            std::iter::repeat_n(b"XYZ".as_ref(), num_rows),
2959        )));
2960        arrays.push(Arc::new(StringArray::from_iter_values(
2961            std::iter::repeat_n("hello", num_rows),
2962        )));
2963        arrays.push(Arc::new(Date32Array::from_iter_values(
2964            std::iter::repeat_n(0, num_rows),
2965        )));
2966        arrays.push(Arc::new(Time32MillisecondArray::from_iter_values(
2967            std::iter::repeat_n(1_000, num_rows),
2968        )));
2969        arrays.push(Arc::new(Time64MicrosecondArray::from_iter_values(
2970            std::iter::repeat_n(2_000i64, num_rows),
2971        )));
2972        arrays.push(Arc::new(TimestampMillisecondArray::from_iter_values(
2973            std::iter::repeat_n(0i64, num_rows),
2974        )));
2975        arrays.push(Arc::new(TimestampMicrosecondArray::from_iter_values(
2976            std::iter::repeat_n(0i64, num_rows),
2977        )));
2978        #[cfg(feature = "small_decimals")]
2979        let decimal = Decimal64Array::from_iter_values(std::iter::repeat_n(0i64, num_rows))
2980            .with_precision_and_scale(10, 2)
2981            .unwrap();
2982        #[cfg(not(feature = "small_decimals"))]
2983        let decimal = Decimal128Array::from_iter_values(std::iter::repeat_n(0i128, num_rows))
2984            .with_precision_and_scale(10, 2)
2985            .unwrap();
2986        arrays.push(Arc::new(decimal));
2987        let fixed_iter = std::iter::repeat_n(Some(*b"ABCD"), num_rows);
2988        arrays.push(Arc::new(
2989            FixedSizeBinaryArray::try_from_sparse_iter_with_size(fixed_iter, 4).unwrap(),
2990        ));
2991        let enum_keys = Int32Array::from_iter_values(std::iter::repeat_n(0, num_rows));
2992        let enum_values = StringArray::from_iter_values(["A", "B", "C"]);
2993        let enum_arr =
2994            DictionaryArray::<Int32Type>::try_new(enum_keys, Arc::new(enum_values)).unwrap();
2995        arrays.push(Arc::new(enum_arr));
2996        let duration_values = std::iter::repeat_n(
2997            Some(IntervalMonthDayNanoType::make_value(0, 0, 0)),
2998            num_rows,
2999        );
3000        let duration_arr: IntervalMonthDayNanoArray = duration_values.collect();
3001        arrays.push(Arc::new(duration_arr));
3002        let uuid_bytes = [0u8; 16];
3003        let uuid_iter = std::iter::repeat_n(Some(uuid_bytes), num_rows);
3004        arrays.push(Arc::new(
3005            FixedSizeBinaryArray::try_from_sparse_iter_with_size(uuid_iter, 16).unwrap(),
3006        ));
3007        let item_field = Arc::new(Field::new(
3008            Field::LIST_FIELD_DEFAULT_NAME,
3009            DataType::Int32,
3010            false,
3011        ));
3012        let mut list_builder = ListBuilder::new(Int32Builder::new()).with_field(item_field);
3013        for _ in 0..num_rows {
3014            list_builder.values().append_value(1);
3015            list_builder.values().append_value(2);
3016            list_builder.values().append_value(3);
3017            list_builder.append(true);
3018        }
3019        arrays.push(Arc::new(list_builder.finish()));
3020        let values_field = Arc::new(Field::new("value", DataType::Int64, false));
3021        let mut map_builder = MapBuilder::new(
3022            Some(builder::MapFieldNames {
3023                entry: "entries".to_string(),
3024                key: "key".to_string(),
3025                value: "value".to_string(),
3026            }),
3027            StringBuilder::new(),
3028            Int64Builder::new(),
3029        )
3030        .with_values_field(values_field);
3031        for _ in 0..num_rows {
3032            let (keys, vals) = map_builder.entries();
3033            keys.append_value("a");
3034            vals.append_value(1);
3035            keys.append_value("b");
3036            vals.append_value(2);
3037            map_builder.append(true).unwrap();
3038        }
3039        arrays.push(Arc::new(map_builder.finish()));
3040        let rec_fields: Fields = Fields::from(vec![
3041            Field::new("x", DataType::Int32, false),
3042            Field::new("y", DataType::Utf8, true),
3043        ]);
3044        let mut sb = StructBuilder::new(
3045            rec_fields.clone(),
3046            vec![
3047                Box::new(Int32Builder::new()),
3048                Box::new(StringBuilder::new()),
3049            ],
3050        );
3051        for _ in 0..num_rows {
3052            sb.field_builder::<Int32Builder>(0).unwrap().append_value(7);
3053            sb.field_builder::<StringBuilder>(1).unwrap().append_null();
3054            sb.append(true);
3055        }
3056        arrays.push(Arc::new(sb.finish()));
3057        arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n(
3058            None::<i32>,
3059            num_rows,
3060        ))));
3061        arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
3062            123, num_rows,
3063        ))));
3064        let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap();
3065        assert_eq!(
3066            actual, expected,
3067            "defaults should materialize correctly for all fields"
3068        );
3069    }
3070
3071    #[test]
3072    fn test_schema_resolution_default_enum_invalid_symbol_errors() {
3073        let path = "test/data/skippable_types.avro";
3074        let bad_schema = make_reader_schema_with_default_fields(
3075            path,
3076            vec![serde_json::json!({
3077                "name":"bad_enum",
3078                "type":{"type":"enum","name":"E","symbols":["A","B","C"]},
3079                "default":"Z"
3080            })],
3081        );
3082        let file = File::open(path).unwrap();
3083        let res = ReaderBuilder::new()
3084            .with_reader_schema(bad_schema)
3085            .build(BufReader::new(file));
3086        let err = res.expect_err("expected enum default validation to fail");
3087        let msg = err.to_string();
3088        let lower_msg = msg.to_lowercase();
3089        assert!(
3090            lower_msg.contains("enum")
3091                && (lower_msg.contains("symbol") || lower_msg.contains("default")),
3092            "unexpected error: {msg}"
3093        );
3094    }
3095
3096    #[test]
3097    fn test_schema_resolution_default_fixed_size_mismatch_errors() {
3098        let path = "test/data/skippable_types.avro";
3099        let bad_schema = make_reader_schema_with_default_fields(
3100            path,
3101            vec![serde_json::json!({
3102                "name":"bad_fixed",
3103                "type":{"type":"fixed","name":"F","size":4},
3104                "default":"ABC"
3105            })],
3106        );
3107        let file = File::open(path).unwrap();
3108        let res = ReaderBuilder::new()
3109            .with_reader_schema(bad_schema)
3110            .build(BufReader::new(file));
3111        let err = res.expect_err("expected fixed default validation to fail");
3112        let msg = err.to_string();
3113        let lower_msg = msg.to_lowercase();
3114        assert!(
3115            lower_msg.contains("fixed")
3116                && (lower_msg.contains("size")
3117                    || lower_msg.contains("length")
3118                    || lower_msg.contains("does not match")),
3119            "unexpected error: {msg}"
3120        );
3121    }
3122
3123    #[test]
3124    // TODO: avoid requiring snappy for this file
3125    #[cfg(feature = "snappy")]
3126    fn test_alltypes_skip_writer_fields_keep_double_only() {
3127        let file = arrow_test_data("avro/alltypes_plain.avro");
3128        let reader_schema =
3129            make_reader_schema_with_selected_fields_in_order(&file, &["double_col"]);
3130        let batch = read_alltypes_with_reader_schema(&file, reader_schema);
3131        let expected = RecordBatch::try_from_iter_with_nullable([(
3132            "double_col",
3133            Arc::new(Float64Array::from_iter_values(
3134                (0..8).map(|x| (x % 2) as f64 * 10.1),
3135            )) as _,
3136            true,
3137        )])
3138        .unwrap();
3139        assert_eq!(batch, expected);
3140    }
3141
3142    #[test]
3143    // TODO: avoid requiring snappy for this file
3144    #[cfg(feature = "snappy")]
3145    fn test_alltypes_skip_writer_fields_reorder_and_skip_many() {
3146        let file = arrow_test_data("avro/alltypes_plain.avro");
3147        let reader_schema =
3148            make_reader_schema_with_selected_fields_in_order(&file, &["timestamp_col", "id"]);
3149        let batch = read_alltypes_with_reader_schema(&file, reader_schema);
3150        let expected = RecordBatch::try_from_iter_with_nullable([
3151            (
3152                "timestamp_col",
3153                Arc::new(
3154                    TimestampMicrosecondArray::from_iter_values([
3155                        1235865600000000, // 2009-03-01T00:00:00.000
3156                        1235865660000000, // 2009-03-01T00:01:00.000
3157                        1238544000000000, // 2009-04-01T00:00:00.000
3158                        1238544060000000, // 2009-04-01T00:01:00.000
3159                        1233446400000000, // 2009-02-01T00:00:00.000
3160                        1233446460000000, // 2009-02-01T00:01:00.000
3161                        1230768000000000, // 2009-01-01T00:00:00.000
3162                        1230768060000000, // 2009-01-01T00:01:00.000
3163                    ])
3164                    .with_timezone("+00:00"),
3165                ) as _,
3166                true,
3167            ),
3168            (
3169                "id",
3170                Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
3171                true,
3172            ),
3173        ])
3174        .unwrap();
3175        assert_eq!(batch, expected);
3176    }
3177
3178    #[test]
3179    fn test_skippable_types_project_each_field_individually() {
3180        let path = "test/data/skippable_types.avro";
3181        let full = read_file(path, 1024, false);
3182        let schema_full = full.schema();
3183        let num_rows = full.num_rows();
3184        let writer_json = load_writer_schema_json(path);
3185        assert_eq!(
3186            writer_json["type"], "record",
3187            "writer schema must be a record"
3188        );
3189        let fields_json = writer_json
3190            .get("fields")
3191            .and_then(|f| f.as_array())
3192            .expect("record has fields");
3193        assert_eq!(
3194            schema_full.fields().len(),
3195            fields_json.len(),
3196            "full read column count vs writer fields"
3197        );
3198        fn rebuild_list_array_with_element(
3199            col: &ArrayRef,
3200            new_elem: Arc<Field>,
3201            is_large: bool,
3202        ) -> ArrayRef {
3203            if is_large {
3204                let list = col
3205                    .as_any()
3206                    .downcast_ref::<LargeListArray>()
3207                    .expect("expected LargeListArray");
3208                let offsets = list.offsets().clone();
3209                let values = list.values().clone();
3210                let validity = list.nulls().cloned();
3211                Arc::new(LargeListArray::try_new(new_elem, offsets, values, validity).unwrap())
3212            } else {
3213                let list = col
3214                    .as_any()
3215                    .downcast_ref::<ListArray>()
3216                    .expect("expected ListArray");
3217                let offsets = list.offsets().clone();
3218                let values = list.values().clone();
3219                let validity = list.nulls().cloned();
3220                Arc::new(ListArray::try_new(new_elem, offsets, values, validity).unwrap())
3221            }
3222        }
3223        for (idx, f) in fields_json.iter().enumerate() {
3224            let name = f
3225                .get("name")
3226                .and_then(|n| n.as_str())
3227                .unwrap_or_else(|| panic!("field at index {idx} has no name"));
3228            let reader_schema = make_reader_schema_with_selected_fields_in_order(path, &[name]);
3229            let projected = read_alltypes_with_reader_schema(path, reader_schema);
3230            assert_eq!(
3231                projected.num_columns(),
3232                1,
3233                "projected batch should contain exactly the selected column '{name}'"
3234            );
3235            assert_eq!(
3236                projected.num_rows(),
3237                num_rows,
3238                "row count mismatch for projected column '{name}'"
3239            );
3240            let col_full = full.column(idx).clone();
3241            let full_field = schema_full.field(idx).as_ref().clone();
3242            let proj_field_ref = projected.schema().field(0).clone();
3243            let proj_field = proj_field_ref.as_ref();
3244            let top_meta = proj_field.metadata().clone();
3245            let (expected_field_ref, expected_col): (Arc<Field>, ArrayRef) =
3246                match (full_field.data_type(), proj_field.data_type()) {
3247                    (&DataType::List(_), DataType::List(proj_elem)) => {
3248                        let new_col =
3249                            rebuild_list_array_with_element(&col_full, proj_elem.clone(), false);
3250                        let nf = Field::new(
3251                            full_field.name().clone(),
3252                            proj_field.data_type().clone(),
3253                            full_field.is_nullable(),
3254                        )
3255                        .with_metadata(top_meta);
3256                        (Arc::new(nf), new_col)
3257                    }
3258                    (&DataType::LargeList(_), DataType::LargeList(proj_elem)) => {
3259                        let new_col =
3260                            rebuild_list_array_with_element(&col_full, proj_elem.clone(), true);
3261                        let nf = Field::new(
3262                            full_field.name().clone(),
3263                            proj_field.data_type().clone(),
3264                            full_field.is_nullable(),
3265                        )
3266                        .with_metadata(top_meta);
3267                        (Arc::new(nf), new_col)
3268                    }
3269                    _ => {
3270                        let nf = full_field.with_metadata(top_meta);
3271                        (Arc::new(nf), col_full)
3272                    }
3273                };
3274
3275            let expected = RecordBatch::try_new(
3276                Arc::new(Schema::new(vec![expected_field_ref])),
3277                vec![expected_col],
3278            )
3279            .unwrap();
3280            assert_eq!(
3281                projected, expected,
3282                "projected column '{name}' mismatch vs full read column"
3283            );
3284        }
3285    }
3286
3287    #[test]
3288    fn test_union_fields_avro_nullable_and_general_unions() {
3289        let path = "test/data/union_fields.avro";
3290        let batch = read_file(path, 1024, false);
3291        let schema = batch.schema();
3292        let idx = schema.index_of("nullable_int_nullfirst").unwrap();
3293        let a = batch.column(idx).as_primitive::<Int32Type>();
3294        assert_eq!(a.len(), 4);
3295        assert!(a.is_null(0));
3296        assert_eq!(a.value(1), 42);
3297        assert!(a.is_null(2));
3298        assert_eq!(a.value(3), 0);
3299        let idx = schema.index_of("nullable_string_nullsecond").unwrap();
3300        let s = batch
3301            .column(idx)
3302            .as_any()
3303            .downcast_ref::<StringArray>()
3304            .expect("nullable_string_nullsecond should be Utf8");
3305        assert_eq!(s.len(), 4);
3306        assert_eq!(s.value(0), "s1");
3307        assert!(s.is_null(1));
3308        assert_eq!(s.value(2), "s3");
3309        assert!(s.is_valid(3)); // empty string, not null
3310        assert_eq!(s.value(3), "");
3311        let idx = schema.index_of("union_prim").unwrap();
3312        let u = batch
3313            .column(idx)
3314            .as_any()
3315            .downcast_ref::<UnionArray>()
3316            .expect("union_prim should be Union");
3317        let fields = match u.data_type() {
3318            DataType::Union(fields, mode) => {
3319                assert!(matches!(mode, UnionMode::Dense), "expect dense unions");
3320                fields
3321            }
3322            other => panic!("expected Union, got {other:?}"),
3323        };
3324        let tid_by_name = |name: &str| -> i8 {
3325            for (tid, f) in fields.iter() {
3326                if f.name() == name {
3327                    return tid;
3328                }
3329            }
3330            panic!("union child '{name}' not found");
3331        };
3332        let expected_type_ids = vec![
3333            tid_by_name("long"),
3334            tid_by_name("int"),
3335            tid_by_name("float"),
3336            tid_by_name("double"),
3337        ];
3338        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3339        assert_eq!(
3340            type_ids, expected_type_ids,
3341            "branch selection for union_prim rows"
3342        );
3343        let longs = u
3344            .child(tid_by_name("long"))
3345            .as_any()
3346            .downcast_ref::<Int64Array>()
3347            .unwrap();
3348        assert_eq!(longs.len(), 1);
3349        let ints = u
3350            .child(tid_by_name("int"))
3351            .as_any()
3352            .downcast_ref::<Int32Array>()
3353            .unwrap();
3354        assert_eq!(ints.len(), 1);
3355        let floats = u
3356            .child(tid_by_name("float"))
3357            .as_any()
3358            .downcast_ref::<Float32Array>()
3359            .unwrap();
3360        assert_eq!(floats.len(), 1);
3361        let doubles = u
3362            .child(tid_by_name("double"))
3363            .as_any()
3364            .downcast_ref::<Float64Array>()
3365            .unwrap();
3366        assert_eq!(doubles.len(), 1);
3367        let idx = schema.index_of("union_bytes_vs_string").unwrap();
3368        let u = batch
3369            .column(idx)
3370            .as_any()
3371            .downcast_ref::<UnionArray>()
3372            .expect("union_bytes_vs_string should be Union");
3373        let fields = match u.data_type() {
3374            DataType::Union(fields, _) => fields,
3375            other => panic!("expected Union, got {other:?}"),
3376        };
3377        let tid_by_name = |name: &str| -> i8 {
3378            for (tid, f) in fields.iter() {
3379                if f.name() == name {
3380                    return tid;
3381                }
3382            }
3383            panic!("union child '{name}' not found");
3384        };
3385        let tid_bytes = tid_by_name("bytes");
3386        let tid_string = tid_by_name("string");
3387        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3388        assert_eq!(
3389            type_ids,
3390            vec![tid_bytes, tid_string, tid_string, tid_bytes],
3391            "branch selection for bytes/string union"
3392        );
3393        let s_child = u
3394            .child(tid_string)
3395            .as_any()
3396            .downcast_ref::<StringArray>()
3397            .unwrap();
3398        assert_eq!(s_child.len(), 2);
3399        assert_eq!(s_child.value(0), "hello");
3400        assert_eq!(s_child.value(1), "world");
3401        let b_child = u
3402            .child(tid_bytes)
3403            .as_any()
3404            .downcast_ref::<BinaryArray>()
3405            .unwrap();
3406        assert_eq!(b_child.len(), 2);
3407        assert_eq!(b_child.value(0), &[0x00, 0xFF, 0x7F]);
3408        assert_eq!(b_child.value(1), b""); // previously: &[]
3409        let idx = schema.index_of("union_enum_records_array_map").unwrap();
3410        let u = batch
3411            .column(idx)
3412            .as_any()
3413            .downcast_ref::<UnionArray>()
3414            .expect("union_enum_records_array_map should be Union");
3415        let fields = match u.data_type() {
3416            DataType::Union(fields, _) => fields,
3417            other => panic!("expected Union, got {other:?}"),
3418        };
3419        let mut tid_enum: Option<i8> = None;
3420        let mut tid_rec_a: Option<i8> = None;
3421        let mut tid_rec_b: Option<i8> = None;
3422        let mut tid_array: Option<i8> = None;
3423        for (tid, f) in fields.iter() {
3424            match f.data_type() {
3425                DataType::Dictionary(_, _) => tid_enum = Some(tid),
3426                DataType::Struct(childs) => {
3427                    if childs.len() == 2 && childs[0].name() == "a" && childs[1].name() == "b" {
3428                        tid_rec_a = Some(tid);
3429                    } else if childs.len() == 2
3430                        && childs[0].name() == "x"
3431                        && childs[1].name() == "y"
3432                    {
3433                        tid_rec_b = Some(tid);
3434                    }
3435                }
3436                DataType::List(_) => tid_array = Some(tid),
3437                _ => {}
3438            }
3439        }
3440        let (tid_enum, tid_rec_a, tid_rec_b, tid_array) = (
3441            tid_enum.expect("enum child"),
3442            tid_rec_a.expect("RecA child"),
3443            tid_rec_b.expect("RecB child"),
3444            tid_array.expect("array<long> child"),
3445        );
3446        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3447        assert_eq!(
3448            type_ids,
3449            vec![tid_enum, tid_rec_a, tid_rec_b, tid_array],
3450            "branch selection for complex union"
3451        );
3452        let dict = u
3453            .child(tid_enum)
3454            .as_any()
3455            .downcast_ref::<DictionaryArray<Int32Type>>()
3456            .unwrap();
3457        assert_eq!(dict.len(), 1);
3458        assert!(dict.is_valid(0));
3459        let rec_a = u
3460            .child(tid_rec_a)
3461            .as_any()
3462            .downcast_ref::<StructArray>()
3463            .unwrap();
3464        assert_eq!(rec_a.len(), 1);
3465        let a_val = rec_a
3466            .column_by_name("a")
3467            .unwrap()
3468            .as_any()
3469            .downcast_ref::<Int32Array>()
3470            .unwrap();
3471        assert_eq!(a_val.value(0), 7);
3472        let b_val = rec_a
3473            .column_by_name("b")
3474            .unwrap()
3475            .as_any()
3476            .downcast_ref::<StringArray>()
3477            .unwrap();
3478        assert_eq!(b_val.value(0), "x");
3479        // RecB row: {"x": 123456789, "y": b"\xFF\x00"}
3480        let rec_b = u
3481            .child(tid_rec_b)
3482            .as_any()
3483            .downcast_ref::<StructArray>()
3484            .unwrap();
3485        let x_val = rec_b
3486            .column_by_name("x")
3487            .unwrap()
3488            .as_any()
3489            .downcast_ref::<Int64Array>()
3490            .unwrap();
3491        assert_eq!(x_val.value(0), 123_456_789_i64);
3492        let y_val = rec_b
3493            .column_by_name("y")
3494            .unwrap()
3495            .as_any()
3496            .downcast_ref::<BinaryArray>()
3497            .unwrap();
3498        assert_eq!(y_val.value(0), &[0xFF, 0x00]);
3499        let arr = u
3500            .child(tid_array)
3501            .as_any()
3502            .downcast_ref::<ListArray>()
3503            .unwrap();
3504        assert_eq!(arr.len(), 1);
3505        let first_values = arr.value(0);
3506        let longs = first_values.as_any().downcast_ref::<Int64Array>().unwrap();
3507        assert_eq!(longs.len(), 3);
3508        assert_eq!(longs.value(0), 1);
3509        assert_eq!(longs.value(1), 2);
3510        assert_eq!(longs.value(2), 3);
3511        let idx = schema.index_of("union_date_or_fixed4").unwrap();
3512        let u = batch
3513            .column(idx)
3514            .as_any()
3515            .downcast_ref::<UnionArray>()
3516            .expect("union_date_or_fixed4 should be Union");
3517        let fields = match u.data_type() {
3518            DataType::Union(fields, _) => fields,
3519            other => panic!("expected Union, got {other:?}"),
3520        };
3521        let mut tid_date: Option<i8> = None;
3522        let mut tid_fixed: Option<i8> = None;
3523        for (tid, f) in fields.iter() {
3524            match f.data_type() {
3525                DataType::Date32 => tid_date = Some(tid),
3526                DataType::FixedSizeBinary(4) => tid_fixed = Some(tid),
3527                _ => {}
3528            }
3529        }
3530        let (tid_date, tid_fixed) = (tid_date.expect("date"), tid_fixed.expect("fixed(4)"));
3531        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3532        assert_eq!(
3533            type_ids,
3534            vec![tid_date, tid_fixed, tid_date, tid_fixed],
3535            "branch selection for date/fixed4 union"
3536        );
3537        let dates = u
3538            .child(tid_date)
3539            .as_any()
3540            .downcast_ref::<Date32Array>()
3541            .unwrap();
3542        assert_eq!(dates.len(), 2);
3543        assert_eq!(dates.value(0), 19_000); // ~2022‑01‑15
3544        assert_eq!(dates.value(1), 0); // epoch
3545        let fixed = u
3546            .child(tid_fixed)
3547            .as_any()
3548            .downcast_ref::<FixedSizeBinaryArray>()
3549            .unwrap();
3550        assert_eq!(fixed.len(), 2);
3551        assert_eq!(fixed.value(0), b"ABCD");
3552        assert_eq!(fixed.value(1), &[0x00, 0x11, 0x22, 0x33]);
3553    }
3554
3555    #[test]
3556    fn test_union_schema_resolution_all_type_combinations() {
3557        let path = "test/data/union_fields.avro";
3558        let baseline = read_file(path, 1024, false);
3559        let baseline_schema = baseline.schema();
3560        let mut root = load_writer_schema_json(path);
3561        assert_eq!(root["type"], "record", "writer schema must be a record");
3562        let fields = root
3563            .get_mut("fields")
3564            .and_then(|f| f.as_array_mut())
3565            .expect("record has fields");
3566        fn is_named_type(obj: &Value, ty: &str, nm: &str) -> bool {
3567            obj.get("type").and_then(|v| v.as_str()) == Some(ty)
3568                && obj.get("name").and_then(|v| v.as_str()) == Some(nm)
3569        }
3570        fn is_logical(obj: &Value, prim: &str, lt: &str) -> bool {
3571            obj.get("type").and_then(|v| v.as_str()) == Some(prim)
3572                && obj.get("logicalType").and_then(|v| v.as_str()) == Some(lt)
3573        }
3574        fn find_first(arr: &[Value], pred: impl Fn(&Value) -> bool) -> Option<Value> {
3575            arr.iter().find(|v| pred(v)).cloned()
3576        }
3577        fn prim(s: &str) -> Value {
3578            Value::String(s.to_string())
3579        }
3580        for f in fields.iter_mut() {
3581            let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
3582                continue;
3583            };
3584            match name {
3585                // Flip null ordering – should not affect values
3586                "nullable_int_nullfirst" => {
3587                    f["type"] = json!(["int", "null"]);
3588                }
3589                "nullable_string_nullsecond" => {
3590                    f["type"] = json!(["null", "string"]);
3591                }
3592                "union_prim" => {
3593                    let orig = f["type"].as_array().unwrap().clone();
3594                    let long = prim("long");
3595                    let double = prim("double");
3596                    let string = prim("string");
3597                    let bytes = prim("bytes");
3598                    let boolean = prim("boolean");
3599                    assert!(orig.contains(&long));
3600                    assert!(orig.contains(&double));
3601                    assert!(orig.contains(&string));
3602                    assert!(orig.contains(&bytes));
3603                    assert!(orig.contains(&boolean));
3604                    f["type"] = json!([long, double, string, bytes, boolean]);
3605                }
3606                "union_bytes_vs_string" => {
3607                    f["type"] = json!(["string", "bytes"]);
3608                }
3609                "union_fixed_dur_decfix" => {
3610                    let orig = f["type"].as_array().unwrap().clone();
3611                    let fx8 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx8")).unwrap();
3612                    let dur12 = find_first(&orig, |o| is_named_type(o, "fixed", "Dur12")).unwrap();
3613                    let decfix16 =
3614                        find_first(&orig, |o| is_named_type(o, "fixed", "DecFix16")).unwrap();
3615                    f["type"] = json!([decfix16, dur12, fx8]);
3616                }
3617                "union_enum_records_array_map" => {
3618                    let orig = f["type"].as_array().unwrap().clone();
3619                    let enum_color = find_first(&orig, |o| {
3620                        o.get("type").and_then(|v| v.as_str()) == Some("enum")
3621                    })
3622                    .unwrap();
3623                    let rec_a = find_first(&orig, |o| is_named_type(o, "record", "RecA")).unwrap();
3624                    let rec_b = find_first(&orig, |o| is_named_type(o, "record", "RecB")).unwrap();
3625                    let arr = find_first(&orig, |o| {
3626                        o.get("type").and_then(|v| v.as_str()) == Some("array")
3627                    })
3628                    .unwrap();
3629                    let map = find_first(&orig, |o| {
3630                        o.get("type").and_then(|v| v.as_str()) == Some("map")
3631                    })
3632                    .unwrap();
3633                    f["type"] = json!([arr, map, rec_b, rec_a, enum_color]);
3634                }
3635                "union_date_or_fixed4" => {
3636                    let orig = f["type"].as_array().unwrap().clone();
3637                    let date = find_first(&orig, |o| is_logical(o, "int", "date")).unwrap();
3638                    let fx4 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx4")).unwrap();
3639                    f["type"] = json!([fx4, date]);
3640                }
3641                "union_time_millis_or_enum" => {
3642                    let orig = f["type"].as_array().unwrap().clone();
3643                    let time_ms =
3644                        find_first(&orig, |o| is_logical(o, "int", "time-millis")).unwrap();
3645                    let en = find_first(&orig, |o| {
3646                        o.get("type").and_then(|v| v.as_str()) == Some("enum")
3647                    })
3648                    .unwrap();
3649                    f["type"] = json!([en, time_ms]);
3650                }
3651                "union_time_micros_or_string" => {
3652                    let orig = f["type"].as_array().unwrap().clone();
3653                    let time_us =
3654                        find_first(&orig, |o| is_logical(o, "long", "time-micros")).unwrap();
3655                    f["type"] = json!(["string", time_us]);
3656                }
3657                "union_ts_millis_utc_or_array" => {
3658                    let orig = f["type"].as_array().unwrap().clone();
3659                    let ts_ms =
3660                        find_first(&orig, |o| is_logical(o, "long", "timestamp-millis")).unwrap();
3661                    let arr = find_first(&orig, |o| {
3662                        o.get("type").and_then(|v| v.as_str()) == Some("array")
3663                    })
3664                    .unwrap();
3665                    f["type"] = json!([arr, ts_ms]);
3666                }
3667                "union_ts_micros_local_or_bytes" => {
3668                    let orig = f["type"].as_array().unwrap().clone();
3669                    let lts_us =
3670                        find_first(&orig, |o| is_logical(o, "long", "local-timestamp-micros"))
3671                            .unwrap();
3672                    f["type"] = json!(["bytes", lts_us]);
3673                }
3674                "union_uuid_or_fixed10" => {
3675                    let orig = f["type"].as_array().unwrap().clone();
3676                    let uuid = find_first(&orig, |o| is_logical(o, "string", "uuid")).unwrap();
3677                    let fx10 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx10")).unwrap();
3678                    f["type"] = json!([fx10, uuid]);
3679                }
3680                "union_dec_bytes_or_dec_fixed" => {
3681                    let orig = f["type"].as_array().unwrap().clone();
3682                    let dec_bytes = find_first(&orig, |o| {
3683                        o.get("type").and_then(|v| v.as_str()) == Some("bytes")
3684                            && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3685                    })
3686                    .unwrap();
3687                    let dec_fix = find_first(&orig, |o| {
3688                        is_named_type(o, "fixed", "DecFix20")
3689                            && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3690                    })
3691                    .unwrap();
3692                    f["type"] = json!([dec_fix, dec_bytes]);
3693                }
3694                "union_null_bytes_string" => {
3695                    f["type"] = json!(["bytes", "string", "null"]);
3696                }
3697                "array_of_union" => {
3698                    let obj = f
3699                        .get_mut("type")
3700                        .expect("array type")
3701                        .as_object_mut()
3702                        .unwrap();
3703                    obj.insert("items".to_string(), json!(["string", "long"]));
3704                }
3705                "map_of_union" => {
3706                    let obj = f
3707                        .get_mut("type")
3708                        .expect("map type")
3709                        .as_object_mut()
3710                        .unwrap();
3711                    obj.insert("values".to_string(), json!(["double", "null"]));
3712                }
3713                "record_with_union_field" => {
3714                    let rec = f
3715                        .get_mut("type")
3716                        .expect("record type")
3717                        .as_object_mut()
3718                        .unwrap();
3719                    let rec_fields = rec.get_mut("fields").unwrap().as_array_mut().unwrap();
3720                    let mut found = false;
3721                    for rf in rec_fields.iter_mut() {
3722                        if rf.get("name").and_then(|v| v.as_str()) == Some("u") {
3723                            rf["type"] = json!(["string", "long"]); // rely on int→long promotion
3724                            found = true;
3725                            break;
3726                        }
3727                    }
3728                    assert!(found, "field 'u' expected in HasUnion");
3729                }
3730                "union_ts_micros_utc_or_map" => {
3731                    let orig = f["type"].as_array().unwrap().clone();
3732                    let ts_us =
3733                        find_first(&orig, |o| is_logical(o, "long", "timestamp-micros")).unwrap();
3734                    let map = find_first(&orig, |o| {
3735                        o.get("type").and_then(|v| v.as_str()) == Some("map")
3736                    })
3737                    .unwrap();
3738                    f["type"] = json!([map, ts_us]);
3739                }
3740                "union_ts_millis_local_or_string" => {
3741                    let orig = f["type"].as_array().unwrap().clone();
3742                    let lts_ms =
3743                        find_first(&orig, |o| is_logical(o, "long", "local-timestamp-millis"))
3744                            .unwrap();
3745                    f["type"] = json!(["string", lts_ms]);
3746                }
3747                "union_bool_or_string" => {
3748                    f["type"] = json!(["string", "boolean"]);
3749                }
3750                _ => {}
3751            }
3752        }
3753        let reader_schema = AvroSchema::new(root.to_string());
3754        let resolved = read_alltypes_with_reader_schema(path, reader_schema);
3755
3756        fn branch_token(dt: &DataType) -> String {
3757            match dt {
3758                DataType::Null => "null".into(),
3759                DataType::Boolean => "boolean".into(),
3760                DataType::Int32 => "int".into(),
3761                DataType::Int64 => "long".into(),
3762                DataType::Float32 => "float".into(),
3763                DataType::Float64 => "double".into(),
3764                DataType::Binary => "bytes".into(),
3765                DataType::Utf8 => "string".into(),
3766                DataType::Date32 => "date".into(),
3767                DataType::Time32(arrow_schema::TimeUnit::Millisecond) => "time-millis".into(),
3768                DataType::Time64(arrow_schema::TimeUnit::Microsecond) => "time-micros".into(),
3769                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => if tz.is_some() {
3770                    "timestamp-millis"
3771                } else {
3772                    "local-timestamp-millis"
3773                }
3774                .into(),
3775                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => if tz.is_some() {
3776                    "timestamp-micros"
3777                } else {
3778                    "local-timestamp-micros"
3779                }
3780                .into(),
3781                DataType::Interval(IntervalUnit::MonthDayNano) => "duration".into(),
3782                DataType::FixedSizeBinary(n) => format!("fixed{n}"),
3783                DataType::Dictionary(_, _) => "enum".into(),
3784                DataType::Decimal128(p, s) => format!("decimal({p},{s})"),
3785                DataType::Decimal256(p, s) => format!("decimal({p},{s})"),
3786                #[cfg(feature = "small_decimals")]
3787                DataType::Decimal64(p, s) => format!("decimal({p},{s})"),
3788                DataType::Struct(fields) => {
3789                    if fields.len() == 2 && fields[0].name() == "a" && fields[1].name() == "b" {
3790                        "record:RecA".into()
3791                    } else if fields.len() == 2
3792                        && fields[0].name() == "x"
3793                        && fields[1].name() == "y"
3794                    {
3795                        "record:RecB".into()
3796                    } else {
3797                        "record".into()
3798                    }
3799                }
3800                DataType::List(_) => "array".into(),
3801                DataType::Map(_, _) => "map".into(),
3802                other => format!("{other:?}"),
3803            }
3804        }
3805
3806        fn union_tokens(u: &UnionArray) -> (Vec<i8>, HashMap<i8, String>) {
3807            let fields = match u.data_type() {
3808                DataType::Union(fields, _) => fields,
3809                other => panic!("expected Union, got {other:?}"),
3810            };
3811            let mut dict: HashMap<i8, String> = HashMap::with_capacity(fields.len());
3812            for (tid, f) in fields.iter() {
3813                dict.insert(tid, branch_token(f.data_type()));
3814            }
3815            let ids: Vec<i8> = u.type_ids().iter().copied().collect();
3816            (ids, dict)
3817        }
3818
3819        fn expected_token(field_name: &str, writer_token: &str) -> String {
3820            match field_name {
3821                "union_prim" => match writer_token {
3822                    "int" => "long".into(),
3823                    "float" => "double".into(),
3824                    other => other.into(),
3825                },
3826                "record_with_union_field.u" => match writer_token {
3827                    "int" => "long".into(),
3828                    other => other.into(),
3829                },
3830                _ => writer_token.into(),
3831            }
3832        }
3833
3834        fn get_union<'a>(
3835            rb: &'a RecordBatch,
3836            schema: arrow_schema::SchemaRef,
3837            fname: &str,
3838        ) -> &'a UnionArray {
3839            let idx = schema.index_of(fname).unwrap();
3840            rb.column(idx)
3841                .as_any()
3842                .downcast_ref::<UnionArray>()
3843                .unwrap_or_else(|| panic!("{fname} should be a Union"))
3844        }
3845
3846        fn assert_union_equivalent(field_name: &str, u_writer: &UnionArray, u_reader: &UnionArray) {
3847            let (ids_w, dict_w) = union_tokens(u_writer);
3848            let (ids_r, dict_r) = union_tokens(u_reader);
3849            assert_eq!(
3850                ids_w.len(),
3851                ids_r.len(),
3852                "{field_name}: row count mismatch between baseline and resolved"
3853            );
3854            for (i, (id_w, id_r)) in ids_w.iter().zip(ids_r.iter()).enumerate() {
3855                let w_tok = dict_w.get(id_w).unwrap();
3856                let want = expected_token(field_name, w_tok);
3857                let got = dict_r.get(id_r).unwrap();
3858                assert_eq!(
3859                    got, &want,
3860                    "{field_name}: row {i} resolved to wrong union branch (writer={w_tok}, expected={want}, got={got})"
3861                );
3862            }
3863        }
3864
3865        for (fname, dt) in [
3866            ("nullable_int_nullfirst", DataType::Int32),
3867            ("nullable_string_nullsecond", DataType::Utf8),
3868        ] {
3869            let idx_b = baseline_schema.index_of(fname).unwrap();
3870            let idx_r = resolved.schema().index_of(fname).unwrap();
3871            let col_b = baseline.column(idx_b);
3872            let col_r = resolved.column(idx_r);
3873            assert_eq!(
3874                col_b.data_type(),
3875                &dt,
3876                "baseline {fname} should decode as non-union with nullability"
3877            );
3878            assert_eq!(
3879                col_b.as_ref(),
3880                col_r.as_ref(),
3881                "{fname}: values must be identical regardless of null-branch order"
3882            );
3883        }
3884        let union_fields = [
3885            "union_prim",
3886            "union_bytes_vs_string",
3887            "union_fixed_dur_decfix",
3888            "union_enum_records_array_map",
3889            "union_date_or_fixed4",
3890            "union_time_millis_or_enum",
3891            "union_time_micros_or_string",
3892            "union_ts_millis_utc_or_array",
3893            "union_ts_micros_local_or_bytes",
3894            "union_uuid_or_fixed10",
3895            "union_dec_bytes_or_dec_fixed",
3896            "union_null_bytes_string",
3897            "union_ts_micros_utc_or_map",
3898            "union_ts_millis_local_or_string",
3899            "union_bool_or_string",
3900        ];
3901        for fname in union_fields {
3902            let u_b = get_union(&baseline, baseline_schema.clone(), fname);
3903            let u_r = get_union(&resolved, resolved.schema(), fname);
3904            assert_union_equivalent(fname, u_b, u_r);
3905        }
3906        {
3907            let fname = "array_of_union";
3908            let idx_b = baseline_schema.index_of(fname).unwrap();
3909            let idx_r = resolved.schema().index_of(fname).unwrap();
3910            let arr_b = baseline
3911                .column(idx_b)
3912                .as_any()
3913                .downcast_ref::<ListArray>()
3914                .expect("array_of_union should be a List");
3915            let arr_r = resolved
3916                .column(idx_r)
3917                .as_any()
3918                .downcast_ref::<ListArray>()
3919                .expect("array_of_union should be a List");
3920            assert_eq!(
3921                arr_b.value_offsets(),
3922                arr_r.value_offsets(),
3923                "{fname}: list offsets changed after resolution"
3924            );
3925            let u_b = arr_b
3926                .values()
3927                .as_any()
3928                .downcast_ref::<UnionArray>()
3929                .expect("array items should be Union");
3930            let u_r = arr_r
3931                .values()
3932                .as_any()
3933                .downcast_ref::<UnionArray>()
3934                .expect("array items should be Union");
3935            let (ids_b, dict_b) = union_tokens(u_b);
3936            let (ids_r, dict_r) = union_tokens(u_r);
3937            assert_eq!(ids_b.len(), ids_r.len(), "{fname}: values length mismatch");
3938            for (i, (id_b, id_r)) in ids_b.iter().zip(ids_r.iter()).enumerate() {
3939                let w_tok = dict_b.get(id_b).unwrap();
3940                let got = dict_r.get(id_r).unwrap();
3941                assert_eq!(
3942                    got, w_tok,
3943                    "{fname}: value {i} resolved to wrong branch (writer={w_tok}, got={got})"
3944                );
3945            }
3946        }
3947        {
3948            let fname = "map_of_union";
3949            let idx_b = baseline_schema.index_of(fname).unwrap();
3950            let idx_r = resolved.schema().index_of(fname).unwrap();
3951            let map_b = baseline
3952                .column(idx_b)
3953                .as_any()
3954                .downcast_ref::<MapArray>()
3955                .expect("map_of_union should be a Map");
3956            let map_r = resolved
3957                .column(idx_r)
3958                .as_any()
3959                .downcast_ref::<MapArray>()
3960                .expect("map_of_union should be a Map");
3961            assert_eq!(
3962                map_b.value_offsets(),
3963                map_r.value_offsets(),
3964                "{fname}: map value offsets changed after resolution"
3965            );
3966            let ent_b = map_b.entries();
3967            let ent_r = map_r.entries();
3968            let val_b_any = ent_b.column(1).as_ref();
3969            let val_r_any = ent_r.column(1).as_ref();
3970            let b_union = val_b_any.as_any().downcast_ref::<UnionArray>();
3971            let r_union = val_r_any.as_any().downcast_ref::<UnionArray>();
3972            if let (Some(u_b), Some(u_r)) = (b_union, r_union) {
3973                assert_union_equivalent(fname, u_b, u_r);
3974            } else {
3975                assert_eq!(
3976                    val_b_any.data_type(),
3977                    val_r_any.data_type(),
3978                    "{fname}: value data types differ after resolution"
3979                );
3980                assert_eq!(
3981                    val_b_any, val_r_any,
3982                    "{fname}: value arrays differ after resolution (nullable value column case)"
3983                );
3984                let value_nullable = |m: &MapArray| -> bool {
3985                    match m.data_type() {
3986                        DataType::Map(entries_field, _sorted) => match entries_field.data_type() {
3987                            DataType::Struct(fields) => {
3988                                assert_eq!(fields.len(), 2, "entries struct must have 2 fields");
3989                                assert_eq!(fields[0].name(), "key");
3990                                assert_eq!(fields[1].name(), "value");
3991                                fields[1].is_nullable()
3992                            }
3993                            other => panic!("Map entries field must be Struct, got {other:?}"),
3994                        },
3995                        other => panic!("expected Map data type, got {other:?}"),
3996                    }
3997                };
3998                assert!(
3999                    value_nullable(map_b),
4000                    "{fname}: baseline Map value field should be nullable per Arrow spec"
4001                );
4002                assert!(
4003                    value_nullable(map_r),
4004                    "{fname}: resolved Map value field should be nullable per Arrow spec"
4005                );
4006            }
4007        }
4008        {
4009            let fname = "record_with_union_field";
4010            let idx_b = baseline_schema.index_of(fname).unwrap();
4011            let idx_r = resolved.schema().index_of(fname).unwrap();
4012            let rec_b = baseline
4013                .column(idx_b)
4014                .as_any()
4015                .downcast_ref::<StructArray>()
4016                .expect("record_with_union_field should be a Struct");
4017            let rec_r = resolved
4018                .column(idx_r)
4019                .as_any()
4020                .downcast_ref::<StructArray>()
4021                .expect("record_with_union_field should be a Struct");
4022            let u_b = rec_b
4023                .column_by_name("u")
4024                .unwrap()
4025                .as_any()
4026                .downcast_ref::<UnionArray>()
4027                .expect("field 'u' should be Union (baseline)");
4028            let u_r = rec_r
4029                .column_by_name("u")
4030                .unwrap()
4031                .as_any()
4032                .downcast_ref::<UnionArray>()
4033                .expect("field 'u' should be Union (resolved)");
4034            assert_union_equivalent("record_with_union_field.u", u_b, u_r);
4035        }
4036    }
4037
4038    #[test]
4039    fn test_union_fields_end_to_end_expected_arrays() {
4040        fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
4041            for (tid, f) in fields.iter() {
4042                if f.name() == want {
4043                    return tid;
4044                }
4045            }
4046            panic!("union child '{want}' not found")
4047        }
4048
4049        fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
4050            for (tid, f) in fields.iter() {
4051                if pred(f.data_type()) {
4052                    return tid;
4053                }
4054            }
4055            panic!("no union child matches predicate");
4056        }
4057
4058        fn uuid16_from_str(s: &str) -> [u8; 16] {
4059            fn hex(b: u8) -> u8 {
4060                match b {
4061                    b'0'..=b'9' => b - b'0',
4062                    b'a'..=b'f' => b - b'a' + 10,
4063                    b'A'..=b'F' => b - b'A' + 10,
4064                    _ => panic!("invalid hex"),
4065                }
4066            }
4067            let mut out = [0u8; 16];
4068            let bytes = s.as_bytes();
4069            let (mut i, mut j) = (0, 0);
4070            while i < bytes.len() {
4071                if bytes[i] == b'-' {
4072                    i += 1;
4073                    continue;
4074                }
4075                let hi = hex(bytes[i]);
4076                let lo = hex(bytes[i + 1]);
4077                out[j] = (hi << 4) | lo;
4078                j += 1;
4079                i += 2;
4080            }
4081            assert_eq!(j, 16, "uuid must decode to 16 bytes");
4082            out
4083        }
4084
4085        fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
4086            match dt {
4087                DataType::Null => Arc::new(NullArray::new(0)),
4088                DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
4089                DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
4090                DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
4091                DataType::Float32 => Arc::new(arrow_array::Float32Array::from(Vec::<f32>::new())),
4092                DataType::Float64 => Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())),
4093                DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
4094                DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
4095                DataType::Date32 => Arc::new(arrow_array::Date32Array::from(Vec::<i32>::new())),
4096                DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4097                    Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
4098                }
4099                DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4100                    Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
4101                }
4102                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4103                    let a = TimestampMillisecondArray::from(Vec::<i64>::new());
4104                    Arc::new(if let Some(tz) = tz {
4105                        a.with_timezone(tz.clone())
4106                    } else {
4107                        a
4108                    })
4109                }
4110                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4111                    let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
4112                    Arc::new(if let Some(tz) = tz {
4113                        a.with_timezone(tz.clone())
4114                    } else {
4115                        a
4116                    })
4117                }
4118                DataType::Interval(IntervalUnit::MonthDayNano) => {
4119                    Arc::new(arrow_array::IntervalMonthDayNanoArray::from(Vec::<
4120                        IntervalMonthDayNano,
4121                    >::new(
4122                    )))
4123                }
4124                DataType::FixedSizeBinary(n) => Arc::new(FixedSizeBinaryArray::new_null(*n, 0)),
4125                DataType::Dictionary(k, v) => {
4126                    assert_eq!(**k, DataType::Int32, "expect int32 keys for enums");
4127                    let keys = Int32Array::from(Vec::<i32>::new());
4128                    let values = match v.as_ref() {
4129                        DataType::Utf8 => {
4130                            Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4131                        }
4132                        other => panic!("unexpected dictionary value type {other:?}"),
4133                    };
4134                    Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4135                }
4136                DataType::List(field) => {
4137                    let values: ArrayRef = match field.data_type() {
4138                        DataType::Int32 => {
4139                            Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
4140                        }
4141                        DataType::Int64 => {
4142                            Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
4143                        }
4144                        DataType::Utf8 => {
4145                            Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4146                        }
4147                        DataType::Union(_, _) => {
4148                            let (uf, _) = if let DataType::Union(f, m) = field.data_type() {
4149                                (f.clone(), m)
4150                            } else {
4151                                unreachable!()
4152                            };
4153                            let children: Vec<ArrayRef> = uf
4154                                .iter()
4155                                .map(|(_, f)| empty_child_for(f.data_type()))
4156                                .collect();
4157                            Arc::new(
4158                                UnionArray::try_new(
4159                                    uf.clone(),
4160                                    ScalarBuffer::<i8>::from(Vec::<i8>::new()),
4161                                    Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
4162                                    children,
4163                                )
4164                                .unwrap(),
4165                            ) as ArrayRef
4166                        }
4167                        other => panic!("unsupported list item type: {other:?}"),
4168                    };
4169                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
4170                    Arc::new(ListArray::try_new(field.clone(), offsets, values, None).unwrap())
4171                }
4172                DataType::Map(entry_field, ordered) => {
4173                    let DataType::Struct(childs) = entry_field.data_type() else {
4174                        panic!("map entries must be struct")
4175                    };
4176                    let key_field = &childs[0];
4177                    let val_field = &childs[1];
4178                    assert_eq!(key_field.data_type(), &DataType::Utf8);
4179                    let keys = StringArray::from(Vec::<&str>::new());
4180                    let vals: ArrayRef = match val_field.data_type() {
4181                        DataType::Float64 => {
4182                            Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())) as ArrayRef
4183                        }
4184                        DataType::Int64 => {
4185                            Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
4186                        }
4187                        DataType::Utf8 => {
4188                            Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4189                        }
4190                        DataType::Union(uf, _) => {
4191                            let ch: Vec<ArrayRef> = uf
4192                                .iter()
4193                                .map(|(_, f)| empty_child_for(f.data_type()))
4194                                .collect();
4195                            Arc::new(
4196                                UnionArray::try_new(
4197                                    uf.clone(),
4198                                    ScalarBuffer::<i8>::from(Vec::<i8>::new()),
4199                                    Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
4200                                    ch,
4201                                )
4202                                .unwrap(),
4203                            ) as ArrayRef
4204                        }
4205                        other => panic!("unsupported map value type: {other:?}"),
4206                    };
4207                    let entries = StructArray::new(
4208                        Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4209                        vec![Arc::new(keys) as ArrayRef, vals],
4210                        None,
4211                    );
4212                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
4213                    Arc::new(MapArray::new(
4214                        entry_field.clone(),
4215                        offsets,
4216                        entries,
4217                        None,
4218                        *ordered,
4219                    ))
4220                }
4221                other => panic!("empty_child_for: unhandled type {other:?}"),
4222            }
4223        }
4224
4225        fn mk_dense_union(
4226            fields: &UnionFields,
4227            type_ids: Vec<i8>,
4228            offsets: Vec<i32>,
4229            provide: impl Fn(&Field) -> Option<ArrayRef>,
4230        ) -> ArrayRef {
4231            let children: Vec<ArrayRef> = fields
4232                .iter()
4233                .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
4234                .collect();
4235
4236            Arc::new(
4237                UnionArray::try_new(
4238                    fields.clone(),
4239                    ScalarBuffer::<i8>::from(type_ids),
4240                    Some(ScalarBuffer::<i32>::from(offsets)),
4241                    children,
4242                )
4243                .unwrap(),
4244            ) as ArrayRef
4245        }
4246
4247        // Dates / times / timestamps from the Avro content block:
4248        let date_a: i32 = 19_000;
4249        let time_ms_a: i32 = 13 * 3_600_000 + 45 * 60_000 + 30_000 + 123;
4250        let time_us_b: i64 = 23 * 3_600_000_000 + 59 * 60_000_000 + 59 * 1_000_000 + 999_999;
4251        let ts_ms_2024_01_01: i64 = 1_704_067_200_000;
4252        let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1000;
4253        // Fixed / bytes-like values:
4254        let fx8_a: [u8; 8] = *b"ABCDEFGH";
4255        let fx4_abcd: [u8; 4] = *b"ABCD";
4256        let fx4_misc: [u8; 4] = [0x00, 0x11, 0x22, 0x33];
4257        let fx10_ascii: [u8; 10] = *b"0123456789";
4258        let fx10_aa: [u8; 10] = [0xAA; 10];
4259        // Duration logical values as MonthDayNano:
4260        let dur_a = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
4261        let dur_b = IntervalMonthDayNanoType::make_value(12, 31, 999_000_000);
4262        // UUID logical values (stored as 16-byte FixedSizeBinary in Arrow):
4263        let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
4264        let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
4265        // Decimals from Avro content:
4266        let dec_b_scale2_pos: i128 = 123_456; // "1234.56" bytes-decimal -> (precision=10, scale=2)
4267        let dec_fix16_neg: i128 = -101; // "-1.01" fixed(16) decimal(10,2)
4268        let dec_fix20_s4: i128 = 1_234_567_891_234; // "123456789.1234" fixed(20) decimal(20,4)
4269        let dec_fix20_s4_neg: i128 = -123; // "-0.0123" fixed(20) decimal(20,4)
4270        let path = "test/data/union_fields.avro";
4271        let actual = read_file(path, 1024, false);
4272        let schema = actual.schema();
4273        // Helper to fetch union metadata for a column
4274        let get_union = |name: &str| -> (UnionFields, UnionMode) {
4275            let idx = schema.index_of(name).unwrap();
4276            match schema.field(idx).data_type() {
4277                DataType::Union(f, m) => (f.clone(), *m),
4278                other => panic!("{name} should be a Union, got {other:?}"),
4279            }
4280        };
4281        let mut expected_cols: Vec<ArrayRef> = Vec::with_capacity(schema.fields().len());
4282        // 1) ["null","int"]: Int32 (nullable)
4283        expected_cols.push(Arc::new(Int32Array::from(vec![
4284            None,
4285            Some(42),
4286            None,
4287            Some(0),
4288        ])));
4289        // 2) ["string","null"]: Utf8 (nullable)
4290        expected_cols.push(Arc::new(StringArray::from(vec![
4291            Some("s1"),
4292            None,
4293            Some("s3"),
4294            Some(""),
4295        ])));
4296        // 3) union_prim: ["boolean","int","long","float","double","bytes","string"]
4297        {
4298            let (uf, mode) = get_union("union_prim");
4299            assert!(matches!(mode, UnionMode::Dense));
4300            let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4301            let expected_names = vec![
4302                "boolean", "int", "long", "float", "double", "bytes", "string",
4303            ];
4304            assert_eq!(
4305                generated_names, expected_names,
4306                "Field names for union_prim are incorrect"
4307            );
4308            let tids = vec![
4309                tid_by_name(&uf, "long"),
4310                tid_by_name(&uf, "int"),
4311                tid_by_name(&uf, "float"),
4312                tid_by_name(&uf, "double"),
4313            ];
4314            let offs = vec![0, 0, 0, 0];
4315            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4316                "int" => Some(Arc::new(Int32Array::from(vec![-1])) as ArrayRef),
4317                "long" => Some(Arc::new(Int64Array::from(vec![1_234_567_890_123i64])) as ArrayRef),
4318                "float" => {
4319                    Some(Arc::new(arrow_array::Float32Array::from(vec![1.25f32])) as ArrayRef)
4320                }
4321                "double" => {
4322                    Some(Arc::new(arrow_array::Float64Array::from(vec![-2.5f64])) as ArrayRef)
4323                }
4324                _ => None,
4325            });
4326            expected_cols.push(arr);
4327        }
4328        // 4) union_bytes_vs_string: ["bytes","string"]
4329        {
4330            let (uf, _) = get_union("union_bytes_vs_string");
4331            let tids = vec![
4332                tid_by_name(&uf, "bytes"),
4333                tid_by_name(&uf, "string"),
4334                tid_by_name(&uf, "string"),
4335                tid_by_name(&uf, "bytes"),
4336            ];
4337            let offs = vec![0, 0, 1, 1];
4338            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4339                "bytes" => Some(
4340                    Arc::new(BinaryArray::from(vec![&[0x00, 0xFF, 0x7F][..], &[][..]])) as ArrayRef,
4341                ),
4342                "string" => Some(Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef),
4343                _ => None,
4344            });
4345            expected_cols.push(arr);
4346        }
4347        // 5) union_fixed_dur_decfix: [Fx8, Dur12, DecFix16(decimal(10,2))]
4348        {
4349            let (uf, _) = get_union("union_fixed_dur_decfix");
4350            let tid_fx8 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(8)));
4351            let tid_dur = tid_by_dt(&uf, |dt| {
4352                matches!(
4353                    dt,
4354                    DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)
4355                )
4356            });
4357            let tid_dec = tid_by_dt(&uf, |dt| match dt {
4358                #[cfg(feature = "small_decimals")]
4359                DataType::Decimal64(10, 2) => true,
4360                DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4361                _ => false,
4362            });
4363            let tids = vec![tid_fx8, tid_dur, tid_dec, tid_dur];
4364            let offs = vec![0, 0, 0, 1];
4365            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4366                DataType::FixedSizeBinary(8) => {
4367                    let it = [Some(fx8_a)].into_iter();
4368                    Some(Arc::new(
4369                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 8).unwrap(),
4370                    ) as ArrayRef)
4371                }
4372                DataType::Interval(IntervalUnit::MonthDayNano) => {
4373                    Some(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![
4374                        dur_a, dur_b,
4375                    ])) as ArrayRef)
4376                }
4377                #[cfg(feature = "small_decimals")]
4378                DataType::Decimal64(10, 2) => {
4379                    let a = arrow_array::Decimal64Array::from_iter_values([dec_fix16_neg as i64]);
4380                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4381                }
4382                DataType::Decimal128(10, 2) => {
4383                    let a = arrow_array::Decimal128Array::from_iter_values([dec_fix16_neg]);
4384                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4385                }
4386                DataType::Decimal256(10, 2) => {
4387                    let a = arrow_array::Decimal256Array::from_iter_values([i256::from_i128(
4388                        dec_fix16_neg,
4389                    )]);
4390                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4391                }
4392                _ => None,
4393            });
4394            let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4395            let expected_names = vec!["Fx8", "Dur12", "DecFix16"];
4396            assert_eq!(
4397                generated_names, expected_names,
4398                "Data type names were not generated correctly for union_fixed_dur_decfix"
4399            );
4400            expected_cols.push(arr);
4401        }
4402        // 6) union_enum_records_array_map: [enum ColorU, record RecA, record RecB, array<long>, map<string>]
4403        {
4404            let (uf, _) = get_union("union_enum_records_array_map");
4405            let tid_enum = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4406            let tid_reca = tid_by_dt(&uf, |dt| {
4407                if let DataType::Struct(fs) = dt {
4408                    fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b"
4409                } else {
4410                    false
4411                }
4412            });
4413            let tid_recb = tid_by_dt(&uf, |dt| {
4414                if let DataType::Struct(fs) = dt {
4415                    fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y"
4416                } else {
4417                    false
4418                }
4419            });
4420            let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4421            let tids = vec![tid_enum, tid_reca, tid_recb, tid_arr];
4422            let offs = vec![0, 0, 0, 0];
4423            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4424                DataType::Dictionary(_, _) => {
4425                    let keys = Int32Array::from(vec![0i32]); // "RED"
4426                    let values =
4427                        Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
4428                    Some(
4429                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4430                            as ArrayRef,
4431                    )
4432                }
4433                DataType::Struct(fs)
4434                    if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
4435                {
4436                    let a = Int32Array::from(vec![7]);
4437                    let b = StringArray::from(vec!["x"]);
4438                    Some(Arc::new(StructArray::new(
4439                        fs.clone(),
4440                        vec![Arc::new(a), Arc::new(b)],
4441                        None,
4442                    )) as ArrayRef)
4443                }
4444                DataType::Struct(fs)
4445                    if fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y" =>
4446                {
4447                    let x = Int64Array::from(vec![123_456_789i64]);
4448                    let y = BinaryArray::from(vec![&[0xFF, 0x00][..]]);
4449                    Some(Arc::new(StructArray::new(
4450                        fs.clone(),
4451                        vec![Arc::new(x), Arc::new(y)],
4452                        None,
4453                    )) as ArrayRef)
4454                }
4455                DataType::List(field) => {
4456                    let values = Int64Array::from(vec![1i64, 2, 3]);
4457                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
4458                    Some(Arc::new(
4459                        ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4460                    ) as ArrayRef)
4461                }
4462                DataType::Map(_, _) => None,
4463                other => panic!("unexpected child {other:?}"),
4464            });
4465            expected_cols.push(arr);
4466        }
4467        // 7) union_date_or_fixed4: [date32, fixed(4)]
4468        {
4469            let (uf, _) = get_union("union_date_or_fixed4");
4470            let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
4471            let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
4472            let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
4473            let offs = vec![0, 0, 1, 1];
4474            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4475                DataType::Date32 => {
4476                    Some(Arc::new(arrow_array::Date32Array::from(vec![date_a, 0])) as ArrayRef)
4477                }
4478                DataType::FixedSizeBinary(4) => {
4479                    let it = [Some(fx4_abcd), Some(fx4_misc)].into_iter();
4480                    Some(Arc::new(
4481                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
4482                    ) as ArrayRef)
4483                }
4484                _ => None,
4485            });
4486            expected_cols.push(arr);
4487        }
4488        // 8) union_time_millis_or_enum: [time-millis, enum OnOff]
4489        {
4490            let (uf, _) = get_union("union_time_millis_or_enum");
4491            let tid_ms = tid_by_dt(&uf, |dt| {
4492                matches!(dt, DataType::Time32(arrow_schema::TimeUnit::Millisecond))
4493            });
4494            let tid_en = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4495            let tids = vec![tid_ms, tid_en, tid_en, tid_ms];
4496            let offs = vec![0, 0, 1, 1];
4497            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4498                DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4499                    Some(Arc::new(Time32MillisecondArray::from(vec![time_ms_a, 0])) as ArrayRef)
4500                }
4501                DataType::Dictionary(_, _) => {
4502                    let keys = Int32Array::from(vec![0i32, 1]); // "ON", "OFF"
4503                    let values = Arc::new(StringArray::from(vec!["ON", "OFF"])) as ArrayRef;
4504                    Some(
4505                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4506                            as ArrayRef,
4507                    )
4508                }
4509                _ => None,
4510            });
4511            expected_cols.push(arr);
4512        }
4513        // 9) union_time_micros_or_string: [time-micros, string]
4514        {
4515            let (uf, _) = get_union("union_time_micros_or_string");
4516            let tid_us = tid_by_dt(&uf, |dt| {
4517                matches!(dt, DataType::Time64(arrow_schema::TimeUnit::Microsecond))
4518            });
4519            let tid_s = tid_by_name(&uf, "string");
4520            let tids = vec![tid_s, tid_us, tid_s, tid_s];
4521            let offs = vec![0, 0, 1, 2];
4522            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4523                DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4524                    Some(Arc::new(Time64MicrosecondArray::from(vec![time_us_b])) as ArrayRef)
4525                }
4526                DataType::Utf8 => {
4527                    Some(Arc::new(StringArray::from(vec!["evening", "night", ""])) as ArrayRef)
4528                }
4529                _ => None,
4530            });
4531            expected_cols.push(arr);
4532        }
4533        // 10) union_ts_millis_utc_or_array: [timestamp-millis(TZ), array<int>]
4534        {
4535            let (uf, _) = get_union("union_ts_millis_utc_or_array");
4536            let tid_ts = tid_by_dt(&uf, |dt| {
4537                matches!(
4538                    dt,
4539                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)
4540                )
4541            });
4542            let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4543            let tids = vec![tid_ts, tid_arr, tid_arr, tid_ts];
4544            let offs = vec![0, 0, 1, 1];
4545            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4546                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4547                    let a = TimestampMillisecondArray::from(vec![
4548                        ts_ms_2024_01_01,
4549                        ts_ms_2024_01_01 + 86_400_000,
4550                    ]);
4551                    Some(Arc::new(if let Some(tz) = tz {
4552                        a.with_timezone(tz.clone())
4553                    } else {
4554                        a
4555                    }) as ArrayRef)
4556                }
4557                DataType::List(field) => {
4558                    let values = Int32Array::from(vec![0, 1, 2, -1, 0, 1]);
4559                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6]));
4560                    Some(Arc::new(
4561                        ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4562                    ) as ArrayRef)
4563                }
4564                _ => None,
4565            });
4566            expected_cols.push(arr);
4567        }
4568        // 11) union_ts_micros_local_or_bytes: [local-timestamp-micros, bytes]
4569        {
4570            let (uf, _) = get_union("union_ts_micros_local_or_bytes");
4571            let tid_lts = tid_by_dt(&uf, |dt| {
4572                matches!(
4573                    dt,
4574                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
4575                )
4576            });
4577            let tid_b = tid_by_name(&uf, "bytes");
4578            let tids = vec![tid_b, tid_lts, tid_b, tid_b];
4579            let offs = vec![0, 0, 1, 2];
4580            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4581                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) => Some(Arc::new(
4582                    TimestampMicrosecondArray::from(vec![ts_us_2024_01_01]),
4583                )
4584                    as ArrayRef),
4585                DataType::Binary => Some(Arc::new(BinaryArray::from(vec![
4586                    &b"\x11\x22\x33"[..],
4587                    &b"\x00"[..],
4588                    &b"\x10\x20\x30\x40"[..],
4589                ])) as ArrayRef),
4590                _ => None,
4591            });
4592            expected_cols.push(arr);
4593        }
4594        // 12) union_uuid_or_fixed10: [uuid(string)->fixed(16), fixed(10)]
4595        {
4596            let (uf, _) = get_union("union_uuid_or_fixed10");
4597            let tid_fx16 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
4598            let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
4599            let tids = vec![tid_fx16, tid_fx10, tid_fx16, tid_fx10];
4600            let offs = vec![0, 0, 1, 1];
4601            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4602                DataType::FixedSizeBinary(16) => {
4603                    let it = [Some(uuid1), Some(uuid2)].into_iter();
4604                    Some(Arc::new(
4605                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
4606                    ) as ArrayRef)
4607                }
4608                DataType::FixedSizeBinary(10) => {
4609                    let it = [Some(fx10_ascii), Some(fx10_aa)].into_iter();
4610                    Some(Arc::new(
4611                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
4612                    ) as ArrayRef)
4613                }
4614                _ => None,
4615            });
4616            expected_cols.push(arr);
4617        }
4618        // 13) union_dec_bytes_or_dec_fixed: [bytes dec(10,2), fixed(20) dec(20,4)]
4619        {
4620            let (uf, _) = get_union("union_dec_bytes_or_dec_fixed");
4621            let tid_b10s2 = tid_by_dt(&uf, |dt| match dt {
4622                #[cfg(feature = "small_decimals")]
4623                DataType::Decimal64(10, 2) => true,
4624                DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4625                _ => false,
4626            });
4627            let tid_f20s4 = tid_by_dt(&uf, |dt| {
4628                matches!(
4629                    dt,
4630                    DataType::Decimal128(20, 4) | DataType::Decimal256(20, 4)
4631                )
4632            });
4633            let tids = vec![tid_b10s2, tid_f20s4, tid_b10s2, tid_f20s4];
4634            let offs = vec![0, 0, 1, 1];
4635            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4636                #[cfg(feature = "small_decimals")]
4637                DataType::Decimal64(10, 2) => {
4638                    let a = Decimal64Array::from_iter_values([dec_b_scale2_pos as i64, 0i64]);
4639                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4640                }
4641                DataType::Decimal128(10, 2) => {
4642                    let a = Decimal128Array::from_iter_values([dec_b_scale2_pos, 0]);
4643                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4644                }
4645                DataType::Decimal256(10, 2) => {
4646                    let a = Decimal256Array::from_iter_values([
4647                        i256::from_i128(dec_b_scale2_pos),
4648                        i256::from(0),
4649                    ]);
4650                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4651                }
4652                DataType::Decimal128(20, 4) => {
4653                    let a = Decimal128Array::from_iter_values([dec_fix20_s4_neg, dec_fix20_s4]);
4654                    Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4655                }
4656                DataType::Decimal256(20, 4) => {
4657                    let a = Decimal256Array::from_iter_values([
4658                        i256::from_i128(dec_fix20_s4_neg),
4659                        i256::from_i128(dec_fix20_s4),
4660                    ]);
4661                    Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4662                }
4663                _ => None,
4664            });
4665            expected_cols.push(arr);
4666        }
4667        // 14) union_null_bytes_string: ["null","bytes","string"]
4668        {
4669            let (uf, _) = get_union("union_null_bytes_string");
4670            let tid_n = tid_by_name(&uf, "null");
4671            let tid_b = tid_by_name(&uf, "bytes");
4672            let tid_s = tid_by_name(&uf, "string");
4673            let tids = vec![tid_n, tid_b, tid_s, tid_s];
4674            let offs = vec![0, 0, 0, 1];
4675            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4676                "null" => Some(Arc::new(arrow_array::NullArray::new(1)) as ArrayRef),
4677                "bytes" => Some(Arc::new(BinaryArray::from(vec![&b"\x01\x02"[..]])) as ArrayRef),
4678                "string" => Some(Arc::new(StringArray::from(vec!["text", "u"])) as ArrayRef),
4679                _ => None,
4680            });
4681            expected_cols.push(arr);
4682        }
4683        // 15) array_of_union: array<[long,string]>
4684        {
4685            let idx = schema.index_of("array_of_union").unwrap();
4686            let dt = schema.field(idx).data_type().clone();
4687            let (item_field, _) = match &dt {
4688                DataType::List(f) => (f.clone(), ()),
4689                other => panic!("array_of_union must be List, got {other:?}"),
4690            };
4691            let (uf, _) = match item_field.data_type() {
4692                DataType::Union(f, m) => (f.clone(), m),
4693                other => panic!("array_of_union items must be Union, got {other:?}"),
4694            };
4695            let tid_l = tid_by_name(&uf, "long");
4696            let tid_s = tid_by_name(&uf, "string");
4697            let type_ids = vec![tid_l, tid_s, tid_l, tid_s, tid_l, tid_l, tid_s, tid_l];
4698            let offsets = vec![0, 0, 1, 1, 2, 3, 2, 4];
4699            let values_union =
4700                mk_dense_union(&uf, type_ids, offsets, |f| match f.name().as_str() {
4701                    "long" => {
4702                        Some(Arc::new(Int64Array::from(vec![1i64, -5, 42, -1, 0])) as ArrayRef)
4703                    }
4704                    "string" => Some(Arc::new(StringArray::from(vec!["a", "", "z"])) as ArrayRef),
4705                    _ => None,
4706                });
4707            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 8]));
4708            expected_cols.push(Arc::new(
4709                ListArray::try_new(item_field.clone(), list_offsets, values_union, None).unwrap(),
4710            ));
4711        }
4712        // 16) map_of_union: map<[null,double]>
4713        {
4714            let idx = schema.index_of("map_of_union").unwrap();
4715            let dt = schema.field(idx).data_type().clone();
4716            let (entry_field, ordered) = match &dt {
4717                DataType::Map(f, ordered) => (f.clone(), *ordered),
4718                other => panic!("map_of_union must be Map, got {other:?}"),
4719            };
4720            let DataType::Struct(entry_fields) = entry_field.data_type() else {
4721                panic!("map entries must be struct")
4722            };
4723            let key_field = entry_fields[0].clone();
4724            let val_field = entry_fields[1].clone();
4725            let keys = StringArray::from(vec!["a", "b", "x", "pi"]);
4726            let rounded_pi = (std::f64::consts::PI * 100_000.0).round() / 100_000.0;
4727            let values: ArrayRef = match val_field.data_type() {
4728                DataType::Union(uf, _) => {
4729                    let tid_n = tid_by_name(uf, "null");
4730                    let tid_d = tid_by_name(uf, "double");
4731                    let tids = vec![tid_n, tid_d, tid_d, tid_d];
4732                    let offs = vec![0, 0, 1, 2];
4733                    mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4734                        "null" => Some(Arc::new(NullArray::new(1)) as ArrayRef),
4735                        "double" => Some(Arc::new(arrow_array::Float64Array::from(vec![
4736                            2.5f64, -0.5f64, rounded_pi,
4737                        ])) as ArrayRef),
4738                        _ => None,
4739                    })
4740                }
4741                DataType::Float64 => Arc::new(arrow_array::Float64Array::from(vec![
4742                    None,
4743                    Some(2.5),
4744                    Some(-0.5),
4745                    Some(rounded_pi),
4746                ])),
4747                other => panic!("unexpected map value type {other:?}"),
4748            };
4749            let entries = StructArray::new(
4750                Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4751                vec![Arc::new(keys) as ArrayRef, values],
4752                None,
4753            );
4754            let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 3, 4]));
4755            expected_cols.push(Arc::new(MapArray::new(
4756                entry_field,
4757                offsets,
4758                entries,
4759                None,
4760                ordered,
4761            )));
4762        }
4763        // 17) record_with_union_field: struct { id:int, u:[int,string] }
4764        {
4765            let idx = schema.index_of("record_with_union_field").unwrap();
4766            let DataType::Struct(rec_fields) = schema.field(idx).data_type() else {
4767                panic!("record_with_union_field should be Struct")
4768            };
4769            let id = Int32Array::from(vec![1, 2, 3, 4]);
4770            let u_field = rec_fields.iter().find(|f| f.name() == "u").unwrap();
4771            let DataType::Union(uf, _) = u_field.data_type() else {
4772                panic!("u must be Union")
4773            };
4774            let tid_i = tid_by_name(uf, "int");
4775            let tid_s = tid_by_name(uf, "string");
4776            let tids = vec![tid_s, tid_i, tid_i, tid_s];
4777            let offs = vec![0, 0, 1, 1];
4778            let u = mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4779                "int" => Some(Arc::new(Int32Array::from(vec![99, 0])) as ArrayRef),
4780                "string" => Some(Arc::new(StringArray::from(vec!["one", "four"])) as ArrayRef),
4781                _ => None,
4782            });
4783            let rec = StructArray::new(rec_fields.clone(), vec![Arc::new(id) as ArrayRef, u], None);
4784            expected_cols.push(Arc::new(rec));
4785        }
4786        // 18) union_ts_micros_utc_or_map: [timestamp-micros(TZ), map<long>]
4787        {
4788            let (uf, _) = get_union("union_ts_micros_utc_or_map");
4789            let tid_ts = tid_by_dt(&uf, |dt| {
4790                matches!(
4791                    dt,
4792                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some(_))
4793                )
4794            });
4795            let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
4796            let tids = vec![tid_ts, tid_map, tid_ts, tid_map];
4797            let offs = vec![0, 0, 1, 1];
4798            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4799                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4800                    let a = TimestampMicrosecondArray::from(vec![ts_us_2024_01_01, 0i64]);
4801                    Some(Arc::new(if let Some(tz) = tz {
4802                        a.with_timezone(tz.clone())
4803                    } else {
4804                        a
4805                    }) as ArrayRef)
4806                }
4807                DataType::Map(entry_field, ordered) => {
4808                    let DataType::Struct(fs) = entry_field.data_type() else {
4809                        panic!("map entries must be struct")
4810                    };
4811                    let key_field = fs[0].clone();
4812                    let val_field = fs[1].clone();
4813                    assert_eq!(key_field.data_type(), &DataType::Utf8);
4814                    assert_eq!(val_field.data_type(), &DataType::Int64);
4815                    let keys = StringArray::from(vec!["k1", "k2", "n"]);
4816                    let vals = Int64Array::from(vec![1i64, 2, 0]);
4817                    let entries = StructArray::new(
4818                        Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4819                        vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
4820                        None,
4821                    );
4822                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
4823                    Some(Arc::new(MapArray::new(
4824                        entry_field.clone(),
4825                        offsets,
4826                        entries,
4827                        None,
4828                        *ordered,
4829                    )) as ArrayRef)
4830                }
4831                _ => None,
4832            });
4833            expected_cols.push(arr);
4834        }
4835        // 19) union_ts_millis_local_or_string: [local-timestamp-millis, string]
4836        {
4837            let (uf, _) = get_union("union_ts_millis_local_or_string");
4838            let tid_ts = tid_by_dt(&uf, |dt| {
4839                matches!(
4840                    dt,
4841                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
4842                )
4843            });
4844            let tid_s = tid_by_name(&uf, "string");
4845            let tids = vec![tid_s, tid_ts, tid_s, tid_s];
4846            let offs = vec![0, 0, 1, 2];
4847            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4848                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) => Some(Arc::new(
4849                    TimestampMillisecondArray::from(vec![ts_ms_2024_01_01]),
4850                )
4851                    as ArrayRef),
4852                DataType::Utf8 => {
4853                    Some(
4854                        Arc::new(StringArray::from(vec!["local midnight", "done", ""])) as ArrayRef,
4855                    )
4856                }
4857                _ => None,
4858            });
4859            expected_cols.push(arr);
4860        }
4861        // 20) union_bool_or_string: ["boolean","string"]
4862        {
4863            let (uf, _) = get_union("union_bool_or_string");
4864            let tid_b = tid_by_name(&uf, "boolean");
4865            let tid_s = tid_by_name(&uf, "string");
4866            let tids = vec![tid_b, tid_s, tid_b, tid_s];
4867            let offs = vec![0, 0, 1, 1];
4868            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4869                "boolean" => Some(Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef),
4870                "string" => Some(Arc::new(StringArray::from(vec!["no", "yes"])) as ArrayRef),
4871                _ => None,
4872            });
4873            expected_cols.push(arr);
4874        }
4875        let expected = RecordBatch::try_new(schema.clone(), expected_cols).unwrap();
4876        assert_eq!(
4877            actual, expected,
4878            "full end-to-end equality for union_fields.avro"
4879        );
4880    }
4881
4882    #[test]
4883    fn test_read_zero_byte_avro_file() {
4884        let batch = read_file("test/data/zero_byte.avro", 3, false);
4885        let schema = batch.schema();
4886        assert_eq!(schema.fields().len(), 1);
4887        let field = schema.field(0);
4888        assert_eq!(field.name(), "data");
4889        assert_eq!(field.data_type(), &DataType::Binary);
4890        assert!(field.is_nullable());
4891        assert_eq!(batch.num_rows(), 3);
4892        assert_eq!(batch.num_columns(), 1);
4893        let binary_array = batch
4894            .column(0)
4895            .as_any()
4896            .downcast_ref::<BinaryArray>()
4897            .unwrap();
4898        assert!(binary_array.is_null(0));
4899        assert!(binary_array.is_valid(1));
4900        assert_eq!(binary_array.value(1), b"");
4901        assert!(binary_array.is_valid(2));
4902        assert_eq!(binary_array.value(2), b"some bytes");
4903    }
4904
4905    #[test]
4906    fn test_alltypes() {
4907        let expected = RecordBatch::try_from_iter_with_nullable([
4908            (
4909                "id",
4910                Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
4911                true,
4912            ),
4913            (
4914                "bool_col",
4915                Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
4916                true,
4917            ),
4918            (
4919                "tinyint_col",
4920                Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4921                true,
4922            ),
4923            (
4924                "smallint_col",
4925                Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4926                true,
4927            ),
4928            (
4929                "int_col",
4930                Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4931                true,
4932            ),
4933            (
4934                "bigint_col",
4935                Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
4936                true,
4937            ),
4938            (
4939                "float_col",
4940                Arc::new(Float32Array::from_iter_values(
4941                    (0..8).map(|x| (x % 2) as f32 * 1.1),
4942                )) as _,
4943                true,
4944            ),
4945            (
4946                "double_col",
4947                Arc::new(Float64Array::from_iter_values(
4948                    (0..8).map(|x| (x % 2) as f64 * 10.1),
4949                )) as _,
4950                true,
4951            ),
4952            (
4953                "date_string_col",
4954                Arc::new(BinaryArray::from_iter_values([
4955                    [48, 51, 47, 48, 49, 47, 48, 57],
4956                    [48, 51, 47, 48, 49, 47, 48, 57],
4957                    [48, 52, 47, 48, 49, 47, 48, 57],
4958                    [48, 52, 47, 48, 49, 47, 48, 57],
4959                    [48, 50, 47, 48, 49, 47, 48, 57],
4960                    [48, 50, 47, 48, 49, 47, 48, 57],
4961                    [48, 49, 47, 48, 49, 47, 48, 57],
4962                    [48, 49, 47, 48, 49, 47, 48, 57],
4963                ])) as _,
4964                true,
4965            ),
4966            (
4967                "string_col",
4968                Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
4969                true,
4970            ),
4971            (
4972                "timestamp_col",
4973                Arc::new(
4974                    TimestampMicrosecondArray::from_iter_values([
4975                        1235865600000000, // 2009-03-01T00:00:00.000
4976                        1235865660000000, // 2009-03-01T00:01:00.000
4977                        1238544000000000, // 2009-04-01T00:00:00.000
4978                        1238544060000000, // 2009-04-01T00:01:00.000
4979                        1233446400000000, // 2009-02-01T00:00:00.000
4980                        1233446460000000, // 2009-02-01T00:01:00.000
4981                        1230768000000000, // 2009-01-01T00:00:00.000
4982                        1230768060000000, // 2009-01-01T00:01:00.000
4983                    ])
4984                    .with_timezone("+00:00"),
4985                ) as _,
4986                true,
4987            ),
4988        ])
4989        .unwrap();
4990
4991        for file in files() {
4992            let file = arrow_test_data(file);
4993
4994            assert_eq!(read_file(&file, 8, false), expected);
4995            assert_eq!(read_file(&file, 3, false), expected);
4996        }
4997    }
4998
4999    #[test]
5000    // TODO: avoid requiring snappy for this file
5001    #[cfg(feature = "snappy")]
5002    fn test_alltypes_dictionary() {
5003        let file = "avro/alltypes_dictionary.avro";
5004        let expected = RecordBatch::try_from_iter_with_nullable([
5005            ("id", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
5006            (
5007                "bool_col",
5008                Arc::new(BooleanArray::from(vec![Some(true), Some(false)])) as _,
5009                true,
5010            ),
5011            (
5012                "tinyint_col",
5013                Arc::new(Int32Array::from(vec![0, 1])) as _,
5014                true,
5015            ),
5016            (
5017                "smallint_col",
5018                Arc::new(Int32Array::from(vec![0, 1])) as _,
5019                true,
5020            ),
5021            ("int_col", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
5022            (
5023                "bigint_col",
5024                Arc::new(Int64Array::from(vec![0, 10])) as _,
5025                true,
5026            ),
5027            (
5028                "float_col",
5029                Arc::new(Float32Array::from(vec![0.0, 1.1])) as _,
5030                true,
5031            ),
5032            (
5033                "double_col",
5034                Arc::new(Float64Array::from(vec![0.0, 10.1])) as _,
5035                true,
5036            ),
5037            (
5038                "date_string_col",
5039                Arc::new(BinaryArray::from_iter_values([b"01/01/09", b"01/01/09"])) as _,
5040                true,
5041            ),
5042            (
5043                "string_col",
5044                Arc::new(BinaryArray::from_iter_values([b"0", b"1"])) as _,
5045                true,
5046            ),
5047            (
5048                "timestamp_col",
5049                Arc::new(
5050                    TimestampMicrosecondArray::from_iter_values([
5051                        1230768000000000, // 2009-01-01T00:00:00.000
5052                        1230768060000000, // 2009-01-01T00:01:00.000
5053                    ])
5054                    .with_timezone("+00:00"),
5055                ) as _,
5056                true,
5057            ),
5058        ])
5059        .unwrap();
5060        let file_path = arrow_test_data(file);
5061        let batch_large = read_file(&file_path, 8, false);
5062        assert_eq!(
5063            batch_large, expected,
5064            "Decoded RecordBatch does not match for file {file}"
5065        );
5066        let batch_small = read_file(&file_path, 3, false);
5067        assert_eq!(
5068            batch_small, expected,
5069            "Decoded RecordBatch (batch size 3) does not match for file {file}"
5070        );
5071    }
5072
5073    #[test]
5074    fn test_alltypes_nulls_plain() {
5075        let file = "avro/alltypes_nulls_plain.avro";
5076        let expected = RecordBatch::try_from_iter_with_nullable([
5077            (
5078                "string_col",
5079                Arc::new(StringArray::from(vec![None::<&str>])) as _,
5080                true,
5081            ),
5082            ("int_col", Arc::new(Int32Array::from(vec![None])) as _, true),
5083            (
5084                "bool_col",
5085                Arc::new(BooleanArray::from(vec![None])) as _,
5086                true,
5087            ),
5088            (
5089                "bigint_col",
5090                Arc::new(Int64Array::from(vec![None])) as _,
5091                true,
5092            ),
5093            (
5094                "float_col",
5095                Arc::new(Float32Array::from(vec![None])) as _,
5096                true,
5097            ),
5098            (
5099                "double_col",
5100                Arc::new(Float64Array::from(vec![None])) as _,
5101                true,
5102            ),
5103            (
5104                "bytes_col",
5105                Arc::new(BinaryArray::from(vec![None::<&[u8]>])) as _,
5106                true,
5107            ),
5108        ])
5109        .unwrap();
5110        let file_path = arrow_test_data(file);
5111        let batch_large = read_file(&file_path, 8, false);
5112        assert_eq!(
5113            batch_large, expected,
5114            "Decoded RecordBatch does not match for file {file}"
5115        );
5116        let batch_small = read_file(&file_path, 3, false);
5117        assert_eq!(
5118            batch_small, expected,
5119            "Decoded RecordBatch (batch size 3) does not match for file {file}"
5120        );
5121    }
5122
5123    #[test]
5124    // TODO: avoid requiring snappy for this file
5125    #[cfg(feature = "snappy")]
5126    fn test_binary() {
5127        let file = arrow_test_data("avro/binary.avro");
5128        let batch = read_file(&file, 8, false);
5129        let expected = RecordBatch::try_from_iter_with_nullable([(
5130            "foo",
5131            Arc::new(BinaryArray::from_iter_values(vec![
5132                b"\x00" as &[u8],
5133                b"\x01" as &[u8],
5134                b"\x02" as &[u8],
5135                b"\x03" as &[u8],
5136                b"\x04" as &[u8],
5137                b"\x05" as &[u8],
5138                b"\x06" as &[u8],
5139                b"\x07" as &[u8],
5140                b"\x08" as &[u8],
5141                b"\t" as &[u8],
5142                b"\n" as &[u8],
5143                b"\x0b" as &[u8],
5144            ])) as Arc<dyn Array>,
5145            true,
5146        )])
5147        .unwrap();
5148        assert_eq!(batch, expected);
5149    }
5150
5151    #[test]
5152    // TODO: avoid requiring snappy for these files
5153    #[cfg(feature = "snappy")]
5154    fn test_decimal() {
5155        // Choose expected Arrow types depending on the `small_decimals` feature flag.
5156        // With `small_decimals` enabled, Decimal32/Decimal64 are used where their
5157        // precision allows; otherwise, those cases resolve to Decimal128.
5158        #[cfg(feature = "small_decimals")]
5159        let files: [(&str, DataType, HashMap<String, String>); 8] = [
5160            (
5161                "avro/fixed_length_decimal.avro",
5162                DataType::Decimal128(25, 2),
5163                HashMap::from([
5164                    (
5165                        "avro.namespace".to_string(),
5166                        "topLevelRecord.value".to_string(),
5167                    ),
5168                    ("avro.name".to_string(), "fixed".to_string()),
5169                ]),
5170            ),
5171            (
5172                "avro/fixed_length_decimal_legacy.avro",
5173                DataType::Decimal64(13, 2),
5174                HashMap::from([
5175                    (
5176                        "avro.namespace".to_string(),
5177                        "topLevelRecord.value".to_string(),
5178                    ),
5179                    ("avro.name".to_string(), "fixed".to_string()),
5180                ]),
5181            ),
5182            (
5183                "avro/int32_decimal.avro",
5184                DataType::Decimal32(4, 2),
5185                HashMap::from([
5186                    (
5187                        "avro.namespace".to_string(),
5188                        "topLevelRecord.value".to_string(),
5189                    ),
5190                    ("avro.name".to_string(), "fixed".to_string()),
5191                ]),
5192            ),
5193            (
5194                "avro/int64_decimal.avro",
5195                DataType::Decimal64(10, 2),
5196                HashMap::from([
5197                    (
5198                        "avro.namespace".to_string(),
5199                        "topLevelRecord.value".to_string(),
5200                    ),
5201                    ("avro.name".to_string(), "fixed".to_string()),
5202                ]),
5203            ),
5204            (
5205                "test/data/int256_decimal.avro",
5206                DataType::Decimal256(76, 10),
5207                HashMap::new(),
5208            ),
5209            (
5210                "test/data/fixed256_decimal.avro",
5211                DataType::Decimal256(76, 10),
5212                HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
5213            ),
5214            (
5215                "test/data/fixed_length_decimal_legacy_32.avro",
5216                DataType::Decimal32(9, 2),
5217                HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
5218            ),
5219            (
5220                "test/data/int128_decimal.avro",
5221                DataType::Decimal128(38, 2),
5222                HashMap::new(),
5223            ),
5224        ];
5225        #[cfg(not(feature = "small_decimals"))]
5226        let files: [(&str, DataType, HashMap<String, String>); 8] = [
5227            (
5228                "avro/fixed_length_decimal.avro",
5229                DataType::Decimal128(25, 2),
5230                HashMap::from([
5231                    (
5232                        "avro.namespace".to_string(),
5233                        "topLevelRecord.value".to_string(),
5234                    ),
5235                    ("avro.name".to_string(), "fixed".to_string()),
5236                ]),
5237            ),
5238            (
5239                "avro/fixed_length_decimal_legacy.avro",
5240                DataType::Decimal128(13, 2),
5241                HashMap::from([
5242                    (
5243                        "avro.namespace".to_string(),
5244                        "topLevelRecord.value".to_string(),
5245                    ),
5246                    ("avro.name".to_string(), "fixed".to_string()),
5247                ]),
5248            ),
5249            (
5250                "avro/int32_decimal.avro",
5251                DataType::Decimal128(4, 2),
5252                HashMap::from([
5253                    (
5254                        "avro.namespace".to_string(),
5255                        "topLevelRecord.value".to_string(),
5256                    ),
5257                    ("avro.name".to_string(), "fixed".to_string()),
5258                ]),
5259            ),
5260            (
5261                "avro/int64_decimal.avro",
5262                DataType::Decimal128(10, 2),
5263                HashMap::from([
5264                    (
5265                        "avro.namespace".to_string(),
5266                        "topLevelRecord.value".to_string(),
5267                    ),
5268                    ("avro.name".to_string(), "fixed".to_string()),
5269                ]),
5270            ),
5271            (
5272                "test/data/int256_decimal.avro",
5273                DataType::Decimal256(76, 10),
5274                HashMap::new(),
5275            ),
5276            (
5277                "test/data/fixed256_decimal.avro",
5278                DataType::Decimal256(76, 10),
5279                HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
5280            ),
5281            (
5282                "test/data/fixed_length_decimal_legacy_32.avro",
5283                DataType::Decimal128(9, 2),
5284                HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
5285            ),
5286            (
5287                "test/data/int128_decimal.avro",
5288                DataType::Decimal128(38, 2),
5289                HashMap::new(),
5290            ),
5291        ];
5292        for (file, expected_dt, mut metadata) in files {
5293            let (precision, scale) = match expected_dt {
5294                DataType::Decimal32(p, s)
5295                | DataType::Decimal64(p, s)
5296                | DataType::Decimal128(p, s)
5297                | DataType::Decimal256(p, s) => (p, s),
5298                _ => unreachable!("Unexpected decimal type in test inputs"),
5299            };
5300            assert!(scale >= 0, "test data uses non-negative scales only");
5301            let scale_u32 = scale as u32;
5302            let file_path: String = if file.starts_with("avro/") {
5303                arrow_test_data(file)
5304            } else {
5305                std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5306                    .join(file)
5307                    .to_string_lossy()
5308                    .into_owned()
5309            };
5310            let pow10: i128 = 10i128.pow(scale_u32);
5311            let values_i128: Vec<i128> = (1..=24).map(|n| (n as i128) * pow10).collect();
5312            let build_expected = |dt: &DataType, values: &[i128]| -> ArrayRef {
5313                match *dt {
5314                    #[cfg(feature = "small_decimals")]
5315                    DataType::Decimal32(p, s) => {
5316                        let it = values.iter().map(|&v| v as i32);
5317                        Arc::new(
5318                            Decimal32Array::from_iter_values(it)
5319                                .with_precision_and_scale(p, s)
5320                                .unwrap(),
5321                        )
5322                    }
5323                    #[cfg(feature = "small_decimals")]
5324                    DataType::Decimal64(p, s) => {
5325                        let it = values.iter().map(|&v| v as i64);
5326                        Arc::new(
5327                            Decimal64Array::from_iter_values(it)
5328                                .with_precision_and_scale(p, s)
5329                                .unwrap(),
5330                        )
5331                    }
5332                    DataType::Decimal128(p, s) => {
5333                        let it = values.iter().copied();
5334                        Arc::new(
5335                            Decimal128Array::from_iter_values(it)
5336                                .with_precision_and_scale(p, s)
5337                                .unwrap(),
5338                        )
5339                    }
5340                    DataType::Decimal256(p, s) => {
5341                        let it = values.iter().map(|&v| i256::from_i128(v));
5342                        Arc::new(
5343                            Decimal256Array::from_iter_values(it)
5344                                .with_precision_and_scale(p, s)
5345                                .unwrap(),
5346                        )
5347                    }
5348                    _ => unreachable!("Unexpected decimal type in test"),
5349                }
5350            };
5351            let actual_batch = read_file(&file_path, 8, false);
5352            let actual_nullable = actual_batch.schema().field(0).is_nullable();
5353            let expected_array = build_expected(&expected_dt, &values_i128);
5354            metadata.insert("precision".to_string(), precision.to_string());
5355            metadata.insert("scale".to_string(), scale.to_string());
5356            let field =
5357                Field::new("value", expected_dt.clone(), actual_nullable).with_metadata(metadata);
5358            let expected_schema = Arc::new(Schema::new(vec![field]));
5359            let expected_batch =
5360                RecordBatch::try_new(expected_schema.clone(), vec![expected_array]).unwrap();
5361            assert_eq!(
5362                actual_batch, expected_batch,
5363                "Decoded RecordBatch does not match for {file}"
5364            );
5365            let actual_batch_small = read_file(&file_path, 3, false);
5366            assert_eq!(
5367                actual_batch_small, expected_batch,
5368                "Decoded RecordBatch does not match for {file} with batch size 3"
5369            );
5370        }
5371    }
5372
5373    #[test]
5374    fn test_read_duration_logical_types_feature_toggle() -> Result<(), ArrowError> {
5375        let file_path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5376            .join("test/data/duration_logical_types.avro")
5377            .to_string_lossy()
5378            .into_owned();
5379
5380        let actual_batch = read_file(&file_path, 4, false);
5381
5382        let expected_batch = {
5383            #[cfg(feature = "avro_custom_types")]
5384            {
5385                let schema = Arc::new(Schema::new(vec![
5386                    Field::new(
5387                        "duration_time_nanos",
5388                        DataType::Duration(TimeUnit::Nanosecond),
5389                        false,
5390                    ),
5391                    Field::new(
5392                        "duration_time_micros",
5393                        DataType::Duration(TimeUnit::Microsecond),
5394                        false,
5395                    ),
5396                    Field::new(
5397                        "duration_time_millis",
5398                        DataType::Duration(TimeUnit::Millisecond),
5399                        false,
5400                    ),
5401                    Field::new(
5402                        "duration_time_seconds",
5403                        DataType::Duration(TimeUnit::Second),
5404                        false,
5405                    ),
5406                ]));
5407
5408                let nanos = Arc::new(PrimitiveArray::<DurationNanosecondType>::from(vec![
5409                    10, 20, 30, 40,
5410                ])) as ArrayRef;
5411                let micros = Arc::new(PrimitiveArray::<DurationMicrosecondType>::from(vec![
5412                    100, 200, 300, 400,
5413                ])) as ArrayRef;
5414                let millis = Arc::new(PrimitiveArray::<DurationMillisecondType>::from(vec![
5415                    1000, 2000, 3000, 4000,
5416                ])) as ArrayRef;
5417                let seconds = Arc::new(PrimitiveArray::<DurationSecondType>::from(vec![1, 2, 3, 4]))
5418                    as ArrayRef;
5419
5420                RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5421            }
5422            #[cfg(not(feature = "avro_custom_types"))]
5423            {
5424                let schema = Arc::new(Schema::new(vec![
5425                    Field::new("duration_time_nanos", DataType::Int64, false).with_metadata(
5426                        [(
5427                            "logicalType".to_string(),
5428                            "arrow.duration-nanos".to_string(),
5429                        )]
5430                        .into(),
5431                    ),
5432                    Field::new("duration_time_micros", DataType::Int64, false).with_metadata(
5433                        [(
5434                            "logicalType".to_string(),
5435                            "arrow.duration-micros".to_string(),
5436                        )]
5437                        .into(),
5438                    ),
5439                    Field::new("duration_time_millis", DataType::Int64, false).with_metadata(
5440                        [(
5441                            "logicalType".to_string(),
5442                            "arrow.duration-millis".to_string(),
5443                        )]
5444                        .into(),
5445                    ),
5446                    Field::new("duration_time_seconds", DataType::Int64, false).with_metadata(
5447                        [(
5448                            "logicalType".to_string(),
5449                            "arrow.duration-seconds".to_string(),
5450                        )]
5451                        .into(),
5452                    ),
5453                ]));
5454
5455                let nanos =
5456                    Arc::new(PrimitiveArray::<Int64Type>::from(vec![10, 20, 30, 40])) as ArrayRef;
5457                let micros = Arc::new(PrimitiveArray::<Int64Type>::from(vec![100, 200, 300, 400]))
5458                    as ArrayRef;
5459                let millis = Arc::new(PrimitiveArray::<Int64Type>::from(vec![
5460                    1000, 2000, 3000, 4000,
5461                ])) as ArrayRef;
5462                let seconds =
5463                    Arc::new(PrimitiveArray::<Int64Type>::from(vec![1, 2, 3, 4])) as ArrayRef;
5464
5465                RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5466            }
5467        };
5468
5469        assert_eq!(actual_batch, expected_batch);
5470
5471        Ok(())
5472    }
5473
5474    #[test]
5475    // TODO: avoid requiring snappy for this file
5476    #[cfg(feature = "snappy")]
5477    fn test_dict_pages_offset_zero() {
5478        let file = arrow_test_data("avro/dict-page-offset-zero.avro");
5479        let batch = read_file(&file, 32, false);
5480        let num_rows = batch.num_rows();
5481        let expected_field = Int32Array::from(vec![Some(1552); num_rows]);
5482        let expected = RecordBatch::try_from_iter_with_nullable([(
5483            "l_partkey",
5484            Arc::new(expected_field) as Arc<dyn Array>,
5485            true,
5486        )])
5487        .unwrap();
5488        assert_eq!(batch, expected);
5489    }
5490
5491    #[test]
5492    // TODO: avoid requiring snappy for this file
5493    #[cfg(feature = "snappy")]
5494    fn test_list_columns() {
5495        let file = arrow_test_data("avro/list_columns.avro");
5496        let mut int64_list_builder = ListBuilder::new(Int64Builder::new());
5497        {
5498            {
5499                let values = int64_list_builder.values();
5500                values.append_value(1);
5501                values.append_value(2);
5502                values.append_value(3);
5503            }
5504            int64_list_builder.append(true);
5505        }
5506        {
5507            {
5508                let values = int64_list_builder.values();
5509                values.append_null();
5510                values.append_value(1);
5511            }
5512            int64_list_builder.append(true);
5513        }
5514        {
5515            {
5516                let values = int64_list_builder.values();
5517                values.append_value(4);
5518            }
5519            int64_list_builder.append(true);
5520        }
5521        let int64_list = int64_list_builder.finish();
5522        let mut utf8_list_builder = ListBuilder::new(StringBuilder::new());
5523        {
5524            {
5525                let values = utf8_list_builder.values();
5526                values.append_value("abc");
5527                values.append_value("efg");
5528                values.append_value("hij");
5529            }
5530            utf8_list_builder.append(true);
5531        }
5532        {
5533            utf8_list_builder.append(false);
5534        }
5535        {
5536            {
5537                let values = utf8_list_builder.values();
5538                values.append_value("efg");
5539                values.append_null();
5540                values.append_value("hij");
5541                values.append_value("xyz");
5542            }
5543            utf8_list_builder.append(true);
5544        }
5545        let utf8_list = utf8_list_builder.finish();
5546        let expected = RecordBatch::try_from_iter_with_nullable([
5547            ("int64_list", Arc::new(int64_list) as Arc<dyn Array>, true),
5548            ("utf8_list", Arc::new(utf8_list) as Arc<dyn Array>, true),
5549        ])
5550        .unwrap();
5551        let batch = read_file(&file, 8, false);
5552        assert_eq!(batch, expected);
5553    }
5554
5555    #[test]
5556    #[cfg(feature = "snappy")]
5557    fn test_nested_lists() {
5558        use arrow_data::ArrayDataBuilder;
5559        let file = arrow_test_data("avro/nested_lists.snappy.avro");
5560        let inner_values = StringArray::from(vec![
5561            Some("a"),
5562            Some("b"),
5563            Some("c"),
5564            Some("d"),
5565            Some("a"),
5566            Some("b"),
5567            Some("c"),
5568            Some("d"),
5569            Some("e"),
5570            Some("a"),
5571            Some("b"),
5572            Some("c"),
5573            Some("d"),
5574            Some("e"),
5575            Some("f"),
5576        ]);
5577        let inner_offsets = Buffer::from_slice_ref([0, 2, 3, 3, 4, 6, 8, 8, 9, 11, 13, 14, 14, 15]);
5578        let inner_validity = [
5579            true, true, false, true, true, true, false, true, true, true, true, false, true,
5580        ];
5581        let inner_null_buffer = Buffer::from_iter(inner_validity.iter().copied());
5582        let inner_field = Field::new("item", DataType::Utf8, true);
5583        let inner_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(inner_field)))
5584            .len(13)
5585            .add_buffer(inner_offsets)
5586            .add_child_data(inner_values.to_data())
5587            .null_bit_buffer(Some(inner_null_buffer))
5588            .build()
5589            .unwrap();
5590        let inner_list_array = ListArray::from(inner_list_data);
5591        let middle_offsets = Buffer::from_slice_ref([0, 2, 4, 6, 8, 11, 13]);
5592        let middle_validity = [true; 6];
5593        let middle_null_buffer = Buffer::from_iter(middle_validity.iter().copied());
5594        let middle_field = Field::new("item", inner_list_array.data_type().clone(), true);
5595        let middle_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(middle_field)))
5596            .len(6)
5597            .add_buffer(middle_offsets)
5598            .add_child_data(inner_list_array.to_data())
5599            .null_bit_buffer(Some(middle_null_buffer))
5600            .build()
5601            .unwrap();
5602        let middle_list_array = ListArray::from(middle_list_data);
5603        let outer_offsets = Buffer::from_slice_ref([0, 2, 4, 6]);
5604        let outer_null_buffer = Buffer::from_slice_ref([0b111]); // all 3 rows valid
5605        let outer_field = Field::new("item", middle_list_array.data_type().clone(), true);
5606        let outer_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(outer_field)))
5607            .len(3)
5608            .add_buffer(outer_offsets)
5609            .add_child_data(middle_list_array.to_data())
5610            .null_bit_buffer(Some(outer_null_buffer))
5611            .build()
5612            .unwrap();
5613        let a_expected = ListArray::from(outer_list_data);
5614        let b_expected = Int32Array::from(vec![1, 1, 1]);
5615        let expected = RecordBatch::try_from_iter_with_nullable([
5616            ("a", Arc::new(a_expected) as Arc<dyn Array>, true),
5617            ("b", Arc::new(b_expected) as Arc<dyn Array>, true),
5618        ])
5619        .unwrap();
5620        let left = read_file(&file, 8, false);
5621        assert_eq!(left, expected, "Mismatch for batch size=8");
5622        let left_small = read_file(&file, 3, false);
5623        assert_eq!(left_small, expected, "Mismatch for batch size=3");
5624    }
5625
5626    #[test]
5627    fn test_simple() {
5628        let tests = [
5629            ("avro/simple_enum.avro", 4, build_expected_enum(), 2),
5630            ("avro/simple_fixed.avro", 2, build_expected_fixed(), 1),
5631        ];
5632
5633        fn build_expected_enum() -> RecordBatch {
5634            // Build the DictionaryArrays for f1, f2, f3
5635            let keys_f1 = Int32Array::from(vec![0, 1, 2, 3]);
5636            let vals_f1 = StringArray::from(vec!["a", "b", "c", "d"]);
5637            let f1_dict =
5638                DictionaryArray::<Int32Type>::try_new(keys_f1, Arc::new(vals_f1)).unwrap();
5639            let keys_f2 = Int32Array::from(vec![2, 3, 0, 1]);
5640            let vals_f2 = StringArray::from(vec!["e", "f", "g", "h"]);
5641            let f2_dict =
5642                DictionaryArray::<Int32Type>::try_new(keys_f2, Arc::new(vals_f2)).unwrap();
5643            let keys_f3 = Int32Array::from(vec![Some(1), Some(2), None, Some(0)]);
5644            let vals_f3 = StringArray::from(vec!["i", "j", "k"]);
5645            let f3_dict =
5646                DictionaryArray::<Int32Type>::try_new(keys_f3, Arc::new(vals_f3)).unwrap();
5647            let dict_type =
5648                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
5649            let mut md_f1 = HashMap::new();
5650            md_f1.insert(
5651                AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5652                r#"["a","b","c","d"]"#.to_string(),
5653            );
5654            md_f1.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum1".to_string());
5655            md_f1.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5656            let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
5657            let mut md_f2 = HashMap::new();
5658            md_f2.insert(
5659                AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5660                r#"["e","f","g","h"]"#.to_string(),
5661            );
5662            md_f2.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum2".to_string());
5663            md_f2.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5664            let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
5665            let mut md_f3 = HashMap::new();
5666            md_f3.insert(
5667                AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5668                r#"["i","j","k"]"#.to_string(),
5669            );
5670            md_f3.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum3".to_string());
5671            md_f3.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5672            let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
5673            let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
5674            RecordBatch::try_new(
5675                expected_schema,
5676                vec![
5677                    Arc::new(f1_dict) as Arc<dyn Array>,
5678                    Arc::new(f2_dict) as Arc<dyn Array>,
5679                    Arc::new(f3_dict) as Arc<dyn Array>,
5680                ],
5681            )
5682            .unwrap()
5683        }
5684
5685        fn build_expected_fixed() -> RecordBatch {
5686            let f1 =
5687                FixedSizeBinaryArray::try_from_iter(vec![b"abcde", b"12345"].into_iter()).unwrap();
5688            let f2 =
5689                FixedSizeBinaryArray::try_from_iter(vec![b"fghijklmno", b"1234567890"].into_iter())
5690                    .unwrap();
5691            let f3 = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5692                vec![Some(b"ABCDEF" as &[u8]), None].into_iter(),
5693                6,
5694            )
5695            .unwrap();
5696
5697            // Add Avro named-type metadata for fixed fields
5698            let mut md_f1 = HashMap::new();
5699            md_f1.insert(
5700                crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5701                "fixed1".to_string(),
5702            );
5703            md_f1.insert(
5704                crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5705                "ns1".to_string(),
5706            );
5707
5708            let mut md_f2 = HashMap::new();
5709            md_f2.insert(
5710                crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5711                "fixed2".to_string(),
5712            );
5713            md_f2.insert(
5714                crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5715                "ns2".to_string(),
5716            );
5717
5718            let mut md_f3 = HashMap::new();
5719            md_f3.insert(
5720                crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5721                "fixed3".to_string(),
5722            );
5723            md_f3.insert(
5724                crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5725                "ns1".to_string(),
5726            );
5727
5728            let expected_schema = Arc::new(Schema::new(vec![
5729                Field::new("f1", DataType::FixedSizeBinary(5), false).with_metadata(md_f1),
5730                Field::new("f2", DataType::FixedSizeBinary(10), false).with_metadata(md_f2),
5731                Field::new("f3", DataType::FixedSizeBinary(6), true).with_metadata(md_f3),
5732            ]));
5733
5734            RecordBatch::try_new(
5735                expected_schema,
5736                vec![
5737                    Arc::new(f1) as Arc<dyn Array>,
5738                    Arc::new(f2) as Arc<dyn Array>,
5739                    Arc::new(f3) as Arc<dyn Array>,
5740                ],
5741            )
5742            .unwrap()
5743        }
5744        for (file_name, batch_size, expected, alt_batch_size) in tests {
5745            let file = arrow_test_data(file_name);
5746            let actual = read_file(&file, batch_size, false);
5747            assert_eq!(actual, expected);
5748            let actual2 = read_file(&file, alt_batch_size, false);
5749            assert_eq!(actual2, expected);
5750        }
5751    }
5752
5753    #[test]
5754    #[cfg(feature = "snappy")]
5755    fn test_single_nan() {
5756        let file = arrow_test_data("avro/single_nan.avro");
5757        let actual = read_file(&file, 1, false);
5758        use arrow_array::Float64Array;
5759        let schema = Arc::new(Schema::new(vec![Field::new(
5760            "mycol",
5761            DataType::Float64,
5762            true,
5763        )]));
5764        let col = Float64Array::from(vec![None]);
5765        let expected = RecordBatch::try_new(schema, vec![Arc::new(col)]).unwrap();
5766        assert_eq!(actual, expected);
5767        let actual2 = read_file(&file, 2, false);
5768        assert_eq!(actual2, expected);
5769    }
5770
5771    #[test]
5772    fn test_duration_uuid() {
5773        let batch = read_file("test/data/duration_uuid.avro", 4, false);
5774        let schema = batch.schema();
5775        let fields = schema.fields();
5776        assert_eq!(fields.len(), 2);
5777        assert_eq!(fields[0].name(), "duration_field");
5778        assert_eq!(
5779            fields[0].data_type(),
5780            &DataType::Interval(IntervalUnit::MonthDayNano)
5781        );
5782        assert_eq!(fields[1].name(), "uuid_field");
5783        assert_eq!(fields[1].data_type(), &DataType::FixedSizeBinary(16));
5784        assert_eq!(batch.num_rows(), 4);
5785        assert_eq!(batch.num_columns(), 2);
5786        let duration_array = batch
5787            .column(0)
5788            .as_any()
5789            .downcast_ref::<IntervalMonthDayNanoArray>()
5790            .unwrap();
5791        let expected_duration_array: IntervalMonthDayNanoArray = [
5792            Some(IntervalMonthDayNanoType::make_value(1, 15, 500_000_000)),
5793            Some(IntervalMonthDayNanoType::make_value(0, 5, 2_500_000_000)),
5794            Some(IntervalMonthDayNanoType::make_value(2, 0, 0)),
5795            Some(IntervalMonthDayNanoType::make_value(12, 31, 999_000_000)),
5796        ]
5797        .iter()
5798        .copied()
5799        .collect();
5800        assert_eq!(&expected_duration_array, duration_array);
5801        let uuid_array = batch
5802            .column(1)
5803            .as_any()
5804            .downcast_ref::<FixedSizeBinaryArray>()
5805            .unwrap();
5806        let expected_uuid_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5807            [
5808                Some([
5809                    0xfe, 0x7b, 0xc3, 0x0b, 0x4c, 0xe8, 0x4c, 0x5e, 0xb6, 0x7c, 0x22, 0x34, 0xa2,
5810                    0xd3, 0x8e, 0x66,
5811                ]),
5812                Some([
5813                    0xb3, 0x3f, 0x2a, 0xd7, 0x97, 0xb4, 0x4d, 0xe1, 0x8b, 0xfe, 0x94, 0x94, 0x1d,
5814                    0x60, 0x15, 0x6e,
5815                ]),
5816                Some([
5817                    0x5f, 0x74, 0x92, 0x64, 0x07, 0x4b, 0x40, 0x05, 0x84, 0xbf, 0x11, 0x5e, 0xa8,
5818                    0x4e, 0xd2, 0x0a,
5819                ]),
5820                Some([
5821                    0x08, 0x26, 0xcc, 0x06, 0xd2, 0xe3, 0x45, 0x99, 0xb4, 0xad, 0xaf, 0x5f, 0xa6,
5822                    0x90, 0x5c, 0xdb,
5823                ]),
5824            ]
5825            .into_iter(),
5826            16,
5827        )
5828        .unwrap();
5829        assert_eq!(&expected_uuid_array, uuid_array);
5830    }
5831
5832    #[test]
5833    #[cfg(feature = "snappy")]
5834    fn test_datapage_v2() {
5835        let file = arrow_test_data("avro/datapage_v2.snappy.avro");
5836        let batch = read_file(&file, 8, false);
5837        let a = StringArray::from(vec![
5838            Some("abc"),
5839            Some("abc"),
5840            Some("abc"),
5841            None,
5842            Some("abc"),
5843        ]);
5844        let b = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
5845        let c = Float64Array::from(vec![Some(2.0), Some(3.0), Some(4.0), Some(5.0), Some(2.0)]);
5846        let d = BooleanArray::from(vec![
5847            Some(true),
5848            Some(true),
5849            Some(true),
5850            Some(false),
5851            Some(true),
5852        ]);
5853        let e_values = Int32Array::from(vec![
5854            Some(1),
5855            Some(2),
5856            Some(3),
5857            Some(1),
5858            Some(2),
5859            Some(3),
5860            Some(1),
5861            Some(2),
5862        ]);
5863        let e_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 3, 3, 3, 6, 8]));
5864        let e_validity = Some(NullBuffer::from(vec![true, false, false, true, true]));
5865        let field_e = Arc::new(Field::new("item", DataType::Int32, true));
5866        let e = ListArray::new(field_e, e_offsets, Arc::new(e_values), e_validity);
5867        let expected = RecordBatch::try_from_iter_with_nullable([
5868            ("a", Arc::new(a) as Arc<dyn Array>, true),
5869            ("b", Arc::new(b) as Arc<dyn Array>, true),
5870            ("c", Arc::new(c) as Arc<dyn Array>, true),
5871            ("d", Arc::new(d) as Arc<dyn Array>, true),
5872            ("e", Arc::new(e) as Arc<dyn Array>, true),
5873        ])
5874        .unwrap();
5875        assert_eq!(batch, expected);
5876    }
5877
5878    #[test]
5879    fn test_nested_records() {
5880        let f1_f1_1 = StringArray::from(vec!["aaa", "bbb"]);
5881        let f1_f1_2 = Int32Array::from(vec![10, 20]);
5882        let rounded_pi = (std::f64::consts::PI * 100.0).round() / 100.0;
5883        let f1_f1_3_1 = Float64Array::from(vec![rounded_pi, rounded_pi]);
5884        let f1_f1_3 = StructArray::from(vec![(
5885            Arc::new(Field::new("f1_3_1", DataType::Float64, false)),
5886            Arc::new(f1_f1_3_1) as Arc<dyn Array>,
5887        )]);
5888        // Add Avro named-type metadata to nested field f1_3 (ns3.record3)
5889        let mut f1_3_md: HashMap<String, String> = HashMap::new();
5890        f1_3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns3".to_string());
5891        f1_3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record3".to_string());
5892        let f1_expected = StructArray::from(vec![
5893            (
5894                Arc::new(Field::new("f1_1", DataType::Utf8, false)),
5895                Arc::new(f1_f1_1) as Arc<dyn Array>,
5896            ),
5897            (
5898                Arc::new(Field::new("f1_2", DataType::Int32, false)),
5899                Arc::new(f1_f1_2) as Arc<dyn Array>,
5900            ),
5901            (
5902                Arc::new(
5903                    Field::new(
5904                        "f1_3",
5905                        DataType::Struct(Fields::from(vec![Field::new(
5906                            "f1_3_1",
5907                            DataType::Float64,
5908                            false,
5909                        )])),
5910                        false,
5911                    )
5912                    .with_metadata(f1_3_md),
5913                ),
5914                Arc::new(f1_f1_3) as Arc<dyn Array>,
5915            ),
5916        ]);
5917        let f2_fields = [
5918            Field::new("f2_1", DataType::Boolean, false),
5919            Field::new("f2_2", DataType::Float32, false),
5920        ];
5921        let f2_struct_builder = StructBuilder::new(
5922            f2_fields
5923                .iter()
5924                .map(|f| Arc::new(f.clone()))
5925                .collect::<Vec<Arc<Field>>>(),
5926            vec![
5927                Box::new(BooleanBuilder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5928                Box::new(Float32Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5929            ],
5930        );
5931        let mut f2_list_builder = ListBuilder::new(f2_struct_builder);
5932        {
5933            let struct_builder = f2_list_builder.values();
5934            struct_builder.append(true);
5935            {
5936                let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5937                b.append_value(true);
5938            }
5939            {
5940                let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5941                b.append_value(1.2_f32);
5942            }
5943            struct_builder.append(true);
5944            {
5945                let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5946                b.append_value(true);
5947            }
5948            {
5949                let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5950                b.append_value(2.2_f32);
5951            }
5952            f2_list_builder.append(true);
5953        }
5954        {
5955            let struct_builder = f2_list_builder.values();
5956            struct_builder.append(true);
5957            {
5958                let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5959                b.append_value(false);
5960            }
5961            {
5962                let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5963                b.append_value(10.2_f32);
5964            }
5965            f2_list_builder.append(true);
5966        }
5967
5968        let list_array_with_nullable_items = f2_list_builder.finish();
5969        // Add Avro named-type metadata to f2's list item (ns4.record4)
5970        let mut f2_item_md: HashMap<String, String> = HashMap::new();
5971        f2_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record4".to_string());
5972        f2_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns4".to_string());
5973        let item_field = Arc::new(
5974            Field::new(
5975                "item",
5976                list_array_with_nullable_items.values().data_type().clone(),
5977                false, // items are non-nullable for f2
5978            )
5979            .with_metadata(f2_item_md),
5980        );
5981        let list_data_type = DataType::List(item_field);
5982        let f2_array_data = list_array_with_nullable_items
5983            .to_data()
5984            .into_builder()
5985            .data_type(list_data_type)
5986            .build()
5987            .unwrap();
5988        let f2_expected = ListArray::from(f2_array_data);
5989        let mut f3_struct_builder = StructBuilder::new(
5990            vec![Arc::new(Field::new("f3_1", DataType::Utf8, false))],
5991            vec![Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>],
5992        );
5993        f3_struct_builder.append(true);
5994        {
5995            let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
5996            b.append_value("xyz");
5997        }
5998        f3_struct_builder.append(false);
5999        {
6000            let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
6001            b.append_null();
6002        }
6003        let f3_expected = f3_struct_builder.finish();
6004        let f4_fields = [Field::new("f4_1", DataType::Int64, false)];
6005        let f4_struct_builder = StructBuilder::new(
6006            f4_fields
6007                .iter()
6008                .map(|f| Arc::new(f.clone()))
6009                .collect::<Vec<Arc<Field>>>(),
6010            vec![Box::new(Int64Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>],
6011        );
6012        let mut f4_list_builder = ListBuilder::new(f4_struct_builder);
6013        {
6014            let struct_builder = f4_list_builder.values();
6015            struct_builder.append(true);
6016            {
6017                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6018                b.append_value(200);
6019            }
6020            struct_builder.append(false);
6021            {
6022                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6023                b.append_null();
6024            }
6025            f4_list_builder.append(true);
6026        }
6027        {
6028            let struct_builder = f4_list_builder.values();
6029            struct_builder.append(false);
6030            {
6031                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6032                b.append_null();
6033            }
6034            struct_builder.append(true);
6035            {
6036                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6037                b.append_value(300);
6038            }
6039            f4_list_builder.append(true);
6040        }
6041        let f4_expected = f4_list_builder.finish();
6042        // Add Avro named-type metadata to f4's list item (ns6.record6), item is nullable
6043        let mut f4_item_md: HashMap<String, String> = HashMap::new();
6044        f4_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns6".to_string());
6045        f4_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record6".to_string());
6046        let f4_item_field = Arc::new(
6047            Field::new("item", f4_expected.values().data_type().clone(), true)
6048                .with_metadata(f4_item_md),
6049        );
6050        let f4_list_data_type = DataType::List(f4_item_field);
6051        let f4_array_data = f4_expected
6052            .to_data()
6053            .into_builder()
6054            .data_type(f4_list_data_type)
6055            .build()
6056            .unwrap();
6057        let f4_expected = ListArray::from(f4_array_data);
6058        // Build Schema with Avro named-type metadata on the top-level f1 and f3 fields
6059        let mut f1_md: HashMap<String, String> = HashMap::new();
6060        f1_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record2".to_string());
6061        f1_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
6062        let mut f3_md: HashMap<String, String> = HashMap::new();
6063        f3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns5".to_string());
6064        f3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record5".to_string());
6065        let expected_schema = Schema::new(vec![
6066            Field::new("f1", f1_expected.data_type().clone(), false).with_metadata(f1_md),
6067            Field::new("f2", f2_expected.data_type().clone(), false),
6068            Field::new("f3", f3_expected.data_type().clone(), true).with_metadata(f3_md),
6069            Field::new("f4", f4_expected.data_type().clone(), false),
6070        ]);
6071        let expected = RecordBatch::try_new(
6072            Arc::new(expected_schema),
6073            vec![
6074                Arc::new(f1_expected) as Arc<dyn Array>,
6075                Arc::new(f2_expected) as Arc<dyn Array>,
6076                Arc::new(f3_expected) as Arc<dyn Array>,
6077                Arc::new(f4_expected) as Arc<dyn Array>,
6078            ],
6079        )
6080        .unwrap();
6081        let file = arrow_test_data("avro/nested_records.avro");
6082        let batch_large = read_file(&file, 8, false);
6083        assert_eq!(
6084            batch_large, expected,
6085            "Decoded RecordBatch does not match expected data for nested records (batch size 8)"
6086        );
6087        let batch_small = read_file(&file, 3, false);
6088        assert_eq!(
6089            batch_small, expected,
6090            "Decoded RecordBatch does not match expected data for nested records (batch size 3)"
6091        );
6092    }
6093
6094    #[test]
6095    // TODO: avoid requiring snappy for this file
6096    #[cfg(feature = "snappy")]
6097    fn test_repeated_no_annotation() {
6098        use arrow_data::ArrayDataBuilder;
6099        let file = arrow_test_data("avro/repeated_no_annotation.avro");
6100        let batch_large = read_file(&file, 8, false);
6101        // id column
6102        let id_array = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
6103        // Build the inner Struct<number:int64, kind:utf8>
6104        let number_array = Int64Array::from(vec![
6105            Some(5555555555),
6106            Some(1111111111),
6107            Some(1111111111),
6108            Some(2222222222),
6109            Some(3333333333),
6110        ]);
6111        let kind_array =
6112            StringArray::from(vec![None, Some("home"), Some("home"), None, Some("mobile")]);
6113        let phone_fields = Fields::from(vec![
6114            Field::new("number", DataType::Int64, true),
6115            Field::new("kind", DataType::Utf8, true),
6116        ]);
6117        let phone_struct_data = ArrayDataBuilder::new(DataType::Struct(phone_fields))
6118            .len(5)
6119            .child_data(vec![number_array.into_data(), kind_array.into_data()])
6120            .build()
6121            .unwrap();
6122        let phone_struct_array = StructArray::from(phone_struct_data);
6123        // Build List<item: Struct<...>> with Avro named-type metadata on the *element* field
6124        let phone_list_offsets = Buffer::from_slice_ref([0i32, 0, 0, 0, 1, 2, 5]);
6125        let phone_list_validity = Buffer::from_iter([false, false, true, true, true, true]);
6126        // The Avro schema names this inner record "phone" in namespace "topLevelRecord.phoneNumbers"
6127        let mut phone_item_md = HashMap::new();
6128        phone_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "phone".to_string());
6129        phone_item_md.insert(
6130            AVRO_NAMESPACE_METADATA_KEY.to_string(),
6131            "topLevelRecord.phoneNumbers".to_string(),
6132        );
6133        let phone_item_field = Field::new("item", phone_struct_array.data_type().clone(), true)
6134            .with_metadata(phone_item_md);
6135        let phone_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(phone_item_field)))
6136            .len(6)
6137            .add_buffer(phone_list_offsets)
6138            .null_bit_buffer(Some(phone_list_validity))
6139            .child_data(vec![phone_struct_array.into_data()])
6140            .build()
6141            .unwrap();
6142        let phone_list_array = ListArray::from(phone_list_data);
6143        // Wrap in Struct { phone: List<...> }
6144        let phone_numbers_validity = Buffer::from_iter([false, false, true, true, true, true]);
6145        let phone_numbers_field = Field::new("phone", phone_list_array.data_type().clone(), true);
6146        let phone_numbers_struct_data =
6147            ArrayDataBuilder::new(DataType::Struct(Fields::from(vec![phone_numbers_field])))
6148                .len(6)
6149                .null_bit_buffer(Some(phone_numbers_validity))
6150                .child_data(vec![phone_list_array.into_data()])
6151                .build()
6152                .unwrap();
6153        let phone_numbers_struct_array = StructArray::from(phone_numbers_struct_data);
6154        // Build the expected Schema, annotating the top-level "phoneNumbers" field with Avro name/namespace
6155        let mut phone_numbers_md = HashMap::new();
6156        phone_numbers_md.insert(
6157            AVRO_NAME_METADATA_KEY.to_string(),
6158            "phoneNumbers".to_string(),
6159        );
6160        phone_numbers_md.insert(
6161            AVRO_NAMESPACE_METADATA_KEY.to_string(),
6162            "topLevelRecord".to_string(),
6163        );
6164        let id_field = Field::new("id", DataType::Int32, true);
6165        let phone_numbers_schema_field = Field::new(
6166            "phoneNumbers",
6167            phone_numbers_struct_array.data_type().clone(),
6168            true,
6169        )
6170        .with_metadata(phone_numbers_md);
6171        let expected_schema = Schema::new(vec![id_field, phone_numbers_schema_field]);
6172        // Final expected RecordBatch (arrays already carry matching list-element metadata)
6173        let expected = RecordBatch::try_new(
6174            Arc::new(expected_schema),
6175            vec![
6176                Arc::new(id_array) as _,
6177                Arc::new(phone_numbers_struct_array) as _,
6178            ],
6179        )
6180        .unwrap();
6181        assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6182        let batch_small = read_file(&file, 3, false);
6183        assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6184    }
6185
6186    #[test]
6187    // TODO: avoid requiring snappy for this file
6188    #[cfg(feature = "snappy")]
6189    fn test_nonnullable_impala() {
6190        let file = arrow_test_data("avro/nonnullable.impala.avro");
6191        let id = Int64Array::from(vec![Some(8)]);
6192        let mut int_array_builder = ListBuilder::new(Int32Builder::new());
6193        {
6194            let vb = int_array_builder.values();
6195            vb.append_value(-1);
6196        }
6197        int_array_builder.append(true); // finalize one sub-list
6198        let int_array = int_array_builder.finish();
6199        let mut iaa_builder = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
6200        {
6201            let inner_list_builder = iaa_builder.values();
6202            {
6203                let vb = inner_list_builder.values();
6204                vb.append_value(-1);
6205                vb.append_value(-2);
6206            }
6207            inner_list_builder.append(true);
6208            inner_list_builder.append(true);
6209        }
6210        iaa_builder.append(true);
6211        let int_array_array = iaa_builder.finish();
6212        let field_names = MapFieldNames {
6213            entry: "entries".to_string(),
6214            key: "key".to_string(),
6215            value: "value".to_string(),
6216        };
6217        let mut int_map_builder =
6218            MapBuilder::new(Some(field_names), StringBuilder::new(), Int32Builder::new());
6219        {
6220            let (keys, vals) = int_map_builder.entries();
6221            keys.append_value("k1");
6222            vals.append_value(-1);
6223        }
6224        int_map_builder.append(true).unwrap(); // finalize map for row 0
6225        let int_map = int_map_builder.finish();
6226        let field_names2 = MapFieldNames {
6227            entry: "entries".to_string(),
6228            key: "key".to_string(),
6229            value: "value".to_string(),
6230        };
6231        let mut ima_builder = ListBuilder::new(MapBuilder::new(
6232            Some(field_names2),
6233            StringBuilder::new(),
6234            Int32Builder::new(),
6235        ));
6236        {
6237            let map_builder = ima_builder.values();
6238            map_builder.append(true).unwrap();
6239            {
6240                let (keys, vals) = map_builder.entries();
6241                keys.append_value("k1");
6242                vals.append_value(1);
6243            }
6244            map_builder.append(true).unwrap();
6245            map_builder.append(true).unwrap();
6246            map_builder.append(true).unwrap();
6247        }
6248        ima_builder.append(true);
6249        let int_map_array_ = ima_builder.finish();
6250        // Helper metadata maps
6251        let meta_nested_struct: HashMap<String, String> = [
6252            ("avro.name", "nested_Struct"),
6253            ("avro.namespace", "topLevelRecord"),
6254        ]
6255        .into_iter()
6256        .map(|(k, v)| (k.to_string(), v.to_string()))
6257        .collect();
6258        let meta_c: HashMap<String, String> = [
6259            ("avro.name", "c"),
6260            ("avro.namespace", "topLevelRecord.nested_Struct"),
6261        ]
6262        .into_iter()
6263        .map(|(k, v)| (k.to_string(), v.to_string()))
6264        .collect();
6265        let meta_d_item_struct: HashMap<String, String> = [
6266            ("avro.name", "D"),
6267            ("avro.namespace", "topLevelRecord.nested_Struct.c"),
6268        ]
6269        .into_iter()
6270        .map(|(k, v)| (k.to_string(), v.to_string()))
6271        .collect();
6272        let meta_g_value: HashMap<String, String> = [
6273            ("avro.name", "G"),
6274            ("avro.namespace", "topLevelRecord.nested_Struct"),
6275        ]
6276        .into_iter()
6277        .map(|(k, v)| (k.to_string(), v.to_string()))
6278        .collect();
6279        let meta_h: HashMap<String, String> = [
6280            ("avro.name", "h"),
6281            ("avro.namespace", "topLevelRecord.nested_Struct.G"),
6282        ]
6283        .into_iter()
6284        .map(|(k, v)| (k.to_string(), v.to_string()))
6285        .collect();
6286        // Types used multiple times below
6287        let ef_struct_field = Arc::new(
6288            Field::new(
6289                "item",
6290                DataType::Struct(
6291                    vec![
6292                        Field::new("e", DataType::Int32, true),
6293                        Field::new("f", DataType::Utf8, true),
6294                    ]
6295                    .into(),
6296                ),
6297                true,
6298            )
6299            .with_metadata(meta_d_item_struct.clone()),
6300        );
6301        let d_inner_list_field = Arc::new(Field::new(
6302            "item",
6303            DataType::List(ef_struct_field.clone()),
6304            true,
6305        ));
6306        let d_field = Field::new("D", DataType::List(d_inner_list_field.clone()), true);
6307        // G.value.h.i : List<Float64>
6308        let i_list_field = Arc::new(Field::new("item", DataType::Float64, true));
6309        let i_field = Field::new("i", DataType::List(i_list_field.clone()), true);
6310        // G.value.h : Struct<{ i: List<Float64> }> with metadata (h)
6311        let h_field = Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6312            .with_metadata(meta_h.clone());
6313        // G.value : Struct<{ h: ... }> with metadata (G)
6314        let g_value_struct_field = Field::new(
6315            "value",
6316            DataType::Struct(vec![h_field.clone()].into()),
6317            true,
6318        )
6319        .with_metadata(meta_g_value.clone());
6320        // entries struct for Map G
6321        let entries_struct_field = Field::new(
6322            "entries",
6323            DataType::Struct(
6324                vec![
6325                    Field::new("key", DataType::Utf8, false),
6326                    g_value_struct_field.clone(),
6327                ]
6328                .into(),
6329            ),
6330            false,
6331        );
6332        // Top-level nested_Struct fields (include metadata on "c")
6333        let a_field = Arc::new(Field::new("a", DataType::Int32, true));
6334        let b_field = Arc::new(Field::new(
6335            "B",
6336            DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
6337            true,
6338        ));
6339        let c_field = Arc::new(
6340            Field::new("c", DataType::Struct(vec![d_field.clone()].into()), true)
6341                .with_metadata(meta_c.clone()),
6342        );
6343        let g_field = Arc::new(Field::new(
6344            "G",
6345            DataType::Map(Arc::new(entries_struct_field.clone()), false),
6346            true,
6347        ));
6348        // Now create builders that match these exact field types (so nested types carry metadata)
6349        let mut nested_sb = StructBuilder::new(
6350            vec![
6351                a_field.clone(),
6352                b_field.clone(),
6353                c_field.clone(),
6354                g_field.clone(),
6355            ],
6356            vec![
6357                Box::new(Int32Builder::new()),
6358                Box::new(ListBuilder::new(Int32Builder::new())),
6359                {
6360                    // builder for "c" with correctly typed "D" including metadata on inner list item
6361                    Box::new(StructBuilder::new(
6362                        vec![Arc::new(d_field.clone())],
6363                        vec![Box::new({
6364                            let ef_struct_builder = StructBuilder::new(
6365                                vec![
6366                                    Arc::new(Field::new("e", DataType::Int32, true)),
6367                                    Arc::new(Field::new("f", DataType::Utf8, true)),
6368                                ],
6369                                vec![
6370                                    Box::new(Int32Builder::new()),
6371                                    Box::new(StringBuilder::new()),
6372                                ],
6373                            );
6374                            // Inner list that holds Struct<e,f> with Avro named-type metadata ("D")
6375                            let list_of_ef = ListBuilder::new(ef_struct_builder)
6376                                .with_field(ef_struct_field.clone());
6377                            // Outer list for "D"
6378                            ListBuilder::new(list_of_ef)
6379                        })],
6380                    ))
6381                },
6382                {
6383                    let map_field_names = MapFieldNames {
6384                        entry: "entries".to_string(),
6385                        key: "key".to_string(),
6386                        value: "value".to_string(),
6387                    };
6388                    let i_list_builder = ListBuilder::new(Float64Builder::new());
6389                    let h_struct_builder = StructBuilder::new(
6390                        vec![Arc::new(Field::new(
6391                            "i",
6392                            DataType::List(i_list_field.clone()),
6393                            true,
6394                        ))],
6395                        vec![Box::new(i_list_builder)],
6396                    );
6397                    let g_value_builder = StructBuilder::new(
6398                        vec![Arc::new(
6399                            Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6400                                .with_metadata(meta_h.clone()),
6401                        )],
6402                        vec![Box::new(h_struct_builder)],
6403                    );
6404                    // Use with_values_field to attach metadata to "value" field in the map's entries
6405                    let map_builder = MapBuilder::new(
6406                        Some(map_field_names),
6407                        StringBuilder::new(),
6408                        g_value_builder,
6409                    )
6410                    .with_values_field(Arc::new(
6411                        Field::new(
6412                            "value",
6413                            DataType::Struct(vec![h_field.clone()].into()),
6414                            true,
6415                        )
6416                        .with_metadata(meta_g_value.clone()),
6417                    ));
6418
6419                    Box::new(map_builder)
6420                },
6421            ],
6422        );
6423        nested_sb.append(true);
6424        {
6425            let a_builder = nested_sb.field_builder::<Int32Builder>(0).unwrap();
6426            a_builder.append_value(-1);
6427        }
6428        {
6429            let b_builder = nested_sb
6430                .field_builder::<ListBuilder<Int32Builder>>(1)
6431                .unwrap();
6432            {
6433                let vb = b_builder.values();
6434                vb.append_value(-1);
6435            }
6436            b_builder.append(true);
6437        }
6438        {
6439            let c_struct_builder = nested_sb.field_builder::<StructBuilder>(2).unwrap();
6440            c_struct_builder.append(true);
6441            let d_list_builder = c_struct_builder
6442                .field_builder::<ListBuilder<ListBuilder<StructBuilder>>>(0)
6443                .unwrap();
6444            {
6445                let sub_list_builder = d_list_builder.values();
6446                {
6447                    let ef_struct = sub_list_builder.values();
6448                    ef_struct.append(true);
6449                    {
6450                        let e_b = ef_struct.field_builder::<Int32Builder>(0).unwrap();
6451                        e_b.append_value(-1);
6452                        let f_b = ef_struct.field_builder::<StringBuilder>(1).unwrap();
6453                        f_b.append_value("nonnullable");
6454                    }
6455                    sub_list_builder.append(true);
6456                }
6457                d_list_builder.append(true);
6458            }
6459        }
6460        {
6461            let g_map_builder = nested_sb
6462                .field_builder::<MapBuilder<StringBuilder, StructBuilder>>(3)
6463                .unwrap();
6464            g_map_builder.append(true).unwrap();
6465        }
6466        let nested_struct = nested_sb.finish();
6467        let schema = Arc::new(arrow_schema::Schema::new(vec![
6468            Field::new("ID", id.data_type().clone(), true),
6469            Field::new("Int_Array", int_array.data_type().clone(), true),
6470            Field::new("int_array_array", int_array_array.data_type().clone(), true),
6471            Field::new("Int_Map", int_map.data_type().clone(), true),
6472            Field::new("int_map_array", int_map_array_.data_type().clone(), true),
6473            Field::new("nested_Struct", nested_struct.data_type().clone(), true)
6474                .with_metadata(meta_nested_struct.clone()),
6475        ]));
6476        let expected = RecordBatch::try_new(
6477            schema,
6478            vec![
6479                Arc::new(id) as Arc<dyn Array>,
6480                Arc::new(int_array),
6481                Arc::new(int_array_array),
6482                Arc::new(int_map),
6483                Arc::new(int_map_array_),
6484                Arc::new(nested_struct),
6485            ],
6486        )
6487        .unwrap();
6488        let batch_large = read_file(&file, 8, false);
6489        assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6490        let batch_small = read_file(&file, 3, false);
6491        assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6492    }
6493
6494    #[test]
6495    fn test_nonnullable_impala_strict() {
6496        let file = arrow_test_data("avro/nonnullable.impala.avro");
6497        let err = read_file_strict(&file, 8, false).unwrap_err();
6498        assert!(err.to_string().contains(
6499            "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6500        ));
6501    }
6502
6503    #[test]
6504    // TODO: avoid requiring snappy for this file
6505    #[cfg(feature = "snappy")]
6506    fn test_nullable_impala() {
6507        let file = arrow_test_data("avro/nullable.impala.avro");
6508        let batch1 = read_file(&file, 3, false);
6509        let batch2 = read_file(&file, 8, false);
6510        assert_eq!(batch1, batch2);
6511        let batch = batch1;
6512        assert_eq!(batch.num_rows(), 7);
6513        let id_array = batch
6514            .column(0)
6515            .as_any()
6516            .downcast_ref::<Int64Array>()
6517            .expect("id column should be an Int64Array");
6518        let expected_ids = [1, 2, 3, 4, 5, 6, 7];
6519        for (i, &expected_id) in expected_ids.iter().enumerate() {
6520            assert_eq!(id_array.value(i), expected_id, "Mismatch in id at row {i}",);
6521        }
6522        let int_array = batch
6523            .column(1)
6524            .as_any()
6525            .downcast_ref::<ListArray>()
6526            .expect("int_array column should be a ListArray");
6527        {
6528            let offsets = int_array.value_offsets();
6529            let start = offsets[0] as usize;
6530            let end = offsets[1] as usize;
6531            let values = int_array
6532                .values()
6533                .as_any()
6534                .downcast_ref::<Int32Array>()
6535                .expect("Values of int_array should be an Int32Array");
6536            let row0: Vec<Option<i32>> = (start..end).map(|i| Some(values.value(i))).collect();
6537            assert_eq!(
6538                row0,
6539                vec![Some(1), Some(2), Some(3)],
6540                "Mismatch in int_array row 0"
6541            );
6542        }
6543        let nested_struct = batch
6544            .column(5)
6545            .as_any()
6546            .downcast_ref::<StructArray>()
6547            .expect("nested_struct column should be a StructArray");
6548        let a_array = nested_struct
6549            .column_by_name("A")
6550            .expect("Field A should exist in nested_struct")
6551            .as_any()
6552            .downcast_ref::<Int32Array>()
6553            .expect("Field A should be an Int32Array");
6554        assert_eq!(a_array.value(0), 1, "Mismatch in nested_struct.A at row 0");
6555        assert!(
6556            !a_array.is_valid(1),
6557            "Expected null in nested_struct.A at row 1"
6558        );
6559        assert!(
6560            !a_array.is_valid(3),
6561            "Expected null in nested_struct.A at row 3"
6562        );
6563        assert_eq!(a_array.value(6), 7, "Mismatch in nested_struct.A at row 6");
6564    }
6565
6566    #[test]
6567    fn test_nullable_impala_strict() {
6568        let file = arrow_test_data("avro/nullable.impala.avro");
6569        let err = read_file_strict(&file, 8, false).unwrap_err();
6570        assert!(err.to_string().contains(
6571            "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6572        ));
6573    }
6574
6575    #[test]
6576    fn test_nested_record_type_reuse() {
6577        // The .avro file has the following schema:
6578        // {
6579        // "type" : "record",
6580        // "name" : "Record",
6581        // "fields" : [ {
6582        //     "name" : "nested",
6583        //     "type" : {
6584        //     "type" : "record",
6585        //     "name" : "Nested",
6586        //     "fields" : [ {
6587        //         "name" : "nested_int",
6588        //         "type" : "int"
6589        //     } ]
6590        //     }
6591        // }, {
6592        //     "name" : "nestedRecord",
6593        //     "type" : "Nested"
6594        // }, {
6595        //     "name" : "nestedArray",
6596        //     "type" : {
6597        //     "type" : "array",
6598        //     "items" : "Nested"
6599        //     }
6600        // } ]
6601        // }
6602        let batch = read_file("test/data/nested_record_reuse.avro", 8, false);
6603        let schema = batch.schema();
6604
6605        // Verify schema structure
6606        assert_eq!(schema.fields().len(), 3);
6607        let fields = schema.fields();
6608        assert_eq!(fields[0].name(), "nested");
6609        assert_eq!(fields[1].name(), "nestedRecord");
6610        assert_eq!(fields[2].name(), "nestedArray");
6611        assert!(matches!(fields[0].data_type(), DataType::Struct(_)));
6612        assert!(matches!(fields[1].data_type(), DataType::Struct(_)));
6613        assert!(matches!(fields[2].data_type(), DataType::List(_)));
6614
6615        // Validate that the nested record type
6616        if let DataType::Struct(nested_fields) = fields[0].data_type() {
6617            assert_eq!(nested_fields.len(), 1);
6618            assert_eq!(nested_fields[0].name(), "nested_int");
6619            assert_eq!(nested_fields[0].data_type(), &DataType::Int32);
6620        }
6621
6622        // Validate that the nested record type is reused
6623        assert_eq!(fields[0].data_type(), fields[1].data_type());
6624        if let DataType::List(array_field) = fields[2].data_type() {
6625            assert_eq!(array_field.data_type(), fields[0].data_type());
6626        }
6627
6628        // Validate data
6629        assert_eq!(batch.num_rows(), 2);
6630        assert_eq!(batch.num_columns(), 3);
6631
6632        // Validate the first column (nested)
6633        let nested_col = batch
6634            .column(0)
6635            .as_any()
6636            .downcast_ref::<StructArray>()
6637            .unwrap();
6638        let nested_int_array = nested_col
6639            .column_by_name("nested_int")
6640            .unwrap()
6641            .as_any()
6642            .downcast_ref::<Int32Array>()
6643            .unwrap();
6644        assert_eq!(nested_int_array.value(0), 42);
6645        assert_eq!(nested_int_array.value(1), 99);
6646
6647        // Validate the second column (nestedRecord)
6648        let nested_record_col = batch
6649            .column(1)
6650            .as_any()
6651            .downcast_ref::<StructArray>()
6652            .unwrap();
6653        let nested_record_int_array = nested_record_col
6654            .column_by_name("nested_int")
6655            .unwrap()
6656            .as_any()
6657            .downcast_ref::<Int32Array>()
6658            .unwrap();
6659        assert_eq!(nested_record_int_array.value(0), 100);
6660        assert_eq!(nested_record_int_array.value(1), 200);
6661
6662        // Validate the third column (nestedArray)
6663        let nested_array_col = batch
6664            .column(2)
6665            .as_any()
6666            .downcast_ref::<ListArray>()
6667            .unwrap();
6668        assert_eq!(nested_array_col.len(), 2);
6669        let first_array_struct = nested_array_col.value(0);
6670        let first_array_struct_array = first_array_struct
6671            .as_any()
6672            .downcast_ref::<StructArray>()
6673            .unwrap();
6674        let first_array_int_values = first_array_struct_array
6675            .column_by_name("nested_int")
6676            .unwrap()
6677            .as_any()
6678            .downcast_ref::<Int32Array>()
6679            .unwrap();
6680        assert_eq!(first_array_int_values.len(), 3);
6681        assert_eq!(first_array_int_values.value(0), 1);
6682        assert_eq!(first_array_int_values.value(1), 2);
6683        assert_eq!(first_array_int_values.value(2), 3);
6684    }
6685
6686    #[test]
6687    fn test_enum_type_reuse() {
6688        // The .avro file has the following schema:
6689        // {
6690        //     "type" : "record",
6691        //     "name" : "Record",
6692        //     "fields" : [ {
6693        //       "name" : "status",
6694        //       "type" : {
6695        //         "type" : "enum",
6696        //         "name" : "Status",
6697        //         "symbols" : [ "ACTIVE", "INACTIVE", "PENDING" ]
6698        //       }
6699        //     }, {
6700        //       "name" : "backupStatus",
6701        //       "type" : "Status"
6702        //     }, {
6703        //       "name" : "statusHistory",
6704        //       "type" : {
6705        //         "type" : "array",
6706        //         "items" : "Status"
6707        //       }
6708        //     } ]
6709        //   }
6710        let batch = read_file("test/data/enum_reuse.avro", 8, false);
6711        let schema = batch.schema();
6712
6713        // Verify schema structure
6714        assert_eq!(schema.fields().len(), 3);
6715        let fields = schema.fields();
6716        assert_eq!(fields[0].name(), "status");
6717        assert_eq!(fields[1].name(), "backupStatus");
6718        assert_eq!(fields[2].name(), "statusHistory");
6719        assert!(matches!(fields[0].data_type(), DataType::Dictionary(_, _)));
6720        assert!(matches!(fields[1].data_type(), DataType::Dictionary(_, _)));
6721        assert!(matches!(fields[2].data_type(), DataType::List(_)));
6722
6723        if let DataType::Dictionary(key_type, value_type) = fields[0].data_type() {
6724            assert_eq!(key_type.as_ref(), &DataType::Int32);
6725            assert_eq!(value_type.as_ref(), &DataType::Utf8);
6726        }
6727
6728        // Validate that the enum types are reused
6729        assert_eq!(fields[0].data_type(), fields[1].data_type());
6730        if let DataType::List(array_field) = fields[2].data_type() {
6731            assert_eq!(array_field.data_type(), fields[0].data_type());
6732        }
6733
6734        // Validate data - should have 2 rows
6735        assert_eq!(batch.num_rows(), 2);
6736        assert_eq!(batch.num_columns(), 3);
6737
6738        // Get status enum values
6739        let status_col = batch
6740            .column(0)
6741            .as_any()
6742            .downcast_ref::<DictionaryArray<Int32Type>>()
6743            .unwrap();
6744        let status_values = status_col
6745            .values()
6746            .as_any()
6747            .downcast_ref::<StringArray>()
6748            .unwrap();
6749
6750        // First row should be "ACTIVE", second row should be "PENDING"
6751        assert_eq!(
6752            status_values.value(status_col.key(0).unwrap() as usize),
6753            "ACTIVE"
6754        );
6755        assert_eq!(
6756            status_values.value(status_col.key(1).unwrap() as usize),
6757            "PENDING"
6758        );
6759
6760        // Get backupStatus enum values (same as status)
6761        let backup_status_col = batch
6762            .column(1)
6763            .as_any()
6764            .downcast_ref::<DictionaryArray<Int32Type>>()
6765            .unwrap();
6766        let backup_status_values = backup_status_col
6767            .values()
6768            .as_any()
6769            .downcast_ref::<StringArray>()
6770            .unwrap();
6771
6772        // First row should be "INACTIVE", second row should be "ACTIVE"
6773        assert_eq!(
6774            backup_status_values.value(backup_status_col.key(0).unwrap() as usize),
6775            "INACTIVE"
6776        );
6777        assert_eq!(
6778            backup_status_values.value(backup_status_col.key(1).unwrap() as usize),
6779            "ACTIVE"
6780        );
6781
6782        // Get statusHistory array
6783        let status_history_col = batch
6784            .column(2)
6785            .as_any()
6786            .downcast_ref::<ListArray>()
6787            .unwrap();
6788        assert_eq!(status_history_col.len(), 2);
6789
6790        // Validate first row's array data
6791        let first_array_dict = status_history_col.value(0);
6792        let first_array_dict_array = first_array_dict
6793            .as_any()
6794            .downcast_ref::<DictionaryArray<Int32Type>>()
6795            .unwrap();
6796        let first_array_values = first_array_dict_array
6797            .values()
6798            .as_any()
6799            .downcast_ref::<StringArray>()
6800            .unwrap();
6801
6802        // First row: ["PENDING", "ACTIVE", "INACTIVE"]
6803        assert_eq!(first_array_dict_array.len(), 3);
6804        assert_eq!(
6805            first_array_values.value(first_array_dict_array.key(0).unwrap() as usize),
6806            "PENDING"
6807        );
6808        assert_eq!(
6809            first_array_values.value(first_array_dict_array.key(1).unwrap() as usize),
6810            "ACTIVE"
6811        );
6812        assert_eq!(
6813            first_array_values.value(first_array_dict_array.key(2).unwrap() as usize),
6814            "INACTIVE"
6815        );
6816    }
6817
6818    #[test]
6819    fn comprehensive_e2e_test() {
6820        let path = "test/data/comprehensive_e2e.avro";
6821        let batch = read_file(path, 1024, false);
6822        let schema = batch.schema();
6823
6824        #[inline]
6825        fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
6826            for (tid, f) in fields.iter() {
6827                if f.name() == want {
6828                    return tid;
6829                }
6830            }
6831            panic!("union child '{want}' not found");
6832        }
6833
6834        #[inline]
6835        fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
6836            for (tid, f) in fields.iter() {
6837                if pred(f.data_type()) {
6838                    return tid;
6839                }
6840            }
6841            panic!("no union child matches predicate");
6842        }
6843
6844        fn mk_dense_union(
6845            fields: &UnionFields,
6846            type_ids: Vec<i8>,
6847            offsets: Vec<i32>,
6848            provide: impl Fn(&Field) -> Option<ArrayRef>,
6849        ) -> ArrayRef {
6850            fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
6851                match dt {
6852                    DataType::Null => Arc::new(NullArray::new(0)),
6853                    DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
6854                    DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
6855                    DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
6856                    DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
6857                    DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
6858                    DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
6859                    DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
6860                    DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
6861                    DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
6862                        Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
6863                    }
6864                    DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
6865                        Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
6866                    }
6867                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
6868                        let a = TimestampMillisecondArray::from(Vec::<i64>::new());
6869                        Arc::new(if let Some(tz) = tz {
6870                            a.with_timezone(tz.clone())
6871                        } else {
6872                            a
6873                        })
6874                    }
6875                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
6876                        let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
6877                        Arc::new(if let Some(tz) = tz {
6878                            a.with_timezone(tz.clone())
6879                        } else {
6880                            a
6881                        })
6882                    }
6883                    DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
6884                        IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
6885                    ),
6886                    DataType::FixedSizeBinary(sz) => Arc::new(
6887                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(
6888                            std::iter::empty::<Option<Vec<u8>>>(),
6889                            *sz,
6890                        )
6891                        .unwrap(),
6892                    ),
6893                    DataType::Dictionary(_, _) => {
6894                        let keys = Int32Array::from(Vec::<i32>::new());
6895                        let values = Arc::new(StringArray::from(Vec::<&str>::new()));
6896                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
6897                    }
6898                    DataType::Struct(fields) => {
6899                        let children: Vec<ArrayRef> = fields
6900                            .iter()
6901                            .map(|f| empty_child_for(f.data_type()) as ArrayRef)
6902                            .collect();
6903                        Arc::new(StructArray::new(fields.clone(), children, None))
6904                    }
6905                    DataType::List(field) => {
6906                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6907                        Arc::new(
6908                            ListArray::try_new(
6909                                field.clone(),
6910                                offsets,
6911                                empty_child_for(field.data_type()),
6912                                None,
6913                            )
6914                            .unwrap(),
6915                        )
6916                    }
6917                    DataType::Map(entry_field, is_sorted) => {
6918                        let (key_field, val_field) = match entry_field.data_type() {
6919                            DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
6920                            other => panic!("unexpected map entries type: {other:?}"),
6921                        };
6922                        let keys = StringArray::from(Vec::<&str>::new());
6923                        let vals: ArrayRef = match val_field.data_type() {
6924                            DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
6925                            DataType::Boolean => {
6926                                Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
6927                            }
6928                            DataType::Int32 => {
6929                                Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
6930                            }
6931                            DataType::Int64 => {
6932                                Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
6933                            }
6934                            DataType::Float32 => {
6935                                Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
6936                            }
6937                            DataType::Float64 => {
6938                                Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
6939                            }
6940                            DataType::Utf8 => {
6941                                Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
6942                            }
6943                            DataType::Binary => {
6944                                Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
6945                            }
6946                            DataType::Union(uf, _) => {
6947                                let children: Vec<ArrayRef> = uf
6948                                    .iter()
6949                                    .map(|(_, f)| empty_child_for(f.data_type()))
6950                                    .collect();
6951                                Arc::new(
6952                                    UnionArray::try_new(
6953                                        uf.clone(),
6954                                        ScalarBuffer::<i8>::from(Vec::<i8>::new()),
6955                                        Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
6956                                        children,
6957                                    )
6958                                    .unwrap(),
6959                                ) as ArrayRef
6960                            }
6961                            other => panic!("unsupported map value type: {other:?}"),
6962                        };
6963                        let entries = StructArray::new(
6964                            Fields::from(vec![
6965                                key_field.as_ref().clone(),
6966                                val_field.as_ref().clone(),
6967                            ]),
6968                            vec![Arc::new(keys) as ArrayRef, vals],
6969                            None,
6970                        );
6971                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6972                        Arc::new(MapArray::new(
6973                            entry_field.clone(),
6974                            offsets,
6975                            entries,
6976                            None,
6977                            *is_sorted,
6978                        ))
6979                    }
6980                    other => panic!("empty_child_for: unhandled type {other:?}"),
6981                }
6982            }
6983            let children: Vec<ArrayRef> = fields
6984                .iter()
6985                .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
6986                .collect();
6987            Arc::new(
6988                UnionArray::try_new(
6989                    fields.clone(),
6990                    ScalarBuffer::<i8>::from(type_ids),
6991                    Some(ScalarBuffer::<i32>::from(offsets)),
6992                    children,
6993                )
6994                .unwrap(),
6995            ) as ArrayRef
6996        }
6997
6998        #[inline]
6999        fn uuid16_from_str(s: &str) -> [u8; 16] {
7000            let mut out = [0u8; 16];
7001            let mut idx = 0usize;
7002            let mut hi: Option<u8> = None;
7003            for ch in s.chars() {
7004                if ch == '-' {
7005                    continue;
7006                }
7007                let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
7008                if let Some(h) = hi {
7009                    out[idx] = (h << 4) | v;
7010                    idx += 1;
7011                    hi = None;
7012                } else {
7013                    hi = Some(v);
7014                }
7015            }
7016            assert_eq!(idx, 16, "UUID must decode to 16 bytes");
7017            out
7018        }
7019        let date_a: i32 = 19_000; // 2022-01-08
7020        let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
7021        let time_us_eod: i64 = 86_400_000_000 - 1;
7022        let ts_ms_2024_01_01: i64 = 1_704_067_200_000; // 2024-01-01T00:00:00Z
7023        let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
7024        let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
7025        let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
7026        let dur_large =
7027            IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
7028        let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
7029        let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
7030        let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
7031
7032        #[inline]
7033        fn push_like(
7034            reader_schema: &arrow_schema::Schema,
7035            name: &str,
7036            arr: ArrayRef,
7037            fields: &mut Vec<FieldRef>,
7038            cols: &mut Vec<ArrayRef>,
7039        ) {
7040            let src = reader_schema
7041                .field_with_name(name)
7042                .unwrap_or_else(|_| panic!("source schema missing field '{name}'"));
7043            let mut f = Field::new(name, arr.data_type().clone(), src.is_nullable());
7044            let md = src.metadata();
7045            if !md.is_empty() {
7046                f = f.with_metadata(md.clone());
7047            }
7048            fields.push(Arc::new(f));
7049            cols.push(arr);
7050        }
7051
7052        let mut fields: Vec<FieldRef> = Vec::new();
7053        let mut columns: Vec<ArrayRef> = Vec::new();
7054        push_like(
7055            schema.as_ref(),
7056            "id",
7057            Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
7058            &mut fields,
7059            &mut columns,
7060        );
7061        push_like(
7062            schema.as_ref(),
7063            "flag",
7064            Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef,
7065            &mut fields,
7066            &mut columns,
7067        );
7068        push_like(
7069            schema.as_ref(),
7070            "ratio_f32",
7071            Arc::new(Float32Array::from(vec![1.25f32, -0.0, 3.5, 9.75])) as ArrayRef,
7072            &mut fields,
7073            &mut columns,
7074        );
7075        push_like(
7076            schema.as_ref(),
7077            "ratio_f64",
7078            Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef,
7079            &mut fields,
7080            &mut columns,
7081        );
7082        push_like(
7083            schema.as_ref(),
7084            "count_i32",
7085            Arc::new(Int32Array::from(vec![7, -1, 0, 123])) as ArrayRef,
7086            &mut fields,
7087            &mut columns,
7088        );
7089        push_like(
7090            schema.as_ref(),
7091            "count_i64",
7092            Arc::new(Int64Array::from(vec![
7093                7_000_000_000i64,
7094                -2,
7095                0,
7096                -9_876_543_210i64,
7097            ])) as ArrayRef,
7098            &mut fields,
7099            &mut columns,
7100        );
7101        push_like(
7102            schema.as_ref(),
7103            "opt_i32_nullfirst",
7104            Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef,
7105            &mut fields,
7106            &mut columns,
7107        );
7108        push_like(
7109            schema.as_ref(),
7110            "opt_str_nullsecond",
7111            Arc::new(StringArray::from(vec![
7112                Some("alpha"),
7113                None,
7114                Some("s3"),
7115                Some(""),
7116            ])) as ArrayRef,
7117            &mut fields,
7118            &mut columns,
7119        );
7120        {
7121            let uf = match schema
7122                .field_with_name("tri_union_prim")
7123                .unwrap()
7124                .data_type()
7125            {
7126                DataType::Union(f, UnionMode::Dense) => f.clone(),
7127                other => panic!("tri_union_prim should be dense union, got {other:?}"),
7128            };
7129            let tid_i = tid_by_name(&uf, "int");
7130            let tid_s = tid_by_name(&uf, "string");
7131            let tid_b = tid_by_name(&uf, "boolean");
7132            let tids = vec![tid_i, tid_s, tid_b, tid_s];
7133            let offs = vec![0, 0, 0, 1];
7134            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7135                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
7136                DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
7137                DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
7138                _ => None,
7139            });
7140            push_like(
7141                schema.as_ref(),
7142                "tri_union_prim",
7143                arr,
7144                &mut fields,
7145                &mut columns,
7146            );
7147        }
7148
7149        push_like(
7150            schema.as_ref(),
7151            "str_utf8",
7152            Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef,
7153            &mut fields,
7154            &mut columns,
7155        );
7156        push_like(
7157            schema.as_ref(),
7158            "raw_bytes",
7159            Arc::new(BinaryArray::from(vec![
7160                b"\x00\x01".as_ref(),
7161                b"".as_ref(),
7162                b"\xFF\x00".as_ref(),
7163                b"\x10\x20\x30\x40".as_ref(),
7164            ])) as ArrayRef,
7165            &mut fields,
7166            &mut columns,
7167        );
7168        {
7169            let it = [
7170                Some(*b"0123456789ABCDEF"),
7171                Some([0u8; 16]),
7172                Some(*b"ABCDEFGHIJKLMNOP"),
7173                Some([0xAA; 16]),
7174            ]
7175            .into_iter();
7176            let arr =
7177                Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
7178                    as ArrayRef;
7179            push_like(
7180                schema.as_ref(),
7181                "fx16_plain",
7182                arr,
7183                &mut fields,
7184                &mut columns,
7185            );
7186        }
7187        {
7188            #[cfg(feature = "small_decimals")]
7189            let dec10_2 = Arc::new(
7190                Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
7191                    .with_precision_and_scale(10, 2)
7192                    .unwrap(),
7193            ) as ArrayRef;
7194            #[cfg(not(feature = "small_decimals"))]
7195            let dec10_2 = Arc::new(
7196                Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
7197                    .with_precision_and_scale(10, 2)
7198                    .unwrap(),
7199            ) as ArrayRef;
7200            push_like(
7201                schema.as_ref(),
7202                "dec_bytes_s10_2",
7203                dec10_2,
7204                &mut fields,
7205                &mut columns,
7206            );
7207        }
7208        {
7209            #[cfg(feature = "small_decimals")]
7210            let dec20_4 = Arc::new(
7211                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
7212                    .with_precision_and_scale(20, 4)
7213                    .unwrap(),
7214            ) as ArrayRef;
7215            #[cfg(not(feature = "small_decimals"))]
7216            let dec20_4 = Arc::new(
7217                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
7218                    .with_precision_and_scale(20, 4)
7219                    .unwrap(),
7220            ) as ArrayRef;
7221            push_like(
7222                schema.as_ref(),
7223                "dec_fix_s20_4",
7224                dec20_4,
7225                &mut fields,
7226                &mut columns,
7227            );
7228        }
7229        {
7230            let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
7231            let arr =
7232                Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
7233                    as ArrayRef;
7234            push_like(schema.as_ref(), "uuid_str", arr, &mut fields, &mut columns);
7235        }
7236        push_like(
7237            schema.as_ref(),
7238            "d_date",
7239            Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef,
7240            &mut fields,
7241            &mut columns,
7242        );
7243        push_like(
7244            schema.as_ref(),
7245            "t_millis",
7246            Arc::new(Time32MillisecondArray::from(vec![
7247                time_ms_a,
7248                0,
7249                1,
7250                86_400_000 - 1,
7251            ])) as ArrayRef,
7252            &mut fields,
7253            &mut columns,
7254        );
7255        push_like(
7256            schema.as_ref(),
7257            "t_micros",
7258            Arc::new(Time64MicrosecondArray::from(vec![
7259                time_us_eod,
7260                0,
7261                1,
7262                1_000_000,
7263            ])) as ArrayRef,
7264            &mut fields,
7265            &mut columns,
7266        );
7267        {
7268            let a = TimestampMillisecondArray::from(vec![
7269                ts_ms_2024_01_01,
7270                -1,
7271                ts_ms_2024_01_01 + 123,
7272                0,
7273            ])
7274            .with_timezone("+00:00");
7275            push_like(
7276                schema.as_ref(),
7277                "ts_millis_utc",
7278                Arc::new(a) as ArrayRef,
7279                &mut fields,
7280                &mut columns,
7281            );
7282        }
7283        {
7284            let a = TimestampMicrosecondArray::from(vec![
7285                ts_us_2024_01_01,
7286                1,
7287                ts_us_2024_01_01 + 456,
7288                0,
7289            ])
7290            .with_timezone("+00:00");
7291            push_like(
7292                schema.as_ref(),
7293                "ts_micros_utc",
7294                Arc::new(a) as ArrayRef,
7295                &mut fields,
7296                &mut columns,
7297            );
7298        }
7299        push_like(
7300            schema.as_ref(),
7301            "ts_millis_local",
7302            Arc::new(TimestampMillisecondArray::from(vec![
7303                ts_ms_2024_01_01 + 86_400_000,
7304                0,
7305                ts_ms_2024_01_01 + 789,
7306                123_456_789,
7307            ])) as ArrayRef,
7308            &mut fields,
7309            &mut columns,
7310        );
7311        push_like(
7312            schema.as_ref(),
7313            "ts_micros_local",
7314            Arc::new(TimestampMicrosecondArray::from(vec![
7315                ts_us_2024_01_01 + 123_456,
7316                0,
7317                ts_us_2024_01_01 + 101_112,
7318                987_654_321,
7319            ])) as ArrayRef,
7320            &mut fields,
7321            &mut columns,
7322        );
7323        {
7324            let v = vec![dur_small, dur_zero, dur_large, dur_2years];
7325            push_like(
7326                schema.as_ref(),
7327                "interval_mdn",
7328                Arc::new(IntervalMonthDayNanoArray::from(v)) as ArrayRef,
7329                &mut fields,
7330                &mut columns,
7331            );
7332        }
7333        {
7334            let keys = Int32Array::from(vec![1, 2, 3, 0]); // NEW, PROCESSING, DONE, UNKNOWN
7335            let values = Arc::new(StringArray::from(vec![
7336                "UNKNOWN",
7337                "NEW",
7338                "PROCESSING",
7339                "DONE",
7340            ])) as ArrayRef;
7341            let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
7342            push_like(
7343                schema.as_ref(),
7344                "status",
7345                Arc::new(dict) as ArrayRef,
7346                &mut fields,
7347                &mut columns,
7348            );
7349        }
7350        {
7351            let list_field = match schema.field_with_name("arr_union").unwrap().data_type() {
7352                DataType::List(f) => f.clone(),
7353                other => panic!("arr_union should be List, got {other:?}"),
7354            };
7355            let uf = match list_field.data_type() {
7356                DataType::Union(f, UnionMode::Dense) => f.clone(),
7357                other => panic!("arr_union item should be union, got {other:?}"),
7358            };
7359            let tid_l = tid_by_name(&uf, "long");
7360            let tid_s = tid_by_name(&uf, "string");
7361            let tid_n = tid_by_name(&uf, "null");
7362            let type_ids = vec![
7363                tid_l, tid_s, tid_n, tid_l, tid_n, tid_s, tid_l, tid_l, tid_s, tid_n, tid_l,
7364            ];
7365            let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
7366            let values = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7367                DataType::Int64 => {
7368                    Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
7369                }
7370                DataType::Utf8 => {
7371                    Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
7372                }
7373                DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
7374                _ => None,
7375            });
7376            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
7377            let arr = Arc::new(ListArray::try_new(list_field, list_offsets, values, None).unwrap())
7378                as ArrayRef;
7379            push_like(schema.as_ref(), "arr_union", arr, &mut fields, &mut columns);
7380        }
7381        {
7382            let (entry_field, entries_fields, uf, is_sorted) =
7383                match schema.field_with_name("map_union").unwrap().data_type() {
7384                    DataType::Map(entry_field, is_sorted) => {
7385                        let fs = match entry_field.data_type() {
7386                            DataType::Struct(fs) => fs.clone(),
7387                            other => panic!("map entries must be struct, got {other:?}"),
7388                        };
7389                        let val_f = fs[1].clone();
7390                        let uf = match val_f.data_type() {
7391                            DataType::Union(f, UnionMode::Dense) => f.clone(),
7392                            other => panic!("map value must be union, got {other:?}"),
7393                        };
7394                        (entry_field.clone(), fs, uf, *is_sorted)
7395                    }
7396                    other => panic!("map_union should be Map, got {other:?}"),
7397                };
7398            let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
7399            let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
7400            let tid_null = tid_by_name(&uf, "null");
7401            let tid_d = tid_by_name(&uf, "double");
7402            let tid_s = tid_by_name(&uf, "string");
7403            let type_ids = vec![tid_d, tid_null, tid_s, tid_d, tid_d, tid_s];
7404            let offsets = vec![0, 0, 0, 1, 2, 1];
7405            let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
7406            let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7407                DataType::Float64 => {
7408                    Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
7409                }
7410                DataType::Utf8 => {
7411                    Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
7412                }
7413                DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7414                _ => None,
7415            });
7416            let entries = StructArray::new(
7417                entries_fields.clone(),
7418                vec![Arc::new(keys) as ArrayRef, vals],
7419                None,
7420            );
7421            let map =
7422                Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef;
7423            push_like(schema.as_ref(), "map_union", map, &mut fields, &mut columns);
7424        }
7425        {
7426            let fs = match schema.field_with_name("address").unwrap().data_type() {
7427                DataType::Struct(fs) => fs.clone(),
7428                other => panic!("address should be Struct, got {other:?}"),
7429            };
7430            let street = Arc::new(StringArray::from(vec![
7431                "100 Main",
7432                "",
7433                "42 Galaxy Way",
7434                "End Ave",
7435            ])) as ArrayRef;
7436            let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
7437            let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
7438            let arr = Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef;
7439            push_like(schema.as_ref(), "address", arr, &mut fields, &mut columns);
7440        }
7441        {
7442            let fs = match schema.field_with_name("maybe_auth").unwrap().data_type() {
7443                DataType::Struct(fs) => fs.clone(),
7444                other => panic!("maybe_auth should be Struct, got {other:?}"),
7445            };
7446            let user =
7447                Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
7448            let token_values: Vec<Option<&[u8]>> = vec![
7449                None,                           // row 1: null
7450                Some(b"\x01\x02\x03".as_ref()), // row 2: bytes
7451                None,                           // row 3: null
7452                Some(b"".as_ref()),             // row 4: empty bytes
7453            ];
7454            let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
7455            let arr = Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef;
7456            push_like(
7457                schema.as_ref(),
7458                "maybe_auth",
7459                arr,
7460                &mut fields,
7461                &mut columns,
7462            );
7463        }
7464        {
7465            let uf = match schema
7466                .field_with_name("union_enum_record_array_map")
7467                .unwrap()
7468                .data_type()
7469            {
7470                DataType::Union(f, UnionMode::Dense) => f.clone(),
7471                other => panic!("union_enum_record_array_map should be union, got {other:?}"),
7472            };
7473            let mut tid_enum: Option<i8> = None;
7474            let mut tid_rec_a: Option<i8> = None;
7475            let mut tid_array: Option<i8> = None;
7476            let mut tid_map: Option<i8> = None;
7477            let mut map_entry_field: Option<FieldRef> = None;
7478            let mut map_sorted: bool = false;
7479            for (tid, f) in uf.iter() {
7480                match f.data_type() {
7481                    DataType::Dictionary(_, _) => tid_enum = Some(tid),
7482                    DataType::Struct(childs)
7483                        if childs.len() == 2
7484                            && childs[0].name() == "a"
7485                            && childs[1].name() == "b" =>
7486                    {
7487                        tid_rec_a = Some(tid)
7488                    }
7489                    DataType::List(item) if matches!(item.data_type(), DataType::Int64) => {
7490                        tid_array = Some(tid)
7491                    }
7492                    DataType::Map(ef, is_sorted) => {
7493                        tid_map = Some(tid);
7494                        map_entry_field = Some(ef.clone());
7495                        map_sorted = *is_sorted;
7496                    }
7497                    _ => {}
7498                }
7499            }
7500            let (tid_enum, tid_rec_a, tid_array, tid_map) = (
7501                tid_enum.unwrap(),
7502                tid_rec_a.unwrap(),
7503                tid_array.unwrap(),
7504                tid_map.unwrap(),
7505            );
7506            let tids = vec![tid_enum, tid_rec_a, tid_array, tid_map];
7507            let offs = vec![0, 0, 0, 0];
7508            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7509                DataType::Dictionary(_, _) => {
7510                    let keys = Int32Array::from(vec![0i32]);
7511                    let values =
7512                        Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
7513                    Some(
7514                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7515                            as ArrayRef,
7516                    )
7517                }
7518                DataType::Struct(fs)
7519                    if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
7520                {
7521                    let a = Int32Array::from(vec![7]);
7522                    let b = StringArray::from(vec!["rec"]);
7523                    Some(Arc::new(StructArray::new(
7524                        fs.clone(),
7525                        vec![Arc::new(a), Arc::new(b)],
7526                        None,
7527                    )) as ArrayRef)
7528                }
7529                DataType::List(field) => {
7530                    let values = Int64Array::from(vec![1i64, 2, 3]);
7531                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
7532                    Some(Arc::new(
7533                        ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
7534                    ) as ArrayRef)
7535                }
7536                DataType::Map(_, _) => {
7537                    let entry_field = map_entry_field.clone().unwrap();
7538                    let (key_field, val_field) = match entry_field.data_type() {
7539                        DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7540                        _ => unreachable!(),
7541                    };
7542                    let keys = StringArray::from(vec!["k"]);
7543                    let vals = StringArray::from(vec!["v"]);
7544                    let entries = StructArray::new(
7545                        Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7546                        vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7547                        None,
7548                    );
7549                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
7550                    Some(Arc::new(MapArray::new(
7551                        entry_field.clone(),
7552                        offsets,
7553                        entries,
7554                        None,
7555                        map_sorted,
7556                    )) as ArrayRef)
7557                }
7558                _ => None,
7559            });
7560            push_like(
7561                schema.as_ref(),
7562                "union_enum_record_array_map",
7563                arr,
7564                &mut fields,
7565                &mut columns,
7566            );
7567        }
7568        {
7569            let uf = match schema
7570                .field_with_name("union_date_or_fixed4")
7571                .unwrap()
7572                .data_type()
7573            {
7574                DataType::Union(f, UnionMode::Dense) => f.clone(),
7575                other => panic!("union_date_or_fixed4 should be union, got {other:?}"),
7576            };
7577            let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
7578            let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
7579            let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
7580            let offs = vec![0, 0, 1, 1];
7581            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7582                DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
7583                DataType::FixedSizeBinary(4) => {
7584                    let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
7585                    Some(Arc::new(
7586                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
7587                    ) as ArrayRef)
7588                }
7589                _ => None,
7590            });
7591            push_like(
7592                schema.as_ref(),
7593                "union_date_or_fixed4",
7594                arr,
7595                &mut fields,
7596                &mut columns,
7597            );
7598        }
7599        {
7600            let uf = match schema
7601                .field_with_name("union_interval_or_string")
7602                .unwrap()
7603                .data_type()
7604            {
7605                DataType::Union(f, UnionMode::Dense) => f.clone(),
7606                other => panic!("union_interval_or_string should be union, got {other:?}"),
7607            };
7608            let tid_dur = tid_by_dt(&uf, |dt| {
7609                matches!(dt, DataType::Interval(IntervalUnit::MonthDayNano))
7610            });
7611            let tid_str = tid_by_dt(&uf, |dt| matches!(dt, DataType::Utf8));
7612            let tids = vec![tid_dur, tid_str, tid_dur, tid_str];
7613            let offs = vec![0, 0, 1, 1];
7614            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7615                DataType::Interval(IntervalUnit::MonthDayNano) => Some(Arc::new(
7616                    IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
7617                )
7618                    as ArrayRef),
7619                DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
7620                    "duration-as-text",
7621                    "iso-8601-period-P1Y",
7622                ])) as ArrayRef),
7623                _ => None,
7624            });
7625            push_like(
7626                schema.as_ref(),
7627                "union_interval_or_string",
7628                arr,
7629                &mut fields,
7630                &mut columns,
7631            );
7632        }
7633        {
7634            let uf = match schema
7635                .field_with_name("union_uuid_or_fixed10")
7636                .unwrap()
7637                .data_type()
7638            {
7639                DataType::Union(f, UnionMode::Dense) => f.clone(),
7640                other => panic!("union_uuid_or_fixed10 should be union, got {other:?}"),
7641            };
7642            let tid_uuid = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
7643            let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
7644            let tids = vec![tid_uuid, tid_fx10, tid_uuid, tid_fx10];
7645            let offs = vec![0, 0, 1, 1];
7646            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7647                DataType::FixedSizeBinary(16) => {
7648                    let it = [Some(uuid1), Some(uuid2)].into_iter();
7649                    Some(Arc::new(
7650                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
7651                    ) as ArrayRef)
7652                }
7653                DataType::FixedSizeBinary(10) => {
7654                    let fx10_a = [0xAAu8; 10];
7655                    let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
7656                    let it = [Some(fx10_a), Some(fx10_b)].into_iter();
7657                    Some(Arc::new(
7658                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
7659                    ) as ArrayRef)
7660                }
7661                _ => None,
7662            });
7663            push_like(
7664                schema.as_ref(),
7665                "union_uuid_or_fixed10",
7666                arr,
7667                &mut fields,
7668                &mut columns,
7669            );
7670        }
7671        {
7672            let list_field = match schema
7673                .field_with_name("array_records_with_union")
7674                .unwrap()
7675                .data_type()
7676            {
7677                DataType::List(f) => f.clone(),
7678                other => panic!("array_records_with_union should be List, got {other:?}"),
7679            };
7680            let kv_fields = match list_field.data_type() {
7681                DataType::Struct(fs) => fs.clone(),
7682                other => panic!("array_records_with_union items must be Struct, got {other:?}"),
7683            };
7684            let val_field = kv_fields
7685                .iter()
7686                .find(|f| f.name() == "val")
7687                .unwrap()
7688                .clone();
7689            let uf = match val_field.data_type() {
7690                DataType::Union(f, UnionMode::Dense) => f.clone(),
7691                other => panic!("KV.val should be union, got {other:?}"),
7692            };
7693            let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
7694            let tid_null = tid_by_name(&uf, "null");
7695            let tid_i = tid_by_name(&uf, "int");
7696            let tid_l = tid_by_name(&uf, "long");
7697            let type_ids = vec![tid_i, tid_null, tid_l, tid_null, tid_i];
7698            let offsets = vec![0, 0, 0, 1, 1];
7699            let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7700                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
7701                DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
7702                DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7703                _ => None,
7704            });
7705            let values_struct =
7706                Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None)) as ArrayRef;
7707            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
7708            let arr = Arc::new(
7709                ListArray::try_new(list_field, list_offsets, values_struct, None).unwrap(),
7710            ) as ArrayRef;
7711            push_like(
7712                schema.as_ref(),
7713                "array_records_with_union",
7714                arr,
7715                &mut fields,
7716                &mut columns,
7717            );
7718        }
7719        {
7720            let uf = match schema
7721                .field_with_name("union_map_or_array_int")
7722                .unwrap()
7723                .data_type()
7724            {
7725                DataType::Union(f, UnionMode::Dense) => f.clone(),
7726                other => panic!("union_map_or_array_int should be union, got {other:?}"),
7727            };
7728            let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
7729            let tid_list = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
7730            let map_child: ArrayRef = {
7731                let (entry_field, is_sorted) = match uf
7732                    .iter()
7733                    .find(|(tid, _)| *tid == tid_map)
7734                    .unwrap()
7735                    .1
7736                    .data_type()
7737                {
7738                    DataType::Map(ef, is_sorted) => (ef.clone(), *is_sorted),
7739                    _ => unreachable!(),
7740                };
7741                let (key_field, val_field) = match entry_field.data_type() {
7742                    DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7743                    _ => unreachable!(),
7744                };
7745                let keys = StringArray::from(vec!["x", "y", "only"]);
7746                let vals = Int32Array::from(vec![1, 2, 10]);
7747                let entries = StructArray::new(
7748                    Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7749                    vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7750                    None,
7751                );
7752                let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
7753                Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef
7754            };
7755            let list_child: ArrayRef = {
7756                let list_field = match uf
7757                    .iter()
7758                    .find(|(tid, _)| *tid == tid_list)
7759                    .unwrap()
7760                    .1
7761                    .data_type()
7762                {
7763                    DataType::List(f) => f.clone(),
7764                    _ => unreachable!(),
7765                };
7766                let values = Int32Array::from(vec![1, 2, 3, 0]);
7767                let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
7768                Arc::new(ListArray::try_new(list_field, offsets, Arc::new(values), None).unwrap())
7769                    as ArrayRef
7770            };
7771            let tids = vec![tid_map, tid_list, tid_map, tid_list];
7772            let offs = vec![0, 0, 1, 1];
7773            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7774                DataType::Map(_, _) => Some(map_child.clone()),
7775                DataType::List(_) => Some(list_child.clone()),
7776                _ => None,
7777            });
7778            push_like(
7779                schema.as_ref(),
7780                "union_map_or_array_int",
7781                arr,
7782                &mut fields,
7783                &mut columns,
7784            );
7785        }
7786        push_like(
7787            schema.as_ref(),
7788            "renamed_with_default",
7789            Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
7790            &mut fields,
7791            &mut columns,
7792        );
7793        {
7794            let fs = match schema.field_with_name("person").unwrap().data_type() {
7795                DataType::Struct(fs) => fs.clone(),
7796                other => panic!("person should be Struct, got {other:?}"),
7797            };
7798            let name =
7799                Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef;
7800            let age = Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef;
7801            let arr = Arc::new(StructArray::new(fs, vec![name, age], None)) as ArrayRef;
7802            push_like(schema.as_ref(), "person", arr, &mut fields, &mut columns);
7803        }
7804        let expected =
7805            RecordBatch::try_new(Arc::new(Schema::new(Fields::from(fields))), columns).unwrap();
7806        assert_eq!(
7807            expected, batch,
7808            "entire RecordBatch mismatch (schema, all columns, all rows)"
7809        );
7810    }
7811    #[test]
7812    fn comprehensive_e2e_resolution_test() {
7813        use serde_json::Value;
7814        use std::collections::HashMap;
7815
7816        // Build a reader schema that stresses Avro schema‑resolution
7817        //
7818        // Changes relative to writer schema:
7819        // * Rename fields using writer aliases:    id -> identifier, renamed_with_default -> old_count
7820        // * Promote numeric types:                 count_i32 (int) -> long, ratio_f32 (float) -> double
7821        // * Reorder many union branches (reverse), incl. nested unions
7822        // * Reorder array/map union item/value branches
7823        // * Rename nested Address field:           street -> street_name (uses alias in writer)
7824        // * Change Person type name/namespace:     com.example.Person (matches writer alias)
7825        // * Reverse top‑level field order
7826        //
7827        // Reader‑side aliases are added wherever names change (per Avro spec).
7828        fn make_comprehensive_reader_schema(path: &str) -> AvroSchema {
7829            fn set_type_string(f: &mut Value, new_ty: &str) {
7830                if let Some(ty) = f.get_mut("type") {
7831                    match ty {
7832                        Value::String(_) | Value::Object(_) => {
7833                            *ty = Value::String(new_ty.to_string());
7834                        }
7835                        Value::Array(arr) => {
7836                            for b in arr.iter_mut() {
7837                                match b {
7838                                    Value::String(s) if s != "null" => {
7839                                        *b = Value::String(new_ty.to_string());
7840                                        break;
7841                                    }
7842                                    Value::Object(_) => {
7843                                        *b = Value::String(new_ty.to_string());
7844                                        break;
7845                                    }
7846                                    _ => {}
7847                                }
7848                            }
7849                        }
7850                        _ => {}
7851                    }
7852                }
7853            }
7854            fn reverse_union_array(f: &mut Value) {
7855                if let Some(arr) = f.get_mut("type").and_then(|t| t.as_array_mut()) {
7856                    arr.reverse();
7857                }
7858            }
7859            fn reverse_items_union(f: &mut Value) {
7860                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7861                    if let Some(items) = obj.get_mut("items").and_then(|v| v.as_array_mut()) {
7862                        items.reverse();
7863                    }
7864                }
7865            }
7866            fn reverse_map_values_union(f: &mut Value) {
7867                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7868                    if let Some(values) = obj.get_mut("values").and_then(|v| v.as_array_mut()) {
7869                        values.reverse();
7870                    }
7871                }
7872            }
7873            fn reverse_nested_union_in_record(f: &mut Value, field_name: &str) {
7874                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7875                    if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7876                        for ff in fields.iter_mut() {
7877                            if ff.get("name").and_then(|n| n.as_str()) == Some(field_name) {
7878                                if let Some(ty) = ff.get_mut("type") {
7879                                    if let Some(arr) = ty.as_array_mut() {
7880                                        arr.reverse();
7881                                    }
7882                                }
7883                            }
7884                        }
7885                    }
7886                }
7887            }
7888            fn rename_nested_field_with_alias(f: &mut Value, old: &str, new: &str) {
7889                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7890                    if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7891                        for ff in fields.iter_mut() {
7892                            if ff.get("name").and_then(|n| n.as_str()) == Some(old) {
7893                                ff["name"] = Value::String(new.to_string());
7894                                ff["aliases"] = Value::Array(vec![Value::String(old.to_string())]);
7895                            }
7896                        }
7897                    }
7898                }
7899            }
7900            let mut root = load_writer_schema_json(path);
7901            assert_eq!(root["type"], "record", "writer schema must be a record");
7902            let fields = root
7903                .get_mut("fields")
7904                .and_then(|f| f.as_array_mut())
7905                .expect("record has fields");
7906            for f in fields.iter_mut() {
7907                let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
7908                    continue;
7909                };
7910                match name {
7911                    // Field aliasing (reader‑side aliases added)
7912                    "id" => {
7913                        f["name"] = Value::String("identifier".into());
7914                        f["aliases"] = Value::Array(vec![Value::String("id".into())]);
7915                    }
7916                    "renamed_with_default" => {
7917                        f["name"] = Value::String("old_count".into());
7918                        f["aliases"] =
7919                            Value::Array(vec![Value::String("renamed_with_default".into())]);
7920                    }
7921                    // Promotions
7922                    "count_i32" => set_type_string(f, "long"),
7923                    "ratio_f32" => set_type_string(f, "double"),
7924                    // Union reorder (exercise resolution)
7925                    "opt_str_nullsecond" => reverse_union_array(f),
7926                    "union_enum_record_array_map" => reverse_union_array(f),
7927                    "union_date_or_fixed4" => reverse_union_array(f),
7928                    "union_interval_or_string" => reverse_union_array(f),
7929                    "union_uuid_or_fixed10" => reverse_union_array(f),
7930                    "union_map_or_array_int" => reverse_union_array(f),
7931                    "maybe_auth" => reverse_nested_union_in_record(f, "token"),
7932                    // Array/Map unions
7933                    "arr_union" => reverse_items_union(f),
7934                    "map_union" => reverse_map_values_union(f),
7935                    // Nested rename using reader‑side alias
7936                    "address" => rename_nested_field_with_alias(f, "street", "street_name"),
7937                    // Type‑name alias for nested record
7938                    "person" => {
7939                        if let Some(tobj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7940                            tobj.insert("name".to_string(), Value::String("Person".into()));
7941                            tobj.insert(
7942                                "namespace".to_string(),
7943                                Value::String("com.example".into()),
7944                            );
7945                            tobj.insert(
7946                                "aliases".into(),
7947                                Value::Array(vec![
7948                                    Value::String("PersonV2".into()),
7949                                    Value::String("com.example.v2.PersonV2".into()),
7950                                ]),
7951                            );
7952                        }
7953                    }
7954                    _ => {}
7955                }
7956            }
7957            fields.reverse();
7958            AvroSchema::new(root.to_string())
7959        }
7960
7961        let path = "test/data/comprehensive_e2e.avro";
7962        let reader_schema = make_comprehensive_reader_schema(path);
7963        let batch = read_alltypes_with_reader_schema(path, reader_schema.clone());
7964
7965        const UUID_EXT_KEY: &str = "ARROW:extension:name";
7966        const UUID_LOGICAL_KEY: &str = "logicalType";
7967
7968        let uuid_md_top: Option<HashMap<String, String>> = batch
7969            .schema()
7970            .field_with_name("uuid_str")
7971            .ok()
7972            .and_then(|f| {
7973                let md = f.metadata();
7974                let has_ext = md.get(UUID_EXT_KEY).is_some();
7975                let is_uuid_logical = md
7976                    .get(UUID_LOGICAL_KEY)
7977                    .map(|v| v.trim_matches('"') == "uuid")
7978                    .unwrap_or(false);
7979                if has_ext || is_uuid_logical {
7980                    Some(md.clone())
7981                } else {
7982                    None
7983                }
7984            });
7985
7986        let uuid_md_union: Option<HashMap<String, String>> = batch
7987            .schema()
7988            .field_with_name("union_uuid_or_fixed10")
7989            .ok()
7990            .and_then(|f| match f.data_type() {
7991                DataType::Union(uf, _) => uf
7992                    .iter()
7993                    .find(|(_, child)| child.name() == "uuid")
7994                    .and_then(|(_, child)| {
7995                        let md = child.metadata();
7996                        let has_ext = md.get(UUID_EXT_KEY).is_some();
7997                        let is_uuid_logical = md
7998                            .get(UUID_LOGICAL_KEY)
7999                            .map(|v| v.trim_matches('"') == "uuid")
8000                            .unwrap_or(false);
8001                        if has_ext || is_uuid_logical {
8002                            Some(md.clone())
8003                        } else {
8004                            None
8005                        }
8006                    }),
8007                _ => None,
8008            });
8009
8010        let add_uuid_ext_top = |f: Field| -> Field {
8011            if let Some(md) = &uuid_md_top {
8012                f.with_metadata(md.clone())
8013            } else {
8014                f
8015            }
8016        };
8017        let add_uuid_ext_union = |f: Field| -> Field {
8018            if let Some(md) = &uuid_md_union {
8019                f.with_metadata(md.clone())
8020            } else {
8021                f
8022            }
8023        };
8024
8025        #[inline]
8026        fn uuid16_from_str(s: &str) -> [u8; 16] {
8027            let mut out = [0u8; 16];
8028            let mut idx = 0usize;
8029            let mut hi: Option<u8> = None;
8030            for ch in s.chars() {
8031                if ch == '-' {
8032                    continue;
8033                }
8034                let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
8035                if let Some(h) = hi {
8036                    out[idx] = (h << 4) | v;
8037                    idx += 1;
8038                    hi = None;
8039                } else {
8040                    hi = Some(v);
8041                }
8042            }
8043            assert_eq!(idx, 16, "UUID must decode to 16 bytes");
8044            out
8045        }
8046
8047        fn mk_dense_union(
8048            fields: &UnionFields,
8049            type_ids: Vec<i8>,
8050            offsets: Vec<i32>,
8051            provide: impl Fn(&Field) -> Option<ArrayRef>,
8052        ) -> ArrayRef {
8053            fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
8054                match dt {
8055                    DataType::Null => Arc::new(NullArray::new(0)),
8056                    DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
8057                    DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
8058                    DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
8059                    DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
8060                    DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
8061                    DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
8062                    DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
8063                    DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
8064                    DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
8065                        Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
8066                    }
8067                    DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
8068                        Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
8069                    }
8070                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
8071                        let a = TimestampMillisecondArray::from(Vec::<i64>::new());
8072                        Arc::new(if let Some(tz) = tz {
8073                            a.with_timezone(tz.clone())
8074                        } else {
8075                            a
8076                        })
8077                    }
8078                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
8079                        let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
8080                        Arc::new(if let Some(tz) = tz {
8081                            a.with_timezone(tz.clone())
8082                        } else {
8083                            a
8084                        })
8085                    }
8086                    DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
8087                        IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
8088                    ),
8089                    DataType::FixedSizeBinary(sz) => Arc::new(
8090                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(
8091                            std::iter::empty::<Option<Vec<u8>>>(),
8092                            *sz,
8093                        )
8094                        .unwrap(),
8095                    ),
8096                    DataType::Dictionary(_, _) => {
8097                        let keys = Int32Array::from(Vec::<i32>::new());
8098                        let values = Arc::new(StringArray::from(Vec::<&str>::new()));
8099                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8100                    }
8101                    DataType::Struct(fields) => {
8102                        let children: Vec<ArrayRef> = fields
8103                            .iter()
8104                            .map(|f| empty_child_for(f.data_type()) as ArrayRef)
8105                            .collect();
8106                        Arc::new(StructArray::new(fields.clone(), children, None))
8107                    }
8108                    DataType::List(field) => {
8109                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
8110                        Arc::new(
8111                            ListArray::try_new(
8112                                field.clone(),
8113                                offsets,
8114                                empty_child_for(field.data_type()),
8115                                None,
8116                            )
8117                            .unwrap(),
8118                        )
8119                    }
8120                    DataType::Map(entry_field, is_sorted) => {
8121                        let (key_field, val_field) = match entry_field.data_type() {
8122                            DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
8123                            other => panic!("unexpected map entries type: {other:?}"),
8124                        };
8125                        let keys = StringArray::from(Vec::<&str>::new());
8126                        let vals: ArrayRef = match val_field.data_type() {
8127                            DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
8128                            DataType::Boolean => {
8129                                Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
8130                            }
8131                            DataType::Int32 => {
8132                                Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
8133                            }
8134                            DataType::Int64 => {
8135                                Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
8136                            }
8137                            DataType::Float32 => {
8138                                Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
8139                            }
8140                            DataType::Float64 => {
8141                                Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
8142                            }
8143                            DataType::Utf8 => {
8144                                Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
8145                            }
8146                            DataType::Binary => {
8147                                Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
8148                            }
8149                            DataType::Union(uf, _) => {
8150                                let children: Vec<ArrayRef> = uf
8151                                    .iter()
8152                                    .map(|(_, f)| empty_child_for(f.data_type()))
8153                                    .collect();
8154                                Arc::new(
8155                                    UnionArray::try_new(
8156                                        uf.clone(),
8157                                        ScalarBuffer::<i8>::from(Vec::<i8>::new()),
8158                                        Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
8159                                        children,
8160                                    )
8161                                    .unwrap(),
8162                                ) as ArrayRef
8163                            }
8164                            other => panic!("unsupported map value type: {other:?}"),
8165                        };
8166                        let entries = StructArray::new(
8167                            Fields::from(vec![
8168                                key_field.as_ref().clone(),
8169                                val_field.as_ref().clone(),
8170                            ]),
8171                            vec![Arc::new(keys) as ArrayRef, vals],
8172                            None,
8173                        );
8174                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
8175                        Arc::new(MapArray::new(
8176                            entry_field.clone(),
8177                            offsets,
8178                            entries,
8179                            None,
8180                            *is_sorted,
8181                        ))
8182                    }
8183                    other => panic!("empty_child_for: unhandled type {other:?}"),
8184                }
8185            }
8186            let children: Vec<ArrayRef> = fields
8187                .iter()
8188                .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
8189                .collect();
8190            Arc::new(
8191                UnionArray::try_new(
8192                    fields.clone(),
8193                    ScalarBuffer::<i8>::from(type_ids),
8194                    Some(ScalarBuffer::<i32>::from(offsets)),
8195                    children,
8196                )
8197                .unwrap(),
8198            ) as ArrayRef
8199        }
8200        let date_a: i32 = 19_000; // 2022-01-08
8201        let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
8202        let time_us_eod: i64 = 86_400_000_000 - 1;
8203        let ts_ms_2024_01_01: i64 = 1_704_067_200_000; // 2024-01-01T00:00:00Z
8204        let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
8205        let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
8206        let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
8207        let dur_large =
8208            IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
8209        let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
8210        let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
8211        let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
8212        let item_name = Field::LIST_FIELD_DEFAULT_NAME;
8213        let uf_tri = UnionFields::try_new(
8214            vec![0, 1, 2],
8215            vec![
8216                Field::new("int", DataType::Int32, false),
8217                Field::new("string", DataType::Utf8, false),
8218                Field::new("boolean", DataType::Boolean, false),
8219            ],
8220        )
8221        .unwrap();
8222        let uf_arr_items = UnionFields::try_new(
8223            vec![0, 1, 2],
8224            vec![
8225                Field::new("null", DataType::Null, false),
8226                Field::new("string", DataType::Utf8, false),
8227                Field::new("long", DataType::Int64, false),
8228            ],
8229        )
8230        .unwrap();
8231        let arr_items_field = Arc::new(Field::new(
8232            item_name,
8233            DataType::Union(uf_arr_items.clone(), UnionMode::Dense),
8234            true,
8235        ));
8236        let uf_map_vals = UnionFields::try_new(
8237            vec![0, 1, 2],
8238            vec![
8239                Field::new("string", DataType::Utf8, false),
8240                Field::new("double", DataType::Float64, false),
8241                Field::new("null", DataType::Null, false),
8242            ],
8243        )
8244        .unwrap();
8245        let map_entries_field = Arc::new(Field::new(
8246            "entries",
8247            DataType::Struct(Fields::from(vec![
8248                Field::new("key", DataType::Utf8, false),
8249                Field::new(
8250                    "value",
8251                    DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8252                    true,
8253                ),
8254            ])),
8255            false,
8256        ));
8257        // Enum metadata for Color (now includes name/namespace)
8258        let mut enum_md_color = {
8259            let mut m = HashMap::<String, String>::new();
8260            m.insert(
8261                crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8262                serde_json::to_string(&vec!["RED", "GREEN", "BLUE"]).unwrap(),
8263            );
8264            m
8265        };
8266        enum_md_color.insert(AVRO_NAME_METADATA_KEY.to_string(), "Color".to_string());
8267        enum_md_color.insert(
8268            AVRO_NAMESPACE_METADATA_KEY.to_string(),
8269            "org.apache.arrow.avrotests.v1.types".to_string(),
8270        );
8271        let union_rec_a_fields = Fields::from(vec![
8272            Field::new("a", DataType::Int32, false),
8273            Field::new("b", DataType::Utf8, false),
8274        ]);
8275        let union_rec_b_fields = Fields::from(vec![
8276            Field::new("x", DataType::Int64, false),
8277            Field::new("y", DataType::Binary, false),
8278        ]);
8279        let union_map_entries = Arc::new(Field::new(
8280            "entries",
8281            DataType::Struct(Fields::from(vec![
8282                Field::new("key", DataType::Utf8, false),
8283                Field::new("value", DataType::Utf8, false),
8284            ])),
8285            false,
8286        ));
8287        let rec_a_md = {
8288            let mut m = HashMap::<String, String>::new();
8289            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecA".to_string());
8290            m.insert(
8291                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8292                "org.apache.arrow.avrotests.v1.types".to_string(),
8293            );
8294            m
8295        };
8296        let rec_b_md = {
8297            let mut m = HashMap::<String, String>::new();
8298            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecB".to_string());
8299            m.insert(
8300                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8301                "org.apache.arrow.avrotests.v1.types".to_string(),
8302            );
8303            m
8304        };
8305        let uf_union_big = UnionFields::try_new(
8306            vec![0, 1, 2, 3, 4],
8307            vec![
8308                Field::new(
8309                    "map",
8310                    DataType::Map(union_map_entries.clone(), false),
8311                    false,
8312                ),
8313                Field::new(
8314                    "array",
8315                    DataType::List(Arc::new(Field::new(item_name, DataType::Int64, false))),
8316                    false,
8317                ),
8318                Field::new(
8319                    "org.apache.arrow.avrotests.v1.types.RecB",
8320                    DataType::Struct(union_rec_b_fields.clone()),
8321                    false,
8322                )
8323                .with_metadata(rec_b_md.clone()),
8324                Field::new(
8325                    "org.apache.arrow.avrotests.v1.types.RecA",
8326                    DataType::Struct(union_rec_a_fields.clone()),
8327                    false,
8328                )
8329                .with_metadata(rec_a_md.clone()),
8330                Field::new(
8331                    "org.apache.arrow.avrotests.v1.types.Color",
8332                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8333                    false,
8334                )
8335                .with_metadata(enum_md_color.clone()),
8336            ],
8337        )
8338        .unwrap();
8339        let fx4_md = {
8340            let mut m = HashMap::<String, String>::new();
8341            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx4".to_string());
8342            m.insert(
8343                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8344                "org.apache.arrow.avrotests.v1".to_string(),
8345            );
8346            m
8347        };
8348        let uf_date_fixed4 = UnionFields::try_new(
8349            vec![0, 1],
8350            vec![
8351                Field::new(
8352                    "org.apache.arrow.avrotests.v1.Fx4",
8353                    DataType::FixedSizeBinary(4),
8354                    false,
8355                )
8356                .with_metadata(fx4_md.clone()),
8357                Field::new("date", DataType::Date32, false),
8358            ],
8359        )
8360        .unwrap();
8361        let dur12u_md = {
8362            let mut m = HashMap::<String, String>::new();
8363            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12U".to_string());
8364            m.insert(
8365                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8366                "org.apache.arrow.avrotests.v1".to_string(),
8367            );
8368            m
8369        };
8370        let uf_dur_or_str = UnionFields::try_new(
8371            vec![0, 1],
8372            vec![
8373                Field::new("string", DataType::Utf8, false),
8374                Field::new(
8375                    "org.apache.arrow.avrotests.v1.Dur12U",
8376                    DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano),
8377                    false,
8378                )
8379                .with_metadata(dur12u_md.clone()),
8380            ],
8381        )
8382        .unwrap();
8383        let fx10_md = {
8384            let mut m = HashMap::<String, String>::new();
8385            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx10".to_string());
8386            m.insert(
8387                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8388                "org.apache.arrow.avrotests.v1".to_string(),
8389            );
8390            m
8391        };
8392        let uf_uuid_or_fx10 = UnionFields::try_new(
8393            vec![0, 1],
8394            vec![
8395                Field::new(
8396                    "org.apache.arrow.avrotests.v1.Fx10",
8397                    DataType::FixedSizeBinary(10),
8398                    false,
8399                )
8400                .with_metadata(fx10_md.clone()),
8401                add_uuid_ext_union(Field::new("uuid", DataType::FixedSizeBinary(16), false)),
8402            ],
8403        )
8404        .unwrap();
8405        let uf_kv_val = UnionFields::try_new(
8406            vec![0, 1, 2],
8407            vec![
8408                Field::new("null", DataType::Null, false),
8409                Field::new("int", DataType::Int32, false),
8410                Field::new("long", DataType::Int64, false),
8411            ],
8412        )
8413        .unwrap();
8414        let kv_fields = Fields::from(vec![
8415            Field::new("key", DataType::Utf8, false),
8416            Field::new(
8417                "val",
8418                DataType::Union(uf_kv_val.clone(), UnionMode::Dense),
8419                true,
8420            ),
8421        ]);
8422        let kv_item_field = Arc::new(Field::new(
8423            item_name,
8424            DataType::Struct(kv_fields.clone()),
8425            false,
8426        ));
8427        let map_int_entries = Arc::new(Field::new(
8428            "entries",
8429            DataType::Struct(Fields::from(vec![
8430                Field::new("key", DataType::Utf8, false),
8431                Field::new("value", DataType::Int32, false),
8432            ])),
8433            false,
8434        ));
8435        let uf_map_or_array = UnionFields::try_new(
8436            vec![0, 1],
8437            vec![
8438                Field::new(
8439                    "array",
8440                    DataType::List(Arc::new(Field::new(item_name, DataType::Int32, false))),
8441                    false,
8442                ),
8443                Field::new("map", DataType::Map(map_int_entries.clone(), false), false),
8444            ],
8445        )
8446        .unwrap();
8447        let mut enum_md_status = {
8448            let mut m = HashMap::<String, String>::new();
8449            m.insert(
8450                crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8451                serde_json::to_string(&vec!["UNKNOWN", "NEW", "PROCESSING", "DONE"]).unwrap(),
8452            );
8453            m
8454        };
8455        enum_md_status.insert(AVRO_NAME_METADATA_KEY.to_string(), "Status".to_string());
8456        enum_md_status.insert(
8457            AVRO_NAMESPACE_METADATA_KEY.to_string(),
8458            "org.apache.arrow.avrotests.v1.types".to_string(),
8459        );
8460        let mut dec20_md = HashMap::<String, String>::new();
8461        dec20_md.insert("precision".to_string(), "20".to_string());
8462        dec20_md.insert("scale".to_string(), "4".to_string());
8463        dec20_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "DecFix20".to_string());
8464        dec20_md.insert(
8465            AVRO_NAMESPACE_METADATA_KEY.to_string(),
8466            "org.apache.arrow.avrotests.v1.types".to_string(),
8467        );
8468        let mut dec10_md = HashMap::<String, String>::new();
8469        dec10_md.insert("precision".to_string(), "10".to_string());
8470        dec10_md.insert("scale".to_string(), "2".to_string());
8471        let fx16_top_md = {
8472            let mut m = HashMap::<String, String>::new();
8473            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx16".to_string());
8474            m.insert(
8475                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8476                "org.apache.arrow.avrotests.v1.types".to_string(),
8477            );
8478            m
8479        };
8480        let dur12_top_md = {
8481            let mut m = HashMap::<String, String>::new();
8482            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12".to_string());
8483            m.insert(
8484                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8485                "org.apache.arrow.avrotests.v1.types".to_string(),
8486            );
8487            m
8488        };
8489        #[cfg(feature = "small_decimals")]
8490        let dec20_dt = DataType::Decimal128(20, 4);
8491        #[cfg(not(feature = "small_decimals"))]
8492        let dec20_dt = DataType::Decimal128(20, 4);
8493        #[cfg(feature = "small_decimals")]
8494        let dec10_dt = DataType::Decimal64(10, 2);
8495        #[cfg(not(feature = "small_decimals"))]
8496        let dec10_dt = DataType::Decimal128(10, 2);
8497        let fields: Vec<FieldRef> = vec![
8498            Arc::new(Field::new(
8499                "person",
8500                DataType::Struct(Fields::from(vec![
8501                    Field::new("name", DataType::Utf8, false),
8502                    Field::new("age", DataType::Int32, false),
8503                ])),
8504                false,
8505            )),
8506            Arc::new(Field::new("old_count", DataType::Int32, false)),
8507            Arc::new(Field::new(
8508                "union_map_or_array_int",
8509                DataType::Union(uf_map_or_array.clone(), UnionMode::Dense),
8510                false,
8511            )),
8512            Arc::new(Field::new(
8513                "array_records_with_union",
8514                DataType::List(kv_item_field.clone()),
8515                false,
8516            )),
8517            Arc::new(Field::new(
8518                "union_uuid_or_fixed10",
8519                DataType::Union(uf_uuid_or_fx10.clone(), UnionMode::Dense),
8520                false,
8521            )),
8522            Arc::new(Field::new(
8523                "union_interval_or_string",
8524                DataType::Union(uf_dur_or_str.clone(), UnionMode::Dense),
8525                false,
8526            )),
8527            Arc::new(Field::new(
8528                "union_date_or_fixed4",
8529                DataType::Union(uf_date_fixed4.clone(), UnionMode::Dense),
8530                false,
8531            )),
8532            Arc::new(Field::new(
8533                "union_enum_record_array_map",
8534                DataType::Union(uf_union_big.clone(), UnionMode::Dense),
8535                false,
8536            )),
8537            Arc::new(Field::new(
8538                "maybe_auth",
8539                DataType::Struct(Fields::from(vec![
8540                    Field::new("user", DataType::Utf8, false),
8541                    Field::new("token", DataType::Binary, true), // [bytes,null] -> nullable bytes
8542                ])),
8543                false,
8544            )),
8545            Arc::new(Field::new(
8546                "address",
8547                DataType::Struct(Fields::from(vec![
8548                    Field::new("street_name", DataType::Utf8, false),
8549                    Field::new("zip", DataType::Int32, false),
8550                    Field::new("country", DataType::Utf8, false),
8551                ])),
8552                false,
8553            )),
8554            Arc::new(Field::new(
8555                "map_union",
8556                DataType::Map(map_entries_field.clone(), false),
8557                false,
8558            )),
8559            Arc::new(Field::new(
8560                "arr_union",
8561                DataType::List(arr_items_field.clone()),
8562                false,
8563            )),
8564            Arc::new(
8565                Field::new(
8566                    "status",
8567                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8568                    false,
8569                )
8570                .with_metadata(enum_md_status.clone()),
8571            ),
8572            Arc::new(
8573                Field::new(
8574                    "interval_mdn",
8575                    DataType::Interval(IntervalUnit::MonthDayNano),
8576                    false,
8577                )
8578                .with_metadata(dur12_top_md.clone()),
8579            ),
8580            Arc::new(Field::new(
8581                "ts_micros_local",
8582                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
8583                false,
8584            )),
8585            Arc::new(Field::new(
8586                "ts_millis_local",
8587                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
8588                false,
8589            )),
8590            Arc::new(Field::new(
8591                "ts_micros_utc",
8592                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+00:00".into())),
8593                false,
8594            )),
8595            Arc::new(Field::new(
8596                "ts_millis_utc",
8597                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, Some("+00:00".into())),
8598                false,
8599            )),
8600            Arc::new(Field::new(
8601                "t_micros",
8602                DataType::Time64(arrow_schema::TimeUnit::Microsecond),
8603                false,
8604            )),
8605            Arc::new(Field::new(
8606                "t_millis",
8607                DataType::Time32(arrow_schema::TimeUnit::Millisecond),
8608                false,
8609            )),
8610            Arc::new(Field::new("d_date", DataType::Date32, false)),
8611            Arc::new(add_uuid_ext_top(Field::new(
8612                "uuid_str",
8613                DataType::FixedSizeBinary(16),
8614                false,
8615            ))),
8616            Arc::new(Field::new("dec_fix_s20_4", dec20_dt, false).with_metadata(dec20_md.clone())),
8617            Arc::new(
8618                Field::new("dec_bytes_s10_2", dec10_dt, false).with_metadata(dec10_md.clone()),
8619            ),
8620            Arc::new(
8621                Field::new("fx16_plain", DataType::FixedSizeBinary(16), false)
8622                    .with_metadata(fx16_top_md.clone()),
8623            ),
8624            Arc::new(Field::new("raw_bytes", DataType::Binary, false)),
8625            Arc::new(Field::new("str_utf8", DataType::Utf8, false)),
8626            Arc::new(Field::new(
8627                "tri_union_prim",
8628                DataType::Union(uf_tri.clone(), UnionMode::Dense),
8629                false,
8630            )),
8631            Arc::new(Field::new("opt_str_nullsecond", DataType::Utf8, true)),
8632            Arc::new(Field::new("opt_i32_nullfirst", DataType::Int32, true)),
8633            Arc::new(Field::new("count_i64", DataType::Int64, false)),
8634            Arc::new(Field::new("count_i32", DataType::Int64, false)),
8635            Arc::new(Field::new("ratio_f64", DataType::Float64, false)),
8636            Arc::new(Field::new("ratio_f32", DataType::Float64, false)),
8637            Arc::new(Field::new("flag", DataType::Boolean, false)),
8638            Arc::new(Field::new("identifier", DataType::Int64, false)),
8639        ];
8640        let expected_schema = Arc::new(arrow_schema::Schema::new(Fields::from(fields)));
8641        let mut cols: Vec<ArrayRef> = vec![
8642            Arc::new(StructArray::new(
8643                match expected_schema
8644                    .field_with_name("person")
8645                    .unwrap()
8646                    .data_type()
8647                {
8648                    DataType::Struct(fs) => fs.clone(),
8649                    _ => unreachable!(),
8650                },
8651                vec![
8652                    Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef,
8653                    Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef,
8654                ],
8655                None,
8656            )) as ArrayRef,
8657            Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
8658        ];
8659        {
8660            let map_child: ArrayRef = {
8661                let keys = StringArray::from(vec!["x", "y", "only"]);
8662                let vals = Int32Array::from(vec![1, 2, 10]);
8663                let entries = StructArray::new(
8664                    Fields::from(vec![
8665                        Field::new("key", DataType::Utf8, false),
8666                        Field::new("value", DataType::Int32, false),
8667                    ]),
8668                    vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8669                    None,
8670                );
8671                let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
8672                Arc::new(MapArray::new(
8673                    map_int_entries.clone(),
8674                    moff,
8675                    entries,
8676                    None,
8677                    false,
8678                )) as ArrayRef
8679            };
8680            let list_child: ArrayRef = {
8681                let values = Int32Array::from(vec![1, 2, 3, 0]);
8682                let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
8683                Arc::new(
8684                    ListArray::try_new(
8685                        Arc::new(Field::new(item_name, DataType::Int32, false)),
8686                        offsets,
8687                        Arc::new(values),
8688                        None,
8689                    )
8690                    .unwrap(),
8691                ) as ArrayRef
8692            };
8693            let tids = vec![1, 0, 1, 0];
8694            let offs = vec![0, 0, 1, 1];
8695            let arr = mk_dense_union(&uf_map_or_array, tids, offs, |f| match f.name().as_str() {
8696                "array" => Some(list_child.clone()),
8697                "map" => Some(map_child.clone()),
8698                _ => None,
8699            });
8700            cols.push(arr);
8701        }
8702        {
8703            let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
8704            let type_ids = vec![1, 0, 2, 0, 1];
8705            let offsets = vec![0, 0, 0, 1, 1];
8706            let vals = mk_dense_union(&uf_kv_val, type_ids, offsets, |f| match f.data_type() {
8707                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
8708                DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
8709                DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
8710                _ => None,
8711            });
8712            let values_struct =
8713                Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None));
8714            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
8715            let arr = Arc::new(
8716                ListArray::try_new(kv_item_field.clone(), list_offsets, values_struct, None)
8717                    .unwrap(),
8718            ) as ArrayRef;
8719            cols.push(arr);
8720        }
8721        {
8722            let type_ids = vec![1, 0, 1, 0]; // [uuid, fixed10, uuid, fixed10] but uf order = [fixed10, uuid]
8723            let offs = vec![0, 0, 1, 1];
8724            let arr = mk_dense_union(&uf_uuid_or_fx10, type_ids, offs, |f| match f.data_type() {
8725                DataType::FixedSizeBinary(16) => {
8726                    let it = [Some(uuid1), Some(uuid2)].into_iter();
8727                    Some(Arc::new(
8728                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8729                    ) as ArrayRef)
8730                }
8731                DataType::FixedSizeBinary(10) => {
8732                    let fx10_a = [0xAAu8; 10];
8733                    let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
8734                    let it = [Some(fx10_a), Some(fx10_b)].into_iter();
8735                    Some(Arc::new(
8736                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
8737                    ) as ArrayRef)
8738                }
8739                _ => None,
8740            });
8741            cols.push(arr);
8742        }
8743        {
8744            let type_ids = vec![1, 0, 1, 0]; // [duration, string, duration, string] but uf order = [string, duration]
8745            let offs = vec![0, 0, 1, 1];
8746            let arr = mk_dense_union(&uf_dur_or_str, type_ids, offs, |f| match f.data_type() {
8747                DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => Some(Arc::new(
8748                    IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
8749                )
8750                    as ArrayRef),
8751                DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
8752                    "duration-as-text",
8753                    "iso-8601-period-P1Y",
8754                ])) as ArrayRef),
8755                _ => None,
8756            });
8757            cols.push(arr);
8758        }
8759        {
8760            let type_ids = vec![1, 0, 1, 0]; // [date, fixed, date, fixed] but uf order = [fixed, date]
8761            let offs = vec![0, 0, 1, 1];
8762            let arr = mk_dense_union(&uf_date_fixed4, type_ids, offs, |f| match f.data_type() {
8763                DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
8764                DataType::FixedSizeBinary(4) => {
8765                    let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
8766                    Some(Arc::new(
8767                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
8768                    ) as ArrayRef)
8769                }
8770                _ => None,
8771            });
8772            cols.push(arr);
8773        }
8774        {
8775            let tids = vec![4, 3, 1, 0]; // uf order = [map(0), array(1), RecB(2), RecA(3), enum(4)]
8776            let offs = vec![0, 0, 0, 0];
8777            let arr = mk_dense_union(&uf_union_big, tids, offs, |f| match f.data_type() {
8778                DataType::Dictionary(_, _) => {
8779                    let keys = Int32Array::from(vec![0i32]);
8780                    let values =
8781                        Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
8782                    Some(
8783                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8784                            as ArrayRef,
8785                    )
8786                }
8787                DataType::Struct(fs) if fs == &union_rec_a_fields => {
8788                    let a = Int32Array::from(vec![7]);
8789                    let b = StringArray::from(vec!["rec"]);
8790                    Some(Arc::new(StructArray::new(
8791                        fs.clone(),
8792                        vec![Arc::new(a) as ArrayRef, Arc::new(b) as ArrayRef],
8793                        None,
8794                    )) as ArrayRef)
8795                }
8796                DataType::List(_) => {
8797                    let values = Int64Array::from(vec![1i64, 2, 3]);
8798                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
8799                    Some(Arc::new(
8800                        ListArray::try_new(
8801                            Arc::new(Field::new(item_name, DataType::Int64, false)),
8802                            offsets,
8803                            Arc::new(values),
8804                            None,
8805                        )
8806                        .unwrap(),
8807                    ) as ArrayRef)
8808                }
8809                DataType::Map(_, _) => {
8810                    let keys = StringArray::from(vec!["k"]);
8811                    let vals = StringArray::from(vec!["v"]);
8812                    let entries = StructArray::new(
8813                        Fields::from(vec![
8814                            Field::new("key", DataType::Utf8, false),
8815                            Field::new("value", DataType::Utf8, false),
8816                        ]),
8817                        vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8818                        None,
8819                    );
8820                    let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
8821                    Some(Arc::new(MapArray::new(
8822                        union_map_entries.clone(),
8823                        moff,
8824                        entries,
8825                        None,
8826                        false,
8827                    )) as ArrayRef)
8828                }
8829                _ => None,
8830            });
8831            cols.push(arr);
8832        }
8833        {
8834            let fs = match expected_schema
8835                .field_with_name("maybe_auth")
8836                .unwrap()
8837                .data_type()
8838            {
8839                DataType::Struct(fs) => fs.clone(),
8840                _ => unreachable!(),
8841            };
8842            let user =
8843                Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
8844            let token_values: Vec<Option<&[u8]>> = vec![
8845                None,
8846                Some(b"\x01\x02\x03".as_ref()),
8847                None,
8848                Some(b"".as_ref()),
8849            ];
8850            let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
8851            cols.push(Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef);
8852        }
8853        {
8854            let fs = match expected_schema
8855                .field_with_name("address")
8856                .unwrap()
8857                .data_type()
8858            {
8859                DataType::Struct(fs) => fs.clone(),
8860                _ => unreachable!(),
8861            };
8862            let street = Arc::new(StringArray::from(vec![
8863                "100 Main",
8864                "",
8865                "42 Galaxy Way",
8866                "End Ave",
8867            ])) as ArrayRef;
8868            let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
8869            let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
8870            cols.push(Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef);
8871        }
8872        {
8873            let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
8874            let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
8875            let tid_s = 0; // string
8876            let tid_d = 1; // double
8877            let tid_n = 2; // null
8878            let type_ids = vec![tid_d, tid_n, tid_s, tid_d, tid_d, tid_s];
8879            let offsets = vec![0, 0, 0, 1, 2, 1];
8880            let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
8881            let vals = mk_dense_union(&uf_map_vals, type_ids, offsets, |f| match f.data_type() {
8882                DataType::Float64 => {
8883                    Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
8884                }
8885                DataType::Utf8 => {
8886                    Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
8887                }
8888                DataType::Null => Some(Arc::new(NullArray::new(1)) as ArrayRef),
8889                _ => None,
8890            });
8891            let entries = StructArray::new(
8892                Fields::from(vec![
8893                    Field::new("key", DataType::Utf8, false),
8894                    Field::new(
8895                        "value",
8896                        DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8897                        true,
8898                    ),
8899                ]),
8900                vec![Arc::new(keys) as ArrayRef, vals],
8901                None,
8902            );
8903            let map = Arc::new(MapArray::new(
8904                map_entries_field.clone(),
8905                moff,
8906                entries,
8907                None,
8908                false,
8909            )) as ArrayRef;
8910            cols.push(map);
8911        }
8912        {
8913            let type_ids = vec![
8914                2, 1, 0, 2, 0, 1, 2, 2, 1, 0,
8915                2, // long,string,null,long,null,string,long,long,string,null,long
8916            ];
8917            let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
8918            let values =
8919                mk_dense_union(&uf_arr_items, type_ids, offsets, |f| match f.data_type() {
8920                    DataType::Int64 => {
8921                        Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
8922                    }
8923                    DataType::Utf8 => {
8924                        Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
8925                    }
8926                    DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
8927                    _ => None,
8928                });
8929            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
8930            let arr = Arc::new(
8931                ListArray::try_new(arr_items_field.clone(), list_offsets, values, None).unwrap(),
8932            ) as ArrayRef;
8933            cols.push(arr);
8934        }
8935        {
8936            let keys = Int32Array::from(vec![1, 2, 3, 0]); // NEW, PROCESSING, DONE, UNKNOWN
8937            let values = Arc::new(StringArray::from(vec![
8938                "UNKNOWN",
8939                "NEW",
8940                "PROCESSING",
8941                "DONE",
8942            ])) as ArrayRef;
8943            let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
8944            cols.push(Arc::new(dict) as ArrayRef);
8945        }
8946        cols.push(Arc::new(IntervalMonthDayNanoArray::from(vec![
8947            dur_small, dur_zero, dur_large, dur_2years,
8948        ])) as ArrayRef);
8949        cols.push(Arc::new(TimestampMicrosecondArray::from(vec![
8950            ts_us_2024_01_01 + 123_456,
8951            0,
8952            ts_us_2024_01_01 + 101_112,
8953            987_654_321,
8954        ])) as ArrayRef);
8955        cols.push(Arc::new(TimestampMillisecondArray::from(vec![
8956            ts_ms_2024_01_01 + 86_400_000,
8957            0,
8958            ts_ms_2024_01_01 + 789,
8959            123_456_789,
8960        ])) as ArrayRef);
8961        {
8962            let a = TimestampMicrosecondArray::from(vec![
8963                ts_us_2024_01_01,
8964                1,
8965                ts_us_2024_01_01 + 456,
8966                0,
8967            ])
8968            .with_timezone("+00:00");
8969            cols.push(Arc::new(a) as ArrayRef);
8970        }
8971        {
8972            let a = TimestampMillisecondArray::from(vec![
8973                ts_ms_2024_01_01,
8974                -1,
8975                ts_ms_2024_01_01 + 123,
8976                0,
8977            ])
8978            .with_timezone("+00:00");
8979            cols.push(Arc::new(a) as ArrayRef);
8980        }
8981        cols.push(Arc::new(Time64MicrosecondArray::from(vec![
8982            time_us_eod,
8983            0,
8984            1,
8985            1_000_000,
8986        ])) as ArrayRef);
8987        cols.push(Arc::new(Time32MillisecondArray::from(vec![
8988            time_ms_a,
8989            0,
8990            1,
8991            86_400_000 - 1,
8992        ])) as ArrayRef);
8993        cols.push(Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef);
8994        {
8995            let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
8996            cols.push(Arc::new(
8997                FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8998            ) as ArrayRef);
8999        }
9000        {
9001            #[cfg(feature = "small_decimals")]
9002            let arr = Arc::new(
9003                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
9004                    .with_precision_and_scale(20, 4)
9005                    .unwrap(),
9006            ) as ArrayRef;
9007            #[cfg(not(feature = "small_decimals"))]
9008            let arr = Arc::new(
9009                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
9010                    .with_precision_and_scale(20, 4)
9011                    .unwrap(),
9012            ) as ArrayRef;
9013            cols.push(arr);
9014        }
9015        {
9016            #[cfg(feature = "small_decimals")]
9017            let arr = Arc::new(
9018                Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
9019                    .with_precision_and_scale(10, 2)
9020                    .unwrap(),
9021            ) as ArrayRef;
9022            #[cfg(not(feature = "small_decimals"))]
9023            let arr = Arc::new(
9024                Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
9025                    .with_precision_and_scale(10, 2)
9026                    .unwrap(),
9027            ) as ArrayRef;
9028            cols.push(arr);
9029        }
9030        {
9031            let it = [
9032                Some(*b"0123456789ABCDEF"),
9033                Some([0u8; 16]),
9034                Some(*b"ABCDEFGHIJKLMNOP"),
9035                Some([0xAA; 16]),
9036            ]
9037            .into_iter();
9038            cols.push(Arc::new(
9039                FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
9040            ) as ArrayRef);
9041        }
9042        cols.push(Arc::new(BinaryArray::from(vec![
9043            b"\x00\x01".as_ref(),
9044            b"".as_ref(),
9045            b"\xFF\x00".as_ref(),
9046            b"\x10\x20\x30\x40".as_ref(),
9047        ])) as ArrayRef);
9048        cols.push(Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef);
9049        {
9050            let tids = vec![0, 1, 2, 1];
9051            let offs = vec![0, 0, 0, 1];
9052            let arr = mk_dense_union(&uf_tri, tids, offs, |f| match f.data_type() {
9053                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
9054                DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
9055                DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
9056                _ => None,
9057            });
9058            cols.push(arr);
9059        }
9060        cols.push(Arc::new(StringArray::from(vec![
9061            Some("alpha"),
9062            None,
9063            Some("s3"),
9064            Some(""),
9065        ])) as ArrayRef);
9066        cols.push(Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef);
9067        cols.push(Arc::new(Int64Array::from(vec![
9068            7_000_000_000i64,
9069            -2,
9070            0,
9071            -9_876_543_210i64,
9072        ])) as ArrayRef);
9073        cols.push(Arc::new(Int64Array::from(vec![7i64, -1, 0, 123])) as ArrayRef);
9074        cols.push(Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef);
9075        cols.push(Arc::new(Float64Array::from(vec![1.25f64, -0.0, 3.5, 9.75])) as ArrayRef);
9076        cols.push(Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef);
9077        cols.push(Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef);
9078        let expected = RecordBatch::try_new(expected_schema, cols).unwrap();
9079        assert_eq!(
9080            expected, batch,
9081            "entire RecordBatch mismatch (schema, all columns, all rows)"
9082        );
9083    }
9084}