arrow_avro/reader/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Avro reader
19//!
20//! Facilities to read Apache Avro–encoded data into Arrow's `RecordBatch` format.
21//!
22//! ### Limitations
23//!
24//!- **Avro unions with > 127 branches are not supported.**
25//!  When decoding Avro unions to Arrow `UnionArray`, Arrow stores the union
26//!  type identifiers in an **8‑bit signed** buffer (`i8`). This implies a
27//!  practical limit of **127** distinct branch ids. Inputs that resolve to
28//!  more than 127 branches will return an error. If you truly need more,
29//!  model the schema as a **union of unions**, per the Arrow format spec.
30//!
31//!  See: Arrow Columnar Format — Dense Union (“types buffer: 8‑bit signed;
32//!  a union with more than 127 possible types can be modeled as a union of
33//!  unions”).
34//!
35//! This module exposes three layers of the API surface, from highest to lowest-level:
36//!
37//! * [`ReaderBuilder`](crate::reader::ReaderBuilder): configures how Avro is read (batch size, strict union handling,
38//!   string representation, reader schema, etc.) and produces either:
39//!   * a `Reader` for **Avro Object Container Files (OCF)** read from any `BufRead`, or
40//!   * a low-level `Decoder` for **single‑object encoded** Avro bytes and Confluent
41//!     **Schema Registry** framed messages.
42//! * [`Reader`](crate::reader::Reader): a convenient, synchronous iterator over `RecordBatch` decoded from an OCF
43//!   input. Implements [`Iterator<Item = Result<RecordBatch, ArrowError>>`] and
44//!   `RecordBatchReader`.
45//! * [`Decoder`](crate::reader::Decoder): a push‑based row decoder that consumes SOE framed Avro bytes and yields ready
46//!   `RecordBatch` values when batches fill. This is suitable for integrating with async
47//!   byte streams, network protocols, or other custom data sources.
48//!
49//! ## Encodings and when to use which type
50//!
51//! * **Object Container File (OCF)**: A self‑describing file format with a header containing
52//!   the writer schema, optional compression codec, and a sync marker, followed by one or
53//!   more data blocks. Use `Reader` for this format. See the Avro 1.11.1 specification
54//!   (“Object Container Files”). <https://avro.apache.org/docs/1.11.1/specification/#object-container-files>
55//! * **Single‑Object Encoding**: A stream‑friendly framing that prefixes each record body with
56//!   the 2‑byte marker `0xC3 0x01` followed by the **8‑byte little‑endian CRC‑64‑AVRO Rabin
57//!   fingerprint** of the writer schema, then the Avro binary body. Use `Decoder` with a
58//!   populated `SchemaStore` to resolve fingerprints to full schemas.
59//!   See “Single object encoding” in the Avro 1.11.1 spec.
60//!   <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
61//! * **Confluent Schema Registry wire format**: A 1‑byte magic `0x00`, a **4‑byte big‑endian**
62//!   schema ID, then the Avro‑encoded body. Use `Decoder` with a `SchemaStore` configured
63//!   for `FingerprintAlgorithm::Id` and entries keyed by `Fingerprint::Id`. See
64//!   Confluent’s “Wire format” documentation.
65//!   <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
66//! * **Apicurio Schema Registry wire format**: A 1‑byte magic `0x00`, a **8‑byte big‑endian**
67//!   global schema ID, then the Avro‑encoded body. Use `Decoder` with a `SchemaStore` configured
68//!   for `FingerprintAlgorithm::Id64` and entries keyed by `Fingerprint::Id64`. See
69//!   Apicurio’s “Avro SerDe” documentation.
70//!   <https://www.apicur.io/registry/docs/apicurio-registry/1.3.3.Final/getting-started/assembly-using-kafka-client-serdes.html#registry-serdes-types-avro-registry>
71//!
72//! ## Basic file usage (OCF)
73//!
74//! Use `ReaderBuilder::build` to construct a `Reader` from any `BufRead`. The doctest below
75//! creates a tiny OCF in memory using `AvroWriter` and then reads it back.
76//!
77//! ```
78//! use std::io::Cursor;
79//! use std::sync::Arc;
80//! use arrow_array::{ArrayRef, Int32Array, RecordBatch};
81//! use arrow_schema::{DataType, Field, Schema};
82//! use arrow_avro::writer::AvroWriter;
83//! use arrow_avro::reader::ReaderBuilder;
84//!
85//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
86//! // Build a minimal Arrow schema and batch
87//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
88//! let batch = RecordBatch::try_new(
89//!     Arc::new(schema.clone()),
90//!     vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
91//! )?;
92//!
93//! // Write an Avro OCF to memory
94//! let buffer: Vec<u8> = Vec::new();
95//! let mut writer = AvroWriter::new(buffer, schema.clone())?;
96//! writer.write(&batch)?;
97//! writer.finish()?;
98//! let bytes = writer.into_inner();
99//!
100//! // Read it back with ReaderBuilder
101//! let mut reader = ReaderBuilder::new().build(Cursor::new(bytes))?;
102//! let out = reader.next().unwrap()?;
103//! assert_eq!(out.num_rows(), 3);
104//! # Ok(()) }
105//! ```
106//!
107//! ## Streaming usage (single‑object / Confluent / Apicurio)
108//!
109//! The `Decoder` lets you integrate Avro decoding with **any** source of bytes by
110//! periodically calling `Decoder::decode` with new data and calling `Decoder::flush`
111//! to get a `RecordBatch` once at least one row is complete.
112//!
113//! The example below shows how to decode from an arbitrary stream of `bytes::Bytes` using
114//! `futures` utilities. Note: this is illustrative and keeps a single in‑memory `Bytes`
115//! buffer for simplicity—real applications typically maintain a rolling buffer.
116//!
117//! ```
118//! use bytes::{Buf, Bytes};
119//! use futures::{Stream, StreamExt};
120//! use std::task::{Poll, ready};
121//! use arrow_array::RecordBatch;
122//! use arrow_schema::ArrowError;
123//! use arrow_avro::reader::Decoder;
124//!
125//! /// Decode a stream of Avro-framed bytes into RecordBatch values.
126//! fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
127//!     mut decoder: Decoder,
128//!     mut input: S,
129//! ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
130//!     let mut buffered = Bytes::new();
131//!     futures::stream::poll_fn(move |cx| {
132//!         loop {
133//!             if buffered.is_empty() {
134//!                 buffered = match ready!(input.poll_next_unpin(cx)) {
135//!                     Some(b) => b,
136//!                     None => break, // EOF
137//!                 };
138//!             }
139//!             // Feed as much as possible
140//!             let decoded = match decoder.decode(buffered.as_ref()) {
141//!                 Ok(n) => n,
142//!                 Err(e) => return Poll::Ready(Some(Err(e))),
143//!             };
144//!             let read = buffered.len();
145//!             buffered.advance(decoded);
146//!             if decoded != read {
147//!                 // decoder made partial progress; request more bytes
148//!                 break
149//!             }
150//!         }
151//!         // Return a batch if one or more rows are complete
152//!         Poll::Ready(decoder.flush().transpose())
153//!     })
154//! }
155//! ```
156//!
157//! ### Building and using a `Decoder` for **single‑object encoding** (Rabin fingerprints)
158//!
159//! The doctest below **writes** a single‑object framed record using the Avro writer
160//! (no manual varints) for the writer schema
161//! (`{"type":"record","name":"User","fields":[{"name":"id","type":"long"}]}`)
162//! and then decodes it into a `RecordBatch`.
163//!
164//! ```
165//! use std::sync::Arc;
166//! use std::collections::HashMap;
167//! use arrow_array::{ArrayRef, Int64Array, RecordBatch};
168//! use arrow_schema::{DataType, Field, Schema};
169//! use arrow_avro::schema::{AvroSchema, SchemaStore, SCHEMA_METADATA_KEY, FingerprintStrategy};
170//! use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
171//! use arrow_avro::reader::ReaderBuilder;
172//!
173//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
174//! // Register the writer schema (Rabin fingerprint by default).
175//! let mut store = SchemaStore::new();
176//! let avro_schema = AvroSchema::new(r#"{"type":"record","name":"User","fields":[
177//!   {"name":"id","type":"long"}]}"#.to_string());
178//! let _fp = store.register(avro_schema.clone())?;
179//!
180//! // Create a single-object framed record { id: 42 } with the Avro writer.
181//! let mut md = HashMap::new();
182//! md.insert(SCHEMA_METADATA_KEY.to_string(), avro_schema.json_string.clone());
183//! let arrow = Schema::new_with_metadata(vec![Field::new("id", DataType::Int64, false)], md);
184//! let batch = RecordBatch::try_new(
185//!     Arc::new(arrow.clone()),
186//!     vec![Arc::new(Int64Array::from(vec![42])) as ArrayRef],
187//! )?;
188//! let mut w = WriterBuilder::new(arrow)
189//!     .with_fingerprint_strategy(FingerprintStrategy::Rabin) // SOE prefix
190//!     .build::<_, AvroSoeFormat>(Vec::new())?;
191//! w.write(&batch)?;
192//! w.finish()?;
193//! let frame = w.into_inner(); // C3 01 + fp + Avro body
194//!
195//! // Decode with a `Decoder`
196//! let mut dec = ReaderBuilder::new()
197//!   .with_writer_schema_store(store)
198//!   .with_batch_size(1024)
199//!   .build_decoder()?;
200//!
201//! dec.decode(&frame)?;
202//! let out = dec.flush()?.expect("one batch");
203//! assert_eq!(out.num_rows(), 1);
204//! # Ok(()) }
205//! ```
206//!
207//! See Avro 1.11.1 “Single object encoding” for details of the 2‑byte marker
208//! and little‑endian CRC‑64‑AVRO fingerprint:
209//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
210//!
211//! ### Building and using a `Decoder` for **Confluent Schema Registry** framing
212//!
213//! The Confluent wire format is: 1‑byte magic `0x00`, then a **4‑byte big‑endian** schema ID,
214//! then the Avro body. The doctest below crafts two messages for the same schema ID and
215//! decodes them into a single `RecordBatch` with two rows.
216//!
217//! ```
218//! use std::sync::Arc;
219//! use std::collections::HashMap;
220//! use arrow_array::{ArrayRef, Int64Array, StringArray, RecordBatch};
221//! use arrow_schema::{DataType, Field, Schema};
222//! use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm, SCHEMA_METADATA_KEY, FingerprintStrategy};
223//! use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
224//! use arrow_avro::reader::ReaderBuilder;
225//!
226//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
227//! // Set up a store keyed by numeric IDs (Confluent).
228//! let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
229//! let schema_id = 7u32;
230//! let avro_schema = AvroSchema::new(r#"{"type":"record","name":"User","fields":[
231//!   {"name":"id","type":"long"}, {"name":"name","type":"string"}]}"#.to_string());
232//! store.set(Fingerprint::Id(schema_id), avro_schema.clone())?;
233//!
234//! // Write two Confluent-framed messages {id:1,name:"a"} and {id:2,name:"b"}.
235//! fn msg(id: i64, name: &str, schema: &AvroSchema, schema_id: u32) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
236//!     let mut md = HashMap::new();
237//!     md.insert(SCHEMA_METADATA_KEY.to_string(), schema.json_string.clone());
238//!     let arrow = Schema::new_with_metadata(
239//!         vec![Field::new("id", DataType::Int64, false), Field::new("name", DataType::Utf8, false)],
240//!         md,
241//!     );
242//!     let batch = RecordBatch::try_new(
243//!         Arc::new(arrow.clone()),
244//!         vec![
245//!           Arc::new(Int64Array::from(vec![id])) as ArrayRef,
246//!           Arc::new(StringArray::from(vec![name])) as ArrayRef,
247//!         ],
248//!     )?;
249//!     let mut w = WriterBuilder::new(arrow)
250//!         .with_fingerprint_strategy(FingerprintStrategy::Id(schema_id)) // 0x00 + ID + body
251//!         .build::<_, AvroSoeFormat>(Vec::new())?;
252//!     w.write(&batch)?; w.finish()?;
253//!     Ok(w.into_inner())
254//! }
255//! let m1 = msg(1, "a", &avro_schema, schema_id)?;
256//! let m2 = msg(2, "b", &avro_schema, schema_id)?;
257//!
258//! // Decode both into a single batch.
259//! let mut dec = ReaderBuilder::new()
260//!   .with_writer_schema_store(store)
261//!   .with_batch_size(1024)
262//!   .build_decoder()?;
263//! dec.decode(&m1)?;
264//! dec.decode(&m2)?;
265//! let batch = dec.flush()?.expect("batch");
266//! assert_eq!(batch.num_rows(), 2);
267//! # Ok(()) }
268//! ```
269//!
270//! See Confluent’s “Wire format” notes: magic byte `0x00`, 4‑byte **big‑endian** schema ID,
271//! then the Avro‑encoded payload.
272//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
273//!
274//! ## Schema resolution (reader vs. writer schemas)
275//!
276//! Avro supports resolving data written with one schema (“writer”) into another (“reader”)
277//! using rules like **field aliases**, **default values**, and **numeric promotions**.
278//! In practice this lets you evolve schemas over time while remaining compatible with old data.
279//!
280//! *Spec background:* See Avro’s **Schema Resolution** (aliases, defaults) and the Confluent
281//! **Wire format** (magic `0x00` + big‑endian schema id + Avro body).
282//! <https://avro.apache.org/docs/1.11.1/specification/#schema-resolution>
283//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
284//!
285//! ### OCF example: rename a field and add a default via a reader schema
286//!
287//! Below we write an OCF with a *writer schema* having fields `id: long`, `name: string`.
288//! We then read it with a *reader schema* that:
289//! - **renames** `name` to `full_name` via `aliases`, and
290//! - **adds** `is_active: boolean` with a **default** value `true`.
291//!
292//! ```
293//! use std::io::Cursor;
294//! use std::sync::Arc;
295//! use arrow_array::{ArrayRef, Int64Array, StringArray, RecordBatch};
296//! use arrow_schema::{DataType, Field, Schema};
297//! use arrow_avro::writer::AvroWriter;
298//! use arrow_avro::reader::ReaderBuilder;
299//! use arrow_avro::schema::AvroSchema;
300//!
301//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
302//! // Writer (past version): { id: long, name: string }
303//! let writer_arrow = Schema::new(vec![
304//!     Field::new("id", DataType::Int64, false),
305//!     Field::new("name", DataType::Utf8, false),
306//! ]);
307//! let batch = RecordBatch::try_new(
308//!     Arc::new(writer_arrow.clone()),
309//!     vec![
310//!         Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
311//!         Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
312//!     ],
313//! )?;
314//!
315//! // Write an OCF entirely in memory
316//! let mut w = AvroWriter::new(Vec::<u8>::new(), writer_arrow)?;
317//! w.write(&batch)?;
318//! w.finish()?;
319//! let bytes = w.into_inner();
320//!
321//! // Reader (current version):
322//! //  - record name "topLevelRecord" matches the crate's default for OCF
323//! //  - rename `name` -> `full_name` using aliases (optional)
324//! let reader_json = r#"
325//! {
326//!   "type": "record",
327//!   "name": "topLevelRecord",
328//!   "fields": [
329//!     { "name": "id", "type": "long" },
330//!     { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
331//!     { "name": "is_active", "type": "boolean", "default": true }
332//!   ]
333//! }"#;
334//!
335//! let mut reader = ReaderBuilder::new()
336//!   .with_reader_schema(AvroSchema::new(reader_json.to_string()))
337//!   .build(Cursor::new(bytes))?;
338//!
339//! let out = reader.next().unwrap()?;
340//! assert_eq!(out.num_rows(), 2);
341//! # Ok(()) }
342//! ```
343//!
344//! ### Confluent single‑object example: resolve *past* writer versions to the topic’s **current** reader schema
345//!
346//! In this scenario, the **reader schema** is the topic’s *current* schema, while the two
347//! **writer schemas** registered under Confluent IDs **1** and **2** represent *past versions*.
348//! The decoder uses the reader schema to resolve both versions.
349//!
350//! ```
351//! use std::sync::Arc;
352//! use std::collections::HashMap;
353//! use arrow_avro::reader::ReaderBuilder;
354//! use arrow_avro::schema::{
355//!     AvroSchema, Fingerprint, FingerprintAlgorithm, SchemaStore,
356//!     SCHEMA_METADATA_KEY, FingerprintStrategy,
357//! };
358//! use arrow_array::{ArrayRef, Int32Array, Int64Array, StringArray, RecordBatch};
359//! use arrow_schema::{DataType, Field, Schema};
360//!
361//! fn main() -> Result<(), Box<dyn std::error::Error>> {
362//!     // Reader: current topic schema (no reader-added fields)
363//!     //   {"type":"record","name":"User","fields":[
364//!     //     {"name":"id","type":"long"},
365//!     //     {"name":"name","type":"string"}]}
366//!     let reader_schema = AvroSchema::new(
367//!         r#"{"type":"record","name":"User",
368//!             "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"}]}"#
369//!             .to_string(),
370//!     );
371//!
372//!     // Register two *writer* schemas under Confluent IDs 0 and 1
373//!     let writer_v0 = AvroSchema::new(
374//!         r#"{"type":"record","name":"User",
375//!             "fields":[{"name":"id","type":"int"},{"name":"name","type":"string"}]}"#
376//!             .to_string(),
377//!     );
378//!     let writer_v1 = AvroSchema::new(
379//!         r#"{"type":"record","name":"User",
380//!             "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"},
381//!                       {"name":"email","type":["null","string"],"default":null}]}"#
382//!             .to_string(),
383//!     );
384//!
385//!     let id_v0: u32 = 0;
386//!     let id_v1: u32 = 1;
387//!
388//!     let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id); // integer IDs
389//!     store.set(Fingerprint::Id(id_v0), writer_v0.clone())?;
390//!     store.set(Fingerprint::Id(id_v1), writer_v1.clone())?;
391//!
392//!     // Write two Confluent-framed messages using each writer version
393//!     // frame0: writer v0 body {id:1001_i32, name:"v0-alice"}
394//!     let mut md0 = HashMap::new();
395//!     md0.insert(SCHEMA_METADATA_KEY.to_string(), writer_v0.json_string.clone());
396//!     let arrow0 = Schema::new_with_metadata(
397//!         vec![Field::new("id", DataType::Int32, false),
398//!              Field::new("name", DataType::Utf8, false)], md0);
399//!     let batch0 = RecordBatch::try_new(
400//!         Arc::new(arrow0.clone()),
401//!         vec![Arc::new(Int32Array::from(vec![1001])) as ArrayRef,
402//!              Arc::new(StringArray::from(vec!["v0-alice"])) as ArrayRef])?;
403//!     let mut w0 = arrow_avro::writer::WriterBuilder::new(arrow0)
404//!         .with_fingerprint_strategy(FingerprintStrategy::Id(id_v0))
405//!         .build::<_, arrow_avro::writer::format::AvroSoeFormat>(Vec::new())?;
406//!     w0.write(&batch0)?; w0.finish()?;
407//!     let frame0 = w0.into_inner(); // 0x00 + id_v0 + body
408//!
409//!     // frame1: writer v1 body {id:2002_i64, name:"v1-bob", email: Some("bob@example.com")}
410//!     let mut md1 = HashMap::new();
411//!    md1.insert(SCHEMA_METADATA_KEY.to_string(), writer_v1.json_string.clone());
412//!     let arrow1 = Schema::new_with_metadata(
413//!         vec![Field::new("id", DataType::Int64, false),
414//!              Field::new("name", DataType::Utf8, false),
415//!              Field::new("email", DataType::Utf8, true)], md1);
416//!     let batch1 = RecordBatch::try_new(
417//!         Arc::new(arrow1.clone()),
418//!         vec![Arc::new(Int64Array::from(vec![2002])) as ArrayRef,
419//!              Arc::new(StringArray::from(vec!["v1-bob"])) as ArrayRef,
420//!              Arc::new(StringArray::from(vec![Some("bob@example.com")])) as ArrayRef])?;
421//!     let mut w1 = arrow_avro::writer::WriterBuilder::new(arrow1)
422//!         .with_fingerprint_strategy(FingerprintStrategy::Id(id_v1))
423//!         .build::<_, arrow_avro::writer::format::AvroSoeFormat>(Vec::new())?;
424//!     w1.write(&batch1)?; w1.finish()?;
425//!     let frame1 = w1.into_inner(); // 0x00 + id_v1 + body
426//!
427//!     // Build a streaming Decoder that understands Confluent framing
428//!     let mut decoder = ReaderBuilder::new()
429//!         .with_reader_schema(reader_schema)
430//!         .with_writer_schema_store(store)
431//!         .with_batch_size(8) // small demo batches
432//!         .build_decoder()?;
433//!
434//!     // Decode each whole frame, then drain completed rows with flush()
435//!     let mut total_rows = 0usize;
436//!
437//!     let consumed0 = decoder.decode(&frame0)?;
438//!     assert_eq!(consumed0, frame0.len(), "decoder must consume the whole frame");
439//!     while let Some(batch) = decoder.flush()? { total_rows += batch.num_rows(); }
440//!
441//!     let consumed1 = decoder.decode(&frame1)?;
442//!     assert_eq!(consumed1, frame1.len(), "decoder must consume the whole frame");
443//!     while let Some(batch) = decoder.flush()? { total_rows += batch.num_rows(); }
444//!
445//!     // We sent 2 records so we should get 2 rows (possibly one per flush)
446//!     assert_eq!(total_rows, 2);
447//!     Ok(())
448//! }
449//! ```
450//!
451//! ## Schema evolution and batch boundaries
452//!
453//! `Decoder` supports mid‑stream schema changes when the input framing carries a schema
454//! fingerprint (single‑object or Confluent). When a new fingerprint is observed:
455//!
456//! * If the current `RecordBatch` is **empty**, the decoder switches to the new schema
457//!   immediately.
458//! * If not, the decoder finishes the current batch first and only then switches.
459//!
460//! Consequently, the schema of batches produced by `Decoder::flush` may change over time,
461//! and `Decoder` intentionally does **not** implement `RecordBatchReader`. In contrast,
462//! `Reader` (OCF) has a single writer schema for the entire file and therefore implements
463//! `RecordBatchReader`.
464//!
465//! ## Performance & memory
466//!
467//! * `batch_size` controls the maximum number of rows per `RecordBatch`. Larger batches
468//!   amortize per‑batch overhead; smaller batches reduce peak memory usage and latency.
469//! * When `utf8_view` is enabled, string columns use Arrow’s `StringViewArray`, which can
470//!   reduce allocations for short strings.
471//! * For OCF, blocks may be compressed; `Reader` will decompress using the codec specified
472//!   in the file header and feed uncompressed bytes to the row `Decoder`.
473//!
474//! ## Error handling
475//!
476//! * Incomplete inputs return parse errors with "Unexpected EOF"; callers typically provide
477//!   more bytes and try again.
478//! * If a fingerprint is unknown to the provided `SchemaStore`, decoding fails with a
479//!   descriptive error. Populate the store up front to avoid this.
480//!
481//! ---
482use crate::codec::AvroFieldBuilder;
483use crate::reader::header::read_header;
484use crate::schema::{
485    AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, SINGLE_OBJECT_MAGIC, Schema,
486    SchemaStore,
487};
488use arrow_array::{RecordBatch, RecordBatchReader};
489use arrow_schema::{ArrowError, SchemaRef};
490use block::BlockDecoder;
491use header::Header;
492use indexmap::IndexMap;
493use record::RecordDecoder;
494use std::io::BufRead;
495
496mod block;
497mod cursor;
498mod header;
499mod record;
500mod vlq;
501
502fn is_incomplete_data(err: &ArrowError) -> bool {
503    matches!(
504        err,
505        ArrowError::ParseError(msg)
506            if msg.contains("Unexpected EOF")
507    )
508}
509
510/// A low‑level, push‑based decoder from Avro bytes to Arrow `RecordBatch`.
511///
512/// `Decoder` is designed for **streaming** scenarios:
513///
514/// * You *feed* freshly received bytes using `Self::decode`, potentially multiple times,
515///   until at least one row is complete.
516/// * You then *drain* completed rows with `Self::flush`, which yields a `RecordBatch`
517///   if any rows were finished since the last flush.
518///
519/// Unlike `Reader`, which is specialized for Avro **Object Container Files**, `Decoder`
520/// understands **framed single‑object** inputs and **Confluent Schema Registry** messages,
521/// switching schemas mid‑stream when the framing indicates a new fingerprint.
522///
523/// ### Supported prefixes
524///
525/// On each new row boundary, `Decoder` tries to match one of the following "prefixes":
526///
527/// * **Single‑Object encoding**: magic `0xC3 0x01` + schema fingerprint (length depends on
528///   the configured `FingerprintAlgorithm`); see `SINGLE_OBJECT_MAGIC`.
529/// * **Confluent wire format**: magic `0x00` + 4‑byte big‑endian schema id; see
530///   `CONFLUENT_MAGIC`.
531///
532/// The active fingerprint determines which cached row decoder is used to decode the following
533/// record body bytes.
534///
535/// ### Schema switching semantics
536///
537/// When a new fingerprint is observed:
538///
539/// * If the current batch is empty, the decoder switches immediately;
540/// * Otherwise, the current batch is finalized on the next `flush` and only then
541///   does the decoder switch to the new schema. This guarantees that a single `RecordBatch`
542///   never mixes rows with different schemas.
543///
544/// ### Examples
545///
546/// Build and use a `Decoder` for single‑object encoding:
547///
548/// ```
549/// use arrow_avro::schema::{AvroSchema, SchemaStore};
550/// use arrow_avro::reader::ReaderBuilder;
551///
552/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
553/// // Use a record schema at the top level so we can build an Arrow RecordBatch
554/// let mut store = SchemaStore::new(); // Rabin fingerprinting by default
555/// let avro = AvroSchema::new(
556///     r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string()
557/// );
558/// let fp = store.register(avro)?;
559///
560/// // --- Hidden: write a single-object framed row {x:7} ---
561/// # use std::sync::Arc;
562/// # use std::collections::HashMap;
563/// # use arrow_array::{ArrayRef, Int64Array, RecordBatch};
564/// # use arrow_schema::{DataType, Field, Schema};
565/// # use arrow_avro::schema::{SCHEMA_METADATA_KEY, FingerprintStrategy};
566/// # use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
567/// # let mut md = HashMap::new();
568/// # md.insert(SCHEMA_METADATA_KEY.to_string(),
569/// #     r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string());
570/// # let arrow = Schema::new_with_metadata(vec![Field::new("x", DataType::Int64, false)], md);
571/// # let batch = RecordBatch::try_new(Arc::new(arrow.clone()), vec![Arc::new(Int64Array::from(vec![7])) as ArrayRef])?;
572/// # let mut w = WriterBuilder::new(arrow)
573/// #     .with_fingerprint_strategy(fp.into())
574/// #     .build::<_, AvroSoeFormat>(Vec::new())?;
575/// # w.write(&batch)?; w.finish()?; let frame = w.into_inner();
576///
577/// let mut decoder = ReaderBuilder::new()
578///     .with_writer_schema_store(store)
579///     .with_batch_size(16)
580///     .build_decoder()?;
581///
582/// # decoder.decode(&frame)?;
583/// let batch = decoder.flush()?.expect("one row");
584/// assert_eq!(batch.num_rows(), 1);
585/// # Ok(()) }
586/// ```
587///
588/// *Background:* Avro's single‑object encoding is defined as `0xC3 0x01` + 8‑byte
589/// little‑endian CRC‑64‑AVRO fingerprint of the **writer schema** + Avro binary body.
590/// See the Avro 1.11.1 spec for details. <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
591///
592/// Build and use a `Decoder` for Confluent Registry messages:
593///
594/// ```
595/// use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm};
596/// use arrow_avro::reader::ReaderBuilder;
597///
598/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
599/// let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
600/// store.set(Fingerprint::Id(1234), AvroSchema::new(r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string()))?;
601///
602/// // --- Hidden: encode two Confluent-framed messages {x:1} and {x:2} ---
603/// # use std::sync::Arc;
604/// # use std::collections::HashMap;
605/// # use arrow_array::{ArrayRef, Int64Array, RecordBatch};
606/// # use arrow_schema::{DataType, Field, Schema};
607/// # use arrow_avro::schema::{SCHEMA_METADATA_KEY, FingerprintStrategy};
608/// # use arrow_avro::writer::{WriterBuilder, format::AvroSoeFormat};
609/// # fn msg(x: i64) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
610/// #   let mut md = HashMap::new();
611/// #   md.insert(SCHEMA_METADATA_KEY.to_string(),
612/// #     r#"{"type":"record","name":"E","fields":[{"name":"x","type":"long"}]}"#.to_string());
613/// #   let arrow = Schema::new_with_metadata(vec![Field::new("x", DataType::Int64, false)], md);
614/// #   let batch = RecordBatch::try_new(Arc::new(arrow.clone()), vec![Arc::new(Int64Array::from(vec![x])) as ArrayRef])?;
615/// #   let mut w = WriterBuilder::new(arrow)
616/// #       .with_fingerprint_strategy(FingerprintStrategy::Id(1234))
617/// #       .build::<_, AvroSoeFormat>(Vec::new())?;
618/// #   w.write(&batch)?; w.finish()?; Ok(w.into_inner())
619/// # }
620/// # let m1 = msg(1)?;
621/// # let m2 = msg(2)?;
622///
623/// let mut decoder = ReaderBuilder::new()
624///     .with_writer_schema_store(store)
625///     .build_decoder()?;
626/// # decoder.decode(&m1)?;
627/// # decoder.decode(&m2)?;
628/// let batch = decoder.flush()?.expect("two rows");
629/// assert_eq!(batch.num_rows(), 2);
630/// # Ok(()) }
631/// ```
632#[derive(Debug)]
633pub struct Decoder {
634    active_decoder: RecordDecoder,
635    active_fingerprint: Option<Fingerprint>,
636    batch_size: usize,
637    remaining_capacity: usize,
638    cache: IndexMap<Fingerprint, RecordDecoder>,
639    fingerprint_algorithm: FingerprintAlgorithm,
640    pending_schema: Option<(Fingerprint, RecordDecoder)>,
641    awaiting_body: bool,
642}
643
644impl Decoder {
645    /// Returns the Arrow schema for the rows decoded by this decoder.
646    ///
647    /// **Note:** With single‑object or Confluent framing, the schema may change
648    /// at a row boundary when the input indicates a new fingerprint.
649    pub fn schema(&self) -> SchemaRef {
650        self.active_decoder.schema().clone()
651    }
652
653    /// Returns the configured maximum number of rows per batch.
654    pub fn batch_size(&self) -> usize {
655        self.batch_size
656    }
657
658    /// Feed a chunk of bytes into the decoder.
659    ///
660    /// This will:
661    ///
662    /// * Decode at most `Self::batch_size` rows;
663    /// * Return the number of input bytes **consumed** from `data` (which may be 0 if more
664    ///   bytes are required, or less than `data.len()` if a prefix/body straddles the
665    ///   chunk boundary);
666    /// * Defer producing a `RecordBatch` until you call `Self::flush`.
667    ///
668    /// # Returns
669    /// The number of bytes consumed from `data`.
670    ///
671    /// # Errors
672    /// Returns an error if:
673    ///
674    /// * The input indicates an unknown fingerprint (not present in the provided
675    ///   `SchemaStore`;
676    /// * The Avro body is malformed;
677    /// * A strict‑mode union rule is violated (see `ReaderBuilder::with_strict_mode`).
678    pub fn decode(&mut self, data: &[u8]) -> Result<usize, ArrowError> {
679        let mut total_consumed = 0usize;
680        while total_consumed < data.len() && self.remaining_capacity > 0 {
681            if self.awaiting_body {
682                match self.active_decoder.decode(&data[total_consumed..], 1) {
683                    Ok(n) => {
684                        self.remaining_capacity -= 1;
685                        total_consumed += n;
686                        self.awaiting_body = false;
687                        continue;
688                    }
689                    Err(ref e) if is_incomplete_data(e) => break,
690                    err => return err,
691                };
692            }
693            match self.handle_prefix(&data[total_consumed..])? {
694                Some(0) => break, // Insufficient bytes
695                Some(n) => {
696                    total_consumed += n;
697                    self.apply_pending_schema_if_batch_empty();
698                    self.awaiting_body = true;
699                }
700                None => {
701                    return Err(ArrowError::ParseError(
702                        "Missing magic bytes and fingerprint".to_string(),
703                    ));
704                }
705            }
706        }
707        Ok(total_consumed)
708    }
709
710    // Attempt to handle a prefix at the current position.
711    // * Ok(None) – buffer does not start with the prefix.
712    // * Ok(Some(0)) – prefix detected, but the buffer is too short; caller should await more bytes.
713    // * Ok(Some(n)) – consumed `n > 0` bytes of a complete prefix (magic and fingerprint).
714    fn handle_prefix(&mut self, buf: &[u8]) -> Result<Option<usize>, ArrowError> {
715        match self.fingerprint_algorithm {
716            FingerprintAlgorithm::Rabin => {
717                self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
718                    Fingerprint::Rabin(u64::from_le_bytes(bytes))
719                })
720            }
721            FingerprintAlgorithm::Id => self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
722                Fingerprint::Id(u32::from_be_bytes(bytes))
723            }),
724            FingerprintAlgorithm::Id64 => {
725                self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
726                    Fingerprint::Id64(u64::from_be_bytes(bytes))
727                })
728            }
729            #[cfg(feature = "md5")]
730            FingerprintAlgorithm::MD5 => {
731                self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
732                    Fingerprint::MD5(bytes)
733                })
734            }
735            #[cfg(feature = "sha256")]
736            FingerprintAlgorithm::SHA256 => {
737                self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
738                    Fingerprint::SHA256(bytes)
739                })
740            }
741        }
742    }
743
744    /// This method checks for the provided `magic` bytes at the start of `buf` and, if present,
745    /// attempts to read the following fingerprint of `N` bytes, converting it to a
746    /// `Fingerprint` using `fingerprint_from`.
747    fn handle_prefix_common<const MAGIC_LEN: usize, const N: usize>(
748        &mut self,
749        buf: &[u8],
750        magic: &[u8; MAGIC_LEN],
751        fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
752    ) -> Result<Option<usize>, ArrowError> {
753        // Need at least the magic bytes to decide
754        // 2 bytes for Avro Spec and 1 byte for Confluent Wire Protocol.
755        if buf.len() < MAGIC_LEN {
756            return Ok(Some(0));
757        }
758        // Bail out early if the magic does not match.
759        if &buf[..MAGIC_LEN] != magic {
760            return Ok(None);
761        }
762        // Try to parse the fingerprint that follows the magic.
763        let consumed_fp = self.handle_fingerprint(&buf[MAGIC_LEN..], fingerprint_from)?;
764        // Convert the inner result into a “bytes consumed” count.
765        // NOTE: Incomplete fingerprint consumes no bytes.
766        Ok(Some(consumed_fp.map_or(0, |n| n + MAGIC_LEN)))
767    }
768
769    // Attempts to read and install a new fingerprint of `N` bytes.
770    //
771    // * Ok(None) – insufficient bytes (`buf.len() < `N`).
772    // * Ok(Some(N)) – fingerprint consumed (always `N`).
773    fn handle_fingerprint<const N: usize>(
774        &mut self,
775        buf: &[u8],
776        fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
777    ) -> Result<Option<usize>, ArrowError> {
778        // Need enough bytes to get fingerprint (next N bytes)
779        let Some(fingerprint_bytes) = buf.get(..N) else {
780            return Ok(None); // insufficient bytes
781        };
782        // SAFETY: length checked above.
783        let new_fingerprint = fingerprint_from(fingerprint_bytes.try_into().unwrap());
784        // If the fingerprint indicates a schema change, prepare to switch decoders.
785        if self.active_fingerprint != Some(new_fingerprint) {
786            let Some(new_decoder) = self.cache.shift_remove(&new_fingerprint) else {
787                return Err(ArrowError::ParseError(format!(
788                    "Unknown fingerprint: {new_fingerprint:?}"
789                )));
790            };
791            self.pending_schema = Some((new_fingerprint, new_decoder));
792            // If there are already decoded rows, we must flush them first.
793            // Reducing `remaining_capacity` to 0 ensures `flush` is called next.
794            if self.remaining_capacity < self.batch_size {
795                self.remaining_capacity = 0;
796            }
797        }
798        Ok(Some(N))
799    }
800
801    fn apply_pending_schema(&mut self) {
802        if let Some((new_fingerprint, new_decoder)) = self.pending_schema.take() {
803            if let Some(old_fingerprint) = self.active_fingerprint.replace(new_fingerprint) {
804                let old_decoder = std::mem::replace(&mut self.active_decoder, new_decoder);
805                self.cache.shift_remove(&old_fingerprint);
806                self.cache.insert(old_fingerprint, old_decoder);
807            } else {
808                self.active_decoder = new_decoder;
809            }
810        }
811    }
812
813    fn apply_pending_schema_if_batch_empty(&mut self) {
814        if self.batch_is_empty() {
815            self.apply_pending_schema();
816        }
817    }
818
819    fn flush_and_reset(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
820        if self.batch_is_empty() {
821            return Ok(None);
822        }
823        let batch = self.active_decoder.flush()?;
824        self.remaining_capacity = self.batch_size;
825        Ok(Some(batch))
826    }
827
828    /// Produce a `RecordBatch` if at least one row is fully decoded, returning
829    /// `Ok(None)` if no new rows are available.
830    ///
831    /// If a schema change was detected while decoding rows for the current batch, the
832    /// schema switch is applied **after** flushing this batch, so the **next** batch
833    /// (if any) may have a different schema.
834    pub fn flush(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
835        // We must flush the active decoder before switching to the pending one.
836        let batch = self.flush_and_reset();
837        self.apply_pending_schema();
838        batch
839    }
840
841    /// Returns the number of rows that can be added to this decoder before it is full.
842    pub fn capacity(&self) -> usize {
843        self.remaining_capacity
844    }
845
846    /// Returns true if the decoder has reached its capacity for the current batch.
847    pub fn batch_is_full(&self) -> bool {
848        self.remaining_capacity == 0
849    }
850
851    /// Returns true if the decoder has not decoded any batches yet (i.e., the current batch is empty).
852    pub fn batch_is_empty(&self) -> bool {
853        self.remaining_capacity == self.batch_size
854    }
855
856    // Decode either the block count or remaining capacity from `data` (an OCF block payload).
857    //
858    // Returns the number of bytes consumed from `data` along with the number of records decoded.
859    fn decode_block(&mut self, data: &[u8], count: usize) -> Result<(usize, usize), ArrowError> {
860        // OCF decoding never interleaves records across blocks, so no chunking.
861        let to_decode = std::cmp::min(count, self.remaining_capacity);
862        if to_decode == 0 {
863            return Ok((0, 0));
864        }
865        let consumed = self.active_decoder.decode(data, to_decode)?;
866        self.remaining_capacity -= to_decode;
867        Ok((consumed, to_decode))
868    }
869
870    // Produce a `RecordBatch` if at least one row is fully decoded, returning
871    // `Ok(None)` if no new rows are available.
872    fn flush_block(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
873        self.flush_and_reset()
874    }
875}
876
877/// A builder that configures and constructs Avro readers and decoders.
878///
879/// `ReaderBuilder` is the primary entry point for this module. It supports:
880///
881/// * OCF reading via `Self::build`, returning a `Reader` over any `BufRead`;
882/// * streaming decoding via `Self::build_decoder`, returning a `Decoder`.
883///
884/// ### Options
885///
886/// * **`batch_size`**: Max rows per `RecordBatch` (default: `1024`). See `Self::with_batch_size`.
887/// * **`utf8_view`**: Use Arrow `StringViewArray` for string columns (default: `false`).
888///   See `Self::with_utf8_view`.
889/// * **`strict_mode`**: Opt‑in to stricter union handling (default: `false`).
890///   See `Self::with_strict_mode`.
891/// * **`reader_schema`**: Optional reader schema (projection / evolution) used when decoding
892///   values (default: `None`). See `Self::with_reader_schema`.
893/// * **`writer_schema_store`**: Required for building a `Decoder` for single‑object or
894///   Confluent framing. Maps fingerprints to Avro schemas. See `Self::with_writer_schema_store`.
895/// * **`active_fingerprint`**: Optional starting fingerprint for streaming decode when the
896///   first frame omits one (rare). See `Self::with_active_fingerprint`.
897///
898/// ### Examples
899///
900/// Read an OCF file in batches of 4096 rows:
901///
902/// ```no_run
903/// use std::fs::File;
904/// use std::io::BufReader;
905/// use arrow_avro::reader::ReaderBuilder;
906///
907/// let file = File::open("data.avro")?;
908/// let mut reader = ReaderBuilder::new()
909///     .with_batch_size(4096)
910///     .build(BufReader::new(file))?;
911/// # Ok::<(), Box<dyn std::error::Error>>(())
912/// ```
913///
914/// Build a `Decoder` for Confluent messages:
915///
916/// ```
917/// use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm};
918/// use arrow_avro::reader::ReaderBuilder;
919///
920/// let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
921/// store.set(Fingerprint::Id(1234), AvroSchema::new(r#"{"type":"record","name":"E","fields":[]}"#.to_string()))?;
922///
923/// let decoder = ReaderBuilder::new()
924///     .with_writer_schema_store(store)
925///     .build_decoder()?;
926/// # Ok::<(), Box<dyn std::error::Error>>(())
927/// ```
928#[derive(Debug)]
929pub struct ReaderBuilder {
930    batch_size: usize,
931    strict_mode: bool,
932    utf8_view: bool,
933    reader_schema: Option<AvroSchema>,
934    writer_schema_store: Option<SchemaStore>,
935    active_fingerprint: Option<Fingerprint>,
936}
937
938impl Default for ReaderBuilder {
939    fn default() -> Self {
940        Self {
941            batch_size: 1024,
942            strict_mode: false,
943            utf8_view: false,
944            reader_schema: None,
945            writer_schema_store: None,
946            active_fingerprint: None,
947        }
948    }
949}
950
951impl ReaderBuilder {
952    /// Creates a new `ReaderBuilder` with defaults:
953    ///
954    /// * `batch_size = 1024`
955    /// * `strict_mode = false`
956    /// * `utf8_view = false`
957    /// * `reader_schema = None`
958    /// * `writer_schema_store = None`
959    /// * `active_fingerprint = None`
960    pub fn new() -> Self {
961        Self::default()
962    }
963
964    fn make_record_decoder(
965        &self,
966        writer_schema: &Schema,
967        reader_schema: Option<&Schema>,
968    ) -> Result<RecordDecoder, ArrowError> {
969        let mut builder = AvroFieldBuilder::new(writer_schema);
970        if let Some(reader_schema) = reader_schema {
971            builder = builder.with_reader_schema(reader_schema);
972        }
973        let root = builder
974            .with_utf8view(self.utf8_view)
975            .with_strict_mode(self.strict_mode)
976            .build()?;
977        RecordDecoder::try_new_with_options(root.data_type())
978    }
979
980    fn make_record_decoder_from_schemas(
981        &self,
982        writer_schema: &Schema,
983        reader_schema: Option<&AvroSchema>,
984    ) -> Result<RecordDecoder, ArrowError> {
985        let reader_schema_raw = reader_schema.map(|s| s.schema()).transpose()?;
986        self.make_record_decoder(writer_schema, reader_schema_raw.as_ref())
987    }
988
989    fn make_decoder_with_parts(
990        &self,
991        active_decoder: RecordDecoder,
992        active_fingerprint: Option<Fingerprint>,
993        cache: IndexMap<Fingerprint, RecordDecoder>,
994        fingerprint_algorithm: FingerprintAlgorithm,
995    ) -> Decoder {
996        Decoder {
997            batch_size: self.batch_size,
998            remaining_capacity: self.batch_size,
999            active_fingerprint,
1000            active_decoder,
1001            cache,
1002            fingerprint_algorithm,
1003            pending_schema: None,
1004            awaiting_body: false,
1005        }
1006    }
1007
1008    fn make_decoder(
1009        &self,
1010        header: Option<&Header>,
1011        reader_schema: Option<&AvroSchema>,
1012    ) -> Result<Decoder, ArrowError> {
1013        if let Some(hdr) = header {
1014            let writer_schema = hdr
1015                .schema()
1016                .map_err(|e| ArrowError::ExternalError(Box::new(e)))?
1017                .ok_or_else(|| {
1018                    ArrowError::ParseError("No Avro schema present in file header".into())
1019                })?;
1020            let record_decoder =
1021                self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?;
1022            return Ok(self.make_decoder_with_parts(
1023                record_decoder,
1024                None,
1025                IndexMap::new(),
1026                FingerprintAlgorithm::Rabin,
1027            ));
1028        }
1029        let store = self.writer_schema_store.as_ref().ok_or_else(|| {
1030            ArrowError::ParseError("Writer schema store required for raw Avro".into())
1031        })?;
1032        let fingerprints = store.fingerprints();
1033        if fingerprints.is_empty() {
1034            return Err(ArrowError::ParseError(
1035                "Writer schema store must contain at least one schema".into(),
1036            ));
1037        }
1038        let start_fingerprint = self
1039            .active_fingerprint
1040            .or_else(|| fingerprints.first().copied())
1041            .ok_or_else(|| {
1042                ArrowError::ParseError("Could not determine initial schema fingerprint".into())
1043            })?;
1044        let mut cache = IndexMap::with_capacity(fingerprints.len().saturating_sub(1));
1045        let mut active_decoder: Option<RecordDecoder> = None;
1046        for fingerprint in store.fingerprints() {
1047            let avro_schema = match store.lookup(&fingerprint) {
1048                Some(schema) => schema,
1049                None => {
1050                    return Err(ArrowError::ComputeError(format!(
1051                        "Fingerprint {fingerprint:?} not found in schema store",
1052                    )));
1053                }
1054            };
1055            let writer_schema = avro_schema.schema()?;
1056            let record_decoder =
1057                self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?;
1058            if fingerprint == start_fingerprint {
1059                active_decoder = Some(record_decoder);
1060            } else {
1061                cache.insert(fingerprint, record_decoder);
1062            }
1063        }
1064        let active_decoder = active_decoder.ok_or_else(|| {
1065            ArrowError::ComputeError(format!(
1066                "Initial fingerprint {start_fingerprint:?} not found in schema store"
1067            ))
1068        })?;
1069        Ok(self.make_decoder_with_parts(
1070            active_decoder,
1071            Some(start_fingerprint),
1072            cache,
1073            store.fingerprint_algorithm(),
1074        ))
1075    }
1076
1077    /// Sets the **row‑based batch size**.
1078    ///
1079    /// Each call to `Decoder::flush` or each iteration of `Reader` yields a batch with
1080    /// *up to* this many rows. Larger batches can reduce overhead; smaller batches can
1081    /// reduce peak memory usage and latency.
1082    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
1083        self.batch_size = batch_size;
1084        self
1085    }
1086
1087    /// Choose Arrow's `StringViewArray` for UTF‑8 string data.
1088    ///
1089    /// When enabled, textual Avro fields are loaded into Arrow’s **StringViewArray**
1090    /// instead of the standard `StringArray`. This can improve performance for workloads
1091    /// with many short strings by reducing allocations.
1092    pub fn with_utf8_view(mut self, utf8_view: bool) -> Self {
1093        self.utf8_view = utf8_view;
1094        self
1095    }
1096
1097    /// Returns whether `StringViewArray` is enabled for string data.
1098    pub fn use_utf8view(&self) -> bool {
1099        self.utf8_view
1100    }
1101
1102    /// Enable stricter behavior for certain Avro unions (e.g., `[T, "null"]`).
1103    ///
1104    /// When `true`, ambiguous or lossy unions that would otherwise be coerced may instead
1105    /// produce a descriptive error. Use this to catch schema issues early during ingestion.
1106    pub fn with_strict_mode(mut self, strict_mode: bool) -> Self {
1107        self.strict_mode = strict_mode;
1108        self
1109    }
1110
1111    /// Sets the **reader schema** used during decoding.
1112    ///
1113    /// If not provided, the writer schema from the OCF header (for `Reader`) or the
1114    /// schema looked up from the fingerprint (for `Decoder`) is used directly.
1115    ///
1116    /// A reader schema can be used for **schema evolution** or **projection**.
1117    pub fn with_reader_schema(mut self, schema: AvroSchema) -> Self {
1118        self.reader_schema = Some(schema);
1119        self
1120    }
1121
1122    /// Sets the `SchemaStore` used to resolve writer schemas by fingerprint.
1123    ///
1124    /// This is required when building a `Decoder` for **single‑object encoding** or the
1125    /// **Confluent** wire format. The store maps a fingerprint (Rabin / MD5 / SHA‑256 /
1126    /// ID) to a full Avro schema.
1127    ///
1128    /// Defaults to `None`.
1129    pub fn with_writer_schema_store(mut self, store: SchemaStore) -> Self {
1130        self.writer_schema_store = Some(store);
1131        self
1132    }
1133
1134    /// Sets the initial schema fingerprint for stream decoding.
1135    ///
1136    /// This can be useful for streams that **do not include** a fingerprint before the first
1137    /// record body (uncommon). If not set, the first observed fingerprint is used.
1138    pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self {
1139        self.active_fingerprint = Some(fp);
1140        self
1141    }
1142
1143    /// Build a `Reader` (OCF) from this builder and a `BufRead`.
1144    ///
1145    /// This reads and validates the OCF header, initializes an internal row decoder from
1146    /// the discovered writer (and optional reader) schema, and prepares to iterate blocks,
1147    /// decompressing if necessary.
1148    pub fn build<R: BufRead>(self, mut reader: R) -> Result<Reader<R>, ArrowError> {
1149        let header = read_header(&mut reader)?;
1150        let decoder = self.make_decoder(Some(&header), self.reader_schema.as_ref())?;
1151        Ok(Reader {
1152            reader,
1153            header,
1154            decoder,
1155            block_decoder: BlockDecoder::default(),
1156            block_data: Vec::new(),
1157            block_count: 0,
1158            block_cursor: 0,
1159            finished: false,
1160        })
1161    }
1162
1163    /// Build a streaming `Decoder` from this builder.
1164    ///
1165    /// # Requirements
1166    /// * `SchemaStore` **must** be provided via `Self::with_writer_schema_store`.
1167    /// * The store should contain **all** fingerprints that may appear on the stream.
1168    ///
1169    /// # Errors
1170    /// * Returns [`ArrowError::InvalidArgumentError`] if the schema store is missing
1171    pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
1172        if self.writer_schema_store.is_none() {
1173            return Err(ArrowError::InvalidArgumentError(
1174                "Building a decoder requires a writer schema store".to_string(),
1175            ));
1176        }
1177        self.make_decoder(None, self.reader_schema.as_ref())
1178    }
1179}
1180
1181/// A high‑level Avro **Object Container File** reader.
1182///
1183/// `Reader` pulls blocks from a `BufRead` source, handles optional block compression,
1184/// and decodes them row‑by‑row into Arrow `RecordBatch` values using an internal
1185/// `Decoder`. It implements both:
1186///
1187/// * [`Iterator<Item = Result<RecordBatch, ArrowError>>`], and
1188/// * `RecordBatchReader`, guaranteeing a consistent schema across all produced batches.
1189///
1190#[derive(Debug)]
1191pub struct Reader<R: BufRead> {
1192    reader: R,
1193    header: Header,
1194    decoder: Decoder,
1195    block_decoder: BlockDecoder,
1196    block_data: Vec<u8>,
1197    block_count: usize,
1198    block_cursor: usize,
1199    finished: bool,
1200}
1201
1202impl<R: BufRead> Reader<R> {
1203    /// Returns the Arrow schema discovered from the Avro file header (or derived via
1204    /// the optional reader schema).
1205    pub fn schema(&self) -> SchemaRef {
1206        self.decoder.schema()
1207    }
1208
1209    /// Returns a reference to the parsed Avro container‑file header (magic, metadata, codec, sync).
1210    pub fn avro_header(&self) -> &Header {
1211        &self.header
1212    }
1213
1214    /// Reads the next `RecordBatch` from the Avro file, or `Ok(None)` on EOF.
1215    ///
1216    /// Batches are bounded by `batch_size`; a single OCF block may yield multiple batches,
1217    /// and a batch may also span multiple blocks.
1218    fn read(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
1219        'outer: while !self.finished && !self.decoder.batch_is_full() {
1220            while self.block_cursor == self.block_data.len() {
1221                let buf = self.reader.fill_buf()?;
1222                if buf.is_empty() {
1223                    self.finished = true;
1224                    break 'outer;
1225                }
1226                // Try to decode another block from the buffered reader.
1227                let consumed = self.block_decoder.decode(buf)?;
1228                self.reader.consume(consumed);
1229                if let Some(block) = self.block_decoder.flush() {
1230                    // Successfully decoded a block.
1231                    self.block_data = if let Some(ref codec) = self.header.compression()? {
1232                        codec.decompress(&block.data)?
1233                    } else {
1234                        block.data
1235                    };
1236                    self.block_count = block.count;
1237                    self.block_cursor = 0;
1238                } else if consumed == 0 {
1239                    // The block decoder made no progress on a non-empty buffer.
1240                    return Err(ArrowError::ParseError(
1241                        "Could not decode next Avro block from partial data".to_string(),
1242                    ));
1243                }
1244            }
1245            // Decode as many rows as will fit in the current batch
1246            if self.block_cursor < self.block_data.len() {
1247                let (consumed, records_decoded) = self
1248                    .decoder
1249                    .decode_block(&self.block_data[self.block_cursor..], self.block_count)?;
1250                self.block_cursor += consumed;
1251                self.block_count -= records_decoded;
1252            }
1253        }
1254        self.decoder.flush_block()
1255    }
1256}
1257
1258impl<R: BufRead> Iterator for Reader<R> {
1259    type Item = Result<RecordBatch, ArrowError>;
1260
1261    fn next(&mut self) -> Option<Self::Item> {
1262        self.read().transpose()
1263    }
1264}
1265
1266impl<R: BufRead> RecordBatchReader for Reader<R> {
1267    fn schema(&self) -> SchemaRef {
1268        self.schema()
1269    }
1270}
1271
1272#[cfg(test)]
1273mod test {
1274    use crate::codec::AvroFieldBuilder;
1275    use crate::reader::record::RecordDecoder;
1276    use crate::reader::{Decoder, Reader, ReaderBuilder};
1277    use crate::schema::{
1278        AVRO_ENUM_SYMBOLS_METADATA_KEY, AVRO_NAME_METADATA_KEY, AVRO_NAMESPACE_METADATA_KEY,
1279        AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, PrimitiveType,
1280        SINGLE_OBJECT_MAGIC, SchemaStore,
1281    };
1282    use crate::test_util::arrow_test_data;
1283    use crate::writer::AvroWriter;
1284    use arrow_array::builder::{
1285        ArrayBuilder, BooleanBuilder, Float32Builder, Int32Builder, Int64Builder, ListBuilder,
1286        MapBuilder, StringBuilder, StructBuilder,
1287    };
1288    #[cfg(feature = "snappy")]
1289    use arrow_array::builder::{Float64Builder, MapFieldNames};
1290    use arrow_array::cast::AsArray;
1291    #[cfg(not(feature = "avro_custom_types"))]
1292    use arrow_array::types::Int64Type;
1293    #[cfg(feature = "avro_custom_types")]
1294    use arrow_array::types::{
1295        DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
1296        DurationSecondType,
1297    };
1298    use arrow_array::types::{Int32Type, IntervalMonthDayNanoType};
1299    use arrow_array::*;
1300    #[cfg(feature = "snappy")]
1301    use arrow_buffer::{Buffer, NullBuffer};
1302    use arrow_buffer::{IntervalMonthDayNano, OffsetBuffer, ScalarBuffer, i256};
1303    #[cfg(feature = "avro_custom_types")]
1304    use arrow_schema::{
1305        ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit, UnionFields,
1306        UnionMode,
1307    };
1308    #[cfg(not(feature = "avro_custom_types"))]
1309    use arrow_schema::{
1310        ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, UnionFields, UnionMode,
1311    };
1312    use bytes::Bytes;
1313    use futures::executor::block_on;
1314    use futures::{Stream, StreamExt, TryStreamExt, stream};
1315    use serde_json::{Value, json};
1316    use std::collections::HashMap;
1317    use std::fs::File;
1318    use std::io::{BufReader, Cursor};
1319    use std::sync::Arc;
1320
1321    fn files() -> impl Iterator<Item = &'static str> {
1322        [
1323            // TODO: avoid requiring snappy for this file
1324            #[cfg(feature = "snappy")]
1325            "avro/alltypes_plain.avro",
1326            #[cfg(feature = "snappy")]
1327            "avro/alltypes_plain.snappy.avro",
1328            #[cfg(feature = "zstd")]
1329            "avro/alltypes_plain.zstandard.avro",
1330            #[cfg(feature = "bzip2")]
1331            "avro/alltypes_plain.bzip2.avro",
1332            #[cfg(feature = "xz")]
1333            "avro/alltypes_plain.xz.avro",
1334        ]
1335        .into_iter()
1336    }
1337
1338    fn read_file(path: &str, batch_size: usize, utf8_view: bool) -> RecordBatch {
1339        let file = File::open(path).unwrap();
1340        let reader = ReaderBuilder::new()
1341            .with_batch_size(batch_size)
1342            .with_utf8_view(utf8_view)
1343            .build(BufReader::new(file))
1344            .unwrap();
1345        let schema = reader.schema();
1346        let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1347        arrow::compute::concat_batches(&schema, &batches).unwrap()
1348    }
1349
1350    fn read_file_strict(
1351        path: &str,
1352        batch_size: usize,
1353        utf8_view: bool,
1354    ) -> Result<Reader<BufReader<File>>, ArrowError> {
1355        let file = File::open(path)?;
1356        ReaderBuilder::new()
1357            .with_batch_size(batch_size)
1358            .with_utf8_view(utf8_view)
1359            .with_strict_mode(true)
1360            .build(BufReader::new(file))
1361    }
1362
1363    fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
1364        mut decoder: Decoder,
1365        mut input: S,
1366    ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
1367        async_stream::try_stream! {
1368            if let Some(data) = input.next().await {
1369                let consumed = decoder.decode(&data)?;
1370                if consumed < data.len() {
1371                    Err(ArrowError::ParseError(
1372                        "did not consume all bytes".to_string(),
1373                    ))?;
1374                }
1375            }
1376            if let Some(batch) = decoder.flush()? {
1377                yield batch
1378            }
1379        }
1380    }
1381
1382    fn make_record_schema(pt: PrimitiveType) -> AvroSchema {
1383        let js = format!(
1384            r#"{{"type":"record","name":"TestRecord","fields":[{{"name":"a","type":"{}"}}]}}"#,
1385            pt.as_ref()
1386        );
1387        AvroSchema::new(js)
1388    }
1389
1390    fn make_two_schema_store() -> (
1391        SchemaStore,
1392        Fingerprint,
1393        Fingerprint,
1394        AvroSchema,
1395        AvroSchema,
1396    ) {
1397        let schema_int = make_record_schema(PrimitiveType::Int);
1398        let schema_long = make_record_schema(PrimitiveType::Long);
1399        let mut store = SchemaStore::new();
1400        let fp_int = store
1401            .register(schema_int.clone())
1402            .expect("register int schema");
1403        let fp_long = store
1404            .register(schema_long.clone())
1405            .expect("register long schema");
1406        (store, fp_int, fp_long, schema_int, schema_long)
1407    }
1408
1409    fn make_prefix(fp: Fingerprint) -> Vec<u8> {
1410        match fp {
1411            Fingerprint::Rabin(v) => {
1412                let mut out = Vec::with_capacity(2 + 8);
1413                out.extend_from_slice(&SINGLE_OBJECT_MAGIC);
1414                out.extend_from_slice(&v.to_le_bytes());
1415                out
1416            }
1417            Fingerprint::Id(v) => {
1418                panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1419            }
1420            Fingerprint::Id64(v) => {
1421                panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1422            }
1423            #[cfg(feature = "md5")]
1424            Fingerprint::MD5(v) => {
1425                panic!("make_prefix expects a Rabin fingerprint, got ({v:?})");
1426            }
1427            #[cfg(feature = "sha256")]
1428            Fingerprint::SHA256(id) => {
1429                panic!("make_prefix expects a Rabin fingerprint, got ({id:?})");
1430            }
1431        }
1432    }
1433
1434    fn make_decoder(store: &SchemaStore, fp: Fingerprint, reader_schema: &AvroSchema) -> Decoder {
1435        ReaderBuilder::new()
1436            .with_batch_size(8)
1437            .with_reader_schema(reader_schema.clone())
1438            .with_writer_schema_store(store.clone())
1439            .with_active_fingerprint(fp)
1440            .build_decoder()
1441            .expect("decoder")
1442    }
1443
1444    fn make_id_prefix(id: u32, additional: usize) -> Vec<u8> {
1445        let capacity = CONFLUENT_MAGIC.len() + size_of::<u32>() + additional;
1446        let mut out = Vec::with_capacity(capacity);
1447        out.extend_from_slice(&CONFLUENT_MAGIC);
1448        out.extend_from_slice(&id.to_be_bytes());
1449        out
1450    }
1451
1452    fn make_message_id(id: u32, value: i64) -> Vec<u8> {
1453        let encoded_value = encode_zigzag(value);
1454        let mut msg = make_id_prefix(id, encoded_value.len());
1455        msg.extend_from_slice(&encoded_value);
1456        msg
1457    }
1458
1459    fn make_id64_prefix(id: u64, additional: usize) -> Vec<u8> {
1460        let capacity = CONFLUENT_MAGIC.len() + size_of::<u64>() + additional;
1461        let mut out = Vec::with_capacity(capacity);
1462        out.extend_from_slice(&CONFLUENT_MAGIC);
1463        out.extend_from_slice(&id.to_be_bytes());
1464        out
1465    }
1466
1467    fn make_message_id64(id: u64, value: i64) -> Vec<u8> {
1468        let encoded_value = encode_zigzag(value);
1469        let mut msg = make_id64_prefix(id, encoded_value.len());
1470        msg.extend_from_slice(&encoded_value);
1471        msg
1472    }
1473
1474    fn make_value_schema(pt: PrimitiveType) -> AvroSchema {
1475        let json_schema = format!(
1476            r#"{{"type":"record","name":"S","fields":[{{"name":"v","type":"{}"}}]}}"#,
1477            pt.as_ref()
1478        );
1479        AvroSchema::new(json_schema)
1480    }
1481
1482    fn encode_zigzag(value: i64) -> Vec<u8> {
1483        let mut n = ((value << 1) ^ (value >> 63)) as u64;
1484        let mut out = Vec::new();
1485        loop {
1486            if (n & !0x7F) == 0 {
1487                out.push(n as u8);
1488                break;
1489            } else {
1490                out.push(((n & 0x7F) | 0x80) as u8);
1491                n >>= 7;
1492            }
1493        }
1494        out
1495    }
1496
1497    fn make_message(fp: Fingerprint, value: i64) -> Vec<u8> {
1498        let mut msg = make_prefix(fp);
1499        msg.extend_from_slice(&encode_zigzag(value));
1500        msg
1501    }
1502
1503    fn load_writer_schema_json(path: &str) -> Value {
1504        let file = File::open(path).unwrap();
1505        let header = super::read_header(BufReader::new(file)).unwrap();
1506        let schema = header.schema().unwrap().unwrap();
1507        serde_json::to_value(&schema).unwrap()
1508    }
1509
1510    fn make_reader_schema_with_promotions(
1511        path: &str,
1512        promotions: &HashMap<&str, &str>,
1513    ) -> AvroSchema {
1514        let mut root = load_writer_schema_json(path);
1515        assert_eq!(root["type"], "record", "writer schema must be a record");
1516        let fields = root
1517            .get_mut("fields")
1518            .and_then(|f| f.as_array_mut())
1519            .expect("record has fields");
1520        for f in fields.iter_mut() {
1521            let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1522                continue;
1523            };
1524            if let Some(new_ty) = promotions.get(name) {
1525                let ty = f.get_mut("type").expect("field has a type");
1526                match ty {
1527                    Value::String(_) => {
1528                        *ty = Value::String((*new_ty).to_string());
1529                    }
1530                    // Union
1531                    Value::Array(arr) => {
1532                        for b in arr.iter_mut() {
1533                            match b {
1534                                Value::String(s) if s != "null" => {
1535                                    *b = Value::String((*new_ty).to_string());
1536                                    break;
1537                                }
1538                                Value::Object(_) => {
1539                                    *b = Value::String((*new_ty).to_string());
1540                                    break;
1541                                }
1542                                _ => {}
1543                            }
1544                        }
1545                    }
1546                    Value::Object(_) => {
1547                        *ty = Value::String((*new_ty).to_string());
1548                    }
1549                    _ => {}
1550                }
1551            }
1552        }
1553        AvroSchema::new(root.to_string())
1554    }
1555
1556    fn make_reader_schema_with_enum_remap(
1557        path: &str,
1558        remap: &HashMap<&str, Vec<&str>>,
1559    ) -> AvroSchema {
1560        let mut root = load_writer_schema_json(path);
1561        assert_eq!(root["type"], "record", "writer schema must be a record");
1562        let fields = root
1563            .get_mut("fields")
1564            .and_then(|f| f.as_array_mut())
1565            .expect("record has fields");
1566
1567        fn to_symbols_array(symbols: &[&str]) -> Value {
1568            Value::Array(symbols.iter().map(|s| Value::String((*s).into())).collect())
1569        }
1570
1571        fn update_enum_symbols(ty: &mut Value, symbols: &Value) {
1572            match ty {
1573                Value::Object(map) => {
1574                    if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1575                        map.insert("symbols".to_string(), symbols.clone());
1576                    }
1577                }
1578                Value::Array(arr) => {
1579                    for b in arr.iter_mut() {
1580                        if let Value::Object(map) = b {
1581                            if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1582                                map.insert("symbols".to_string(), symbols.clone());
1583                            }
1584                        }
1585                    }
1586                }
1587                _ => {}
1588            }
1589        }
1590        for f in fields.iter_mut() {
1591            let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1592                continue;
1593            };
1594            if let Some(new_symbols) = remap.get(name) {
1595                let symbols_val = to_symbols_array(new_symbols);
1596                let ty = f.get_mut("type").expect("field has a type");
1597                update_enum_symbols(ty, &symbols_val);
1598            }
1599        }
1600        AvroSchema::new(root.to_string())
1601    }
1602
1603    fn read_alltypes_with_reader_schema(path: &str, reader_schema: AvroSchema) -> RecordBatch {
1604        let file = File::open(path).unwrap();
1605        let reader = ReaderBuilder::new()
1606            .with_batch_size(1024)
1607            .with_utf8_view(false)
1608            .with_reader_schema(reader_schema)
1609            .build(BufReader::new(file))
1610            .unwrap();
1611        let schema = reader.schema();
1612        let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1613        arrow::compute::concat_batches(&schema, &batches).unwrap()
1614    }
1615
1616    fn make_reader_schema_with_selected_fields_in_order(
1617        path: &str,
1618        selected: &[&str],
1619    ) -> AvroSchema {
1620        let mut root = load_writer_schema_json(path);
1621        assert_eq!(root["type"], "record", "writer schema must be a record");
1622        let writer_fields = root
1623            .get("fields")
1624            .and_then(|f| f.as_array())
1625            .expect("record has fields");
1626        let mut field_map: HashMap<String, Value> = HashMap::with_capacity(writer_fields.len());
1627        for f in writer_fields {
1628            if let Some(name) = f.get("name").and_then(|n| n.as_str()) {
1629                field_map.insert(name.to_string(), f.clone());
1630            }
1631        }
1632        let mut new_fields = Vec::with_capacity(selected.len());
1633        for name in selected {
1634            let f = field_map
1635                .get(*name)
1636                .unwrap_or_else(|| panic!("field '{name}' not found in writer schema"))
1637                .clone();
1638            new_fields.push(f);
1639        }
1640        root["fields"] = Value::Array(new_fields);
1641        AvroSchema::new(root.to_string())
1642    }
1643
1644    fn write_ocf(schema: &Schema, batches: &[RecordBatch]) -> Vec<u8> {
1645        let mut w = AvroWriter::new(Vec::<u8>::new(), schema.clone()).expect("writer");
1646        for b in batches {
1647            w.write(b).expect("write");
1648        }
1649        w.finish().expect("finish");
1650        w.into_inner()
1651    }
1652
1653    #[test]
1654    fn writer_string_reader_nullable_with_alias() -> Result<(), Box<dyn std::error::Error>> {
1655        // Writer: { id: long, name: string }
1656        let writer_schema = Schema::new(vec![
1657            Field::new("id", DataType::Int64, false),
1658            Field::new("name", DataType::Utf8, false),
1659        ]);
1660        let batch = RecordBatch::try_new(
1661            Arc::new(writer_schema.clone()),
1662            vec![
1663                Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1664                Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1665            ],
1666        )?;
1667        let bytes = write_ocf(&writer_schema, &[batch]);
1668        let reader_json = r#"
1669    {
1670      "type": "record",
1671      "name": "topLevelRecord",
1672      "fields": [
1673        { "name": "id", "type": "long" },
1674        { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
1675        { "name": "is_active", "type": "boolean", "default": true }
1676      ]
1677    }"#;
1678        let mut reader = ReaderBuilder::new()
1679            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1680            .build(Cursor::new(bytes))?;
1681        let out = reader.next().unwrap()?;
1682        // Evolved aliased field should be non-null and match original writer values
1683        let full_name = out.column(1).as_string::<i32>();
1684        assert_eq!(full_name.value(0), "a");
1685        assert_eq!(full_name.value(1), "b");
1686
1687        Ok(())
1688    }
1689
1690    #[test]
1691    fn writer_string_reader_string_null_order_second() -> Result<(), Box<dyn std::error::Error>> {
1692        // Writer: { name: string }
1693        let writer_schema = Schema::new(vec![Field::new("name", DataType::Utf8, false)]);
1694        let batch = RecordBatch::try_new(
1695            Arc::new(writer_schema.clone()),
1696            vec![Arc::new(StringArray::from(vec!["x", "y"])) as ArrayRef],
1697        )?;
1698        let bytes = write_ocf(&writer_schema, &[batch]);
1699
1700        // Reader: ["string","null"] (NullSecond)
1701        let reader_json = r#"
1702    {
1703      "type":"record", "name":"topLevelRecord",
1704      "fields":[ { "name":"name", "type":["string","null"], "default":"x" } ]
1705    }"#;
1706
1707        let mut reader = ReaderBuilder::new()
1708            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1709            .build(Cursor::new(bytes))?;
1710
1711        let out = reader.next().unwrap()?;
1712        assert_eq!(out.num_rows(), 2);
1713
1714        // Should decode as non-null strings (writer non-union -> reader union)
1715        let name = out.column(0).as_string::<i32>();
1716        assert_eq!(name.value(0), "x");
1717        assert_eq!(name.value(1), "y");
1718
1719        Ok(())
1720    }
1721
1722    #[test]
1723    fn promotion_writer_int_reader_nullable_long() -> Result<(), Box<dyn std::error::Error>> {
1724        // Writer: { v: int }
1725        let writer_schema = Schema::new(vec![Field::new("v", DataType::Int32, false)]);
1726        let batch = RecordBatch::try_new(
1727            Arc::new(writer_schema.clone()),
1728            vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
1729        )?;
1730        let bytes = write_ocf(&writer_schema, &[batch]);
1731
1732        // Reader: { v: ["null","long"] }
1733        let reader_json = r#"
1734    {
1735      "type":"record", "name":"topLevelRecord",
1736      "fields":[ { "name":"v", "type":["null","long"], "default": null } ]
1737    }"#;
1738
1739        let mut reader = ReaderBuilder::new()
1740            .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1741            .build(Cursor::new(bytes))?;
1742
1743        let out = reader.next().unwrap()?;
1744        assert_eq!(out.num_rows(), 3);
1745
1746        // Should have promoted to Int64 and be non-null (no union tag in writer)
1747        let v = out
1748            .column(0)
1749            .as_primitive::<arrow_array::types::Int64Type>();
1750        assert_eq!(v.values(), &[1, 2, 3]);
1751        assert!(
1752            out.column(0).nulls().is_none(),
1753            "expected no validity bitmap for all-valid column"
1754        );
1755
1756        Ok(())
1757    }
1758
1759    #[test]
1760    fn test_alltypes_schema_promotion_mixed() {
1761        for file in files() {
1762            let file = arrow_test_data(file);
1763            let mut promotions: HashMap<&str, &str> = HashMap::new();
1764            promotions.insert("id", "long");
1765            promotions.insert("tinyint_col", "float");
1766            promotions.insert("smallint_col", "double");
1767            promotions.insert("int_col", "double");
1768            promotions.insert("bigint_col", "double");
1769            promotions.insert("float_col", "double");
1770            promotions.insert("date_string_col", "string");
1771            promotions.insert("string_col", "string");
1772            let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
1773            let batch = read_alltypes_with_reader_schema(&file, reader_schema);
1774            let expected = RecordBatch::try_from_iter_with_nullable([
1775                (
1776                    "id",
1777                    Arc::new(Int64Array::from(vec![4i64, 5, 6, 7, 2, 3, 0, 1])) as _,
1778                    true,
1779                ),
1780                (
1781                    "bool_col",
1782                    Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
1783                    true,
1784                ),
1785                (
1786                    "tinyint_col",
1787                    Arc::new(Float32Array::from_iter_values(
1788                        (0..8).map(|x| (x % 2) as f32),
1789                    )) as _,
1790                    true,
1791                ),
1792                (
1793                    "smallint_col",
1794                    Arc::new(Float64Array::from_iter_values(
1795                        (0..8).map(|x| (x % 2) as f64),
1796                    )) as _,
1797                    true,
1798                ),
1799                (
1800                    "int_col",
1801                    Arc::new(Float64Array::from_iter_values(
1802                        (0..8).map(|x| (x % 2) as f64),
1803                    )) as _,
1804                    true,
1805                ),
1806                (
1807                    "bigint_col",
1808                    Arc::new(Float64Array::from_iter_values(
1809                        (0..8).map(|x| ((x % 2) * 10) as f64),
1810                    )) as _,
1811                    true,
1812                ),
1813                (
1814                    "float_col",
1815                    Arc::new(Float64Array::from_iter_values(
1816                        (0..8).map(|x| ((x % 2) as f32 * 1.1f32) as f64),
1817                    )) as _,
1818                    true,
1819                ),
1820                (
1821                    "double_col",
1822                    Arc::new(Float64Array::from_iter_values(
1823                        (0..8).map(|x| (x % 2) as f64 * 10.1),
1824                    )) as _,
1825                    true,
1826                ),
1827                (
1828                    "date_string_col",
1829                    Arc::new(StringArray::from(vec![
1830                        "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
1831                        "01/01/09", "01/01/09",
1832                    ])) as _,
1833                    true,
1834                ),
1835                (
1836                    "string_col",
1837                    Arc::new(StringArray::from(
1838                        (0..8)
1839                            .map(|x| if x % 2 == 0 { "0" } else { "1" })
1840                            .collect::<Vec<_>>(),
1841                    )) as _,
1842                    true,
1843                ),
1844                (
1845                    "timestamp_col",
1846                    Arc::new(
1847                        TimestampMicrosecondArray::from_iter_values([
1848                            1235865600000000, // 2009-03-01T00:00:00.000
1849                            1235865660000000, // 2009-03-01T00:01:00.000
1850                            1238544000000000, // 2009-04-01T00:00:00.000
1851                            1238544060000000, // 2009-04-01T00:01:00.000
1852                            1233446400000000, // 2009-02-01T00:00:00.000
1853                            1233446460000000, // 2009-02-01T00:01:00.000
1854                            1230768000000000, // 2009-01-01T00:00:00.000
1855                            1230768060000000, // 2009-01-01T00:01:00.000
1856                        ])
1857                        .with_timezone("+00:00"),
1858                    ) as _,
1859                    true,
1860                ),
1861            ])
1862            .unwrap();
1863            assert_eq!(batch, expected, "mismatch for file {file}");
1864        }
1865    }
1866
1867    #[test]
1868    fn test_alltypes_schema_promotion_long_to_float_only() {
1869        for file in files() {
1870            let file = arrow_test_data(file);
1871            let mut promotions: HashMap<&str, &str> = HashMap::new();
1872            promotions.insert("bigint_col", "float");
1873            let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
1874            let batch = read_alltypes_with_reader_schema(&file, reader_schema);
1875            let expected = RecordBatch::try_from_iter_with_nullable([
1876                (
1877                    "id",
1878                    Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
1879                    true,
1880                ),
1881                (
1882                    "bool_col",
1883                    Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
1884                    true,
1885                ),
1886                (
1887                    "tinyint_col",
1888                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1889                    true,
1890                ),
1891                (
1892                    "smallint_col",
1893                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1894                    true,
1895                ),
1896                (
1897                    "int_col",
1898                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1899                    true,
1900                ),
1901                (
1902                    "bigint_col",
1903                    Arc::new(Float32Array::from_iter_values(
1904                        (0..8).map(|x| ((x % 2) * 10) as f32),
1905                    )) as _,
1906                    true,
1907                ),
1908                (
1909                    "float_col",
1910                    Arc::new(Float32Array::from_iter_values(
1911                        (0..8).map(|x| (x % 2) as f32 * 1.1),
1912                    )) as _,
1913                    true,
1914                ),
1915                (
1916                    "double_col",
1917                    Arc::new(Float64Array::from_iter_values(
1918                        (0..8).map(|x| (x % 2) as f64 * 10.1),
1919                    )) as _,
1920                    true,
1921                ),
1922                (
1923                    "date_string_col",
1924                    Arc::new(BinaryArray::from_iter_values([
1925                        [48, 51, 47, 48, 49, 47, 48, 57],
1926                        [48, 51, 47, 48, 49, 47, 48, 57],
1927                        [48, 52, 47, 48, 49, 47, 48, 57],
1928                        [48, 52, 47, 48, 49, 47, 48, 57],
1929                        [48, 50, 47, 48, 49, 47, 48, 57],
1930                        [48, 50, 47, 48, 49, 47, 48, 57],
1931                        [48, 49, 47, 48, 49, 47, 48, 57],
1932                        [48, 49, 47, 48, 49, 47, 48, 57],
1933                    ])) as _,
1934                    true,
1935                ),
1936                (
1937                    "string_col",
1938                    Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
1939                    true,
1940                ),
1941                (
1942                    "timestamp_col",
1943                    Arc::new(
1944                        TimestampMicrosecondArray::from_iter_values([
1945                            1235865600000000, // 2009-03-01T00:00:00.000
1946                            1235865660000000, // 2009-03-01T00:01:00.000
1947                            1238544000000000, // 2009-04-01T00:00:00.000
1948                            1238544060000000, // 2009-04-01T00:01:00.000
1949                            1233446400000000, // 2009-02-01T00:00:00.000
1950                            1233446460000000, // 2009-02-01T00:01:00.000
1951                            1230768000000000, // 2009-01-01T00:00:00.000
1952                            1230768060000000, // 2009-01-01T00:01:00.000
1953                        ])
1954                        .with_timezone("+00:00"),
1955                    ) as _,
1956                    true,
1957                ),
1958            ])
1959            .unwrap();
1960            assert_eq!(batch, expected, "mismatch for file {file}");
1961        }
1962    }
1963
1964    #[test]
1965    fn test_alltypes_schema_promotion_bytes_to_string_only() {
1966        for file in files() {
1967            let file = arrow_test_data(file);
1968            let mut promotions: HashMap<&str, &str> = HashMap::new();
1969            promotions.insert("date_string_col", "string");
1970            promotions.insert("string_col", "string");
1971            let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
1972            let batch = read_alltypes_with_reader_schema(&file, reader_schema);
1973            let expected = RecordBatch::try_from_iter_with_nullable([
1974                (
1975                    "id",
1976                    Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
1977                    true,
1978                ),
1979                (
1980                    "bool_col",
1981                    Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
1982                    true,
1983                ),
1984                (
1985                    "tinyint_col",
1986                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1987                    true,
1988                ),
1989                (
1990                    "smallint_col",
1991                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1992                    true,
1993                ),
1994                (
1995                    "int_col",
1996                    Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1997                    true,
1998                ),
1999                (
2000                    "bigint_col",
2001                    Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
2002                    true,
2003                ),
2004                (
2005                    "float_col",
2006                    Arc::new(Float32Array::from_iter_values(
2007                        (0..8).map(|x| (x % 2) as f32 * 1.1),
2008                    )) as _,
2009                    true,
2010                ),
2011                (
2012                    "double_col",
2013                    Arc::new(Float64Array::from_iter_values(
2014                        (0..8).map(|x| (x % 2) as f64 * 10.1),
2015                    )) as _,
2016                    true,
2017                ),
2018                (
2019                    "date_string_col",
2020                    Arc::new(StringArray::from(vec![
2021                        "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2022                        "01/01/09", "01/01/09",
2023                    ])) as _,
2024                    true,
2025                ),
2026                (
2027                    "string_col",
2028                    Arc::new(StringArray::from(
2029                        (0..8)
2030                            .map(|x| if x % 2 == 0 { "0" } else { "1" })
2031                            .collect::<Vec<_>>(),
2032                    )) as _,
2033                    true,
2034                ),
2035                (
2036                    "timestamp_col",
2037                    Arc::new(
2038                        TimestampMicrosecondArray::from_iter_values([
2039                            1235865600000000, // 2009-03-01T00:00:00.000
2040                            1235865660000000, // 2009-03-01T00:01:00.000
2041                            1238544000000000, // 2009-04-01T00:00:00.000
2042                            1238544060000000, // 2009-04-01T00:01:00.000
2043                            1233446400000000, // 2009-02-01T00:00:00.000
2044                            1233446460000000, // 2009-02-01T00:01:00.000
2045                            1230768000000000, // 2009-01-01T00:00:00.000
2046                            1230768060000000, // 2009-01-01T00:01:00.000
2047                        ])
2048                        .with_timezone("+00:00"),
2049                    ) as _,
2050                    true,
2051                ),
2052            ])
2053            .unwrap();
2054            assert_eq!(batch, expected, "mismatch for file {file}");
2055        }
2056    }
2057
2058    #[test]
2059    // TODO: avoid requiring snappy for this file
2060    #[cfg(feature = "snappy")]
2061    fn test_alltypes_illegal_promotion_bool_to_double_errors() {
2062        let file = arrow_test_data("avro/alltypes_plain.avro");
2063        let mut promotions: HashMap<&str, &str> = HashMap::new();
2064        promotions.insert("bool_col", "double"); // illegal
2065        let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2066        let file_handle = File::open(&file).unwrap();
2067        let result = ReaderBuilder::new()
2068            .with_reader_schema(reader_schema)
2069            .build(BufReader::new(file_handle));
2070        let err = result.expect_err("expected illegal promotion to error");
2071        let msg = err.to_string();
2072        assert!(
2073            msg.contains("Illegal promotion") || msg.contains("illegal promotion"),
2074            "unexpected error: {msg}"
2075        );
2076    }
2077
2078    #[test]
2079    fn test_simple_enum_with_reader_schema_mapping() {
2080        let file = arrow_test_data("avro/simple_enum.avro");
2081        let mut remap: HashMap<&str, Vec<&str>> = HashMap::new();
2082        remap.insert("f1", vec!["d", "c", "b", "a"]);
2083        remap.insert("f2", vec!["h", "g", "f", "e"]);
2084        remap.insert("f3", vec!["k", "i", "j"]);
2085        let reader_schema = make_reader_schema_with_enum_remap(&file, &remap);
2086        let actual = read_alltypes_with_reader_schema(&file, reader_schema);
2087        let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
2088        // f1
2089        let f1_keys = Int32Array::from(vec![3, 2, 1, 0]);
2090        let f1_vals = StringArray::from(vec!["d", "c", "b", "a"]);
2091        let f1 = DictionaryArray::<Int32Type>::try_new(f1_keys, Arc::new(f1_vals)).unwrap();
2092        let mut md_f1 = HashMap::new();
2093        md_f1.insert(
2094            AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2095            r#"["d","c","b","a"]"#.to_string(),
2096        );
2097        // New named-type metadata
2098        md_f1.insert("avro.name".to_string(), "enum1".to_string());
2099        md_f1.insert("avro.namespace".to_string(), "ns1".to_string());
2100        let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
2101        // f2
2102        let f2_keys = Int32Array::from(vec![1, 0, 3, 2]);
2103        let f2_vals = StringArray::from(vec!["h", "g", "f", "e"]);
2104        let f2 = DictionaryArray::<Int32Type>::try_new(f2_keys, Arc::new(f2_vals)).unwrap();
2105        let mut md_f2 = HashMap::new();
2106        md_f2.insert(
2107            AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2108            r#"["h","g","f","e"]"#.to_string(),
2109        );
2110        // New named-type metadata
2111        md_f2.insert("avro.name".to_string(), "enum2".to_string());
2112        md_f2.insert("avro.namespace".to_string(), "ns2".to_string());
2113        let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
2114        // f3
2115        let f3_keys = Int32Array::from(vec![Some(2), Some(0), None, Some(1)]);
2116        let f3_vals = StringArray::from(vec!["k", "i", "j"]);
2117        let f3 = DictionaryArray::<Int32Type>::try_new(f3_keys, Arc::new(f3_vals)).unwrap();
2118        let mut md_f3 = HashMap::new();
2119        md_f3.insert(
2120            AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2121            r#"["k","i","j"]"#.to_string(),
2122        );
2123        // New named-type metadata
2124        md_f3.insert("avro.name".to_string(), "enum3".to_string());
2125        md_f3.insert("avro.namespace".to_string(), "ns1".to_string());
2126        let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
2127        let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
2128        let expected = RecordBatch::try_new(
2129            expected_schema,
2130            vec![Arc::new(f1) as ArrayRef, Arc::new(f2), Arc::new(f3)],
2131        )
2132        .unwrap();
2133        assert_eq!(actual, expected);
2134    }
2135
2136    #[test]
2137    fn test_schema_store_register_lookup() {
2138        let schema_int = make_record_schema(PrimitiveType::Int);
2139        let schema_long = make_record_schema(PrimitiveType::Long);
2140        let mut store = SchemaStore::new();
2141        let fp_int = store.register(schema_int.clone()).unwrap();
2142        let fp_long = store.register(schema_long.clone()).unwrap();
2143        assert_eq!(store.lookup(&fp_int).cloned(), Some(schema_int));
2144        assert_eq!(store.lookup(&fp_long).cloned(), Some(schema_long));
2145        assert_eq!(store.fingerprint_algorithm(), FingerprintAlgorithm::Rabin);
2146    }
2147
2148    #[test]
2149    fn test_unknown_fingerprint_is_error() {
2150        let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2151        let unknown_fp = Fingerprint::Rabin(0xDEAD_BEEF_DEAD_BEEF);
2152        let prefix = make_prefix(unknown_fp);
2153        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2154        let err = decoder.decode(&prefix).expect_err("decode should error");
2155        let msg = err.to_string();
2156        assert!(
2157            msg.contains("Unknown fingerprint"),
2158            "unexpected message: {msg}"
2159        );
2160    }
2161
2162    #[test]
2163    fn test_handle_prefix_incomplete_magic() {
2164        let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2165        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2166        let buf = &SINGLE_OBJECT_MAGIC[..1];
2167        let res = decoder.handle_prefix(buf).unwrap();
2168        assert_eq!(res, Some(0));
2169        assert!(decoder.pending_schema.is_none());
2170    }
2171
2172    #[test]
2173    fn test_handle_prefix_magic_mismatch() {
2174        let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2175        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2176        let buf = [0xFFu8, 0x00u8, 0x01u8];
2177        let res = decoder.handle_prefix(&buf).unwrap();
2178        assert!(res.is_none());
2179    }
2180
2181    #[test]
2182    fn test_handle_prefix_incomplete_fingerprint() {
2183        let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2184        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2185        let long_bytes = match fp_long {
2186            Fingerprint::Rabin(v) => v.to_le_bytes(),
2187            Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2188            Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2189            #[cfg(feature = "md5")]
2190            Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2191            #[cfg(feature = "sha256")]
2192            Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2193        };
2194        let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2195        buf.extend_from_slice(&long_bytes[..4]);
2196        let res = decoder.handle_prefix(&buf).unwrap();
2197        assert_eq!(res, Some(0));
2198        assert!(decoder.pending_schema.is_none());
2199    }
2200
2201    #[test]
2202    fn test_handle_prefix_valid_prefix_switches_schema() {
2203        let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2204        let mut decoder = make_decoder(&store, fp_int, &schema_long);
2205        let writer_schema_long = schema_long.schema().unwrap();
2206        let root_long = AvroFieldBuilder::new(&writer_schema_long).build().unwrap();
2207        let long_decoder = RecordDecoder::try_new_with_options(root_long.data_type()).unwrap();
2208        let _ = decoder.cache.insert(fp_long, long_decoder);
2209        let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2210        match fp_long {
2211            Fingerprint::Rabin(v) => buf.extend_from_slice(&v.to_le_bytes()),
2212            Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2213            Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2214            #[cfg(feature = "md5")]
2215            Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2216            #[cfg(feature = "sha256")]
2217            Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2218        }
2219        let consumed = decoder.handle_prefix(&buf).unwrap().unwrap();
2220        assert_eq!(consumed, buf.len());
2221        assert!(decoder.pending_schema.is_some());
2222        assert_eq!(decoder.pending_schema.as_ref().unwrap().0, fp_long);
2223    }
2224
2225    #[test]
2226    fn test_two_messages_same_schema() {
2227        let writer_schema = make_value_schema(PrimitiveType::Int);
2228        let reader_schema = writer_schema.clone();
2229        let mut store = SchemaStore::new();
2230        let fp = store.register(writer_schema).unwrap();
2231        let msg1 = make_message(fp, 42);
2232        let msg2 = make_message(fp, 11);
2233        let input = [msg1.clone(), msg2.clone()].concat();
2234        let mut decoder = ReaderBuilder::new()
2235            .with_batch_size(8)
2236            .with_reader_schema(reader_schema.clone())
2237            .with_writer_schema_store(store)
2238            .with_active_fingerprint(fp)
2239            .build_decoder()
2240            .unwrap();
2241        let _ = decoder.decode(&input).unwrap();
2242        let batch = decoder.flush().unwrap().expect("batch");
2243        assert_eq!(batch.num_rows(), 2);
2244        let col = batch
2245            .column(0)
2246            .as_any()
2247            .downcast_ref::<Int32Array>()
2248            .unwrap();
2249        assert_eq!(col.value(0), 42);
2250        assert_eq!(col.value(1), 11);
2251    }
2252
2253    #[test]
2254    fn test_two_messages_schema_switch() {
2255        let w_int = make_value_schema(PrimitiveType::Int);
2256        let w_long = make_value_schema(PrimitiveType::Long);
2257        let mut store = SchemaStore::new();
2258        let fp_int = store.register(w_int).unwrap();
2259        let fp_long = store.register(w_long).unwrap();
2260        let msg_int = make_message(fp_int, 1);
2261        let msg_long = make_message(fp_long, 123456789_i64);
2262        let mut decoder = ReaderBuilder::new()
2263            .with_batch_size(8)
2264            .with_writer_schema_store(store)
2265            .with_active_fingerprint(fp_int)
2266            .build_decoder()
2267            .unwrap();
2268        let _ = decoder.decode(&msg_int).unwrap();
2269        let batch1 = decoder.flush().unwrap().expect("batch1");
2270        assert_eq!(batch1.num_rows(), 1);
2271        assert_eq!(
2272            batch1
2273                .column(0)
2274                .as_any()
2275                .downcast_ref::<Int32Array>()
2276                .unwrap()
2277                .value(0),
2278            1
2279        );
2280        let _ = decoder.decode(&msg_long).unwrap();
2281        let batch2 = decoder.flush().unwrap().expect("batch2");
2282        assert_eq!(batch2.num_rows(), 1);
2283        assert_eq!(
2284            batch2
2285                .column(0)
2286                .as_any()
2287                .downcast_ref::<Int64Array>()
2288                .unwrap()
2289                .value(0),
2290            123456789_i64
2291        );
2292    }
2293
2294    #[test]
2295    fn test_two_messages_same_schema_id() {
2296        let writer_schema = make_value_schema(PrimitiveType::Int);
2297        let reader_schema = writer_schema.clone();
2298        let id = 100u32;
2299        // Set up store with None fingerprint algorithm and register schema by id
2300        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2301        let _ = store
2302            .set(Fingerprint::Id(id), writer_schema.clone())
2303            .expect("set id schema");
2304        let msg1 = make_message_id(id, 21);
2305        let msg2 = make_message_id(id, 22);
2306        let input = [msg1.clone(), msg2.clone()].concat();
2307        let mut decoder = ReaderBuilder::new()
2308            .with_batch_size(8)
2309            .with_reader_schema(reader_schema)
2310            .with_writer_schema_store(store)
2311            .with_active_fingerprint(Fingerprint::Id(id))
2312            .build_decoder()
2313            .unwrap();
2314        let _ = decoder.decode(&input).unwrap();
2315        let batch = decoder.flush().unwrap().expect("batch");
2316        assert_eq!(batch.num_rows(), 2);
2317        let col = batch
2318            .column(0)
2319            .as_any()
2320            .downcast_ref::<Int32Array>()
2321            .unwrap();
2322        assert_eq!(col.value(0), 21);
2323        assert_eq!(col.value(1), 22);
2324    }
2325
2326    #[test]
2327    fn test_unknown_id_fingerprint_is_error() {
2328        let writer_schema = make_value_schema(PrimitiveType::Int);
2329        let id_known = 7u32;
2330        let id_unknown = 9u32;
2331        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2332        let _ = store
2333            .set(Fingerprint::Id(id_known), writer_schema.clone())
2334            .expect("set id schema");
2335        let mut decoder = ReaderBuilder::new()
2336            .with_batch_size(8)
2337            .with_reader_schema(writer_schema)
2338            .with_writer_schema_store(store)
2339            .with_active_fingerprint(Fingerprint::Id(id_known))
2340            .build_decoder()
2341            .unwrap();
2342        let prefix = make_id_prefix(id_unknown, 0);
2343        let err = decoder.decode(&prefix).expect_err("decode should error");
2344        let msg = err.to_string();
2345        assert!(
2346            msg.contains("Unknown fingerprint"),
2347            "unexpected message: {msg}"
2348        );
2349    }
2350
2351    #[test]
2352    fn test_handle_prefix_id_incomplete_magic() {
2353        let writer_schema = make_value_schema(PrimitiveType::Int);
2354        let id = 5u32;
2355        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2356        let _ = store
2357            .set(Fingerprint::Id(id), writer_schema.clone())
2358            .expect("set id schema");
2359        let mut decoder = ReaderBuilder::new()
2360            .with_batch_size(8)
2361            .with_reader_schema(writer_schema)
2362            .with_writer_schema_store(store)
2363            .with_active_fingerprint(Fingerprint::Id(id))
2364            .build_decoder()
2365            .unwrap();
2366        let buf = &CONFLUENT_MAGIC[..0]; // empty incomplete magic
2367        let res = decoder.handle_prefix(buf).unwrap();
2368        assert_eq!(res, Some(0));
2369        assert!(decoder.pending_schema.is_none());
2370    }
2371
2372    #[test]
2373    fn test_two_messages_same_schema_id64() {
2374        let writer_schema = make_value_schema(PrimitiveType::Int);
2375        let reader_schema = writer_schema.clone();
2376        let id = 100u64;
2377        // Set up store with None fingerprint algorithm and register schema by id
2378        let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id64);
2379        let _ = store
2380            .set(Fingerprint::Id64(id), writer_schema.clone())
2381            .expect("set id schema");
2382        let msg1 = make_message_id64(id, 21);
2383        let msg2 = make_message_id64(id, 22);
2384        let input = [msg1.clone(), msg2.clone()].concat();
2385        let mut decoder = ReaderBuilder::new()
2386            .with_batch_size(8)
2387            .with_reader_schema(reader_schema)
2388            .with_writer_schema_store(store)
2389            .with_active_fingerprint(Fingerprint::Id64(id))
2390            .build_decoder()
2391            .unwrap();
2392        let _ = decoder.decode(&input).unwrap();
2393        let batch = decoder.flush().unwrap().expect("batch");
2394        assert_eq!(batch.num_rows(), 2);
2395        let col = batch
2396            .column(0)
2397            .as_any()
2398            .downcast_ref::<Int32Array>()
2399            .unwrap();
2400        assert_eq!(col.value(0), 21);
2401        assert_eq!(col.value(1), 22);
2402    }
2403
2404    #[test]
2405    fn test_decode_stream_with_schema() {
2406        struct TestCase<'a> {
2407            name: &'a str,
2408            schema: &'a str,
2409            expected_error: Option<&'a str>,
2410        }
2411        let tests = vec![
2412            TestCase {
2413                name: "success",
2414                schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"string"}]}"#,
2415                expected_error: None,
2416            },
2417            TestCase {
2418                name: "valid schema invalid data",
2419                schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"long"}]}"#,
2420                expected_error: Some("did not consume all bytes"),
2421            },
2422        ];
2423        for test in tests {
2424            let avro_schema = AvroSchema::new(test.schema.to_string());
2425            let mut store = SchemaStore::new();
2426            let fp = store.register(avro_schema.clone()).unwrap();
2427            let prefix = make_prefix(fp);
2428            let record_val = "some_string";
2429            let mut body = prefix;
2430            body.push((record_val.len() as u8) << 1);
2431            body.extend_from_slice(record_val.as_bytes());
2432            let decoder_res = ReaderBuilder::new()
2433                .with_batch_size(1)
2434                .with_writer_schema_store(store)
2435                .with_active_fingerprint(fp)
2436                .build_decoder();
2437            let decoder = match decoder_res {
2438                Ok(d) => d,
2439                Err(e) => {
2440                    if let Some(expected) = test.expected_error {
2441                        assert!(
2442                            e.to_string().contains(expected),
2443                            "Test '{}' failed at build – expected '{expected}', got '{e}'",
2444                            test.name
2445                        );
2446                        continue;
2447                    } else {
2448                        panic!("Test '{}' failed during build: {e}", test.name);
2449                    }
2450                }
2451            };
2452            let stream = Box::pin(stream::once(async { Bytes::from(body) }));
2453            let decoded_stream = decode_stream(decoder, stream);
2454            let batches_result: Result<Vec<RecordBatch>, ArrowError> =
2455                block_on(decoded_stream.try_collect());
2456            match (batches_result, test.expected_error) {
2457                (Ok(batches), None) => {
2458                    let batch =
2459                        arrow::compute::concat_batches(&batches[0].schema(), &batches).unwrap();
2460                    let expected_field = Field::new("f2", DataType::Utf8, false);
2461                    let expected_schema = Arc::new(Schema::new(vec![expected_field]));
2462                    let expected_array = Arc::new(StringArray::from(vec![record_val]));
2463                    let expected_batch =
2464                        RecordBatch::try_new(expected_schema, vec![expected_array]).unwrap();
2465                    assert_eq!(batch, expected_batch, "Test '{}'", test.name);
2466                }
2467                (Err(e), Some(expected)) => {
2468                    assert!(
2469                        e.to_string().contains(expected),
2470                        "Test '{}' – expected error containing '{expected}', got '{e}'",
2471                        test.name
2472                    );
2473                }
2474                (Ok(_), Some(expected)) => {
2475                    panic!(
2476                        "Test '{}' expected failure ('{expected}') but succeeded",
2477                        test.name
2478                    );
2479                }
2480                (Err(e), None) => {
2481                    panic!("Test '{}' unexpectedly failed with '{e}'", test.name);
2482                }
2483            }
2484        }
2485    }
2486
2487    #[test]
2488    fn test_utf8view_support() {
2489        struct TestHelper;
2490        impl TestHelper {
2491            fn with_utf8view(field: &Field) -> Field {
2492                match field.data_type() {
2493                    DataType::Utf8 => {
2494                        Field::new(field.name(), DataType::Utf8View, field.is_nullable())
2495                            .with_metadata(field.metadata().clone())
2496                    }
2497                    _ => field.clone(),
2498                }
2499            }
2500        }
2501
2502        let field = TestHelper::with_utf8view(&Field::new("str_field", DataType::Utf8, false));
2503
2504        assert_eq!(field.data_type(), &DataType::Utf8View);
2505
2506        let array = StringViewArray::from(vec!["test1", "test2"]);
2507        let batch =
2508            RecordBatch::try_from_iter(vec![("str_field", Arc::new(array) as ArrayRef)]).unwrap();
2509
2510        assert!(batch.column(0).as_any().is::<StringViewArray>());
2511    }
2512
2513    fn make_reader_schema_with_default_fields(
2514        path: &str,
2515        default_fields: Vec<Value>,
2516    ) -> AvroSchema {
2517        let mut root = load_writer_schema_json(path);
2518        assert_eq!(root["type"], "record", "writer schema must be a record");
2519        root.as_object_mut()
2520            .expect("schema is a JSON object")
2521            .insert("fields".to_string(), Value::Array(default_fields));
2522        AvroSchema::new(root.to_string())
2523    }
2524
2525    #[test]
2526    fn test_schema_resolution_defaults_all_supported_types() {
2527        let path = "test/data/skippable_types.avro";
2528        let duration_default = "\u{0000}".repeat(12);
2529        let reader_schema = make_reader_schema_with_default_fields(
2530            path,
2531            vec![
2532                serde_json::json!({"name":"d_bool","type":"boolean","default":true}),
2533                serde_json::json!({"name":"d_int","type":"int","default":42}),
2534                serde_json::json!({"name":"d_long","type":"long","default":12345}),
2535                serde_json::json!({"name":"d_float","type":"float","default":1.5}),
2536                serde_json::json!({"name":"d_double","type":"double","default":2.25}),
2537                serde_json::json!({"name":"d_bytes","type":"bytes","default":"XYZ"}),
2538                serde_json::json!({"name":"d_string","type":"string","default":"hello"}),
2539                serde_json::json!({"name":"d_date","type":{"type":"int","logicalType":"date"},"default":0}),
2540                serde_json::json!({"name":"d_time_ms","type":{"type":"int","logicalType":"time-millis"},"default":1000}),
2541                serde_json::json!({"name":"d_time_us","type":{"type":"long","logicalType":"time-micros"},"default":2000}),
2542                serde_json::json!({"name":"d_ts_ms","type":{"type":"long","logicalType":"local-timestamp-millis"},"default":0}),
2543                serde_json::json!({"name":"d_ts_us","type":{"type":"long","logicalType":"local-timestamp-micros"},"default":0}),
2544                serde_json::json!({"name":"d_decimal","type":{"type":"bytes","logicalType":"decimal","precision":10,"scale":2},"default":""}),
2545                serde_json::json!({"name":"d_fixed","type":{"type":"fixed","name":"F4","size":4},"default":"ABCD"}),
2546                serde_json::json!({"name":"d_enum","type":{"type":"enum","name":"E","symbols":["A","B","C"]},"default":"A"}),
2547                serde_json::json!({"name":"d_duration","type":{"type":"fixed","name":"Dur","size":12,"logicalType":"duration"},"default":duration_default}),
2548                serde_json::json!({"name":"d_uuid","type":{"type":"string","logicalType":"uuid"},"default":"00000000-0000-0000-0000-000000000000"}),
2549                serde_json::json!({"name":"d_array","type":{"type":"array","items":"int"},"default":[1,2,3]}),
2550                serde_json::json!({"name":"d_map","type":{"type":"map","values":"long"},"default":{"a":1,"b":2}}),
2551                serde_json::json!({"name":"d_record","type":{
2552              "type":"record","name":"DefaultRec","fields":[
2553                  {"name":"x","type":"int"},
2554                  {"name":"y","type":["null","string"],"default":null}
2555              ]
2556        },"default":{"x":7}}),
2557                serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}),
2558                serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}),
2559            ],
2560        );
2561        let actual = read_alltypes_with_reader_schema(path, reader_schema);
2562        let num_rows = actual.num_rows();
2563        assert!(num_rows > 0, "skippable_types.avro should contain rows");
2564        assert_eq!(
2565            actual.num_columns(),
2566            22,
2567            "expected exactly our defaulted fields"
2568        );
2569        let mut arrays: Vec<Arc<dyn Array>> = Vec::with_capacity(22);
2570        arrays.push(Arc::new(BooleanArray::from_iter(std::iter::repeat_n(
2571            Some(true),
2572            num_rows,
2573        ))));
2574        arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2575            42, num_rows,
2576        ))));
2577        arrays.push(Arc::new(Int64Array::from_iter_values(std::iter::repeat_n(
2578            12345, num_rows,
2579        ))));
2580        arrays.push(Arc::new(Float32Array::from_iter_values(
2581            std::iter::repeat_n(1.5f32, num_rows),
2582        )));
2583        arrays.push(Arc::new(Float64Array::from_iter_values(
2584            std::iter::repeat_n(2.25f64, num_rows),
2585        )));
2586        arrays.push(Arc::new(BinaryArray::from_iter_values(
2587            std::iter::repeat_n(b"XYZ".as_ref(), num_rows),
2588        )));
2589        arrays.push(Arc::new(StringArray::from_iter_values(
2590            std::iter::repeat_n("hello", num_rows),
2591        )));
2592        arrays.push(Arc::new(Date32Array::from_iter_values(
2593            std::iter::repeat_n(0, num_rows),
2594        )));
2595        arrays.push(Arc::new(Time32MillisecondArray::from_iter_values(
2596            std::iter::repeat_n(1_000, num_rows),
2597        )));
2598        arrays.push(Arc::new(Time64MicrosecondArray::from_iter_values(
2599            std::iter::repeat_n(2_000i64, num_rows),
2600        )));
2601        arrays.push(Arc::new(TimestampMillisecondArray::from_iter_values(
2602            std::iter::repeat_n(0i64, num_rows),
2603        )));
2604        arrays.push(Arc::new(TimestampMicrosecondArray::from_iter_values(
2605            std::iter::repeat_n(0i64, num_rows),
2606        )));
2607        #[cfg(feature = "small_decimals")]
2608        let decimal = Decimal64Array::from_iter_values(std::iter::repeat_n(0i64, num_rows))
2609            .with_precision_and_scale(10, 2)
2610            .unwrap();
2611        #[cfg(not(feature = "small_decimals"))]
2612        let decimal = Decimal128Array::from_iter_values(std::iter::repeat_n(0i128, num_rows))
2613            .with_precision_and_scale(10, 2)
2614            .unwrap();
2615        arrays.push(Arc::new(decimal));
2616        let fixed_iter = std::iter::repeat_n(Some(*b"ABCD"), num_rows);
2617        arrays.push(Arc::new(
2618            FixedSizeBinaryArray::try_from_sparse_iter_with_size(fixed_iter, 4).unwrap(),
2619        ));
2620        let enum_keys = Int32Array::from_iter_values(std::iter::repeat_n(0, num_rows));
2621        let enum_values = StringArray::from_iter_values(["A", "B", "C"]);
2622        let enum_arr =
2623            DictionaryArray::<Int32Type>::try_new(enum_keys, Arc::new(enum_values)).unwrap();
2624        arrays.push(Arc::new(enum_arr));
2625        let duration_values = std::iter::repeat_n(
2626            Some(IntervalMonthDayNanoType::make_value(0, 0, 0)),
2627            num_rows,
2628        );
2629        let duration_arr: IntervalMonthDayNanoArray = duration_values.collect();
2630        arrays.push(Arc::new(duration_arr));
2631        let uuid_bytes = [0u8; 16];
2632        let uuid_iter = std::iter::repeat_n(Some(uuid_bytes), num_rows);
2633        arrays.push(Arc::new(
2634            FixedSizeBinaryArray::try_from_sparse_iter_with_size(uuid_iter, 16).unwrap(),
2635        ));
2636        let item_field = Arc::new(Field::new(
2637            Field::LIST_FIELD_DEFAULT_NAME,
2638            DataType::Int32,
2639            false,
2640        ));
2641        let mut list_builder = ListBuilder::new(Int32Builder::new()).with_field(item_field);
2642        for _ in 0..num_rows {
2643            list_builder.values().append_value(1);
2644            list_builder.values().append_value(2);
2645            list_builder.values().append_value(3);
2646            list_builder.append(true);
2647        }
2648        arrays.push(Arc::new(list_builder.finish()));
2649        let values_field = Arc::new(Field::new("value", DataType::Int64, false));
2650        let mut map_builder = MapBuilder::new(
2651            Some(builder::MapFieldNames {
2652                entry: "entries".to_string(),
2653                key: "key".to_string(),
2654                value: "value".to_string(),
2655            }),
2656            StringBuilder::new(),
2657            Int64Builder::new(),
2658        )
2659        .with_values_field(values_field);
2660        for _ in 0..num_rows {
2661            let (keys, vals) = map_builder.entries();
2662            keys.append_value("a");
2663            vals.append_value(1);
2664            keys.append_value("b");
2665            vals.append_value(2);
2666            map_builder.append(true).unwrap();
2667        }
2668        arrays.push(Arc::new(map_builder.finish()));
2669        let rec_fields: Fields = Fields::from(vec![
2670            Field::new("x", DataType::Int32, false),
2671            Field::new("y", DataType::Utf8, true),
2672        ]);
2673        let mut sb = StructBuilder::new(
2674            rec_fields.clone(),
2675            vec![
2676                Box::new(Int32Builder::new()),
2677                Box::new(StringBuilder::new()),
2678            ],
2679        );
2680        for _ in 0..num_rows {
2681            sb.field_builder::<Int32Builder>(0).unwrap().append_value(7);
2682            sb.field_builder::<StringBuilder>(1).unwrap().append_null();
2683            sb.append(true);
2684        }
2685        arrays.push(Arc::new(sb.finish()));
2686        arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n(
2687            None::<i32>,
2688            num_rows,
2689        ))));
2690        arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2691            123, num_rows,
2692        ))));
2693        let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap();
2694        assert_eq!(
2695            actual, expected,
2696            "defaults should materialize correctly for all fields"
2697        );
2698    }
2699
2700    #[test]
2701    fn test_schema_resolution_default_enum_invalid_symbol_errors() {
2702        let path = "test/data/skippable_types.avro";
2703        let bad_schema = make_reader_schema_with_default_fields(
2704            path,
2705            vec![serde_json::json!({
2706                "name":"bad_enum",
2707                "type":{"type":"enum","name":"E","symbols":["A","B","C"]},
2708                "default":"Z"
2709            })],
2710        );
2711        let file = File::open(path).unwrap();
2712        let res = ReaderBuilder::new()
2713            .with_reader_schema(bad_schema)
2714            .build(BufReader::new(file));
2715        let err = res.expect_err("expected enum default validation to fail");
2716        let msg = err.to_string();
2717        let lower_msg = msg.to_lowercase();
2718        assert!(
2719            lower_msg.contains("enum")
2720                && (lower_msg.contains("symbol") || lower_msg.contains("default")),
2721            "unexpected error: {msg}"
2722        );
2723    }
2724
2725    #[test]
2726    fn test_schema_resolution_default_fixed_size_mismatch_errors() {
2727        let path = "test/data/skippable_types.avro";
2728        let bad_schema = make_reader_schema_with_default_fields(
2729            path,
2730            vec![serde_json::json!({
2731                "name":"bad_fixed",
2732                "type":{"type":"fixed","name":"F","size":4},
2733                "default":"ABC"
2734            })],
2735        );
2736        let file = File::open(path).unwrap();
2737        let res = ReaderBuilder::new()
2738            .with_reader_schema(bad_schema)
2739            .build(BufReader::new(file));
2740        let err = res.expect_err("expected fixed default validation to fail");
2741        let msg = err.to_string();
2742        let lower_msg = msg.to_lowercase();
2743        assert!(
2744            lower_msg.contains("fixed")
2745                && (lower_msg.contains("size")
2746                    || lower_msg.contains("length")
2747                    || lower_msg.contains("does not match")),
2748            "unexpected error: {msg}"
2749        );
2750    }
2751
2752    #[test]
2753    // TODO: avoid requiring snappy for this file
2754    #[cfg(feature = "snappy")]
2755    fn test_alltypes_skip_writer_fields_keep_double_only() {
2756        let file = arrow_test_data("avro/alltypes_plain.avro");
2757        let reader_schema =
2758            make_reader_schema_with_selected_fields_in_order(&file, &["double_col"]);
2759        let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2760        let expected = RecordBatch::try_from_iter_with_nullable([(
2761            "double_col",
2762            Arc::new(Float64Array::from_iter_values(
2763                (0..8).map(|x| (x % 2) as f64 * 10.1),
2764            )) as _,
2765            true,
2766        )])
2767        .unwrap();
2768        assert_eq!(batch, expected);
2769    }
2770
2771    #[test]
2772    // TODO: avoid requiring snappy for this file
2773    #[cfg(feature = "snappy")]
2774    fn test_alltypes_skip_writer_fields_reorder_and_skip_many() {
2775        let file = arrow_test_data("avro/alltypes_plain.avro");
2776        let reader_schema =
2777            make_reader_schema_with_selected_fields_in_order(&file, &["timestamp_col", "id"]);
2778        let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2779        let expected = RecordBatch::try_from_iter_with_nullable([
2780            (
2781                "timestamp_col",
2782                Arc::new(
2783                    TimestampMicrosecondArray::from_iter_values([
2784                        1235865600000000, // 2009-03-01T00:00:00.000
2785                        1235865660000000, // 2009-03-01T00:01:00.000
2786                        1238544000000000, // 2009-04-01T00:00:00.000
2787                        1238544060000000, // 2009-04-01T00:01:00.000
2788                        1233446400000000, // 2009-02-01T00:00:00.000
2789                        1233446460000000, // 2009-02-01T00:01:00.000
2790                        1230768000000000, // 2009-01-01T00:00:00.000
2791                        1230768060000000, // 2009-01-01T00:01:00.000
2792                    ])
2793                    .with_timezone("+00:00"),
2794                ) as _,
2795                true,
2796            ),
2797            (
2798                "id",
2799                Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2800                true,
2801            ),
2802        ])
2803        .unwrap();
2804        assert_eq!(batch, expected);
2805    }
2806
2807    #[test]
2808    fn test_skippable_types_project_each_field_individually() {
2809        let path = "test/data/skippable_types.avro";
2810        let full = read_file(path, 1024, false);
2811        let schema_full = full.schema();
2812        let num_rows = full.num_rows();
2813        let writer_json = load_writer_schema_json(path);
2814        assert_eq!(
2815            writer_json["type"], "record",
2816            "writer schema must be a record"
2817        );
2818        let fields_json = writer_json
2819            .get("fields")
2820            .and_then(|f| f.as_array())
2821            .expect("record has fields");
2822        assert_eq!(
2823            schema_full.fields().len(),
2824            fields_json.len(),
2825            "full read column count vs writer fields"
2826        );
2827        fn rebuild_list_array_with_element(
2828            col: &ArrayRef,
2829            new_elem: Arc<Field>,
2830            is_large: bool,
2831        ) -> ArrayRef {
2832            if is_large {
2833                let list = col
2834                    .as_any()
2835                    .downcast_ref::<LargeListArray>()
2836                    .expect("expected LargeListArray");
2837                let offsets = list.offsets().clone();
2838                let values = list.values().clone();
2839                let validity = list.nulls().cloned();
2840                Arc::new(LargeListArray::try_new(new_elem, offsets, values, validity).unwrap())
2841            } else {
2842                let list = col
2843                    .as_any()
2844                    .downcast_ref::<ListArray>()
2845                    .expect("expected ListArray");
2846                let offsets = list.offsets().clone();
2847                let values = list.values().clone();
2848                let validity = list.nulls().cloned();
2849                Arc::new(ListArray::try_new(new_elem, offsets, values, validity).unwrap())
2850            }
2851        }
2852        for (idx, f) in fields_json.iter().enumerate() {
2853            let name = f
2854                .get("name")
2855                .and_then(|n| n.as_str())
2856                .unwrap_or_else(|| panic!("field at index {idx} has no name"));
2857            let reader_schema = make_reader_schema_with_selected_fields_in_order(path, &[name]);
2858            let projected = read_alltypes_with_reader_schema(path, reader_schema);
2859            assert_eq!(
2860                projected.num_columns(),
2861                1,
2862                "projected batch should contain exactly the selected column '{name}'"
2863            );
2864            assert_eq!(
2865                projected.num_rows(),
2866                num_rows,
2867                "row count mismatch for projected column '{name}'"
2868            );
2869            let col_full = full.column(idx).clone();
2870            let full_field = schema_full.field(idx).as_ref().clone();
2871            let proj_field_ref = projected.schema().field(0).clone();
2872            let proj_field = proj_field_ref.as_ref();
2873            let top_meta = proj_field.metadata().clone();
2874            let (expected_field_ref, expected_col): (Arc<Field>, ArrayRef) =
2875                match (full_field.data_type(), proj_field.data_type()) {
2876                    (&DataType::List(_), DataType::List(proj_elem)) => {
2877                        let new_col =
2878                            rebuild_list_array_with_element(&col_full, proj_elem.clone(), false);
2879                        let nf = Field::new(
2880                            full_field.name().clone(),
2881                            proj_field.data_type().clone(),
2882                            full_field.is_nullable(),
2883                        )
2884                        .with_metadata(top_meta);
2885                        (Arc::new(nf), new_col)
2886                    }
2887                    (&DataType::LargeList(_), DataType::LargeList(proj_elem)) => {
2888                        let new_col =
2889                            rebuild_list_array_with_element(&col_full, proj_elem.clone(), true);
2890                        let nf = Field::new(
2891                            full_field.name().clone(),
2892                            proj_field.data_type().clone(),
2893                            full_field.is_nullable(),
2894                        )
2895                        .with_metadata(top_meta);
2896                        (Arc::new(nf), new_col)
2897                    }
2898                    _ => {
2899                        let nf = full_field.with_metadata(top_meta);
2900                        (Arc::new(nf), col_full)
2901                    }
2902                };
2903
2904            let expected = RecordBatch::try_new(
2905                Arc::new(Schema::new(vec![expected_field_ref])),
2906                vec![expected_col],
2907            )
2908            .unwrap();
2909            assert_eq!(
2910                projected, expected,
2911                "projected column '{name}' mismatch vs full read column"
2912            );
2913        }
2914    }
2915
2916    #[test]
2917    fn test_union_fields_avro_nullable_and_general_unions() {
2918        let path = "test/data/union_fields.avro";
2919        let batch = read_file(path, 1024, false);
2920        let schema = batch.schema();
2921        let idx = schema.index_of("nullable_int_nullfirst").unwrap();
2922        let a = batch.column(idx).as_primitive::<Int32Type>();
2923        assert_eq!(a.len(), 4);
2924        assert!(a.is_null(0));
2925        assert_eq!(a.value(1), 42);
2926        assert!(a.is_null(2));
2927        assert_eq!(a.value(3), 0);
2928        let idx = schema.index_of("nullable_string_nullsecond").unwrap();
2929        let s = batch
2930            .column(idx)
2931            .as_any()
2932            .downcast_ref::<StringArray>()
2933            .expect("nullable_string_nullsecond should be Utf8");
2934        assert_eq!(s.len(), 4);
2935        assert_eq!(s.value(0), "s1");
2936        assert!(s.is_null(1));
2937        assert_eq!(s.value(2), "s3");
2938        assert!(s.is_valid(3)); // empty string, not null
2939        assert_eq!(s.value(3), "");
2940        let idx = schema.index_of("union_prim").unwrap();
2941        let u = batch
2942            .column(idx)
2943            .as_any()
2944            .downcast_ref::<UnionArray>()
2945            .expect("union_prim should be Union");
2946        let fields = match u.data_type() {
2947            DataType::Union(fields, mode) => {
2948                assert!(matches!(mode, UnionMode::Dense), "expect dense unions");
2949                fields
2950            }
2951            other => panic!("expected Union, got {other:?}"),
2952        };
2953        let tid_by_name = |name: &str| -> i8 {
2954            for (tid, f) in fields.iter() {
2955                if f.name() == name {
2956                    return tid;
2957                }
2958            }
2959            panic!("union child '{name}' not found");
2960        };
2961        let expected_type_ids = vec![
2962            tid_by_name("long"),
2963            tid_by_name("int"),
2964            tid_by_name("float"),
2965            tid_by_name("double"),
2966        ];
2967        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
2968        assert_eq!(
2969            type_ids, expected_type_ids,
2970            "branch selection for union_prim rows"
2971        );
2972        let longs = u
2973            .child(tid_by_name("long"))
2974            .as_any()
2975            .downcast_ref::<Int64Array>()
2976            .unwrap();
2977        assert_eq!(longs.len(), 1);
2978        let ints = u
2979            .child(tid_by_name("int"))
2980            .as_any()
2981            .downcast_ref::<Int32Array>()
2982            .unwrap();
2983        assert_eq!(ints.len(), 1);
2984        let floats = u
2985            .child(tid_by_name("float"))
2986            .as_any()
2987            .downcast_ref::<Float32Array>()
2988            .unwrap();
2989        assert_eq!(floats.len(), 1);
2990        let doubles = u
2991            .child(tid_by_name("double"))
2992            .as_any()
2993            .downcast_ref::<Float64Array>()
2994            .unwrap();
2995        assert_eq!(doubles.len(), 1);
2996        let idx = schema.index_of("union_bytes_vs_string").unwrap();
2997        let u = batch
2998            .column(idx)
2999            .as_any()
3000            .downcast_ref::<UnionArray>()
3001            .expect("union_bytes_vs_string should be Union");
3002        let fields = match u.data_type() {
3003            DataType::Union(fields, _) => fields,
3004            other => panic!("expected Union, got {other:?}"),
3005        };
3006        let tid_by_name = |name: &str| -> i8 {
3007            for (tid, f) in fields.iter() {
3008                if f.name() == name {
3009                    return tid;
3010                }
3011            }
3012            panic!("union child '{name}' not found");
3013        };
3014        let tid_bytes = tid_by_name("bytes");
3015        let tid_string = tid_by_name("string");
3016        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3017        assert_eq!(
3018            type_ids,
3019            vec![tid_bytes, tid_string, tid_string, tid_bytes],
3020            "branch selection for bytes/string union"
3021        );
3022        let s_child = u
3023            .child(tid_string)
3024            .as_any()
3025            .downcast_ref::<StringArray>()
3026            .unwrap();
3027        assert_eq!(s_child.len(), 2);
3028        assert_eq!(s_child.value(0), "hello");
3029        assert_eq!(s_child.value(1), "world");
3030        let b_child = u
3031            .child(tid_bytes)
3032            .as_any()
3033            .downcast_ref::<BinaryArray>()
3034            .unwrap();
3035        assert_eq!(b_child.len(), 2);
3036        assert_eq!(b_child.value(0), &[0x00, 0xFF, 0x7F]);
3037        assert_eq!(b_child.value(1), b""); // previously: &[]
3038        let idx = schema.index_of("union_enum_records_array_map").unwrap();
3039        let u = batch
3040            .column(idx)
3041            .as_any()
3042            .downcast_ref::<UnionArray>()
3043            .expect("union_enum_records_array_map should be Union");
3044        let fields = match u.data_type() {
3045            DataType::Union(fields, _) => fields,
3046            other => panic!("expected Union, got {other:?}"),
3047        };
3048        let mut tid_enum: Option<i8> = None;
3049        let mut tid_rec_a: Option<i8> = None;
3050        let mut tid_rec_b: Option<i8> = None;
3051        let mut tid_array: Option<i8> = None;
3052        for (tid, f) in fields.iter() {
3053            match f.data_type() {
3054                DataType::Dictionary(_, _) => tid_enum = Some(tid),
3055                DataType::Struct(childs) => {
3056                    if childs.len() == 2 && childs[0].name() == "a" && childs[1].name() == "b" {
3057                        tid_rec_a = Some(tid);
3058                    } else if childs.len() == 2
3059                        && childs[0].name() == "x"
3060                        && childs[1].name() == "y"
3061                    {
3062                        tid_rec_b = Some(tid);
3063                    }
3064                }
3065                DataType::List(_) => tid_array = Some(tid),
3066                _ => {}
3067            }
3068        }
3069        let (tid_enum, tid_rec_a, tid_rec_b, tid_array) = (
3070            tid_enum.expect("enum child"),
3071            tid_rec_a.expect("RecA child"),
3072            tid_rec_b.expect("RecB child"),
3073            tid_array.expect("array<long> child"),
3074        );
3075        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3076        assert_eq!(
3077            type_ids,
3078            vec![tid_enum, tid_rec_a, tid_rec_b, tid_array],
3079            "branch selection for complex union"
3080        );
3081        let dict = u
3082            .child(tid_enum)
3083            .as_any()
3084            .downcast_ref::<DictionaryArray<Int32Type>>()
3085            .unwrap();
3086        assert_eq!(dict.len(), 1);
3087        assert!(dict.is_valid(0));
3088        let rec_a = u
3089            .child(tid_rec_a)
3090            .as_any()
3091            .downcast_ref::<StructArray>()
3092            .unwrap();
3093        assert_eq!(rec_a.len(), 1);
3094        let a_val = rec_a
3095            .column_by_name("a")
3096            .unwrap()
3097            .as_any()
3098            .downcast_ref::<Int32Array>()
3099            .unwrap();
3100        assert_eq!(a_val.value(0), 7);
3101        let b_val = rec_a
3102            .column_by_name("b")
3103            .unwrap()
3104            .as_any()
3105            .downcast_ref::<StringArray>()
3106            .unwrap();
3107        assert_eq!(b_val.value(0), "x");
3108        // RecB row: {"x": 123456789, "y": b"\xFF\x00"}
3109        let rec_b = u
3110            .child(tid_rec_b)
3111            .as_any()
3112            .downcast_ref::<StructArray>()
3113            .unwrap();
3114        let x_val = rec_b
3115            .column_by_name("x")
3116            .unwrap()
3117            .as_any()
3118            .downcast_ref::<Int64Array>()
3119            .unwrap();
3120        assert_eq!(x_val.value(0), 123_456_789_i64);
3121        let y_val = rec_b
3122            .column_by_name("y")
3123            .unwrap()
3124            .as_any()
3125            .downcast_ref::<BinaryArray>()
3126            .unwrap();
3127        assert_eq!(y_val.value(0), &[0xFF, 0x00]);
3128        let arr = u
3129            .child(tid_array)
3130            .as_any()
3131            .downcast_ref::<ListArray>()
3132            .unwrap();
3133        assert_eq!(arr.len(), 1);
3134        let first_values = arr.value(0);
3135        let longs = first_values.as_any().downcast_ref::<Int64Array>().unwrap();
3136        assert_eq!(longs.len(), 3);
3137        assert_eq!(longs.value(0), 1);
3138        assert_eq!(longs.value(1), 2);
3139        assert_eq!(longs.value(2), 3);
3140        let idx = schema.index_of("union_date_or_fixed4").unwrap();
3141        let u = batch
3142            .column(idx)
3143            .as_any()
3144            .downcast_ref::<UnionArray>()
3145            .expect("union_date_or_fixed4 should be Union");
3146        let fields = match u.data_type() {
3147            DataType::Union(fields, _) => fields,
3148            other => panic!("expected Union, got {other:?}"),
3149        };
3150        let mut tid_date: Option<i8> = None;
3151        let mut tid_fixed: Option<i8> = None;
3152        for (tid, f) in fields.iter() {
3153            match f.data_type() {
3154                DataType::Date32 => tid_date = Some(tid),
3155                DataType::FixedSizeBinary(4) => tid_fixed = Some(tid),
3156                _ => {}
3157            }
3158        }
3159        let (tid_date, tid_fixed) = (tid_date.expect("date"), tid_fixed.expect("fixed(4)"));
3160        let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3161        assert_eq!(
3162            type_ids,
3163            vec![tid_date, tid_fixed, tid_date, tid_fixed],
3164            "branch selection for date/fixed4 union"
3165        );
3166        let dates = u
3167            .child(tid_date)
3168            .as_any()
3169            .downcast_ref::<Date32Array>()
3170            .unwrap();
3171        assert_eq!(dates.len(), 2);
3172        assert_eq!(dates.value(0), 19_000); // ~2022‑01‑15
3173        assert_eq!(dates.value(1), 0); // epoch
3174        let fixed = u
3175            .child(tid_fixed)
3176            .as_any()
3177            .downcast_ref::<FixedSizeBinaryArray>()
3178            .unwrap();
3179        assert_eq!(fixed.len(), 2);
3180        assert_eq!(fixed.value(0), b"ABCD");
3181        assert_eq!(fixed.value(1), &[0x00, 0x11, 0x22, 0x33]);
3182    }
3183
3184    #[test]
3185    fn test_union_schema_resolution_all_type_combinations() {
3186        let path = "test/data/union_fields.avro";
3187        let baseline = read_file(path, 1024, false);
3188        let baseline_schema = baseline.schema();
3189        let mut root = load_writer_schema_json(path);
3190        assert_eq!(root["type"], "record", "writer schema must be a record");
3191        let fields = root
3192            .get_mut("fields")
3193            .and_then(|f| f.as_array_mut())
3194            .expect("record has fields");
3195        fn is_named_type(obj: &Value, ty: &str, nm: &str) -> bool {
3196            obj.get("type").and_then(|v| v.as_str()) == Some(ty)
3197                && obj.get("name").and_then(|v| v.as_str()) == Some(nm)
3198        }
3199        fn is_logical(obj: &Value, prim: &str, lt: &str) -> bool {
3200            obj.get("type").and_then(|v| v.as_str()) == Some(prim)
3201                && obj.get("logicalType").and_then(|v| v.as_str()) == Some(lt)
3202        }
3203        fn find_first(arr: &[Value], pred: impl Fn(&Value) -> bool) -> Option<Value> {
3204            arr.iter().find(|v| pred(v)).cloned()
3205        }
3206        fn prim(s: &str) -> Value {
3207            Value::String(s.to_string())
3208        }
3209        for f in fields.iter_mut() {
3210            let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
3211                continue;
3212            };
3213            match name {
3214                // Flip null ordering – should not affect values
3215                "nullable_int_nullfirst" => {
3216                    f["type"] = json!(["int", "null"]);
3217                }
3218                "nullable_string_nullsecond" => {
3219                    f["type"] = json!(["null", "string"]);
3220                }
3221                "union_prim" => {
3222                    let orig = f["type"].as_array().unwrap().clone();
3223                    let long = prim("long");
3224                    let double = prim("double");
3225                    let string = prim("string");
3226                    let bytes = prim("bytes");
3227                    let boolean = prim("boolean");
3228                    assert!(orig.contains(&long));
3229                    assert!(orig.contains(&double));
3230                    assert!(orig.contains(&string));
3231                    assert!(orig.contains(&bytes));
3232                    assert!(orig.contains(&boolean));
3233                    f["type"] = json!([long, double, string, bytes, boolean]);
3234                }
3235                "union_bytes_vs_string" => {
3236                    f["type"] = json!(["string", "bytes"]);
3237                }
3238                "union_fixed_dur_decfix" => {
3239                    let orig = f["type"].as_array().unwrap().clone();
3240                    let fx8 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx8")).unwrap();
3241                    let dur12 = find_first(&orig, |o| is_named_type(o, "fixed", "Dur12")).unwrap();
3242                    let decfix16 =
3243                        find_first(&orig, |o| is_named_type(o, "fixed", "DecFix16")).unwrap();
3244                    f["type"] = json!([decfix16, dur12, fx8]);
3245                }
3246                "union_enum_records_array_map" => {
3247                    let orig = f["type"].as_array().unwrap().clone();
3248                    let enum_color = find_first(&orig, |o| {
3249                        o.get("type").and_then(|v| v.as_str()) == Some("enum")
3250                    })
3251                    .unwrap();
3252                    let rec_a = find_first(&orig, |o| is_named_type(o, "record", "RecA")).unwrap();
3253                    let rec_b = find_first(&orig, |o| is_named_type(o, "record", "RecB")).unwrap();
3254                    let arr = find_first(&orig, |o| {
3255                        o.get("type").and_then(|v| v.as_str()) == Some("array")
3256                    })
3257                    .unwrap();
3258                    let map = find_first(&orig, |o| {
3259                        o.get("type").and_then(|v| v.as_str()) == Some("map")
3260                    })
3261                    .unwrap();
3262                    f["type"] = json!([arr, map, rec_b, rec_a, enum_color]);
3263                }
3264                "union_date_or_fixed4" => {
3265                    let orig = f["type"].as_array().unwrap().clone();
3266                    let date = find_first(&orig, |o| is_logical(o, "int", "date")).unwrap();
3267                    let fx4 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx4")).unwrap();
3268                    f["type"] = json!([fx4, date]);
3269                }
3270                "union_time_millis_or_enum" => {
3271                    let orig = f["type"].as_array().unwrap().clone();
3272                    let time_ms =
3273                        find_first(&orig, |o| is_logical(o, "int", "time-millis")).unwrap();
3274                    let en = find_first(&orig, |o| {
3275                        o.get("type").and_then(|v| v.as_str()) == Some("enum")
3276                    })
3277                    .unwrap();
3278                    f["type"] = json!([en, time_ms]);
3279                }
3280                "union_time_micros_or_string" => {
3281                    let orig = f["type"].as_array().unwrap().clone();
3282                    let time_us =
3283                        find_first(&orig, |o| is_logical(o, "long", "time-micros")).unwrap();
3284                    f["type"] = json!(["string", time_us]);
3285                }
3286                "union_ts_millis_utc_or_array" => {
3287                    let orig = f["type"].as_array().unwrap().clone();
3288                    let ts_ms =
3289                        find_first(&orig, |o| is_logical(o, "long", "timestamp-millis")).unwrap();
3290                    let arr = find_first(&orig, |o| {
3291                        o.get("type").and_then(|v| v.as_str()) == Some("array")
3292                    })
3293                    .unwrap();
3294                    f["type"] = json!([arr, ts_ms]);
3295                }
3296                "union_ts_micros_local_or_bytes" => {
3297                    let orig = f["type"].as_array().unwrap().clone();
3298                    let lts_us =
3299                        find_first(&orig, |o| is_logical(o, "long", "local-timestamp-micros"))
3300                            .unwrap();
3301                    f["type"] = json!(["bytes", lts_us]);
3302                }
3303                "union_uuid_or_fixed10" => {
3304                    let orig = f["type"].as_array().unwrap().clone();
3305                    let uuid = find_first(&orig, |o| is_logical(o, "string", "uuid")).unwrap();
3306                    let fx10 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx10")).unwrap();
3307                    f["type"] = json!([fx10, uuid]);
3308                }
3309                "union_dec_bytes_or_dec_fixed" => {
3310                    let orig = f["type"].as_array().unwrap().clone();
3311                    let dec_bytes = find_first(&orig, |o| {
3312                        o.get("type").and_then(|v| v.as_str()) == Some("bytes")
3313                            && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3314                    })
3315                    .unwrap();
3316                    let dec_fix = find_first(&orig, |o| {
3317                        is_named_type(o, "fixed", "DecFix20")
3318                            && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3319                    })
3320                    .unwrap();
3321                    f["type"] = json!([dec_fix, dec_bytes]);
3322                }
3323                "union_null_bytes_string" => {
3324                    f["type"] = json!(["bytes", "string", "null"]);
3325                }
3326                "array_of_union" => {
3327                    let obj = f
3328                        .get_mut("type")
3329                        .expect("array type")
3330                        .as_object_mut()
3331                        .unwrap();
3332                    obj.insert("items".to_string(), json!(["string", "long"]));
3333                }
3334                "map_of_union" => {
3335                    let obj = f
3336                        .get_mut("type")
3337                        .expect("map type")
3338                        .as_object_mut()
3339                        .unwrap();
3340                    obj.insert("values".to_string(), json!(["double", "null"]));
3341                }
3342                "record_with_union_field" => {
3343                    let rec = f
3344                        .get_mut("type")
3345                        .expect("record type")
3346                        .as_object_mut()
3347                        .unwrap();
3348                    let rec_fields = rec.get_mut("fields").unwrap().as_array_mut().unwrap();
3349                    let mut found = false;
3350                    for rf in rec_fields.iter_mut() {
3351                        if rf.get("name").and_then(|v| v.as_str()) == Some("u") {
3352                            rf["type"] = json!(["string", "long"]); // rely on int→long promotion
3353                            found = true;
3354                            break;
3355                        }
3356                    }
3357                    assert!(found, "field 'u' expected in HasUnion");
3358                }
3359                "union_ts_micros_utc_or_map" => {
3360                    let orig = f["type"].as_array().unwrap().clone();
3361                    let ts_us =
3362                        find_first(&orig, |o| is_logical(o, "long", "timestamp-micros")).unwrap();
3363                    let map = find_first(&orig, |o| {
3364                        o.get("type").and_then(|v| v.as_str()) == Some("map")
3365                    })
3366                    .unwrap();
3367                    f["type"] = json!([map, ts_us]);
3368                }
3369                "union_ts_millis_local_or_string" => {
3370                    let orig = f["type"].as_array().unwrap().clone();
3371                    let lts_ms =
3372                        find_first(&orig, |o| is_logical(o, "long", "local-timestamp-millis"))
3373                            .unwrap();
3374                    f["type"] = json!(["string", lts_ms]);
3375                }
3376                "union_bool_or_string" => {
3377                    f["type"] = json!(["string", "boolean"]);
3378                }
3379                _ => {}
3380            }
3381        }
3382        let reader_schema = AvroSchema::new(root.to_string());
3383        let resolved = read_alltypes_with_reader_schema(path, reader_schema);
3384
3385        fn branch_token(dt: &DataType) -> String {
3386            match dt {
3387                DataType::Null => "null".into(),
3388                DataType::Boolean => "boolean".into(),
3389                DataType::Int32 => "int".into(),
3390                DataType::Int64 => "long".into(),
3391                DataType::Float32 => "float".into(),
3392                DataType::Float64 => "double".into(),
3393                DataType::Binary => "bytes".into(),
3394                DataType::Utf8 => "string".into(),
3395                DataType::Date32 => "date".into(),
3396                DataType::Time32(arrow_schema::TimeUnit::Millisecond) => "time-millis".into(),
3397                DataType::Time64(arrow_schema::TimeUnit::Microsecond) => "time-micros".into(),
3398                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => if tz.is_some() {
3399                    "timestamp-millis"
3400                } else {
3401                    "local-timestamp-millis"
3402                }
3403                .into(),
3404                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => if tz.is_some() {
3405                    "timestamp-micros"
3406                } else {
3407                    "local-timestamp-micros"
3408                }
3409                .into(),
3410                DataType::Interval(IntervalUnit::MonthDayNano) => "duration".into(),
3411                DataType::FixedSizeBinary(n) => format!("fixed{n}"),
3412                DataType::Dictionary(_, _) => "enum".into(),
3413                DataType::Decimal128(p, s) => format!("decimal({p},{s})"),
3414                DataType::Decimal256(p, s) => format!("decimal({p},{s})"),
3415                #[cfg(feature = "small_decimals")]
3416                DataType::Decimal64(p, s) => format!("decimal({p},{s})"),
3417                DataType::Struct(fields) => {
3418                    if fields.len() == 2 && fields[0].name() == "a" && fields[1].name() == "b" {
3419                        "record:RecA".into()
3420                    } else if fields.len() == 2
3421                        && fields[0].name() == "x"
3422                        && fields[1].name() == "y"
3423                    {
3424                        "record:RecB".into()
3425                    } else {
3426                        "record".into()
3427                    }
3428                }
3429                DataType::List(_) => "array".into(),
3430                DataType::Map(_, _) => "map".into(),
3431                other => format!("{other:?}"),
3432            }
3433        }
3434
3435        fn union_tokens(u: &UnionArray) -> (Vec<i8>, HashMap<i8, String>) {
3436            let fields = match u.data_type() {
3437                DataType::Union(fields, _) => fields,
3438                other => panic!("expected Union, got {other:?}"),
3439            };
3440            let mut dict: HashMap<i8, String> = HashMap::with_capacity(fields.len());
3441            for (tid, f) in fields.iter() {
3442                dict.insert(tid, branch_token(f.data_type()));
3443            }
3444            let ids: Vec<i8> = u.type_ids().iter().copied().collect();
3445            (ids, dict)
3446        }
3447
3448        fn expected_token(field_name: &str, writer_token: &str) -> String {
3449            match field_name {
3450                "union_prim" => match writer_token {
3451                    "int" => "long".into(),
3452                    "float" => "double".into(),
3453                    other => other.into(),
3454                },
3455                "record_with_union_field.u" => match writer_token {
3456                    "int" => "long".into(),
3457                    other => other.into(),
3458                },
3459                _ => writer_token.into(),
3460            }
3461        }
3462
3463        fn get_union<'a>(
3464            rb: &'a RecordBatch,
3465            schema: arrow_schema::SchemaRef,
3466            fname: &str,
3467        ) -> &'a UnionArray {
3468            let idx = schema.index_of(fname).unwrap();
3469            rb.column(idx)
3470                .as_any()
3471                .downcast_ref::<UnionArray>()
3472                .unwrap_or_else(|| panic!("{fname} should be a Union"))
3473        }
3474
3475        fn assert_union_equivalent(field_name: &str, u_writer: &UnionArray, u_reader: &UnionArray) {
3476            let (ids_w, dict_w) = union_tokens(u_writer);
3477            let (ids_r, dict_r) = union_tokens(u_reader);
3478            assert_eq!(
3479                ids_w.len(),
3480                ids_r.len(),
3481                "{field_name}: row count mismatch between baseline and resolved"
3482            );
3483            for (i, (id_w, id_r)) in ids_w.iter().zip(ids_r.iter()).enumerate() {
3484                let w_tok = dict_w.get(id_w).unwrap();
3485                let want = expected_token(field_name, w_tok);
3486                let got = dict_r.get(id_r).unwrap();
3487                assert_eq!(
3488                    got, &want,
3489                    "{field_name}: row {i} resolved to wrong union branch (writer={w_tok}, expected={want}, got={got})"
3490                );
3491            }
3492        }
3493
3494        for (fname, dt) in [
3495            ("nullable_int_nullfirst", DataType::Int32),
3496            ("nullable_string_nullsecond", DataType::Utf8),
3497        ] {
3498            let idx_b = baseline_schema.index_of(fname).unwrap();
3499            let idx_r = resolved.schema().index_of(fname).unwrap();
3500            let col_b = baseline.column(idx_b);
3501            let col_r = resolved.column(idx_r);
3502            assert_eq!(
3503                col_b.data_type(),
3504                &dt,
3505                "baseline {fname} should decode as non-union with nullability"
3506            );
3507            assert_eq!(
3508                col_b.as_ref(),
3509                col_r.as_ref(),
3510                "{fname}: values must be identical regardless of null-branch order"
3511            );
3512        }
3513        let union_fields = [
3514            "union_prim",
3515            "union_bytes_vs_string",
3516            "union_fixed_dur_decfix",
3517            "union_enum_records_array_map",
3518            "union_date_or_fixed4",
3519            "union_time_millis_or_enum",
3520            "union_time_micros_or_string",
3521            "union_ts_millis_utc_or_array",
3522            "union_ts_micros_local_or_bytes",
3523            "union_uuid_or_fixed10",
3524            "union_dec_bytes_or_dec_fixed",
3525            "union_null_bytes_string",
3526            "union_ts_micros_utc_or_map",
3527            "union_ts_millis_local_or_string",
3528            "union_bool_or_string",
3529        ];
3530        for fname in union_fields {
3531            let u_b = get_union(&baseline, baseline_schema.clone(), fname);
3532            let u_r = get_union(&resolved, resolved.schema(), fname);
3533            assert_union_equivalent(fname, u_b, u_r);
3534        }
3535        {
3536            let fname = "array_of_union";
3537            let idx_b = baseline_schema.index_of(fname).unwrap();
3538            let idx_r = resolved.schema().index_of(fname).unwrap();
3539            let arr_b = baseline
3540                .column(idx_b)
3541                .as_any()
3542                .downcast_ref::<ListArray>()
3543                .expect("array_of_union should be a List");
3544            let arr_r = resolved
3545                .column(idx_r)
3546                .as_any()
3547                .downcast_ref::<ListArray>()
3548                .expect("array_of_union should be a List");
3549            assert_eq!(
3550                arr_b.value_offsets(),
3551                arr_r.value_offsets(),
3552                "{fname}: list offsets changed after resolution"
3553            );
3554            let u_b = arr_b
3555                .values()
3556                .as_any()
3557                .downcast_ref::<UnionArray>()
3558                .expect("array items should be Union");
3559            let u_r = arr_r
3560                .values()
3561                .as_any()
3562                .downcast_ref::<UnionArray>()
3563                .expect("array items should be Union");
3564            let (ids_b, dict_b) = union_tokens(u_b);
3565            let (ids_r, dict_r) = union_tokens(u_r);
3566            assert_eq!(ids_b.len(), ids_r.len(), "{fname}: values length mismatch");
3567            for (i, (id_b, id_r)) in ids_b.iter().zip(ids_r.iter()).enumerate() {
3568                let w_tok = dict_b.get(id_b).unwrap();
3569                let got = dict_r.get(id_r).unwrap();
3570                assert_eq!(
3571                    got, w_tok,
3572                    "{fname}: value {i} resolved to wrong branch (writer={w_tok}, got={got})"
3573                );
3574            }
3575        }
3576        {
3577            let fname = "map_of_union";
3578            let idx_b = baseline_schema.index_of(fname).unwrap();
3579            let idx_r = resolved.schema().index_of(fname).unwrap();
3580            let map_b = baseline
3581                .column(idx_b)
3582                .as_any()
3583                .downcast_ref::<MapArray>()
3584                .expect("map_of_union should be a Map");
3585            let map_r = resolved
3586                .column(idx_r)
3587                .as_any()
3588                .downcast_ref::<MapArray>()
3589                .expect("map_of_union should be a Map");
3590            assert_eq!(
3591                map_b.value_offsets(),
3592                map_r.value_offsets(),
3593                "{fname}: map value offsets changed after resolution"
3594            );
3595            let ent_b = map_b.entries();
3596            let ent_r = map_r.entries();
3597            let val_b_any = ent_b.column(1).as_ref();
3598            let val_r_any = ent_r.column(1).as_ref();
3599            let b_union = val_b_any.as_any().downcast_ref::<UnionArray>();
3600            let r_union = val_r_any.as_any().downcast_ref::<UnionArray>();
3601            if let (Some(u_b), Some(u_r)) = (b_union, r_union) {
3602                assert_union_equivalent(fname, u_b, u_r);
3603            } else {
3604                assert_eq!(
3605                    val_b_any.data_type(),
3606                    val_r_any.data_type(),
3607                    "{fname}: value data types differ after resolution"
3608                );
3609                assert_eq!(
3610                    val_b_any, val_r_any,
3611                    "{fname}: value arrays differ after resolution (nullable value column case)"
3612                );
3613                let value_nullable = |m: &MapArray| -> bool {
3614                    match m.data_type() {
3615                        DataType::Map(entries_field, _sorted) => match entries_field.data_type() {
3616                            DataType::Struct(fields) => {
3617                                assert_eq!(fields.len(), 2, "entries struct must have 2 fields");
3618                                assert_eq!(fields[0].name(), "key");
3619                                assert_eq!(fields[1].name(), "value");
3620                                fields[1].is_nullable()
3621                            }
3622                            other => panic!("Map entries field must be Struct, got {other:?}"),
3623                        },
3624                        other => panic!("expected Map data type, got {other:?}"),
3625                    }
3626                };
3627                assert!(
3628                    value_nullable(map_b),
3629                    "{fname}: baseline Map value field should be nullable per Arrow spec"
3630                );
3631                assert!(
3632                    value_nullable(map_r),
3633                    "{fname}: resolved Map value field should be nullable per Arrow spec"
3634                );
3635            }
3636        }
3637        {
3638            let fname = "record_with_union_field";
3639            let idx_b = baseline_schema.index_of(fname).unwrap();
3640            let idx_r = resolved.schema().index_of(fname).unwrap();
3641            let rec_b = baseline
3642                .column(idx_b)
3643                .as_any()
3644                .downcast_ref::<StructArray>()
3645                .expect("record_with_union_field should be a Struct");
3646            let rec_r = resolved
3647                .column(idx_r)
3648                .as_any()
3649                .downcast_ref::<StructArray>()
3650                .expect("record_with_union_field should be a Struct");
3651            let u_b = rec_b
3652                .column_by_name("u")
3653                .unwrap()
3654                .as_any()
3655                .downcast_ref::<UnionArray>()
3656                .expect("field 'u' should be Union (baseline)");
3657            let u_r = rec_r
3658                .column_by_name("u")
3659                .unwrap()
3660                .as_any()
3661                .downcast_ref::<UnionArray>()
3662                .expect("field 'u' should be Union (resolved)");
3663            assert_union_equivalent("record_with_union_field.u", u_b, u_r);
3664        }
3665    }
3666
3667    #[test]
3668    fn test_union_fields_end_to_end_expected_arrays() {
3669        fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
3670            for (tid, f) in fields.iter() {
3671                if f.name() == want {
3672                    return tid;
3673                }
3674            }
3675            panic!("union child '{want}' not found")
3676        }
3677
3678        fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
3679            for (tid, f) in fields.iter() {
3680                if pred(f.data_type()) {
3681                    return tid;
3682                }
3683            }
3684            panic!("no union child matches predicate");
3685        }
3686
3687        fn uuid16_from_str(s: &str) -> [u8; 16] {
3688            fn hex(b: u8) -> u8 {
3689                match b {
3690                    b'0'..=b'9' => b - b'0',
3691                    b'a'..=b'f' => b - b'a' + 10,
3692                    b'A'..=b'F' => b - b'A' + 10,
3693                    _ => panic!("invalid hex"),
3694                }
3695            }
3696            let mut out = [0u8; 16];
3697            let bytes = s.as_bytes();
3698            let (mut i, mut j) = (0, 0);
3699            while i < bytes.len() {
3700                if bytes[i] == b'-' {
3701                    i += 1;
3702                    continue;
3703                }
3704                let hi = hex(bytes[i]);
3705                let lo = hex(bytes[i + 1]);
3706                out[j] = (hi << 4) | lo;
3707                j += 1;
3708                i += 2;
3709            }
3710            assert_eq!(j, 16, "uuid must decode to 16 bytes");
3711            out
3712        }
3713
3714        fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
3715            match dt {
3716                DataType::Null => Arc::new(NullArray::new(0)),
3717                DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
3718                DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
3719                DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
3720                DataType::Float32 => Arc::new(arrow_array::Float32Array::from(Vec::<f32>::new())),
3721                DataType::Float64 => Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())),
3722                DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
3723                DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
3724                DataType::Date32 => Arc::new(arrow_array::Date32Array::from(Vec::<i32>::new())),
3725                DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
3726                    Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
3727                }
3728                DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
3729                    Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
3730                }
3731                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
3732                    let a = TimestampMillisecondArray::from(Vec::<i64>::new());
3733                    Arc::new(if let Some(tz) = tz {
3734                        a.with_timezone(tz.clone())
3735                    } else {
3736                        a
3737                    })
3738                }
3739                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
3740                    let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
3741                    Arc::new(if let Some(tz) = tz {
3742                        a.with_timezone(tz.clone())
3743                    } else {
3744                        a
3745                    })
3746                }
3747                DataType::Interval(IntervalUnit::MonthDayNano) => {
3748                    Arc::new(arrow_array::IntervalMonthDayNanoArray::from(Vec::<
3749                        IntervalMonthDayNano,
3750                    >::new(
3751                    )))
3752                }
3753                DataType::FixedSizeBinary(n) => Arc::new(FixedSizeBinaryArray::new_null(*n, 0)),
3754                DataType::Dictionary(k, v) => {
3755                    assert_eq!(**k, DataType::Int32, "expect int32 keys for enums");
3756                    let keys = Int32Array::from(Vec::<i32>::new());
3757                    let values = match v.as_ref() {
3758                        DataType::Utf8 => {
3759                            Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
3760                        }
3761                        other => panic!("unexpected dictionary value type {other:?}"),
3762                    };
3763                    Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
3764                }
3765                DataType::List(field) => {
3766                    let values: ArrayRef = match field.data_type() {
3767                        DataType::Int32 => {
3768                            Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
3769                        }
3770                        DataType::Int64 => {
3771                            Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
3772                        }
3773                        DataType::Utf8 => {
3774                            Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
3775                        }
3776                        DataType::Union(_, _) => {
3777                            let (uf, _) = if let DataType::Union(f, m) = field.data_type() {
3778                                (f.clone(), m)
3779                            } else {
3780                                unreachable!()
3781                            };
3782                            let children: Vec<ArrayRef> = uf
3783                                .iter()
3784                                .map(|(_, f)| empty_child_for(f.data_type()))
3785                                .collect();
3786                            Arc::new(
3787                                UnionArray::try_new(
3788                                    uf.clone(),
3789                                    ScalarBuffer::<i8>::from(Vec::<i8>::new()),
3790                                    Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
3791                                    children,
3792                                )
3793                                .unwrap(),
3794                            ) as ArrayRef
3795                        }
3796                        other => panic!("unsupported list item type: {other:?}"),
3797                    };
3798                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
3799                    Arc::new(ListArray::try_new(field.clone(), offsets, values, None).unwrap())
3800                }
3801                DataType::Map(entry_field, ordered) => {
3802                    let DataType::Struct(childs) = entry_field.data_type() else {
3803                        panic!("map entries must be struct")
3804                    };
3805                    let key_field = &childs[0];
3806                    let val_field = &childs[1];
3807                    assert_eq!(key_field.data_type(), &DataType::Utf8);
3808                    let keys = StringArray::from(Vec::<&str>::new());
3809                    let vals: ArrayRef = match val_field.data_type() {
3810                        DataType::Float64 => {
3811                            Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())) as ArrayRef
3812                        }
3813                        DataType::Int64 => {
3814                            Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
3815                        }
3816                        DataType::Utf8 => {
3817                            Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
3818                        }
3819                        DataType::Union(uf, _) => {
3820                            let ch: Vec<ArrayRef> = uf
3821                                .iter()
3822                                .map(|(_, f)| empty_child_for(f.data_type()))
3823                                .collect();
3824                            Arc::new(
3825                                UnionArray::try_new(
3826                                    uf.clone(),
3827                                    ScalarBuffer::<i8>::from(Vec::<i8>::new()),
3828                                    Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
3829                                    ch,
3830                                )
3831                                .unwrap(),
3832                            ) as ArrayRef
3833                        }
3834                        other => panic!("unsupported map value type: {other:?}"),
3835                    };
3836                    let entries = StructArray::new(
3837                        Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
3838                        vec![Arc::new(keys) as ArrayRef, vals],
3839                        None,
3840                    );
3841                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
3842                    Arc::new(MapArray::new(
3843                        entry_field.clone(),
3844                        offsets,
3845                        entries,
3846                        None,
3847                        *ordered,
3848                    ))
3849                }
3850                other => panic!("empty_child_for: unhandled type {other:?}"),
3851            }
3852        }
3853
3854        fn mk_dense_union(
3855            fields: &UnionFields,
3856            type_ids: Vec<i8>,
3857            offsets: Vec<i32>,
3858            provide: impl Fn(&Field) -> Option<ArrayRef>,
3859        ) -> ArrayRef {
3860            let children: Vec<ArrayRef> = fields
3861                .iter()
3862                .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
3863                .collect();
3864
3865            Arc::new(
3866                UnionArray::try_new(
3867                    fields.clone(),
3868                    ScalarBuffer::<i8>::from(type_ids),
3869                    Some(ScalarBuffer::<i32>::from(offsets)),
3870                    children,
3871                )
3872                .unwrap(),
3873            ) as ArrayRef
3874        }
3875
3876        // Dates / times / timestamps from the Avro content block:
3877        let date_a: i32 = 19_000;
3878        let time_ms_a: i32 = 13 * 3_600_000 + 45 * 60_000 + 30_000 + 123;
3879        let time_us_b: i64 = 23 * 3_600_000_000 + 59 * 60_000_000 + 59 * 1_000_000 + 999_999;
3880        let ts_ms_2024_01_01: i64 = 1_704_067_200_000;
3881        let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1000;
3882        // Fixed / bytes-like values:
3883        let fx8_a: [u8; 8] = *b"ABCDEFGH";
3884        let fx4_abcd: [u8; 4] = *b"ABCD";
3885        let fx4_misc: [u8; 4] = [0x00, 0x11, 0x22, 0x33];
3886        let fx10_ascii: [u8; 10] = *b"0123456789";
3887        let fx10_aa: [u8; 10] = [0xAA; 10];
3888        // Duration logical values as MonthDayNano:
3889        let dur_a = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
3890        let dur_b = IntervalMonthDayNanoType::make_value(12, 31, 999_000_000);
3891        // UUID logical values (stored as 16-byte FixedSizeBinary in Arrow):
3892        let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
3893        let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
3894        // Decimals from Avro content:
3895        let dec_b_scale2_pos: i128 = 123_456; // "1234.56" bytes-decimal -> (precision=10, scale=2)
3896        let dec_fix16_neg: i128 = -101; // "-1.01" fixed(16) decimal(10,2)
3897        let dec_fix20_s4: i128 = 1_234_567_891_234; // "123456789.1234" fixed(20) decimal(20,4)
3898        let dec_fix20_s4_neg: i128 = -123; // "-0.0123" fixed(20) decimal(20,4)
3899        let path = "test/data/union_fields.avro";
3900        let actual = read_file(path, 1024, false);
3901        let schema = actual.schema();
3902        // Helper to fetch union metadata for a column
3903        let get_union = |name: &str| -> (UnionFields, UnionMode) {
3904            let idx = schema.index_of(name).unwrap();
3905            match schema.field(idx).data_type() {
3906                DataType::Union(f, m) => (f.clone(), *m),
3907                other => panic!("{name} should be a Union, got {other:?}"),
3908            }
3909        };
3910        let mut expected_cols: Vec<ArrayRef> = Vec::with_capacity(schema.fields().len());
3911        // 1) ["null","int"]: Int32 (nullable)
3912        expected_cols.push(Arc::new(Int32Array::from(vec![
3913            None,
3914            Some(42),
3915            None,
3916            Some(0),
3917        ])));
3918        // 2) ["string","null"]: Utf8 (nullable)
3919        expected_cols.push(Arc::new(StringArray::from(vec![
3920            Some("s1"),
3921            None,
3922            Some("s3"),
3923            Some(""),
3924        ])));
3925        // 3) union_prim: ["boolean","int","long","float","double","bytes","string"]
3926        {
3927            let (uf, mode) = get_union("union_prim");
3928            assert!(matches!(mode, UnionMode::Dense));
3929            let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
3930            let expected_names = vec![
3931                "boolean", "int", "long", "float", "double", "bytes", "string",
3932            ];
3933            assert_eq!(
3934                generated_names, expected_names,
3935                "Field names for union_prim are incorrect"
3936            );
3937            let tids = vec![
3938                tid_by_name(&uf, "long"),
3939                tid_by_name(&uf, "int"),
3940                tid_by_name(&uf, "float"),
3941                tid_by_name(&uf, "double"),
3942            ];
3943            let offs = vec![0, 0, 0, 0];
3944            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
3945                "int" => Some(Arc::new(Int32Array::from(vec![-1])) as ArrayRef),
3946                "long" => Some(Arc::new(Int64Array::from(vec![1_234_567_890_123i64])) as ArrayRef),
3947                "float" => {
3948                    Some(Arc::new(arrow_array::Float32Array::from(vec![1.25f32])) as ArrayRef)
3949                }
3950                "double" => {
3951                    Some(Arc::new(arrow_array::Float64Array::from(vec![-2.5f64])) as ArrayRef)
3952                }
3953                _ => None,
3954            });
3955            expected_cols.push(arr);
3956        }
3957        // 4) union_bytes_vs_string: ["bytes","string"]
3958        {
3959            let (uf, _) = get_union("union_bytes_vs_string");
3960            let tids = vec![
3961                tid_by_name(&uf, "bytes"),
3962                tid_by_name(&uf, "string"),
3963                tid_by_name(&uf, "string"),
3964                tid_by_name(&uf, "bytes"),
3965            ];
3966            let offs = vec![0, 0, 1, 1];
3967            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
3968                "bytes" => Some(
3969                    Arc::new(BinaryArray::from(vec![&[0x00, 0xFF, 0x7F][..], &[][..]])) as ArrayRef,
3970                ),
3971                "string" => Some(Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef),
3972                _ => None,
3973            });
3974            expected_cols.push(arr);
3975        }
3976        // 5) union_fixed_dur_decfix: [Fx8, Dur12, DecFix16(decimal(10,2))]
3977        {
3978            let (uf, _) = get_union("union_fixed_dur_decfix");
3979            let tid_fx8 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(8)));
3980            let tid_dur = tid_by_dt(&uf, |dt| {
3981                matches!(
3982                    dt,
3983                    DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)
3984                )
3985            });
3986            let tid_dec = tid_by_dt(&uf, |dt| match dt {
3987                #[cfg(feature = "small_decimals")]
3988                DataType::Decimal64(10, 2) => true,
3989                DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
3990                _ => false,
3991            });
3992            let tids = vec![tid_fx8, tid_dur, tid_dec, tid_dur];
3993            let offs = vec![0, 0, 0, 1];
3994            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
3995                DataType::FixedSizeBinary(8) => {
3996                    let it = [Some(fx8_a)].into_iter();
3997                    Some(Arc::new(
3998                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 8).unwrap(),
3999                    ) as ArrayRef)
4000                }
4001                DataType::Interval(IntervalUnit::MonthDayNano) => {
4002                    Some(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![
4003                        dur_a, dur_b,
4004                    ])) as ArrayRef)
4005                }
4006                #[cfg(feature = "small_decimals")]
4007                DataType::Decimal64(10, 2) => {
4008                    let a = arrow_array::Decimal64Array::from_iter_values([dec_fix16_neg as i64]);
4009                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4010                }
4011                DataType::Decimal128(10, 2) => {
4012                    let a = arrow_array::Decimal128Array::from_iter_values([dec_fix16_neg]);
4013                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4014                }
4015                DataType::Decimal256(10, 2) => {
4016                    let a = arrow_array::Decimal256Array::from_iter_values([i256::from_i128(
4017                        dec_fix16_neg,
4018                    )]);
4019                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4020                }
4021                _ => None,
4022            });
4023            let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4024            let expected_names = vec!["Fx8", "Dur12", "DecFix16"];
4025            assert_eq!(
4026                generated_names, expected_names,
4027                "Data type names were not generated correctly for union_fixed_dur_decfix"
4028            );
4029            expected_cols.push(arr);
4030        }
4031        // 6) union_enum_records_array_map: [enum ColorU, record RecA, record RecB, array<long>, map<string>]
4032        {
4033            let (uf, _) = get_union("union_enum_records_array_map");
4034            let tid_enum = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4035            let tid_reca = tid_by_dt(&uf, |dt| {
4036                if let DataType::Struct(fs) = dt {
4037                    fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b"
4038                } else {
4039                    false
4040                }
4041            });
4042            let tid_recb = tid_by_dt(&uf, |dt| {
4043                if let DataType::Struct(fs) = dt {
4044                    fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y"
4045                } else {
4046                    false
4047                }
4048            });
4049            let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4050            let tids = vec![tid_enum, tid_reca, tid_recb, tid_arr];
4051            let offs = vec![0, 0, 0, 0];
4052            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4053                DataType::Dictionary(_, _) => {
4054                    let keys = Int32Array::from(vec![0i32]); // "RED"
4055                    let values =
4056                        Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
4057                    Some(
4058                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4059                            as ArrayRef,
4060                    )
4061                }
4062                DataType::Struct(fs)
4063                    if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
4064                {
4065                    let a = Int32Array::from(vec![7]);
4066                    let b = StringArray::from(vec!["x"]);
4067                    Some(Arc::new(StructArray::new(
4068                        fs.clone(),
4069                        vec![Arc::new(a), Arc::new(b)],
4070                        None,
4071                    )) as ArrayRef)
4072                }
4073                DataType::Struct(fs)
4074                    if fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y" =>
4075                {
4076                    let x = Int64Array::from(vec![123_456_789i64]);
4077                    let y = BinaryArray::from(vec![&[0xFF, 0x00][..]]);
4078                    Some(Arc::new(StructArray::new(
4079                        fs.clone(),
4080                        vec![Arc::new(x), Arc::new(y)],
4081                        None,
4082                    )) as ArrayRef)
4083                }
4084                DataType::List(field) => {
4085                    let values = Int64Array::from(vec![1i64, 2, 3]);
4086                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
4087                    Some(Arc::new(
4088                        ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4089                    ) as ArrayRef)
4090                }
4091                DataType::Map(_, _) => None,
4092                other => panic!("unexpected child {other:?}"),
4093            });
4094            expected_cols.push(arr);
4095        }
4096        // 7) union_date_or_fixed4: [date32, fixed(4)]
4097        {
4098            let (uf, _) = get_union("union_date_or_fixed4");
4099            let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
4100            let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
4101            let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
4102            let offs = vec![0, 0, 1, 1];
4103            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4104                DataType::Date32 => {
4105                    Some(Arc::new(arrow_array::Date32Array::from(vec![date_a, 0])) as ArrayRef)
4106                }
4107                DataType::FixedSizeBinary(4) => {
4108                    let it = [Some(fx4_abcd), Some(fx4_misc)].into_iter();
4109                    Some(Arc::new(
4110                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
4111                    ) as ArrayRef)
4112                }
4113                _ => None,
4114            });
4115            expected_cols.push(arr);
4116        }
4117        // 8) union_time_millis_or_enum: [time-millis, enum OnOff]
4118        {
4119            let (uf, _) = get_union("union_time_millis_or_enum");
4120            let tid_ms = tid_by_dt(&uf, |dt| {
4121                matches!(dt, DataType::Time32(arrow_schema::TimeUnit::Millisecond))
4122            });
4123            let tid_en = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4124            let tids = vec![tid_ms, tid_en, tid_en, tid_ms];
4125            let offs = vec![0, 0, 1, 1];
4126            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4127                DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4128                    Some(Arc::new(Time32MillisecondArray::from(vec![time_ms_a, 0])) as ArrayRef)
4129                }
4130                DataType::Dictionary(_, _) => {
4131                    let keys = Int32Array::from(vec![0i32, 1]); // "ON", "OFF"
4132                    let values = Arc::new(StringArray::from(vec!["ON", "OFF"])) as ArrayRef;
4133                    Some(
4134                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4135                            as ArrayRef,
4136                    )
4137                }
4138                _ => None,
4139            });
4140            expected_cols.push(arr);
4141        }
4142        // 9) union_time_micros_or_string: [time-micros, string]
4143        {
4144            let (uf, _) = get_union("union_time_micros_or_string");
4145            let tid_us = tid_by_dt(&uf, |dt| {
4146                matches!(dt, DataType::Time64(arrow_schema::TimeUnit::Microsecond))
4147            });
4148            let tid_s = tid_by_name(&uf, "string");
4149            let tids = vec![tid_s, tid_us, tid_s, tid_s];
4150            let offs = vec![0, 0, 1, 2];
4151            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4152                DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4153                    Some(Arc::new(Time64MicrosecondArray::from(vec![time_us_b])) as ArrayRef)
4154                }
4155                DataType::Utf8 => {
4156                    Some(Arc::new(StringArray::from(vec!["evening", "night", ""])) as ArrayRef)
4157                }
4158                _ => None,
4159            });
4160            expected_cols.push(arr);
4161        }
4162        // 10) union_ts_millis_utc_or_array: [timestamp-millis(TZ), array<int>]
4163        {
4164            let (uf, _) = get_union("union_ts_millis_utc_or_array");
4165            let tid_ts = tid_by_dt(&uf, |dt| {
4166                matches!(
4167                    dt,
4168                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)
4169                )
4170            });
4171            let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4172            let tids = vec![tid_ts, tid_arr, tid_arr, tid_ts];
4173            let offs = vec![0, 0, 1, 1];
4174            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4175                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4176                    let a = TimestampMillisecondArray::from(vec![
4177                        ts_ms_2024_01_01,
4178                        ts_ms_2024_01_01 + 86_400_000,
4179                    ]);
4180                    Some(Arc::new(if let Some(tz) = tz {
4181                        a.with_timezone(tz.clone())
4182                    } else {
4183                        a
4184                    }) as ArrayRef)
4185                }
4186                DataType::List(field) => {
4187                    let values = Int32Array::from(vec![0, 1, 2, -1, 0, 1]);
4188                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6]));
4189                    Some(Arc::new(
4190                        ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4191                    ) as ArrayRef)
4192                }
4193                _ => None,
4194            });
4195            expected_cols.push(arr);
4196        }
4197        // 11) union_ts_micros_local_or_bytes: [local-timestamp-micros, bytes]
4198        {
4199            let (uf, _) = get_union("union_ts_micros_local_or_bytes");
4200            let tid_lts = tid_by_dt(&uf, |dt| {
4201                matches!(
4202                    dt,
4203                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
4204                )
4205            });
4206            let tid_b = tid_by_name(&uf, "bytes");
4207            let tids = vec![tid_b, tid_lts, tid_b, tid_b];
4208            let offs = vec![0, 0, 1, 2];
4209            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4210                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) => Some(Arc::new(
4211                    TimestampMicrosecondArray::from(vec![ts_us_2024_01_01]),
4212                )
4213                    as ArrayRef),
4214                DataType::Binary => Some(Arc::new(BinaryArray::from(vec![
4215                    &b"\x11\x22\x33"[..],
4216                    &b"\x00"[..],
4217                    &b"\x10\x20\x30\x40"[..],
4218                ])) as ArrayRef),
4219                _ => None,
4220            });
4221            expected_cols.push(arr);
4222        }
4223        // 12) union_uuid_or_fixed10: [uuid(string)->fixed(16), fixed(10)]
4224        {
4225            let (uf, _) = get_union("union_uuid_or_fixed10");
4226            let tid_fx16 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
4227            let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
4228            let tids = vec![tid_fx16, tid_fx10, tid_fx16, tid_fx10];
4229            let offs = vec![0, 0, 1, 1];
4230            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4231                DataType::FixedSizeBinary(16) => {
4232                    let it = [Some(uuid1), Some(uuid2)].into_iter();
4233                    Some(Arc::new(
4234                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
4235                    ) as ArrayRef)
4236                }
4237                DataType::FixedSizeBinary(10) => {
4238                    let it = [Some(fx10_ascii), Some(fx10_aa)].into_iter();
4239                    Some(Arc::new(
4240                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
4241                    ) as ArrayRef)
4242                }
4243                _ => None,
4244            });
4245            expected_cols.push(arr);
4246        }
4247        // 13) union_dec_bytes_or_dec_fixed: [bytes dec(10,2), fixed(20) dec(20,4)]
4248        {
4249            let (uf, _) = get_union("union_dec_bytes_or_dec_fixed");
4250            let tid_b10s2 = tid_by_dt(&uf, |dt| match dt {
4251                #[cfg(feature = "small_decimals")]
4252                DataType::Decimal64(10, 2) => true,
4253                DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4254                _ => false,
4255            });
4256            let tid_f20s4 = tid_by_dt(&uf, |dt| {
4257                matches!(
4258                    dt,
4259                    DataType::Decimal128(20, 4) | DataType::Decimal256(20, 4)
4260                )
4261            });
4262            let tids = vec![tid_b10s2, tid_f20s4, tid_b10s2, tid_f20s4];
4263            let offs = vec![0, 0, 1, 1];
4264            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4265                #[cfg(feature = "small_decimals")]
4266                DataType::Decimal64(10, 2) => {
4267                    let a = Decimal64Array::from_iter_values([dec_b_scale2_pos as i64, 0i64]);
4268                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4269                }
4270                DataType::Decimal128(10, 2) => {
4271                    let a = Decimal128Array::from_iter_values([dec_b_scale2_pos, 0]);
4272                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4273                }
4274                DataType::Decimal256(10, 2) => {
4275                    let a = Decimal256Array::from_iter_values([
4276                        i256::from_i128(dec_b_scale2_pos),
4277                        i256::from(0),
4278                    ]);
4279                    Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4280                }
4281                DataType::Decimal128(20, 4) => {
4282                    let a = Decimal128Array::from_iter_values([dec_fix20_s4_neg, dec_fix20_s4]);
4283                    Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4284                }
4285                DataType::Decimal256(20, 4) => {
4286                    let a = Decimal256Array::from_iter_values([
4287                        i256::from_i128(dec_fix20_s4_neg),
4288                        i256::from_i128(dec_fix20_s4),
4289                    ]);
4290                    Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4291                }
4292                _ => None,
4293            });
4294            expected_cols.push(arr);
4295        }
4296        // 14) union_null_bytes_string: ["null","bytes","string"]
4297        {
4298            let (uf, _) = get_union("union_null_bytes_string");
4299            let tid_n = tid_by_name(&uf, "null");
4300            let tid_b = tid_by_name(&uf, "bytes");
4301            let tid_s = tid_by_name(&uf, "string");
4302            let tids = vec![tid_n, tid_b, tid_s, tid_s];
4303            let offs = vec![0, 0, 0, 1];
4304            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4305                "null" => Some(Arc::new(arrow_array::NullArray::new(1)) as ArrayRef),
4306                "bytes" => Some(Arc::new(BinaryArray::from(vec![&b"\x01\x02"[..]])) as ArrayRef),
4307                "string" => Some(Arc::new(StringArray::from(vec!["text", "u"])) as ArrayRef),
4308                _ => None,
4309            });
4310            expected_cols.push(arr);
4311        }
4312        // 15) array_of_union: array<[long,string]>
4313        {
4314            let idx = schema.index_of("array_of_union").unwrap();
4315            let dt = schema.field(idx).data_type().clone();
4316            let (item_field, _) = match &dt {
4317                DataType::List(f) => (f.clone(), ()),
4318                other => panic!("array_of_union must be List, got {other:?}"),
4319            };
4320            let (uf, _) = match item_field.data_type() {
4321                DataType::Union(f, m) => (f.clone(), m),
4322                other => panic!("array_of_union items must be Union, got {other:?}"),
4323            };
4324            let tid_l = tid_by_name(&uf, "long");
4325            let tid_s = tid_by_name(&uf, "string");
4326            let type_ids = vec![tid_l, tid_s, tid_l, tid_s, tid_l, tid_l, tid_s, tid_l];
4327            let offsets = vec![0, 0, 1, 1, 2, 3, 2, 4];
4328            let values_union =
4329                mk_dense_union(&uf, type_ids, offsets, |f| match f.name().as_str() {
4330                    "long" => {
4331                        Some(Arc::new(Int64Array::from(vec![1i64, -5, 42, -1, 0])) as ArrayRef)
4332                    }
4333                    "string" => Some(Arc::new(StringArray::from(vec!["a", "", "z"])) as ArrayRef),
4334                    _ => None,
4335                });
4336            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 8]));
4337            expected_cols.push(Arc::new(
4338                ListArray::try_new(item_field.clone(), list_offsets, values_union, None).unwrap(),
4339            ));
4340        }
4341        // 16) map_of_union: map<[null,double]>
4342        {
4343            let idx = schema.index_of("map_of_union").unwrap();
4344            let dt = schema.field(idx).data_type().clone();
4345            let (entry_field, ordered) = match &dt {
4346                DataType::Map(f, ordered) => (f.clone(), *ordered),
4347                other => panic!("map_of_union must be Map, got {other:?}"),
4348            };
4349            let DataType::Struct(entry_fields) = entry_field.data_type() else {
4350                panic!("map entries must be struct")
4351            };
4352            let key_field = entry_fields[0].clone();
4353            let val_field = entry_fields[1].clone();
4354            let keys = StringArray::from(vec!["a", "b", "x", "pi"]);
4355            let rounded_pi = (std::f64::consts::PI * 100_000.0).round() / 100_000.0;
4356            let values: ArrayRef = match val_field.data_type() {
4357                DataType::Union(uf, _) => {
4358                    let tid_n = tid_by_name(uf, "null");
4359                    let tid_d = tid_by_name(uf, "double");
4360                    let tids = vec![tid_n, tid_d, tid_d, tid_d];
4361                    let offs = vec![0, 0, 1, 2];
4362                    mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4363                        "null" => Some(Arc::new(NullArray::new(1)) as ArrayRef),
4364                        "double" => Some(Arc::new(arrow_array::Float64Array::from(vec![
4365                            2.5f64, -0.5f64, rounded_pi,
4366                        ])) as ArrayRef),
4367                        _ => None,
4368                    })
4369                }
4370                DataType::Float64 => Arc::new(arrow_array::Float64Array::from(vec![
4371                    None,
4372                    Some(2.5),
4373                    Some(-0.5),
4374                    Some(rounded_pi),
4375                ])),
4376                other => panic!("unexpected map value type {other:?}"),
4377            };
4378            let entries = StructArray::new(
4379                Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4380                vec![Arc::new(keys) as ArrayRef, values],
4381                None,
4382            );
4383            let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 3, 4]));
4384            expected_cols.push(Arc::new(MapArray::new(
4385                entry_field,
4386                offsets,
4387                entries,
4388                None,
4389                ordered,
4390            )));
4391        }
4392        // 17) record_with_union_field: struct { id:int, u:[int,string] }
4393        {
4394            let idx = schema.index_of("record_with_union_field").unwrap();
4395            let DataType::Struct(rec_fields) = schema.field(idx).data_type() else {
4396                panic!("record_with_union_field should be Struct")
4397            };
4398            let id = Int32Array::from(vec![1, 2, 3, 4]);
4399            let u_field = rec_fields.iter().find(|f| f.name() == "u").unwrap();
4400            let DataType::Union(uf, _) = u_field.data_type() else {
4401                panic!("u must be Union")
4402            };
4403            let tid_i = tid_by_name(uf, "int");
4404            let tid_s = tid_by_name(uf, "string");
4405            let tids = vec![tid_s, tid_i, tid_i, tid_s];
4406            let offs = vec![0, 0, 1, 1];
4407            let u = mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4408                "int" => Some(Arc::new(Int32Array::from(vec![99, 0])) as ArrayRef),
4409                "string" => Some(Arc::new(StringArray::from(vec!["one", "four"])) as ArrayRef),
4410                _ => None,
4411            });
4412            let rec = StructArray::new(rec_fields.clone(), vec![Arc::new(id) as ArrayRef, u], None);
4413            expected_cols.push(Arc::new(rec));
4414        }
4415        // 18) union_ts_micros_utc_or_map: [timestamp-micros(TZ), map<long>]
4416        {
4417            let (uf, _) = get_union("union_ts_micros_utc_or_map");
4418            let tid_ts = tid_by_dt(&uf, |dt| {
4419                matches!(
4420                    dt,
4421                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some(_))
4422                )
4423            });
4424            let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
4425            let tids = vec![tid_ts, tid_map, tid_ts, tid_map];
4426            let offs = vec![0, 0, 1, 1];
4427            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4428                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4429                    let a = TimestampMicrosecondArray::from(vec![ts_us_2024_01_01, 0i64]);
4430                    Some(Arc::new(if let Some(tz) = tz {
4431                        a.with_timezone(tz.clone())
4432                    } else {
4433                        a
4434                    }) as ArrayRef)
4435                }
4436                DataType::Map(entry_field, ordered) => {
4437                    let DataType::Struct(fs) = entry_field.data_type() else {
4438                        panic!("map entries must be struct")
4439                    };
4440                    let key_field = fs[0].clone();
4441                    let val_field = fs[1].clone();
4442                    assert_eq!(key_field.data_type(), &DataType::Utf8);
4443                    assert_eq!(val_field.data_type(), &DataType::Int64);
4444                    let keys = StringArray::from(vec!["k1", "k2", "n"]);
4445                    let vals = Int64Array::from(vec![1i64, 2, 0]);
4446                    let entries = StructArray::new(
4447                        Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4448                        vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
4449                        None,
4450                    );
4451                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
4452                    Some(Arc::new(MapArray::new(
4453                        entry_field.clone(),
4454                        offsets,
4455                        entries,
4456                        None,
4457                        *ordered,
4458                    )) as ArrayRef)
4459                }
4460                _ => None,
4461            });
4462            expected_cols.push(arr);
4463        }
4464        // 19) union_ts_millis_local_or_string: [local-timestamp-millis, string]
4465        {
4466            let (uf, _) = get_union("union_ts_millis_local_or_string");
4467            let tid_ts = tid_by_dt(&uf, |dt| {
4468                matches!(
4469                    dt,
4470                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
4471                )
4472            });
4473            let tid_s = tid_by_name(&uf, "string");
4474            let tids = vec![tid_s, tid_ts, tid_s, tid_s];
4475            let offs = vec![0, 0, 1, 2];
4476            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4477                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) => Some(Arc::new(
4478                    TimestampMillisecondArray::from(vec![ts_ms_2024_01_01]),
4479                )
4480                    as ArrayRef),
4481                DataType::Utf8 => {
4482                    Some(
4483                        Arc::new(StringArray::from(vec!["local midnight", "done", ""])) as ArrayRef,
4484                    )
4485                }
4486                _ => None,
4487            });
4488            expected_cols.push(arr);
4489        }
4490        // 20) union_bool_or_string: ["boolean","string"]
4491        {
4492            let (uf, _) = get_union("union_bool_or_string");
4493            let tid_b = tid_by_name(&uf, "boolean");
4494            let tid_s = tid_by_name(&uf, "string");
4495            let tids = vec![tid_b, tid_s, tid_b, tid_s];
4496            let offs = vec![0, 0, 1, 1];
4497            let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4498                "boolean" => Some(Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef),
4499                "string" => Some(Arc::new(StringArray::from(vec!["no", "yes"])) as ArrayRef),
4500                _ => None,
4501            });
4502            expected_cols.push(arr);
4503        }
4504        let expected = RecordBatch::try_new(schema.clone(), expected_cols).unwrap();
4505        assert_eq!(
4506            actual, expected,
4507            "full end-to-end equality for union_fields.avro"
4508        );
4509    }
4510
4511    #[test]
4512    fn test_read_zero_byte_avro_file() {
4513        let batch = read_file("test/data/zero_byte.avro", 3, false);
4514        let schema = batch.schema();
4515        assert_eq!(schema.fields().len(), 1);
4516        let field = schema.field(0);
4517        assert_eq!(field.name(), "data");
4518        assert_eq!(field.data_type(), &DataType::Binary);
4519        assert!(field.is_nullable());
4520        assert_eq!(batch.num_rows(), 3);
4521        assert_eq!(batch.num_columns(), 1);
4522        let binary_array = batch
4523            .column(0)
4524            .as_any()
4525            .downcast_ref::<BinaryArray>()
4526            .unwrap();
4527        assert!(binary_array.is_null(0));
4528        assert!(binary_array.is_valid(1));
4529        assert_eq!(binary_array.value(1), b"");
4530        assert!(binary_array.is_valid(2));
4531        assert_eq!(binary_array.value(2), b"some bytes");
4532    }
4533
4534    #[test]
4535    fn test_alltypes() {
4536        let expected = RecordBatch::try_from_iter_with_nullable([
4537            (
4538                "id",
4539                Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
4540                true,
4541            ),
4542            (
4543                "bool_col",
4544                Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
4545                true,
4546            ),
4547            (
4548                "tinyint_col",
4549                Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4550                true,
4551            ),
4552            (
4553                "smallint_col",
4554                Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4555                true,
4556            ),
4557            (
4558                "int_col",
4559                Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4560                true,
4561            ),
4562            (
4563                "bigint_col",
4564                Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
4565                true,
4566            ),
4567            (
4568                "float_col",
4569                Arc::new(Float32Array::from_iter_values(
4570                    (0..8).map(|x| (x % 2) as f32 * 1.1),
4571                )) as _,
4572                true,
4573            ),
4574            (
4575                "double_col",
4576                Arc::new(Float64Array::from_iter_values(
4577                    (0..8).map(|x| (x % 2) as f64 * 10.1),
4578                )) as _,
4579                true,
4580            ),
4581            (
4582                "date_string_col",
4583                Arc::new(BinaryArray::from_iter_values([
4584                    [48, 51, 47, 48, 49, 47, 48, 57],
4585                    [48, 51, 47, 48, 49, 47, 48, 57],
4586                    [48, 52, 47, 48, 49, 47, 48, 57],
4587                    [48, 52, 47, 48, 49, 47, 48, 57],
4588                    [48, 50, 47, 48, 49, 47, 48, 57],
4589                    [48, 50, 47, 48, 49, 47, 48, 57],
4590                    [48, 49, 47, 48, 49, 47, 48, 57],
4591                    [48, 49, 47, 48, 49, 47, 48, 57],
4592                ])) as _,
4593                true,
4594            ),
4595            (
4596                "string_col",
4597                Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
4598                true,
4599            ),
4600            (
4601                "timestamp_col",
4602                Arc::new(
4603                    TimestampMicrosecondArray::from_iter_values([
4604                        1235865600000000, // 2009-03-01T00:00:00.000
4605                        1235865660000000, // 2009-03-01T00:01:00.000
4606                        1238544000000000, // 2009-04-01T00:00:00.000
4607                        1238544060000000, // 2009-04-01T00:01:00.000
4608                        1233446400000000, // 2009-02-01T00:00:00.000
4609                        1233446460000000, // 2009-02-01T00:01:00.000
4610                        1230768000000000, // 2009-01-01T00:00:00.000
4611                        1230768060000000, // 2009-01-01T00:01:00.000
4612                    ])
4613                    .with_timezone("+00:00"),
4614                ) as _,
4615                true,
4616            ),
4617        ])
4618        .unwrap();
4619
4620        for file in files() {
4621            let file = arrow_test_data(file);
4622
4623            assert_eq!(read_file(&file, 8, false), expected);
4624            assert_eq!(read_file(&file, 3, false), expected);
4625        }
4626    }
4627
4628    #[test]
4629    // TODO: avoid requiring snappy for this file
4630    #[cfg(feature = "snappy")]
4631    fn test_alltypes_dictionary() {
4632        let file = "avro/alltypes_dictionary.avro";
4633        let expected = RecordBatch::try_from_iter_with_nullable([
4634            ("id", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
4635            (
4636                "bool_col",
4637                Arc::new(BooleanArray::from(vec![Some(true), Some(false)])) as _,
4638                true,
4639            ),
4640            (
4641                "tinyint_col",
4642                Arc::new(Int32Array::from(vec![0, 1])) as _,
4643                true,
4644            ),
4645            (
4646                "smallint_col",
4647                Arc::new(Int32Array::from(vec![0, 1])) as _,
4648                true,
4649            ),
4650            ("int_col", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
4651            (
4652                "bigint_col",
4653                Arc::new(Int64Array::from(vec![0, 10])) as _,
4654                true,
4655            ),
4656            (
4657                "float_col",
4658                Arc::new(Float32Array::from(vec![0.0, 1.1])) as _,
4659                true,
4660            ),
4661            (
4662                "double_col",
4663                Arc::new(Float64Array::from(vec![0.0, 10.1])) as _,
4664                true,
4665            ),
4666            (
4667                "date_string_col",
4668                Arc::new(BinaryArray::from_iter_values([b"01/01/09", b"01/01/09"])) as _,
4669                true,
4670            ),
4671            (
4672                "string_col",
4673                Arc::new(BinaryArray::from_iter_values([b"0", b"1"])) as _,
4674                true,
4675            ),
4676            (
4677                "timestamp_col",
4678                Arc::new(
4679                    TimestampMicrosecondArray::from_iter_values([
4680                        1230768000000000, // 2009-01-01T00:00:00.000
4681                        1230768060000000, // 2009-01-01T00:01:00.000
4682                    ])
4683                    .with_timezone("+00:00"),
4684                ) as _,
4685                true,
4686            ),
4687        ])
4688        .unwrap();
4689        let file_path = arrow_test_data(file);
4690        let batch_large = read_file(&file_path, 8, false);
4691        assert_eq!(
4692            batch_large, expected,
4693            "Decoded RecordBatch does not match for file {file}"
4694        );
4695        let batch_small = read_file(&file_path, 3, false);
4696        assert_eq!(
4697            batch_small, expected,
4698            "Decoded RecordBatch (batch size 3) does not match for file {file}"
4699        );
4700    }
4701
4702    #[test]
4703    fn test_alltypes_nulls_plain() {
4704        let file = "avro/alltypes_nulls_plain.avro";
4705        let expected = RecordBatch::try_from_iter_with_nullable([
4706            (
4707                "string_col",
4708                Arc::new(StringArray::from(vec![None::<&str>])) as _,
4709                true,
4710            ),
4711            ("int_col", Arc::new(Int32Array::from(vec![None])) as _, true),
4712            (
4713                "bool_col",
4714                Arc::new(BooleanArray::from(vec![None])) as _,
4715                true,
4716            ),
4717            (
4718                "bigint_col",
4719                Arc::new(Int64Array::from(vec![None])) as _,
4720                true,
4721            ),
4722            (
4723                "float_col",
4724                Arc::new(Float32Array::from(vec![None])) as _,
4725                true,
4726            ),
4727            (
4728                "double_col",
4729                Arc::new(Float64Array::from(vec![None])) as _,
4730                true,
4731            ),
4732            (
4733                "bytes_col",
4734                Arc::new(BinaryArray::from(vec![None::<&[u8]>])) as _,
4735                true,
4736            ),
4737        ])
4738        .unwrap();
4739        let file_path = arrow_test_data(file);
4740        let batch_large = read_file(&file_path, 8, false);
4741        assert_eq!(
4742            batch_large, expected,
4743            "Decoded RecordBatch does not match for file {file}"
4744        );
4745        let batch_small = read_file(&file_path, 3, false);
4746        assert_eq!(
4747            batch_small, expected,
4748            "Decoded RecordBatch (batch size 3) does not match for file {file}"
4749        );
4750    }
4751
4752    #[test]
4753    // TODO: avoid requiring snappy for this file
4754    #[cfg(feature = "snappy")]
4755    fn test_binary() {
4756        let file = arrow_test_data("avro/binary.avro");
4757        let batch = read_file(&file, 8, false);
4758        let expected = RecordBatch::try_from_iter_with_nullable([(
4759            "foo",
4760            Arc::new(BinaryArray::from_iter_values(vec![
4761                b"\x00" as &[u8],
4762                b"\x01" as &[u8],
4763                b"\x02" as &[u8],
4764                b"\x03" as &[u8],
4765                b"\x04" as &[u8],
4766                b"\x05" as &[u8],
4767                b"\x06" as &[u8],
4768                b"\x07" as &[u8],
4769                b"\x08" as &[u8],
4770                b"\t" as &[u8],
4771                b"\n" as &[u8],
4772                b"\x0b" as &[u8],
4773            ])) as Arc<dyn Array>,
4774            true,
4775        )])
4776        .unwrap();
4777        assert_eq!(batch, expected);
4778    }
4779
4780    #[test]
4781    // TODO: avoid requiring snappy for these files
4782    #[cfg(feature = "snappy")]
4783    fn test_decimal() {
4784        // Choose expected Arrow types depending on the `small_decimals` feature flag.
4785        // With `small_decimals` enabled, Decimal32/Decimal64 are used where their
4786        // precision allows; otherwise, those cases resolve to Decimal128.
4787        #[cfg(feature = "small_decimals")]
4788        let files: [(&str, DataType, HashMap<String, String>); 8] = [
4789            (
4790                "avro/fixed_length_decimal.avro",
4791                DataType::Decimal128(25, 2),
4792                HashMap::from([
4793                    (
4794                        "avro.namespace".to_string(),
4795                        "topLevelRecord.value".to_string(),
4796                    ),
4797                    ("avro.name".to_string(), "fixed".to_string()),
4798                ]),
4799            ),
4800            (
4801                "avro/fixed_length_decimal_legacy.avro",
4802                DataType::Decimal64(13, 2),
4803                HashMap::from([
4804                    (
4805                        "avro.namespace".to_string(),
4806                        "topLevelRecord.value".to_string(),
4807                    ),
4808                    ("avro.name".to_string(), "fixed".to_string()),
4809                ]),
4810            ),
4811            (
4812                "avro/int32_decimal.avro",
4813                DataType::Decimal32(4, 2),
4814                HashMap::from([
4815                    (
4816                        "avro.namespace".to_string(),
4817                        "topLevelRecord.value".to_string(),
4818                    ),
4819                    ("avro.name".to_string(), "fixed".to_string()),
4820                ]),
4821            ),
4822            (
4823                "avro/int64_decimal.avro",
4824                DataType::Decimal64(10, 2),
4825                HashMap::from([
4826                    (
4827                        "avro.namespace".to_string(),
4828                        "topLevelRecord.value".to_string(),
4829                    ),
4830                    ("avro.name".to_string(), "fixed".to_string()),
4831                ]),
4832            ),
4833            (
4834                "test/data/int256_decimal.avro",
4835                DataType::Decimal256(76, 10),
4836                HashMap::new(),
4837            ),
4838            (
4839                "test/data/fixed256_decimal.avro",
4840                DataType::Decimal256(76, 10),
4841                HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
4842            ),
4843            (
4844                "test/data/fixed_length_decimal_legacy_32.avro",
4845                DataType::Decimal32(9, 2),
4846                HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
4847            ),
4848            (
4849                "test/data/int128_decimal.avro",
4850                DataType::Decimal128(38, 2),
4851                HashMap::new(),
4852            ),
4853        ];
4854        #[cfg(not(feature = "small_decimals"))]
4855        let files: [(&str, DataType, HashMap<String, String>); 8] = [
4856            (
4857                "avro/fixed_length_decimal.avro",
4858                DataType::Decimal128(25, 2),
4859                HashMap::from([
4860                    (
4861                        "avro.namespace".to_string(),
4862                        "topLevelRecord.value".to_string(),
4863                    ),
4864                    ("avro.name".to_string(), "fixed".to_string()),
4865                ]),
4866            ),
4867            (
4868                "avro/fixed_length_decimal_legacy.avro",
4869                DataType::Decimal128(13, 2),
4870                HashMap::from([
4871                    (
4872                        "avro.namespace".to_string(),
4873                        "topLevelRecord.value".to_string(),
4874                    ),
4875                    ("avro.name".to_string(), "fixed".to_string()),
4876                ]),
4877            ),
4878            (
4879                "avro/int32_decimal.avro",
4880                DataType::Decimal128(4, 2),
4881                HashMap::from([
4882                    (
4883                        "avro.namespace".to_string(),
4884                        "topLevelRecord.value".to_string(),
4885                    ),
4886                    ("avro.name".to_string(), "fixed".to_string()),
4887                ]),
4888            ),
4889            (
4890                "avro/int64_decimal.avro",
4891                DataType::Decimal128(10, 2),
4892                HashMap::from([
4893                    (
4894                        "avro.namespace".to_string(),
4895                        "topLevelRecord.value".to_string(),
4896                    ),
4897                    ("avro.name".to_string(), "fixed".to_string()),
4898                ]),
4899            ),
4900            (
4901                "test/data/int256_decimal.avro",
4902                DataType::Decimal256(76, 10),
4903                HashMap::new(),
4904            ),
4905            (
4906                "test/data/fixed256_decimal.avro",
4907                DataType::Decimal256(76, 10),
4908                HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
4909            ),
4910            (
4911                "test/data/fixed_length_decimal_legacy_32.avro",
4912                DataType::Decimal128(9, 2),
4913                HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
4914            ),
4915            (
4916                "test/data/int128_decimal.avro",
4917                DataType::Decimal128(38, 2),
4918                HashMap::new(),
4919            ),
4920        ];
4921        for (file, expected_dt, mut metadata) in files {
4922            let (precision, scale) = match expected_dt {
4923                DataType::Decimal32(p, s)
4924                | DataType::Decimal64(p, s)
4925                | DataType::Decimal128(p, s)
4926                | DataType::Decimal256(p, s) => (p, s),
4927                _ => unreachable!("Unexpected decimal type in test inputs"),
4928            };
4929            assert!(scale >= 0, "test data uses non-negative scales only");
4930            let scale_u32 = scale as u32;
4931            let file_path: String = if file.starts_with("avro/") {
4932                arrow_test_data(file)
4933            } else {
4934                std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
4935                    .join(file)
4936                    .to_string_lossy()
4937                    .into_owned()
4938            };
4939            let pow10: i128 = 10i128.pow(scale_u32);
4940            let values_i128: Vec<i128> = (1..=24).map(|n| (n as i128) * pow10).collect();
4941            let build_expected = |dt: &DataType, values: &[i128]| -> ArrayRef {
4942                match *dt {
4943                    #[cfg(feature = "small_decimals")]
4944                    DataType::Decimal32(p, s) => {
4945                        let it = values.iter().map(|&v| v as i32);
4946                        Arc::new(
4947                            Decimal32Array::from_iter_values(it)
4948                                .with_precision_and_scale(p, s)
4949                                .unwrap(),
4950                        )
4951                    }
4952                    #[cfg(feature = "small_decimals")]
4953                    DataType::Decimal64(p, s) => {
4954                        let it = values.iter().map(|&v| v as i64);
4955                        Arc::new(
4956                            Decimal64Array::from_iter_values(it)
4957                                .with_precision_and_scale(p, s)
4958                                .unwrap(),
4959                        )
4960                    }
4961                    DataType::Decimal128(p, s) => {
4962                        let it = values.iter().copied();
4963                        Arc::new(
4964                            Decimal128Array::from_iter_values(it)
4965                                .with_precision_and_scale(p, s)
4966                                .unwrap(),
4967                        )
4968                    }
4969                    DataType::Decimal256(p, s) => {
4970                        let it = values.iter().map(|&v| i256::from_i128(v));
4971                        Arc::new(
4972                            Decimal256Array::from_iter_values(it)
4973                                .with_precision_and_scale(p, s)
4974                                .unwrap(),
4975                        )
4976                    }
4977                    _ => unreachable!("Unexpected decimal type in test"),
4978                }
4979            };
4980            let actual_batch = read_file(&file_path, 8, false);
4981            let actual_nullable = actual_batch.schema().field(0).is_nullable();
4982            let expected_array = build_expected(&expected_dt, &values_i128);
4983            metadata.insert("precision".to_string(), precision.to_string());
4984            metadata.insert("scale".to_string(), scale.to_string());
4985            let field =
4986                Field::new("value", expected_dt.clone(), actual_nullable).with_metadata(metadata);
4987            let expected_schema = Arc::new(Schema::new(vec![field]));
4988            let expected_batch =
4989                RecordBatch::try_new(expected_schema.clone(), vec![expected_array]).unwrap();
4990            assert_eq!(
4991                actual_batch, expected_batch,
4992                "Decoded RecordBatch does not match for {file}"
4993            );
4994            let actual_batch_small = read_file(&file_path, 3, false);
4995            assert_eq!(
4996                actual_batch_small, expected_batch,
4997                "Decoded RecordBatch does not match for {file} with batch size 3"
4998            );
4999        }
5000    }
5001
5002    #[test]
5003    fn test_read_duration_logical_types_feature_toggle() -> Result<(), ArrowError> {
5004        let file_path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5005            .join("test/data/duration_logical_types.avro")
5006            .to_string_lossy()
5007            .into_owned();
5008
5009        let actual_batch = read_file(&file_path, 4, false);
5010
5011        let expected_batch = {
5012            #[cfg(feature = "avro_custom_types")]
5013            {
5014                let schema = Arc::new(Schema::new(vec![
5015                    Field::new(
5016                        "duration_time_nanos",
5017                        DataType::Duration(TimeUnit::Nanosecond),
5018                        false,
5019                    ),
5020                    Field::new(
5021                        "duration_time_micros",
5022                        DataType::Duration(TimeUnit::Microsecond),
5023                        false,
5024                    ),
5025                    Field::new(
5026                        "duration_time_millis",
5027                        DataType::Duration(TimeUnit::Millisecond),
5028                        false,
5029                    ),
5030                    Field::new(
5031                        "duration_time_seconds",
5032                        DataType::Duration(TimeUnit::Second),
5033                        false,
5034                    ),
5035                ]));
5036
5037                let nanos = Arc::new(PrimitiveArray::<DurationNanosecondType>::from(vec![
5038                    10, 20, 30, 40,
5039                ])) as ArrayRef;
5040                let micros = Arc::new(PrimitiveArray::<DurationMicrosecondType>::from(vec![
5041                    100, 200, 300, 400,
5042                ])) as ArrayRef;
5043                let millis = Arc::new(PrimitiveArray::<DurationMillisecondType>::from(vec![
5044                    1000, 2000, 3000, 4000,
5045                ])) as ArrayRef;
5046                let seconds = Arc::new(PrimitiveArray::<DurationSecondType>::from(vec![1, 2, 3, 4]))
5047                    as ArrayRef;
5048
5049                RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5050            }
5051            #[cfg(not(feature = "avro_custom_types"))]
5052            {
5053                let schema = Arc::new(Schema::new(vec![
5054                    Field::new("duration_time_nanos", DataType::Int64, false).with_metadata(
5055                        [(
5056                            "logicalType".to_string(),
5057                            "arrow.duration-nanos".to_string(),
5058                        )]
5059                        .into(),
5060                    ),
5061                    Field::new("duration_time_micros", DataType::Int64, false).with_metadata(
5062                        [(
5063                            "logicalType".to_string(),
5064                            "arrow.duration-micros".to_string(),
5065                        )]
5066                        .into(),
5067                    ),
5068                    Field::new("duration_time_millis", DataType::Int64, false).with_metadata(
5069                        [(
5070                            "logicalType".to_string(),
5071                            "arrow.duration-millis".to_string(),
5072                        )]
5073                        .into(),
5074                    ),
5075                    Field::new("duration_time_seconds", DataType::Int64, false).with_metadata(
5076                        [(
5077                            "logicalType".to_string(),
5078                            "arrow.duration-seconds".to_string(),
5079                        )]
5080                        .into(),
5081                    ),
5082                ]));
5083
5084                let nanos =
5085                    Arc::new(PrimitiveArray::<Int64Type>::from(vec![10, 20, 30, 40])) as ArrayRef;
5086                let micros = Arc::new(PrimitiveArray::<Int64Type>::from(vec![100, 200, 300, 400]))
5087                    as ArrayRef;
5088                let millis = Arc::new(PrimitiveArray::<Int64Type>::from(vec![
5089                    1000, 2000, 3000, 4000,
5090                ])) as ArrayRef;
5091                let seconds =
5092                    Arc::new(PrimitiveArray::<Int64Type>::from(vec![1, 2, 3, 4])) as ArrayRef;
5093
5094                RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5095            }
5096        };
5097
5098        assert_eq!(actual_batch, expected_batch);
5099
5100        Ok(())
5101    }
5102
5103    #[test]
5104    // TODO: avoid requiring snappy for this file
5105    #[cfg(feature = "snappy")]
5106    fn test_dict_pages_offset_zero() {
5107        let file = arrow_test_data("avro/dict-page-offset-zero.avro");
5108        let batch = read_file(&file, 32, false);
5109        let num_rows = batch.num_rows();
5110        let expected_field = Int32Array::from(vec![Some(1552); num_rows]);
5111        let expected = RecordBatch::try_from_iter_with_nullable([(
5112            "l_partkey",
5113            Arc::new(expected_field) as Arc<dyn Array>,
5114            true,
5115        )])
5116        .unwrap();
5117        assert_eq!(batch, expected);
5118    }
5119
5120    #[test]
5121    // TODO: avoid requiring snappy for this file
5122    #[cfg(feature = "snappy")]
5123    fn test_list_columns() {
5124        let file = arrow_test_data("avro/list_columns.avro");
5125        let mut int64_list_builder = ListBuilder::new(Int64Builder::new());
5126        {
5127            {
5128                let values = int64_list_builder.values();
5129                values.append_value(1);
5130                values.append_value(2);
5131                values.append_value(3);
5132            }
5133            int64_list_builder.append(true);
5134        }
5135        {
5136            {
5137                let values = int64_list_builder.values();
5138                values.append_null();
5139                values.append_value(1);
5140            }
5141            int64_list_builder.append(true);
5142        }
5143        {
5144            {
5145                let values = int64_list_builder.values();
5146                values.append_value(4);
5147            }
5148            int64_list_builder.append(true);
5149        }
5150        let int64_list = int64_list_builder.finish();
5151        let mut utf8_list_builder = ListBuilder::new(StringBuilder::new());
5152        {
5153            {
5154                let values = utf8_list_builder.values();
5155                values.append_value("abc");
5156                values.append_value("efg");
5157                values.append_value("hij");
5158            }
5159            utf8_list_builder.append(true);
5160        }
5161        {
5162            utf8_list_builder.append(false);
5163        }
5164        {
5165            {
5166                let values = utf8_list_builder.values();
5167                values.append_value("efg");
5168                values.append_null();
5169                values.append_value("hij");
5170                values.append_value("xyz");
5171            }
5172            utf8_list_builder.append(true);
5173        }
5174        let utf8_list = utf8_list_builder.finish();
5175        let expected = RecordBatch::try_from_iter_with_nullable([
5176            ("int64_list", Arc::new(int64_list) as Arc<dyn Array>, true),
5177            ("utf8_list", Arc::new(utf8_list) as Arc<dyn Array>, true),
5178        ])
5179        .unwrap();
5180        let batch = read_file(&file, 8, false);
5181        assert_eq!(batch, expected);
5182    }
5183
5184    #[test]
5185    #[cfg(feature = "snappy")]
5186    fn test_nested_lists() {
5187        use arrow_data::ArrayDataBuilder;
5188        let file = arrow_test_data("avro/nested_lists.snappy.avro");
5189        let inner_values = StringArray::from(vec![
5190            Some("a"),
5191            Some("b"),
5192            Some("c"),
5193            Some("d"),
5194            Some("a"),
5195            Some("b"),
5196            Some("c"),
5197            Some("d"),
5198            Some("e"),
5199            Some("a"),
5200            Some("b"),
5201            Some("c"),
5202            Some("d"),
5203            Some("e"),
5204            Some("f"),
5205        ]);
5206        let inner_offsets = Buffer::from_slice_ref([0, 2, 3, 3, 4, 6, 8, 8, 9, 11, 13, 14, 14, 15]);
5207        let inner_validity = [
5208            true, true, false, true, true, true, false, true, true, true, true, false, true,
5209        ];
5210        let inner_null_buffer = Buffer::from_iter(inner_validity.iter().copied());
5211        let inner_field = Field::new("item", DataType::Utf8, true);
5212        let inner_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(inner_field)))
5213            .len(13)
5214            .add_buffer(inner_offsets)
5215            .add_child_data(inner_values.to_data())
5216            .null_bit_buffer(Some(inner_null_buffer))
5217            .build()
5218            .unwrap();
5219        let inner_list_array = ListArray::from(inner_list_data);
5220        let middle_offsets = Buffer::from_slice_ref([0, 2, 4, 6, 8, 11, 13]);
5221        let middle_validity = [true; 6];
5222        let middle_null_buffer = Buffer::from_iter(middle_validity.iter().copied());
5223        let middle_field = Field::new("item", inner_list_array.data_type().clone(), true);
5224        let middle_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(middle_field)))
5225            .len(6)
5226            .add_buffer(middle_offsets)
5227            .add_child_data(inner_list_array.to_data())
5228            .null_bit_buffer(Some(middle_null_buffer))
5229            .build()
5230            .unwrap();
5231        let middle_list_array = ListArray::from(middle_list_data);
5232        let outer_offsets = Buffer::from_slice_ref([0, 2, 4, 6]);
5233        let outer_null_buffer = Buffer::from_slice_ref([0b111]); // all 3 rows valid
5234        let outer_field = Field::new("item", middle_list_array.data_type().clone(), true);
5235        let outer_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(outer_field)))
5236            .len(3)
5237            .add_buffer(outer_offsets)
5238            .add_child_data(middle_list_array.to_data())
5239            .null_bit_buffer(Some(outer_null_buffer))
5240            .build()
5241            .unwrap();
5242        let a_expected = ListArray::from(outer_list_data);
5243        let b_expected = Int32Array::from(vec![1, 1, 1]);
5244        let expected = RecordBatch::try_from_iter_with_nullable([
5245            ("a", Arc::new(a_expected) as Arc<dyn Array>, true),
5246            ("b", Arc::new(b_expected) as Arc<dyn Array>, true),
5247        ])
5248        .unwrap();
5249        let left = read_file(&file, 8, false);
5250        assert_eq!(left, expected, "Mismatch for batch size=8");
5251        let left_small = read_file(&file, 3, false);
5252        assert_eq!(left_small, expected, "Mismatch for batch size=3");
5253    }
5254
5255    #[test]
5256    fn test_simple() {
5257        let tests = [
5258            ("avro/simple_enum.avro", 4, build_expected_enum(), 2),
5259            ("avro/simple_fixed.avro", 2, build_expected_fixed(), 1),
5260        ];
5261
5262        fn build_expected_enum() -> RecordBatch {
5263            // Build the DictionaryArrays for f1, f2, f3
5264            let keys_f1 = Int32Array::from(vec![0, 1, 2, 3]);
5265            let vals_f1 = StringArray::from(vec!["a", "b", "c", "d"]);
5266            let f1_dict =
5267                DictionaryArray::<Int32Type>::try_new(keys_f1, Arc::new(vals_f1)).unwrap();
5268            let keys_f2 = Int32Array::from(vec![2, 3, 0, 1]);
5269            let vals_f2 = StringArray::from(vec!["e", "f", "g", "h"]);
5270            let f2_dict =
5271                DictionaryArray::<Int32Type>::try_new(keys_f2, Arc::new(vals_f2)).unwrap();
5272            let keys_f3 = Int32Array::from(vec![Some(1), Some(2), None, Some(0)]);
5273            let vals_f3 = StringArray::from(vec!["i", "j", "k"]);
5274            let f3_dict =
5275                DictionaryArray::<Int32Type>::try_new(keys_f3, Arc::new(vals_f3)).unwrap();
5276            let dict_type =
5277                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
5278            let mut md_f1 = HashMap::new();
5279            md_f1.insert(
5280                AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5281                r#"["a","b","c","d"]"#.to_string(),
5282            );
5283            md_f1.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum1".to_string());
5284            md_f1.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5285            let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
5286            let mut md_f2 = HashMap::new();
5287            md_f2.insert(
5288                AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5289                r#"["e","f","g","h"]"#.to_string(),
5290            );
5291            md_f2.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum2".to_string());
5292            md_f2.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5293            let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
5294            let mut md_f3 = HashMap::new();
5295            md_f3.insert(
5296                AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5297                r#"["i","j","k"]"#.to_string(),
5298            );
5299            md_f3.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum3".to_string());
5300            md_f3.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5301            let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
5302            let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
5303            RecordBatch::try_new(
5304                expected_schema,
5305                vec![
5306                    Arc::new(f1_dict) as Arc<dyn Array>,
5307                    Arc::new(f2_dict) as Arc<dyn Array>,
5308                    Arc::new(f3_dict) as Arc<dyn Array>,
5309                ],
5310            )
5311            .unwrap()
5312        }
5313
5314        fn build_expected_fixed() -> RecordBatch {
5315            let f1 =
5316                FixedSizeBinaryArray::try_from_iter(vec![b"abcde", b"12345"].into_iter()).unwrap();
5317            let f2 =
5318                FixedSizeBinaryArray::try_from_iter(vec![b"fghijklmno", b"1234567890"].into_iter())
5319                    .unwrap();
5320            let f3 = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5321                vec![Some(b"ABCDEF" as &[u8]), None].into_iter(),
5322                6,
5323            )
5324            .unwrap();
5325
5326            // Add Avro named-type metadata for fixed fields
5327            let mut md_f1 = HashMap::new();
5328            md_f1.insert(
5329                crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5330                "fixed1".to_string(),
5331            );
5332            md_f1.insert(
5333                crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5334                "ns1".to_string(),
5335            );
5336
5337            let mut md_f2 = HashMap::new();
5338            md_f2.insert(
5339                crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5340                "fixed2".to_string(),
5341            );
5342            md_f2.insert(
5343                crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5344                "ns2".to_string(),
5345            );
5346
5347            let mut md_f3 = HashMap::new();
5348            md_f3.insert(
5349                crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5350                "fixed3".to_string(),
5351            );
5352            md_f3.insert(
5353                crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5354                "ns1".to_string(),
5355            );
5356
5357            let expected_schema = Arc::new(Schema::new(vec![
5358                Field::new("f1", DataType::FixedSizeBinary(5), false).with_metadata(md_f1),
5359                Field::new("f2", DataType::FixedSizeBinary(10), false).with_metadata(md_f2),
5360                Field::new("f3", DataType::FixedSizeBinary(6), true).with_metadata(md_f3),
5361            ]));
5362
5363            RecordBatch::try_new(
5364                expected_schema,
5365                vec![
5366                    Arc::new(f1) as Arc<dyn Array>,
5367                    Arc::new(f2) as Arc<dyn Array>,
5368                    Arc::new(f3) as Arc<dyn Array>,
5369                ],
5370            )
5371            .unwrap()
5372        }
5373        for (file_name, batch_size, expected, alt_batch_size) in tests {
5374            let file = arrow_test_data(file_name);
5375            let actual = read_file(&file, batch_size, false);
5376            assert_eq!(actual, expected);
5377            let actual2 = read_file(&file, alt_batch_size, false);
5378            assert_eq!(actual2, expected);
5379        }
5380    }
5381
5382    #[test]
5383    #[cfg(feature = "snappy")]
5384    fn test_single_nan() {
5385        let file = arrow_test_data("avro/single_nan.avro");
5386        let actual = read_file(&file, 1, false);
5387        use arrow_array::Float64Array;
5388        let schema = Arc::new(Schema::new(vec![Field::new(
5389            "mycol",
5390            DataType::Float64,
5391            true,
5392        )]));
5393        let col = Float64Array::from(vec![None]);
5394        let expected = RecordBatch::try_new(schema, vec![Arc::new(col)]).unwrap();
5395        assert_eq!(actual, expected);
5396        let actual2 = read_file(&file, 2, false);
5397        assert_eq!(actual2, expected);
5398    }
5399
5400    #[test]
5401    fn test_duration_uuid() {
5402        let batch = read_file("test/data/duration_uuid.avro", 4, false);
5403        let schema = batch.schema();
5404        let fields = schema.fields();
5405        assert_eq!(fields.len(), 2);
5406        assert_eq!(fields[0].name(), "duration_field");
5407        assert_eq!(
5408            fields[0].data_type(),
5409            &DataType::Interval(IntervalUnit::MonthDayNano)
5410        );
5411        assert_eq!(fields[1].name(), "uuid_field");
5412        assert_eq!(fields[1].data_type(), &DataType::FixedSizeBinary(16));
5413        assert_eq!(batch.num_rows(), 4);
5414        assert_eq!(batch.num_columns(), 2);
5415        let duration_array = batch
5416            .column(0)
5417            .as_any()
5418            .downcast_ref::<IntervalMonthDayNanoArray>()
5419            .unwrap();
5420        let expected_duration_array: IntervalMonthDayNanoArray = [
5421            Some(IntervalMonthDayNanoType::make_value(1, 15, 500_000_000)),
5422            Some(IntervalMonthDayNanoType::make_value(0, 5, 2_500_000_000)),
5423            Some(IntervalMonthDayNanoType::make_value(2, 0, 0)),
5424            Some(IntervalMonthDayNanoType::make_value(12, 31, 999_000_000)),
5425        ]
5426        .iter()
5427        .copied()
5428        .collect();
5429        assert_eq!(&expected_duration_array, duration_array);
5430        let uuid_array = batch
5431            .column(1)
5432            .as_any()
5433            .downcast_ref::<FixedSizeBinaryArray>()
5434            .unwrap();
5435        let expected_uuid_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5436            [
5437                Some([
5438                    0xfe, 0x7b, 0xc3, 0x0b, 0x4c, 0xe8, 0x4c, 0x5e, 0xb6, 0x7c, 0x22, 0x34, 0xa2,
5439                    0xd3, 0x8e, 0x66,
5440                ]),
5441                Some([
5442                    0xb3, 0x3f, 0x2a, 0xd7, 0x97, 0xb4, 0x4d, 0xe1, 0x8b, 0xfe, 0x94, 0x94, 0x1d,
5443                    0x60, 0x15, 0x6e,
5444                ]),
5445                Some([
5446                    0x5f, 0x74, 0x92, 0x64, 0x07, 0x4b, 0x40, 0x05, 0x84, 0xbf, 0x11, 0x5e, 0xa8,
5447                    0x4e, 0xd2, 0x0a,
5448                ]),
5449                Some([
5450                    0x08, 0x26, 0xcc, 0x06, 0xd2, 0xe3, 0x45, 0x99, 0xb4, 0xad, 0xaf, 0x5f, 0xa6,
5451                    0x90, 0x5c, 0xdb,
5452                ]),
5453            ]
5454            .into_iter(),
5455            16,
5456        )
5457        .unwrap();
5458        assert_eq!(&expected_uuid_array, uuid_array);
5459    }
5460
5461    #[test]
5462    #[cfg(feature = "snappy")]
5463    fn test_datapage_v2() {
5464        let file = arrow_test_data("avro/datapage_v2.snappy.avro");
5465        let batch = read_file(&file, 8, false);
5466        let a = StringArray::from(vec![
5467            Some("abc"),
5468            Some("abc"),
5469            Some("abc"),
5470            None,
5471            Some("abc"),
5472        ]);
5473        let b = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
5474        let c = Float64Array::from(vec![Some(2.0), Some(3.0), Some(4.0), Some(5.0), Some(2.0)]);
5475        let d = BooleanArray::from(vec![
5476            Some(true),
5477            Some(true),
5478            Some(true),
5479            Some(false),
5480            Some(true),
5481        ]);
5482        let e_values = Int32Array::from(vec![
5483            Some(1),
5484            Some(2),
5485            Some(3),
5486            Some(1),
5487            Some(2),
5488            Some(3),
5489            Some(1),
5490            Some(2),
5491        ]);
5492        let e_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 3, 3, 3, 6, 8]));
5493        let e_validity = Some(NullBuffer::from(vec![true, false, false, true, true]));
5494        let field_e = Arc::new(Field::new("item", DataType::Int32, true));
5495        let e = ListArray::new(field_e, e_offsets, Arc::new(e_values), e_validity);
5496        let expected = RecordBatch::try_from_iter_with_nullable([
5497            ("a", Arc::new(a) as Arc<dyn Array>, true),
5498            ("b", Arc::new(b) as Arc<dyn Array>, true),
5499            ("c", Arc::new(c) as Arc<dyn Array>, true),
5500            ("d", Arc::new(d) as Arc<dyn Array>, true),
5501            ("e", Arc::new(e) as Arc<dyn Array>, true),
5502        ])
5503        .unwrap();
5504        assert_eq!(batch, expected);
5505    }
5506
5507    #[test]
5508    fn test_nested_records() {
5509        let f1_f1_1 = StringArray::from(vec!["aaa", "bbb"]);
5510        let f1_f1_2 = Int32Array::from(vec![10, 20]);
5511        let rounded_pi = (std::f64::consts::PI * 100.0).round() / 100.0;
5512        let f1_f1_3_1 = Float64Array::from(vec![rounded_pi, rounded_pi]);
5513        let f1_f1_3 = StructArray::from(vec![(
5514            Arc::new(Field::new("f1_3_1", DataType::Float64, false)),
5515            Arc::new(f1_f1_3_1) as Arc<dyn Array>,
5516        )]);
5517        // Add Avro named-type metadata to nested field f1_3 (ns3.record3)
5518        let mut f1_3_md: HashMap<String, String> = HashMap::new();
5519        f1_3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns3".to_string());
5520        f1_3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record3".to_string());
5521        let f1_expected = StructArray::from(vec![
5522            (
5523                Arc::new(Field::new("f1_1", DataType::Utf8, false)),
5524                Arc::new(f1_f1_1) as Arc<dyn Array>,
5525            ),
5526            (
5527                Arc::new(Field::new("f1_2", DataType::Int32, false)),
5528                Arc::new(f1_f1_2) as Arc<dyn Array>,
5529            ),
5530            (
5531                Arc::new(
5532                    Field::new(
5533                        "f1_3",
5534                        DataType::Struct(Fields::from(vec![Field::new(
5535                            "f1_3_1",
5536                            DataType::Float64,
5537                            false,
5538                        )])),
5539                        false,
5540                    )
5541                    .with_metadata(f1_3_md),
5542                ),
5543                Arc::new(f1_f1_3) as Arc<dyn Array>,
5544            ),
5545        ]);
5546        let f2_fields = vec![
5547            Field::new("f2_1", DataType::Boolean, false),
5548            Field::new("f2_2", DataType::Float32, false),
5549        ];
5550        let f2_struct_builder = StructBuilder::new(
5551            f2_fields
5552                .iter()
5553                .map(|f| Arc::new(f.clone()))
5554                .collect::<Vec<Arc<Field>>>(),
5555            vec![
5556                Box::new(BooleanBuilder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5557                Box::new(Float32Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5558            ],
5559        );
5560        let mut f2_list_builder = ListBuilder::new(f2_struct_builder);
5561        {
5562            let struct_builder = f2_list_builder.values();
5563            struct_builder.append(true);
5564            {
5565                let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5566                b.append_value(true);
5567            }
5568            {
5569                let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5570                b.append_value(1.2_f32);
5571            }
5572            struct_builder.append(true);
5573            {
5574                let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5575                b.append_value(true);
5576            }
5577            {
5578                let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5579                b.append_value(2.2_f32);
5580            }
5581            f2_list_builder.append(true);
5582        }
5583        {
5584            let struct_builder = f2_list_builder.values();
5585            struct_builder.append(true);
5586            {
5587                let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5588                b.append_value(false);
5589            }
5590            {
5591                let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5592                b.append_value(10.2_f32);
5593            }
5594            f2_list_builder.append(true);
5595        }
5596
5597        let list_array_with_nullable_items = f2_list_builder.finish();
5598        // Add Avro named-type metadata to f2's list item (ns4.record4)
5599        let mut f2_item_md: HashMap<String, String> = HashMap::new();
5600        f2_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record4".to_string());
5601        f2_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns4".to_string());
5602        let item_field = Arc::new(
5603            Field::new(
5604                "item",
5605                list_array_with_nullable_items.values().data_type().clone(),
5606                false, // items are non-nullable for f2
5607            )
5608            .with_metadata(f2_item_md),
5609        );
5610        let list_data_type = DataType::List(item_field);
5611        let f2_array_data = list_array_with_nullable_items
5612            .to_data()
5613            .into_builder()
5614            .data_type(list_data_type)
5615            .build()
5616            .unwrap();
5617        let f2_expected = ListArray::from(f2_array_data);
5618        let mut f3_struct_builder = StructBuilder::new(
5619            vec![Arc::new(Field::new("f3_1", DataType::Utf8, false))],
5620            vec![Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>],
5621        );
5622        f3_struct_builder.append(true);
5623        {
5624            let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
5625            b.append_value("xyz");
5626        }
5627        f3_struct_builder.append(false);
5628        {
5629            let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
5630            b.append_null();
5631        }
5632        let f3_expected = f3_struct_builder.finish();
5633        let f4_fields = [Field::new("f4_1", DataType::Int64, false)];
5634        let f4_struct_builder = StructBuilder::new(
5635            f4_fields
5636                .iter()
5637                .map(|f| Arc::new(f.clone()))
5638                .collect::<Vec<Arc<Field>>>(),
5639            vec![Box::new(Int64Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>],
5640        );
5641        let mut f4_list_builder = ListBuilder::new(f4_struct_builder);
5642        {
5643            let struct_builder = f4_list_builder.values();
5644            struct_builder.append(true);
5645            {
5646                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5647                b.append_value(200);
5648            }
5649            struct_builder.append(false);
5650            {
5651                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5652                b.append_null();
5653            }
5654            f4_list_builder.append(true);
5655        }
5656        {
5657            let struct_builder = f4_list_builder.values();
5658            struct_builder.append(false);
5659            {
5660                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5661                b.append_null();
5662            }
5663            struct_builder.append(true);
5664            {
5665                let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5666                b.append_value(300);
5667            }
5668            f4_list_builder.append(true);
5669        }
5670        let f4_expected = f4_list_builder.finish();
5671        // Add Avro named-type metadata to f4's list item (ns6.record6), item is nullable
5672        let mut f4_item_md: HashMap<String, String> = HashMap::new();
5673        f4_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns6".to_string());
5674        f4_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record6".to_string());
5675        let f4_item_field = Arc::new(
5676            Field::new("item", f4_expected.values().data_type().clone(), true)
5677                .with_metadata(f4_item_md),
5678        );
5679        let f4_list_data_type = DataType::List(f4_item_field);
5680        let f4_array_data = f4_expected
5681            .to_data()
5682            .into_builder()
5683            .data_type(f4_list_data_type)
5684            .build()
5685            .unwrap();
5686        let f4_expected = ListArray::from(f4_array_data);
5687        // Build Schema with Avro named-type metadata on the top-level f1 and f3 fields
5688        let mut f1_md: HashMap<String, String> = HashMap::new();
5689        f1_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record2".to_string());
5690        f1_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5691        let mut f3_md: HashMap<String, String> = HashMap::new();
5692        f3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns5".to_string());
5693        f3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record5".to_string());
5694        let expected_schema = Schema::new(vec![
5695            Field::new("f1", f1_expected.data_type().clone(), false).with_metadata(f1_md),
5696            Field::new("f2", f2_expected.data_type().clone(), false),
5697            Field::new("f3", f3_expected.data_type().clone(), true).with_metadata(f3_md),
5698            Field::new("f4", f4_expected.data_type().clone(), false),
5699        ]);
5700        let expected = RecordBatch::try_new(
5701            Arc::new(expected_schema),
5702            vec![
5703                Arc::new(f1_expected) as Arc<dyn Array>,
5704                Arc::new(f2_expected) as Arc<dyn Array>,
5705                Arc::new(f3_expected) as Arc<dyn Array>,
5706                Arc::new(f4_expected) as Arc<dyn Array>,
5707            ],
5708        )
5709        .unwrap();
5710        let file = arrow_test_data("avro/nested_records.avro");
5711        let batch_large = read_file(&file, 8, false);
5712        assert_eq!(
5713            batch_large, expected,
5714            "Decoded RecordBatch does not match expected data for nested records (batch size 8)"
5715        );
5716        let batch_small = read_file(&file, 3, false);
5717        assert_eq!(
5718            batch_small, expected,
5719            "Decoded RecordBatch does not match expected data for nested records (batch size 3)"
5720        );
5721    }
5722
5723    #[test]
5724    // TODO: avoid requiring snappy for this file
5725    #[cfg(feature = "snappy")]
5726    fn test_repeated_no_annotation() {
5727        use arrow_data::ArrayDataBuilder;
5728        let file = arrow_test_data("avro/repeated_no_annotation.avro");
5729        let batch_large = read_file(&file, 8, false);
5730        // id column
5731        let id_array = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
5732        // Build the inner Struct<number:int64, kind:utf8>
5733        let number_array = Int64Array::from(vec![
5734            Some(5555555555),
5735            Some(1111111111),
5736            Some(1111111111),
5737            Some(2222222222),
5738            Some(3333333333),
5739        ]);
5740        let kind_array =
5741            StringArray::from(vec![None, Some("home"), Some("home"), None, Some("mobile")]);
5742        let phone_fields = Fields::from(vec![
5743            Field::new("number", DataType::Int64, true),
5744            Field::new("kind", DataType::Utf8, true),
5745        ]);
5746        let phone_struct_data = ArrayDataBuilder::new(DataType::Struct(phone_fields))
5747            .len(5)
5748            .child_data(vec![number_array.into_data(), kind_array.into_data()])
5749            .build()
5750            .unwrap();
5751        let phone_struct_array = StructArray::from(phone_struct_data);
5752        // Build List<item: Struct<...>> with Avro named-type metadata on the *element* field
5753        let phone_list_offsets = Buffer::from_slice_ref([0i32, 0, 0, 0, 1, 2, 5]);
5754        let phone_list_validity = Buffer::from_iter([false, false, true, true, true, true]);
5755        // The Avro schema names this inner record "phone" in namespace "topLevelRecord.phoneNumbers"
5756        let mut phone_item_md = HashMap::new();
5757        phone_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "phone".to_string());
5758        phone_item_md.insert(
5759            AVRO_NAMESPACE_METADATA_KEY.to_string(),
5760            "topLevelRecord.phoneNumbers".to_string(),
5761        );
5762        let phone_item_field = Field::new("item", phone_struct_array.data_type().clone(), true)
5763            .with_metadata(phone_item_md);
5764        let phone_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(phone_item_field)))
5765            .len(6)
5766            .add_buffer(phone_list_offsets)
5767            .null_bit_buffer(Some(phone_list_validity))
5768            .child_data(vec![phone_struct_array.into_data()])
5769            .build()
5770            .unwrap();
5771        let phone_list_array = ListArray::from(phone_list_data);
5772        // Wrap in Struct { phone: List<...> }
5773        let phone_numbers_validity = Buffer::from_iter([false, false, true, true, true, true]);
5774        let phone_numbers_field = Field::new("phone", phone_list_array.data_type().clone(), true);
5775        let phone_numbers_struct_data =
5776            ArrayDataBuilder::new(DataType::Struct(Fields::from(vec![phone_numbers_field])))
5777                .len(6)
5778                .null_bit_buffer(Some(phone_numbers_validity))
5779                .child_data(vec![phone_list_array.into_data()])
5780                .build()
5781                .unwrap();
5782        let phone_numbers_struct_array = StructArray::from(phone_numbers_struct_data);
5783        // Build the expected Schema, annotating the top-level "phoneNumbers" field with Avro name/namespace
5784        let mut phone_numbers_md = HashMap::new();
5785        phone_numbers_md.insert(
5786            AVRO_NAME_METADATA_KEY.to_string(),
5787            "phoneNumbers".to_string(),
5788        );
5789        phone_numbers_md.insert(
5790            AVRO_NAMESPACE_METADATA_KEY.to_string(),
5791            "topLevelRecord".to_string(),
5792        );
5793        let id_field = Field::new("id", DataType::Int32, true);
5794        let phone_numbers_schema_field = Field::new(
5795            "phoneNumbers",
5796            phone_numbers_struct_array.data_type().clone(),
5797            true,
5798        )
5799        .with_metadata(phone_numbers_md);
5800        let expected_schema = Schema::new(vec![id_field, phone_numbers_schema_field]);
5801        // Final expected RecordBatch (arrays already carry matching list-element metadata)
5802        let expected = RecordBatch::try_new(
5803            Arc::new(expected_schema),
5804            vec![
5805                Arc::new(id_array) as _,
5806                Arc::new(phone_numbers_struct_array) as _,
5807            ],
5808        )
5809        .unwrap();
5810        assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
5811        let batch_small = read_file(&file, 3, false);
5812        assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
5813    }
5814
5815    #[test]
5816    // TODO: avoid requiring snappy for this file
5817    #[cfg(feature = "snappy")]
5818    fn test_nonnullable_impala() {
5819        let file = arrow_test_data("avro/nonnullable.impala.avro");
5820        let id = Int64Array::from(vec![Some(8)]);
5821        let mut int_array_builder = ListBuilder::new(Int32Builder::new());
5822        {
5823            let vb = int_array_builder.values();
5824            vb.append_value(-1);
5825        }
5826        int_array_builder.append(true); // finalize one sub-list
5827        let int_array = int_array_builder.finish();
5828        let mut iaa_builder = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
5829        {
5830            let inner_list_builder = iaa_builder.values();
5831            {
5832                let vb = inner_list_builder.values();
5833                vb.append_value(-1);
5834                vb.append_value(-2);
5835            }
5836            inner_list_builder.append(true);
5837            inner_list_builder.append(true);
5838        }
5839        iaa_builder.append(true);
5840        let int_array_array = iaa_builder.finish();
5841        let field_names = MapFieldNames {
5842            entry: "entries".to_string(),
5843            key: "key".to_string(),
5844            value: "value".to_string(),
5845        };
5846        let mut int_map_builder =
5847            MapBuilder::new(Some(field_names), StringBuilder::new(), Int32Builder::new());
5848        {
5849            let (keys, vals) = int_map_builder.entries();
5850            keys.append_value("k1");
5851            vals.append_value(-1);
5852        }
5853        int_map_builder.append(true).unwrap(); // finalize map for row 0
5854        let int_map = int_map_builder.finish();
5855        let field_names2 = MapFieldNames {
5856            entry: "entries".to_string(),
5857            key: "key".to_string(),
5858            value: "value".to_string(),
5859        };
5860        let mut ima_builder = ListBuilder::new(MapBuilder::new(
5861            Some(field_names2),
5862            StringBuilder::new(),
5863            Int32Builder::new(),
5864        ));
5865        {
5866            let map_builder = ima_builder.values();
5867            map_builder.append(true).unwrap();
5868            {
5869                let (keys, vals) = map_builder.entries();
5870                keys.append_value("k1");
5871                vals.append_value(1);
5872            }
5873            map_builder.append(true).unwrap();
5874            map_builder.append(true).unwrap();
5875            map_builder.append(true).unwrap();
5876        }
5877        ima_builder.append(true);
5878        let int_map_array_ = ima_builder.finish();
5879        // Helper metadata maps
5880        let meta_nested_struct: HashMap<String, String> = [
5881            ("avro.name", "nested_Struct"),
5882            ("avro.namespace", "topLevelRecord"),
5883        ]
5884        .into_iter()
5885        .map(|(k, v)| (k.to_string(), v.to_string()))
5886        .collect();
5887        let meta_c: HashMap<String, String> = [
5888            ("avro.name", "c"),
5889            ("avro.namespace", "topLevelRecord.nested_Struct"),
5890        ]
5891        .into_iter()
5892        .map(|(k, v)| (k.to_string(), v.to_string()))
5893        .collect();
5894        let meta_d_item_struct: HashMap<String, String> = [
5895            ("avro.name", "D"),
5896            ("avro.namespace", "topLevelRecord.nested_Struct.c"),
5897        ]
5898        .into_iter()
5899        .map(|(k, v)| (k.to_string(), v.to_string()))
5900        .collect();
5901        let meta_g_value: HashMap<String, String> = [
5902            ("avro.name", "G"),
5903            ("avro.namespace", "topLevelRecord.nested_Struct"),
5904        ]
5905        .into_iter()
5906        .map(|(k, v)| (k.to_string(), v.to_string()))
5907        .collect();
5908        let meta_h: HashMap<String, String> = [
5909            ("avro.name", "h"),
5910            ("avro.namespace", "topLevelRecord.nested_Struct.G"),
5911        ]
5912        .into_iter()
5913        .map(|(k, v)| (k.to_string(), v.to_string()))
5914        .collect();
5915        // Types used multiple times below
5916        let ef_struct_field = Arc::new(
5917            Field::new(
5918                "item",
5919                DataType::Struct(
5920                    vec![
5921                        Field::new("e", DataType::Int32, true),
5922                        Field::new("f", DataType::Utf8, true),
5923                    ]
5924                    .into(),
5925                ),
5926                true,
5927            )
5928            .with_metadata(meta_d_item_struct.clone()),
5929        );
5930        let d_inner_list_field = Arc::new(Field::new(
5931            "item",
5932            DataType::List(ef_struct_field.clone()),
5933            true,
5934        ));
5935        let d_field = Field::new("D", DataType::List(d_inner_list_field.clone()), true);
5936        // G.value.h.i : List<Float64>
5937        let i_list_field = Arc::new(Field::new("item", DataType::Float64, true));
5938        let i_field = Field::new("i", DataType::List(i_list_field.clone()), true);
5939        // G.value.h : Struct<{ i: List<Float64> }> with metadata (h)
5940        let h_field = Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
5941            .with_metadata(meta_h.clone());
5942        // G.value : Struct<{ h: ... }> with metadata (G)
5943        let g_value_struct_field = Field::new(
5944            "value",
5945            DataType::Struct(vec![h_field.clone()].into()),
5946            true,
5947        )
5948        .with_metadata(meta_g_value.clone());
5949        // entries struct for Map G
5950        let entries_struct_field = Field::new(
5951            "entries",
5952            DataType::Struct(
5953                vec![
5954                    Field::new("key", DataType::Utf8, false),
5955                    g_value_struct_field.clone(),
5956                ]
5957                .into(),
5958            ),
5959            false,
5960        );
5961        // Top-level nested_Struct fields (include metadata on "c")
5962        let a_field = Arc::new(Field::new("a", DataType::Int32, true));
5963        let b_field = Arc::new(Field::new(
5964            "B",
5965            DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
5966            true,
5967        ));
5968        let c_field = Arc::new(
5969            Field::new("c", DataType::Struct(vec![d_field.clone()].into()), true)
5970                .with_metadata(meta_c.clone()),
5971        );
5972        let g_field = Arc::new(Field::new(
5973            "G",
5974            DataType::Map(Arc::new(entries_struct_field.clone()), false),
5975            true,
5976        ));
5977        // Now create builders that match these exact field types (so nested types carry metadata)
5978        let mut nested_sb = StructBuilder::new(
5979            vec![
5980                a_field.clone(),
5981                b_field.clone(),
5982                c_field.clone(),
5983                g_field.clone(),
5984            ],
5985            vec![
5986                Box::new(Int32Builder::new()),
5987                Box::new(ListBuilder::new(Int32Builder::new())),
5988                {
5989                    // builder for "c" with correctly typed "D" including metadata on inner list item
5990                    Box::new(StructBuilder::new(
5991                        vec![Arc::new(d_field.clone())],
5992                        vec![Box::new({
5993                            let ef_struct_builder = StructBuilder::new(
5994                                vec![
5995                                    Arc::new(Field::new("e", DataType::Int32, true)),
5996                                    Arc::new(Field::new("f", DataType::Utf8, true)),
5997                                ],
5998                                vec![
5999                                    Box::new(Int32Builder::new()),
6000                                    Box::new(StringBuilder::new()),
6001                                ],
6002                            );
6003                            // Inner list that holds Struct<e,f> with Avro named-type metadata ("D")
6004                            let list_of_ef = ListBuilder::new(ef_struct_builder)
6005                                .with_field(ef_struct_field.clone());
6006                            // Outer list for "D"
6007                            ListBuilder::new(list_of_ef)
6008                        })],
6009                    ))
6010                },
6011                {
6012                    let map_field_names = MapFieldNames {
6013                        entry: "entries".to_string(),
6014                        key: "key".to_string(),
6015                        value: "value".to_string(),
6016                    };
6017                    let i_list_builder = ListBuilder::new(Float64Builder::new());
6018                    let h_struct_builder = StructBuilder::new(
6019                        vec![Arc::new(Field::new(
6020                            "i",
6021                            DataType::List(i_list_field.clone()),
6022                            true,
6023                        ))],
6024                        vec![Box::new(i_list_builder)],
6025                    );
6026                    let g_value_builder = StructBuilder::new(
6027                        vec![Arc::new(
6028                            Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6029                                .with_metadata(meta_h.clone()),
6030                        )],
6031                        vec![Box::new(h_struct_builder)],
6032                    );
6033                    // Use with_values_field to attach metadata to "value" field in the map's entries
6034                    let map_builder = MapBuilder::new(
6035                        Some(map_field_names),
6036                        StringBuilder::new(),
6037                        g_value_builder,
6038                    )
6039                    .with_values_field(Arc::new(
6040                        Field::new(
6041                            "value",
6042                            DataType::Struct(vec![h_field.clone()].into()),
6043                            true,
6044                        )
6045                        .with_metadata(meta_g_value.clone()),
6046                    ));
6047
6048                    Box::new(map_builder)
6049                },
6050            ],
6051        );
6052        nested_sb.append(true);
6053        {
6054            let a_builder = nested_sb.field_builder::<Int32Builder>(0).unwrap();
6055            a_builder.append_value(-1);
6056        }
6057        {
6058            let b_builder = nested_sb
6059                .field_builder::<ListBuilder<Int32Builder>>(1)
6060                .unwrap();
6061            {
6062                let vb = b_builder.values();
6063                vb.append_value(-1);
6064            }
6065            b_builder.append(true);
6066        }
6067        {
6068            let c_struct_builder = nested_sb.field_builder::<StructBuilder>(2).unwrap();
6069            c_struct_builder.append(true);
6070            let d_list_builder = c_struct_builder
6071                .field_builder::<ListBuilder<ListBuilder<StructBuilder>>>(0)
6072                .unwrap();
6073            {
6074                let sub_list_builder = d_list_builder.values();
6075                {
6076                    let ef_struct = sub_list_builder.values();
6077                    ef_struct.append(true);
6078                    {
6079                        let e_b = ef_struct.field_builder::<Int32Builder>(0).unwrap();
6080                        e_b.append_value(-1);
6081                        let f_b = ef_struct.field_builder::<StringBuilder>(1).unwrap();
6082                        f_b.append_value("nonnullable");
6083                    }
6084                    sub_list_builder.append(true);
6085                }
6086                d_list_builder.append(true);
6087            }
6088        }
6089        {
6090            let g_map_builder = nested_sb
6091                .field_builder::<MapBuilder<StringBuilder, StructBuilder>>(3)
6092                .unwrap();
6093            g_map_builder.append(true).unwrap();
6094        }
6095        let nested_struct = nested_sb.finish();
6096        let schema = Arc::new(arrow_schema::Schema::new(vec![
6097            Field::new("ID", id.data_type().clone(), true),
6098            Field::new("Int_Array", int_array.data_type().clone(), true),
6099            Field::new("int_array_array", int_array_array.data_type().clone(), true),
6100            Field::new("Int_Map", int_map.data_type().clone(), true),
6101            Field::new("int_map_array", int_map_array_.data_type().clone(), true),
6102            Field::new("nested_Struct", nested_struct.data_type().clone(), true)
6103                .with_metadata(meta_nested_struct.clone()),
6104        ]));
6105        let expected = RecordBatch::try_new(
6106            schema,
6107            vec![
6108                Arc::new(id) as Arc<dyn Array>,
6109                Arc::new(int_array),
6110                Arc::new(int_array_array),
6111                Arc::new(int_map),
6112                Arc::new(int_map_array_),
6113                Arc::new(nested_struct),
6114            ],
6115        )
6116        .unwrap();
6117        let batch_large = read_file(&file, 8, false);
6118        assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6119        let batch_small = read_file(&file, 3, false);
6120        assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6121    }
6122
6123    #[test]
6124    fn test_nonnullable_impala_strict() {
6125        let file = arrow_test_data("avro/nonnullable.impala.avro");
6126        let err = read_file_strict(&file, 8, false).unwrap_err();
6127        assert!(err.to_string().contains(
6128            "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6129        ));
6130    }
6131
6132    #[test]
6133    // TODO: avoid requiring snappy for this file
6134    #[cfg(feature = "snappy")]
6135    fn test_nullable_impala() {
6136        let file = arrow_test_data("avro/nullable.impala.avro");
6137        let batch1 = read_file(&file, 3, false);
6138        let batch2 = read_file(&file, 8, false);
6139        assert_eq!(batch1, batch2);
6140        let batch = batch1;
6141        assert_eq!(batch.num_rows(), 7);
6142        let id_array = batch
6143            .column(0)
6144            .as_any()
6145            .downcast_ref::<Int64Array>()
6146            .expect("id column should be an Int64Array");
6147        let expected_ids = [1, 2, 3, 4, 5, 6, 7];
6148        for (i, &expected_id) in expected_ids.iter().enumerate() {
6149            assert_eq!(id_array.value(i), expected_id, "Mismatch in id at row {i}",);
6150        }
6151        let int_array = batch
6152            .column(1)
6153            .as_any()
6154            .downcast_ref::<ListArray>()
6155            .expect("int_array column should be a ListArray");
6156        {
6157            let offsets = int_array.value_offsets();
6158            let start = offsets[0] as usize;
6159            let end = offsets[1] as usize;
6160            let values = int_array
6161                .values()
6162                .as_any()
6163                .downcast_ref::<Int32Array>()
6164                .expect("Values of int_array should be an Int32Array");
6165            let row0: Vec<Option<i32>> = (start..end).map(|i| Some(values.value(i))).collect();
6166            assert_eq!(
6167                row0,
6168                vec![Some(1), Some(2), Some(3)],
6169                "Mismatch in int_array row 0"
6170            );
6171        }
6172        let nested_struct = batch
6173            .column(5)
6174            .as_any()
6175            .downcast_ref::<StructArray>()
6176            .expect("nested_struct column should be a StructArray");
6177        let a_array = nested_struct
6178            .column_by_name("A")
6179            .expect("Field A should exist in nested_struct")
6180            .as_any()
6181            .downcast_ref::<Int32Array>()
6182            .expect("Field A should be an Int32Array");
6183        assert_eq!(a_array.value(0), 1, "Mismatch in nested_struct.A at row 0");
6184        assert!(
6185            !a_array.is_valid(1),
6186            "Expected null in nested_struct.A at row 1"
6187        );
6188        assert!(
6189            !a_array.is_valid(3),
6190            "Expected null in nested_struct.A at row 3"
6191        );
6192        assert_eq!(a_array.value(6), 7, "Mismatch in nested_struct.A at row 6");
6193    }
6194
6195    #[test]
6196    fn test_nullable_impala_strict() {
6197        let file = arrow_test_data("avro/nullable.impala.avro");
6198        let err = read_file_strict(&file, 8, false).unwrap_err();
6199        assert!(err.to_string().contains(
6200            "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6201        ));
6202    }
6203
6204    #[test]
6205    fn test_nested_record_type_reuse() {
6206        // The .avro file has the following schema:
6207        // {
6208        // "type" : "record",
6209        // "name" : "Record",
6210        // "fields" : [ {
6211        //     "name" : "nested",
6212        //     "type" : {
6213        //     "type" : "record",
6214        //     "name" : "Nested",
6215        //     "fields" : [ {
6216        //         "name" : "nested_int",
6217        //         "type" : "int"
6218        //     } ]
6219        //     }
6220        // }, {
6221        //     "name" : "nestedRecord",
6222        //     "type" : "Nested"
6223        // }, {
6224        //     "name" : "nestedArray",
6225        //     "type" : {
6226        //     "type" : "array",
6227        //     "items" : "Nested"
6228        //     }
6229        // } ]
6230        // }
6231        let batch = read_file("test/data/nested_record_reuse.avro", 8, false);
6232        let schema = batch.schema();
6233
6234        // Verify schema structure
6235        assert_eq!(schema.fields().len(), 3);
6236        let fields = schema.fields();
6237        assert_eq!(fields[0].name(), "nested");
6238        assert_eq!(fields[1].name(), "nestedRecord");
6239        assert_eq!(fields[2].name(), "nestedArray");
6240        assert!(matches!(fields[0].data_type(), DataType::Struct(_)));
6241        assert!(matches!(fields[1].data_type(), DataType::Struct(_)));
6242        assert!(matches!(fields[2].data_type(), DataType::List(_)));
6243
6244        // Validate that the nested record type
6245        if let DataType::Struct(nested_fields) = fields[0].data_type() {
6246            assert_eq!(nested_fields.len(), 1);
6247            assert_eq!(nested_fields[0].name(), "nested_int");
6248            assert_eq!(nested_fields[0].data_type(), &DataType::Int32);
6249        }
6250
6251        // Validate that the nested record type is reused
6252        assert_eq!(fields[0].data_type(), fields[1].data_type());
6253        if let DataType::List(array_field) = fields[2].data_type() {
6254            assert_eq!(array_field.data_type(), fields[0].data_type());
6255        }
6256
6257        // Validate data
6258        assert_eq!(batch.num_rows(), 2);
6259        assert_eq!(batch.num_columns(), 3);
6260
6261        // Validate the first column (nested)
6262        let nested_col = batch
6263            .column(0)
6264            .as_any()
6265            .downcast_ref::<StructArray>()
6266            .unwrap();
6267        let nested_int_array = nested_col
6268            .column_by_name("nested_int")
6269            .unwrap()
6270            .as_any()
6271            .downcast_ref::<Int32Array>()
6272            .unwrap();
6273        assert_eq!(nested_int_array.value(0), 42);
6274        assert_eq!(nested_int_array.value(1), 99);
6275
6276        // Validate the second column (nestedRecord)
6277        let nested_record_col = batch
6278            .column(1)
6279            .as_any()
6280            .downcast_ref::<StructArray>()
6281            .unwrap();
6282        let nested_record_int_array = nested_record_col
6283            .column_by_name("nested_int")
6284            .unwrap()
6285            .as_any()
6286            .downcast_ref::<Int32Array>()
6287            .unwrap();
6288        assert_eq!(nested_record_int_array.value(0), 100);
6289        assert_eq!(nested_record_int_array.value(1), 200);
6290
6291        // Validate the third column (nestedArray)
6292        let nested_array_col = batch
6293            .column(2)
6294            .as_any()
6295            .downcast_ref::<ListArray>()
6296            .unwrap();
6297        assert_eq!(nested_array_col.len(), 2);
6298        let first_array_struct = nested_array_col.value(0);
6299        let first_array_struct_array = first_array_struct
6300            .as_any()
6301            .downcast_ref::<StructArray>()
6302            .unwrap();
6303        let first_array_int_values = first_array_struct_array
6304            .column_by_name("nested_int")
6305            .unwrap()
6306            .as_any()
6307            .downcast_ref::<Int32Array>()
6308            .unwrap();
6309        assert_eq!(first_array_int_values.len(), 3);
6310        assert_eq!(first_array_int_values.value(0), 1);
6311        assert_eq!(first_array_int_values.value(1), 2);
6312        assert_eq!(first_array_int_values.value(2), 3);
6313    }
6314
6315    #[test]
6316    fn test_enum_type_reuse() {
6317        // The .avro file has the following schema:
6318        // {
6319        //     "type" : "record",
6320        //     "name" : "Record",
6321        //     "fields" : [ {
6322        //       "name" : "status",
6323        //       "type" : {
6324        //         "type" : "enum",
6325        //         "name" : "Status",
6326        //         "symbols" : [ "ACTIVE", "INACTIVE", "PENDING" ]
6327        //       }
6328        //     }, {
6329        //       "name" : "backupStatus",
6330        //       "type" : "Status"
6331        //     }, {
6332        //       "name" : "statusHistory",
6333        //       "type" : {
6334        //         "type" : "array",
6335        //         "items" : "Status"
6336        //       }
6337        //     } ]
6338        //   }
6339        let batch = read_file("test/data/enum_reuse.avro", 8, false);
6340        let schema = batch.schema();
6341
6342        // Verify schema structure
6343        assert_eq!(schema.fields().len(), 3);
6344        let fields = schema.fields();
6345        assert_eq!(fields[0].name(), "status");
6346        assert_eq!(fields[1].name(), "backupStatus");
6347        assert_eq!(fields[2].name(), "statusHistory");
6348        assert!(matches!(fields[0].data_type(), DataType::Dictionary(_, _)));
6349        assert!(matches!(fields[1].data_type(), DataType::Dictionary(_, _)));
6350        assert!(matches!(fields[2].data_type(), DataType::List(_)));
6351
6352        if let DataType::Dictionary(key_type, value_type) = fields[0].data_type() {
6353            assert_eq!(key_type.as_ref(), &DataType::Int32);
6354            assert_eq!(value_type.as_ref(), &DataType::Utf8);
6355        }
6356
6357        // Validate that the enum types are reused
6358        assert_eq!(fields[0].data_type(), fields[1].data_type());
6359        if let DataType::List(array_field) = fields[2].data_type() {
6360            assert_eq!(array_field.data_type(), fields[0].data_type());
6361        }
6362
6363        // Validate data - should have 2 rows
6364        assert_eq!(batch.num_rows(), 2);
6365        assert_eq!(batch.num_columns(), 3);
6366
6367        // Get status enum values
6368        let status_col = batch
6369            .column(0)
6370            .as_any()
6371            .downcast_ref::<DictionaryArray<Int32Type>>()
6372            .unwrap();
6373        let status_values = status_col
6374            .values()
6375            .as_any()
6376            .downcast_ref::<StringArray>()
6377            .unwrap();
6378
6379        // First row should be "ACTIVE", second row should be "PENDING"
6380        assert_eq!(
6381            status_values.value(status_col.key(0).unwrap() as usize),
6382            "ACTIVE"
6383        );
6384        assert_eq!(
6385            status_values.value(status_col.key(1).unwrap() as usize),
6386            "PENDING"
6387        );
6388
6389        // Get backupStatus enum values (same as status)
6390        let backup_status_col = batch
6391            .column(1)
6392            .as_any()
6393            .downcast_ref::<DictionaryArray<Int32Type>>()
6394            .unwrap();
6395        let backup_status_values = backup_status_col
6396            .values()
6397            .as_any()
6398            .downcast_ref::<StringArray>()
6399            .unwrap();
6400
6401        // First row should be "INACTIVE", second row should be "ACTIVE"
6402        assert_eq!(
6403            backup_status_values.value(backup_status_col.key(0).unwrap() as usize),
6404            "INACTIVE"
6405        );
6406        assert_eq!(
6407            backup_status_values.value(backup_status_col.key(1).unwrap() as usize),
6408            "ACTIVE"
6409        );
6410
6411        // Get statusHistory array
6412        let status_history_col = batch
6413            .column(2)
6414            .as_any()
6415            .downcast_ref::<ListArray>()
6416            .unwrap();
6417        assert_eq!(status_history_col.len(), 2);
6418
6419        // Validate first row's array data
6420        let first_array_dict = status_history_col.value(0);
6421        let first_array_dict_array = first_array_dict
6422            .as_any()
6423            .downcast_ref::<DictionaryArray<Int32Type>>()
6424            .unwrap();
6425        let first_array_values = first_array_dict_array
6426            .values()
6427            .as_any()
6428            .downcast_ref::<StringArray>()
6429            .unwrap();
6430
6431        // First row: ["PENDING", "ACTIVE", "INACTIVE"]
6432        assert_eq!(first_array_dict_array.len(), 3);
6433        assert_eq!(
6434            first_array_values.value(first_array_dict_array.key(0).unwrap() as usize),
6435            "PENDING"
6436        );
6437        assert_eq!(
6438            first_array_values.value(first_array_dict_array.key(1).unwrap() as usize),
6439            "ACTIVE"
6440        );
6441        assert_eq!(
6442            first_array_values.value(first_array_dict_array.key(2).unwrap() as usize),
6443            "INACTIVE"
6444        );
6445    }
6446
6447    #[test]
6448    fn comprehensive_e2e_test() {
6449        let path = "test/data/comprehensive_e2e.avro";
6450        let batch = read_file(path, 1024, false);
6451        let schema = batch.schema();
6452
6453        #[inline]
6454        fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
6455            for (tid, f) in fields.iter() {
6456                if f.name() == want {
6457                    return tid;
6458                }
6459            }
6460            panic!("union child '{want}' not found");
6461        }
6462
6463        #[inline]
6464        fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
6465            for (tid, f) in fields.iter() {
6466                if pred(f.data_type()) {
6467                    return tid;
6468                }
6469            }
6470            panic!("no union child matches predicate");
6471        }
6472
6473        fn mk_dense_union(
6474            fields: &UnionFields,
6475            type_ids: Vec<i8>,
6476            offsets: Vec<i32>,
6477            provide: impl Fn(&Field) -> Option<ArrayRef>,
6478        ) -> ArrayRef {
6479            fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
6480                match dt {
6481                    DataType::Null => Arc::new(NullArray::new(0)),
6482                    DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
6483                    DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
6484                    DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
6485                    DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
6486                    DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
6487                    DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
6488                    DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
6489                    DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
6490                    DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
6491                        Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
6492                    }
6493                    DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
6494                        Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
6495                    }
6496                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
6497                        let a = TimestampMillisecondArray::from(Vec::<i64>::new());
6498                        Arc::new(if let Some(tz) = tz {
6499                            a.with_timezone(tz.clone())
6500                        } else {
6501                            a
6502                        })
6503                    }
6504                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
6505                        let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
6506                        Arc::new(if let Some(tz) = tz {
6507                            a.with_timezone(tz.clone())
6508                        } else {
6509                            a
6510                        })
6511                    }
6512                    DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
6513                        IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
6514                    ),
6515                    DataType::FixedSizeBinary(sz) => Arc::new(
6516                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(
6517                            std::iter::empty::<Option<Vec<u8>>>(),
6518                            *sz,
6519                        )
6520                        .unwrap(),
6521                    ),
6522                    DataType::Dictionary(_, _) => {
6523                        let keys = Int32Array::from(Vec::<i32>::new());
6524                        let values = Arc::new(StringArray::from(Vec::<&str>::new()));
6525                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
6526                    }
6527                    DataType::Struct(fields) => {
6528                        let children: Vec<ArrayRef> = fields
6529                            .iter()
6530                            .map(|f| empty_child_for(f.data_type()) as ArrayRef)
6531                            .collect();
6532                        Arc::new(StructArray::new(fields.clone(), children, None))
6533                    }
6534                    DataType::List(field) => {
6535                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6536                        Arc::new(
6537                            ListArray::try_new(
6538                                field.clone(),
6539                                offsets,
6540                                empty_child_for(field.data_type()),
6541                                None,
6542                            )
6543                            .unwrap(),
6544                        )
6545                    }
6546                    DataType::Map(entry_field, is_sorted) => {
6547                        let (key_field, val_field) = match entry_field.data_type() {
6548                            DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
6549                            other => panic!("unexpected map entries type: {other:?}"),
6550                        };
6551                        let keys = StringArray::from(Vec::<&str>::new());
6552                        let vals: ArrayRef = match val_field.data_type() {
6553                            DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
6554                            DataType::Boolean => {
6555                                Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
6556                            }
6557                            DataType::Int32 => {
6558                                Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
6559                            }
6560                            DataType::Int64 => {
6561                                Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
6562                            }
6563                            DataType::Float32 => {
6564                                Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
6565                            }
6566                            DataType::Float64 => {
6567                                Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
6568                            }
6569                            DataType::Utf8 => {
6570                                Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
6571                            }
6572                            DataType::Binary => {
6573                                Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
6574                            }
6575                            DataType::Union(uf, _) => {
6576                                let children: Vec<ArrayRef> = uf
6577                                    .iter()
6578                                    .map(|(_, f)| empty_child_for(f.data_type()))
6579                                    .collect();
6580                                Arc::new(
6581                                    UnionArray::try_new(
6582                                        uf.clone(),
6583                                        ScalarBuffer::<i8>::from(Vec::<i8>::new()),
6584                                        Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
6585                                        children,
6586                                    )
6587                                    .unwrap(),
6588                                ) as ArrayRef
6589                            }
6590                            other => panic!("unsupported map value type: {other:?}"),
6591                        };
6592                        let entries = StructArray::new(
6593                            Fields::from(vec![
6594                                key_field.as_ref().clone(),
6595                                val_field.as_ref().clone(),
6596                            ]),
6597                            vec![Arc::new(keys) as ArrayRef, vals],
6598                            None,
6599                        );
6600                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6601                        Arc::new(MapArray::new(
6602                            entry_field.clone(),
6603                            offsets,
6604                            entries,
6605                            None,
6606                            *is_sorted,
6607                        ))
6608                    }
6609                    other => panic!("empty_child_for: unhandled type {other:?}"),
6610                }
6611            }
6612            let children: Vec<ArrayRef> = fields
6613                .iter()
6614                .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
6615                .collect();
6616            Arc::new(
6617                UnionArray::try_new(
6618                    fields.clone(),
6619                    ScalarBuffer::<i8>::from(type_ids),
6620                    Some(ScalarBuffer::<i32>::from(offsets)),
6621                    children,
6622                )
6623                .unwrap(),
6624            ) as ArrayRef
6625        }
6626
6627        #[inline]
6628        fn uuid16_from_str(s: &str) -> [u8; 16] {
6629            let mut out = [0u8; 16];
6630            let mut idx = 0usize;
6631            let mut hi: Option<u8> = None;
6632            for ch in s.chars() {
6633                if ch == '-' {
6634                    continue;
6635                }
6636                let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
6637                if let Some(h) = hi {
6638                    out[idx] = (h << 4) | v;
6639                    idx += 1;
6640                    hi = None;
6641                } else {
6642                    hi = Some(v);
6643                }
6644            }
6645            assert_eq!(idx, 16, "UUID must decode to 16 bytes");
6646            out
6647        }
6648        let date_a: i32 = 19_000; // 2022-01-08
6649        let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
6650        let time_us_eod: i64 = 86_400_000_000 - 1;
6651        let ts_ms_2024_01_01: i64 = 1_704_067_200_000; // 2024-01-01T00:00:00Z
6652        let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
6653        let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
6654        let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
6655        let dur_large =
6656            IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
6657        let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
6658        let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
6659        let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
6660
6661        #[inline]
6662        fn push_like(
6663            reader_schema: &arrow_schema::Schema,
6664            name: &str,
6665            arr: ArrayRef,
6666            fields: &mut Vec<FieldRef>,
6667            cols: &mut Vec<ArrayRef>,
6668        ) {
6669            let src = reader_schema
6670                .field_with_name(name)
6671                .unwrap_or_else(|_| panic!("source schema missing field '{name}'"));
6672            let mut f = Field::new(name, arr.data_type().clone(), src.is_nullable());
6673            let md = src.metadata();
6674            if !md.is_empty() {
6675                f = f.with_metadata(md.clone());
6676            }
6677            fields.push(Arc::new(f));
6678            cols.push(arr);
6679        }
6680
6681        let mut fields: Vec<FieldRef> = Vec::new();
6682        let mut columns: Vec<ArrayRef> = Vec::new();
6683        push_like(
6684            schema.as_ref(),
6685            "id",
6686            Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
6687            &mut fields,
6688            &mut columns,
6689        );
6690        push_like(
6691            schema.as_ref(),
6692            "flag",
6693            Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef,
6694            &mut fields,
6695            &mut columns,
6696        );
6697        push_like(
6698            schema.as_ref(),
6699            "ratio_f32",
6700            Arc::new(Float32Array::from(vec![1.25f32, -0.0, 3.5, 9.75])) as ArrayRef,
6701            &mut fields,
6702            &mut columns,
6703        );
6704        push_like(
6705            schema.as_ref(),
6706            "ratio_f64",
6707            Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef,
6708            &mut fields,
6709            &mut columns,
6710        );
6711        push_like(
6712            schema.as_ref(),
6713            "count_i32",
6714            Arc::new(Int32Array::from(vec![7, -1, 0, 123])) as ArrayRef,
6715            &mut fields,
6716            &mut columns,
6717        );
6718        push_like(
6719            schema.as_ref(),
6720            "count_i64",
6721            Arc::new(Int64Array::from(vec![
6722                7_000_000_000i64,
6723                -2,
6724                0,
6725                -9_876_543_210i64,
6726            ])) as ArrayRef,
6727            &mut fields,
6728            &mut columns,
6729        );
6730        push_like(
6731            schema.as_ref(),
6732            "opt_i32_nullfirst",
6733            Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef,
6734            &mut fields,
6735            &mut columns,
6736        );
6737        push_like(
6738            schema.as_ref(),
6739            "opt_str_nullsecond",
6740            Arc::new(StringArray::from(vec![
6741                Some("alpha"),
6742                None,
6743                Some("s3"),
6744                Some(""),
6745            ])) as ArrayRef,
6746            &mut fields,
6747            &mut columns,
6748        );
6749        {
6750            let uf = match schema
6751                .field_with_name("tri_union_prim")
6752                .unwrap()
6753                .data_type()
6754            {
6755                DataType::Union(f, UnionMode::Dense) => f.clone(),
6756                other => panic!("tri_union_prim should be dense union, got {other:?}"),
6757            };
6758            let tid_i = tid_by_name(&uf, "int");
6759            let tid_s = tid_by_name(&uf, "string");
6760            let tid_b = tid_by_name(&uf, "boolean");
6761            let tids = vec![tid_i, tid_s, tid_b, tid_s];
6762            let offs = vec![0, 0, 0, 1];
6763            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
6764                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
6765                DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
6766                DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
6767                _ => None,
6768            });
6769            push_like(
6770                schema.as_ref(),
6771                "tri_union_prim",
6772                arr,
6773                &mut fields,
6774                &mut columns,
6775            );
6776        }
6777
6778        push_like(
6779            schema.as_ref(),
6780            "str_utf8",
6781            Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef,
6782            &mut fields,
6783            &mut columns,
6784        );
6785        push_like(
6786            schema.as_ref(),
6787            "raw_bytes",
6788            Arc::new(BinaryArray::from(vec![
6789                b"\x00\x01".as_ref(),
6790                b"".as_ref(),
6791                b"\xFF\x00".as_ref(),
6792                b"\x10\x20\x30\x40".as_ref(),
6793            ])) as ArrayRef,
6794            &mut fields,
6795            &mut columns,
6796        );
6797        {
6798            let it = [
6799                Some(*b"0123456789ABCDEF"),
6800                Some([0u8; 16]),
6801                Some(*b"ABCDEFGHIJKLMNOP"),
6802                Some([0xAA; 16]),
6803            ]
6804            .into_iter();
6805            let arr =
6806                Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
6807                    as ArrayRef;
6808            push_like(
6809                schema.as_ref(),
6810                "fx16_plain",
6811                arr,
6812                &mut fields,
6813                &mut columns,
6814            );
6815        }
6816        {
6817            #[cfg(feature = "small_decimals")]
6818            let dec10_2 = Arc::new(
6819                Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
6820                    .with_precision_and_scale(10, 2)
6821                    .unwrap(),
6822            ) as ArrayRef;
6823            #[cfg(not(feature = "small_decimals"))]
6824            let dec10_2 = Arc::new(
6825                Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
6826                    .with_precision_and_scale(10, 2)
6827                    .unwrap(),
6828            ) as ArrayRef;
6829            push_like(
6830                schema.as_ref(),
6831                "dec_bytes_s10_2",
6832                dec10_2,
6833                &mut fields,
6834                &mut columns,
6835            );
6836        }
6837        {
6838            #[cfg(feature = "small_decimals")]
6839            let dec20_4 = Arc::new(
6840                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
6841                    .with_precision_and_scale(20, 4)
6842                    .unwrap(),
6843            ) as ArrayRef;
6844            #[cfg(not(feature = "small_decimals"))]
6845            let dec20_4 = Arc::new(
6846                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
6847                    .with_precision_and_scale(20, 4)
6848                    .unwrap(),
6849            ) as ArrayRef;
6850            push_like(
6851                schema.as_ref(),
6852                "dec_fix_s20_4",
6853                dec20_4,
6854                &mut fields,
6855                &mut columns,
6856            );
6857        }
6858        {
6859            let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
6860            let arr =
6861                Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
6862                    as ArrayRef;
6863            push_like(schema.as_ref(), "uuid_str", arr, &mut fields, &mut columns);
6864        }
6865        push_like(
6866            schema.as_ref(),
6867            "d_date",
6868            Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef,
6869            &mut fields,
6870            &mut columns,
6871        );
6872        push_like(
6873            schema.as_ref(),
6874            "t_millis",
6875            Arc::new(Time32MillisecondArray::from(vec![
6876                time_ms_a,
6877                0,
6878                1,
6879                86_400_000 - 1,
6880            ])) as ArrayRef,
6881            &mut fields,
6882            &mut columns,
6883        );
6884        push_like(
6885            schema.as_ref(),
6886            "t_micros",
6887            Arc::new(Time64MicrosecondArray::from(vec![
6888                time_us_eod,
6889                0,
6890                1,
6891                1_000_000,
6892            ])) as ArrayRef,
6893            &mut fields,
6894            &mut columns,
6895        );
6896        {
6897            let a = TimestampMillisecondArray::from(vec![
6898                ts_ms_2024_01_01,
6899                -1,
6900                ts_ms_2024_01_01 + 123,
6901                0,
6902            ])
6903            .with_timezone("+00:00");
6904            push_like(
6905                schema.as_ref(),
6906                "ts_millis_utc",
6907                Arc::new(a) as ArrayRef,
6908                &mut fields,
6909                &mut columns,
6910            );
6911        }
6912        {
6913            let a = TimestampMicrosecondArray::from(vec![
6914                ts_us_2024_01_01,
6915                1,
6916                ts_us_2024_01_01 + 456,
6917                0,
6918            ])
6919            .with_timezone("+00:00");
6920            push_like(
6921                schema.as_ref(),
6922                "ts_micros_utc",
6923                Arc::new(a) as ArrayRef,
6924                &mut fields,
6925                &mut columns,
6926            );
6927        }
6928        push_like(
6929            schema.as_ref(),
6930            "ts_millis_local",
6931            Arc::new(TimestampMillisecondArray::from(vec![
6932                ts_ms_2024_01_01 + 86_400_000,
6933                0,
6934                ts_ms_2024_01_01 + 789,
6935                123_456_789,
6936            ])) as ArrayRef,
6937            &mut fields,
6938            &mut columns,
6939        );
6940        push_like(
6941            schema.as_ref(),
6942            "ts_micros_local",
6943            Arc::new(TimestampMicrosecondArray::from(vec![
6944                ts_us_2024_01_01 + 123_456,
6945                0,
6946                ts_us_2024_01_01 + 101_112,
6947                987_654_321,
6948            ])) as ArrayRef,
6949            &mut fields,
6950            &mut columns,
6951        );
6952        {
6953            let v = vec![dur_small, dur_zero, dur_large, dur_2years];
6954            push_like(
6955                schema.as_ref(),
6956                "interval_mdn",
6957                Arc::new(IntervalMonthDayNanoArray::from(v)) as ArrayRef,
6958                &mut fields,
6959                &mut columns,
6960            );
6961        }
6962        {
6963            let keys = Int32Array::from(vec![1, 2, 3, 0]); // NEW, PROCESSING, DONE, UNKNOWN
6964            let values = Arc::new(StringArray::from(vec![
6965                "UNKNOWN",
6966                "NEW",
6967                "PROCESSING",
6968                "DONE",
6969            ])) as ArrayRef;
6970            let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
6971            push_like(
6972                schema.as_ref(),
6973                "status",
6974                Arc::new(dict) as ArrayRef,
6975                &mut fields,
6976                &mut columns,
6977            );
6978        }
6979        {
6980            let list_field = match schema.field_with_name("arr_union").unwrap().data_type() {
6981                DataType::List(f) => f.clone(),
6982                other => panic!("arr_union should be List, got {other:?}"),
6983            };
6984            let uf = match list_field.data_type() {
6985                DataType::Union(f, UnionMode::Dense) => f.clone(),
6986                other => panic!("arr_union item should be union, got {other:?}"),
6987            };
6988            let tid_l = tid_by_name(&uf, "long");
6989            let tid_s = tid_by_name(&uf, "string");
6990            let tid_n = tid_by_name(&uf, "null");
6991            let type_ids = vec![
6992                tid_l, tid_s, tid_n, tid_l, tid_n, tid_s, tid_l, tid_l, tid_s, tid_n, tid_l,
6993            ];
6994            let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
6995            let values = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
6996                DataType::Int64 => {
6997                    Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
6998                }
6999                DataType::Utf8 => {
7000                    Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
7001                }
7002                DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
7003                _ => None,
7004            });
7005            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
7006            let arr = Arc::new(ListArray::try_new(list_field, list_offsets, values, None).unwrap())
7007                as ArrayRef;
7008            push_like(schema.as_ref(), "arr_union", arr, &mut fields, &mut columns);
7009        }
7010        {
7011            let (entry_field, entries_fields, uf, is_sorted) =
7012                match schema.field_with_name("map_union").unwrap().data_type() {
7013                    DataType::Map(entry_field, is_sorted) => {
7014                        let fs = match entry_field.data_type() {
7015                            DataType::Struct(fs) => fs.clone(),
7016                            other => panic!("map entries must be struct, got {other:?}"),
7017                        };
7018                        let val_f = fs[1].clone();
7019                        let uf = match val_f.data_type() {
7020                            DataType::Union(f, UnionMode::Dense) => f.clone(),
7021                            other => panic!("map value must be union, got {other:?}"),
7022                        };
7023                        (entry_field.clone(), fs, uf, *is_sorted)
7024                    }
7025                    other => panic!("map_union should be Map, got {other:?}"),
7026                };
7027            let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
7028            let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
7029            let tid_null = tid_by_name(&uf, "null");
7030            let tid_d = tid_by_name(&uf, "double");
7031            let tid_s = tid_by_name(&uf, "string");
7032            let type_ids = vec![tid_d, tid_null, tid_s, tid_d, tid_d, tid_s];
7033            let offsets = vec![0, 0, 0, 1, 2, 1];
7034            let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
7035            let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7036                DataType::Float64 => {
7037                    Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
7038                }
7039                DataType::Utf8 => {
7040                    Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
7041                }
7042                DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7043                _ => None,
7044            });
7045            let entries = StructArray::new(
7046                entries_fields.clone(),
7047                vec![Arc::new(keys) as ArrayRef, vals],
7048                None,
7049            );
7050            let map =
7051                Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef;
7052            push_like(schema.as_ref(), "map_union", map, &mut fields, &mut columns);
7053        }
7054        {
7055            let fs = match schema.field_with_name("address").unwrap().data_type() {
7056                DataType::Struct(fs) => fs.clone(),
7057                other => panic!("address should be Struct, got {other:?}"),
7058            };
7059            let street = Arc::new(StringArray::from(vec![
7060                "100 Main",
7061                "",
7062                "42 Galaxy Way",
7063                "End Ave",
7064            ])) as ArrayRef;
7065            let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
7066            let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
7067            let arr = Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef;
7068            push_like(schema.as_ref(), "address", arr, &mut fields, &mut columns);
7069        }
7070        {
7071            let fs = match schema.field_with_name("maybe_auth").unwrap().data_type() {
7072                DataType::Struct(fs) => fs.clone(),
7073                other => panic!("maybe_auth should be Struct, got {other:?}"),
7074            };
7075            let user =
7076                Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
7077            let token_values: Vec<Option<&[u8]>> = vec![
7078                None,                           // row 1: null
7079                Some(b"\x01\x02\x03".as_ref()), // row 2: bytes
7080                None,                           // row 3: null
7081                Some(b"".as_ref()),             // row 4: empty bytes
7082            ];
7083            let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
7084            let arr = Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef;
7085            push_like(
7086                schema.as_ref(),
7087                "maybe_auth",
7088                arr,
7089                &mut fields,
7090                &mut columns,
7091            );
7092        }
7093        {
7094            let uf = match schema
7095                .field_with_name("union_enum_record_array_map")
7096                .unwrap()
7097                .data_type()
7098            {
7099                DataType::Union(f, UnionMode::Dense) => f.clone(),
7100                other => panic!("union_enum_record_array_map should be union, got {other:?}"),
7101            };
7102            let mut tid_enum: Option<i8> = None;
7103            let mut tid_rec_a: Option<i8> = None;
7104            let mut tid_array: Option<i8> = None;
7105            let mut tid_map: Option<i8> = None;
7106            let mut map_entry_field: Option<FieldRef> = None;
7107            let mut map_sorted: bool = false;
7108            for (tid, f) in uf.iter() {
7109                match f.data_type() {
7110                    DataType::Dictionary(_, _) => tid_enum = Some(tid),
7111                    DataType::Struct(childs)
7112                        if childs.len() == 2
7113                            && childs[0].name() == "a"
7114                            && childs[1].name() == "b" =>
7115                    {
7116                        tid_rec_a = Some(tid)
7117                    }
7118                    DataType::List(item) if matches!(item.data_type(), DataType::Int64) => {
7119                        tid_array = Some(tid)
7120                    }
7121                    DataType::Map(ef, is_sorted) => {
7122                        tid_map = Some(tid);
7123                        map_entry_field = Some(ef.clone());
7124                        map_sorted = *is_sorted;
7125                    }
7126                    _ => {}
7127                }
7128            }
7129            let (tid_enum, tid_rec_a, tid_array, tid_map) = (
7130                tid_enum.unwrap(),
7131                tid_rec_a.unwrap(),
7132                tid_array.unwrap(),
7133                tid_map.unwrap(),
7134            );
7135            let tids = vec![tid_enum, tid_rec_a, tid_array, tid_map];
7136            let offs = vec![0, 0, 0, 0];
7137            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7138                DataType::Dictionary(_, _) => {
7139                    let keys = Int32Array::from(vec![0i32]);
7140                    let values =
7141                        Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
7142                    Some(
7143                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7144                            as ArrayRef,
7145                    )
7146                }
7147                DataType::Struct(fs)
7148                    if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
7149                {
7150                    let a = Int32Array::from(vec![7]);
7151                    let b = StringArray::from(vec!["rec"]);
7152                    Some(Arc::new(StructArray::new(
7153                        fs.clone(),
7154                        vec![Arc::new(a), Arc::new(b)],
7155                        None,
7156                    )) as ArrayRef)
7157                }
7158                DataType::List(field) => {
7159                    let values = Int64Array::from(vec![1i64, 2, 3]);
7160                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
7161                    Some(Arc::new(
7162                        ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
7163                    ) as ArrayRef)
7164                }
7165                DataType::Map(_, _) => {
7166                    let entry_field = map_entry_field.clone().unwrap();
7167                    let (key_field, val_field) = match entry_field.data_type() {
7168                        DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7169                        _ => unreachable!(),
7170                    };
7171                    let keys = StringArray::from(vec!["k"]);
7172                    let vals = StringArray::from(vec!["v"]);
7173                    let entries = StructArray::new(
7174                        Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7175                        vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7176                        None,
7177                    );
7178                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
7179                    Some(Arc::new(MapArray::new(
7180                        entry_field.clone(),
7181                        offsets,
7182                        entries,
7183                        None,
7184                        map_sorted,
7185                    )) as ArrayRef)
7186                }
7187                _ => None,
7188            });
7189            push_like(
7190                schema.as_ref(),
7191                "union_enum_record_array_map",
7192                arr,
7193                &mut fields,
7194                &mut columns,
7195            );
7196        }
7197        {
7198            let uf = match schema
7199                .field_with_name("union_date_or_fixed4")
7200                .unwrap()
7201                .data_type()
7202            {
7203                DataType::Union(f, UnionMode::Dense) => f.clone(),
7204                other => panic!("union_date_or_fixed4 should be union, got {other:?}"),
7205            };
7206            let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
7207            let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
7208            let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
7209            let offs = vec![0, 0, 1, 1];
7210            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7211                DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
7212                DataType::FixedSizeBinary(4) => {
7213                    let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
7214                    Some(Arc::new(
7215                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
7216                    ) as ArrayRef)
7217                }
7218                _ => None,
7219            });
7220            push_like(
7221                schema.as_ref(),
7222                "union_date_or_fixed4",
7223                arr,
7224                &mut fields,
7225                &mut columns,
7226            );
7227        }
7228        {
7229            let uf = match schema
7230                .field_with_name("union_interval_or_string")
7231                .unwrap()
7232                .data_type()
7233            {
7234                DataType::Union(f, UnionMode::Dense) => f.clone(),
7235                other => panic!("union_interval_or_string should be union, got {other:?}"),
7236            };
7237            let tid_dur = tid_by_dt(&uf, |dt| {
7238                matches!(dt, DataType::Interval(IntervalUnit::MonthDayNano))
7239            });
7240            let tid_str = tid_by_dt(&uf, |dt| matches!(dt, DataType::Utf8));
7241            let tids = vec![tid_dur, tid_str, tid_dur, tid_str];
7242            let offs = vec![0, 0, 1, 1];
7243            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7244                DataType::Interval(IntervalUnit::MonthDayNano) => Some(Arc::new(
7245                    IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
7246                )
7247                    as ArrayRef),
7248                DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
7249                    "duration-as-text",
7250                    "iso-8601-period-P1Y",
7251                ])) as ArrayRef),
7252                _ => None,
7253            });
7254            push_like(
7255                schema.as_ref(),
7256                "union_interval_or_string",
7257                arr,
7258                &mut fields,
7259                &mut columns,
7260            );
7261        }
7262        {
7263            let uf = match schema
7264                .field_with_name("union_uuid_or_fixed10")
7265                .unwrap()
7266                .data_type()
7267            {
7268                DataType::Union(f, UnionMode::Dense) => f.clone(),
7269                other => panic!("union_uuid_or_fixed10 should be union, got {other:?}"),
7270            };
7271            let tid_uuid = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
7272            let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
7273            let tids = vec![tid_uuid, tid_fx10, tid_uuid, tid_fx10];
7274            let offs = vec![0, 0, 1, 1];
7275            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7276                DataType::FixedSizeBinary(16) => {
7277                    let it = [Some(uuid1), Some(uuid2)].into_iter();
7278                    Some(Arc::new(
7279                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
7280                    ) as ArrayRef)
7281                }
7282                DataType::FixedSizeBinary(10) => {
7283                    let fx10_a = [0xAAu8; 10];
7284                    let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
7285                    let it = [Some(fx10_a), Some(fx10_b)].into_iter();
7286                    Some(Arc::new(
7287                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
7288                    ) as ArrayRef)
7289                }
7290                _ => None,
7291            });
7292            push_like(
7293                schema.as_ref(),
7294                "union_uuid_or_fixed10",
7295                arr,
7296                &mut fields,
7297                &mut columns,
7298            );
7299        }
7300        {
7301            let list_field = match schema
7302                .field_with_name("array_records_with_union")
7303                .unwrap()
7304                .data_type()
7305            {
7306                DataType::List(f) => f.clone(),
7307                other => panic!("array_records_with_union should be List, got {other:?}"),
7308            };
7309            let kv_fields = match list_field.data_type() {
7310                DataType::Struct(fs) => fs.clone(),
7311                other => panic!("array_records_with_union items must be Struct, got {other:?}"),
7312            };
7313            let val_field = kv_fields
7314                .iter()
7315                .find(|f| f.name() == "val")
7316                .unwrap()
7317                .clone();
7318            let uf = match val_field.data_type() {
7319                DataType::Union(f, UnionMode::Dense) => f.clone(),
7320                other => panic!("KV.val should be union, got {other:?}"),
7321            };
7322            let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
7323            let tid_null = tid_by_name(&uf, "null");
7324            let tid_i = tid_by_name(&uf, "int");
7325            let tid_l = tid_by_name(&uf, "long");
7326            let type_ids = vec![tid_i, tid_null, tid_l, tid_null, tid_i];
7327            let offsets = vec![0, 0, 0, 1, 1];
7328            let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7329                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
7330                DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
7331                DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7332                _ => None,
7333            });
7334            let values_struct =
7335                Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None)) as ArrayRef;
7336            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
7337            let arr = Arc::new(
7338                ListArray::try_new(list_field, list_offsets, values_struct, None).unwrap(),
7339            ) as ArrayRef;
7340            push_like(
7341                schema.as_ref(),
7342                "array_records_with_union",
7343                arr,
7344                &mut fields,
7345                &mut columns,
7346            );
7347        }
7348        {
7349            let uf = match schema
7350                .field_with_name("union_map_or_array_int")
7351                .unwrap()
7352                .data_type()
7353            {
7354                DataType::Union(f, UnionMode::Dense) => f.clone(),
7355                other => panic!("union_map_or_array_int should be union, got {other:?}"),
7356            };
7357            let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
7358            let tid_list = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
7359            let map_child: ArrayRef = {
7360                let (entry_field, is_sorted) = match uf
7361                    .iter()
7362                    .find(|(tid, _)| *tid == tid_map)
7363                    .unwrap()
7364                    .1
7365                    .data_type()
7366                {
7367                    DataType::Map(ef, is_sorted) => (ef.clone(), *is_sorted),
7368                    _ => unreachable!(),
7369                };
7370                let (key_field, val_field) = match entry_field.data_type() {
7371                    DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7372                    _ => unreachable!(),
7373                };
7374                let keys = StringArray::from(vec!["x", "y", "only"]);
7375                let vals = Int32Array::from(vec![1, 2, 10]);
7376                let entries = StructArray::new(
7377                    Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7378                    vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7379                    None,
7380                );
7381                let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
7382                Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef
7383            };
7384            let list_child: ArrayRef = {
7385                let list_field = match uf
7386                    .iter()
7387                    .find(|(tid, _)| *tid == tid_list)
7388                    .unwrap()
7389                    .1
7390                    .data_type()
7391                {
7392                    DataType::List(f) => f.clone(),
7393                    _ => unreachable!(),
7394                };
7395                let values = Int32Array::from(vec![1, 2, 3, 0]);
7396                let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
7397                Arc::new(ListArray::try_new(list_field, offsets, Arc::new(values), None).unwrap())
7398                    as ArrayRef
7399            };
7400            let tids = vec![tid_map, tid_list, tid_map, tid_list];
7401            let offs = vec![0, 0, 1, 1];
7402            let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7403                DataType::Map(_, _) => Some(map_child.clone()),
7404                DataType::List(_) => Some(list_child.clone()),
7405                _ => None,
7406            });
7407            push_like(
7408                schema.as_ref(),
7409                "union_map_or_array_int",
7410                arr,
7411                &mut fields,
7412                &mut columns,
7413            );
7414        }
7415        push_like(
7416            schema.as_ref(),
7417            "renamed_with_default",
7418            Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
7419            &mut fields,
7420            &mut columns,
7421        );
7422        {
7423            let fs = match schema.field_with_name("person").unwrap().data_type() {
7424                DataType::Struct(fs) => fs.clone(),
7425                other => panic!("person should be Struct, got {other:?}"),
7426            };
7427            let name =
7428                Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef;
7429            let age = Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef;
7430            let arr = Arc::new(StructArray::new(fs, vec![name, age], None)) as ArrayRef;
7431            push_like(schema.as_ref(), "person", arr, &mut fields, &mut columns);
7432        }
7433        let expected =
7434            RecordBatch::try_new(Arc::new(Schema::new(Fields::from(fields))), columns).unwrap();
7435        assert_eq!(
7436            expected, batch,
7437            "entire RecordBatch mismatch (schema, all columns, all rows)"
7438        );
7439    }
7440    #[test]
7441    fn comprehensive_e2e_resolution_test() {
7442        use serde_json::Value;
7443        use std::collections::HashMap;
7444
7445        // Build a reader schema that stresses Avro schema‑resolution
7446        //
7447        // Changes relative to writer schema:
7448        // * Rename fields using writer aliases:    id -> identifier, renamed_with_default -> old_count
7449        // * Promote numeric types:                 count_i32 (int) -> long, ratio_f32 (float) -> double
7450        // * Reorder many union branches (reverse), incl. nested unions
7451        // * Reorder array/map union item/value branches
7452        // * Rename nested Address field:           street -> street_name (uses alias in writer)
7453        // * Change Person type name/namespace:     com.example.Person (matches writer alias)
7454        // * Reverse top‑level field order
7455        //
7456        // Reader‑side aliases are added wherever names change (per Avro spec).
7457        fn make_comprehensive_reader_schema(path: &str) -> AvroSchema {
7458            fn set_type_string(f: &mut Value, new_ty: &str) {
7459                if let Some(ty) = f.get_mut("type") {
7460                    match ty {
7461                        Value::String(_) | Value::Object(_) => {
7462                            *ty = Value::String(new_ty.to_string());
7463                        }
7464                        Value::Array(arr) => {
7465                            for b in arr.iter_mut() {
7466                                match b {
7467                                    Value::String(s) if s != "null" => {
7468                                        *b = Value::String(new_ty.to_string());
7469                                        break;
7470                                    }
7471                                    Value::Object(_) => {
7472                                        *b = Value::String(new_ty.to_string());
7473                                        break;
7474                                    }
7475                                    _ => {}
7476                                }
7477                            }
7478                        }
7479                        _ => {}
7480                    }
7481                }
7482            }
7483            fn reverse_union_array(f: &mut Value) {
7484                if let Some(arr) = f.get_mut("type").and_then(|t| t.as_array_mut()) {
7485                    arr.reverse();
7486                }
7487            }
7488            fn reverse_items_union(f: &mut Value) {
7489                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7490                    if let Some(items) = obj.get_mut("items").and_then(|v| v.as_array_mut()) {
7491                        items.reverse();
7492                    }
7493                }
7494            }
7495            fn reverse_map_values_union(f: &mut Value) {
7496                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7497                    if let Some(values) = obj.get_mut("values").and_then(|v| v.as_array_mut()) {
7498                        values.reverse();
7499                    }
7500                }
7501            }
7502            fn reverse_nested_union_in_record(f: &mut Value, field_name: &str) {
7503                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7504                    if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7505                        for ff in fields.iter_mut() {
7506                            if ff.get("name").and_then(|n| n.as_str()) == Some(field_name) {
7507                                if let Some(ty) = ff.get_mut("type") {
7508                                    if let Some(arr) = ty.as_array_mut() {
7509                                        arr.reverse();
7510                                    }
7511                                }
7512                            }
7513                        }
7514                    }
7515                }
7516            }
7517            fn rename_nested_field_with_alias(f: &mut Value, old: &str, new: &str) {
7518                if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7519                    if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7520                        for ff in fields.iter_mut() {
7521                            if ff.get("name").and_then(|n| n.as_str()) == Some(old) {
7522                                ff["name"] = Value::String(new.to_string());
7523                                ff["aliases"] = Value::Array(vec![Value::String(old.to_string())]);
7524                            }
7525                        }
7526                    }
7527                }
7528            }
7529            let mut root = load_writer_schema_json(path);
7530            assert_eq!(root["type"], "record", "writer schema must be a record");
7531            let fields = root
7532                .get_mut("fields")
7533                .and_then(|f| f.as_array_mut())
7534                .expect("record has fields");
7535            for f in fields.iter_mut() {
7536                let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
7537                    continue;
7538                };
7539                match name {
7540                    // Field aliasing (reader‑side aliases added)
7541                    "id" => {
7542                        f["name"] = Value::String("identifier".into());
7543                        f["aliases"] = Value::Array(vec![Value::String("id".into())]);
7544                    }
7545                    "renamed_with_default" => {
7546                        f["name"] = Value::String("old_count".into());
7547                        f["aliases"] =
7548                            Value::Array(vec![Value::String("renamed_with_default".into())]);
7549                    }
7550                    // Promotions
7551                    "count_i32" => set_type_string(f, "long"),
7552                    "ratio_f32" => set_type_string(f, "double"),
7553                    // Union reorder (exercise resolution)
7554                    "opt_str_nullsecond" => reverse_union_array(f),
7555                    "union_enum_record_array_map" => reverse_union_array(f),
7556                    "union_date_or_fixed4" => reverse_union_array(f),
7557                    "union_interval_or_string" => reverse_union_array(f),
7558                    "union_uuid_or_fixed10" => reverse_union_array(f),
7559                    "union_map_or_array_int" => reverse_union_array(f),
7560                    "maybe_auth" => reverse_nested_union_in_record(f, "token"),
7561                    // Array/Map unions
7562                    "arr_union" => reverse_items_union(f),
7563                    "map_union" => reverse_map_values_union(f),
7564                    // Nested rename using reader‑side alias
7565                    "address" => rename_nested_field_with_alias(f, "street", "street_name"),
7566                    // Type‑name alias for nested record
7567                    "person" => {
7568                        if let Some(tobj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7569                            tobj.insert("name".to_string(), Value::String("Person".into()));
7570                            tobj.insert(
7571                                "namespace".to_string(),
7572                                Value::String("com.example".into()),
7573                            );
7574                            tobj.insert(
7575                                "aliases".into(),
7576                                Value::Array(vec![
7577                                    Value::String("PersonV2".into()),
7578                                    Value::String("com.example.v2.PersonV2".into()),
7579                                ]),
7580                            );
7581                        }
7582                    }
7583                    _ => {}
7584                }
7585            }
7586            fields.reverse();
7587            AvroSchema::new(root.to_string())
7588        }
7589
7590        let path = "test/data/comprehensive_e2e.avro";
7591        let reader_schema = make_comprehensive_reader_schema(path);
7592        let batch = read_alltypes_with_reader_schema(path, reader_schema.clone());
7593
7594        const UUID_EXT_KEY: &str = "ARROW:extension:name";
7595        const UUID_LOGICAL_KEY: &str = "logicalType";
7596
7597        let uuid_md_top: Option<HashMap<String, String>> = batch
7598            .schema()
7599            .field_with_name("uuid_str")
7600            .ok()
7601            .and_then(|f| {
7602                let md = f.metadata();
7603                let has_ext = md.get(UUID_EXT_KEY).is_some();
7604                let is_uuid_logical = md
7605                    .get(UUID_LOGICAL_KEY)
7606                    .map(|v| v.trim_matches('"') == "uuid")
7607                    .unwrap_or(false);
7608                if has_ext || is_uuid_logical {
7609                    Some(md.clone())
7610                } else {
7611                    None
7612                }
7613            });
7614
7615        let uuid_md_union: Option<HashMap<String, String>> = batch
7616            .schema()
7617            .field_with_name("union_uuid_or_fixed10")
7618            .ok()
7619            .and_then(|f| match f.data_type() {
7620                DataType::Union(uf, _) => uf
7621                    .iter()
7622                    .find(|(_, child)| child.name() == "uuid")
7623                    .and_then(|(_, child)| {
7624                        let md = child.metadata();
7625                        let has_ext = md.get(UUID_EXT_KEY).is_some();
7626                        let is_uuid_logical = md
7627                            .get(UUID_LOGICAL_KEY)
7628                            .map(|v| v.trim_matches('"') == "uuid")
7629                            .unwrap_or(false);
7630                        if has_ext || is_uuid_logical {
7631                            Some(md.clone())
7632                        } else {
7633                            None
7634                        }
7635                    }),
7636                _ => None,
7637            });
7638
7639        let add_uuid_ext_top = |f: Field| -> Field {
7640            if let Some(md) = &uuid_md_top {
7641                f.with_metadata(md.clone())
7642            } else {
7643                f
7644            }
7645        };
7646        let add_uuid_ext_union = |f: Field| -> Field {
7647            if let Some(md) = &uuid_md_union {
7648                f.with_metadata(md.clone())
7649            } else {
7650                f
7651            }
7652        };
7653
7654        #[inline]
7655        fn uuid16_from_str(s: &str) -> [u8; 16] {
7656            let mut out = [0u8; 16];
7657            let mut idx = 0usize;
7658            let mut hi: Option<u8> = None;
7659            for ch in s.chars() {
7660                if ch == '-' {
7661                    continue;
7662                }
7663                let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
7664                if let Some(h) = hi {
7665                    out[idx] = (h << 4) | v;
7666                    idx += 1;
7667                    hi = None;
7668                } else {
7669                    hi = Some(v);
7670                }
7671            }
7672            assert_eq!(idx, 16, "UUID must decode to 16 bytes");
7673            out
7674        }
7675
7676        fn mk_dense_union(
7677            fields: &UnionFields,
7678            type_ids: Vec<i8>,
7679            offsets: Vec<i32>,
7680            provide: impl Fn(&Field) -> Option<ArrayRef>,
7681        ) -> ArrayRef {
7682            fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
7683                match dt {
7684                    DataType::Null => Arc::new(NullArray::new(0)),
7685                    DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
7686                    DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
7687                    DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
7688                    DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
7689                    DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
7690                    DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
7691                    DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
7692                    DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
7693                    DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
7694                        Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
7695                    }
7696                    DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
7697                        Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
7698                    }
7699                    DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
7700                        let a = TimestampMillisecondArray::from(Vec::<i64>::new());
7701                        Arc::new(if let Some(tz) = tz {
7702                            a.with_timezone(tz.clone())
7703                        } else {
7704                            a
7705                        })
7706                    }
7707                    DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
7708                        let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
7709                        Arc::new(if let Some(tz) = tz {
7710                            a.with_timezone(tz.clone())
7711                        } else {
7712                            a
7713                        })
7714                    }
7715                    DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
7716                        IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
7717                    ),
7718                    DataType::FixedSizeBinary(sz) => Arc::new(
7719                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(
7720                            std::iter::empty::<Option<Vec<u8>>>(),
7721                            *sz,
7722                        )
7723                        .unwrap(),
7724                    ),
7725                    DataType::Dictionary(_, _) => {
7726                        let keys = Int32Array::from(Vec::<i32>::new());
7727                        let values = Arc::new(StringArray::from(Vec::<&str>::new()));
7728                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7729                    }
7730                    DataType::Struct(fields) => {
7731                        let children: Vec<ArrayRef> = fields
7732                            .iter()
7733                            .map(|f| empty_child_for(f.data_type()) as ArrayRef)
7734                            .collect();
7735                        Arc::new(StructArray::new(fields.clone(), children, None))
7736                    }
7737                    DataType::List(field) => {
7738                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
7739                        Arc::new(
7740                            ListArray::try_new(
7741                                field.clone(),
7742                                offsets,
7743                                empty_child_for(field.data_type()),
7744                                None,
7745                            )
7746                            .unwrap(),
7747                        )
7748                    }
7749                    DataType::Map(entry_field, is_sorted) => {
7750                        let (key_field, val_field) = match entry_field.data_type() {
7751                            DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7752                            other => panic!("unexpected map entries type: {other:?}"),
7753                        };
7754                        let keys = StringArray::from(Vec::<&str>::new());
7755                        let vals: ArrayRef = match val_field.data_type() {
7756                            DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
7757                            DataType::Boolean => {
7758                                Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
7759                            }
7760                            DataType::Int32 => {
7761                                Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
7762                            }
7763                            DataType::Int64 => {
7764                                Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
7765                            }
7766                            DataType::Float32 => {
7767                                Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
7768                            }
7769                            DataType::Float64 => {
7770                                Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
7771                            }
7772                            DataType::Utf8 => {
7773                                Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
7774                            }
7775                            DataType::Binary => {
7776                                Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
7777                            }
7778                            DataType::Union(uf, _) => {
7779                                let children: Vec<ArrayRef> = uf
7780                                    .iter()
7781                                    .map(|(_, f)| empty_child_for(f.data_type()))
7782                                    .collect();
7783                                Arc::new(
7784                                    UnionArray::try_new(
7785                                        uf.clone(),
7786                                        ScalarBuffer::<i8>::from(Vec::<i8>::new()),
7787                                        Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
7788                                        children,
7789                                    )
7790                                    .unwrap(),
7791                                ) as ArrayRef
7792                            }
7793                            other => panic!("unsupported map value type: {other:?}"),
7794                        };
7795                        let entries = StructArray::new(
7796                            Fields::from(vec![
7797                                key_field.as_ref().clone(),
7798                                val_field.as_ref().clone(),
7799                            ]),
7800                            vec![Arc::new(keys) as ArrayRef, vals],
7801                            None,
7802                        );
7803                        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
7804                        Arc::new(MapArray::new(
7805                            entry_field.clone(),
7806                            offsets,
7807                            entries,
7808                            None,
7809                            *is_sorted,
7810                        ))
7811                    }
7812                    other => panic!("empty_child_for: unhandled type {other:?}"),
7813                }
7814            }
7815            let children: Vec<ArrayRef> = fields
7816                .iter()
7817                .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
7818                .collect();
7819            Arc::new(
7820                UnionArray::try_new(
7821                    fields.clone(),
7822                    ScalarBuffer::<i8>::from(type_ids),
7823                    Some(ScalarBuffer::<i32>::from(offsets)),
7824                    children,
7825                )
7826                .unwrap(),
7827            ) as ArrayRef
7828        }
7829        let date_a: i32 = 19_000; // 2022-01-08
7830        let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
7831        let time_us_eod: i64 = 86_400_000_000 - 1;
7832        let ts_ms_2024_01_01: i64 = 1_704_067_200_000; // 2024-01-01T00:00:00Z
7833        let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
7834        let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
7835        let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
7836        let dur_large =
7837            IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
7838        let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
7839        let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
7840        let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
7841        let item_name = Field::LIST_FIELD_DEFAULT_NAME;
7842        let uf_tri = UnionFields::new(
7843            vec![0, 1, 2],
7844            vec![
7845                Field::new("int", DataType::Int32, false),
7846                Field::new("string", DataType::Utf8, false),
7847                Field::new("boolean", DataType::Boolean, false),
7848            ],
7849        );
7850        let uf_arr_items = UnionFields::new(
7851            vec![0, 1, 2],
7852            vec![
7853                Field::new("null", DataType::Null, false),
7854                Field::new("string", DataType::Utf8, false),
7855                Field::new("long", DataType::Int64, false),
7856            ],
7857        );
7858        let arr_items_field = Arc::new(Field::new(
7859            item_name,
7860            DataType::Union(uf_arr_items.clone(), UnionMode::Dense),
7861            true,
7862        ));
7863        let uf_map_vals = UnionFields::new(
7864            vec![0, 1, 2],
7865            vec![
7866                Field::new("string", DataType::Utf8, false),
7867                Field::new("double", DataType::Float64, false),
7868                Field::new("null", DataType::Null, false),
7869            ],
7870        );
7871        let map_entries_field = Arc::new(Field::new(
7872            "entries",
7873            DataType::Struct(Fields::from(vec![
7874                Field::new("key", DataType::Utf8, false),
7875                Field::new(
7876                    "value",
7877                    DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
7878                    true,
7879                ),
7880            ])),
7881            false,
7882        ));
7883        // Enum metadata for Color (now includes name/namespace)
7884        let mut enum_md_color = {
7885            let mut m = HashMap::<String, String>::new();
7886            m.insert(
7887                crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
7888                serde_json::to_string(&vec!["RED", "GREEN", "BLUE"]).unwrap(),
7889            );
7890            m
7891        };
7892        enum_md_color.insert(AVRO_NAME_METADATA_KEY.to_string(), "Color".to_string());
7893        enum_md_color.insert(
7894            AVRO_NAMESPACE_METADATA_KEY.to_string(),
7895            "org.apache.arrow.avrotests.v1.types".to_string(),
7896        );
7897        let union_rec_a_fields = Fields::from(vec![
7898            Field::new("a", DataType::Int32, false),
7899            Field::new("b", DataType::Utf8, false),
7900        ]);
7901        let union_rec_b_fields = Fields::from(vec![
7902            Field::new("x", DataType::Int64, false),
7903            Field::new("y", DataType::Binary, false),
7904        ]);
7905        let union_map_entries = Arc::new(Field::new(
7906            "entries",
7907            DataType::Struct(Fields::from(vec![
7908                Field::new("key", DataType::Utf8, false),
7909                Field::new("value", DataType::Utf8, false),
7910            ])),
7911            false,
7912        ));
7913        let rec_a_md = {
7914            let mut m = HashMap::<String, String>::new();
7915            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecA".to_string());
7916            m.insert(
7917                AVRO_NAMESPACE_METADATA_KEY.to_string(),
7918                "org.apache.arrow.avrotests.v1.types".to_string(),
7919            );
7920            m
7921        };
7922        let rec_b_md = {
7923            let mut m = HashMap::<String, String>::new();
7924            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecB".to_string());
7925            m.insert(
7926                AVRO_NAMESPACE_METADATA_KEY.to_string(),
7927                "org.apache.arrow.avrotests.v1.types".to_string(),
7928            );
7929            m
7930        };
7931        let uf_union_big = UnionFields::new(
7932            vec![0, 1, 2, 3, 4],
7933            vec![
7934                Field::new(
7935                    "map",
7936                    DataType::Map(union_map_entries.clone(), false),
7937                    false,
7938                ),
7939                Field::new(
7940                    "array",
7941                    DataType::List(Arc::new(Field::new(item_name, DataType::Int64, false))),
7942                    false,
7943                ),
7944                Field::new(
7945                    "org.apache.arrow.avrotests.v1.types.RecB",
7946                    DataType::Struct(union_rec_b_fields.clone()),
7947                    false,
7948                )
7949                .with_metadata(rec_b_md.clone()),
7950                Field::new(
7951                    "org.apache.arrow.avrotests.v1.types.RecA",
7952                    DataType::Struct(union_rec_a_fields.clone()),
7953                    false,
7954                )
7955                .with_metadata(rec_a_md.clone()),
7956                Field::new(
7957                    "org.apache.arrow.avrotests.v1.types.Color",
7958                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
7959                    false,
7960                )
7961                .with_metadata(enum_md_color.clone()),
7962            ],
7963        );
7964        let fx4_md = {
7965            let mut m = HashMap::<String, String>::new();
7966            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx4".to_string());
7967            m.insert(
7968                AVRO_NAMESPACE_METADATA_KEY.to_string(),
7969                "org.apache.arrow.avrotests.v1".to_string(),
7970            );
7971            m
7972        };
7973        let uf_date_fixed4 = UnionFields::new(
7974            vec![0, 1],
7975            vec![
7976                Field::new(
7977                    "org.apache.arrow.avrotests.v1.Fx4",
7978                    DataType::FixedSizeBinary(4),
7979                    false,
7980                )
7981                .with_metadata(fx4_md.clone()),
7982                Field::new("date", DataType::Date32, false),
7983            ],
7984        );
7985        let dur12u_md = {
7986            let mut m = HashMap::<String, String>::new();
7987            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12U".to_string());
7988            m.insert(
7989                AVRO_NAMESPACE_METADATA_KEY.to_string(),
7990                "org.apache.arrow.avrotests.v1".to_string(),
7991            );
7992            m
7993        };
7994        let uf_dur_or_str = UnionFields::new(
7995            vec![0, 1],
7996            vec![
7997                Field::new("string", DataType::Utf8, false),
7998                Field::new(
7999                    "org.apache.arrow.avrotests.v1.Dur12U",
8000                    DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano),
8001                    false,
8002                )
8003                .with_metadata(dur12u_md.clone()),
8004            ],
8005        );
8006        let fx10_md = {
8007            let mut m = HashMap::<String, String>::new();
8008            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx10".to_string());
8009            m.insert(
8010                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8011                "org.apache.arrow.avrotests.v1".to_string(),
8012            );
8013            m
8014        };
8015        let uf_uuid_or_fx10 = UnionFields::new(
8016            vec![0, 1],
8017            vec![
8018                Field::new(
8019                    "org.apache.arrow.avrotests.v1.Fx10",
8020                    DataType::FixedSizeBinary(10),
8021                    false,
8022                )
8023                .with_metadata(fx10_md.clone()),
8024                add_uuid_ext_union(Field::new("uuid", DataType::FixedSizeBinary(16), false)),
8025            ],
8026        );
8027        let uf_kv_val = UnionFields::new(
8028            vec![0, 1, 2],
8029            vec![
8030                Field::new("null", DataType::Null, false),
8031                Field::new("int", DataType::Int32, false),
8032                Field::new("long", DataType::Int64, false),
8033            ],
8034        );
8035        let kv_fields = Fields::from(vec![
8036            Field::new("key", DataType::Utf8, false),
8037            Field::new(
8038                "val",
8039                DataType::Union(uf_kv_val.clone(), UnionMode::Dense),
8040                true,
8041            ),
8042        ]);
8043        let kv_item_field = Arc::new(Field::new(
8044            item_name,
8045            DataType::Struct(kv_fields.clone()),
8046            false,
8047        ));
8048        let map_int_entries = Arc::new(Field::new(
8049            "entries",
8050            DataType::Struct(Fields::from(vec![
8051                Field::new("key", DataType::Utf8, false),
8052                Field::new("value", DataType::Int32, false),
8053            ])),
8054            false,
8055        ));
8056        let uf_map_or_array = UnionFields::new(
8057            vec![0, 1],
8058            vec![
8059                Field::new(
8060                    "array",
8061                    DataType::List(Arc::new(Field::new(item_name, DataType::Int32, false))),
8062                    false,
8063                ),
8064                Field::new("map", DataType::Map(map_int_entries.clone(), false), false),
8065            ],
8066        );
8067        let mut enum_md_status = {
8068            let mut m = HashMap::<String, String>::new();
8069            m.insert(
8070                crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8071                serde_json::to_string(&vec!["UNKNOWN", "NEW", "PROCESSING", "DONE"]).unwrap(),
8072            );
8073            m
8074        };
8075        enum_md_status.insert(AVRO_NAME_METADATA_KEY.to_string(), "Status".to_string());
8076        enum_md_status.insert(
8077            AVRO_NAMESPACE_METADATA_KEY.to_string(),
8078            "org.apache.arrow.avrotests.v1.types".to_string(),
8079        );
8080        let mut dec20_md = HashMap::<String, String>::new();
8081        dec20_md.insert("precision".to_string(), "20".to_string());
8082        dec20_md.insert("scale".to_string(), "4".to_string());
8083        dec20_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "DecFix20".to_string());
8084        dec20_md.insert(
8085            AVRO_NAMESPACE_METADATA_KEY.to_string(),
8086            "org.apache.arrow.avrotests.v1.types".to_string(),
8087        );
8088        let mut dec10_md = HashMap::<String, String>::new();
8089        dec10_md.insert("precision".to_string(), "10".to_string());
8090        dec10_md.insert("scale".to_string(), "2".to_string());
8091        let fx16_top_md = {
8092            let mut m = HashMap::<String, String>::new();
8093            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx16".to_string());
8094            m.insert(
8095                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8096                "org.apache.arrow.avrotests.v1.types".to_string(),
8097            );
8098            m
8099        };
8100        let dur12_top_md = {
8101            let mut m = HashMap::<String, String>::new();
8102            m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12".to_string());
8103            m.insert(
8104                AVRO_NAMESPACE_METADATA_KEY.to_string(),
8105                "org.apache.arrow.avrotests.v1.types".to_string(),
8106            );
8107            m
8108        };
8109        #[cfg(feature = "small_decimals")]
8110        let dec20_dt = DataType::Decimal128(20, 4);
8111        #[cfg(not(feature = "small_decimals"))]
8112        let dec20_dt = DataType::Decimal128(20, 4);
8113        #[cfg(feature = "small_decimals")]
8114        let dec10_dt = DataType::Decimal64(10, 2);
8115        #[cfg(not(feature = "small_decimals"))]
8116        let dec10_dt = DataType::Decimal128(10, 2);
8117        let fields: Vec<FieldRef> = vec![
8118            Arc::new(Field::new(
8119                "person",
8120                DataType::Struct(Fields::from(vec![
8121                    Field::new("name", DataType::Utf8, false),
8122                    Field::new("age", DataType::Int32, false),
8123                ])),
8124                false,
8125            )),
8126            Arc::new(Field::new("old_count", DataType::Int32, false)),
8127            Arc::new(Field::new(
8128                "union_map_or_array_int",
8129                DataType::Union(uf_map_or_array.clone(), UnionMode::Dense),
8130                false,
8131            )),
8132            Arc::new(Field::new(
8133                "array_records_with_union",
8134                DataType::List(kv_item_field.clone()),
8135                false,
8136            )),
8137            Arc::new(Field::new(
8138                "union_uuid_or_fixed10",
8139                DataType::Union(uf_uuid_or_fx10.clone(), UnionMode::Dense),
8140                false,
8141            )),
8142            Arc::new(Field::new(
8143                "union_interval_or_string",
8144                DataType::Union(uf_dur_or_str.clone(), UnionMode::Dense),
8145                false,
8146            )),
8147            Arc::new(Field::new(
8148                "union_date_or_fixed4",
8149                DataType::Union(uf_date_fixed4.clone(), UnionMode::Dense),
8150                false,
8151            )),
8152            Arc::new(Field::new(
8153                "union_enum_record_array_map",
8154                DataType::Union(uf_union_big.clone(), UnionMode::Dense),
8155                false,
8156            )),
8157            Arc::new(Field::new(
8158                "maybe_auth",
8159                DataType::Struct(Fields::from(vec![
8160                    Field::new("user", DataType::Utf8, false),
8161                    Field::new("token", DataType::Binary, true), // [bytes,null] -> nullable bytes
8162                ])),
8163                false,
8164            )),
8165            Arc::new(Field::new(
8166                "address",
8167                DataType::Struct(Fields::from(vec![
8168                    Field::new("street_name", DataType::Utf8, false),
8169                    Field::new("zip", DataType::Int32, false),
8170                    Field::new("country", DataType::Utf8, false),
8171                ])),
8172                false,
8173            )),
8174            Arc::new(Field::new(
8175                "map_union",
8176                DataType::Map(map_entries_field.clone(), false),
8177                false,
8178            )),
8179            Arc::new(Field::new(
8180                "arr_union",
8181                DataType::List(arr_items_field.clone()),
8182                false,
8183            )),
8184            Arc::new(
8185                Field::new(
8186                    "status",
8187                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8188                    false,
8189                )
8190                .with_metadata(enum_md_status.clone()),
8191            ),
8192            Arc::new(
8193                Field::new(
8194                    "interval_mdn",
8195                    DataType::Interval(IntervalUnit::MonthDayNano),
8196                    false,
8197                )
8198                .with_metadata(dur12_top_md.clone()),
8199            ),
8200            Arc::new(Field::new(
8201                "ts_micros_local",
8202                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
8203                false,
8204            )),
8205            Arc::new(Field::new(
8206                "ts_millis_local",
8207                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
8208                false,
8209            )),
8210            Arc::new(Field::new(
8211                "ts_micros_utc",
8212                DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+00:00".into())),
8213                false,
8214            )),
8215            Arc::new(Field::new(
8216                "ts_millis_utc",
8217                DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, Some("+00:00".into())),
8218                false,
8219            )),
8220            Arc::new(Field::new(
8221                "t_micros",
8222                DataType::Time64(arrow_schema::TimeUnit::Microsecond),
8223                false,
8224            )),
8225            Arc::new(Field::new(
8226                "t_millis",
8227                DataType::Time32(arrow_schema::TimeUnit::Millisecond),
8228                false,
8229            )),
8230            Arc::new(Field::new("d_date", DataType::Date32, false)),
8231            Arc::new(add_uuid_ext_top(Field::new(
8232                "uuid_str",
8233                DataType::FixedSizeBinary(16),
8234                false,
8235            ))),
8236            Arc::new(Field::new("dec_fix_s20_4", dec20_dt, false).with_metadata(dec20_md.clone())),
8237            Arc::new(
8238                Field::new("dec_bytes_s10_2", dec10_dt, false).with_metadata(dec10_md.clone()),
8239            ),
8240            Arc::new(
8241                Field::new("fx16_plain", DataType::FixedSizeBinary(16), false)
8242                    .with_metadata(fx16_top_md.clone()),
8243            ),
8244            Arc::new(Field::new("raw_bytes", DataType::Binary, false)),
8245            Arc::new(Field::new("str_utf8", DataType::Utf8, false)),
8246            Arc::new(Field::new(
8247                "tri_union_prim",
8248                DataType::Union(uf_tri.clone(), UnionMode::Dense),
8249                false,
8250            )),
8251            Arc::new(Field::new("opt_str_nullsecond", DataType::Utf8, true)),
8252            Arc::new(Field::new("opt_i32_nullfirst", DataType::Int32, true)),
8253            Arc::new(Field::new("count_i64", DataType::Int64, false)),
8254            Arc::new(Field::new("count_i32", DataType::Int64, false)),
8255            Arc::new(Field::new("ratio_f64", DataType::Float64, false)),
8256            Arc::new(Field::new("ratio_f32", DataType::Float64, false)),
8257            Arc::new(Field::new("flag", DataType::Boolean, false)),
8258            Arc::new(Field::new("identifier", DataType::Int64, false)),
8259        ];
8260        let expected_schema = Arc::new(arrow_schema::Schema::new(Fields::from(fields)));
8261        let mut cols: Vec<ArrayRef> = vec![
8262            Arc::new(StructArray::new(
8263                match expected_schema
8264                    .field_with_name("person")
8265                    .unwrap()
8266                    .data_type()
8267                {
8268                    DataType::Struct(fs) => fs.clone(),
8269                    _ => unreachable!(),
8270                },
8271                vec![
8272                    Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef,
8273                    Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef,
8274                ],
8275                None,
8276            )) as ArrayRef,
8277            Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
8278        ];
8279        {
8280            let map_child: ArrayRef = {
8281                let keys = StringArray::from(vec!["x", "y", "only"]);
8282                let vals = Int32Array::from(vec![1, 2, 10]);
8283                let entries = StructArray::new(
8284                    Fields::from(vec![
8285                        Field::new("key", DataType::Utf8, false),
8286                        Field::new("value", DataType::Int32, false),
8287                    ]),
8288                    vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8289                    None,
8290                );
8291                let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
8292                Arc::new(MapArray::new(
8293                    map_int_entries.clone(),
8294                    moff,
8295                    entries,
8296                    None,
8297                    false,
8298                )) as ArrayRef
8299            };
8300            let list_child: ArrayRef = {
8301                let values = Int32Array::from(vec![1, 2, 3, 0]);
8302                let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
8303                Arc::new(
8304                    ListArray::try_new(
8305                        Arc::new(Field::new(item_name, DataType::Int32, false)),
8306                        offsets,
8307                        Arc::new(values),
8308                        None,
8309                    )
8310                    .unwrap(),
8311                ) as ArrayRef
8312            };
8313            let tids = vec![1, 0, 1, 0];
8314            let offs = vec![0, 0, 1, 1];
8315            let arr = mk_dense_union(&uf_map_or_array, tids, offs, |f| match f.name().as_str() {
8316                "array" => Some(list_child.clone()),
8317                "map" => Some(map_child.clone()),
8318                _ => None,
8319            });
8320            cols.push(arr);
8321        }
8322        {
8323            let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
8324            let type_ids = vec![1, 0, 2, 0, 1];
8325            let offsets = vec![0, 0, 0, 1, 1];
8326            let vals = mk_dense_union(&uf_kv_val, type_ids, offsets, |f| match f.data_type() {
8327                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
8328                DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
8329                DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
8330                _ => None,
8331            });
8332            let values_struct =
8333                Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None));
8334            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
8335            let arr = Arc::new(
8336                ListArray::try_new(kv_item_field.clone(), list_offsets, values_struct, None)
8337                    .unwrap(),
8338            ) as ArrayRef;
8339            cols.push(arr);
8340        }
8341        {
8342            let type_ids = vec![1, 0, 1, 0]; // [uuid, fixed10, uuid, fixed10] but uf order = [fixed10, uuid]
8343            let offs = vec![0, 0, 1, 1];
8344            let arr = mk_dense_union(&uf_uuid_or_fx10, type_ids, offs, |f| match f.data_type() {
8345                DataType::FixedSizeBinary(16) => {
8346                    let it = [Some(uuid1), Some(uuid2)].into_iter();
8347                    Some(Arc::new(
8348                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8349                    ) as ArrayRef)
8350                }
8351                DataType::FixedSizeBinary(10) => {
8352                    let fx10_a = [0xAAu8; 10];
8353                    let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
8354                    let it = [Some(fx10_a), Some(fx10_b)].into_iter();
8355                    Some(Arc::new(
8356                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
8357                    ) as ArrayRef)
8358                }
8359                _ => None,
8360            });
8361            cols.push(arr);
8362        }
8363        {
8364            let type_ids = vec![1, 0, 1, 0]; // [duration, string, duration, string] but uf order = [string, duration]
8365            let offs = vec![0, 0, 1, 1];
8366            let arr = mk_dense_union(&uf_dur_or_str, type_ids, offs, |f| match f.data_type() {
8367                DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => Some(Arc::new(
8368                    IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
8369                )
8370                    as ArrayRef),
8371                DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
8372                    "duration-as-text",
8373                    "iso-8601-period-P1Y",
8374                ])) as ArrayRef),
8375                _ => None,
8376            });
8377            cols.push(arr);
8378        }
8379        {
8380            let type_ids = vec![1, 0, 1, 0]; // [date, fixed, date, fixed] but uf order = [fixed, date]
8381            let offs = vec![0, 0, 1, 1];
8382            let arr = mk_dense_union(&uf_date_fixed4, type_ids, offs, |f| match f.data_type() {
8383                DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
8384                DataType::FixedSizeBinary(4) => {
8385                    let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
8386                    Some(Arc::new(
8387                        FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
8388                    ) as ArrayRef)
8389                }
8390                _ => None,
8391            });
8392            cols.push(arr);
8393        }
8394        {
8395            let tids = vec![4, 3, 1, 0]; // uf order = [map(0), array(1), RecB(2), RecA(3), enum(4)]
8396            let offs = vec![0, 0, 0, 0];
8397            let arr = mk_dense_union(&uf_union_big, tids, offs, |f| match f.data_type() {
8398                DataType::Dictionary(_, _) => {
8399                    let keys = Int32Array::from(vec![0i32]);
8400                    let values =
8401                        Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
8402                    Some(
8403                        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8404                            as ArrayRef,
8405                    )
8406                }
8407                DataType::Struct(fs) if fs == &union_rec_a_fields => {
8408                    let a = Int32Array::from(vec![7]);
8409                    let b = StringArray::from(vec!["rec"]);
8410                    Some(Arc::new(StructArray::new(
8411                        fs.clone(),
8412                        vec![Arc::new(a) as ArrayRef, Arc::new(b) as ArrayRef],
8413                        None,
8414                    )) as ArrayRef)
8415                }
8416                DataType::List(_) => {
8417                    let values = Int64Array::from(vec![1i64, 2, 3]);
8418                    let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
8419                    Some(Arc::new(
8420                        ListArray::try_new(
8421                            Arc::new(Field::new(item_name, DataType::Int64, false)),
8422                            offsets,
8423                            Arc::new(values),
8424                            None,
8425                        )
8426                        .unwrap(),
8427                    ) as ArrayRef)
8428                }
8429                DataType::Map(_, _) => {
8430                    let keys = StringArray::from(vec!["k"]);
8431                    let vals = StringArray::from(vec!["v"]);
8432                    let entries = StructArray::new(
8433                        Fields::from(vec![
8434                            Field::new("key", DataType::Utf8, false),
8435                            Field::new("value", DataType::Utf8, false),
8436                        ]),
8437                        vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8438                        None,
8439                    );
8440                    let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
8441                    Some(Arc::new(MapArray::new(
8442                        union_map_entries.clone(),
8443                        moff,
8444                        entries,
8445                        None,
8446                        false,
8447                    )) as ArrayRef)
8448                }
8449                _ => None,
8450            });
8451            cols.push(arr);
8452        }
8453        {
8454            let fs = match expected_schema
8455                .field_with_name("maybe_auth")
8456                .unwrap()
8457                .data_type()
8458            {
8459                DataType::Struct(fs) => fs.clone(),
8460                _ => unreachable!(),
8461            };
8462            let user =
8463                Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
8464            let token_values: Vec<Option<&[u8]>> = vec![
8465                None,
8466                Some(b"\x01\x02\x03".as_ref()),
8467                None,
8468                Some(b"".as_ref()),
8469            ];
8470            let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
8471            cols.push(Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef);
8472        }
8473        {
8474            let fs = match expected_schema
8475                .field_with_name("address")
8476                .unwrap()
8477                .data_type()
8478            {
8479                DataType::Struct(fs) => fs.clone(),
8480                _ => unreachable!(),
8481            };
8482            let street = Arc::new(StringArray::from(vec![
8483                "100 Main",
8484                "",
8485                "42 Galaxy Way",
8486                "End Ave",
8487            ])) as ArrayRef;
8488            let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
8489            let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
8490            cols.push(Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef);
8491        }
8492        {
8493            let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
8494            let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
8495            let tid_s = 0; // string
8496            let tid_d = 1; // double
8497            let tid_n = 2; // null
8498            let type_ids = vec![tid_d, tid_n, tid_s, tid_d, tid_d, tid_s];
8499            let offsets = vec![0, 0, 0, 1, 2, 1];
8500            let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
8501            let vals = mk_dense_union(&uf_map_vals, type_ids, offsets, |f| match f.data_type() {
8502                DataType::Float64 => {
8503                    Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
8504                }
8505                DataType::Utf8 => {
8506                    Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
8507                }
8508                DataType::Null => Some(Arc::new(NullArray::new(1)) as ArrayRef),
8509                _ => None,
8510            });
8511            let entries = StructArray::new(
8512                Fields::from(vec![
8513                    Field::new("key", DataType::Utf8, false),
8514                    Field::new(
8515                        "value",
8516                        DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8517                        true,
8518                    ),
8519                ]),
8520                vec![Arc::new(keys) as ArrayRef, vals],
8521                None,
8522            );
8523            let map = Arc::new(MapArray::new(
8524                map_entries_field.clone(),
8525                moff,
8526                entries,
8527                None,
8528                false,
8529            )) as ArrayRef;
8530            cols.push(map);
8531        }
8532        {
8533            let type_ids = vec![
8534                2, 1, 0, 2, 0, 1, 2, 2, 1, 0,
8535                2, // long,string,null,long,null,string,long,long,string,null,long
8536            ];
8537            let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
8538            let values =
8539                mk_dense_union(&uf_arr_items, type_ids, offsets, |f| match f.data_type() {
8540                    DataType::Int64 => {
8541                        Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
8542                    }
8543                    DataType::Utf8 => {
8544                        Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
8545                    }
8546                    DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
8547                    _ => None,
8548                });
8549            let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
8550            let arr = Arc::new(
8551                ListArray::try_new(arr_items_field.clone(), list_offsets, values, None).unwrap(),
8552            ) as ArrayRef;
8553            cols.push(arr);
8554        }
8555        {
8556            let keys = Int32Array::from(vec![1, 2, 3, 0]); // NEW, PROCESSING, DONE, UNKNOWN
8557            let values = Arc::new(StringArray::from(vec![
8558                "UNKNOWN",
8559                "NEW",
8560                "PROCESSING",
8561                "DONE",
8562            ])) as ArrayRef;
8563            let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
8564            cols.push(Arc::new(dict) as ArrayRef);
8565        }
8566        cols.push(Arc::new(IntervalMonthDayNanoArray::from(vec![
8567            dur_small, dur_zero, dur_large, dur_2years,
8568        ])) as ArrayRef);
8569        cols.push(Arc::new(TimestampMicrosecondArray::from(vec![
8570            ts_us_2024_01_01 + 123_456,
8571            0,
8572            ts_us_2024_01_01 + 101_112,
8573            987_654_321,
8574        ])) as ArrayRef);
8575        cols.push(Arc::new(TimestampMillisecondArray::from(vec![
8576            ts_ms_2024_01_01 + 86_400_000,
8577            0,
8578            ts_ms_2024_01_01 + 789,
8579            123_456_789,
8580        ])) as ArrayRef);
8581        {
8582            let a = TimestampMicrosecondArray::from(vec![
8583                ts_us_2024_01_01,
8584                1,
8585                ts_us_2024_01_01 + 456,
8586                0,
8587            ])
8588            .with_timezone("+00:00");
8589            cols.push(Arc::new(a) as ArrayRef);
8590        }
8591        {
8592            let a = TimestampMillisecondArray::from(vec![
8593                ts_ms_2024_01_01,
8594                -1,
8595                ts_ms_2024_01_01 + 123,
8596                0,
8597            ])
8598            .with_timezone("+00:00");
8599            cols.push(Arc::new(a) as ArrayRef);
8600        }
8601        cols.push(Arc::new(Time64MicrosecondArray::from(vec![
8602            time_us_eod,
8603            0,
8604            1,
8605            1_000_000,
8606        ])) as ArrayRef);
8607        cols.push(Arc::new(Time32MillisecondArray::from(vec![
8608            time_ms_a,
8609            0,
8610            1,
8611            86_400_000 - 1,
8612        ])) as ArrayRef);
8613        cols.push(Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef);
8614        {
8615            let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
8616            cols.push(Arc::new(
8617                FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8618            ) as ArrayRef);
8619        }
8620        {
8621            #[cfg(feature = "small_decimals")]
8622            let arr = Arc::new(
8623                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
8624                    .with_precision_and_scale(20, 4)
8625                    .unwrap(),
8626            ) as ArrayRef;
8627            #[cfg(not(feature = "small_decimals"))]
8628            let arr = Arc::new(
8629                Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
8630                    .with_precision_and_scale(20, 4)
8631                    .unwrap(),
8632            ) as ArrayRef;
8633            cols.push(arr);
8634        }
8635        {
8636            #[cfg(feature = "small_decimals")]
8637            let arr = Arc::new(
8638                Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
8639                    .with_precision_and_scale(10, 2)
8640                    .unwrap(),
8641            ) as ArrayRef;
8642            #[cfg(not(feature = "small_decimals"))]
8643            let arr = Arc::new(
8644                Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
8645                    .with_precision_and_scale(10, 2)
8646                    .unwrap(),
8647            ) as ArrayRef;
8648            cols.push(arr);
8649        }
8650        {
8651            let it = [
8652                Some(*b"0123456789ABCDEF"),
8653                Some([0u8; 16]),
8654                Some(*b"ABCDEFGHIJKLMNOP"),
8655                Some([0xAA; 16]),
8656            ]
8657            .into_iter();
8658            cols.push(Arc::new(
8659                FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8660            ) as ArrayRef);
8661        }
8662        cols.push(Arc::new(BinaryArray::from(vec![
8663            b"\x00\x01".as_ref(),
8664            b"".as_ref(),
8665            b"\xFF\x00".as_ref(),
8666            b"\x10\x20\x30\x40".as_ref(),
8667        ])) as ArrayRef);
8668        cols.push(Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef);
8669        {
8670            let tids = vec![0, 1, 2, 1];
8671            let offs = vec![0, 0, 0, 1];
8672            let arr = mk_dense_union(&uf_tri, tids, offs, |f| match f.data_type() {
8673                DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
8674                DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
8675                DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
8676                _ => None,
8677            });
8678            cols.push(arr);
8679        }
8680        cols.push(Arc::new(StringArray::from(vec![
8681            Some("alpha"),
8682            None,
8683            Some("s3"),
8684            Some(""),
8685        ])) as ArrayRef);
8686        cols.push(Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef);
8687        cols.push(Arc::new(Int64Array::from(vec![
8688            7_000_000_000i64,
8689            -2,
8690            0,
8691            -9_876_543_210i64,
8692        ])) as ArrayRef);
8693        cols.push(Arc::new(Int64Array::from(vec![7i64, -1, 0, 123])) as ArrayRef);
8694        cols.push(Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef);
8695        cols.push(Arc::new(Float64Array::from(vec![1.25f64, -0.0, 3.5, 9.75])) as ArrayRef);
8696        cols.push(Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef);
8697        cols.push(Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef);
8698        let expected = RecordBatch::try_new(expected_schema, cols).unwrap();
8699        assert_eq!(
8700            expected, batch,
8701            "entire RecordBatch mismatch (schema, all columns, all rows)"
8702        );
8703    }
8704}