1use crate::codec::AvroFieldBuilder;
483use crate::reader::header::read_header;
484use crate::schema::{
485 AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, SCHEMA_METADATA_KEY,
486 SINGLE_OBJECT_MAGIC, Schema, SchemaStore,
487};
488use arrow_array::{RecordBatch, RecordBatchReader};
489use arrow_schema::{ArrowError, SchemaRef};
490use block::BlockDecoder;
491use header::Header;
492use indexmap::IndexMap;
493use record::RecordDecoder;
494use std::io::BufRead;
495
496mod block;
497mod cursor;
498mod header;
499mod record;
500mod vlq;
501
502fn is_incomplete_data(err: &ArrowError) -> bool {
503 matches!(
504 err,
505 ArrowError::ParseError(msg)
506 if msg.contains("Unexpected EOF")
507 )
508}
509
510#[derive(Debug)]
633pub struct Decoder {
634 active_decoder: RecordDecoder,
635 active_fingerprint: Option<Fingerprint>,
636 batch_size: usize,
637 remaining_capacity: usize,
638 cache: IndexMap<Fingerprint, RecordDecoder>,
639 fingerprint_algorithm: FingerprintAlgorithm,
640 pending_schema: Option<(Fingerprint, RecordDecoder)>,
641 awaiting_body: bool,
642}
643
644impl Decoder {
645 pub fn schema(&self) -> SchemaRef {
650 self.active_decoder.schema().clone()
651 }
652
653 pub fn batch_size(&self) -> usize {
655 self.batch_size
656 }
657
658 pub fn decode(&mut self, data: &[u8]) -> Result<usize, ArrowError> {
679 let mut total_consumed = 0usize;
680 while total_consumed < data.len() && self.remaining_capacity > 0 {
681 if self.awaiting_body {
682 match self.active_decoder.decode(&data[total_consumed..], 1) {
683 Ok(n) => {
684 self.remaining_capacity -= 1;
685 total_consumed += n;
686 self.awaiting_body = false;
687 continue;
688 }
689 Err(ref e) if is_incomplete_data(e) => break,
690 err => return err,
691 };
692 }
693 match self.handle_prefix(&data[total_consumed..])? {
694 Some(0) => break, Some(n) => {
696 total_consumed += n;
697 self.apply_pending_schema_if_batch_empty();
698 self.awaiting_body = true;
699 }
700 None => {
701 return Err(ArrowError::ParseError(
702 "Missing magic bytes and fingerprint".to_string(),
703 ));
704 }
705 }
706 }
707 Ok(total_consumed)
708 }
709
710 fn handle_prefix(&mut self, buf: &[u8]) -> Result<Option<usize>, ArrowError> {
715 match self.fingerprint_algorithm {
716 FingerprintAlgorithm::Rabin => {
717 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
718 Fingerprint::Rabin(u64::from_le_bytes(bytes))
719 })
720 }
721 FingerprintAlgorithm::Id => self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
722 Fingerprint::Id(u32::from_be_bytes(bytes))
723 }),
724 FingerprintAlgorithm::Id64 => {
725 self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
726 Fingerprint::Id64(u64::from_be_bytes(bytes))
727 })
728 }
729 #[cfg(feature = "md5")]
730 FingerprintAlgorithm::MD5 => {
731 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
732 Fingerprint::MD5(bytes)
733 })
734 }
735 #[cfg(feature = "sha256")]
736 FingerprintAlgorithm::SHA256 => {
737 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
738 Fingerprint::SHA256(bytes)
739 })
740 }
741 }
742 }
743
744 fn handle_prefix_common<const MAGIC_LEN: usize, const N: usize>(
748 &mut self,
749 buf: &[u8],
750 magic: &[u8; MAGIC_LEN],
751 fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
752 ) -> Result<Option<usize>, ArrowError> {
753 if buf.len() < MAGIC_LEN {
756 return Ok(Some(0));
757 }
758 if &buf[..MAGIC_LEN] != magic {
760 return Ok(None);
761 }
762 let consumed_fp = self.handle_fingerprint(&buf[MAGIC_LEN..], fingerprint_from)?;
764 Ok(Some(consumed_fp.map_or(0, |n| n + MAGIC_LEN)))
767 }
768
769 fn handle_fingerprint<const N: usize>(
774 &mut self,
775 buf: &[u8],
776 fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
777 ) -> Result<Option<usize>, ArrowError> {
778 let Some(fingerprint_bytes) = buf.get(..N) else {
780 return Ok(None); };
782 let new_fingerprint = fingerprint_from(fingerprint_bytes.try_into().unwrap());
784 if self.active_fingerprint != Some(new_fingerprint) {
786 let Some(new_decoder) = self.cache.shift_remove(&new_fingerprint) else {
787 return Err(ArrowError::ParseError(format!(
788 "Unknown fingerprint: {new_fingerprint:?}"
789 )));
790 };
791 self.pending_schema = Some((new_fingerprint, new_decoder));
792 if self.remaining_capacity < self.batch_size {
795 self.remaining_capacity = 0;
796 }
797 }
798 Ok(Some(N))
799 }
800
801 fn apply_pending_schema(&mut self) {
802 if let Some((new_fingerprint, new_decoder)) = self.pending_schema.take() {
803 if let Some(old_fingerprint) = self.active_fingerprint.replace(new_fingerprint) {
804 let old_decoder = std::mem::replace(&mut self.active_decoder, new_decoder);
805 self.cache.shift_remove(&old_fingerprint);
806 self.cache.insert(old_fingerprint, old_decoder);
807 } else {
808 self.active_decoder = new_decoder;
809 }
810 }
811 }
812
813 fn apply_pending_schema_if_batch_empty(&mut self) {
814 if self.batch_is_empty() {
815 self.apply_pending_schema();
816 }
817 }
818
819 fn flush_and_reset(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
820 if self.batch_is_empty() {
821 return Ok(None);
822 }
823 let batch = self.active_decoder.flush()?;
824 self.remaining_capacity = self.batch_size;
825 Ok(Some(batch))
826 }
827
828 pub fn flush(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
835 let batch = self.flush_and_reset();
837 self.apply_pending_schema();
838 batch
839 }
840
841 pub fn capacity(&self) -> usize {
843 self.remaining_capacity
844 }
845
846 pub fn batch_is_full(&self) -> bool {
848 self.remaining_capacity == 0
849 }
850
851 pub fn batch_is_empty(&self) -> bool {
853 self.remaining_capacity == self.batch_size
854 }
855
856 fn decode_block(&mut self, data: &[u8], count: usize) -> Result<(usize, usize), ArrowError> {
860 let to_decode = std::cmp::min(count, self.remaining_capacity);
862 if to_decode == 0 {
863 return Ok((0, 0));
864 }
865 let consumed = self.active_decoder.decode(data, to_decode)?;
866 self.remaining_capacity -= to_decode;
867 Ok((consumed, to_decode))
868 }
869
870 fn flush_block(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
873 self.flush_and_reset()
874 }
875}
876
877#[derive(Debug)]
940pub struct ReaderBuilder {
941 batch_size: usize,
942 strict_mode: bool,
943 utf8_view: bool,
944 reader_schema: Option<AvroSchema>,
945 projection: Option<Vec<usize>>,
946 writer_schema_store: Option<SchemaStore>,
947 active_fingerprint: Option<Fingerprint>,
948}
949
950impl Default for ReaderBuilder {
951 fn default() -> Self {
952 Self {
953 batch_size: 1024,
954 strict_mode: false,
955 utf8_view: false,
956 reader_schema: None,
957 projection: None,
958 writer_schema_store: None,
959 active_fingerprint: None,
960 }
961 }
962}
963
964impl ReaderBuilder {
965 pub fn new() -> Self {
975 Self::default()
976 }
977
978 fn make_record_decoder(
979 &self,
980 writer_schema: &Schema,
981 reader_schema: Option<&Schema>,
982 ) -> Result<RecordDecoder, ArrowError> {
983 let mut builder = AvroFieldBuilder::new(writer_schema);
984 if let Some(reader_schema) = reader_schema {
985 builder = builder.with_reader_schema(reader_schema);
986 }
987 let root = builder
988 .with_utf8view(self.utf8_view)
989 .with_strict_mode(self.strict_mode)
990 .build()?;
991 RecordDecoder::try_new_with_options(root.data_type())
992 }
993
994 fn make_record_decoder_from_schemas(
995 &self,
996 writer_schema: &Schema,
997 reader_schema: Option<&AvroSchema>,
998 ) -> Result<RecordDecoder, ArrowError> {
999 let reader_schema_raw = reader_schema.map(|s| s.schema()).transpose()?;
1000 self.make_record_decoder(writer_schema, reader_schema_raw.as_ref())
1001 }
1002
1003 fn make_decoder_with_parts(
1004 &self,
1005 active_decoder: RecordDecoder,
1006 active_fingerprint: Option<Fingerprint>,
1007 cache: IndexMap<Fingerprint, RecordDecoder>,
1008 fingerprint_algorithm: FingerprintAlgorithm,
1009 ) -> Decoder {
1010 Decoder {
1011 batch_size: self.batch_size,
1012 remaining_capacity: self.batch_size,
1013 active_fingerprint,
1014 active_decoder,
1015 cache,
1016 fingerprint_algorithm,
1017 pending_schema: None,
1018 awaiting_body: false,
1019 }
1020 }
1021
1022 fn make_decoder(
1023 &self,
1024 header: Option<&Header>,
1025 reader_schema: Option<&AvroSchema>,
1026 ) -> Result<Decoder, ArrowError> {
1027 if let Some(hdr) = header {
1028 let writer_schema = hdr
1029 .schema()
1030 .map_err(|e| ArrowError::ExternalError(Box::new(e)))?
1031 .ok_or_else(|| {
1032 ArrowError::ParseError("No Avro schema present in file header".into())
1033 })?;
1034 let projected_reader_schema = self
1035 .projection
1036 .as_deref()
1037 .map(|projection| {
1038 let base_schema = if let Some(reader_schema) = reader_schema {
1039 reader_schema.clone()
1040 } else {
1041 let raw = hdr.get(SCHEMA_METADATA_KEY).ok_or_else(|| {
1042 ArrowError::ParseError(
1043 "No Avro schema present in file header".to_string(),
1044 )
1045 })?;
1046 let json_string = std::str::from_utf8(raw)
1047 .map_err(|e| {
1048 ArrowError::ParseError(format!(
1049 "Invalid UTF-8 in Avro schema header: {e}"
1050 ))
1051 })?
1052 .to_string();
1053 AvroSchema::new(json_string)
1054 };
1055 base_schema.project(projection)
1056 })
1057 .transpose()?;
1058 let effective_reader_schema = projected_reader_schema.as_ref().or(reader_schema);
1059 let record_decoder =
1060 self.make_record_decoder_from_schemas(&writer_schema, effective_reader_schema)?;
1061 return Ok(self.make_decoder_with_parts(
1062 record_decoder,
1063 None,
1064 IndexMap::new(),
1065 FingerprintAlgorithm::Rabin,
1066 ));
1067 }
1068 let store = self.writer_schema_store.as_ref().ok_or_else(|| {
1069 ArrowError::ParseError("Writer schema store required for raw Avro".into())
1070 })?;
1071 let fingerprints = store.fingerprints();
1072 if fingerprints.is_empty() {
1073 return Err(ArrowError::ParseError(
1074 "Writer schema store must contain at least one schema".into(),
1075 ));
1076 }
1077 let start_fingerprint = self
1078 .active_fingerprint
1079 .or_else(|| fingerprints.first().copied())
1080 .ok_or_else(|| {
1081 ArrowError::ParseError("Could not determine initial schema fingerprint".into())
1082 })?;
1083 let projection = self.projection.as_deref();
1084 let projected_reader_schema = match (projection, reader_schema) {
1085 (Some(projection), Some(reader_schema)) => Some(reader_schema.project(projection)?),
1086 _ => None,
1087 };
1088 let mut cache = IndexMap::with_capacity(fingerprints.len().saturating_sub(1));
1089 let mut active_decoder: Option<RecordDecoder> = None;
1090 for fingerprint in store.fingerprints() {
1091 let avro_schema = match store.lookup(&fingerprint) {
1092 Some(schema) => schema,
1093 None => {
1094 return Err(ArrowError::ComputeError(format!(
1095 "Fingerprint {fingerprint:?} not found in schema store",
1096 )));
1097 }
1098 };
1099 let writer_schema = avro_schema.schema()?;
1100 let record_decoder = match projection {
1101 None => self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?,
1102 Some(projection) => {
1103 if let Some(ref pruned_reader_schema) = projected_reader_schema {
1104 self.make_record_decoder_from_schemas(
1105 &writer_schema,
1106 Some(pruned_reader_schema),
1107 )?
1108 } else {
1109 let derived_reader_schema = avro_schema.project(projection)?;
1110 self.make_record_decoder_from_schemas(
1111 &writer_schema,
1112 Some(&derived_reader_schema),
1113 )?
1114 }
1115 }
1116 };
1117 if fingerprint == start_fingerprint {
1118 active_decoder = Some(record_decoder);
1119 } else {
1120 cache.insert(fingerprint, record_decoder);
1121 }
1122 }
1123 let active_decoder = active_decoder.ok_or_else(|| {
1124 ArrowError::ComputeError(format!(
1125 "Initial fingerprint {start_fingerprint:?} not found in schema store"
1126 ))
1127 })?;
1128 Ok(self.make_decoder_with_parts(
1129 active_decoder,
1130 Some(start_fingerprint),
1131 cache,
1132 store.fingerprint_algorithm(),
1133 ))
1134 }
1135
1136 pub fn with_batch_size(mut self, batch_size: usize) -> Self {
1142 self.batch_size = batch_size;
1143 self
1144 }
1145
1146 pub fn with_utf8_view(mut self, utf8_view: bool) -> Self {
1152 self.utf8_view = utf8_view;
1153 self
1154 }
1155
1156 pub fn use_utf8view(&self) -> bool {
1158 self.utf8_view
1159 }
1160
1161 pub fn with_strict_mode(mut self, strict_mode: bool) -> Self {
1166 self.strict_mode = strict_mode;
1167 self
1168 }
1169
1170 pub fn with_reader_schema(mut self, schema: AvroSchema) -> Self {
1177 self.reader_schema = Some(schema);
1178 self
1179 }
1180
1181 pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
1239 self.projection = Some(projection);
1240 self
1241 }
1242
1243 pub fn with_writer_schema_store(mut self, store: SchemaStore) -> Self {
1251 self.writer_schema_store = Some(store);
1252 self
1253 }
1254
1255 pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self {
1260 self.active_fingerprint = Some(fp);
1261 self
1262 }
1263
1264 pub fn build<R: BufRead>(self, mut reader: R) -> Result<Reader<R>, ArrowError> {
1270 let header = read_header(&mut reader)?;
1271 let decoder = self.make_decoder(Some(&header), self.reader_schema.as_ref())?;
1272 Ok(Reader {
1273 reader,
1274 header,
1275 decoder,
1276 block_decoder: BlockDecoder::default(),
1277 block_data: Vec::new(),
1278 block_count: 0,
1279 block_cursor: 0,
1280 finished: false,
1281 })
1282 }
1283
1284 pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
1293 if self.writer_schema_store.is_none() {
1294 return Err(ArrowError::InvalidArgumentError(
1295 "Building a decoder requires a writer schema store".to_string(),
1296 ));
1297 }
1298 self.make_decoder(None, self.reader_schema.as_ref())
1299 }
1300}
1301
1302#[derive(Debug)]
1312pub struct Reader<R: BufRead> {
1313 reader: R,
1314 header: Header,
1315 decoder: Decoder,
1316 block_decoder: BlockDecoder,
1317 block_data: Vec<u8>,
1318 block_count: usize,
1319 block_cursor: usize,
1320 finished: bool,
1321}
1322
1323impl<R: BufRead> Reader<R> {
1324 pub fn schema(&self) -> SchemaRef {
1327 self.decoder.schema()
1328 }
1329
1330 pub fn avro_header(&self) -> &Header {
1332 &self.header
1333 }
1334
1335 fn read(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
1340 'outer: while !self.finished && !self.decoder.batch_is_full() {
1341 while self.block_cursor == self.block_data.len() {
1342 let buf = self.reader.fill_buf()?;
1343 if buf.is_empty() {
1344 self.finished = true;
1345 break 'outer;
1346 }
1347 let consumed = self.block_decoder.decode(buf)?;
1349 self.reader.consume(consumed);
1350 if let Some(block) = self.block_decoder.flush() {
1351 self.block_data = if let Some(ref codec) = self.header.compression()? {
1353 codec.decompress(&block.data)?
1354 } else {
1355 block.data
1356 };
1357 self.block_count = block.count;
1358 self.block_cursor = 0;
1359 } else if consumed == 0 {
1360 return Err(ArrowError::ParseError(
1362 "Could not decode next Avro block from partial data".to_string(),
1363 ));
1364 }
1365 }
1366 if self.block_cursor < self.block_data.len() {
1368 let (consumed, records_decoded) = self
1369 .decoder
1370 .decode_block(&self.block_data[self.block_cursor..], self.block_count)?;
1371 self.block_cursor += consumed;
1372 self.block_count -= records_decoded;
1373 }
1374 }
1375 self.decoder.flush_block()
1376 }
1377}
1378
1379impl<R: BufRead> Iterator for Reader<R> {
1380 type Item = Result<RecordBatch, ArrowError>;
1381
1382 fn next(&mut self) -> Option<Self::Item> {
1383 self.read().transpose()
1384 }
1385}
1386
1387impl<R: BufRead> RecordBatchReader for Reader<R> {
1388 fn schema(&self) -> SchemaRef {
1389 self.schema()
1390 }
1391}
1392
1393#[cfg(test)]
1394mod test {
1395 use crate::codec::AvroFieldBuilder;
1396 use crate::reader::record::RecordDecoder;
1397 use crate::reader::{Decoder, Reader, ReaderBuilder};
1398 use crate::schema::{
1399 AVRO_ENUM_SYMBOLS_METADATA_KEY, AVRO_NAME_METADATA_KEY, AVRO_NAMESPACE_METADATA_KEY,
1400 AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, PrimitiveType,
1401 SINGLE_OBJECT_MAGIC, SchemaStore,
1402 };
1403 use crate::test_util::arrow_test_data;
1404 use crate::writer::AvroWriter;
1405 use arrow_array::builder::{
1406 ArrayBuilder, BooleanBuilder, Float32Builder, Int32Builder, Int64Builder, ListBuilder,
1407 MapBuilder, StringBuilder, StructBuilder,
1408 };
1409 #[cfg(feature = "snappy")]
1410 use arrow_array::builder::{Float64Builder, MapFieldNames};
1411 use arrow_array::cast::AsArray;
1412 #[cfg(not(feature = "avro_custom_types"))]
1413 use arrow_array::types::Int64Type;
1414 #[cfg(feature = "avro_custom_types")]
1415 use arrow_array::types::{
1416 DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
1417 DurationSecondType,
1418 };
1419 use arrow_array::types::{Int32Type, IntervalMonthDayNanoType};
1420 use arrow_array::*;
1421 #[cfg(feature = "snappy")]
1422 use arrow_buffer::{Buffer, NullBuffer};
1423 use arrow_buffer::{IntervalMonthDayNano, OffsetBuffer, ScalarBuffer, i256};
1424 #[cfg(feature = "avro_custom_types")]
1425 use arrow_schema::{
1426 ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit, UnionFields,
1427 UnionMode,
1428 };
1429 #[cfg(not(feature = "avro_custom_types"))]
1430 use arrow_schema::{
1431 ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, UnionFields, UnionMode,
1432 };
1433 use bytes::Bytes;
1434 use futures::executor::block_on;
1435 use futures::{Stream, StreamExt, TryStreamExt, stream};
1436 use serde_json::{Value, json};
1437 use std::collections::HashMap;
1438 use std::fs::File;
1439 use std::io::{BufReader, Cursor};
1440 use std::sync::Arc;
1441
1442 fn files() -> impl Iterator<Item = &'static str> {
1443 [
1444 #[cfg(feature = "snappy")]
1446 "avro/alltypes_plain.avro",
1447 #[cfg(feature = "snappy")]
1448 "avro/alltypes_plain.snappy.avro",
1449 #[cfg(feature = "zstd")]
1450 "avro/alltypes_plain.zstandard.avro",
1451 #[cfg(feature = "bzip2")]
1452 "avro/alltypes_plain.bzip2.avro",
1453 #[cfg(feature = "xz")]
1454 "avro/alltypes_plain.xz.avro",
1455 ]
1456 .into_iter()
1457 }
1458
1459 fn read_file(path: &str, batch_size: usize, utf8_view: bool) -> RecordBatch {
1460 let file = File::open(path).unwrap();
1461 let reader = ReaderBuilder::new()
1462 .with_batch_size(batch_size)
1463 .with_utf8_view(utf8_view)
1464 .build(BufReader::new(file))
1465 .unwrap();
1466 let schema = reader.schema();
1467 let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1468 arrow::compute::concat_batches(&schema, &batches).unwrap()
1469 }
1470
1471 fn read_file_strict(
1472 path: &str,
1473 batch_size: usize,
1474 utf8_view: bool,
1475 ) -> Result<Reader<BufReader<File>>, ArrowError> {
1476 let file = File::open(path)?;
1477 ReaderBuilder::new()
1478 .with_batch_size(batch_size)
1479 .with_utf8_view(utf8_view)
1480 .with_strict_mode(true)
1481 .build(BufReader::new(file))
1482 }
1483
1484 fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
1485 mut decoder: Decoder,
1486 mut input: S,
1487 ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
1488 async_stream::try_stream! {
1489 if let Some(data) = input.next().await {
1490 let consumed = decoder.decode(&data)?;
1491 if consumed < data.len() {
1492 Err(ArrowError::ParseError(
1493 "did not consume all bytes".to_string(),
1494 ))?;
1495 }
1496 }
1497 if let Some(batch) = decoder.flush()? {
1498 yield batch
1499 }
1500 }
1501 }
1502
1503 fn make_record_schema(pt: PrimitiveType) -> AvroSchema {
1504 let js = format!(
1505 r#"{{"type":"record","name":"TestRecord","fields":[{{"name":"a","type":"{}"}}]}}"#,
1506 pt.as_ref()
1507 );
1508 AvroSchema::new(js)
1509 }
1510
1511 fn make_two_schema_store() -> (
1512 SchemaStore,
1513 Fingerprint,
1514 Fingerprint,
1515 AvroSchema,
1516 AvroSchema,
1517 ) {
1518 let schema_int = make_record_schema(PrimitiveType::Int);
1519 let schema_long = make_record_schema(PrimitiveType::Long);
1520 let mut store = SchemaStore::new();
1521 let fp_int = store
1522 .register(schema_int.clone())
1523 .expect("register int schema");
1524 let fp_long = store
1525 .register(schema_long.clone())
1526 .expect("register long schema");
1527 (store, fp_int, fp_long, schema_int, schema_long)
1528 }
1529
1530 fn make_prefix(fp: Fingerprint) -> Vec<u8> {
1531 match fp {
1532 Fingerprint::Rabin(v) => {
1533 let mut out = Vec::with_capacity(2 + 8);
1534 out.extend_from_slice(&SINGLE_OBJECT_MAGIC);
1535 out.extend_from_slice(&v.to_le_bytes());
1536 out
1537 }
1538 Fingerprint::Id(v) => {
1539 panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1540 }
1541 Fingerprint::Id64(v) => {
1542 panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1543 }
1544 #[cfg(feature = "md5")]
1545 Fingerprint::MD5(v) => {
1546 panic!("make_prefix expects a Rabin fingerprint, got ({v:?})");
1547 }
1548 #[cfg(feature = "sha256")]
1549 Fingerprint::SHA256(id) => {
1550 panic!("make_prefix expects a Rabin fingerprint, got ({id:?})");
1551 }
1552 }
1553 }
1554
1555 fn make_decoder(store: &SchemaStore, fp: Fingerprint, reader_schema: &AvroSchema) -> Decoder {
1556 ReaderBuilder::new()
1557 .with_batch_size(8)
1558 .with_reader_schema(reader_schema.clone())
1559 .with_writer_schema_store(store.clone())
1560 .with_active_fingerprint(fp)
1561 .build_decoder()
1562 .expect("decoder")
1563 }
1564
1565 fn make_id_prefix(id: u32, additional: usize) -> Vec<u8> {
1566 let capacity = CONFLUENT_MAGIC.len() + size_of::<u32>() + additional;
1567 let mut out = Vec::with_capacity(capacity);
1568 out.extend_from_slice(&CONFLUENT_MAGIC);
1569 out.extend_from_slice(&id.to_be_bytes());
1570 out
1571 }
1572
1573 fn make_message_id(id: u32, value: i64) -> Vec<u8> {
1574 let encoded_value = encode_zigzag(value);
1575 let mut msg = make_id_prefix(id, encoded_value.len());
1576 msg.extend_from_slice(&encoded_value);
1577 msg
1578 }
1579
1580 fn make_id64_prefix(id: u64, additional: usize) -> Vec<u8> {
1581 let capacity = CONFLUENT_MAGIC.len() + size_of::<u64>() + additional;
1582 let mut out = Vec::with_capacity(capacity);
1583 out.extend_from_slice(&CONFLUENT_MAGIC);
1584 out.extend_from_slice(&id.to_be_bytes());
1585 out
1586 }
1587
1588 fn make_message_id64(id: u64, value: i64) -> Vec<u8> {
1589 let encoded_value = encode_zigzag(value);
1590 let mut msg = make_id64_prefix(id, encoded_value.len());
1591 msg.extend_from_slice(&encoded_value);
1592 msg
1593 }
1594
1595 fn make_value_schema(pt: PrimitiveType) -> AvroSchema {
1596 let json_schema = format!(
1597 r#"{{"type":"record","name":"S","fields":[{{"name":"v","type":"{}"}}]}}"#,
1598 pt.as_ref()
1599 );
1600 AvroSchema::new(json_schema)
1601 }
1602
1603 fn encode_zigzag(value: i64) -> Vec<u8> {
1604 let mut n = ((value << 1) ^ (value >> 63)) as u64;
1605 let mut out = Vec::new();
1606 loop {
1607 if (n & !0x7F) == 0 {
1608 out.push(n as u8);
1609 break;
1610 } else {
1611 out.push(((n & 0x7F) | 0x80) as u8);
1612 n >>= 7;
1613 }
1614 }
1615 out
1616 }
1617
1618 fn make_message(fp: Fingerprint, value: i64) -> Vec<u8> {
1619 let mut msg = make_prefix(fp);
1620 msg.extend_from_slice(&encode_zigzag(value));
1621 msg
1622 }
1623
1624 fn load_writer_schema_json(path: &str) -> Value {
1625 let file = File::open(path).unwrap();
1626 let header = super::read_header(BufReader::new(file)).unwrap();
1627 let schema = header.schema().unwrap().unwrap();
1628 serde_json::to_value(&schema).unwrap()
1629 }
1630
1631 fn make_reader_schema_with_promotions(
1632 path: &str,
1633 promotions: &HashMap<&str, &str>,
1634 ) -> AvroSchema {
1635 let mut root = load_writer_schema_json(path);
1636 assert_eq!(root["type"], "record", "writer schema must be a record");
1637 let fields = root
1638 .get_mut("fields")
1639 .and_then(|f| f.as_array_mut())
1640 .expect("record has fields");
1641 for f in fields.iter_mut() {
1642 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1643 continue;
1644 };
1645 if let Some(new_ty) = promotions.get(name) {
1646 let ty = f.get_mut("type").expect("field has a type");
1647 match ty {
1648 Value::String(_) => {
1649 *ty = Value::String((*new_ty).to_string());
1650 }
1651 Value::Array(arr) => {
1653 for b in arr.iter_mut() {
1654 match b {
1655 Value::String(s) if s != "null" => {
1656 *b = Value::String((*new_ty).to_string());
1657 break;
1658 }
1659 Value::Object(_) => {
1660 *b = Value::String((*new_ty).to_string());
1661 break;
1662 }
1663 _ => {}
1664 }
1665 }
1666 }
1667 Value::Object(_) => {
1668 *ty = Value::String((*new_ty).to_string());
1669 }
1670 _ => {}
1671 }
1672 }
1673 }
1674 AvroSchema::new(root.to_string())
1675 }
1676
1677 fn make_reader_schema_with_enum_remap(
1678 path: &str,
1679 remap: &HashMap<&str, Vec<&str>>,
1680 ) -> AvroSchema {
1681 let mut root = load_writer_schema_json(path);
1682 assert_eq!(root["type"], "record", "writer schema must be a record");
1683 let fields = root
1684 .get_mut("fields")
1685 .and_then(|f| f.as_array_mut())
1686 .expect("record has fields");
1687
1688 fn to_symbols_array(symbols: &[&str]) -> Value {
1689 Value::Array(symbols.iter().map(|s| Value::String((*s).into())).collect())
1690 }
1691
1692 fn update_enum_symbols(ty: &mut Value, symbols: &Value) {
1693 match ty {
1694 Value::Object(map) => {
1695 if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1696 map.insert("symbols".to_string(), symbols.clone());
1697 }
1698 }
1699 Value::Array(arr) => {
1700 for b in arr.iter_mut() {
1701 if let Value::Object(map) = b {
1702 if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1703 map.insert("symbols".to_string(), symbols.clone());
1704 }
1705 }
1706 }
1707 }
1708 _ => {}
1709 }
1710 }
1711 for f in fields.iter_mut() {
1712 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1713 continue;
1714 };
1715 if let Some(new_symbols) = remap.get(name) {
1716 let symbols_val = to_symbols_array(new_symbols);
1717 let ty = f.get_mut("type").expect("field has a type");
1718 update_enum_symbols(ty, &symbols_val);
1719 }
1720 }
1721 AvroSchema::new(root.to_string())
1722 }
1723
1724 fn read_alltypes_with_reader_schema(path: &str, reader_schema: AvroSchema) -> RecordBatch {
1725 let file = File::open(path).unwrap();
1726 let reader = ReaderBuilder::new()
1727 .with_batch_size(1024)
1728 .with_utf8_view(false)
1729 .with_reader_schema(reader_schema)
1730 .build(BufReader::new(file))
1731 .unwrap();
1732 let schema = reader.schema();
1733 let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1734 arrow::compute::concat_batches(&schema, &batches).unwrap()
1735 }
1736
1737 fn make_reader_schema_with_selected_fields_in_order(
1738 path: &str,
1739 selected: &[&str],
1740 ) -> AvroSchema {
1741 let mut root = load_writer_schema_json(path);
1742 assert_eq!(root["type"], "record", "writer schema must be a record");
1743 let writer_fields = root
1744 .get("fields")
1745 .and_then(|f| f.as_array())
1746 .expect("record has fields");
1747 let mut field_map: HashMap<String, Value> = HashMap::with_capacity(writer_fields.len());
1748 for f in writer_fields {
1749 if let Some(name) = f.get("name").and_then(|n| n.as_str()) {
1750 field_map.insert(name.to_string(), f.clone());
1751 }
1752 }
1753 let mut new_fields = Vec::with_capacity(selected.len());
1754 for name in selected {
1755 let f = field_map
1756 .get(*name)
1757 .unwrap_or_else(|| panic!("field '{name}' not found in writer schema"))
1758 .clone();
1759 new_fields.push(f);
1760 }
1761 root["fields"] = Value::Array(new_fields);
1762 AvroSchema::new(root.to_string())
1763 }
1764
1765 fn write_ocf(schema: &Schema, batches: &[RecordBatch]) -> Vec<u8> {
1766 let mut w = AvroWriter::new(Vec::<u8>::new(), schema.clone()).expect("writer");
1767 for b in batches {
1768 w.write(b).expect("write");
1769 }
1770 w.finish().expect("finish");
1771 w.into_inner()
1772 }
1773
1774 #[test]
1775 fn ocf_projection_no_reader_schema_reorder() -> Result<(), Box<dyn std::error::Error>> {
1776 let writer_schema = Schema::new(vec![
1778 Field::new("id", DataType::Int32, false),
1779 Field::new("name", DataType::Utf8, false),
1780 Field::new("is_active", DataType::Boolean, false),
1781 ]);
1782 let batch = RecordBatch::try_new(
1783 Arc::new(writer_schema.clone()),
1784 vec![
1785 Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef,
1786 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1787 Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef,
1788 ],
1789 )?;
1790 let bytes = write_ocf(&writer_schema, &[batch]);
1791 let mut reader = ReaderBuilder::new()
1793 .with_projection(vec![2, 0])
1794 .build(Cursor::new(bytes))?;
1795 let out = reader.next().unwrap()?;
1796 assert_eq!(out.num_columns(), 2);
1797 assert_eq!(out.schema().field(0).name(), "is_active");
1798 assert_eq!(out.schema().field(1).name(), "id");
1799 let is_active = out.column(0).as_boolean();
1800 assert!(is_active.value(0));
1801 assert!(!is_active.value(1));
1802 let id = out.column(1).as_primitive::<Int32Type>();
1803 assert_eq!(id.value(0), 1);
1804 assert_eq!(id.value(1), 2);
1805 Ok(())
1806 }
1807
1808 #[test]
1809 fn ocf_projection_with_reader_schema_alias_and_default()
1810 -> Result<(), Box<dyn std::error::Error>> {
1811 let writer_schema = Schema::new(vec![
1813 Field::new("id", DataType::Int64, false),
1814 Field::new("name", DataType::Utf8, false),
1815 ]);
1816 let batch = RecordBatch::try_new(
1817 Arc::new(writer_schema.clone()),
1818 vec![
1819 Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1820 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1821 ],
1822 )?;
1823 let bytes = write_ocf(&writer_schema, &[batch]);
1824 let reader_json = r#"
1828 {
1829 "type": "record",
1830 "name": "topLevelRecord",
1831 "fields": [
1832 { "name": "id", "type": "long" },
1833 { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
1834 { "name": "is_active", "type": "boolean", "default": true }
1835 ]
1836 }"#;
1837 let mut reader = ReaderBuilder::new()
1839 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1840 .with_projection(vec![1, 2])
1841 .build(Cursor::new(bytes))?;
1842 let out = reader.next().unwrap()?;
1843 assert_eq!(out.num_columns(), 2);
1844 assert_eq!(out.schema().field(0).name(), "full_name");
1845 assert_eq!(out.schema().field(1).name(), "is_active");
1846 let full_name = out.column(0).as_string::<i32>();
1847 assert_eq!(full_name.value(0), "a");
1848 assert_eq!(full_name.value(1), "b");
1849 let is_active = out.column(1).as_boolean();
1850 assert!(is_active.value(0));
1851 assert!(is_active.value(1));
1852 Ok(())
1853 }
1854
1855 #[test]
1856 fn projection_errors_out_of_bounds_and_duplicate() -> Result<(), Box<dyn std::error::Error>> {
1857 let writer_schema = Schema::new(vec![
1858 Field::new("a", DataType::Int32, false),
1859 Field::new("b", DataType::Int32, false),
1860 ]);
1861 let batch = RecordBatch::try_new(
1862 Arc::new(writer_schema.clone()),
1863 vec![
1864 Arc::new(Int32Array::from(vec![1])) as ArrayRef,
1865 Arc::new(Int32Array::from(vec![2])) as ArrayRef,
1866 ],
1867 )?;
1868 let bytes = write_ocf(&writer_schema, &[batch]);
1869 let err = ReaderBuilder::new()
1870 .with_projection(vec![2])
1871 .build(Cursor::new(bytes.clone()))
1872 .unwrap_err();
1873 assert!(matches!(err, ArrowError::AvroError(_)));
1874 assert!(err.to_string().contains("out of bounds"));
1875 let err = ReaderBuilder::new()
1876 .with_projection(vec![0, 0])
1877 .build(Cursor::new(bytes))
1878 .unwrap_err();
1879 assert!(matches!(err, ArrowError::AvroError(_)));
1880 assert!(err.to_string().contains("Duplicate projection index"));
1881 Ok(())
1882 }
1883
1884 #[test]
1885 #[cfg(feature = "snappy")]
1886 fn test_alltypes_plain_with_projection_and_reader_schema() {
1887 use std::fs::File;
1888 use std::io::BufReader;
1889 let path = arrow_test_data("avro/alltypes_plain.avro");
1890 let reader_schema = make_reader_schema_with_selected_fields_in_order(
1892 &path,
1893 &["double_col", "id", "tinyint_col"],
1894 );
1895 let file = File::open(&path).expect("open avro/alltypes_plain.avro");
1896 let reader = ReaderBuilder::new()
1897 .with_batch_size(1024)
1898 .with_reader_schema(reader_schema)
1899 .with_projection(vec![1, 2]) .build(BufReader::new(file))
1901 .expect("build reader with projection and reader schema");
1902 let schema = reader.schema();
1903 assert_eq!(schema.fields().len(), 2);
1905 assert_eq!(schema.field(0).name(), "id");
1906 assert_eq!(schema.field(1).name(), "tinyint_col");
1907 let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
1908 assert_eq!(batches.len(), 1);
1909 let batch = &batches[0];
1910 assert_eq!(batch.num_rows(), 8);
1911 assert_eq!(batch.num_columns(), 2);
1912 let expected = RecordBatch::try_from_iter_with_nullable([
1916 (
1917 "id",
1918 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as ArrayRef,
1919 true,
1920 ),
1921 (
1922 "tinyint_col",
1923 Arc::new(Int32Array::from(vec![0, 1, 0, 1, 0, 1, 0, 1])) as ArrayRef,
1924 true,
1925 ),
1926 ])
1927 .unwrap();
1928 assert_eq!(
1929 batch, &expected,
1930 "Projected batch mismatch for alltypes_plain.avro with reader schema and projection [1, 2]"
1931 );
1932 }
1933
1934 #[test]
1935 #[cfg(feature = "snappy")]
1936 fn test_alltypes_plain_with_projection() {
1937 use std::fs::File;
1938 use std::io::BufReader;
1939 let path = arrow_test_data("avro/alltypes_plain.avro");
1940 let file = File::open(&path).expect("open avro/alltypes_plain.avro");
1941 let reader = ReaderBuilder::new()
1942 .with_batch_size(1024)
1943 .with_projection(vec![2, 0, 5])
1944 .build(BufReader::new(file))
1945 .expect("build reader with projection");
1946 let schema = reader.schema();
1947 assert_eq!(schema.fields().len(), 3);
1948 assert_eq!(schema.field(0).name(), "tinyint_col");
1949 assert_eq!(schema.field(1).name(), "id");
1950 assert_eq!(schema.field(2).name(), "bigint_col");
1951 let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
1952 assert_eq!(batches.len(), 1);
1953 let batch = &batches[0];
1954 assert_eq!(batch.num_rows(), 8);
1955 assert_eq!(batch.num_columns(), 3);
1956 let expected = RecordBatch::try_from_iter_with_nullable([
1957 (
1958 "tinyint_col",
1959 Arc::new(Int32Array::from(vec![0, 1, 0, 1, 0, 1, 0, 1])) as ArrayRef,
1960 true,
1961 ),
1962 (
1963 "id",
1964 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as ArrayRef,
1965 true,
1966 ),
1967 (
1968 "bigint_col",
1969 Arc::new(Int64Array::from(vec![0, 10, 0, 10, 0, 10, 0, 10])) as ArrayRef,
1970 true,
1971 ),
1972 ])
1973 .unwrap();
1974 assert_eq!(
1975 batch, &expected,
1976 "Projected batch mismatch for alltypes_plain.avro with projection [2, 0, 5]"
1977 );
1978 }
1979
1980 #[test]
1981 fn writer_string_reader_nullable_with_alias() -> Result<(), Box<dyn std::error::Error>> {
1982 let writer_schema = Schema::new(vec![
1983 Field::new("id", DataType::Int64, false),
1984 Field::new("name", DataType::Utf8, false),
1985 ]);
1986 let batch = RecordBatch::try_new(
1987 Arc::new(writer_schema.clone()),
1988 vec![
1989 Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1990 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1991 ],
1992 )?;
1993 let bytes = write_ocf(&writer_schema, &[batch]);
1994 let reader_json = r#"
1995 {
1996 "type": "record",
1997 "name": "topLevelRecord",
1998 "fields": [
1999 { "name": "id", "type": "long" },
2000 { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
2001 { "name": "is_active", "type": "boolean", "default": true }
2002 ]
2003 }"#;
2004 let mut reader = ReaderBuilder::new()
2005 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2006 .build(Cursor::new(bytes))?;
2007 let out = reader.next().unwrap()?;
2008 let full_name = out.column(1).as_string::<i32>();
2009 assert_eq!(full_name.value(0), "a");
2010 assert_eq!(full_name.value(1), "b");
2011 Ok(())
2012 }
2013
2014 #[test]
2015 fn writer_string_reader_string_null_order_second() -> Result<(), Box<dyn std::error::Error>> {
2016 let writer_schema = Schema::new(vec![Field::new("name", DataType::Utf8, false)]);
2018 let batch = RecordBatch::try_new(
2019 Arc::new(writer_schema.clone()),
2020 vec![Arc::new(StringArray::from(vec!["x", "y"])) as ArrayRef],
2021 )?;
2022 let bytes = write_ocf(&writer_schema, &[batch]);
2023
2024 let reader_json = r#"
2026 {
2027 "type":"record", "name":"topLevelRecord",
2028 "fields":[ { "name":"name", "type":["string","null"], "default":"x" } ]
2029 }"#;
2030
2031 let mut reader = ReaderBuilder::new()
2032 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2033 .build(Cursor::new(bytes))?;
2034
2035 let out = reader.next().unwrap()?;
2036 assert_eq!(out.num_rows(), 2);
2037
2038 let name = out.column(0).as_string::<i32>();
2040 assert_eq!(name.value(0), "x");
2041 assert_eq!(name.value(1), "y");
2042
2043 Ok(())
2044 }
2045
2046 #[test]
2047 fn promotion_writer_int_reader_nullable_long() -> Result<(), Box<dyn std::error::Error>> {
2048 let writer_schema = Schema::new(vec![Field::new("v", DataType::Int32, false)]);
2050 let batch = RecordBatch::try_new(
2051 Arc::new(writer_schema.clone()),
2052 vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
2053 )?;
2054 let bytes = write_ocf(&writer_schema, &[batch]);
2055
2056 let reader_json = r#"
2058 {
2059 "type":"record", "name":"topLevelRecord",
2060 "fields":[ { "name":"v", "type":["null","long"], "default": null } ]
2061 }"#;
2062
2063 let mut reader = ReaderBuilder::new()
2064 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2065 .build(Cursor::new(bytes))?;
2066
2067 let out = reader.next().unwrap()?;
2068 assert_eq!(out.num_rows(), 3);
2069
2070 let v = out
2072 .column(0)
2073 .as_primitive::<arrow_array::types::Int64Type>();
2074 assert_eq!(v.values(), &[1, 2, 3]);
2075 assert!(
2076 out.column(0).nulls().is_none(),
2077 "expected no validity bitmap for all-valid column"
2078 );
2079
2080 Ok(())
2081 }
2082
2083 #[test]
2084 fn test_alltypes_schema_promotion_mixed() {
2085 for file in files() {
2086 let file = arrow_test_data(file);
2087 let mut promotions: HashMap<&str, &str> = HashMap::new();
2088 promotions.insert("id", "long");
2089 promotions.insert("tinyint_col", "float");
2090 promotions.insert("smallint_col", "double");
2091 promotions.insert("int_col", "double");
2092 promotions.insert("bigint_col", "double");
2093 promotions.insert("float_col", "double");
2094 promotions.insert("date_string_col", "string");
2095 promotions.insert("string_col", "string");
2096 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2097 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2098 let expected = RecordBatch::try_from_iter_with_nullable([
2099 (
2100 "id",
2101 Arc::new(Int64Array::from(vec![4i64, 5, 6, 7, 2, 3, 0, 1])) as _,
2102 true,
2103 ),
2104 (
2105 "bool_col",
2106 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2107 true,
2108 ),
2109 (
2110 "tinyint_col",
2111 Arc::new(Float32Array::from_iter_values(
2112 (0..8).map(|x| (x % 2) as f32),
2113 )) as _,
2114 true,
2115 ),
2116 (
2117 "smallint_col",
2118 Arc::new(Float64Array::from_iter_values(
2119 (0..8).map(|x| (x % 2) as f64),
2120 )) as _,
2121 true,
2122 ),
2123 (
2124 "int_col",
2125 Arc::new(Float64Array::from_iter_values(
2126 (0..8).map(|x| (x % 2) as f64),
2127 )) as _,
2128 true,
2129 ),
2130 (
2131 "bigint_col",
2132 Arc::new(Float64Array::from_iter_values(
2133 (0..8).map(|x| ((x % 2) * 10) as f64),
2134 )) as _,
2135 true,
2136 ),
2137 (
2138 "float_col",
2139 Arc::new(Float64Array::from_iter_values(
2140 (0..8).map(|x| ((x % 2) as f32 * 1.1f32) as f64),
2141 )) as _,
2142 true,
2143 ),
2144 (
2145 "double_col",
2146 Arc::new(Float64Array::from_iter_values(
2147 (0..8).map(|x| (x % 2) as f64 * 10.1),
2148 )) as _,
2149 true,
2150 ),
2151 (
2152 "date_string_col",
2153 Arc::new(StringArray::from(vec![
2154 "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2155 "01/01/09", "01/01/09",
2156 ])) as _,
2157 true,
2158 ),
2159 (
2160 "string_col",
2161 Arc::new(StringArray::from(
2162 (0..8)
2163 .map(|x| if x % 2 == 0 { "0" } else { "1" })
2164 .collect::<Vec<_>>(),
2165 )) as _,
2166 true,
2167 ),
2168 (
2169 "timestamp_col",
2170 Arc::new(
2171 TimestampMicrosecondArray::from_iter_values([
2172 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2181 .with_timezone("+00:00"),
2182 ) as _,
2183 true,
2184 ),
2185 ])
2186 .unwrap();
2187 assert_eq!(batch, expected, "mismatch for file {file}");
2188 }
2189 }
2190
2191 #[test]
2192 fn test_alltypes_schema_promotion_long_to_float_only() {
2193 for file in files() {
2194 let file = arrow_test_data(file);
2195 let mut promotions: HashMap<&str, &str> = HashMap::new();
2196 promotions.insert("bigint_col", "float");
2197 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2198 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2199 let expected = RecordBatch::try_from_iter_with_nullable([
2200 (
2201 "id",
2202 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2203 true,
2204 ),
2205 (
2206 "bool_col",
2207 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2208 true,
2209 ),
2210 (
2211 "tinyint_col",
2212 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2213 true,
2214 ),
2215 (
2216 "smallint_col",
2217 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2218 true,
2219 ),
2220 (
2221 "int_col",
2222 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2223 true,
2224 ),
2225 (
2226 "bigint_col",
2227 Arc::new(Float32Array::from_iter_values(
2228 (0..8).map(|x| ((x % 2) * 10) as f32),
2229 )) as _,
2230 true,
2231 ),
2232 (
2233 "float_col",
2234 Arc::new(Float32Array::from_iter_values(
2235 (0..8).map(|x| (x % 2) as f32 * 1.1),
2236 )) as _,
2237 true,
2238 ),
2239 (
2240 "double_col",
2241 Arc::new(Float64Array::from_iter_values(
2242 (0..8).map(|x| (x % 2) as f64 * 10.1),
2243 )) as _,
2244 true,
2245 ),
2246 (
2247 "date_string_col",
2248 Arc::new(BinaryArray::from_iter_values([
2249 [48, 51, 47, 48, 49, 47, 48, 57],
2250 [48, 51, 47, 48, 49, 47, 48, 57],
2251 [48, 52, 47, 48, 49, 47, 48, 57],
2252 [48, 52, 47, 48, 49, 47, 48, 57],
2253 [48, 50, 47, 48, 49, 47, 48, 57],
2254 [48, 50, 47, 48, 49, 47, 48, 57],
2255 [48, 49, 47, 48, 49, 47, 48, 57],
2256 [48, 49, 47, 48, 49, 47, 48, 57],
2257 ])) as _,
2258 true,
2259 ),
2260 (
2261 "string_col",
2262 Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
2263 true,
2264 ),
2265 (
2266 "timestamp_col",
2267 Arc::new(
2268 TimestampMicrosecondArray::from_iter_values([
2269 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2278 .with_timezone("+00:00"),
2279 ) as _,
2280 true,
2281 ),
2282 ])
2283 .unwrap();
2284 assert_eq!(batch, expected, "mismatch for file {file}");
2285 }
2286 }
2287
2288 #[test]
2289 fn test_alltypes_schema_promotion_bytes_to_string_only() {
2290 for file in files() {
2291 let file = arrow_test_data(file);
2292 let mut promotions: HashMap<&str, &str> = HashMap::new();
2293 promotions.insert("date_string_col", "string");
2294 promotions.insert("string_col", "string");
2295 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2296 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2297 let expected = RecordBatch::try_from_iter_with_nullable([
2298 (
2299 "id",
2300 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2301 true,
2302 ),
2303 (
2304 "bool_col",
2305 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2306 true,
2307 ),
2308 (
2309 "tinyint_col",
2310 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2311 true,
2312 ),
2313 (
2314 "smallint_col",
2315 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2316 true,
2317 ),
2318 (
2319 "int_col",
2320 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2321 true,
2322 ),
2323 (
2324 "bigint_col",
2325 Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
2326 true,
2327 ),
2328 (
2329 "float_col",
2330 Arc::new(Float32Array::from_iter_values(
2331 (0..8).map(|x| (x % 2) as f32 * 1.1),
2332 )) as _,
2333 true,
2334 ),
2335 (
2336 "double_col",
2337 Arc::new(Float64Array::from_iter_values(
2338 (0..8).map(|x| (x % 2) as f64 * 10.1),
2339 )) as _,
2340 true,
2341 ),
2342 (
2343 "date_string_col",
2344 Arc::new(StringArray::from(vec![
2345 "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2346 "01/01/09", "01/01/09",
2347 ])) as _,
2348 true,
2349 ),
2350 (
2351 "string_col",
2352 Arc::new(StringArray::from(
2353 (0..8)
2354 .map(|x| if x % 2 == 0 { "0" } else { "1" })
2355 .collect::<Vec<_>>(),
2356 )) as _,
2357 true,
2358 ),
2359 (
2360 "timestamp_col",
2361 Arc::new(
2362 TimestampMicrosecondArray::from_iter_values([
2363 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2372 .with_timezone("+00:00"),
2373 ) as _,
2374 true,
2375 ),
2376 ])
2377 .unwrap();
2378 assert_eq!(batch, expected, "mismatch for file {file}");
2379 }
2380 }
2381
2382 #[test]
2383 #[cfg(feature = "snappy")]
2385 fn test_alltypes_illegal_promotion_bool_to_double_errors() {
2386 let file = arrow_test_data("avro/alltypes_plain.avro");
2387 let mut promotions: HashMap<&str, &str> = HashMap::new();
2388 promotions.insert("bool_col", "double"); let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2390 let file_handle = File::open(&file).unwrap();
2391 let result = ReaderBuilder::new()
2392 .with_reader_schema(reader_schema)
2393 .build(BufReader::new(file_handle));
2394 let err = result.expect_err("expected illegal promotion to error");
2395 let msg = err.to_string();
2396 assert!(
2397 msg.contains("Illegal promotion") || msg.contains("illegal promotion"),
2398 "unexpected error: {msg}"
2399 );
2400 }
2401
2402 #[test]
2403 fn test_simple_enum_with_reader_schema_mapping() {
2404 let file = arrow_test_data("avro/simple_enum.avro");
2405 let mut remap: HashMap<&str, Vec<&str>> = HashMap::new();
2406 remap.insert("f1", vec!["d", "c", "b", "a"]);
2407 remap.insert("f2", vec!["h", "g", "f", "e"]);
2408 remap.insert("f3", vec!["k", "i", "j"]);
2409 let reader_schema = make_reader_schema_with_enum_remap(&file, &remap);
2410 let actual = read_alltypes_with_reader_schema(&file, reader_schema);
2411 let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
2412 let f1_keys = Int32Array::from(vec![3, 2, 1, 0]);
2414 let f1_vals = StringArray::from(vec!["d", "c", "b", "a"]);
2415 let f1 = DictionaryArray::<Int32Type>::try_new(f1_keys, Arc::new(f1_vals)).unwrap();
2416 let mut md_f1 = HashMap::new();
2417 md_f1.insert(
2418 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2419 r#"["d","c","b","a"]"#.to_string(),
2420 );
2421 md_f1.insert("avro.name".to_string(), "enum1".to_string());
2423 md_f1.insert("avro.namespace".to_string(), "ns1".to_string());
2424 let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
2425 let f2_keys = Int32Array::from(vec![1, 0, 3, 2]);
2427 let f2_vals = StringArray::from(vec!["h", "g", "f", "e"]);
2428 let f2 = DictionaryArray::<Int32Type>::try_new(f2_keys, Arc::new(f2_vals)).unwrap();
2429 let mut md_f2 = HashMap::new();
2430 md_f2.insert(
2431 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2432 r#"["h","g","f","e"]"#.to_string(),
2433 );
2434 md_f2.insert("avro.name".to_string(), "enum2".to_string());
2436 md_f2.insert("avro.namespace".to_string(), "ns2".to_string());
2437 let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
2438 let f3_keys = Int32Array::from(vec![Some(2), Some(0), None, Some(1)]);
2440 let f3_vals = StringArray::from(vec!["k", "i", "j"]);
2441 let f3 = DictionaryArray::<Int32Type>::try_new(f3_keys, Arc::new(f3_vals)).unwrap();
2442 let mut md_f3 = HashMap::new();
2443 md_f3.insert(
2444 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2445 r#"["k","i","j"]"#.to_string(),
2446 );
2447 md_f3.insert("avro.name".to_string(), "enum3".to_string());
2449 md_f3.insert("avro.namespace".to_string(), "ns1".to_string());
2450 let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
2451 let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
2452 let expected = RecordBatch::try_new(
2453 expected_schema,
2454 vec![Arc::new(f1) as ArrayRef, Arc::new(f2), Arc::new(f3)],
2455 )
2456 .unwrap();
2457 assert_eq!(actual, expected);
2458 }
2459
2460 #[test]
2461 fn test_schema_store_register_lookup() {
2462 let schema_int = make_record_schema(PrimitiveType::Int);
2463 let schema_long = make_record_schema(PrimitiveType::Long);
2464 let mut store = SchemaStore::new();
2465 let fp_int = store.register(schema_int.clone()).unwrap();
2466 let fp_long = store.register(schema_long.clone()).unwrap();
2467 assert_eq!(store.lookup(&fp_int).cloned(), Some(schema_int));
2468 assert_eq!(store.lookup(&fp_long).cloned(), Some(schema_long));
2469 assert_eq!(store.fingerprint_algorithm(), FingerprintAlgorithm::Rabin);
2470 }
2471
2472 #[test]
2473 fn test_unknown_fingerprint_is_error() {
2474 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2475 let unknown_fp = Fingerprint::Rabin(0xDEAD_BEEF_DEAD_BEEF);
2476 let prefix = make_prefix(unknown_fp);
2477 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2478 let err = decoder.decode(&prefix).expect_err("decode should error");
2479 let msg = err.to_string();
2480 assert!(
2481 msg.contains("Unknown fingerprint"),
2482 "unexpected message: {msg}"
2483 );
2484 }
2485
2486 #[test]
2487 fn test_handle_prefix_incomplete_magic() {
2488 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2489 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2490 let buf = &SINGLE_OBJECT_MAGIC[..1];
2491 let res = decoder.handle_prefix(buf).unwrap();
2492 assert_eq!(res, Some(0));
2493 assert!(decoder.pending_schema.is_none());
2494 }
2495
2496 #[test]
2497 fn test_handle_prefix_magic_mismatch() {
2498 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2499 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2500 let buf = [0xFFu8, 0x00u8, 0x01u8];
2501 let res = decoder.handle_prefix(&buf).unwrap();
2502 assert!(res.is_none());
2503 }
2504
2505 #[test]
2506 fn test_handle_prefix_incomplete_fingerprint() {
2507 let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2508 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2509 let long_bytes = match fp_long {
2510 Fingerprint::Rabin(v) => v.to_le_bytes(),
2511 Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2512 Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2513 #[cfg(feature = "md5")]
2514 Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2515 #[cfg(feature = "sha256")]
2516 Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2517 };
2518 let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2519 buf.extend_from_slice(&long_bytes[..4]);
2520 let res = decoder.handle_prefix(&buf).unwrap();
2521 assert_eq!(res, Some(0));
2522 assert!(decoder.pending_schema.is_none());
2523 }
2524
2525 #[test]
2526 fn test_handle_prefix_valid_prefix_switches_schema() {
2527 let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2528 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2529 let writer_schema_long = schema_long.schema().unwrap();
2530 let root_long = AvroFieldBuilder::new(&writer_schema_long).build().unwrap();
2531 let long_decoder = RecordDecoder::try_new_with_options(root_long.data_type()).unwrap();
2532 let _ = decoder.cache.insert(fp_long, long_decoder);
2533 let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2534 match fp_long {
2535 Fingerprint::Rabin(v) => buf.extend_from_slice(&v.to_le_bytes()),
2536 Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2537 Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2538 #[cfg(feature = "md5")]
2539 Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2540 #[cfg(feature = "sha256")]
2541 Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2542 }
2543 let consumed = decoder.handle_prefix(&buf).unwrap().unwrap();
2544 assert_eq!(consumed, buf.len());
2545 assert!(decoder.pending_schema.is_some());
2546 assert_eq!(decoder.pending_schema.as_ref().unwrap().0, fp_long);
2547 }
2548
2549 #[test]
2550 fn test_decoder_projection_multiple_writer_schemas_no_reader_schema()
2551 -> Result<(), Box<dyn std::error::Error>> {
2552 let writer_v1 = AvroSchema::new(
2554 r#"{"type":"record","name":"E","fields":[{"name":"a","type":"int"},{"name":"b","type":"string"}]}"#
2555 .to_string(),
2556 );
2557 let writer_v2 = AvroSchema::new(
2558 r#"{"type":"record","name":"E","fields":[{"name":"a","type":"long"},{"name":"b","type":"string"},{"name":"c","type":"int"}]}"#
2559 .to_string(),
2560 );
2561 let mut store = SchemaStore::new();
2562 let fp1 = store.register(writer_v1)?;
2563 let fp2 = store.register(writer_v2)?;
2564 let mut decoder = ReaderBuilder::new()
2565 .with_writer_schema_store(store)
2566 .with_active_fingerprint(fp1)
2567 .with_batch_size(8)
2568 .with_projection(vec![1])
2569 .build_decoder()?;
2570 let mut msg1 = make_prefix(fp1);
2572 msg1.extend_from_slice(&encode_zigzag(1)); msg1.push((1u8) << 1);
2574 msg1.extend_from_slice(b"x");
2575 let mut msg2 = make_prefix(fp2);
2577 msg2.extend_from_slice(&encode_zigzag(2)); msg2.push((1u8) << 1);
2579 msg2.extend_from_slice(b"y");
2580 msg2.extend_from_slice(&encode_zigzag(7)); decoder.decode(&msg1)?;
2582 let batch1 = decoder.flush()?.expect("batch1");
2583 assert_eq!(batch1.num_columns(), 1);
2584 assert_eq!(batch1.schema().field(0).name(), "b");
2585 let b1 = batch1.column(0).as_string::<i32>();
2586 assert_eq!(b1.value(0), "x");
2587 decoder.decode(&msg2)?;
2588 let batch2 = decoder.flush()?.expect("batch2");
2589 assert_eq!(batch2.num_columns(), 1);
2590 assert_eq!(batch2.schema().field(0).name(), "b");
2591 let b2 = batch2.column(0).as_string::<i32>();
2592 assert_eq!(b2.value(0), "y");
2593 Ok(())
2594 }
2595
2596 #[test]
2597 fn test_two_messages_same_schema() {
2598 let writer_schema = make_value_schema(PrimitiveType::Int);
2599 let reader_schema = writer_schema.clone();
2600 let mut store = SchemaStore::new();
2601 let fp = store.register(writer_schema).unwrap();
2602 let msg1 = make_message(fp, 42);
2603 let msg2 = make_message(fp, 11);
2604 let input = [msg1.clone(), msg2.clone()].concat();
2605 let mut decoder = ReaderBuilder::new()
2606 .with_batch_size(8)
2607 .with_reader_schema(reader_schema.clone())
2608 .with_writer_schema_store(store)
2609 .with_active_fingerprint(fp)
2610 .build_decoder()
2611 .unwrap();
2612 let _ = decoder.decode(&input).unwrap();
2613 let batch = decoder.flush().unwrap().expect("batch");
2614 assert_eq!(batch.num_rows(), 2);
2615 let col = batch
2616 .column(0)
2617 .as_any()
2618 .downcast_ref::<Int32Array>()
2619 .unwrap();
2620 assert_eq!(col.value(0), 42);
2621 assert_eq!(col.value(1), 11);
2622 }
2623
2624 #[test]
2625 fn test_two_messages_schema_switch() {
2626 let w_int = make_value_schema(PrimitiveType::Int);
2627 let w_long = make_value_schema(PrimitiveType::Long);
2628 let mut store = SchemaStore::new();
2629 let fp_int = store.register(w_int).unwrap();
2630 let fp_long = store.register(w_long).unwrap();
2631 let msg_int = make_message(fp_int, 1);
2632 let msg_long = make_message(fp_long, 123456789_i64);
2633 let mut decoder = ReaderBuilder::new()
2634 .with_batch_size(8)
2635 .with_writer_schema_store(store)
2636 .with_active_fingerprint(fp_int)
2637 .build_decoder()
2638 .unwrap();
2639 let _ = decoder.decode(&msg_int).unwrap();
2640 let batch1 = decoder.flush().unwrap().expect("batch1");
2641 assert_eq!(batch1.num_rows(), 1);
2642 assert_eq!(
2643 batch1
2644 .column(0)
2645 .as_any()
2646 .downcast_ref::<Int32Array>()
2647 .unwrap()
2648 .value(0),
2649 1
2650 );
2651 let _ = decoder.decode(&msg_long).unwrap();
2652 let batch2 = decoder.flush().unwrap().expect("batch2");
2653 assert_eq!(batch2.num_rows(), 1);
2654 assert_eq!(
2655 batch2
2656 .column(0)
2657 .as_any()
2658 .downcast_ref::<Int64Array>()
2659 .unwrap()
2660 .value(0),
2661 123456789_i64
2662 );
2663 }
2664
2665 #[test]
2666 fn test_two_messages_same_schema_id() {
2667 let writer_schema = make_value_schema(PrimitiveType::Int);
2668 let reader_schema = writer_schema.clone();
2669 let id = 100u32;
2670 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2672 let _ = store
2673 .set(Fingerprint::Id(id), writer_schema.clone())
2674 .expect("set id schema");
2675 let msg1 = make_message_id(id, 21);
2676 let msg2 = make_message_id(id, 22);
2677 let input = [msg1.clone(), msg2.clone()].concat();
2678 let mut decoder = ReaderBuilder::new()
2679 .with_batch_size(8)
2680 .with_reader_schema(reader_schema)
2681 .with_writer_schema_store(store)
2682 .with_active_fingerprint(Fingerprint::Id(id))
2683 .build_decoder()
2684 .unwrap();
2685 let _ = decoder.decode(&input).unwrap();
2686 let batch = decoder.flush().unwrap().expect("batch");
2687 assert_eq!(batch.num_rows(), 2);
2688 let col = batch
2689 .column(0)
2690 .as_any()
2691 .downcast_ref::<Int32Array>()
2692 .unwrap();
2693 assert_eq!(col.value(0), 21);
2694 assert_eq!(col.value(1), 22);
2695 }
2696
2697 #[test]
2698 fn test_unknown_id_fingerprint_is_error() {
2699 let writer_schema = make_value_schema(PrimitiveType::Int);
2700 let id_known = 7u32;
2701 let id_unknown = 9u32;
2702 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2703 let _ = store
2704 .set(Fingerprint::Id(id_known), writer_schema.clone())
2705 .expect("set id schema");
2706 let mut decoder = ReaderBuilder::new()
2707 .with_batch_size(8)
2708 .with_reader_schema(writer_schema)
2709 .with_writer_schema_store(store)
2710 .with_active_fingerprint(Fingerprint::Id(id_known))
2711 .build_decoder()
2712 .unwrap();
2713 let prefix = make_id_prefix(id_unknown, 0);
2714 let err = decoder.decode(&prefix).expect_err("decode should error");
2715 let msg = err.to_string();
2716 assert!(
2717 msg.contains("Unknown fingerprint"),
2718 "unexpected message: {msg}"
2719 );
2720 }
2721
2722 #[test]
2723 fn test_handle_prefix_id_incomplete_magic() {
2724 let writer_schema = make_value_schema(PrimitiveType::Int);
2725 let id = 5u32;
2726 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2727 let _ = store
2728 .set(Fingerprint::Id(id), writer_schema.clone())
2729 .expect("set id schema");
2730 let mut decoder = ReaderBuilder::new()
2731 .with_batch_size(8)
2732 .with_reader_schema(writer_schema)
2733 .with_writer_schema_store(store)
2734 .with_active_fingerprint(Fingerprint::Id(id))
2735 .build_decoder()
2736 .unwrap();
2737 let buf = &CONFLUENT_MAGIC[..0]; let res = decoder.handle_prefix(buf).unwrap();
2739 assert_eq!(res, Some(0));
2740 assert!(decoder.pending_schema.is_none());
2741 }
2742
2743 #[test]
2744 fn test_two_messages_same_schema_id64() {
2745 let writer_schema = make_value_schema(PrimitiveType::Int);
2746 let reader_schema = writer_schema.clone();
2747 let id = 100u64;
2748 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id64);
2750 let _ = store
2751 .set(Fingerprint::Id64(id), writer_schema.clone())
2752 .expect("set id schema");
2753 let msg1 = make_message_id64(id, 21);
2754 let msg2 = make_message_id64(id, 22);
2755 let input = [msg1.clone(), msg2.clone()].concat();
2756 let mut decoder = ReaderBuilder::new()
2757 .with_batch_size(8)
2758 .with_reader_schema(reader_schema)
2759 .with_writer_schema_store(store)
2760 .with_active_fingerprint(Fingerprint::Id64(id))
2761 .build_decoder()
2762 .unwrap();
2763 let _ = decoder.decode(&input).unwrap();
2764 let batch = decoder.flush().unwrap().expect("batch");
2765 assert_eq!(batch.num_rows(), 2);
2766 let col = batch
2767 .column(0)
2768 .as_any()
2769 .downcast_ref::<Int32Array>()
2770 .unwrap();
2771 assert_eq!(col.value(0), 21);
2772 assert_eq!(col.value(1), 22);
2773 }
2774
2775 #[test]
2776 fn test_decode_stream_with_schema() {
2777 struct TestCase<'a> {
2778 name: &'a str,
2779 schema: &'a str,
2780 expected_error: Option<&'a str>,
2781 }
2782 let tests = vec![
2783 TestCase {
2784 name: "success",
2785 schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"string"}]}"#,
2786 expected_error: None,
2787 },
2788 TestCase {
2789 name: "valid schema invalid data",
2790 schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"long"}]}"#,
2791 expected_error: Some("did not consume all bytes"),
2792 },
2793 ];
2794 for test in tests {
2795 let avro_schema = AvroSchema::new(test.schema.to_string());
2796 let mut store = SchemaStore::new();
2797 let fp = store.register(avro_schema.clone()).unwrap();
2798 let prefix = make_prefix(fp);
2799 let record_val = "some_string";
2800 let mut body = prefix;
2801 body.push((record_val.len() as u8) << 1);
2802 body.extend_from_slice(record_val.as_bytes());
2803 let decoder_res = ReaderBuilder::new()
2804 .with_batch_size(1)
2805 .with_writer_schema_store(store)
2806 .with_active_fingerprint(fp)
2807 .build_decoder();
2808 let decoder = match decoder_res {
2809 Ok(d) => d,
2810 Err(e) => {
2811 if let Some(expected) = test.expected_error {
2812 assert!(
2813 e.to_string().contains(expected),
2814 "Test '{}' failed at build – expected '{expected}', got '{e}'",
2815 test.name
2816 );
2817 continue;
2818 } else {
2819 panic!("Test '{}' failed during build: {e}", test.name);
2820 }
2821 }
2822 };
2823 let stream = Box::pin(stream::once(async { Bytes::from(body) }));
2824 let decoded_stream = decode_stream(decoder, stream);
2825 let batches_result: Result<Vec<RecordBatch>, ArrowError> =
2826 block_on(decoded_stream.try_collect());
2827 match (batches_result, test.expected_error) {
2828 (Ok(batches), None) => {
2829 let batch =
2830 arrow::compute::concat_batches(&batches[0].schema(), &batches).unwrap();
2831 let expected_field = Field::new("f2", DataType::Utf8, false);
2832 let expected_schema = Arc::new(Schema::new(vec![expected_field]));
2833 let expected_array = Arc::new(StringArray::from(vec![record_val]));
2834 let expected_batch =
2835 RecordBatch::try_new(expected_schema, vec![expected_array]).unwrap();
2836 assert_eq!(batch, expected_batch, "Test '{}'", test.name);
2837 }
2838 (Err(e), Some(expected)) => {
2839 assert!(
2840 e.to_string().contains(expected),
2841 "Test '{}' – expected error containing '{expected}', got '{e}'",
2842 test.name
2843 );
2844 }
2845 (Ok(_), Some(expected)) => {
2846 panic!(
2847 "Test '{}' expected failure ('{expected}') but succeeded",
2848 test.name
2849 );
2850 }
2851 (Err(e), None) => {
2852 panic!("Test '{}' unexpectedly failed with '{e}'", test.name);
2853 }
2854 }
2855 }
2856 }
2857
2858 #[test]
2859 fn test_utf8view_support() {
2860 struct TestHelper;
2861 impl TestHelper {
2862 fn with_utf8view(field: &Field) -> Field {
2863 match field.data_type() {
2864 DataType::Utf8 => {
2865 Field::new(field.name(), DataType::Utf8View, field.is_nullable())
2866 .with_metadata(field.metadata().clone())
2867 }
2868 _ => field.clone(),
2869 }
2870 }
2871 }
2872
2873 let field = TestHelper::with_utf8view(&Field::new("str_field", DataType::Utf8, false));
2874
2875 assert_eq!(field.data_type(), &DataType::Utf8View);
2876
2877 let array = StringViewArray::from(vec!["test1", "test2"]);
2878 let batch =
2879 RecordBatch::try_from_iter(vec![("str_field", Arc::new(array) as ArrayRef)]).unwrap();
2880
2881 assert!(batch.column(0).as_any().is::<StringViewArray>());
2882 }
2883
2884 fn make_reader_schema_with_default_fields(
2885 path: &str,
2886 default_fields: Vec<Value>,
2887 ) -> AvroSchema {
2888 let mut root = load_writer_schema_json(path);
2889 assert_eq!(root["type"], "record", "writer schema must be a record");
2890 root.as_object_mut()
2891 .expect("schema is a JSON object")
2892 .insert("fields".to_string(), Value::Array(default_fields));
2893 AvroSchema::new(root.to_string())
2894 }
2895
2896 #[test]
2897 fn test_schema_resolution_defaults_all_supported_types() {
2898 let path = "test/data/skippable_types.avro";
2899 let duration_default = "\u{0000}".repeat(12);
2900 let reader_schema = make_reader_schema_with_default_fields(
2901 path,
2902 vec![
2903 serde_json::json!({"name":"d_bool","type":"boolean","default":true}),
2904 serde_json::json!({"name":"d_int","type":"int","default":42}),
2905 serde_json::json!({"name":"d_long","type":"long","default":12345}),
2906 serde_json::json!({"name":"d_float","type":"float","default":1.5}),
2907 serde_json::json!({"name":"d_double","type":"double","default":2.25}),
2908 serde_json::json!({"name":"d_bytes","type":"bytes","default":"XYZ"}),
2909 serde_json::json!({"name":"d_string","type":"string","default":"hello"}),
2910 serde_json::json!({"name":"d_date","type":{"type":"int","logicalType":"date"},"default":0}),
2911 serde_json::json!({"name":"d_time_ms","type":{"type":"int","logicalType":"time-millis"},"default":1000}),
2912 serde_json::json!({"name":"d_time_us","type":{"type":"long","logicalType":"time-micros"},"default":2000}),
2913 serde_json::json!({"name":"d_ts_ms","type":{"type":"long","logicalType":"local-timestamp-millis"},"default":0}),
2914 serde_json::json!({"name":"d_ts_us","type":{"type":"long","logicalType":"local-timestamp-micros"},"default":0}),
2915 serde_json::json!({"name":"d_decimal","type":{"type":"bytes","logicalType":"decimal","precision":10,"scale":2},"default":""}),
2916 serde_json::json!({"name":"d_fixed","type":{"type":"fixed","name":"F4","size":4},"default":"ABCD"}),
2917 serde_json::json!({"name":"d_enum","type":{"type":"enum","name":"E","symbols":["A","B","C"]},"default":"A"}),
2918 serde_json::json!({"name":"d_duration","type":{"type":"fixed","name":"Dur","size":12,"logicalType":"duration"},"default":duration_default}),
2919 serde_json::json!({"name":"d_uuid","type":{"type":"string","logicalType":"uuid"},"default":"00000000-0000-0000-0000-000000000000"}),
2920 serde_json::json!({"name":"d_array","type":{"type":"array","items":"int"},"default":[1,2,3]}),
2921 serde_json::json!({"name":"d_map","type":{"type":"map","values":"long"},"default":{"a":1,"b":2}}),
2922 serde_json::json!({"name":"d_record","type":{
2923 "type":"record","name":"DefaultRec","fields":[
2924 {"name":"x","type":"int"},
2925 {"name":"y","type":["null","string"],"default":null}
2926 ]
2927 },"default":{"x":7}}),
2928 serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}),
2929 serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}),
2930 ],
2931 );
2932 let actual = read_alltypes_with_reader_schema(path, reader_schema);
2933 let num_rows = actual.num_rows();
2934 assert!(num_rows > 0, "skippable_types.avro should contain rows");
2935 assert_eq!(
2936 actual.num_columns(),
2937 22,
2938 "expected exactly our defaulted fields"
2939 );
2940 let mut arrays: Vec<Arc<dyn Array>> = Vec::with_capacity(22);
2941 arrays.push(Arc::new(BooleanArray::from_iter(std::iter::repeat_n(
2942 Some(true),
2943 num_rows,
2944 ))));
2945 arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2946 42, num_rows,
2947 ))));
2948 arrays.push(Arc::new(Int64Array::from_iter_values(std::iter::repeat_n(
2949 12345, num_rows,
2950 ))));
2951 arrays.push(Arc::new(Float32Array::from_iter_values(
2952 std::iter::repeat_n(1.5f32, num_rows),
2953 )));
2954 arrays.push(Arc::new(Float64Array::from_iter_values(
2955 std::iter::repeat_n(2.25f64, num_rows),
2956 )));
2957 arrays.push(Arc::new(BinaryArray::from_iter_values(
2958 std::iter::repeat_n(b"XYZ".as_ref(), num_rows),
2959 )));
2960 arrays.push(Arc::new(StringArray::from_iter_values(
2961 std::iter::repeat_n("hello", num_rows),
2962 )));
2963 arrays.push(Arc::new(Date32Array::from_iter_values(
2964 std::iter::repeat_n(0, num_rows),
2965 )));
2966 arrays.push(Arc::new(Time32MillisecondArray::from_iter_values(
2967 std::iter::repeat_n(1_000, num_rows),
2968 )));
2969 arrays.push(Arc::new(Time64MicrosecondArray::from_iter_values(
2970 std::iter::repeat_n(2_000i64, num_rows),
2971 )));
2972 arrays.push(Arc::new(TimestampMillisecondArray::from_iter_values(
2973 std::iter::repeat_n(0i64, num_rows),
2974 )));
2975 arrays.push(Arc::new(TimestampMicrosecondArray::from_iter_values(
2976 std::iter::repeat_n(0i64, num_rows),
2977 )));
2978 #[cfg(feature = "small_decimals")]
2979 let decimal = Decimal64Array::from_iter_values(std::iter::repeat_n(0i64, num_rows))
2980 .with_precision_and_scale(10, 2)
2981 .unwrap();
2982 #[cfg(not(feature = "small_decimals"))]
2983 let decimal = Decimal128Array::from_iter_values(std::iter::repeat_n(0i128, num_rows))
2984 .with_precision_and_scale(10, 2)
2985 .unwrap();
2986 arrays.push(Arc::new(decimal));
2987 let fixed_iter = std::iter::repeat_n(Some(*b"ABCD"), num_rows);
2988 arrays.push(Arc::new(
2989 FixedSizeBinaryArray::try_from_sparse_iter_with_size(fixed_iter, 4).unwrap(),
2990 ));
2991 let enum_keys = Int32Array::from_iter_values(std::iter::repeat_n(0, num_rows));
2992 let enum_values = StringArray::from_iter_values(["A", "B", "C"]);
2993 let enum_arr =
2994 DictionaryArray::<Int32Type>::try_new(enum_keys, Arc::new(enum_values)).unwrap();
2995 arrays.push(Arc::new(enum_arr));
2996 let duration_values = std::iter::repeat_n(
2997 Some(IntervalMonthDayNanoType::make_value(0, 0, 0)),
2998 num_rows,
2999 );
3000 let duration_arr: IntervalMonthDayNanoArray = duration_values.collect();
3001 arrays.push(Arc::new(duration_arr));
3002 let uuid_bytes = [0u8; 16];
3003 let uuid_iter = std::iter::repeat_n(Some(uuid_bytes), num_rows);
3004 arrays.push(Arc::new(
3005 FixedSizeBinaryArray::try_from_sparse_iter_with_size(uuid_iter, 16).unwrap(),
3006 ));
3007 let item_field = Arc::new(Field::new(
3008 Field::LIST_FIELD_DEFAULT_NAME,
3009 DataType::Int32,
3010 false,
3011 ));
3012 let mut list_builder = ListBuilder::new(Int32Builder::new()).with_field(item_field);
3013 for _ in 0..num_rows {
3014 list_builder.values().append_value(1);
3015 list_builder.values().append_value(2);
3016 list_builder.values().append_value(3);
3017 list_builder.append(true);
3018 }
3019 arrays.push(Arc::new(list_builder.finish()));
3020 let values_field = Arc::new(Field::new("value", DataType::Int64, false));
3021 let mut map_builder = MapBuilder::new(
3022 Some(builder::MapFieldNames {
3023 entry: "entries".to_string(),
3024 key: "key".to_string(),
3025 value: "value".to_string(),
3026 }),
3027 StringBuilder::new(),
3028 Int64Builder::new(),
3029 )
3030 .with_values_field(values_field);
3031 for _ in 0..num_rows {
3032 let (keys, vals) = map_builder.entries();
3033 keys.append_value("a");
3034 vals.append_value(1);
3035 keys.append_value("b");
3036 vals.append_value(2);
3037 map_builder.append(true).unwrap();
3038 }
3039 arrays.push(Arc::new(map_builder.finish()));
3040 let rec_fields: Fields = Fields::from(vec![
3041 Field::new("x", DataType::Int32, false),
3042 Field::new("y", DataType::Utf8, true),
3043 ]);
3044 let mut sb = StructBuilder::new(
3045 rec_fields.clone(),
3046 vec![
3047 Box::new(Int32Builder::new()),
3048 Box::new(StringBuilder::new()),
3049 ],
3050 );
3051 for _ in 0..num_rows {
3052 sb.field_builder::<Int32Builder>(0).unwrap().append_value(7);
3053 sb.field_builder::<StringBuilder>(1).unwrap().append_null();
3054 sb.append(true);
3055 }
3056 arrays.push(Arc::new(sb.finish()));
3057 arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n(
3058 None::<i32>,
3059 num_rows,
3060 ))));
3061 arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
3062 123, num_rows,
3063 ))));
3064 let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap();
3065 assert_eq!(
3066 actual, expected,
3067 "defaults should materialize correctly for all fields"
3068 );
3069 }
3070
3071 #[test]
3072 fn test_schema_resolution_default_enum_invalid_symbol_errors() {
3073 let path = "test/data/skippable_types.avro";
3074 let bad_schema = make_reader_schema_with_default_fields(
3075 path,
3076 vec![serde_json::json!({
3077 "name":"bad_enum",
3078 "type":{"type":"enum","name":"E","symbols":["A","B","C"]},
3079 "default":"Z"
3080 })],
3081 );
3082 let file = File::open(path).unwrap();
3083 let res = ReaderBuilder::new()
3084 .with_reader_schema(bad_schema)
3085 .build(BufReader::new(file));
3086 let err = res.expect_err("expected enum default validation to fail");
3087 let msg = err.to_string();
3088 let lower_msg = msg.to_lowercase();
3089 assert!(
3090 lower_msg.contains("enum")
3091 && (lower_msg.contains("symbol") || lower_msg.contains("default")),
3092 "unexpected error: {msg}"
3093 );
3094 }
3095
3096 #[test]
3097 fn test_schema_resolution_default_fixed_size_mismatch_errors() {
3098 let path = "test/data/skippable_types.avro";
3099 let bad_schema = make_reader_schema_with_default_fields(
3100 path,
3101 vec![serde_json::json!({
3102 "name":"bad_fixed",
3103 "type":{"type":"fixed","name":"F","size":4},
3104 "default":"ABC"
3105 })],
3106 );
3107 let file = File::open(path).unwrap();
3108 let res = ReaderBuilder::new()
3109 .with_reader_schema(bad_schema)
3110 .build(BufReader::new(file));
3111 let err = res.expect_err("expected fixed default validation to fail");
3112 let msg = err.to_string();
3113 let lower_msg = msg.to_lowercase();
3114 assert!(
3115 lower_msg.contains("fixed")
3116 && (lower_msg.contains("size")
3117 || lower_msg.contains("length")
3118 || lower_msg.contains("does not match")),
3119 "unexpected error: {msg}"
3120 );
3121 }
3122
3123 #[test]
3124 #[cfg(feature = "snappy")]
3126 fn test_alltypes_skip_writer_fields_keep_double_only() {
3127 let file = arrow_test_data("avro/alltypes_plain.avro");
3128 let reader_schema =
3129 make_reader_schema_with_selected_fields_in_order(&file, &["double_col"]);
3130 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
3131 let expected = RecordBatch::try_from_iter_with_nullable([(
3132 "double_col",
3133 Arc::new(Float64Array::from_iter_values(
3134 (0..8).map(|x| (x % 2) as f64 * 10.1),
3135 )) as _,
3136 true,
3137 )])
3138 .unwrap();
3139 assert_eq!(batch, expected);
3140 }
3141
3142 #[test]
3143 #[cfg(feature = "snappy")]
3145 fn test_alltypes_skip_writer_fields_reorder_and_skip_many() {
3146 let file = arrow_test_data("avro/alltypes_plain.avro");
3147 let reader_schema =
3148 make_reader_schema_with_selected_fields_in_order(&file, &["timestamp_col", "id"]);
3149 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
3150 let expected = RecordBatch::try_from_iter_with_nullable([
3151 (
3152 "timestamp_col",
3153 Arc::new(
3154 TimestampMicrosecondArray::from_iter_values([
3155 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
3164 .with_timezone("+00:00"),
3165 ) as _,
3166 true,
3167 ),
3168 (
3169 "id",
3170 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
3171 true,
3172 ),
3173 ])
3174 .unwrap();
3175 assert_eq!(batch, expected);
3176 }
3177
3178 #[test]
3179 fn test_skippable_types_project_each_field_individually() {
3180 let path = "test/data/skippable_types.avro";
3181 let full = read_file(path, 1024, false);
3182 let schema_full = full.schema();
3183 let num_rows = full.num_rows();
3184 let writer_json = load_writer_schema_json(path);
3185 assert_eq!(
3186 writer_json["type"], "record",
3187 "writer schema must be a record"
3188 );
3189 let fields_json = writer_json
3190 .get("fields")
3191 .and_then(|f| f.as_array())
3192 .expect("record has fields");
3193 assert_eq!(
3194 schema_full.fields().len(),
3195 fields_json.len(),
3196 "full read column count vs writer fields"
3197 );
3198 fn rebuild_list_array_with_element(
3199 col: &ArrayRef,
3200 new_elem: Arc<Field>,
3201 is_large: bool,
3202 ) -> ArrayRef {
3203 if is_large {
3204 let list = col
3205 .as_any()
3206 .downcast_ref::<LargeListArray>()
3207 .expect("expected LargeListArray");
3208 let offsets = list.offsets().clone();
3209 let values = list.values().clone();
3210 let validity = list.nulls().cloned();
3211 Arc::new(LargeListArray::try_new(new_elem, offsets, values, validity).unwrap())
3212 } else {
3213 let list = col
3214 .as_any()
3215 .downcast_ref::<ListArray>()
3216 .expect("expected ListArray");
3217 let offsets = list.offsets().clone();
3218 let values = list.values().clone();
3219 let validity = list.nulls().cloned();
3220 Arc::new(ListArray::try_new(new_elem, offsets, values, validity).unwrap())
3221 }
3222 }
3223 for (idx, f) in fields_json.iter().enumerate() {
3224 let name = f
3225 .get("name")
3226 .and_then(|n| n.as_str())
3227 .unwrap_or_else(|| panic!("field at index {idx} has no name"));
3228 let reader_schema = make_reader_schema_with_selected_fields_in_order(path, &[name]);
3229 let projected = read_alltypes_with_reader_schema(path, reader_schema);
3230 assert_eq!(
3231 projected.num_columns(),
3232 1,
3233 "projected batch should contain exactly the selected column '{name}'"
3234 );
3235 assert_eq!(
3236 projected.num_rows(),
3237 num_rows,
3238 "row count mismatch for projected column '{name}'"
3239 );
3240 let col_full = full.column(idx).clone();
3241 let full_field = schema_full.field(idx).as_ref().clone();
3242 let proj_field_ref = projected.schema().field(0).clone();
3243 let proj_field = proj_field_ref.as_ref();
3244 let top_meta = proj_field.metadata().clone();
3245 let (expected_field_ref, expected_col): (Arc<Field>, ArrayRef) =
3246 match (full_field.data_type(), proj_field.data_type()) {
3247 (&DataType::List(_), DataType::List(proj_elem)) => {
3248 let new_col =
3249 rebuild_list_array_with_element(&col_full, proj_elem.clone(), false);
3250 let nf = Field::new(
3251 full_field.name().clone(),
3252 proj_field.data_type().clone(),
3253 full_field.is_nullable(),
3254 )
3255 .with_metadata(top_meta);
3256 (Arc::new(nf), new_col)
3257 }
3258 (&DataType::LargeList(_), DataType::LargeList(proj_elem)) => {
3259 let new_col =
3260 rebuild_list_array_with_element(&col_full, proj_elem.clone(), true);
3261 let nf = Field::new(
3262 full_field.name().clone(),
3263 proj_field.data_type().clone(),
3264 full_field.is_nullable(),
3265 )
3266 .with_metadata(top_meta);
3267 (Arc::new(nf), new_col)
3268 }
3269 _ => {
3270 let nf = full_field.with_metadata(top_meta);
3271 (Arc::new(nf), col_full)
3272 }
3273 };
3274
3275 let expected = RecordBatch::try_new(
3276 Arc::new(Schema::new(vec![expected_field_ref])),
3277 vec![expected_col],
3278 )
3279 .unwrap();
3280 assert_eq!(
3281 projected, expected,
3282 "projected column '{name}' mismatch vs full read column"
3283 );
3284 }
3285 }
3286
3287 #[test]
3288 fn test_union_fields_avro_nullable_and_general_unions() {
3289 let path = "test/data/union_fields.avro";
3290 let batch = read_file(path, 1024, false);
3291 let schema = batch.schema();
3292 let idx = schema.index_of("nullable_int_nullfirst").unwrap();
3293 let a = batch.column(idx).as_primitive::<Int32Type>();
3294 assert_eq!(a.len(), 4);
3295 assert!(a.is_null(0));
3296 assert_eq!(a.value(1), 42);
3297 assert!(a.is_null(2));
3298 assert_eq!(a.value(3), 0);
3299 let idx = schema.index_of("nullable_string_nullsecond").unwrap();
3300 let s = batch
3301 .column(idx)
3302 .as_any()
3303 .downcast_ref::<StringArray>()
3304 .expect("nullable_string_nullsecond should be Utf8");
3305 assert_eq!(s.len(), 4);
3306 assert_eq!(s.value(0), "s1");
3307 assert!(s.is_null(1));
3308 assert_eq!(s.value(2), "s3");
3309 assert!(s.is_valid(3)); assert_eq!(s.value(3), "");
3311 let idx = schema.index_of("union_prim").unwrap();
3312 let u = batch
3313 .column(idx)
3314 .as_any()
3315 .downcast_ref::<UnionArray>()
3316 .expect("union_prim should be Union");
3317 let fields = match u.data_type() {
3318 DataType::Union(fields, mode) => {
3319 assert!(matches!(mode, UnionMode::Dense), "expect dense unions");
3320 fields
3321 }
3322 other => panic!("expected Union, got {other:?}"),
3323 };
3324 let tid_by_name = |name: &str| -> i8 {
3325 for (tid, f) in fields.iter() {
3326 if f.name() == name {
3327 return tid;
3328 }
3329 }
3330 panic!("union child '{name}' not found");
3331 };
3332 let expected_type_ids = vec![
3333 tid_by_name("long"),
3334 tid_by_name("int"),
3335 tid_by_name("float"),
3336 tid_by_name("double"),
3337 ];
3338 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3339 assert_eq!(
3340 type_ids, expected_type_ids,
3341 "branch selection for union_prim rows"
3342 );
3343 let longs = u
3344 .child(tid_by_name("long"))
3345 .as_any()
3346 .downcast_ref::<Int64Array>()
3347 .unwrap();
3348 assert_eq!(longs.len(), 1);
3349 let ints = u
3350 .child(tid_by_name("int"))
3351 .as_any()
3352 .downcast_ref::<Int32Array>()
3353 .unwrap();
3354 assert_eq!(ints.len(), 1);
3355 let floats = u
3356 .child(tid_by_name("float"))
3357 .as_any()
3358 .downcast_ref::<Float32Array>()
3359 .unwrap();
3360 assert_eq!(floats.len(), 1);
3361 let doubles = u
3362 .child(tid_by_name("double"))
3363 .as_any()
3364 .downcast_ref::<Float64Array>()
3365 .unwrap();
3366 assert_eq!(doubles.len(), 1);
3367 let idx = schema.index_of("union_bytes_vs_string").unwrap();
3368 let u = batch
3369 .column(idx)
3370 .as_any()
3371 .downcast_ref::<UnionArray>()
3372 .expect("union_bytes_vs_string should be Union");
3373 let fields = match u.data_type() {
3374 DataType::Union(fields, _) => fields,
3375 other => panic!("expected Union, got {other:?}"),
3376 };
3377 let tid_by_name = |name: &str| -> i8 {
3378 for (tid, f) in fields.iter() {
3379 if f.name() == name {
3380 return tid;
3381 }
3382 }
3383 panic!("union child '{name}' not found");
3384 };
3385 let tid_bytes = tid_by_name("bytes");
3386 let tid_string = tid_by_name("string");
3387 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3388 assert_eq!(
3389 type_ids,
3390 vec![tid_bytes, tid_string, tid_string, tid_bytes],
3391 "branch selection for bytes/string union"
3392 );
3393 let s_child = u
3394 .child(tid_string)
3395 .as_any()
3396 .downcast_ref::<StringArray>()
3397 .unwrap();
3398 assert_eq!(s_child.len(), 2);
3399 assert_eq!(s_child.value(0), "hello");
3400 assert_eq!(s_child.value(1), "world");
3401 let b_child = u
3402 .child(tid_bytes)
3403 .as_any()
3404 .downcast_ref::<BinaryArray>()
3405 .unwrap();
3406 assert_eq!(b_child.len(), 2);
3407 assert_eq!(b_child.value(0), &[0x00, 0xFF, 0x7F]);
3408 assert_eq!(b_child.value(1), b""); let idx = schema.index_of("union_enum_records_array_map").unwrap();
3410 let u = batch
3411 .column(idx)
3412 .as_any()
3413 .downcast_ref::<UnionArray>()
3414 .expect("union_enum_records_array_map should be Union");
3415 let fields = match u.data_type() {
3416 DataType::Union(fields, _) => fields,
3417 other => panic!("expected Union, got {other:?}"),
3418 };
3419 let mut tid_enum: Option<i8> = None;
3420 let mut tid_rec_a: Option<i8> = None;
3421 let mut tid_rec_b: Option<i8> = None;
3422 let mut tid_array: Option<i8> = None;
3423 for (tid, f) in fields.iter() {
3424 match f.data_type() {
3425 DataType::Dictionary(_, _) => tid_enum = Some(tid),
3426 DataType::Struct(childs) => {
3427 if childs.len() == 2 && childs[0].name() == "a" && childs[1].name() == "b" {
3428 tid_rec_a = Some(tid);
3429 } else if childs.len() == 2
3430 && childs[0].name() == "x"
3431 && childs[1].name() == "y"
3432 {
3433 tid_rec_b = Some(tid);
3434 }
3435 }
3436 DataType::List(_) => tid_array = Some(tid),
3437 _ => {}
3438 }
3439 }
3440 let (tid_enum, tid_rec_a, tid_rec_b, tid_array) = (
3441 tid_enum.expect("enum child"),
3442 tid_rec_a.expect("RecA child"),
3443 tid_rec_b.expect("RecB child"),
3444 tid_array.expect("array<long> child"),
3445 );
3446 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3447 assert_eq!(
3448 type_ids,
3449 vec![tid_enum, tid_rec_a, tid_rec_b, tid_array],
3450 "branch selection for complex union"
3451 );
3452 let dict = u
3453 .child(tid_enum)
3454 .as_any()
3455 .downcast_ref::<DictionaryArray<Int32Type>>()
3456 .unwrap();
3457 assert_eq!(dict.len(), 1);
3458 assert!(dict.is_valid(0));
3459 let rec_a = u
3460 .child(tid_rec_a)
3461 .as_any()
3462 .downcast_ref::<StructArray>()
3463 .unwrap();
3464 assert_eq!(rec_a.len(), 1);
3465 let a_val = rec_a
3466 .column_by_name("a")
3467 .unwrap()
3468 .as_any()
3469 .downcast_ref::<Int32Array>()
3470 .unwrap();
3471 assert_eq!(a_val.value(0), 7);
3472 let b_val = rec_a
3473 .column_by_name("b")
3474 .unwrap()
3475 .as_any()
3476 .downcast_ref::<StringArray>()
3477 .unwrap();
3478 assert_eq!(b_val.value(0), "x");
3479 let rec_b = u
3481 .child(tid_rec_b)
3482 .as_any()
3483 .downcast_ref::<StructArray>()
3484 .unwrap();
3485 let x_val = rec_b
3486 .column_by_name("x")
3487 .unwrap()
3488 .as_any()
3489 .downcast_ref::<Int64Array>()
3490 .unwrap();
3491 assert_eq!(x_val.value(0), 123_456_789_i64);
3492 let y_val = rec_b
3493 .column_by_name("y")
3494 .unwrap()
3495 .as_any()
3496 .downcast_ref::<BinaryArray>()
3497 .unwrap();
3498 assert_eq!(y_val.value(0), &[0xFF, 0x00]);
3499 let arr = u
3500 .child(tid_array)
3501 .as_any()
3502 .downcast_ref::<ListArray>()
3503 .unwrap();
3504 assert_eq!(arr.len(), 1);
3505 let first_values = arr.value(0);
3506 let longs = first_values.as_any().downcast_ref::<Int64Array>().unwrap();
3507 assert_eq!(longs.len(), 3);
3508 assert_eq!(longs.value(0), 1);
3509 assert_eq!(longs.value(1), 2);
3510 assert_eq!(longs.value(2), 3);
3511 let idx = schema.index_of("union_date_or_fixed4").unwrap();
3512 let u = batch
3513 .column(idx)
3514 .as_any()
3515 .downcast_ref::<UnionArray>()
3516 .expect("union_date_or_fixed4 should be Union");
3517 let fields = match u.data_type() {
3518 DataType::Union(fields, _) => fields,
3519 other => panic!("expected Union, got {other:?}"),
3520 };
3521 let mut tid_date: Option<i8> = None;
3522 let mut tid_fixed: Option<i8> = None;
3523 for (tid, f) in fields.iter() {
3524 match f.data_type() {
3525 DataType::Date32 => tid_date = Some(tid),
3526 DataType::FixedSizeBinary(4) => tid_fixed = Some(tid),
3527 _ => {}
3528 }
3529 }
3530 let (tid_date, tid_fixed) = (tid_date.expect("date"), tid_fixed.expect("fixed(4)"));
3531 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3532 assert_eq!(
3533 type_ids,
3534 vec![tid_date, tid_fixed, tid_date, tid_fixed],
3535 "branch selection for date/fixed4 union"
3536 );
3537 let dates = u
3538 .child(tid_date)
3539 .as_any()
3540 .downcast_ref::<Date32Array>()
3541 .unwrap();
3542 assert_eq!(dates.len(), 2);
3543 assert_eq!(dates.value(0), 19_000); assert_eq!(dates.value(1), 0); let fixed = u
3546 .child(tid_fixed)
3547 .as_any()
3548 .downcast_ref::<FixedSizeBinaryArray>()
3549 .unwrap();
3550 assert_eq!(fixed.len(), 2);
3551 assert_eq!(fixed.value(0), b"ABCD");
3552 assert_eq!(fixed.value(1), &[0x00, 0x11, 0x22, 0x33]);
3553 }
3554
3555 #[test]
3556 fn test_union_schema_resolution_all_type_combinations() {
3557 let path = "test/data/union_fields.avro";
3558 let baseline = read_file(path, 1024, false);
3559 let baseline_schema = baseline.schema();
3560 let mut root = load_writer_schema_json(path);
3561 assert_eq!(root["type"], "record", "writer schema must be a record");
3562 let fields = root
3563 .get_mut("fields")
3564 .and_then(|f| f.as_array_mut())
3565 .expect("record has fields");
3566 fn is_named_type(obj: &Value, ty: &str, nm: &str) -> bool {
3567 obj.get("type").and_then(|v| v.as_str()) == Some(ty)
3568 && obj.get("name").and_then(|v| v.as_str()) == Some(nm)
3569 }
3570 fn is_logical(obj: &Value, prim: &str, lt: &str) -> bool {
3571 obj.get("type").and_then(|v| v.as_str()) == Some(prim)
3572 && obj.get("logicalType").and_then(|v| v.as_str()) == Some(lt)
3573 }
3574 fn find_first(arr: &[Value], pred: impl Fn(&Value) -> bool) -> Option<Value> {
3575 arr.iter().find(|v| pred(v)).cloned()
3576 }
3577 fn prim(s: &str) -> Value {
3578 Value::String(s.to_string())
3579 }
3580 for f in fields.iter_mut() {
3581 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
3582 continue;
3583 };
3584 match name {
3585 "nullable_int_nullfirst" => {
3587 f["type"] = json!(["int", "null"]);
3588 }
3589 "nullable_string_nullsecond" => {
3590 f["type"] = json!(["null", "string"]);
3591 }
3592 "union_prim" => {
3593 let orig = f["type"].as_array().unwrap().clone();
3594 let long = prim("long");
3595 let double = prim("double");
3596 let string = prim("string");
3597 let bytes = prim("bytes");
3598 let boolean = prim("boolean");
3599 assert!(orig.contains(&long));
3600 assert!(orig.contains(&double));
3601 assert!(orig.contains(&string));
3602 assert!(orig.contains(&bytes));
3603 assert!(orig.contains(&boolean));
3604 f["type"] = json!([long, double, string, bytes, boolean]);
3605 }
3606 "union_bytes_vs_string" => {
3607 f["type"] = json!(["string", "bytes"]);
3608 }
3609 "union_fixed_dur_decfix" => {
3610 let orig = f["type"].as_array().unwrap().clone();
3611 let fx8 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx8")).unwrap();
3612 let dur12 = find_first(&orig, |o| is_named_type(o, "fixed", "Dur12")).unwrap();
3613 let decfix16 =
3614 find_first(&orig, |o| is_named_type(o, "fixed", "DecFix16")).unwrap();
3615 f["type"] = json!([decfix16, dur12, fx8]);
3616 }
3617 "union_enum_records_array_map" => {
3618 let orig = f["type"].as_array().unwrap().clone();
3619 let enum_color = find_first(&orig, |o| {
3620 o.get("type").and_then(|v| v.as_str()) == Some("enum")
3621 })
3622 .unwrap();
3623 let rec_a = find_first(&orig, |o| is_named_type(o, "record", "RecA")).unwrap();
3624 let rec_b = find_first(&orig, |o| is_named_type(o, "record", "RecB")).unwrap();
3625 let arr = find_first(&orig, |o| {
3626 o.get("type").and_then(|v| v.as_str()) == Some("array")
3627 })
3628 .unwrap();
3629 let map = find_first(&orig, |o| {
3630 o.get("type").and_then(|v| v.as_str()) == Some("map")
3631 })
3632 .unwrap();
3633 f["type"] = json!([arr, map, rec_b, rec_a, enum_color]);
3634 }
3635 "union_date_or_fixed4" => {
3636 let orig = f["type"].as_array().unwrap().clone();
3637 let date = find_first(&orig, |o| is_logical(o, "int", "date")).unwrap();
3638 let fx4 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx4")).unwrap();
3639 f["type"] = json!([fx4, date]);
3640 }
3641 "union_time_millis_or_enum" => {
3642 let orig = f["type"].as_array().unwrap().clone();
3643 let time_ms =
3644 find_first(&orig, |o| is_logical(o, "int", "time-millis")).unwrap();
3645 let en = find_first(&orig, |o| {
3646 o.get("type").and_then(|v| v.as_str()) == Some("enum")
3647 })
3648 .unwrap();
3649 f["type"] = json!([en, time_ms]);
3650 }
3651 "union_time_micros_or_string" => {
3652 let orig = f["type"].as_array().unwrap().clone();
3653 let time_us =
3654 find_first(&orig, |o| is_logical(o, "long", "time-micros")).unwrap();
3655 f["type"] = json!(["string", time_us]);
3656 }
3657 "union_ts_millis_utc_or_array" => {
3658 let orig = f["type"].as_array().unwrap().clone();
3659 let ts_ms =
3660 find_first(&orig, |o| is_logical(o, "long", "timestamp-millis")).unwrap();
3661 let arr = find_first(&orig, |o| {
3662 o.get("type").and_then(|v| v.as_str()) == Some("array")
3663 })
3664 .unwrap();
3665 f["type"] = json!([arr, ts_ms]);
3666 }
3667 "union_ts_micros_local_or_bytes" => {
3668 let orig = f["type"].as_array().unwrap().clone();
3669 let lts_us =
3670 find_first(&orig, |o| is_logical(o, "long", "local-timestamp-micros"))
3671 .unwrap();
3672 f["type"] = json!(["bytes", lts_us]);
3673 }
3674 "union_uuid_or_fixed10" => {
3675 let orig = f["type"].as_array().unwrap().clone();
3676 let uuid = find_first(&orig, |o| is_logical(o, "string", "uuid")).unwrap();
3677 let fx10 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx10")).unwrap();
3678 f["type"] = json!([fx10, uuid]);
3679 }
3680 "union_dec_bytes_or_dec_fixed" => {
3681 let orig = f["type"].as_array().unwrap().clone();
3682 let dec_bytes = find_first(&orig, |o| {
3683 o.get("type").and_then(|v| v.as_str()) == Some("bytes")
3684 && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3685 })
3686 .unwrap();
3687 let dec_fix = find_first(&orig, |o| {
3688 is_named_type(o, "fixed", "DecFix20")
3689 && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3690 })
3691 .unwrap();
3692 f["type"] = json!([dec_fix, dec_bytes]);
3693 }
3694 "union_null_bytes_string" => {
3695 f["type"] = json!(["bytes", "string", "null"]);
3696 }
3697 "array_of_union" => {
3698 let obj = f
3699 .get_mut("type")
3700 .expect("array type")
3701 .as_object_mut()
3702 .unwrap();
3703 obj.insert("items".to_string(), json!(["string", "long"]));
3704 }
3705 "map_of_union" => {
3706 let obj = f
3707 .get_mut("type")
3708 .expect("map type")
3709 .as_object_mut()
3710 .unwrap();
3711 obj.insert("values".to_string(), json!(["double", "null"]));
3712 }
3713 "record_with_union_field" => {
3714 let rec = f
3715 .get_mut("type")
3716 .expect("record type")
3717 .as_object_mut()
3718 .unwrap();
3719 let rec_fields = rec.get_mut("fields").unwrap().as_array_mut().unwrap();
3720 let mut found = false;
3721 for rf in rec_fields.iter_mut() {
3722 if rf.get("name").and_then(|v| v.as_str()) == Some("u") {
3723 rf["type"] = json!(["string", "long"]); found = true;
3725 break;
3726 }
3727 }
3728 assert!(found, "field 'u' expected in HasUnion");
3729 }
3730 "union_ts_micros_utc_or_map" => {
3731 let orig = f["type"].as_array().unwrap().clone();
3732 let ts_us =
3733 find_first(&orig, |o| is_logical(o, "long", "timestamp-micros")).unwrap();
3734 let map = find_first(&orig, |o| {
3735 o.get("type").and_then(|v| v.as_str()) == Some("map")
3736 })
3737 .unwrap();
3738 f["type"] = json!([map, ts_us]);
3739 }
3740 "union_ts_millis_local_or_string" => {
3741 let orig = f["type"].as_array().unwrap().clone();
3742 let lts_ms =
3743 find_first(&orig, |o| is_logical(o, "long", "local-timestamp-millis"))
3744 .unwrap();
3745 f["type"] = json!(["string", lts_ms]);
3746 }
3747 "union_bool_or_string" => {
3748 f["type"] = json!(["string", "boolean"]);
3749 }
3750 _ => {}
3751 }
3752 }
3753 let reader_schema = AvroSchema::new(root.to_string());
3754 let resolved = read_alltypes_with_reader_schema(path, reader_schema);
3755
3756 fn branch_token(dt: &DataType) -> String {
3757 match dt {
3758 DataType::Null => "null".into(),
3759 DataType::Boolean => "boolean".into(),
3760 DataType::Int32 => "int".into(),
3761 DataType::Int64 => "long".into(),
3762 DataType::Float32 => "float".into(),
3763 DataType::Float64 => "double".into(),
3764 DataType::Binary => "bytes".into(),
3765 DataType::Utf8 => "string".into(),
3766 DataType::Date32 => "date".into(),
3767 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => "time-millis".into(),
3768 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => "time-micros".into(),
3769 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => if tz.is_some() {
3770 "timestamp-millis"
3771 } else {
3772 "local-timestamp-millis"
3773 }
3774 .into(),
3775 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => if tz.is_some() {
3776 "timestamp-micros"
3777 } else {
3778 "local-timestamp-micros"
3779 }
3780 .into(),
3781 DataType::Interval(IntervalUnit::MonthDayNano) => "duration".into(),
3782 DataType::FixedSizeBinary(n) => format!("fixed{n}"),
3783 DataType::Dictionary(_, _) => "enum".into(),
3784 DataType::Decimal128(p, s) => format!("decimal({p},{s})"),
3785 DataType::Decimal256(p, s) => format!("decimal({p},{s})"),
3786 #[cfg(feature = "small_decimals")]
3787 DataType::Decimal64(p, s) => format!("decimal({p},{s})"),
3788 DataType::Struct(fields) => {
3789 if fields.len() == 2 && fields[0].name() == "a" && fields[1].name() == "b" {
3790 "record:RecA".into()
3791 } else if fields.len() == 2
3792 && fields[0].name() == "x"
3793 && fields[1].name() == "y"
3794 {
3795 "record:RecB".into()
3796 } else {
3797 "record".into()
3798 }
3799 }
3800 DataType::List(_) => "array".into(),
3801 DataType::Map(_, _) => "map".into(),
3802 other => format!("{other:?}"),
3803 }
3804 }
3805
3806 fn union_tokens(u: &UnionArray) -> (Vec<i8>, HashMap<i8, String>) {
3807 let fields = match u.data_type() {
3808 DataType::Union(fields, _) => fields,
3809 other => panic!("expected Union, got {other:?}"),
3810 };
3811 let mut dict: HashMap<i8, String> = HashMap::with_capacity(fields.len());
3812 for (tid, f) in fields.iter() {
3813 dict.insert(tid, branch_token(f.data_type()));
3814 }
3815 let ids: Vec<i8> = u.type_ids().iter().copied().collect();
3816 (ids, dict)
3817 }
3818
3819 fn expected_token(field_name: &str, writer_token: &str) -> String {
3820 match field_name {
3821 "union_prim" => match writer_token {
3822 "int" => "long".into(),
3823 "float" => "double".into(),
3824 other => other.into(),
3825 },
3826 "record_with_union_field.u" => match writer_token {
3827 "int" => "long".into(),
3828 other => other.into(),
3829 },
3830 _ => writer_token.into(),
3831 }
3832 }
3833
3834 fn get_union<'a>(
3835 rb: &'a RecordBatch,
3836 schema: arrow_schema::SchemaRef,
3837 fname: &str,
3838 ) -> &'a UnionArray {
3839 let idx = schema.index_of(fname).unwrap();
3840 rb.column(idx)
3841 .as_any()
3842 .downcast_ref::<UnionArray>()
3843 .unwrap_or_else(|| panic!("{fname} should be a Union"))
3844 }
3845
3846 fn assert_union_equivalent(field_name: &str, u_writer: &UnionArray, u_reader: &UnionArray) {
3847 let (ids_w, dict_w) = union_tokens(u_writer);
3848 let (ids_r, dict_r) = union_tokens(u_reader);
3849 assert_eq!(
3850 ids_w.len(),
3851 ids_r.len(),
3852 "{field_name}: row count mismatch between baseline and resolved"
3853 );
3854 for (i, (id_w, id_r)) in ids_w.iter().zip(ids_r.iter()).enumerate() {
3855 let w_tok = dict_w.get(id_w).unwrap();
3856 let want = expected_token(field_name, w_tok);
3857 let got = dict_r.get(id_r).unwrap();
3858 assert_eq!(
3859 got, &want,
3860 "{field_name}: row {i} resolved to wrong union branch (writer={w_tok}, expected={want}, got={got})"
3861 );
3862 }
3863 }
3864
3865 for (fname, dt) in [
3866 ("nullable_int_nullfirst", DataType::Int32),
3867 ("nullable_string_nullsecond", DataType::Utf8),
3868 ] {
3869 let idx_b = baseline_schema.index_of(fname).unwrap();
3870 let idx_r = resolved.schema().index_of(fname).unwrap();
3871 let col_b = baseline.column(idx_b);
3872 let col_r = resolved.column(idx_r);
3873 assert_eq!(
3874 col_b.data_type(),
3875 &dt,
3876 "baseline {fname} should decode as non-union with nullability"
3877 );
3878 assert_eq!(
3879 col_b.as_ref(),
3880 col_r.as_ref(),
3881 "{fname}: values must be identical regardless of null-branch order"
3882 );
3883 }
3884 let union_fields = [
3885 "union_prim",
3886 "union_bytes_vs_string",
3887 "union_fixed_dur_decfix",
3888 "union_enum_records_array_map",
3889 "union_date_or_fixed4",
3890 "union_time_millis_or_enum",
3891 "union_time_micros_or_string",
3892 "union_ts_millis_utc_or_array",
3893 "union_ts_micros_local_or_bytes",
3894 "union_uuid_or_fixed10",
3895 "union_dec_bytes_or_dec_fixed",
3896 "union_null_bytes_string",
3897 "union_ts_micros_utc_or_map",
3898 "union_ts_millis_local_or_string",
3899 "union_bool_or_string",
3900 ];
3901 for fname in union_fields {
3902 let u_b = get_union(&baseline, baseline_schema.clone(), fname);
3903 let u_r = get_union(&resolved, resolved.schema(), fname);
3904 assert_union_equivalent(fname, u_b, u_r);
3905 }
3906 {
3907 let fname = "array_of_union";
3908 let idx_b = baseline_schema.index_of(fname).unwrap();
3909 let idx_r = resolved.schema().index_of(fname).unwrap();
3910 let arr_b = baseline
3911 .column(idx_b)
3912 .as_any()
3913 .downcast_ref::<ListArray>()
3914 .expect("array_of_union should be a List");
3915 let arr_r = resolved
3916 .column(idx_r)
3917 .as_any()
3918 .downcast_ref::<ListArray>()
3919 .expect("array_of_union should be a List");
3920 assert_eq!(
3921 arr_b.value_offsets(),
3922 arr_r.value_offsets(),
3923 "{fname}: list offsets changed after resolution"
3924 );
3925 let u_b = arr_b
3926 .values()
3927 .as_any()
3928 .downcast_ref::<UnionArray>()
3929 .expect("array items should be Union");
3930 let u_r = arr_r
3931 .values()
3932 .as_any()
3933 .downcast_ref::<UnionArray>()
3934 .expect("array items should be Union");
3935 let (ids_b, dict_b) = union_tokens(u_b);
3936 let (ids_r, dict_r) = union_tokens(u_r);
3937 assert_eq!(ids_b.len(), ids_r.len(), "{fname}: values length mismatch");
3938 for (i, (id_b, id_r)) in ids_b.iter().zip(ids_r.iter()).enumerate() {
3939 let w_tok = dict_b.get(id_b).unwrap();
3940 let got = dict_r.get(id_r).unwrap();
3941 assert_eq!(
3942 got, w_tok,
3943 "{fname}: value {i} resolved to wrong branch (writer={w_tok}, got={got})"
3944 );
3945 }
3946 }
3947 {
3948 let fname = "map_of_union";
3949 let idx_b = baseline_schema.index_of(fname).unwrap();
3950 let idx_r = resolved.schema().index_of(fname).unwrap();
3951 let map_b = baseline
3952 .column(idx_b)
3953 .as_any()
3954 .downcast_ref::<MapArray>()
3955 .expect("map_of_union should be a Map");
3956 let map_r = resolved
3957 .column(idx_r)
3958 .as_any()
3959 .downcast_ref::<MapArray>()
3960 .expect("map_of_union should be a Map");
3961 assert_eq!(
3962 map_b.value_offsets(),
3963 map_r.value_offsets(),
3964 "{fname}: map value offsets changed after resolution"
3965 );
3966 let ent_b = map_b.entries();
3967 let ent_r = map_r.entries();
3968 let val_b_any = ent_b.column(1).as_ref();
3969 let val_r_any = ent_r.column(1).as_ref();
3970 let b_union = val_b_any.as_any().downcast_ref::<UnionArray>();
3971 let r_union = val_r_any.as_any().downcast_ref::<UnionArray>();
3972 if let (Some(u_b), Some(u_r)) = (b_union, r_union) {
3973 assert_union_equivalent(fname, u_b, u_r);
3974 } else {
3975 assert_eq!(
3976 val_b_any.data_type(),
3977 val_r_any.data_type(),
3978 "{fname}: value data types differ after resolution"
3979 );
3980 assert_eq!(
3981 val_b_any, val_r_any,
3982 "{fname}: value arrays differ after resolution (nullable value column case)"
3983 );
3984 let value_nullable = |m: &MapArray| -> bool {
3985 match m.data_type() {
3986 DataType::Map(entries_field, _sorted) => match entries_field.data_type() {
3987 DataType::Struct(fields) => {
3988 assert_eq!(fields.len(), 2, "entries struct must have 2 fields");
3989 assert_eq!(fields[0].name(), "key");
3990 assert_eq!(fields[1].name(), "value");
3991 fields[1].is_nullable()
3992 }
3993 other => panic!("Map entries field must be Struct, got {other:?}"),
3994 },
3995 other => panic!("expected Map data type, got {other:?}"),
3996 }
3997 };
3998 assert!(
3999 value_nullable(map_b),
4000 "{fname}: baseline Map value field should be nullable per Arrow spec"
4001 );
4002 assert!(
4003 value_nullable(map_r),
4004 "{fname}: resolved Map value field should be nullable per Arrow spec"
4005 );
4006 }
4007 }
4008 {
4009 let fname = "record_with_union_field";
4010 let idx_b = baseline_schema.index_of(fname).unwrap();
4011 let idx_r = resolved.schema().index_of(fname).unwrap();
4012 let rec_b = baseline
4013 .column(idx_b)
4014 .as_any()
4015 .downcast_ref::<StructArray>()
4016 .expect("record_with_union_field should be a Struct");
4017 let rec_r = resolved
4018 .column(idx_r)
4019 .as_any()
4020 .downcast_ref::<StructArray>()
4021 .expect("record_with_union_field should be a Struct");
4022 let u_b = rec_b
4023 .column_by_name("u")
4024 .unwrap()
4025 .as_any()
4026 .downcast_ref::<UnionArray>()
4027 .expect("field 'u' should be Union (baseline)");
4028 let u_r = rec_r
4029 .column_by_name("u")
4030 .unwrap()
4031 .as_any()
4032 .downcast_ref::<UnionArray>()
4033 .expect("field 'u' should be Union (resolved)");
4034 assert_union_equivalent("record_with_union_field.u", u_b, u_r);
4035 }
4036 }
4037
4038 #[test]
4039 fn test_union_fields_end_to_end_expected_arrays() {
4040 fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
4041 for (tid, f) in fields.iter() {
4042 if f.name() == want {
4043 return tid;
4044 }
4045 }
4046 panic!("union child '{want}' not found")
4047 }
4048
4049 fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
4050 for (tid, f) in fields.iter() {
4051 if pred(f.data_type()) {
4052 return tid;
4053 }
4054 }
4055 panic!("no union child matches predicate");
4056 }
4057
4058 fn uuid16_from_str(s: &str) -> [u8; 16] {
4059 fn hex(b: u8) -> u8 {
4060 match b {
4061 b'0'..=b'9' => b - b'0',
4062 b'a'..=b'f' => b - b'a' + 10,
4063 b'A'..=b'F' => b - b'A' + 10,
4064 _ => panic!("invalid hex"),
4065 }
4066 }
4067 let mut out = [0u8; 16];
4068 let bytes = s.as_bytes();
4069 let (mut i, mut j) = (0, 0);
4070 while i < bytes.len() {
4071 if bytes[i] == b'-' {
4072 i += 1;
4073 continue;
4074 }
4075 let hi = hex(bytes[i]);
4076 let lo = hex(bytes[i + 1]);
4077 out[j] = (hi << 4) | lo;
4078 j += 1;
4079 i += 2;
4080 }
4081 assert_eq!(j, 16, "uuid must decode to 16 bytes");
4082 out
4083 }
4084
4085 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
4086 match dt {
4087 DataType::Null => Arc::new(NullArray::new(0)),
4088 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
4089 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
4090 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
4091 DataType::Float32 => Arc::new(arrow_array::Float32Array::from(Vec::<f32>::new())),
4092 DataType::Float64 => Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())),
4093 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
4094 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
4095 DataType::Date32 => Arc::new(arrow_array::Date32Array::from(Vec::<i32>::new())),
4096 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4097 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
4098 }
4099 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4100 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
4101 }
4102 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4103 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
4104 Arc::new(if let Some(tz) = tz {
4105 a.with_timezone(tz.clone())
4106 } else {
4107 a
4108 })
4109 }
4110 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4111 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
4112 Arc::new(if let Some(tz) = tz {
4113 a.with_timezone(tz.clone())
4114 } else {
4115 a
4116 })
4117 }
4118 DataType::Interval(IntervalUnit::MonthDayNano) => {
4119 Arc::new(arrow_array::IntervalMonthDayNanoArray::from(Vec::<
4120 IntervalMonthDayNano,
4121 >::new(
4122 )))
4123 }
4124 DataType::FixedSizeBinary(n) => Arc::new(FixedSizeBinaryArray::new_null(*n, 0)),
4125 DataType::Dictionary(k, v) => {
4126 assert_eq!(**k, DataType::Int32, "expect int32 keys for enums");
4127 let keys = Int32Array::from(Vec::<i32>::new());
4128 let values = match v.as_ref() {
4129 DataType::Utf8 => {
4130 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4131 }
4132 other => panic!("unexpected dictionary value type {other:?}"),
4133 };
4134 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4135 }
4136 DataType::List(field) => {
4137 let values: ArrayRef = match field.data_type() {
4138 DataType::Int32 => {
4139 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
4140 }
4141 DataType::Int64 => {
4142 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
4143 }
4144 DataType::Utf8 => {
4145 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4146 }
4147 DataType::Union(_, _) => {
4148 let (uf, _) = if let DataType::Union(f, m) = field.data_type() {
4149 (f.clone(), m)
4150 } else {
4151 unreachable!()
4152 };
4153 let children: Vec<ArrayRef> = uf
4154 .iter()
4155 .map(|(_, f)| empty_child_for(f.data_type()))
4156 .collect();
4157 Arc::new(
4158 UnionArray::try_new(
4159 uf.clone(),
4160 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
4161 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
4162 children,
4163 )
4164 .unwrap(),
4165 ) as ArrayRef
4166 }
4167 other => panic!("unsupported list item type: {other:?}"),
4168 };
4169 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
4170 Arc::new(ListArray::try_new(field.clone(), offsets, values, None).unwrap())
4171 }
4172 DataType::Map(entry_field, ordered) => {
4173 let DataType::Struct(childs) = entry_field.data_type() else {
4174 panic!("map entries must be struct")
4175 };
4176 let key_field = &childs[0];
4177 let val_field = &childs[1];
4178 assert_eq!(key_field.data_type(), &DataType::Utf8);
4179 let keys = StringArray::from(Vec::<&str>::new());
4180 let vals: ArrayRef = match val_field.data_type() {
4181 DataType::Float64 => {
4182 Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())) as ArrayRef
4183 }
4184 DataType::Int64 => {
4185 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
4186 }
4187 DataType::Utf8 => {
4188 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4189 }
4190 DataType::Union(uf, _) => {
4191 let ch: Vec<ArrayRef> = uf
4192 .iter()
4193 .map(|(_, f)| empty_child_for(f.data_type()))
4194 .collect();
4195 Arc::new(
4196 UnionArray::try_new(
4197 uf.clone(),
4198 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
4199 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
4200 ch,
4201 )
4202 .unwrap(),
4203 ) as ArrayRef
4204 }
4205 other => panic!("unsupported map value type: {other:?}"),
4206 };
4207 let entries = StructArray::new(
4208 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4209 vec![Arc::new(keys) as ArrayRef, vals],
4210 None,
4211 );
4212 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
4213 Arc::new(MapArray::new(
4214 entry_field.clone(),
4215 offsets,
4216 entries,
4217 None,
4218 *ordered,
4219 ))
4220 }
4221 other => panic!("empty_child_for: unhandled type {other:?}"),
4222 }
4223 }
4224
4225 fn mk_dense_union(
4226 fields: &UnionFields,
4227 type_ids: Vec<i8>,
4228 offsets: Vec<i32>,
4229 provide: impl Fn(&Field) -> Option<ArrayRef>,
4230 ) -> ArrayRef {
4231 let children: Vec<ArrayRef> = fields
4232 .iter()
4233 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
4234 .collect();
4235
4236 Arc::new(
4237 UnionArray::try_new(
4238 fields.clone(),
4239 ScalarBuffer::<i8>::from(type_ids),
4240 Some(ScalarBuffer::<i32>::from(offsets)),
4241 children,
4242 )
4243 .unwrap(),
4244 ) as ArrayRef
4245 }
4246
4247 let date_a: i32 = 19_000;
4249 let time_ms_a: i32 = 13 * 3_600_000 + 45 * 60_000 + 30_000 + 123;
4250 let time_us_b: i64 = 23 * 3_600_000_000 + 59 * 60_000_000 + 59 * 1_000_000 + 999_999;
4251 let ts_ms_2024_01_01: i64 = 1_704_067_200_000;
4252 let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1000;
4253 let fx8_a: [u8; 8] = *b"ABCDEFGH";
4255 let fx4_abcd: [u8; 4] = *b"ABCD";
4256 let fx4_misc: [u8; 4] = [0x00, 0x11, 0x22, 0x33];
4257 let fx10_ascii: [u8; 10] = *b"0123456789";
4258 let fx10_aa: [u8; 10] = [0xAA; 10];
4259 let dur_a = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
4261 let dur_b = IntervalMonthDayNanoType::make_value(12, 31, 999_000_000);
4262 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
4264 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
4265 let dec_b_scale2_pos: i128 = 123_456; let dec_fix16_neg: i128 = -101; let dec_fix20_s4: i128 = 1_234_567_891_234; let dec_fix20_s4_neg: i128 = -123; let path = "test/data/union_fields.avro";
4271 let actual = read_file(path, 1024, false);
4272 let schema = actual.schema();
4273 let get_union = |name: &str| -> (UnionFields, UnionMode) {
4275 let idx = schema.index_of(name).unwrap();
4276 match schema.field(idx).data_type() {
4277 DataType::Union(f, m) => (f.clone(), *m),
4278 other => panic!("{name} should be a Union, got {other:?}"),
4279 }
4280 };
4281 let mut expected_cols: Vec<ArrayRef> = Vec::with_capacity(schema.fields().len());
4282 expected_cols.push(Arc::new(Int32Array::from(vec![
4284 None,
4285 Some(42),
4286 None,
4287 Some(0),
4288 ])));
4289 expected_cols.push(Arc::new(StringArray::from(vec![
4291 Some("s1"),
4292 None,
4293 Some("s3"),
4294 Some(""),
4295 ])));
4296 {
4298 let (uf, mode) = get_union("union_prim");
4299 assert!(matches!(mode, UnionMode::Dense));
4300 let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4301 let expected_names = vec![
4302 "boolean", "int", "long", "float", "double", "bytes", "string",
4303 ];
4304 assert_eq!(
4305 generated_names, expected_names,
4306 "Field names for union_prim are incorrect"
4307 );
4308 let tids = vec![
4309 tid_by_name(&uf, "long"),
4310 tid_by_name(&uf, "int"),
4311 tid_by_name(&uf, "float"),
4312 tid_by_name(&uf, "double"),
4313 ];
4314 let offs = vec![0, 0, 0, 0];
4315 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4316 "int" => Some(Arc::new(Int32Array::from(vec![-1])) as ArrayRef),
4317 "long" => Some(Arc::new(Int64Array::from(vec![1_234_567_890_123i64])) as ArrayRef),
4318 "float" => {
4319 Some(Arc::new(arrow_array::Float32Array::from(vec![1.25f32])) as ArrayRef)
4320 }
4321 "double" => {
4322 Some(Arc::new(arrow_array::Float64Array::from(vec![-2.5f64])) as ArrayRef)
4323 }
4324 _ => None,
4325 });
4326 expected_cols.push(arr);
4327 }
4328 {
4330 let (uf, _) = get_union("union_bytes_vs_string");
4331 let tids = vec![
4332 tid_by_name(&uf, "bytes"),
4333 tid_by_name(&uf, "string"),
4334 tid_by_name(&uf, "string"),
4335 tid_by_name(&uf, "bytes"),
4336 ];
4337 let offs = vec![0, 0, 1, 1];
4338 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4339 "bytes" => Some(
4340 Arc::new(BinaryArray::from(vec![&[0x00, 0xFF, 0x7F][..], &[][..]])) as ArrayRef,
4341 ),
4342 "string" => Some(Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef),
4343 _ => None,
4344 });
4345 expected_cols.push(arr);
4346 }
4347 {
4349 let (uf, _) = get_union("union_fixed_dur_decfix");
4350 let tid_fx8 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(8)));
4351 let tid_dur = tid_by_dt(&uf, |dt| {
4352 matches!(
4353 dt,
4354 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)
4355 )
4356 });
4357 let tid_dec = tid_by_dt(&uf, |dt| match dt {
4358 #[cfg(feature = "small_decimals")]
4359 DataType::Decimal64(10, 2) => true,
4360 DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4361 _ => false,
4362 });
4363 let tids = vec![tid_fx8, tid_dur, tid_dec, tid_dur];
4364 let offs = vec![0, 0, 0, 1];
4365 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4366 DataType::FixedSizeBinary(8) => {
4367 let it = [Some(fx8_a)].into_iter();
4368 Some(Arc::new(
4369 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 8).unwrap(),
4370 ) as ArrayRef)
4371 }
4372 DataType::Interval(IntervalUnit::MonthDayNano) => {
4373 Some(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![
4374 dur_a, dur_b,
4375 ])) as ArrayRef)
4376 }
4377 #[cfg(feature = "small_decimals")]
4378 DataType::Decimal64(10, 2) => {
4379 let a = arrow_array::Decimal64Array::from_iter_values([dec_fix16_neg as i64]);
4380 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4381 }
4382 DataType::Decimal128(10, 2) => {
4383 let a = arrow_array::Decimal128Array::from_iter_values([dec_fix16_neg]);
4384 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4385 }
4386 DataType::Decimal256(10, 2) => {
4387 let a = arrow_array::Decimal256Array::from_iter_values([i256::from_i128(
4388 dec_fix16_neg,
4389 )]);
4390 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4391 }
4392 _ => None,
4393 });
4394 let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4395 let expected_names = vec!["Fx8", "Dur12", "DecFix16"];
4396 assert_eq!(
4397 generated_names, expected_names,
4398 "Data type names were not generated correctly for union_fixed_dur_decfix"
4399 );
4400 expected_cols.push(arr);
4401 }
4402 {
4404 let (uf, _) = get_union("union_enum_records_array_map");
4405 let tid_enum = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4406 let tid_reca = tid_by_dt(&uf, |dt| {
4407 if let DataType::Struct(fs) = dt {
4408 fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b"
4409 } else {
4410 false
4411 }
4412 });
4413 let tid_recb = tid_by_dt(&uf, |dt| {
4414 if let DataType::Struct(fs) = dt {
4415 fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y"
4416 } else {
4417 false
4418 }
4419 });
4420 let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4421 let tids = vec![tid_enum, tid_reca, tid_recb, tid_arr];
4422 let offs = vec![0, 0, 0, 0];
4423 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4424 DataType::Dictionary(_, _) => {
4425 let keys = Int32Array::from(vec![0i32]); let values =
4427 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
4428 Some(
4429 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4430 as ArrayRef,
4431 )
4432 }
4433 DataType::Struct(fs)
4434 if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
4435 {
4436 let a = Int32Array::from(vec![7]);
4437 let b = StringArray::from(vec!["x"]);
4438 Some(Arc::new(StructArray::new(
4439 fs.clone(),
4440 vec![Arc::new(a), Arc::new(b)],
4441 None,
4442 )) as ArrayRef)
4443 }
4444 DataType::Struct(fs)
4445 if fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y" =>
4446 {
4447 let x = Int64Array::from(vec![123_456_789i64]);
4448 let y = BinaryArray::from(vec![&[0xFF, 0x00][..]]);
4449 Some(Arc::new(StructArray::new(
4450 fs.clone(),
4451 vec![Arc::new(x), Arc::new(y)],
4452 None,
4453 )) as ArrayRef)
4454 }
4455 DataType::List(field) => {
4456 let values = Int64Array::from(vec![1i64, 2, 3]);
4457 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
4458 Some(Arc::new(
4459 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4460 ) as ArrayRef)
4461 }
4462 DataType::Map(_, _) => None,
4463 other => panic!("unexpected child {other:?}"),
4464 });
4465 expected_cols.push(arr);
4466 }
4467 {
4469 let (uf, _) = get_union("union_date_or_fixed4");
4470 let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
4471 let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
4472 let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
4473 let offs = vec![0, 0, 1, 1];
4474 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4475 DataType::Date32 => {
4476 Some(Arc::new(arrow_array::Date32Array::from(vec![date_a, 0])) as ArrayRef)
4477 }
4478 DataType::FixedSizeBinary(4) => {
4479 let it = [Some(fx4_abcd), Some(fx4_misc)].into_iter();
4480 Some(Arc::new(
4481 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
4482 ) as ArrayRef)
4483 }
4484 _ => None,
4485 });
4486 expected_cols.push(arr);
4487 }
4488 {
4490 let (uf, _) = get_union("union_time_millis_or_enum");
4491 let tid_ms = tid_by_dt(&uf, |dt| {
4492 matches!(dt, DataType::Time32(arrow_schema::TimeUnit::Millisecond))
4493 });
4494 let tid_en = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4495 let tids = vec![tid_ms, tid_en, tid_en, tid_ms];
4496 let offs = vec![0, 0, 1, 1];
4497 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4498 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4499 Some(Arc::new(Time32MillisecondArray::from(vec![time_ms_a, 0])) as ArrayRef)
4500 }
4501 DataType::Dictionary(_, _) => {
4502 let keys = Int32Array::from(vec![0i32, 1]); let values = Arc::new(StringArray::from(vec!["ON", "OFF"])) as ArrayRef;
4504 Some(
4505 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4506 as ArrayRef,
4507 )
4508 }
4509 _ => None,
4510 });
4511 expected_cols.push(arr);
4512 }
4513 {
4515 let (uf, _) = get_union("union_time_micros_or_string");
4516 let tid_us = tid_by_dt(&uf, |dt| {
4517 matches!(dt, DataType::Time64(arrow_schema::TimeUnit::Microsecond))
4518 });
4519 let tid_s = tid_by_name(&uf, "string");
4520 let tids = vec![tid_s, tid_us, tid_s, tid_s];
4521 let offs = vec![0, 0, 1, 2];
4522 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4523 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4524 Some(Arc::new(Time64MicrosecondArray::from(vec![time_us_b])) as ArrayRef)
4525 }
4526 DataType::Utf8 => {
4527 Some(Arc::new(StringArray::from(vec!["evening", "night", ""])) as ArrayRef)
4528 }
4529 _ => None,
4530 });
4531 expected_cols.push(arr);
4532 }
4533 {
4535 let (uf, _) = get_union("union_ts_millis_utc_or_array");
4536 let tid_ts = tid_by_dt(&uf, |dt| {
4537 matches!(
4538 dt,
4539 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)
4540 )
4541 });
4542 let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4543 let tids = vec![tid_ts, tid_arr, tid_arr, tid_ts];
4544 let offs = vec![0, 0, 1, 1];
4545 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4546 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4547 let a = TimestampMillisecondArray::from(vec![
4548 ts_ms_2024_01_01,
4549 ts_ms_2024_01_01 + 86_400_000,
4550 ]);
4551 Some(Arc::new(if let Some(tz) = tz {
4552 a.with_timezone(tz.clone())
4553 } else {
4554 a
4555 }) as ArrayRef)
4556 }
4557 DataType::List(field) => {
4558 let values = Int32Array::from(vec![0, 1, 2, -1, 0, 1]);
4559 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6]));
4560 Some(Arc::new(
4561 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4562 ) as ArrayRef)
4563 }
4564 _ => None,
4565 });
4566 expected_cols.push(arr);
4567 }
4568 {
4570 let (uf, _) = get_union("union_ts_micros_local_or_bytes");
4571 let tid_lts = tid_by_dt(&uf, |dt| {
4572 matches!(
4573 dt,
4574 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
4575 )
4576 });
4577 let tid_b = tid_by_name(&uf, "bytes");
4578 let tids = vec![tid_b, tid_lts, tid_b, tid_b];
4579 let offs = vec![0, 0, 1, 2];
4580 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4581 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) => Some(Arc::new(
4582 TimestampMicrosecondArray::from(vec![ts_us_2024_01_01]),
4583 )
4584 as ArrayRef),
4585 DataType::Binary => Some(Arc::new(BinaryArray::from(vec![
4586 &b"\x11\x22\x33"[..],
4587 &b"\x00"[..],
4588 &b"\x10\x20\x30\x40"[..],
4589 ])) as ArrayRef),
4590 _ => None,
4591 });
4592 expected_cols.push(arr);
4593 }
4594 {
4596 let (uf, _) = get_union("union_uuid_or_fixed10");
4597 let tid_fx16 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
4598 let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
4599 let tids = vec![tid_fx16, tid_fx10, tid_fx16, tid_fx10];
4600 let offs = vec![0, 0, 1, 1];
4601 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4602 DataType::FixedSizeBinary(16) => {
4603 let it = [Some(uuid1), Some(uuid2)].into_iter();
4604 Some(Arc::new(
4605 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
4606 ) as ArrayRef)
4607 }
4608 DataType::FixedSizeBinary(10) => {
4609 let it = [Some(fx10_ascii), Some(fx10_aa)].into_iter();
4610 Some(Arc::new(
4611 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
4612 ) as ArrayRef)
4613 }
4614 _ => None,
4615 });
4616 expected_cols.push(arr);
4617 }
4618 {
4620 let (uf, _) = get_union("union_dec_bytes_or_dec_fixed");
4621 let tid_b10s2 = tid_by_dt(&uf, |dt| match dt {
4622 #[cfg(feature = "small_decimals")]
4623 DataType::Decimal64(10, 2) => true,
4624 DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4625 _ => false,
4626 });
4627 let tid_f20s4 = tid_by_dt(&uf, |dt| {
4628 matches!(
4629 dt,
4630 DataType::Decimal128(20, 4) | DataType::Decimal256(20, 4)
4631 )
4632 });
4633 let tids = vec![tid_b10s2, tid_f20s4, tid_b10s2, tid_f20s4];
4634 let offs = vec![0, 0, 1, 1];
4635 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4636 #[cfg(feature = "small_decimals")]
4637 DataType::Decimal64(10, 2) => {
4638 let a = Decimal64Array::from_iter_values([dec_b_scale2_pos as i64, 0i64]);
4639 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4640 }
4641 DataType::Decimal128(10, 2) => {
4642 let a = Decimal128Array::from_iter_values([dec_b_scale2_pos, 0]);
4643 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4644 }
4645 DataType::Decimal256(10, 2) => {
4646 let a = Decimal256Array::from_iter_values([
4647 i256::from_i128(dec_b_scale2_pos),
4648 i256::from(0),
4649 ]);
4650 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4651 }
4652 DataType::Decimal128(20, 4) => {
4653 let a = Decimal128Array::from_iter_values([dec_fix20_s4_neg, dec_fix20_s4]);
4654 Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4655 }
4656 DataType::Decimal256(20, 4) => {
4657 let a = Decimal256Array::from_iter_values([
4658 i256::from_i128(dec_fix20_s4_neg),
4659 i256::from_i128(dec_fix20_s4),
4660 ]);
4661 Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4662 }
4663 _ => None,
4664 });
4665 expected_cols.push(arr);
4666 }
4667 {
4669 let (uf, _) = get_union("union_null_bytes_string");
4670 let tid_n = tid_by_name(&uf, "null");
4671 let tid_b = tid_by_name(&uf, "bytes");
4672 let tid_s = tid_by_name(&uf, "string");
4673 let tids = vec![tid_n, tid_b, tid_s, tid_s];
4674 let offs = vec![0, 0, 0, 1];
4675 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4676 "null" => Some(Arc::new(arrow_array::NullArray::new(1)) as ArrayRef),
4677 "bytes" => Some(Arc::new(BinaryArray::from(vec![&b"\x01\x02"[..]])) as ArrayRef),
4678 "string" => Some(Arc::new(StringArray::from(vec!["text", "u"])) as ArrayRef),
4679 _ => None,
4680 });
4681 expected_cols.push(arr);
4682 }
4683 {
4685 let idx = schema.index_of("array_of_union").unwrap();
4686 let dt = schema.field(idx).data_type().clone();
4687 let (item_field, _) = match &dt {
4688 DataType::List(f) => (f.clone(), ()),
4689 other => panic!("array_of_union must be List, got {other:?}"),
4690 };
4691 let (uf, _) = match item_field.data_type() {
4692 DataType::Union(f, m) => (f.clone(), m),
4693 other => panic!("array_of_union items must be Union, got {other:?}"),
4694 };
4695 let tid_l = tid_by_name(&uf, "long");
4696 let tid_s = tid_by_name(&uf, "string");
4697 let type_ids = vec![tid_l, tid_s, tid_l, tid_s, tid_l, tid_l, tid_s, tid_l];
4698 let offsets = vec![0, 0, 1, 1, 2, 3, 2, 4];
4699 let values_union =
4700 mk_dense_union(&uf, type_ids, offsets, |f| match f.name().as_str() {
4701 "long" => {
4702 Some(Arc::new(Int64Array::from(vec![1i64, -5, 42, -1, 0])) as ArrayRef)
4703 }
4704 "string" => Some(Arc::new(StringArray::from(vec!["a", "", "z"])) as ArrayRef),
4705 _ => None,
4706 });
4707 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 8]));
4708 expected_cols.push(Arc::new(
4709 ListArray::try_new(item_field.clone(), list_offsets, values_union, None).unwrap(),
4710 ));
4711 }
4712 {
4714 let idx = schema.index_of("map_of_union").unwrap();
4715 let dt = schema.field(idx).data_type().clone();
4716 let (entry_field, ordered) = match &dt {
4717 DataType::Map(f, ordered) => (f.clone(), *ordered),
4718 other => panic!("map_of_union must be Map, got {other:?}"),
4719 };
4720 let DataType::Struct(entry_fields) = entry_field.data_type() else {
4721 panic!("map entries must be struct")
4722 };
4723 let key_field = entry_fields[0].clone();
4724 let val_field = entry_fields[1].clone();
4725 let keys = StringArray::from(vec!["a", "b", "x", "pi"]);
4726 let rounded_pi = (std::f64::consts::PI * 100_000.0).round() / 100_000.0;
4727 let values: ArrayRef = match val_field.data_type() {
4728 DataType::Union(uf, _) => {
4729 let tid_n = tid_by_name(uf, "null");
4730 let tid_d = tid_by_name(uf, "double");
4731 let tids = vec![tid_n, tid_d, tid_d, tid_d];
4732 let offs = vec![0, 0, 1, 2];
4733 mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4734 "null" => Some(Arc::new(NullArray::new(1)) as ArrayRef),
4735 "double" => Some(Arc::new(arrow_array::Float64Array::from(vec![
4736 2.5f64, -0.5f64, rounded_pi,
4737 ])) as ArrayRef),
4738 _ => None,
4739 })
4740 }
4741 DataType::Float64 => Arc::new(arrow_array::Float64Array::from(vec![
4742 None,
4743 Some(2.5),
4744 Some(-0.5),
4745 Some(rounded_pi),
4746 ])),
4747 other => panic!("unexpected map value type {other:?}"),
4748 };
4749 let entries = StructArray::new(
4750 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4751 vec![Arc::new(keys) as ArrayRef, values],
4752 None,
4753 );
4754 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 3, 4]));
4755 expected_cols.push(Arc::new(MapArray::new(
4756 entry_field,
4757 offsets,
4758 entries,
4759 None,
4760 ordered,
4761 )));
4762 }
4763 {
4765 let idx = schema.index_of("record_with_union_field").unwrap();
4766 let DataType::Struct(rec_fields) = schema.field(idx).data_type() else {
4767 panic!("record_with_union_field should be Struct")
4768 };
4769 let id = Int32Array::from(vec![1, 2, 3, 4]);
4770 let u_field = rec_fields.iter().find(|f| f.name() == "u").unwrap();
4771 let DataType::Union(uf, _) = u_field.data_type() else {
4772 panic!("u must be Union")
4773 };
4774 let tid_i = tid_by_name(uf, "int");
4775 let tid_s = tid_by_name(uf, "string");
4776 let tids = vec![tid_s, tid_i, tid_i, tid_s];
4777 let offs = vec![0, 0, 1, 1];
4778 let u = mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4779 "int" => Some(Arc::new(Int32Array::from(vec![99, 0])) as ArrayRef),
4780 "string" => Some(Arc::new(StringArray::from(vec!["one", "four"])) as ArrayRef),
4781 _ => None,
4782 });
4783 let rec = StructArray::new(rec_fields.clone(), vec![Arc::new(id) as ArrayRef, u], None);
4784 expected_cols.push(Arc::new(rec));
4785 }
4786 {
4788 let (uf, _) = get_union("union_ts_micros_utc_or_map");
4789 let tid_ts = tid_by_dt(&uf, |dt| {
4790 matches!(
4791 dt,
4792 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some(_))
4793 )
4794 });
4795 let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
4796 let tids = vec![tid_ts, tid_map, tid_ts, tid_map];
4797 let offs = vec![0, 0, 1, 1];
4798 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4799 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4800 let a = TimestampMicrosecondArray::from(vec![ts_us_2024_01_01, 0i64]);
4801 Some(Arc::new(if let Some(tz) = tz {
4802 a.with_timezone(tz.clone())
4803 } else {
4804 a
4805 }) as ArrayRef)
4806 }
4807 DataType::Map(entry_field, ordered) => {
4808 let DataType::Struct(fs) = entry_field.data_type() else {
4809 panic!("map entries must be struct")
4810 };
4811 let key_field = fs[0].clone();
4812 let val_field = fs[1].clone();
4813 assert_eq!(key_field.data_type(), &DataType::Utf8);
4814 assert_eq!(val_field.data_type(), &DataType::Int64);
4815 let keys = StringArray::from(vec!["k1", "k2", "n"]);
4816 let vals = Int64Array::from(vec![1i64, 2, 0]);
4817 let entries = StructArray::new(
4818 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4819 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
4820 None,
4821 );
4822 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
4823 Some(Arc::new(MapArray::new(
4824 entry_field.clone(),
4825 offsets,
4826 entries,
4827 None,
4828 *ordered,
4829 )) as ArrayRef)
4830 }
4831 _ => None,
4832 });
4833 expected_cols.push(arr);
4834 }
4835 {
4837 let (uf, _) = get_union("union_ts_millis_local_or_string");
4838 let tid_ts = tid_by_dt(&uf, |dt| {
4839 matches!(
4840 dt,
4841 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
4842 )
4843 });
4844 let tid_s = tid_by_name(&uf, "string");
4845 let tids = vec![tid_s, tid_ts, tid_s, tid_s];
4846 let offs = vec![0, 0, 1, 2];
4847 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4848 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) => Some(Arc::new(
4849 TimestampMillisecondArray::from(vec![ts_ms_2024_01_01]),
4850 )
4851 as ArrayRef),
4852 DataType::Utf8 => {
4853 Some(
4854 Arc::new(StringArray::from(vec!["local midnight", "done", ""])) as ArrayRef,
4855 )
4856 }
4857 _ => None,
4858 });
4859 expected_cols.push(arr);
4860 }
4861 {
4863 let (uf, _) = get_union("union_bool_or_string");
4864 let tid_b = tid_by_name(&uf, "boolean");
4865 let tid_s = tid_by_name(&uf, "string");
4866 let tids = vec![tid_b, tid_s, tid_b, tid_s];
4867 let offs = vec![0, 0, 1, 1];
4868 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4869 "boolean" => Some(Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef),
4870 "string" => Some(Arc::new(StringArray::from(vec!["no", "yes"])) as ArrayRef),
4871 _ => None,
4872 });
4873 expected_cols.push(arr);
4874 }
4875 let expected = RecordBatch::try_new(schema.clone(), expected_cols).unwrap();
4876 assert_eq!(
4877 actual, expected,
4878 "full end-to-end equality for union_fields.avro"
4879 );
4880 }
4881
4882 #[test]
4883 fn test_read_zero_byte_avro_file() {
4884 let batch = read_file("test/data/zero_byte.avro", 3, false);
4885 let schema = batch.schema();
4886 assert_eq!(schema.fields().len(), 1);
4887 let field = schema.field(0);
4888 assert_eq!(field.name(), "data");
4889 assert_eq!(field.data_type(), &DataType::Binary);
4890 assert!(field.is_nullable());
4891 assert_eq!(batch.num_rows(), 3);
4892 assert_eq!(batch.num_columns(), 1);
4893 let binary_array = batch
4894 .column(0)
4895 .as_any()
4896 .downcast_ref::<BinaryArray>()
4897 .unwrap();
4898 assert!(binary_array.is_null(0));
4899 assert!(binary_array.is_valid(1));
4900 assert_eq!(binary_array.value(1), b"");
4901 assert!(binary_array.is_valid(2));
4902 assert_eq!(binary_array.value(2), b"some bytes");
4903 }
4904
4905 #[test]
4906 fn test_alltypes() {
4907 let expected = RecordBatch::try_from_iter_with_nullable([
4908 (
4909 "id",
4910 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
4911 true,
4912 ),
4913 (
4914 "bool_col",
4915 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
4916 true,
4917 ),
4918 (
4919 "tinyint_col",
4920 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4921 true,
4922 ),
4923 (
4924 "smallint_col",
4925 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4926 true,
4927 ),
4928 (
4929 "int_col",
4930 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4931 true,
4932 ),
4933 (
4934 "bigint_col",
4935 Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
4936 true,
4937 ),
4938 (
4939 "float_col",
4940 Arc::new(Float32Array::from_iter_values(
4941 (0..8).map(|x| (x % 2) as f32 * 1.1),
4942 )) as _,
4943 true,
4944 ),
4945 (
4946 "double_col",
4947 Arc::new(Float64Array::from_iter_values(
4948 (0..8).map(|x| (x % 2) as f64 * 10.1),
4949 )) as _,
4950 true,
4951 ),
4952 (
4953 "date_string_col",
4954 Arc::new(BinaryArray::from_iter_values([
4955 [48, 51, 47, 48, 49, 47, 48, 57],
4956 [48, 51, 47, 48, 49, 47, 48, 57],
4957 [48, 52, 47, 48, 49, 47, 48, 57],
4958 [48, 52, 47, 48, 49, 47, 48, 57],
4959 [48, 50, 47, 48, 49, 47, 48, 57],
4960 [48, 50, 47, 48, 49, 47, 48, 57],
4961 [48, 49, 47, 48, 49, 47, 48, 57],
4962 [48, 49, 47, 48, 49, 47, 48, 57],
4963 ])) as _,
4964 true,
4965 ),
4966 (
4967 "string_col",
4968 Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
4969 true,
4970 ),
4971 (
4972 "timestamp_col",
4973 Arc::new(
4974 TimestampMicrosecondArray::from_iter_values([
4975 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
4984 .with_timezone("+00:00"),
4985 ) as _,
4986 true,
4987 ),
4988 ])
4989 .unwrap();
4990
4991 for file in files() {
4992 let file = arrow_test_data(file);
4993
4994 assert_eq!(read_file(&file, 8, false), expected);
4995 assert_eq!(read_file(&file, 3, false), expected);
4996 }
4997 }
4998
4999 #[test]
5000 #[cfg(feature = "snappy")]
5002 fn test_alltypes_dictionary() {
5003 let file = "avro/alltypes_dictionary.avro";
5004 let expected = RecordBatch::try_from_iter_with_nullable([
5005 ("id", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
5006 (
5007 "bool_col",
5008 Arc::new(BooleanArray::from(vec![Some(true), Some(false)])) as _,
5009 true,
5010 ),
5011 (
5012 "tinyint_col",
5013 Arc::new(Int32Array::from(vec![0, 1])) as _,
5014 true,
5015 ),
5016 (
5017 "smallint_col",
5018 Arc::new(Int32Array::from(vec![0, 1])) as _,
5019 true,
5020 ),
5021 ("int_col", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
5022 (
5023 "bigint_col",
5024 Arc::new(Int64Array::from(vec![0, 10])) as _,
5025 true,
5026 ),
5027 (
5028 "float_col",
5029 Arc::new(Float32Array::from(vec![0.0, 1.1])) as _,
5030 true,
5031 ),
5032 (
5033 "double_col",
5034 Arc::new(Float64Array::from(vec![0.0, 10.1])) as _,
5035 true,
5036 ),
5037 (
5038 "date_string_col",
5039 Arc::new(BinaryArray::from_iter_values([b"01/01/09", b"01/01/09"])) as _,
5040 true,
5041 ),
5042 (
5043 "string_col",
5044 Arc::new(BinaryArray::from_iter_values([b"0", b"1"])) as _,
5045 true,
5046 ),
5047 (
5048 "timestamp_col",
5049 Arc::new(
5050 TimestampMicrosecondArray::from_iter_values([
5051 1230768000000000, 1230768060000000, ])
5054 .with_timezone("+00:00"),
5055 ) as _,
5056 true,
5057 ),
5058 ])
5059 .unwrap();
5060 let file_path = arrow_test_data(file);
5061 let batch_large = read_file(&file_path, 8, false);
5062 assert_eq!(
5063 batch_large, expected,
5064 "Decoded RecordBatch does not match for file {file}"
5065 );
5066 let batch_small = read_file(&file_path, 3, false);
5067 assert_eq!(
5068 batch_small, expected,
5069 "Decoded RecordBatch (batch size 3) does not match for file {file}"
5070 );
5071 }
5072
5073 #[test]
5074 fn test_alltypes_nulls_plain() {
5075 let file = "avro/alltypes_nulls_plain.avro";
5076 let expected = RecordBatch::try_from_iter_with_nullable([
5077 (
5078 "string_col",
5079 Arc::new(StringArray::from(vec![None::<&str>])) as _,
5080 true,
5081 ),
5082 ("int_col", Arc::new(Int32Array::from(vec![None])) as _, true),
5083 (
5084 "bool_col",
5085 Arc::new(BooleanArray::from(vec![None])) as _,
5086 true,
5087 ),
5088 (
5089 "bigint_col",
5090 Arc::new(Int64Array::from(vec![None])) as _,
5091 true,
5092 ),
5093 (
5094 "float_col",
5095 Arc::new(Float32Array::from(vec![None])) as _,
5096 true,
5097 ),
5098 (
5099 "double_col",
5100 Arc::new(Float64Array::from(vec![None])) as _,
5101 true,
5102 ),
5103 (
5104 "bytes_col",
5105 Arc::new(BinaryArray::from(vec![None::<&[u8]>])) as _,
5106 true,
5107 ),
5108 ])
5109 .unwrap();
5110 let file_path = arrow_test_data(file);
5111 let batch_large = read_file(&file_path, 8, false);
5112 assert_eq!(
5113 batch_large, expected,
5114 "Decoded RecordBatch does not match for file {file}"
5115 );
5116 let batch_small = read_file(&file_path, 3, false);
5117 assert_eq!(
5118 batch_small, expected,
5119 "Decoded RecordBatch (batch size 3) does not match for file {file}"
5120 );
5121 }
5122
5123 #[test]
5124 #[cfg(feature = "snappy")]
5126 fn test_binary() {
5127 let file = arrow_test_data("avro/binary.avro");
5128 let batch = read_file(&file, 8, false);
5129 let expected = RecordBatch::try_from_iter_with_nullable([(
5130 "foo",
5131 Arc::new(BinaryArray::from_iter_values(vec![
5132 b"\x00" as &[u8],
5133 b"\x01" as &[u8],
5134 b"\x02" as &[u8],
5135 b"\x03" as &[u8],
5136 b"\x04" as &[u8],
5137 b"\x05" as &[u8],
5138 b"\x06" as &[u8],
5139 b"\x07" as &[u8],
5140 b"\x08" as &[u8],
5141 b"\t" as &[u8],
5142 b"\n" as &[u8],
5143 b"\x0b" as &[u8],
5144 ])) as Arc<dyn Array>,
5145 true,
5146 )])
5147 .unwrap();
5148 assert_eq!(batch, expected);
5149 }
5150
5151 #[test]
5152 #[cfg(feature = "snappy")]
5154 fn test_decimal() {
5155 #[cfg(feature = "small_decimals")]
5159 let files: [(&str, DataType, HashMap<String, String>); 8] = [
5160 (
5161 "avro/fixed_length_decimal.avro",
5162 DataType::Decimal128(25, 2),
5163 HashMap::from([
5164 (
5165 "avro.namespace".to_string(),
5166 "topLevelRecord.value".to_string(),
5167 ),
5168 ("avro.name".to_string(), "fixed".to_string()),
5169 ]),
5170 ),
5171 (
5172 "avro/fixed_length_decimal_legacy.avro",
5173 DataType::Decimal64(13, 2),
5174 HashMap::from([
5175 (
5176 "avro.namespace".to_string(),
5177 "topLevelRecord.value".to_string(),
5178 ),
5179 ("avro.name".to_string(), "fixed".to_string()),
5180 ]),
5181 ),
5182 (
5183 "avro/int32_decimal.avro",
5184 DataType::Decimal32(4, 2),
5185 HashMap::from([
5186 (
5187 "avro.namespace".to_string(),
5188 "topLevelRecord.value".to_string(),
5189 ),
5190 ("avro.name".to_string(), "fixed".to_string()),
5191 ]),
5192 ),
5193 (
5194 "avro/int64_decimal.avro",
5195 DataType::Decimal64(10, 2),
5196 HashMap::from([
5197 (
5198 "avro.namespace".to_string(),
5199 "topLevelRecord.value".to_string(),
5200 ),
5201 ("avro.name".to_string(), "fixed".to_string()),
5202 ]),
5203 ),
5204 (
5205 "test/data/int256_decimal.avro",
5206 DataType::Decimal256(76, 10),
5207 HashMap::new(),
5208 ),
5209 (
5210 "test/data/fixed256_decimal.avro",
5211 DataType::Decimal256(76, 10),
5212 HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
5213 ),
5214 (
5215 "test/data/fixed_length_decimal_legacy_32.avro",
5216 DataType::Decimal32(9, 2),
5217 HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
5218 ),
5219 (
5220 "test/data/int128_decimal.avro",
5221 DataType::Decimal128(38, 2),
5222 HashMap::new(),
5223 ),
5224 ];
5225 #[cfg(not(feature = "small_decimals"))]
5226 let files: [(&str, DataType, HashMap<String, String>); 8] = [
5227 (
5228 "avro/fixed_length_decimal.avro",
5229 DataType::Decimal128(25, 2),
5230 HashMap::from([
5231 (
5232 "avro.namespace".to_string(),
5233 "topLevelRecord.value".to_string(),
5234 ),
5235 ("avro.name".to_string(), "fixed".to_string()),
5236 ]),
5237 ),
5238 (
5239 "avro/fixed_length_decimal_legacy.avro",
5240 DataType::Decimal128(13, 2),
5241 HashMap::from([
5242 (
5243 "avro.namespace".to_string(),
5244 "topLevelRecord.value".to_string(),
5245 ),
5246 ("avro.name".to_string(), "fixed".to_string()),
5247 ]),
5248 ),
5249 (
5250 "avro/int32_decimal.avro",
5251 DataType::Decimal128(4, 2),
5252 HashMap::from([
5253 (
5254 "avro.namespace".to_string(),
5255 "topLevelRecord.value".to_string(),
5256 ),
5257 ("avro.name".to_string(), "fixed".to_string()),
5258 ]),
5259 ),
5260 (
5261 "avro/int64_decimal.avro",
5262 DataType::Decimal128(10, 2),
5263 HashMap::from([
5264 (
5265 "avro.namespace".to_string(),
5266 "topLevelRecord.value".to_string(),
5267 ),
5268 ("avro.name".to_string(), "fixed".to_string()),
5269 ]),
5270 ),
5271 (
5272 "test/data/int256_decimal.avro",
5273 DataType::Decimal256(76, 10),
5274 HashMap::new(),
5275 ),
5276 (
5277 "test/data/fixed256_decimal.avro",
5278 DataType::Decimal256(76, 10),
5279 HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
5280 ),
5281 (
5282 "test/data/fixed_length_decimal_legacy_32.avro",
5283 DataType::Decimal128(9, 2),
5284 HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
5285 ),
5286 (
5287 "test/data/int128_decimal.avro",
5288 DataType::Decimal128(38, 2),
5289 HashMap::new(),
5290 ),
5291 ];
5292 for (file, expected_dt, mut metadata) in files {
5293 let (precision, scale) = match expected_dt {
5294 DataType::Decimal32(p, s)
5295 | DataType::Decimal64(p, s)
5296 | DataType::Decimal128(p, s)
5297 | DataType::Decimal256(p, s) => (p, s),
5298 _ => unreachable!("Unexpected decimal type in test inputs"),
5299 };
5300 assert!(scale >= 0, "test data uses non-negative scales only");
5301 let scale_u32 = scale as u32;
5302 let file_path: String = if file.starts_with("avro/") {
5303 arrow_test_data(file)
5304 } else {
5305 std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5306 .join(file)
5307 .to_string_lossy()
5308 .into_owned()
5309 };
5310 let pow10: i128 = 10i128.pow(scale_u32);
5311 let values_i128: Vec<i128> = (1..=24).map(|n| (n as i128) * pow10).collect();
5312 let build_expected = |dt: &DataType, values: &[i128]| -> ArrayRef {
5313 match *dt {
5314 #[cfg(feature = "small_decimals")]
5315 DataType::Decimal32(p, s) => {
5316 let it = values.iter().map(|&v| v as i32);
5317 Arc::new(
5318 Decimal32Array::from_iter_values(it)
5319 .with_precision_and_scale(p, s)
5320 .unwrap(),
5321 )
5322 }
5323 #[cfg(feature = "small_decimals")]
5324 DataType::Decimal64(p, s) => {
5325 let it = values.iter().map(|&v| v as i64);
5326 Arc::new(
5327 Decimal64Array::from_iter_values(it)
5328 .with_precision_and_scale(p, s)
5329 .unwrap(),
5330 )
5331 }
5332 DataType::Decimal128(p, s) => {
5333 let it = values.iter().copied();
5334 Arc::new(
5335 Decimal128Array::from_iter_values(it)
5336 .with_precision_and_scale(p, s)
5337 .unwrap(),
5338 )
5339 }
5340 DataType::Decimal256(p, s) => {
5341 let it = values.iter().map(|&v| i256::from_i128(v));
5342 Arc::new(
5343 Decimal256Array::from_iter_values(it)
5344 .with_precision_and_scale(p, s)
5345 .unwrap(),
5346 )
5347 }
5348 _ => unreachable!("Unexpected decimal type in test"),
5349 }
5350 };
5351 let actual_batch = read_file(&file_path, 8, false);
5352 let actual_nullable = actual_batch.schema().field(0).is_nullable();
5353 let expected_array = build_expected(&expected_dt, &values_i128);
5354 metadata.insert("precision".to_string(), precision.to_string());
5355 metadata.insert("scale".to_string(), scale.to_string());
5356 let field =
5357 Field::new("value", expected_dt.clone(), actual_nullable).with_metadata(metadata);
5358 let expected_schema = Arc::new(Schema::new(vec![field]));
5359 let expected_batch =
5360 RecordBatch::try_new(expected_schema.clone(), vec![expected_array]).unwrap();
5361 assert_eq!(
5362 actual_batch, expected_batch,
5363 "Decoded RecordBatch does not match for {file}"
5364 );
5365 let actual_batch_small = read_file(&file_path, 3, false);
5366 assert_eq!(
5367 actual_batch_small, expected_batch,
5368 "Decoded RecordBatch does not match for {file} with batch size 3"
5369 );
5370 }
5371 }
5372
5373 #[test]
5374 fn test_read_duration_logical_types_feature_toggle() -> Result<(), ArrowError> {
5375 let file_path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5376 .join("test/data/duration_logical_types.avro")
5377 .to_string_lossy()
5378 .into_owned();
5379
5380 let actual_batch = read_file(&file_path, 4, false);
5381
5382 let expected_batch = {
5383 #[cfg(feature = "avro_custom_types")]
5384 {
5385 let schema = Arc::new(Schema::new(vec![
5386 Field::new(
5387 "duration_time_nanos",
5388 DataType::Duration(TimeUnit::Nanosecond),
5389 false,
5390 ),
5391 Field::new(
5392 "duration_time_micros",
5393 DataType::Duration(TimeUnit::Microsecond),
5394 false,
5395 ),
5396 Field::new(
5397 "duration_time_millis",
5398 DataType::Duration(TimeUnit::Millisecond),
5399 false,
5400 ),
5401 Field::new(
5402 "duration_time_seconds",
5403 DataType::Duration(TimeUnit::Second),
5404 false,
5405 ),
5406 ]));
5407
5408 let nanos = Arc::new(PrimitiveArray::<DurationNanosecondType>::from(vec![
5409 10, 20, 30, 40,
5410 ])) as ArrayRef;
5411 let micros = Arc::new(PrimitiveArray::<DurationMicrosecondType>::from(vec![
5412 100, 200, 300, 400,
5413 ])) as ArrayRef;
5414 let millis = Arc::new(PrimitiveArray::<DurationMillisecondType>::from(vec![
5415 1000, 2000, 3000, 4000,
5416 ])) as ArrayRef;
5417 let seconds = Arc::new(PrimitiveArray::<DurationSecondType>::from(vec![1, 2, 3, 4]))
5418 as ArrayRef;
5419
5420 RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5421 }
5422 #[cfg(not(feature = "avro_custom_types"))]
5423 {
5424 let schema = Arc::new(Schema::new(vec![
5425 Field::new("duration_time_nanos", DataType::Int64, false).with_metadata(
5426 [(
5427 "logicalType".to_string(),
5428 "arrow.duration-nanos".to_string(),
5429 )]
5430 .into(),
5431 ),
5432 Field::new("duration_time_micros", DataType::Int64, false).with_metadata(
5433 [(
5434 "logicalType".to_string(),
5435 "arrow.duration-micros".to_string(),
5436 )]
5437 .into(),
5438 ),
5439 Field::new("duration_time_millis", DataType::Int64, false).with_metadata(
5440 [(
5441 "logicalType".to_string(),
5442 "arrow.duration-millis".to_string(),
5443 )]
5444 .into(),
5445 ),
5446 Field::new("duration_time_seconds", DataType::Int64, false).with_metadata(
5447 [(
5448 "logicalType".to_string(),
5449 "arrow.duration-seconds".to_string(),
5450 )]
5451 .into(),
5452 ),
5453 ]));
5454
5455 let nanos =
5456 Arc::new(PrimitiveArray::<Int64Type>::from(vec![10, 20, 30, 40])) as ArrayRef;
5457 let micros = Arc::new(PrimitiveArray::<Int64Type>::from(vec![100, 200, 300, 400]))
5458 as ArrayRef;
5459 let millis = Arc::new(PrimitiveArray::<Int64Type>::from(vec![
5460 1000, 2000, 3000, 4000,
5461 ])) as ArrayRef;
5462 let seconds =
5463 Arc::new(PrimitiveArray::<Int64Type>::from(vec![1, 2, 3, 4])) as ArrayRef;
5464
5465 RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5466 }
5467 };
5468
5469 assert_eq!(actual_batch, expected_batch);
5470
5471 Ok(())
5472 }
5473
5474 #[test]
5475 #[cfg(feature = "snappy")]
5477 fn test_dict_pages_offset_zero() {
5478 let file = arrow_test_data("avro/dict-page-offset-zero.avro");
5479 let batch = read_file(&file, 32, false);
5480 let num_rows = batch.num_rows();
5481 let expected_field = Int32Array::from(vec![Some(1552); num_rows]);
5482 let expected = RecordBatch::try_from_iter_with_nullable([(
5483 "l_partkey",
5484 Arc::new(expected_field) as Arc<dyn Array>,
5485 true,
5486 )])
5487 .unwrap();
5488 assert_eq!(batch, expected);
5489 }
5490
5491 #[test]
5492 #[cfg(feature = "snappy")]
5494 fn test_list_columns() {
5495 let file = arrow_test_data("avro/list_columns.avro");
5496 let mut int64_list_builder = ListBuilder::new(Int64Builder::new());
5497 {
5498 {
5499 let values = int64_list_builder.values();
5500 values.append_value(1);
5501 values.append_value(2);
5502 values.append_value(3);
5503 }
5504 int64_list_builder.append(true);
5505 }
5506 {
5507 {
5508 let values = int64_list_builder.values();
5509 values.append_null();
5510 values.append_value(1);
5511 }
5512 int64_list_builder.append(true);
5513 }
5514 {
5515 {
5516 let values = int64_list_builder.values();
5517 values.append_value(4);
5518 }
5519 int64_list_builder.append(true);
5520 }
5521 let int64_list = int64_list_builder.finish();
5522 let mut utf8_list_builder = ListBuilder::new(StringBuilder::new());
5523 {
5524 {
5525 let values = utf8_list_builder.values();
5526 values.append_value("abc");
5527 values.append_value("efg");
5528 values.append_value("hij");
5529 }
5530 utf8_list_builder.append(true);
5531 }
5532 {
5533 utf8_list_builder.append(false);
5534 }
5535 {
5536 {
5537 let values = utf8_list_builder.values();
5538 values.append_value("efg");
5539 values.append_null();
5540 values.append_value("hij");
5541 values.append_value("xyz");
5542 }
5543 utf8_list_builder.append(true);
5544 }
5545 let utf8_list = utf8_list_builder.finish();
5546 let expected = RecordBatch::try_from_iter_with_nullable([
5547 ("int64_list", Arc::new(int64_list) as Arc<dyn Array>, true),
5548 ("utf8_list", Arc::new(utf8_list) as Arc<dyn Array>, true),
5549 ])
5550 .unwrap();
5551 let batch = read_file(&file, 8, false);
5552 assert_eq!(batch, expected);
5553 }
5554
5555 #[test]
5556 #[cfg(feature = "snappy")]
5557 fn test_nested_lists() {
5558 use arrow_data::ArrayDataBuilder;
5559 let file = arrow_test_data("avro/nested_lists.snappy.avro");
5560 let inner_values = StringArray::from(vec![
5561 Some("a"),
5562 Some("b"),
5563 Some("c"),
5564 Some("d"),
5565 Some("a"),
5566 Some("b"),
5567 Some("c"),
5568 Some("d"),
5569 Some("e"),
5570 Some("a"),
5571 Some("b"),
5572 Some("c"),
5573 Some("d"),
5574 Some("e"),
5575 Some("f"),
5576 ]);
5577 let inner_offsets = Buffer::from_slice_ref([0, 2, 3, 3, 4, 6, 8, 8, 9, 11, 13, 14, 14, 15]);
5578 let inner_validity = [
5579 true, true, false, true, true, true, false, true, true, true, true, false, true,
5580 ];
5581 let inner_null_buffer = Buffer::from_iter(inner_validity.iter().copied());
5582 let inner_field = Field::new("item", DataType::Utf8, true);
5583 let inner_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(inner_field)))
5584 .len(13)
5585 .add_buffer(inner_offsets)
5586 .add_child_data(inner_values.to_data())
5587 .null_bit_buffer(Some(inner_null_buffer))
5588 .build()
5589 .unwrap();
5590 let inner_list_array = ListArray::from(inner_list_data);
5591 let middle_offsets = Buffer::from_slice_ref([0, 2, 4, 6, 8, 11, 13]);
5592 let middle_validity = [true; 6];
5593 let middle_null_buffer = Buffer::from_iter(middle_validity.iter().copied());
5594 let middle_field = Field::new("item", inner_list_array.data_type().clone(), true);
5595 let middle_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(middle_field)))
5596 .len(6)
5597 .add_buffer(middle_offsets)
5598 .add_child_data(inner_list_array.to_data())
5599 .null_bit_buffer(Some(middle_null_buffer))
5600 .build()
5601 .unwrap();
5602 let middle_list_array = ListArray::from(middle_list_data);
5603 let outer_offsets = Buffer::from_slice_ref([0, 2, 4, 6]);
5604 let outer_null_buffer = Buffer::from_slice_ref([0b111]); let outer_field = Field::new("item", middle_list_array.data_type().clone(), true);
5606 let outer_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(outer_field)))
5607 .len(3)
5608 .add_buffer(outer_offsets)
5609 .add_child_data(middle_list_array.to_data())
5610 .null_bit_buffer(Some(outer_null_buffer))
5611 .build()
5612 .unwrap();
5613 let a_expected = ListArray::from(outer_list_data);
5614 let b_expected = Int32Array::from(vec![1, 1, 1]);
5615 let expected = RecordBatch::try_from_iter_with_nullable([
5616 ("a", Arc::new(a_expected) as Arc<dyn Array>, true),
5617 ("b", Arc::new(b_expected) as Arc<dyn Array>, true),
5618 ])
5619 .unwrap();
5620 let left = read_file(&file, 8, false);
5621 assert_eq!(left, expected, "Mismatch for batch size=8");
5622 let left_small = read_file(&file, 3, false);
5623 assert_eq!(left_small, expected, "Mismatch for batch size=3");
5624 }
5625
5626 #[test]
5627 fn test_simple() {
5628 let tests = [
5629 ("avro/simple_enum.avro", 4, build_expected_enum(), 2),
5630 ("avro/simple_fixed.avro", 2, build_expected_fixed(), 1),
5631 ];
5632
5633 fn build_expected_enum() -> RecordBatch {
5634 let keys_f1 = Int32Array::from(vec![0, 1, 2, 3]);
5636 let vals_f1 = StringArray::from(vec!["a", "b", "c", "d"]);
5637 let f1_dict =
5638 DictionaryArray::<Int32Type>::try_new(keys_f1, Arc::new(vals_f1)).unwrap();
5639 let keys_f2 = Int32Array::from(vec![2, 3, 0, 1]);
5640 let vals_f2 = StringArray::from(vec!["e", "f", "g", "h"]);
5641 let f2_dict =
5642 DictionaryArray::<Int32Type>::try_new(keys_f2, Arc::new(vals_f2)).unwrap();
5643 let keys_f3 = Int32Array::from(vec![Some(1), Some(2), None, Some(0)]);
5644 let vals_f3 = StringArray::from(vec!["i", "j", "k"]);
5645 let f3_dict =
5646 DictionaryArray::<Int32Type>::try_new(keys_f3, Arc::new(vals_f3)).unwrap();
5647 let dict_type =
5648 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
5649 let mut md_f1 = HashMap::new();
5650 md_f1.insert(
5651 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5652 r#"["a","b","c","d"]"#.to_string(),
5653 );
5654 md_f1.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum1".to_string());
5655 md_f1.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5656 let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
5657 let mut md_f2 = HashMap::new();
5658 md_f2.insert(
5659 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5660 r#"["e","f","g","h"]"#.to_string(),
5661 );
5662 md_f2.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum2".to_string());
5663 md_f2.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5664 let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
5665 let mut md_f3 = HashMap::new();
5666 md_f3.insert(
5667 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5668 r#"["i","j","k"]"#.to_string(),
5669 );
5670 md_f3.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum3".to_string());
5671 md_f3.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5672 let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
5673 let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
5674 RecordBatch::try_new(
5675 expected_schema,
5676 vec![
5677 Arc::new(f1_dict) as Arc<dyn Array>,
5678 Arc::new(f2_dict) as Arc<dyn Array>,
5679 Arc::new(f3_dict) as Arc<dyn Array>,
5680 ],
5681 )
5682 .unwrap()
5683 }
5684
5685 fn build_expected_fixed() -> RecordBatch {
5686 let f1 =
5687 FixedSizeBinaryArray::try_from_iter(vec![b"abcde", b"12345"].into_iter()).unwrap();
5688 let f2 =
5689 FixedSizeBinaryArray::try_from_iter(vec![b"fghijklmno", b"1234567890"].into_iter())
5690 .unwrap();
5691 let f3 = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5692 vec![Some(b"ABCDEF" as &[u8]), None].into_iter(),
5693 6,
5694 )
5695 .unwrap();
5696
5697 let mut md_f1 = HashMap::new();
5699 md_f1.insert(
5700 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5701 "fixed1".to_string(),
5702 );
5703 md_f1.insert(
5704 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5705 "ns1".to_string(),
5706 );
5707
5708 let mut md_f2 = HashMap::new();
5709 md_f2.insert(
5710 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5711 "fixed2".to_string(),
5712 );
5713 md_f2.insert(
5714 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5715 "ns2".to_string(),
5716 );
5717
5718 let mut md_f3 = HashMap::new();
5719 md_f3.insert(
5720 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5721 "fixed3".to_string(),
5722 );
5723 md_f3.insert(
5724 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5725 "ns1".to_string(),
5726 );
5727
5728 let expected_schema = Arc::new(Schema::new(vec![
5729 Field::new("f1", DataType::FixedSizeBinary(5), false).with_metadata(md_f1),
5730 Field::new("f2", DataType::FixedSizeBinary(10), false).with_metadata(md_f2),
5731 Field::new("f3", DataType::FixedSizeBinary(6), true).with_metadata(md_f3),
5732 ]));
5733
5734 RecordBatch::try_new(
5735 expected_schema,
5736 vec![
5737 Arc::new(f1) as Arc<dyn Array>,
5738 Arc::new(f2) as Arc<dyn Array>,
5739 Arc::new(f3) as Arc<dyn Array>,
5740 ],
5741 )
5742 .unwrap()
5743 }
5744 for (file_name, batch_size, expected, alt_batch_size) in tests {
5745 let file = arrow_test_data(file_name);
5746 let actual = read_file(&file, batch_size, false);
5747 assert_eq!(actual, expected);
5748 let actual2 = read_file(&file, alt_batch_size, false);
5749 assert_eq!(actual2, expected);
5750 }
5751 }
5752
5753 #[test]
5754 #[cfg(feature = "snappy")]
5755 fn test_single_nan() {
5756 let file = arrow_test_data("avro/single_nan.avro");
5757 let actual = read_file(&file, 1, false);
5758 use arrow_array::Float64Array;
5759 let schema = Arc::new(Schema::new(vec![Field::new(
5760 "mycol",
5761 DataType::Float64,
5762 true,
5763 )]));
5764 let col = Float64Array::from(vec![None]);
5765 let expected = RecordBatch::try_new(schema, vec![Arc::new(col)]).unwrap();
5766 assert_eq!(actual, expected);
5767 let actual2 = read_file(&file, 2, false);
5768 assert_eq!(actual2, expected);
5769 }
5770
5771 #[test]
5772 fn test_duration_uuid() {
5773 let batch = read_file("test/data/duration_uuid.avro", 4, false);
5774 let schema = batch.schema();
5775 let fields = schema.fields();
5776 assert_eq!(fields.len(), 2);
5777 assert_eq!(fields[0].name(), "duration_field");
5778 assert_eq!(
5779 fields[0].data_type(),
5780 &DataType::Interval(IntervalUnit::MonthDayNano)
5781 );
5782 assert_eq!(fields[1].name(), "uuid_field");
5783 assert_eq!(fields[1].data_type(), &DataType::FixedSizeBinary(16));
5784 assert_eq!(batch.num_rows(), 4);
5785 assert_eq!(batch.num_columns(), 2);
5786 let duration_array = batch
5787 .column(0)
5788 .as_any()
5789 .downcast_ref::<IntervalMonthDayNanoArray>()
5790 .unwrap();
5791 let expected_duration_array: IntervalMonthDayNanoArray = [
5792 Some(IntervalMonthDayNanoType::make_value(1, 15, 500_000_000)),
5793 Some(IntervalMonthDayNanoType::make_value(0, 5, 2_500_000_000)),
5794 Some(IntervalMonthDayNanoType::make_value(2, 0, 0)),
5795 Some(IntervalMonthDayNanoType::make_value(12, 31, 999_000_000)),
5796 ]
5797 .iter()
5798 .copied()
5799 .collect();
5800 assert_eq!(&expected_duration_array, duration_array);
5801 let uuid_array = batch
5802 .column(1)
5803 .as_any()
5804 .downcast_ref::<FixedSizeBinaryArray>()
5805 .unwrap();
5806 let expected_uuid_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5807 [
5808 Some([
5809 0xfe, 0x7b, 0xc3, 0x0b, 0x4c, 0xe8, 0x4c, 0x5e, 0xb6, 0x7c, 0x22, 0x34, 0xa2,
5810 0xd3, 0x8e, 0x66,
5811 ]),
5812 Some([
5813 0xb3, 0x3f, 0x2a, 0xd7, 0x97, 0xb4, 0x4d, 0xe1, 0x8b, 0xfe, 0x94, 0x94, 0x1d,
5814 0x60, 0x15, 0x6e,
5815 ]),
5816 Some([
5817 0x5f, 0x74, 0x92, 0x64, 0x07, 0x4b, 0x40, 0x05, 0x84, 0xbf, 0x11, 0x5e, 0xa8,
5818 0x4e, 0xd2, 0x0a,
5819 ]),
5820 Some([
5821 0x08, 0x26, 0xcc, 0x06, 0xd2, 0xe3, 0x45, 0x99, 0xb4, 0xad, 0xaf, 0x5f, 0xa6,
5822 0x90, 0x5c, 0xdb,
5823 ]),
5824 ]
5825 .into_iter(),
5826 16,
5827 )
5828 .unwrap();
5829 assert_eq!(&expected_uuid_array, uuid_array);
5830 }
5831
5832 #[test]
5833 #[cfg(feature = "snappy")]
5834 fn test_datapage_v2() {
5835 let file = arrow_test_data("avro/datapage_v2.snappy.avro");
5836 let batch = read_file(&file, 8, false);
5837 let a = StringArray::from(vec![
5838 Some("abc"),
5839 Some("abc"),
5840 Some("abc"),
5841 None,
5842 Some("abc"),
5843 ]);
5844 let b = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
5845 let c = Float64Array::from(vec![Some(2.0), Some(3.0), Some(4.0), Some(5.0), Some(2.0)]);
5846 let d = BooleanArray::from(vec![
5847 Some(true),
5848 Some(true),
5849 Some(true),
5850 Some(false),
5851 Some(true),
5852 ]);
5853 let e_values = Int32Array::from(vec![
5854 Some(1),
5855 Some(2),
5856 Some(3),
5857 Some(1),
5858 Some(2),
5859 Some(3),
5860 Some(1),
5861 Some(2),
5862 ]);
5863 let e_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 3, 3, 3, 6, 8]));
5864 let e_validity = Some(NullBuffer::from(vec![true, false, false, true, true]));
5865 let field_e = Arc::new(Field::new("item", DataType::Int32, true));
5866 let e = ListArray::new(field_e, e_offsets, Arc::new(e_values), e_validity);
5867 let expected = RecordBatch::try_from_iter_with_nullable([
5868 ("a", Arc::new(a) as Arc<dyn Array>, true),
5869 ("b", Arc::new(b) as Arc<dyn Array>, true),
5870 ("c", Arc::new(c) as Arc<dyn Array>, true),
5871 ("d", Arc::new(d) as Arc<dyn Array>, true),
5872 ("e", Arc::new(e) as Arc<dyn Array>, true),
5873 ])
5874 .unwrap();
5875 assert_eq!(batch, expected);
5876 }
5877
5878 #[test]
5879 fn test_nested_records() {
5880 let f1_f1_1 = StringArray::from(vec!["aaa", "bbb"]);
5881 let f1_f1_2 = Int32Array::from(vec![10, 20]);
5882 let rounded_pi = (std::f64::consts::PI * 100.0).round() / 100.0;
5883 let f1_f1_3_1 = Float64Array::from(vec![rounded_pi, rounded_pi]);
5884 let f1_f1_3 = StructArray::from(vec![(
5885 Arc::new(Field::new("f1_3_1", DataType::Float64, false)),
5886 Arc::new(f1_f1_3_1) as Arc<dyn Array>,
5887 )]);
5888 let mut f1_3_md: HashMap<String, String> = HashMap::new();
5890 f1_3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns3".to_string());
5891 f1_3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record3".to_string());
5892 let f1_expected = StructArray::from(vec![
5893 (
5894 Arc::new(Field::new("f1_1", DataType::Utf8, false)),
5895 Arc::new(f1_f1_1) as Arc<dyn Array>,
5896 ),
5897 (
5898 Arc::new(Field::new("f1_2", DataType::Int32, false)),
5899 Arc::new(f1_f1_2) as Arc<dyn Array>,
5900 ),
5901 (
5902 Arc::new(
5903 Field::new(
5904 "f1_3",
5905 DataType::Struct(Fields::from(vec![Field::new(
5906 "f1_3_1",
5907 DataType::Float64,
5908 false,
5909 )])),
5910 false,
5911 )
5912 .with_metadata(f1_3_md),
5913 ),
5914 Arc::new(f1_f1_3) as Arc<dyn Array>,
5915 ),
5916 ]);
5917 let f2_fields = [
5918 Field::new("f2_1", DataType::Boolean, false),
5919 Field::new("f2_2", DataType::Float32, false),
5920 ];
5921 let f2_struct_builder = StructBuilder::new(
5922 f2_fields
5923 .iter()
5924 .map(|f| Arc::new(f.clone()))
5925 .collect::<Vec<Arc<Field>>>(),
5926 vec![
5927 Box::new(BooleanBuilder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5928 Box::new(Float32Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5929 ],
5930 );
5931 let mut f2_list_builder = ListBuilder::new(f2_struct_builder);
5932 {
5933 let struct_builder = f2_list_builder.values();
5934 struct_builder.append(true);
5935 {
5936 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5937 b.append_value(true);
5938 }
5939 {
5940 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5941 b.append_value(1.2_f32);
5942 }
5943 struct_builder.append(true);
5944 {
5945 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5946 b.append_value(true);
5947 }
5948 {
5949 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5950 b.append_value(2.2_f32);
5951 }
5952 f2_list_builder.append(true);
5953 }
5954 {
5955 let struct_builder = f2_list_builder.values();
5956 struct_builder.append(true);
5957 {
5958 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5959 b.append_value(false);
5960 }
5961 {
5962 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5963 b.append_value(10.2_f32);
5964 }
5965 f2_list_builder.append(true);
5966 }
5967
5968 let list_array_with_nullable_items = f2_list_builder.finish();
5969 let mut f2_item_md: HashMap<String, String> = HashMap::new();
5971 f2_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record4".to_string());
5972 f2_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns4".to_string());
5973 let item_field = Arc::new(
5974 Field::new(
5975 "item",
5976 list_array_with_nullable_items.values().data_type().clone(),
5977 false, )
5979 .with_metadata(f2_item_md),
5980 );
5981 let list_data_type = DataType::List(item_field);
5982 let f2_array_data = list_array_with_nullable_items
5983 .to_data()
5984 .into_builder()
5985 .data_type(list_data_type)
5986 .build()
5987 .unwrap();
5988 let f2_expected = ListArray::from(f2_array_data);
5989 let mut f3_struct_builder = StructBuilder::new(
5990 vec![Arc::new(Field::new("f3_1", DataType::Utf8, false))],
5991 vec![Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>],
5992 );
5993 f3_struct_builder.append(true);
5994 {
5995 let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
5996 b.append_value("xyz");
5997 }
5998 f3_struct_builder.append(false);
5999 {
6000 let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
6001 b.append_null();
6002 }
6003 let f3_expected = f3_struct_builder.finish();
6004 let f4_fields = [Field::new("f4_1", DataType::Int64, false)];
6005 let f4_struct_builder = StructBuilder::new(
6006 f4_fields
6007 .iter()
6008 .map(|f| Arc::new(f.clone()))
6009 .collect::<Vec<Arc<Field>>>(),
6010 vec![Box::new(Int64Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>],
6011 );
6012 let mut f4_list_builder = ListBuilder::new(f4_struct_builder);
6013 {
6014 let struct_builder = f4_list_builder.values();
6015 struct_builder.append(true);
6016 {
6017 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6018 b.append_value(200);
6019 }
6020 struct_builder.append(false);
6021 {
6022 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6023 b.append_null();
6024 }
6025 f4_list_builder.append(true);
6026 }
6027 {
6028 let struct_builder = f4_list_builder.values();
6029 struct_builder.append(false);
6030 {
6031 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6032 b.append_null();
6033 }
6034 struct_builder.append(true);
6035 {
6036 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6037 b.append_value(300);
6038 }
6039 f4_list_builder.append(true);
6040 }
6041 let f4_expected = f4_list_builder.finish();
6042 let mut f4_item_md: HashMap<String, String> = HashMap::new();
6044 f4_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns6".to_string());
6045 f4_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record6".to_string());
6046 let f4_item_field = Arc::new(
6047 Field::new("item", f4_expected.values().data_type().clone(), true)
6048 .with_metadata(f4_item_md),
6049 );
6050 let f4_list_data_type = DataType::List(f4_item_field);
6051 let f4_array_data = f4_expected
6052 .to_data()
6053 .into_builder()
6054 .data_type(f4_list_data_type)
6055 .build()
6056 .unwrap();
6057 let f4_expected = ListArray::from(f4_array_data);
6058 let mut f1_md: HashMap<String, String> = HashMap::new();
6060 f1_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record2".to_string());
6061 f1_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
6062 let mut f3_md: HashMap<String, String> = HashMap::new();
6063 f3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns5".to_string());
6064 f3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record5".to_string());
6065 let expected_schema = Schema::new(vec![
6066 Field::new("f1", f1_expected.data_type().clone(), false).with_metadata(f1_md),
6067 Field::new("f2", f2_expected.data_type().clone(), false),
6068 Field::new("f3", f3_expected.data_type().clone(), true).with_metadata(f3_md),
6069 Field::new("f4", f4_expected.data_type().clone(), false),
6070 ]);
6071 let expected = RecordBatch::try_new(
6072 Arc::new(expected_schema),
6073 vec![
6074 Arc::new(f1_expected) as Arc<dyn Array>,
6075 Arc::new(f2_expected) as Arc<dyn Array>,
6076 Arc::new(f3_expected) as Arc<dyn Array>,
6077 Arc::new(f4_expected) as Arc<dyn Array>,
6078 ],
6079 )
6080 .unwrap();
6081 let file = arrow_test_data("avro/nested_records.avro");
6082 let batch_large = read_file(&file, 8, false);
6083 assert_eq!(
6084 batch_large, expected,
6085 "Decoded RecordBatch does not match expected data for nested records (batch size 8)"
6086 );
6087 let batch_small = read_file(&file, 3, false);
6088 assert_eq!(
6089 batch_small, expected,
6090 "Decoded RecordBatch does not match expected data for nested records (batch size 3)"
6091 );
6092 }
6093
6094 #[test]
6095 #[cfg(feature = "snappy")]
6097 fn test_repeated_no_annotation() {
6098 use arrow_data::ArrayDataBuilder;
6099 let file = arrow_test_data("avro/repeated_no_annotation.avro");
6100 let batch_large = read_file(&file, 8, false);
6101 let id_array = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
6103 let number_array = Int64Array::from(vec![
6105 Some(5555555555),
6106 Some(1111111111),
6107 Some(1111111111),
6108 Some(2222222222),
6109 Some(3333333333),
6110 ]);
6111 let kind_array =
6112 StringArray::from(vec![None, Some("home"), Some("home"), None, Some("mobile")]);
6113 let phone_fields = Fields::from(vec![
6114 Field::new("number", DataType::Int64, true),
6115 Field::new("kind", DataType::Utf8, true),
6116 ]);
6117 let phone_struct_data = ArrayDataBuilder::new(DataType::Struct(phone_fields))
6118 .len(5)
6119 .child_data(vec![number_array.into_data(), kind_array.into_data()])
6120 .build()
6121 .unwrap();
6122 let phone_struct_array = StructArray::from(phone_struct_data);
6123 let phone_list_offsets = Buffer::from_slice_ref([0i32, 0, 0, 0, 1, 2, 5]);
6125 let phone_list_validity = Buffer::from_iter([false, false, true, true, true, true]);
6126 let mut phone_item_md = HashMap::new();
6128 phone_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "phone".to_string());
6129 phone_item_md.insert(
6130 AVRO_NAMESPACE_METADATA_KEY.to_string(),
6131 "topLevelRecord.phoneNumbers".to_string(),
6132 );
6133 let phone_item_field = Field::new("item", phone_struct_array.data_type().clone(), true)
6134 .with_metadata(phone_item_md);
6135 let phone_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(phone_item_field)))
6136 .len(6)
6137 .add_buffer(phone_list_offsets)
6138 .null_bit_buffer(Some(phone_list_validity))
6139 .child_data(vec![phone_struct_array.into_data()])
6140 .build()
6141 .unwrap();
6142 let phone_list_array = ListArray::from(phone_list_data);
6143 let phone_numbers_validity = Buffer::from_iter([false, false, true, true, true, true]);
6145 let phone_numbers_field = Field::new("phone", phone_list_array.data_type().clone(), true);
6146 let phone_numbers_struct_data =
6147 ArrayDataBuilder::new(DataType::Struct(Fields::from(vec![phone_numbers_field])))
6148 .len(6)
6149 .null_bit_buffer(Some(phone_numbers_validity))
6150 .child_data(vec![phone_list_array.into_data()])
6151 .build()
6152 .unwrap();
6153 let phone_numbers_struct_array = StructArray::from(phone_numbers_struct_data);
6154 let mut phone_numbers_md = HashMap::new();
6156 phone_numbers_md.insert(
6157 AVRO_NAME_METADATA_KEY.to_string(),
6158 "phoneNumbers".to_string(),
6159 );
6160 phone_numbers_md.insert(
6161 AVRO_NAMESPACE_METADATA_KEY.to_string(),
6162 "topLevelRecord".to_string(),
6163 );
6164 let id_field = Field::new("id", DataType::Int32, true);
6165 let phone_numbers_schema_field = Field::new(
6166 "phoneNumbers",
6167 phone_numbers_struct_array.data_type().clone(),
6168 true,
6169 )
6170 .with_metadata(phone_numbers_md);
6171 let expected_schema = Schema::new(vec![id_field, phone_numbers_schema_field]);
6172 let expected = RecordBatch::try_new(
6174 Arc::new(expected_schema),
6175 vec![
6176 Arc::new(id_array) as _,
6177 Arc::new(phone_numbers_struct_array) as _,
6178 ],
6179 )
6180 .unwrap();
6181 assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6182 let batch_small = read_file(&file, 3, false);
6183 assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6184 }
6185
6186 #[test]
6187 #[cfg(feature = "snappy")]
6189 fn test_nonnullable_impala() {
6190 let file = arrow_test_data("avro/nonnullable.impala.avro");
6191 let id = Int64Array::from(vec![Some(8)]);
6192 let mut int_array_builder = ListBuilder::new(Int32Builder::new());
6193 {
6194 let vb = int_array_builder.values();
6195 vb.append_value(-1);
6196 }
6197 int_array_builder.append(true); let int_array = int_array_builder.finish();
6199 let mut iaa_builder = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
6200 {
6201 let inner_list_builder = iaa_builder.values();
6202 {
6203 let vb = inner_list_builder.values();
6204 vb.append_value(-1);
6205 vb.append_value(-2);
6206 }
6207 inner_list_builder.append(true);
6208 inner_list_builder.append(true);
6209 }
6210 iaa_builder.append(true);
6211 let int_array_array = iaa_builder.finish();
6212 let field_names = MapFieldNames {
6213 entry: "entries".to_string(),
6214 key: "key".to_string(),
6215 value: "value".to_string(),
6216 };
6217 let mut int_map_builder =
6218 MapBuilder::new(Some(field_names), StringBuilder::new(), Int32Builder::new());
6219 {
6220 let (keys, vals) = int_map_builder.entries();
6221 keys.append_value("k1");
6222 vals.append_value(-1);
6223 }
6224 int_map_builder.append(true).unwrap(); let int_map = int_map_builder.finish();
6226 let field_names2 = MapFieldNames {
6227 entry: "entries".to_string(),
6228 key: "key".to_string(),
6229 value: "value".to_string(),
6230 };
6231 let mut ima_builder = ListBuilder::new(MapBuilder::new(
6232 Some(field_names2),
6233 StringBuilder::new(),
6234 Int32Builder::new(),
6235 ));
6236 {
6237 let map_builder = ima_builder.values();
6238 map_builder.append(true).unwrap();
6239 {
6240 let (keys, vals) = map_builder.entries();
6241 keys.append_value("k1");
6242 vals.append_value(1);
6243 }
6244 map_builder.append(true).unwrap();
6245 map_builder.append(true).unwrap();
6246 map_builder.append(true).unwrap();
6247 }
6248 ima_builder.append(true);
6249 let int_map_array_ = ima_builder.finish();
6250 let meta_nested_struct: HashMap<String, String> = [
6252 ("avro.name", "nested_Struct"),
6253 ("avro.namespace", "topLevelRecord"),
6254 ]
6255 .into_iter()
6256 .map(|(k, v)| (k.to_string(), v.to_string()))
6257 .collect();
6258 let meta_c: HashMap<String, String> = [
6259 ("avro.name", "c"),
6260 ("avro.namespace", "topLevelRecord.nested_Struct"),
6261 ]
6262 .into_iter()
6263 .map(|(k, v)| (k.to_string(), v.to_string()))
6264 .collect();
6265 let meta_d_item_struct: HashMap<String, String> = [
6266 ("avro.name", "D"),
6267 ("avro.namespace", "topLevelRecord.nested_Struct.c"),
6268 ]
6269 .into_iter()
6270 .map(|(k, v)| (k.to_string(), v.to_string()))
6271 .collect();
6272 let meta_g_value: HashMap<String, String> = [
6273 ("avro.name", "G"),
6274 ("avro.namespace", "topLevelRecord.nested_Struct"),
6275 ]
6276 .into_iter()
6277 .map(|(k, v)| (k.to_string(), v.to_string()))
6278 .collect();
6279 let meta_h: HashMap<String, String> = [
6280 ("avro.name", "h"),
6281 ("avro.namespace", "topLevelRecord.nested_Struct.G"),
6282 ]
6283 .into_iter()
6284 .map(|(k, v)| (k.to_string(), v.to_string()))
6285 .collect();
6286 let ef_struct_field = Arc::new(
6288 Field::new(
6289 "item",
6290 DataType::Struct(
6291 vec![
6292 Field::new("e", DataType::Int32, true),
6293 Field::new("f", DataType::Utf8, true),
6294 ]
6295 .into(),
6296 ),
6297 true,
6298 )
6299 .with_metadata(meta_d_item_struct.clone()),
6300 );
6301 let d_inner_list_field = Arc::new(Field::new(
6302 "item",
6303 DataType::List(ef_struct_field.clone()),
6304 true,
6305 ));
6306 let d_field = Field::new("D", DataType::List(d_inner_list_field.clone()), true);
6307 let i_list_field = Arc::new(Field::new("item", DataType::Float64, true));
6309 let i_field = Field::new("i", DataType::List(i_list_field.clone()), true);
6310 let h_field = Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6312 .with_metadata(meta_h.clone());
6313 let g_value_struct_field = Field::new(
6315 "value",
6316 DataType::Struct(vec![h_field.clone()].into()),
6317 true,
6318 )
6319 .with_metadata(meta_g_value.clone());
6320 let entries_struct_field = Field::new(
6322 "entries",
6323 DataType::Struct(
6324 vec![
6325 Field::new("key", DataType::Utf8, false),
6326 g_value_struct_field.clone(),
6327 ]
6328 .into(),
6329 ),
6330 false,
6331 );
6332 let a_field = Arc::new(Field::new("a", DataType::Int32, true));
6334 let b_field = Arc::new(Field::new(
6335 "B",
6336 DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
6337 true,
6338 ));
6339 let c_field = Arc::new(
6340 Field::new("c", DataType::Struct(vec![d_field.clone()].into()), true)
6341 .with_metadata(meta_c.clone()),
6342 );
6343 let g_field = Arc::new(Field::new(
6344 "G",
6345 DataType::Map(Arc::new(entries_struct_field.clone()), false),
6346 true,
6347 ));
6348 let mut nested_sb = StructBuilder::new(
6350 vec![
6351 a_field.clone(),
6352 b_field.clone(),
6353 c_field.clone(),
6354 g_field.clone(),
6355 ],
6356 vec![
6357 Box::new(Int32Builder::new()),
6358 Box::new(ListBuilder::new(Int32Builder::new())),
6359 {
6360 Box::new(StructBuilder::new(
6362 vec![Arc::new(d_field.clone())],
6363 vec![Box::new({
6364 let ef_struct_builder = StructBuilder::new(
6365 vec![
6366 Arc::new(Field::new("e", DataType::Int32, true)),
6367 Arc::new(Field::new("f", DataType::Utf8, true)),
6368 ],
6369 vec![
6370 Box::new(Int32Builder::new()),
6371 Box::new(StringBuilder::new()),
6372 ],
6373 );
6374 let list_of_ef = ListBuilder::new(ef_struct_builder)
6376 .with_field(ef_struct_field.clone());
6377 ListBuilder::new(list_of_ef)
6379 })],
6380 ))
6381 },
6382 {
6383 let map_field_names = MapFieldNames {
6384 entry: "entries".to_string(),
6385 key: "key".to_string(),
6386 value: "value".to_string(),
6387 };
6388 let i_list_builder = ListBuilder::new(Float64Builder::new());
6389 let h_struct_builder = StructBuilder::new(
6390 vec![Arc::new(Field::new(
6391 "i",
6392 DataType::List(i_list_field.clone()),
6393 true,
6394 ))],
6395 vec![Box::new(i_list_builder)],
6396 );
6397 let g_value_builder = StructBuilder::new(
6398 vec![Arc::new(
6399 Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6400 .with_metadata(meta_h.clone()),
6401 )],
6402 vec![Box::new(h_struct_builder)],
6403 );
6404 let map_builder = MapBuilder::new(
6406 Some(map_field_names),
6407 StringBuilder::new(),
6408 g_value_builder,
6409 )
6410 .with_values_field(Arc::new(
6411 Field::new(
6412 "value",
6413 DataType::Struct(vec![h_field.clone()].into()),
6414 true,
6415 )
6416 .with_metadata(meta_g_value.clone()),
6417 ));
6418
6419 Box::new(map_builder)
6420 },
6421 ],
6422 );
6423 nested_sb.append(true);
6424 {
6425 let a_builder = nested_sb.field_builder::<Int32Builder>(0).unwrap();
6426 a_builder.append_value(-1);
6427 }
6428 {
6429 let b_builder = nested_sb
6430 .field_builder::<ListBuilder<Int32Builder>>(1)
6431 .unwrap();
6432 {
6433 let vb = b_builder.values();
6434 vb.append_value(-1);
6435 }
6436 b_builder.append(true);
6437 }
6438 {
6439 let c_struct_builder = nested_sb.field_builder::<StructBuilder>(2).unwrap();
6440 c_struct_builder.append(true);
6441 let d_list_builder = c_struct_builder
6442 .field_builder::<ListBuilder<ListBuilder<StructBuilder>>>(0)
6443 .unwrap();
6444 {
6445 let sub_list_builder = d_list_builder.values();
6446 {
6447 let ef_struct = sub_list_builder.values();
6448 ef_struct.append(true);
6449 {
6450 let e_b = ef_struct.field_builder::<Int32Builder>(0).unwrap();
6451 e_b.append_value(-1);
6452 let f_b = ef_struct.field_builder::<StringBuilder>(1).unwrap();
6453 f_b.append_value("nonnullable");
6454 }
6455 sub_list_builder.append(true);
6456 }
6457 d_list_builder.append(true);
6458 }
6459 }
6460 {
6461 let g_map_builder = nested_sb
6462 .field_builder::<MapBuilder<StringBuilder, StructBuilder>>(3)
6463 .unwrap();
6464 g_map_builder.append(true).unwrap();
6465 }
6466 let nested_struct = nested_sb.finish();
6467 let schema = Arc::new(arrow_schema::Schema::new(vec![
6468 Field::new("ID", id.data_type().clone(), true),
6469 Field::new("Int_Array", int_array.data_type().clone(), true),
6470 Field::new("int_array_array", int_array_array.data_type().clone(), true),
6471 Field::new("Int_Map", int_map.data_type().clone(), true),
6472 Field::new("int_map_array", int_map_array_.data_type().clone(), true),
6473 Field::new("nested_Struct", nested_struct.data_type().clone(), true)
6474 .with_metadata(meta_nested_struct.clone()),
6475 ]));
6476 let expected = RecordBatch::try_new(
6477 schema,
6478 vec![
6479 Arc::new(id) as Arc<dyn Array>,
6480 Arc::new(int_array),
6481 Arc::new(int_array_array),
6482 Arc::new(int_map),
6483 Arc::new(int_map_array_),
6484 Arc::new(nested_struct),
6485 ],
6486 )
6487 .unwrap();
6488 let batch_large = read_file(&file, 8, false);
6489 assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6490 let batch_small = read_file(&file, 3, false);
6491 assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6492 }
6493
6494 #[test]
6495 fn test_nonnullable_impala_strict() {
6496 let file = arrow_test_data("avro/nonnullable.impala.avro");
6497 let err = read_file_strict(&file, 8, false).unwrap_err();
6498 assert!(err.to_string().contains(
6499 "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6500 ));
6501 }
6502
6503 #[test]
6504 #[cfg(feature = "snappy")]
6506 fn test_nullable_impala() {
6507 let file = arrow_test_data("avro/nullable.impala.avro");
6508 let batch1 = read_file(&file, 3, false);
6509 let batch2 = read_file(&file, 8, false);
6510 assert_eq!(batch1, batch2);
6511 let batch = batch1;
6512 assert_eq!(batch.num_rows(), 7);
6513 let id_array = batch
6514 .column(0)
6515 .as_any()
6516 .downcast_ref::<Int64Array>()
6517 .expect("id column should be an Int64Array");
6518 let expected_ids = [1, 2, 3, 4, 5, 6, 7];
6519 for (i, &expected_id) in expected_ids.iter().enumerate() {
6520 assert_eq!(id_array.value(i), expected_id, "Mismatch in id at row {i}",);
6521 }
6522 let int_array = batch
6523 .column(1)
6524 .as_any()
6525 .downcast_ref::<ListArray>()
6526 .expect("int_array column should be a ListArray");
6527 {
6528 let offsets = int_array.value_offsets();
6529 let start = offsets[0] as usize;
6530 let end = offsets[1] as usize;
6531 let values = int_array
6532 .values()
6533 .as_any()
6534 .downcast_ref::<Int32Array>()
6535 .expect("Values of int_array should be an Int32Array");
6536 let row0: Vec<Option<i32>> = (start..end).map(|i| Some(values.value(i))).collect();
6537 assert_eq!(
6538 row0,
6539 vec![Some(1), Some(2), Some(3)],
6540 "Mismatch in int_array row 0"
6541 );
6542 }
6543 let nested_struct = batch
6544 .column(5)
6545 .as_any()
6546 .downcast_ref::<StructArray>()
6547 .expect("nested_struct column should be a StructArray");
6548 let a_array = nested_struct
6549 .column_by_name("A")
6550 .expect("Field A should exist in nested_struct")
6551 .as_any()
6552 .downcast_ref::<Int32Array>()
6553 .expect("Field A should be an Int32Array");
6554 assert_eq!(a_array.value(0), 1, "Mismatch in nested_struct.A at row 0");
6555 assert!(
6556 !a_array.is_valid(1),
6557 "Expected null in nested_struct.A at row 1"
6558 );
6559 assert!(
6560 !a_array.is_valid(3),
6561 "Expected null in nested_struct.A at row 3"
6562 );
6563 assert_eq!(a_array.value(6), 7, "Mismatch in nested_struct.A at row 6");
6564 }
6565
6566 #[test]
6567 fn test_nullable_impala_strict() {
6568 let file = arrow_test_data("avro/nullable.impala.avro");
6569 let err = read_file_strict(&file, 8, false).unwrap_err();
6570 assert!(err.to_string().contains(
6571 "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6572 ));
6573 }
6574
6575 #[test]
6576 fn test_nested_record_type_reuse() {
6577 let batch = read_file("test/data/nested_record_reuse.avro", 8, false);
6603 let schema = batch.schema();
6604
6605 assert_eq!(schema.fields().len(), 3);
6607 let fields = schema.fields();
6608 assert_eq!(fields[0].name(), "nested");
6609 assert_eq!(fields[1].name(), "nestedRecord");
6610 assert_eq!(fields[2].name(), "nestedArray");
6611 assert!(matches!(fields[0].data_type(), DataType::Struct(_)));
6612 assert!(matches!(fields[1].data_type(), DataType::Struct(_)));
6613 assert!(matches!(fields[2].data_type(), DataType::List(_)));
6614
6615 if let DataType::Struct(nested_fields) = fields[0].data_type() {
6617 assert_eq!(nested_fields.len(), 1);
6618 assert_eq!(nested_fields[0].name(), "nested_int");
6619 assert_eq!(nested_fields[0].data_type(), &DataType::Int32);
6620 }
6621
6622 assert_eq!(fields[0].data_type(), fields[1].data_type());
6624 if let DataType::List(array_field) = fields[2].data_type() {
6625 assert_eq!(array_field.data_type(), fields[0].data_type());
6626 }
6627
6628 assert_eq!(batch.num_rows(), 2);
6630 assert_eq!(batch.num_columns(), 3);
6631
6632 let nested_col = batch
6634 .column(0)
6635 .as_any()
6636 .downcast_ref::<StructArray>()
6637 .unwrap();
6638 let nested_int_array = nested_col
6639 .column_by_name("nested_int")
6640 .unwrap()
6641 .as_any()
6642 .downcast_ref::<Int32Array>()
6643 .unwrap();
6644 assert_eq!(nested_int_array.value(0), 42);
6645 assert_eq!(nested_int_array.value(1), 99);
6646
6647 let nested_record_col = batch
6649 .column(1)
6650 .as_any()
6651 .downcast_ref::<StructArray>()
6652 .unwrap();
6653 let nested_record_int_array = nested_record_col
6654 .column_by_name("nested_int")
6655 .unwrap()
6656 .as_any()
6657 .downcast_ref::<Int32Array>()
6658 .unwrap();
6659 assert_eq!(nested_record_int_array.value(0), 100);
6660 assert_eq!(nested_record_int_array.value(1), 200);
6661
6662 let nested_array_col = batch
6664 .column(2)
6665 .as_any()
6666 .downcast_ref::<ListArray>()
6667 .unwrap();
6668 assert_eq!(nested_array_col.len(), 2);
6669 let first_array_struct = nested_array_col.value(0);
6670 let first_array_struct_array = first_array_struct
6671 .as_any()
6672 .downcast_ref::<StructArray>()
6673 .unwrap();
6674 let first_array_int_values = first_array_struct_array
6675 .column_by_name("nested_int")
6676 .unwrap()
6677 .as_any()
6678 .downcast_ref::<Int32Array>()
6679 .unwrap();
6680 assert_eq!(first_array_int_values.len(), 3);
6681 assert_eq!(first_array_int_values.value(0), 1);
6682 assert_eq!(first_array_int_values.value(1), 2);
6683 assert_eq!(first_array_int_values.value(2), 3);
6684 }
6685
6686 #[test]
6687 fn test_enum_type_reuse() {
6688 let batch = read_file("test/data/enum_reuse.avro", 8, false);
6711 let schema = batch.schema();
6712
6713 assert_eq!(schema.fields().len(), 3);
6715 let fields = schema.fields();
6716 assert_eq!(fields[0].name(), "status");
6717 assert_eq!(fields[1].name(), "backupStatus");
6718 assert_eq!(fields[2].name(), "statusHistory");
6719 assert!(matches!(fields[0].data_type(), DataType::Dictionary(_, _)));
6720 assert!(matches!(fields[1].data_type(), DataType::Dictionary(_, _)));
6721 assert!(matches!(fields[2].data_type(), DataType::List(_)));
6722
6723 if let DataType::Dictionary(key_type, value_type) = fields[0].data_type() {
6724 assert_eq!(key_type.as_ref(), &DataType::Int32);
6725 assert_eq!(value_type.as_ref(), &DataType::Utf8);
6726 }
6727
6728 assert_eq!(fields[0].data_type(), fields[1].data_type());
6730 if let DataType::List(array_field) = fields[2].data_type() {
6731 assert_eq!(array_field.data_type(), fields[0].data_type());
6732 }
6733
6734 assert_eq!(batch.num_rows(), 2);
6736 assert_eq!(batch.num_columns(), 3);
6737
6738 let status_col = batch
6740 .column(0)
6741 .as_any()
6742 .downcast_ref::<DictionaryArray<Int32Type>>()
6743 .unwrap();
6744 let status_values = status_col
6745 .values()
6746 .as_any()
6747 .downcast_ref::<StringArray>()
6748 .unwrap();
6749
6750 assert_eq!(
6752 status_values.value(status_col.key(0).unwrap() as usize),
6753 "ACTIVE"
6754 );
6755 assert_eq!(
6756 status_values.value(status_col.key(1).unwrap() as usize),
6757 "PENDING"
6758 );
6759
6760 let backup_status_col = batch
6762 .column(1)
6763 .as_any()
6764 .downcast_ref::<DictionaryArray<Int32Type>>()
6765 .unwrap();
6766 let backup_status_values = backup_status_col
6767 .values()
6768 .as_any()
6769 .downcast_ref::<StringArray>()
6770 .unwrap();
6771
6772 assert_eq!(
6774 backup_status_values.value(backup_status_col.key(0).unwrap() as usize),
6775 "INACTIVE"
6776 );
6777 assert_eq!(
6778 backup_status_values.value(backup_status_col.key(1).unwrap() as usize),
6779 "ACTIVE"
6780 );
6781
6782 let status_history_col = batch
6784 .column(2)
6785 .as_any()
6786 .downcast_ref::<ListArray>()
6787 .unwrap();
6788 assert_eq!(status_history_col.len(), 2);
6789
6790 let first_array_dict = status_history_col.value(0);
6792 let first_array_dict_array = first_array_dict
6793 .as_any()
6794 .downcast_ref::<DictionaryArray<Int32Type>>()
6795 .unwrap();
6796 let first_array_values = first_array_dict_array
6797 .values()
6798 .as_any()
6799 .downcast_ref::<StringArray>()
6800 .unwrap();
6801
6802 assert_eq!(first_array_dict_array.len(), 3);
6804 assert_eq!(
6805 first_array_values.value(first_array_dict_array.key(0).unwrap() as usize),
6806 "PENDING"
6807 );
6808 assert_eq!(
6809 first_array_values.value(first_array_dict_array.key(1).unwrap() as usize),
6810 "ACTIVE"
6811 );
6812 assert_eq!(
6813 first_array_values.value(first_array_dict_array.key(2).unwrap() as usize),
6814 "INACTIVE"
6815 );
6816 }
6817
6818 #[test]
6819 fn comprehensive_e2e_test() {
6820 let path = "test/data/comprehensive_e2e.avro";
6821 let batch = read_file(path, 1024, false);
6822 let schema = batch.schema();
6823
6824 #[inline]
6825 fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
6826 for (tid, f) in fields.iter() {
6827 if f.name() == want {
6828 return tid;
6829 }
6830 }
6831 panic!("union child '{want}' not found");
6832 }
6833
6834 #[inline]
6835 fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
6836 for (tid, f) in fields.iter() {
6837 if pred(f.data_type()) {
6838 return tid;
6839 }
6840 }
6841 panic!("no union child matches predicate");
6842 }
6843
6844 fn mk_dense_union(
6845 fields: &UnionFields,
6846 type_ids: Vec<i8>,
6847 offsets: Vec<i32>,
6848 provide: impl Fn(&Field) -> Option<ArrayRef>,
6849 ) -> ArrayRef {
6850 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
6851 match dt {
6852 DataType::Null => Arc::new(NullArray::new(0)),
6853 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
6854 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
6855 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
6856 DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
6857 DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
6858 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
6859 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
6860 DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
6861 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
6862 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
6863 }
6864 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
6865 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
6866 }
6867 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
6868 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
6869 Arc::new(if let Some(tz) = tz {
6870 a.with_timezone(tz.clone())
6871 } else {
6872 a
6873 })
6874 }
6875 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
6876 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
6877 Arc::new(if let Some(tz) = tz {
6878 a.with_timezone(tz.clone())
6879 } else {
6880 a
6881 })
6882 }
6883 DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
6884 IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
6885 ),
6886 DataType::FixedSizeBinary(sz) => Arc::new(
6887 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
6888 std::iter::empty::<Option<Vec<u8>>>(),
6889 *sz,
6890 )
6891 .unwrap(),
6892 ),
6893 DataType::Dictionary(_, _) => {
6894 let keys = Int32Array::from(Vec::<i32>::new());
6895 let values = Arc::new(StringArray::from(Vec::<&str>::new()));
6896 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
6897 }
6898 DataType::Struct(fields) => {
6899 let children: Vec<ArrayRef> = fields
6900 .iter()
6901 .map(|f| empty_child_for(f.data_type()) as ArrayRef)
6902 .collect();
6903 Arc::new(StructArray::new(fields.clone(), children, None))
6904 }
6905 DataType::List(field) => {
6906 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6907 Arc::new(
6908 ListArray::try_new(
6909 field.clone(),
6910 offsets,
6911 empty_child_for(field.data_type()),
6912 None,
6913 )
6914 .unwrap(),
6915 )
6916 }
6917 DataType::Map(entry_field, is_sorted) => {
6918 let (key_field, val_field) = match entry_field.data_type() {
6919 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
6920 other => panic!("unexpected map entries type: {other:?}"),
6921 };
6922 let keys = StringArray::from(Vec::<&str>::new());
6923 let vals: ArrayRef = match val_field.data_type() {
6924 DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
6925 DataType::Boolean => {
6926 Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
6927 }
6928 DataType::Int32 => {
6929 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
6930 }
6931 DataType::Int64 => {
6932 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
6933 }
6934 DataType::Float32 => {
6935 Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
6936 }
6937 DataType::Float64 => {
6938 Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
6939 }
6940 DataType::Utf8 => {
6941 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
6942 }
6943 DataType::Binary => {
6944 Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
6945 }
6946 DataType::Union(uf, _) => {
6947 let children: Vec<ArrayRef> = uf
6948 .iter()
6949 .map(|(_, f)| empty_child_for(f.data_type()))
6950 .collect();
6951 Arc::new(
6952 UnionArray::try_new(
6953 uf.clone(),
6954 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
6955 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
6956 children,
6957 )
6958 .unwrap(),
6959 ) as ArrayRef
6960 }
6961 other => panic!("unsupported map value type: {other:?}"),
6962 };
6963 let entries = StructArray::new(
6964 Fields::from(vec![
6965 key_field.as_ref().clone(),
6966 val_field.as_ref().clone(),
6967 ]),
6968 vec![Arc::new(keys) as ArrayRef, vals],
6969 None,
6970 );
6971 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6972 Arc::new(MapArray::new(
6973 entry_field.clone(),
6974 offsets,
6975 entries,
6976 None,
6977 *is_sorted,
6978 ))
6979 }
6980 other => panic!("empty_child_for: unhandled type {other:?}"),
6981 }
6982 }
6983 let children: Vec<ArrayRef> = fields
6984 .iter()
6985 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
6986 .collect();
6987 Arc::new(
6988 UnionArray::try_new(
6989 fields.clone(),
6990 ScalarBuffer::<i8>::from(type_ids),
6991 Some(ScalarBuffer::<i32>::from(offsets)),
6992 children,
6993 )
6994 .unwrap(),
6995 ) as ArrayRef
6996 }
6997
6998 #[inline]
6999 fn uuid16_from_str(s: &str) -> [u8; 16] {
7000 let mut out = [0u8; 16];
7001 let mut idx = 0usize;
7002 let mut hi: Option<u8> = None;
7003 for ch in s.chars() {
7004 if ch == '-' {
7005 continue;
7006 }
7007 let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
7008 if let Some(h) = hi {
7009 out[idx] = (h << 4) | v;
7010 idx += 1;
7011 hi = None;
7012 } else {
7013 hi = Some(v);
7014 }
7015 }
7016 assert_eq!(idx, 16, "UUID must decode to 16 bytes");
7017 out
7018 }
7019 let date_a: i32 = 19_000; let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
7021 let time_us_eod: i64 = 86_400_000_000 - 1;
7022 let ts_ms_2024_01_01: i64 = 1_704_067_200_000; let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
7024 let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
7025 let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
7026 let dur_large =
7027 IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
7028 let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
7029 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
7030 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
7031
7032 #[inline]
7033 fn push_like(
7034 reader_schema: &arrow_schema::Schema,
7035 name: &str,
7036 arr: ArrayRef,
7037 fields: &mut Vec<FieldRef>,
7038 cols: &mut Vec<ArrayRef>,
7039 ) {
7040 let src = reader_schema
7041 .field_with_name(name)
7042 .unwrap_or_else(|_| panic!("source schema missing field '{name}'"));
7043 let mut f = Field::new(name, arr.data_type().clone(), src.is_nullable());
7044 let md = src.metadata();
7045 if !md.is_empty() {
7046 f = f.with_metadata(md.clone());
7047 }
7048 fields.push(Arc::new(f));
7049 cols.push(arr);
7050 }
7051
7052 let mut fields: Vec<FieldRef> = Vec::new();
7053 let mut columns: Vec<ArrayRef> = Vec::new();
7054 push_like(
7055 schema.as_ref(),
7056 "id",
7057 Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
7058 &mut fields,
7059 &mut columns,
7060 );
7061 push_like(
7062 schema.as_ref(),
7063 "flag",
7064 Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef,
7065 &mut fields,
7066 &mut columns,
7067 );
7068 push_like(
7069 schema.as_ref(),
7070 "ratio_f32",
7071 Arc::new(Float32Array::from(vec![1.25f32, -0.0, 3.5, 9.75])) as ArrayRef,
7072 &mut fields,
7073 &mut columns,
7074 );
7075 push_like(
7076 schema.as_ref(),
7077 "ratio_f64",
7078 Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef,
7079 &mut fields,
7080 &mut columns,
7081 );
7082 push_like(
7083 schema.as_ref(),
7084 "count_i32",
7085 Arc::new(Int32Array::from(vec![7, -1, 0, 123])) as ArrayRef,
7086 &mut fields,
7087 &mut columns,
7088 );
7089 push_like(
7090 schema.as_ref(),
7091 "count_i64",
7092 Arc::new(Int64Array::from(vec![
7093 7_000_000_000i64,
7094 -2,
7095 0,
7096 -9_876_543_210i64,
7097 ])) as ArrayRef,
7098 &mut fields,
7099 &mut columns,
7100 );
7101 push_like(
7102 schema.as_ref(),
7103 "opt_i32_nullfirst",
7104 Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef,
7105 &mut fields,
7106 &mut columns,
7107 );
7108 push_like(
7109 schema.as_ref(),
7110 "opt_str_nullsecond",
7111 Arc::new(StringArray::from(vec![
7112 Some("alpha"),
7113 None,
7114 Some("s3"),
7115 Some(""),
7116 ])) as ArrayRef,
7117 &mut fields,
7118 &mut columns,
7119 );
7120 {
7121 let uf = match schema
7122 .field_with_name("tri_union_prim")
7123 .unwrap()
7124 .data_type()
7125 {
7126 DataType::Union(f, UnionMode::Dense) => f.clone(),
7127 other => panic!("tri_union_prim should be dense union, got {other:?}"),
7128 };
7129 let tid_i = tid_by_name(&uf, "int");
7130 let tid_s = tid_by_name(&uf, "string");
7131 let tid_b = tid_by_name(&uf, "boolean");
7132 let tids = vec![tid_i, tid_s, tid_b, tid_s];
7133 let offs = vec![0, 0, 0, 1];
7134 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7135 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
7136 DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
7137 DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
7138 _ => None,
7139 });
7140 push_like(
7141 schema.as_ref(),
7142 "tri_union_prim",
7143 arr,
7144 &mut fields,
7145 &mut columns,
7146 );
7147 }
7148
7149 push_like(
7150 schema.as_ref(),
7151 "str_utf8",
7152 Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef,
7153 &mut fields,
7154 &mut columns,
7155 );
7156 push_like(
7157 schema.as_ref(),
7158 "raw_bytes",
7159 Arc::new(BinaryArray::from(vec![
7160 b"\x00\x01".as_ref(),
7161 b"".as_ref(),
7162 b"\xFF\x00".as_ref(),
7163 b"\x10\x20\x30\x40".as_ref(),
7164 ])) as ArrayRef,
7165 &mut fields,
7166 &mut columns,
7167 );
7168 {
7169 let it = [
7170 Some(*b"0123456789ABCDEF"),
7171 Some([0u8; 16]),
7172 Some(*b"ABCDEFGHIJKLMNOP"),
7173 Some([0xAA; 16]),
7174 ]
7175 .into_iter();
7176 let arr =
7177 Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
7178 as ArrayRef;
7179 push_like(
7180 schema.as_ref(),
7181 "fx16_plain",
7182 arr,
7183 &mut fields,
7184 &mut columns,
7185 );
7186 }
7187 {
7188 #[cfg(feature = "small_decimals")]
7189 let dec10_2 = Arc::new(
7190 Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
7191 .with_precision_and_scale(10, 2)
7192 .unwrap(),
7193 ) as ArrayRef;
7194 #[cfg(not(feature = "small_decimals"))]
7195 let dec10_2 = Arc::new(
7196 Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
7197 .with_precision_and_scale(10, 2)
7198 .unwrap(),
7199 ) as ArrayRef;
7200 push_like(
7201 schema.as_ref(),
7202 "dec_bytes_s10_2",
7203 dec10_2,
7204 &mut fields,
7205 &mut columns,
7206 );
7207 }
7208 {
7209 #[cfg(feature = "small_decimals")]
7210 let dec20_4 = Arc::new(
7211 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
7212 .with_precision_and_scale(20, 4)
7213 .unwrap(),
7214 ) as ArrayRef;
7215 #[cfg(not(feature = "small_decimals"))]
7216 let dec20_4 = Arc::new(
7217 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
7218 .with_precision_and_scale(20, 4)
7219 .unwrap(),
7220 ) as ArrayRef;
7221 push_like(
7222 schema.as_ref(),
7223 "dec_fix_s20_4",
7224 dec20_4,
7225 &mut fields,
7226 &mut columns,
7227 );
7228 }
7229 {
7230 let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
7231 let arr =
7232 Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
7233 as ArrayRef;
7234 push_like(schema.as_ref(), "uuid_str", arr, &mut fields, &mut columns);
7235 }
7236 push_like(
7237 schema.as_ref(),
7238 "d_date",
7239 Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef,
7240 &mut fields,
7241 &mut columns,
7242 );
7243 push_like(
7244 schema.as_ref(),
7245 "t_millis",
7246 Arc::new(Time32MillisecondArray::from(vec![
7247 time_ms_a,
7248 0,
7249 1,
7250 86_400_000 - 1,
7251 ])) as ArrayRef,
7252 &mut fields,
7253 &mut columns,
7254 );
7255 push_like(
7256 schema.as_ref(),
7257 "t_micros",
7258 Arc::new(Time64MicrosecondArray::from(vec![
7259 time_us_eod,
7260 0,
7261 1,
7262 1_000_000,
7263 ])) as ArrayRef,
7264 &mut fields,
7265 &mut columns,
7266 );
7267 {
7268 let a = TimestampMillisecondArray::from(vec![
7269 ts_ms_2024_01_01,
7270 -1,
7271 ts_ms_2024_01_01 + 123,
7272 0,
7273 ])
7274 .with_timezone("+00:00");
7275 push_like(
7276 schema.as_ref(),
7277 "ts_millis_utc",
7278 Arc::new(a) as ArrayRef,
7279 &mut fields,
7280 &mut columns,
7281 );
7282 }
7283 {
7284 let a = TimestampMicrosecondArray::from(vec![
7285 ts_us_2024_01_01,
7286 1,
7287 ts_us_2024_01_01 + 456,
7288 0,
7289 ])
7290 .with_timezone("+00:00");
7291 push_like(
7292 schema.as_ref(),
7293 "ts_micros_utc",
7294 Arc::new(a) as ArrayRef,
7295 &mut fields,
7296 &mut columns,
7297 );
7298 }
7299 push_like(
7300 schema.as_ref(),
7301 "ts_millis_local",
7302 Arc::new(TimestampMillisecondArray::from(vec![
7303 ts_ms_2024_01_01 + 86_400_000,
7304 0,
7305 ts_ms_2024_01_01 + 789,
7306 123_456_789,
7307 ])) as ArrayRef,
7308 &mut fields,
7309 &mut columns,
7310 );
7311 push_like(
7312 schema.as_ref(),
7313 "ts_micros_local",
7314 Arc::new(TimestampMicrosecondArray::from(vec![
7315 ts_us_2024_01_01 + 123_456,
7316 0,
7317 ts_us_2024_01_01 + 101_112,
7318 987_654_321,
7319 ])) as ArrayRef,
7320 &mut fields,
7321 &mut columns,
7322 );
7323 {
7324 let v = vec![dur_small, dur_zero, dur_large, dur_2years];
7325 push_like(
7326 schema.as_ref(),
7327 "interval_mdn",
7328 Arc::new(IntervalMonthDayNanoArray::from(v)) as ArrayRef,
7329 &mut fields,
7330 &mut columns,
7331 );
7332 }
7333 {
7334 let keys = Int32Array::from(vec![1, 2, 3, 0]); let values = Arc::new(StringArray::from(vec![
7336 "UNKNOWN",
7337 "NEW",
7338 "PROCESSING",
7339 "DONE",
7340 ])) as ArrayRef;
7341 let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
7342 push_like(
7343 schema.as_ref(),
7344 "status",
7345 Arc::new(dict) as ArrayRef,
7346 &mut fields,
7347 &mut columns,
7348 );
7349 }
7350 {
7351 let list_field = match schema.field_with_name("arr_union").unwrap().data_type() {
7352 DataType::List(f) => f.clone(),
7353 other => panic!("arr_union should be List, got {other:?}"),
7354 };
7355 let uf = match list_field.data_type() {
7356 DataType::Union(f, UnionMode::Dense) => f.clone(),
7357 other => panic!("arr_union item should be union, got {other:?}"),
7358 };
7359 let tid_l = tid_by_name(&uf, "long");
7360 let tid_s = tid_by_name(&uf, "string");
7361 let tid_n = tid_by_name(&uf, "null");
7362 let type_ids = vec![
7363 tid_l, tid_s, tid_n, tid_l, tid_n, tid_s, tid_l, tid_l, tid_s, tid_n, tid_l,
7364 ];
7365 let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
7366 let values = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7367 DataType::Int64 => {
7368 Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
7369 }
7370 DataType::Utf8 => {
7371 Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
7372 }
7373 DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
7374 _ => None,
7375 });
7376 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
7377 let arr = Arc::new(ListArray::try_new(list_field, list_offsets, values, None).unwrap())
7378 as ArrayRef;
7379 push_like(schema.as_ref(), "arr_union", arr, &mut fields, &mut columns);
7380 }
7381 {
7382 let (entry_field, entries_fields, uf, is_sorted) =
7383 match schema.field_with_name("map_union").unwrap().data_type() {
7384 DataType::Map(entry_field, is_sorted) => {
7385 let fs = match entry_field.data_type() {
7386 DataType::Struct(fs) => fs.clone(),
7387 other => panic!("map entries must be struct, got {other:?}"),
7388 };
7389 let val_f = fs[1].clone();
7390 let uf = match val_f.data_type() {
7391 DataType::Union(f, UnionMode::Dense) => f.clone(),
7392 other => panic!("map value must be union, got {other:?}"),
7393 };
7394 (entry_field.clone(), fs, uf, *is_sorted)
7395 }
7396 other => panic!("map_union should be Map, got {other:?}"),
7397 };
7398 let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
7399 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
7400 let tid_null = tid_by_name(&uf, "null");
7401 let tid_d = tid_by_name(&uf, "double");
7402 let tid_s = tid_by_name(&uf, "string");
7403 let type_ids = vec![tid_d, tid_null, tid_s, tid_d, tid_d, tid_s];
7404 let offsets = vec![0, 0, 0, 1, 2, 1];
7405 let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
7406 let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7407 DataType::Float64 => {
7408 Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
7409 }
7410 DataType::Utf8 => {
7411 Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
7412 }
7413 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7414 _ => None,
7415 });
7416 let entries = StructArray::new(
7417 entries_fields.clone(),
7418 vec![Arc::new(keys) as ArrayRef, vals],
7419 None,
7420 );
7421 let map =
7422 Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef;
7423 push_like(schema.as_ref(), "map_union", map, &mut fields, &mut columns);
7424 }
7425 {
7426 let fs = match schema.field_with_name("address").unwrap().data_type() {
7427 DataType::Struct(fs) => fs.clone(),
7428 other => panic!("address should be Struct, got {other:?}"),
7429 };
7430 let street = Arc::new(StringArray::from(vec![
7431 "100 Main",
7432 "",
7433 "42 Galaxy Way",
7434 "End Ave",
7435 ])) as ArrayRef;
7436 let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
7437 let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
7438 let arr = Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef;
7439 push_like(schema.as_ref(), "address", arr, &mut fields, &mut columns);
7440 }
7441 {
7442 let fs = match schema.field_with_name("maybe_auth").unwrap().data_type() {
7443 DataType::Struct(fs) => fs.clone(),
7444 other => panic!("maybe_auth should be Struct, got {other:?}"),
7445 };
7446 let user =
7447 Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
7448 let token_values: Vec<Option<&[u8]>> = vec![
7449 None, Some(b"\x01\x02\x03".as_ref()), None, Some(b"".as_ref()), ];
7454 let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
7455 let arr = Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef;
7456 push_like(
7457 schema.as_ref(),
7458 "maybe_auth",
7459 arr,
7460 &mut fields,
7461 &mut columns,
7462 );
7463 }
7464 {
7465 let uf = match schema
7466 .field_with_name("union_enum_record_array_map")
7467 .unwrap()
7468 .data_type()
7469 {
7470 DataType::Union(f, UnionMode::Dense) => f.clone(),
7471 other => panic!("union_enum_record_array_map should be union, got {other:?}"),
7472 };
7473 let mut tid_enum: Option<i8> = None;
7474 let mut tid_rec_a: Option<i8> = None;
7475 let mut tid_array: Option<i8> = None;
7476 let mut tid_map: Option<i8> = None;
7477 let mut map_entry_field: Option<FieldRef> = None;
7478 let mut map_sorted: bool = false;
7479 for (tid, f) in uf.iter() {
7480 match f.data_type() {
7481 DataType::Dictionary(_, _) => tid_enum = Some(tid),
7482 DataType::Struct(childs)
7483 if childs.len() == 2
7484 && childs[0].name() == "a"
7485 && childs[1].name() == "b" =>
7486 {
7487 tid_rec_a = Some(tid)
7488 }
7489 DataType::List(item) if matches!(item.data_type(), DataType::Int64) => {
7490 tid_array = Some(tid)
7491 }
7492 DataType::Map(ef, is_sorted) => {
7493 tid_map = Some(tid);
7494 map_entry_field = Some(ef.clone());
7495 map_sorted = *is_sorted;
7496 }
7497 _ => {}
7498 }
7499 }
7500 let (tid_enum, tid_rec_a, tid_array, tid_map) = (
7501 tid_enum.unwrap(),
7502 tid_rec_a.unwrap(),
7503 tid_array.unwrap(),
7504 tid_map.unwrap(),
7505 );
7506 let tids = vec![tid_enum, tid_rec_a, tid_array, tid_map];
7507 let offs = vec![0, 0, 0, 0];
7508 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7509 DataType::Dictionary(_, _) => {
7510 let keys = Int32Array::from(vec![0i32]);
7511 let values =
7512 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
7513 Some(
7514 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7515 as ArrayRef,
7516 )
7517 }
7518 DataType::Struct(fs)
7519 if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
7520 {
7521 let a = Int32Array::from(vec![7]);
7522 let b = StringArray::from(vec!["rec"]);
7523 Some(Arc::new(StructArray::new(
7524 fs.clone(),
7525 vec![Arc::new(a), Arc::new(b)],
7526 None,
7527 )) as ArrayRef)
7528 }
7529 DataType::List(field) => {
7530 let values = Int64Array::from(vec![1i64, 2, 3]);
7531 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
7532 Some(Arc::new(
7533 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
7534 ) as ArrayRef)
7535 }
7536 DataType::Map(_, _) => {
7537 let entry_field = map_entry_field.clone().unwrap();
7538 let (key_field, val_field) = match entry_field.data_type() {
7539 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7540 _ => unreachable!(),
7541 };
7542 let keys = StringArray::from(vec!["k"]);
7543 let vals = StringArray::from(vec!["v"]);
7544 let entries = StructArray::new(
7545 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7546 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7547 None,
7548 );
7549 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
7550 Some(Arc::new(MapArray::new(
7551 entry_field.clone(),
7552 offsets,
7553 entries,
7554 None,
7555 map_sorted,
7556 )) as ArrayRef)
7557 }
7558 _ => None,
7559 });
7560 push_like(
7561 schema.as_ref(),
7562 "union_enum_record_array_map",
7563 arr,
7564 &mut fields,
7565 &mut columns,
7566 );
7567 }
7568 {
7569 let uf = match schema
7570 .field_with_name("union_date_or_fixed4")
7571 .unwrap()
7572 .data_type()
7573 {
7574 DataType::Union(f, UnionMode::Dense) => f.clone(),
7575 other => panic!("union_date_or_fixed4 should be union, got {other:?}"),
7576 };
7577 let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
7578 let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
7579 let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
7580 let offs = vec![0, 0, 1, 1];
7581 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7582 DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
7583 DataType::FixedSizeBinary(4) => {
7584 let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
7585 Some(Arc::new(
7586 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
7587 ) as ArrayRef)
7588 }
7589 _ => None,
7590 });
7591 push_like(
7592 schema.as_ref(),
7593 "union_date_or_fixed4",
7594 arr,
7595 &mut fields,
7596 &mut columns,
7597 );
7598 }
7599 {
7600 let uf = match schema
7601 .field_with_name("union_interval_or_string")
7602 .unwrap()
7603 .data_type()
7604 {
7605 DataType::Union(f, UnionMode::Dense) => f.clone(),
7606 other => panic!("union_interval_or_string should be union, got {other:?}"),
7607 };
7608 let tid_dur = tid_by_dt(&uf, |dt| {
7609 matches!(dt, DataType::Interval(IntervalUnit::MonthDayNano))
7610 });
7611 let tid_str = tid_by_dt(&uf, |dt| matches!(dt, DataType::Utf8));
7612 let tids = vec![tid_dur, tid_str, tid_dur, tid_str];
7613 let offs = vec![0, 0, 1, 1];
7614 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7615 DataType::Interval(IntervalUnit::MonthDayNano) => Some(Arc::new(
7616 IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
7617 )
7618 as ArrayRef),
7619 DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
7620 "duration-as-text",
7621 "iso-8601-period-P1Y",
7622 ])) as ArrayRef),
7623 _ => None,
7624 });
7625 push_like(
7626 schema.as_ref(),
7627 "union_interval_or_string",
7628 arr,
7629 &mut fields,
7630 &mut columns,
7631 );
7632 }
7633 {
7634 let uf = match schema
7635 .field_with_name("union_uuid_or_fixed10")
7636 .unwrap()
7637 .data_type()
7638 {
7639 DataType::Union(f, UnionMode::Dense) => f.clone(),
7640 other => panic!("union_uuid_or_fixed10 should be union, got {other:?}"),
7641 };
7642 let tid_uuid = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
7643 let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
7644 let tids = vec![tid_uuid, tid_fx10, tid_uuid, tid_fx10];
7645 let offs = vec![0, 0, 1, 1];
7646 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7647 DataType::FixedSizeBinary(16) => {
7648 let it = [Some(uuid1), Some(uuid2)].into_iter();
7649 Some(Arc::new(
7650 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
7651 ) as ArrayRef)
7652 }
7653 DataType::FixedSizeBinary(10) => {
7654 let fx10_a = [0xAAu8; 10];
7655 let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
7656 let it = [Some(fx10_a), Some(fx10_b)].into_iter();
7657 Some(Arc::new(
7658 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
7659 ) as ArrayRef)
7660 }
7661 _ => None,
7662 });
7663 push_like(
7664 schema.as_ref(),
7665 "union_uuid_or_fixed10",
7666 arr,
7667 &mut fields,
7668 &mut columns,
7669 );
7670 }
7671 {
7672 let list_field = match schema
7673 .field_with_name("array_records_with_union")
7674 .unwrap()
7675 .data_type()
7676 {
7677 DataType::List(f) => f.clone(),
7678 other => panic!("array_records_with_union should be List, got {other:?}"),
7679 };
7680 let kv_fields = match list_field.data_type() {
7681 DataType::Struct(fs) => fs.clone(),
7682 other => panic!("array_records_with_union items must be Struct, got {other:?}"),
7683 };
7684 let val_field = kv_fields
7685 .iter()
7686 .find(|f| f.name() == "val")
7687 .unwrap()
7688 .clone();
7689 let uf = match val_field.data_type() {
7690 DataType::Union(f, UnionMode::Dense) => f.clone(),
7691 other => panic!("KV.val should be union, got {other:?}"),
7692 };
7693 let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
7694 let tid_null = tid_by_name(&uf, "null");
7695 let tid_i = tid_by_name(&uf, "int");
7696 let tid_l = tid_by_name(&uf, "long");
7697 let type_ids = vec![tid_i, tid_null, tid_l, tid_null, tid_i];
7698 let offsets = vec![0, 0, 0, 1, 1];
7699 let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7700 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
7701 DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
7702 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7703 _ => None,
7704 });
7705 let values_struct =
7706 Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None)) as ArrayRef;
7707 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
7708 let arr = Arc::new(
7709 ListArray::try_new(list_field, list_offsets, values_struct, None).unwrap(),
7710 ) as ArrayRef;
7711 push_like(
7712 schema.as_ref(),
7713 "array_records_with_union",
7714 arr,
7715 &mut fields,
7716 &mut columns,
7717 );
7718 }
7719 {
7720 let uf = match schema
7721 .field_with_name("union_map_or_array_int")
7722 .unwrap()
7723 .data_type()
7724 {
7725 DataType::Union(f, UnionMode::Dense) => f.clone(),
7726 other => panic!("union_map_or_array_int should be union, got {other:?}"),
7727 };
7728 let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
7729 let tid_list = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
7730 let map_child: ArrayRef = {
7731 let (entry_field, is_sorted) = match uf
7732 .iter()
7733 .find(|(tid, _)| *tid == tid_map)
7734 .unwrap()
7735 .1
7736 .data_type()
7737 {
7738 DataType::Map(ef, is_sorted) => (ef.clone(), *is_sorted),
7739 _ => unreachable!(),
7740 };
7741 let (key_field, val_field) = match entry_field.data_type() {
7742 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7743 _ => unreachable!(),
7744 };
7745 let keys = StringArray::from(vec!["x", "y", "only"]);
7746 let vals = Int32Array::from(vec![1, 2, 10]);
7747 let entries = StructArray::new(
7748 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7749 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7750 None,
7751 );
7752 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
7753 Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef
7754 };
7755 let list_child: ArrayRef = {
7756 let list_field = match uf
7757 .iter()
7758 .find(|(tid, _)| *tid == tid_list)
7759 .unwrap()
7760 .1
7761 .data_type()
7762 {
7763 DataType::List(f) => f.clone(),
7764 _ => unreachable!(),
7765 };
7766 let values = Int32Array::from(vec![1, 2, 3, 0]);
7767 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
7768 Arc::new(ListArray::try_new(list_field, offsets, Arc::new(values), None).unwrap())
7769 as ArrayRef
7770 };
7771 let tids = vec![tid_map, tid_list, tid_map, tid_list];
7772 let offs = vec![0, 0, 1, 1];
7773 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7774 DataType::Map(_, _) => Some(map_child.clone()),
7775 DataType::List(_) => Some(list_child.clone()),
7776 _ => None,
7777 });
7778 push_like(
7779 schema.as_ref(),
7780 "union_map_or_array_int",
7781 arr,
7782 &mut fields,
7783 &mut columns,
7784 );
7785 }
7786 push_like(
7787 schema.as_ref(),
7788 "renamed_with_default",
7789 Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
7790 &mut fields,
7791 &mut columns,
7792 );
7793 {
7794 let fs = match schema.field_with_name("person").unwrap().data_type() {
7795 DataType::Struct(fs) => fs.clone(),
7796 other => panic!("person should be Struct, got {other:?}"),
7797 };
7798 let name =
7799 Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef;
7800 let age = Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef;
7801 let arr = Arc::new(StructArray::new(fs, vec![name, age], None)) as ArrayRef;
7802 push_like(schema.as_ref(), "person", arr, &mut fields, &mut columns);
7803 }
7804 let expected =
7805 RecordBatch::try_new(Arc::new(Schema::new(Fields::from(fields))), columns).unwrap();
7806 assert_eq!(
7807 expected, batch,
7808 "entire RecordBatch mismatch (schema, all columns, all rows)"
7809 );
7810 }
7811 #[test]
7812 fn comprehensive_e2e_resolution_test() {
7813 use serde_json::Value;
7814 use std::collections::HashMap;
7815
7816 fn make_comprehensive_reader_schema(path: &str) -> AvroSchema {
7829 fn set_type_string(f: &mut Value, new_ty: &str) {
7830 if let Some(ty) = f.get_mut("type") {
7831 match ty {
7832 Value::String(_) | Value::Object(_) => {
7833 *ty = Value::String(new_ty.to_string());
7834 }
7835 Value::Array(arr) => {
7836 for b in arr.iter_mut() {
7837 match b {
7838 Value::String(s) if s != "null" => {
7839 *b = Value::String(new_ty.to_string());
7840 break;
7841 }
7842 Value::Object(_) => {
7843 *b = Value::String(new_ty.to_string());
7844 break;
7845 }
7846 _ => {}
7847 }
7848 }
7849 }
7850 _ => {}
7851 }
7852 }
7853 }
7854 fn reverse_union_array(f: &mut Value) {
7855 if let Some(arr) = f.get_mut("type").and_then(|t| t.as_array_mut()) {
7856 arr.reverse();
7857 }
7858 }
7859 fn reverse_items_union(f: &mut Value) {
7860 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7861 if let Some(items) = obj.get_mut("items").and_then(|v| v.as_array_mut()) {
7862 items.reverse();
7863 }
7864 }
7865 }
7866 fn reverse_map_values_union(f: &mut Value) {
7867 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7868 if let Some(values) = obj.get_mut("values").and_then(|v| v.as_array_mut()) {
7869 values.reverse();
7870 }
7871 }
7872 }
7873 fn reverse_nested_union_in_record(f: &mut Value, field_name: &str) {
7874 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7875 if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7876 for ff in fields.iter_mut() {
7877 if ff.get("name").and_then(|n| n.as_str()) == Some(field_name) {
7878 if let Some(ty) = ff.get_mut("type") {
7879 if let Some(arr) = ty.as_array_mut() {
7880 arr.reverse();
7881 }
7882 }
7883 }
7884 }
7885 }
7886 }
7887 }
7888 fn rename_nested_field_with_alias(f: &mut Value, old: &str, new: &str) {
7889 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7890 if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7891 for ff in fields.iter_mut() {
7892 if ff.get("name").and_then(|n| n.as_str()) == Some(old) {
7893 ff["name"] = Value::String(new.to_string());
7894 ff["aliases"] = Value::Array(vec![Value::String(old.to_string())]);
7895 }
7896 }
7897 }
7898 }
7899 }
7900 let mut root = load_writer_schema_json(path);
7901 assert_eq!(root["type"], "record", "writer schema must be a record");
7902 let fields = root
7903 .get_mut("fields")
7904 .and_then(|f| f.as_array_mut())
7905 .expect("record has fields");
7906 for f in fields.iter_mut() {
7907 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
7908 continue;
7909 };
7910 match name {
7911 "id" => {
7913 f["name"] = Value::String("identifier".into());
7914 f["aliases"] = Value::Array(vec![Value::String("id".into())]);
7915 }
7916 "renamed_with_default" => {
7917 f["name"] = Value::String("old_count".into());
7918 f["aliases"] =
7919 Value::Array(vec![Value::String("renamed_with_default".into())]);
7920 }
7921 "count_i32" => set_type_string(f, "long"),
7923 "ratio_f32" => set_type_string(f, "double"),
7924 "opt_str_nullsecond" => reverse_union_array(f),
7926 "union_enum_record_array_map" => reverse_union_array(f),
7927 "union_date_or_fixed4" => reverse_union_array(f),
7928 "union_interval_or_string" => reverse_union_array(f),
7929 "union_uuid_or_fixed10" => reverse_union_array(f),
7930 "union_map_or_array_int" => reverse_union_array(f),
7931 "maybe_auth" => reverse_nested_union_in_record(f, "token"),
7932 "arr_union" => reverse_items_union(f),
7934 "map_union" => reverse_map_values_union(f),
7935 "address" => rename_nested_field_with_alias(f, "street", "street_name"),
7937 "person" => {
7939 if let Some(tobj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7940 tobj.insert("name".to_string(), Value::String("Person".into()));
7941 tobj.insert(
7942 "namespace".to_string(),
7943 Value::String("com.example".into()),
7944 );
7945 tobj.insert(
7946 "aliases".into(),
7947 Value::Array(vec![
7948 Value::String("PersonV2".into()),
7949 Value::String("com.example.v2.PersonV2".into()),
7950 ]),
7951 );
7952 }
7953 }
7954 _ => {}
7955 }
7956 }
7957 fields.reverse();
7958 AvroSchema::new(root.to_string())
7959 }
7960
7961 let path = "test/data/comprehensive_e2e.avro";
7962 let reader_schema = make_comprehensive_reader_schema(path);
7963 let batch = read_alltypes_with_reader_schema(path, reader_schema.clone());
7964
7965 const UUID_EXT_KEY: &str = "ARROW:extension:name";
7966 const UUID_LOGICAL_KEY: &str = "logicalType";
7967
7968 let uuid_md_top: Option<HashMap<String, String>> = batch
7969 .schema()
7970 .field_with_name("uuid_str")
7971 .ok()
7972 .and_then(|f| {
7973 let md = f.metadata();
7974 let has_ext = md.get(UUID_EXT_KEY).is_some();
7975 let is_uuid_logical = md
7976 .get(UUID_LOGICAL_KEY)
7977 .map(|v| v.trim_matches('"') == "uuid")
7978 .unwrap_or(false);
7979 if has_ext || is_uuid_logical {
7980 Some(md.clone())
7981 } else {
7982 None
7983 }
7984 });
7985
7986 let uuid_md_union: Option<HashMap<String, String>> = batch
7987 .schema()
7988 .field_with_name("union_uuid_or_fixed10")
7989 .ok()
7990 .and_then(|f| match f.data_type() {
7991 DataType::Union(uf, _) => uf
7992 .iter()
7993 .find(|(_, child)| child.name() == "uuid")
7994 .and_then(|(_, child)| {
7995 let md = child.metadata();
7996 let has_ext = md.get(UUID_EXT_KEY).is_some();
7997 let is_uuid_logical = md
7998 .get(UUID_LOGICAL_KEY)
7999 .map(|v| v.trim_matches('"') == "uuid")
8000 .unwrap_or(false);
8001 if has_ext || is_uuid_logical {
8002 Some(md.clone())
8003 } else {
8004 None
8005 }
8006 }),
8007 _ => None,
8008 });
8009
8010 let add_uuid_ext_top = |f: Field| -> Field {
8011 if let Some(md) = &uuid_md_top {
8012 f.with_metadata(md.clone())
8013 } else {
8014 f
8015 }
8016 };
8017 let add_uuid_ext_union = |f: Field| -> Field {
8018 if let Some(md) = &uuid_md_union {
8019 f.with_metadata(md.clone())
8020 } else {
8021 f
8022 }
8023 };
8024
8025 #[inline]
8026 fn uuid16_from_str(s: &str) -> [u8; 16] {
8027 let mut out = [0u8; 16];
8028 let mut idx = 0usize;
8029 let mut hi: Option<u8> = None;
8030 for ch in s.chars() {
8031 if ch == '-' {
8032 continue;
8033 }
8034 let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
8035 if let Some(h) = hi {
8036 out[idx] = (h << 4) | v;
8037 idx += 1;
8038 hi = None;
8039 } else {
8040 hi = Some(v);
8041 }
8042 }
8043 assert_eq!(idx, 16, "UUID must decode to 16 bytes");
8044 out
8045 }
8046
8047 fn mk_dense_union(
8048 fields: &UnionFields,
8049 type_ids: Vec<i8>,
8050 offsets: Vec<i32>,
8051 provide: impl Fn(&Field) -> Option<ArrayRef>,
8052 ) -> ArrayRef {
8053 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
8054 match dt {
8055 DataType::Null => Arc::new(NullArray::new(0)),
8056 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
8057 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
8058 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
8059 DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
8060 DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
8061 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
8062 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
8063 DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
8064 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
8065 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
8066 }
8067 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
8068 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
8069 }
8070 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
8071 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
8072 Arc::new(if let Some(tz) = tz {
8073 a.with_timezone(tz.clone())
8074 } else {
8075 a
8076 })
8077 }
8078 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
8079 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
8080 Arc::new(if let Some(tz) = tz {
8081 a.with_timezone(tz.clone())
8082 } else {
8083 a
8084 })
8085 }
8086 DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
8087 IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
8088 ),
8089 DataType::FixedSizeBinary(sz) => Arc::new(
8090 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
8091 std::iter::empty::<Option<Vec<u8>>>(),
8092 *sz,
8093 )
8094 .unwrap(),
8095 ),
8096 DataType::Dictionary(_, _) => {
8097 let keys = Int32Array::from(Vec::<i32>::new());
8098 let values = Arc::new(StringArray::from(Vec::<&str>::new()));
8099 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8100 }
8101 DataType::Struct(fields) => {
8102 let children: Vec<ArrayRef> = fields
8103 .iter()
8104 .map(|f| empty_child_for(f.data_type()) as ArrayRef)
8105 .collect();
8106 Arc::new(StructArray::new(fields.clone(), children, None))
8107 }
8108 DataType::List(field) => {
8109 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
8110 Arc::new(
8111 ListArray::try_new(
8112 field.clone(),
8113 offsets,
8114 empty_child_for(field.data_type()),
8115 None,
8116 )
8117 .unwrap(),
8118 )
8119 }
8120 DataType::Map(entry_field, is_sorted) => {
8121 let (key_field, val_field) = match entry_field.data_type() {
8122 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
8123 other => panic!("unexpected map entries type: {other:?}"),
8124 };
8125 let keys = StringArray::from(Vec::<&str>::new());
8126 let vals: ArrayRef = match val_field.data_type() {
8127 DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
8128 DataType::Boolean => {
8129 Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
8130 }
8131 DataType::Int32 => {
8132 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
8133 }
8134 DataType::Int64 => {
8135 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
8136 }
8137 DataType::Float32 => {
8138 Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
8139 }
8140 DataType::Float64 => {
8141 Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
8142 }
8143 DataType::Utf8 => {
8144 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
8145 }
8146 DataType::Binary => {
8147 Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
8148 }
8149 DataType::Union(uf, _) => {
8150 let children: Vec<ArrayRef> = uf
8151 .iter()
8152 .map(|(_, f)| empty_child_for(f.data_type()))
8153 .collect();
8154 Arc::new(
8155 UnionArray::try_new(
8156 uf.clone(),
8157 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
8158 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
8159 children,
8160 )
8161 .unwrap(),
8162 ) as ArrayRef
8163 }
8164 other => panic!("unsupported map value type: {other:?}"),
8165 };
8166 let entries = StructArray::new(
8167 Fields::from(vec![
8168 key_field.as_ref().clone(),
8169 val_field.as_ref().clone(),
8170 ]),
8171 vec![Arc::new(keys) as ArrayRef, vals],
8172 None,
8173 );
8174 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
8175 Arc::new(MapArray::new(
8176 entry_field.clone(),
8177 offsets,
8178 entries,
8179 None,
8180 *is_sorted,
8181 ))
8182 }
8183 other => panic!("empty_child_for: unhandled type {other:?}"),
8184 }
8185 }
8186 let children: Vec<ArrayRef> = fields
8187 .iter()
8188 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
8189 .collect();
8190 Arc::new(
8191 UnionArray::try_new(
8192 fields.clone(),
8193 ScalarBuffer::<i8>::from(type_ids),
8194 Some(ScalarBuffer::<i32>::from(offsets)),
8195 children,
8196 )
8197 .unwrap(),
8198 ) as ArrayRef
8199 }
8200 let date_a: i32 = 19_000; let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
8202 let time_us_eod: i64 = 86_400_000_000 - 1;
8203 let ts_ms_2024_01_01: i64 = 1_704_067_200_000; let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
8205 let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
8206 let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
8207 let dur_large =
8208 IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
8209 let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
8210 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
8211 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
8212 let item_name = Field::LIST_FIELD_DEFAULT_NAME;
8213 let uf_tri = UnionFields::try_new(
8214 vec![0, 1, 2],
8215 vec![
8216 Field::new("int", DataType::Int32, false),
8217 Field::new("string", DataType::Utf8, false),
8218 Field::new("boolean", DataType::Boolean, false),
8219 ],
8220 )
8221 .unwrap();
8222 let uf_arr_items = UnionFields::try_new(
8223 vec![0, 1, 2],
8224 vec![
8225 Field::new("null", DataType::Null, false),
8226 Field::new("string", DataType::Utf8, false),
8227 Field::new("long", DataType::Int64, false),
8228 ],
8229 )
8230 .unwrap();
8231 let arr_items_field = Arc::new(Field::new(
8232 item_name,
8233 DataType::Union(uf_arr_items.clone(), UnionMode::Dense),
8234 true,
8235 ));
8236 let uf_map_vals = UnionFields::try_new(
8237 vec![0, 1, 2],
8238 vec![
8239 Field::new("string", DataType::Utf8, false),
8240 Field::new("double", DataType::Float64, false),
8241 Field::new("null", DataType::Null, false),
8242 ],
8243 )
8244 .unwrap();
8245 let map_entries_field = Arc::new(Field::new(
8246 "entries",
8247 DataType::Struct(Fields::from(vec![
8248 Field::new("key", DataType::Utf8, false),
8249 Field::new(
8250 "value",
8251 DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8252 true,
8253 ),
8254 ])),
8255 false,
8256 ));
8257 let mut enum_md_color = {
8259 let mut m = HashMap::<String, String>::new();
8260 m.insert(
8261 crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8262 serde_json::to_string(&vec!["RED", "GREEN", "BLUE"]).unwrap(),
8263 );
8264 m
8265 };
8266 enum_md_color.insert(AVRO_NAME_METADATA_KEY.to_string(), "Color".to_string());
8267 enum_md_color.insert(
8268 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8269 "org.apache.arrow.avrotests.v1.types".to_string(),
8270 );
8271 let union_rec_a_fields = Fields::from(vec![
8272 Field::new("a", DataType::Int32, false),
8273 Field::new("b", DataType::Utf8, false),
8274 ]);
8275 let union_rec_b_fields = Fields::from(vec![
8276 Field::new("x", DataType::Int64, false),
8277 Field::new("y", DataType::Binary, false),
8278 ]);
8279 let union_map_entries = Arc::new(Field::new(
8280 "entries",
8281 DataType::Struct(Fields::from(vec![
8282 Field::new("key", DataType::Utf8, false),
8283 Field::new("value", DataType::Utf8, false),
8284 ])),
8285 false,
8286 ));
8287 let rec_a_md = {
8288 let mut m = HashMap::<String, String>::new();
8289 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecA".to_string());
8290 m.insert(
8291 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8292 "org.apache.arrow.avrotests.v1.types".to_string(),
8293 );
8294 m
8295 };
8296 let rec_b_md = {
8297 let mut m = HashMap::<String, String>::new();
8298 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecB".to_string());
8299 m.insert(
8300 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8301 "org.apache.arrow.avrotests.v1.types".to_string(),
8302 );
8303 m
8304 };
8305 let uf_union_big = UnionFields::try_new(
8306 vec![0, 1, 2, 3, 4],
8307 vec![
8308 Field::new(
8309 "map",
8310 DataType::Map(union_map_entries.clone(), false),
8311 false,
8312 ),
8313 Field::new(
8314 "array",
8315 DataType::List(Arc::new(Field::new(item_name, DataType::Int64, false))),
8316 false,
8317 ),
8318 Field::new(
8319 "org.apache.arrow.avrotests.v1.types.RecB",
8320 DataType::Struct(union_rec_b_fields.clone()),
8321 false,
8322 )
8323 .with_metadata(rec_b_md.clone()),
8324 Field::new(
8325 "org.apache.arrow.avrotests.v1.types.RecA",
8326 DataType::Struct(union_rec_a_fields.clone()),
8327 false,
8328 )
8329 .with_metadata(rec_a_md.clone()),
8330 Field::new(
8331 "org.apache.arrow.avrotests.v1.types.Color",
8332 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8333 false,
8334 )
8335 .with_metadata(enum_md_color.clone()),
8336 ],
8337 )
8338 .unwrap();
8339 let fx4_md = {
8340 let mut m = HashMap::<String, String>::new();
8341 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx4".to_string());
8342 m.insert(
8343 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8344 "org.apache.arrow.avrotests.v1".to_string(),
8345 );
8346 m
8347 };
8348 let uf_date_fixed4 = UnionFields::try_new(
8349 vec![0, 1],
8350 vec![
8351 Field::new(
8352 "org.apache.arrow.avrotests.v1.Fx4",
8353 DataType::FixedSizeBinary(4),
8354 false,
8355 )
8356 .with_metadata(fx4_md.clone()),
8357 Field::new("date", DataType::Date32, false),
8358 ],
8359 )
8360 .unwrap();
8361 let dur12u_md = {
8362 let mut m = HashMap::<String, String>::new();
8363 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12U".to_string());
8364 m.insert(
8365 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8366 "org.apache.arrow.avrotests.v1".to_string(),
8367 );
8368 m
8369 };
8370 let uf_dur_or_str = UnionFields::try_new(
8371 vec![0, 1],
8372 vec![
8373 Field::new("string", DataType::Utf8, false),
8374 Field::new(
8375 "org.apache.arrow.avrotests.v1.Dur12U",
8376 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano),
8377 false,
8378 )
8379 .with_metadata(dur12u_md.clone()),
8380 ],
8381 )
8382 .unwrap();
8383 let fx10_md = {
8384 let mut m = HashMap::<String, String>::new();
8385 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx10".to_string());
8386 m.insert(
8387 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8388 "org.apache.arrow.avrotests.v1".to_string(),
8389 );
8390 m
8391 };
8392 let uf_uuid_or_fx10 = UnionFields::try_new(
8393 vec![0, 1],
8394 vec![
8395 Field::new(
8396 "org.apache.arrow.avrotests.v1.Fx10",
8397 DataType::FixedSizeBinary(10),
8398 false,
8399 )
8400 .with_metadata(fx10_md.clone()),
8401 add_uuid_ext_union(Field::new("uuid", DataType::FixedSizeBinary(16), false)),
8402 ],
8403 )
8404 .unwrap();
8405 let uf_kv_val = UnionFields::try_new(
8406 vec![0, 1, 2],
8407 vec![
8408 Field::new("null", DataType::Null, false),
8409 Field::new("int", DataType::Int32, false),
8410 Field::new("long", DataType::Int64, false),
8411 ],
8412 )
8413 .unwrap();
8414 let kv_fields = Fields::from(vec![
8415 Field::new("key", DataType::Utf8, false),
8416 Field::new(
8417 "val",
8418 DataType::Union(uf_kv_val.clone(), UnionMode::Dense),
8419 true,
8420 ),
8421 ]);
8422 let kv_item_field = Arc::new(Field::new(
8423 item_name,
8424 DataType::Struct(kv_fields.clone()),
8425 false,
8426 ));
8427 let map_int_entries = Arc::new(Field::new(
8428 "entries",
8429 DataType::Struct(Fields::from(vec![
8430 Field::new("key", DataType::Utf8, false),
8431 Field::new("value", DataType::Int32, false),
8432 ])),
8433 false,
8434 ));
8435 let uf_map_or_array = UnionFields::try_new(
8436 vec![0, 1],
8437 vec![
8438 Field::new(
8439 "array",
8440 DataType::List(Arc::new(Field::new(item_name, DataType::Int32, false))),
8441 false,
8442 ),
8443 Field::new("map", DataType::Map(map_int_entries.clone(), false), false),
8444 ],
8445 )
8446 .unwrap();
8447 let mut enum_md_status = {
8448 let mut m = HashMap::<String, String>::new();
8449 m.insert(
8450 crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8451 serde_json::to_string(&vec!["UNKNOWN", "NEW", "PROCESSING", "DONE"]).unwrap(),
8452 );
8453 m
8454 };
8455 enum_md_status.insert(AVRO_NAME_METADATA_KEY.to_string(), "Status".to_string());
8456 enum_md_status.insert(
8457 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8458 "org.apache.arrow.avrotests.v1.types".to_string(),
8459 );
8460 let mut dec20_md = HashMap::<String, String>::new();
8461 dec20_md.insert("precision".to_string(), "20".to_string());
8462 dec20_md.insert("scale".to_string(), "4".to_string());
8463 dec20_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "DecFix20".to_string());
8464 dec20_md.insert(
8465 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8466 "org.apache.arrow.avrotests.v1.types".to_string(),
8467 );
8468 let mut dec10_md = HashMap::<String, String>::new();
8469 dec10_md.insert("precision".to_string(), "10".to_string());
8470 dec10_md.insert("scale".to_string(), "2".to_string());
8471 let fx16_top_md = {
8472 let mut m = HashMap::<String, String>::new();
8473 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx16".to_string());
8474 m.insert(
8475 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8476 "org.apache.arrow.avrotests.v1.types".to_string(),
8477 );
8478 m
8479 };
8480 let dur12_top_md = {
8481 let mut m = HashMap::<String, String>::new();
8482 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12".to_string());
8483 m.insert(
8484 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8485 "org.apache.arrow.avrotests.v1.types".to_string(),
8486 );
8487 m
8488 };
8489 #[cfg(feature = "small_decimals")]
8490 let dec20_dt = DataType::Decimal128(20, 4);
8491 #[cfg(not(feature = "small_decimals"))]
8492 let dec20_dt = DataType::Decimal128(20, 4);
8493 #[cfg(feature = "small_decimals")]
8494 let dec10_dt = DataType::Decimal64(10, 2);
8495 #[cfg(not(feature = "small_decimals"))]
8496 let dec10_dt = DataType::Decimal128(10, 2);
8497 let fields: Vec<FieldRef> = vec![
8498 Arc::new(Field::new(
8499 "person",
8500 DataType::Struct(Fields::from(vec![
8501 Field::new("name", DataType::Utf8, false),
8502 Field::new("age", DataType::Int32, false),
8503 ])),
8504 false,
8505 )),
8506 Arc::new(Field::new("old_count", DataType::Int32, false)),
8507 Arc::new(Field::new(
8508 "union_map_or_array_int",
8509 DataType::Union(uf_map_or_array.clone(), UnionMode::Dense),
8510 false,
8511 )),
8512 Arc::new(Field::new(
8513 "array_records_with_union",
8514 DataType::List(kv_item_field.clone()),
8515 false,
8516 )),
8517 Arc::new(Field::new(
8518 "union_uuid_or_fixed10",
8519 DataType::Union(uf_uuid_or_fx10.clone(), UnionMode::Dense),
8520 false,
8521 )),
8522 Arc::new(Field::new(
8523 "union_interval_or_string",
8524 DataType::Union(uf_dur_or_str.clone(), UnionMode::Dense),
8525 false,
8526 )),
8527 Arc::new(Field::new(
8528 "union_date_or_fixed4",
8529 DataType::Union(uf_date_fixed4.clone(), UnionMode::Dense),
8530 false,
8531 )),
8532 Arc::new(Field::new(
8533 "union_enum_record_array_map",
8534 DataType::Union(uf_union_big.clone(), UnionMode::Dense),
8535 false,
8536 )),
8537 Arc::new(Field::new(
8538 "maybe_auth",
8539 DataType::Struct(Fields::from(vec![
8540 Field::new("user", DataType::Utf8, false),
8541 Field::new("token", DataType::Binary, true), ])),
8543 false,
8544 )),
8545 Arc::new(Field::new(
8546 "address",
8547 DataType::Struct(Fields::from(vec![
8548 Field::new("street_name", DataType::Utf8, false),
8549 Field::new("zip", DataType::Int32, false),
8550 Field::new("country", DataType::Utf8, false),
8551 ])),
8552 false,
8553 )),
8554 Arc::new(Field::new(
8555 "map_union",
8556 DataType::Map(map_entries_field.clone(), false),
8557 false,
8558 )),
8559 Arc::new(Field::new(
8560 "arr_union",
8561 DataType::List(arr_items_field.clone()),
8562 false,
8563 )),
8564 Arc::new(
8565 Field::new(
8566 "status",
8567 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8568 false,
8569 )
8570 .with_metadata(enum_md_status.clone()),
8571 ),
8572 Arc::new(
8573 Field::new(
8574 "interval_mdn",
8575 DataType::Interval(IntervalUnit::MonthDayNano),
8576 false,
8577 )
8578 .with_metadata(dur12_top_md.clone()),
8579 ),
8580 Arc::new(Field::new(
8581 "ts_micros_local",
8582 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
8583 false,
8584 )),
8585 Arc::new(Field::new(
8586 "ts_millis_local",
8587 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
8588 false,
8589 )),
8590 Arc::new(Field::new(
8591 "ts_micros_utc",
8592 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+00:00".into())),
8593 false,
8594 )),
8595 Arc::new(Field::new(
8596 "ts_millis_utc",
8597 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, Some("+00:00".into())),
8598 false,
8599 )),
8600 Arc::new(Field::new(
8601 "t_micros",
8602 DataType::Time64(arrow_schema::TimeUnit::Microsecond),
8603 false,
8604 )),
8605 Arc::new(Field::new(
8606 "t_millis",
8607 DataType::Time32(arrow_schema::TimeUnit::Millisecond),
8608 false,
8609 )),
8610 Arc::new(Field::new("d_date", DataType::Date32, false)),
8611 Arc::new(add_uuid_ext_top(Field::new(
8612 "uuid_str",
8613 DataType::FixedSizeBinary(16),
8614 false,
8615 ))),
8616 Arc::new(Field::new("dec_fix_s20_4", dec20_dt, false).with_metadata(dec20_md.clone())),
8617 Arc::new(
8618 Field::new("dec_bytes_s10_2", dec10_dt, false).with_metadata(dec10_md.clone()),
8619 ),
8620 Arc::new(
8621 Field::new("fx16_plain", DataType::FixedSizeBinary(16), false)
8622 .with_metadata(fx16_top_md.clone()),
8623 ),
8624 Arc::new(Field::new("raw_bytes", DataType::Binary, false)),
8625 Arc::new(Field::new("str_utf8", DataType::Utf8, false)),
8626 Arc::new(Field::new(
8627 "tri_union_prim",
8628 DataType::Union(uf_tri.clone(), UnionMode::Dense),
8629 false,
8630 )),
8631 Arc::new(Field::new("opt_str_nullsecond", DataType::Utf8, true)),
8632 Arc::new(Field::new("opt_i32_nullfirst", DataType::Int32, true)),
8633 Arc::new(Field::new("count_i64", DataType::Int64, false)),
8634 Arc::new(Field::new("count_i32", DataType::Int64, false)),
8635 Arc::new(Field::new("ratio_f64", DataType::Float64, false)),
8636 Arc::new(Field::new("ratio_f32", DataType::Float64, false)),
8637 Arc::new(Field::new("flag", DataType::Boolean, false)),
8638 Arc::new(Field::new("identifier", DataType::Int64, false)),
8639 ];
8640 let expected_schema = Arc::new(arrow_schema::Schema::new(Fields::from(fields)));
8641 let mut cols: Vec<ArrayRef> = vec![
8642 Arc::new(StructArray::new(
8643 match expected_schema
8644 .field_with_name("person")
8645 .unwrap()
8646 .data_type()
8647 {
8648 DataType::Struct(fs) => fs.clone(),
8649 _ => unreachable!(),
8650 },
8651 vec![
8652 Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef,
8653 Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef,
8654 ],
8655 None,
8656 )) as ArrayRef,
8657 Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
8658 ];
8659 {
8660 let map_child: ArrayRef = {
8661 let keys = StringArray::from(vec!["x", "y", "only"]);
8662 let vals = Int32Array::from(vec![1, 2, 10]);
8663 let entries = StructArray::new(
8664 Fields::from(vec![
8665 Field::new("key", DataType::Utf8, false),
8666 Field::new("value", DataType::Int32, false),
8667 ]),
8668 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8669 None,
8670 );
8671 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
8672 Arc::new(MapArray::new(
8673 map_int_entries.clone(),
8674 moff,
8675 entries,
8676 None,
8677 false,
8678 )) as ArrayRef
8679 };
8680 let list_child: ArrayRef = {
8681 let values = Int32Array::from(vec![1, 2, 3, 0]);
8682 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
8683 Arc::new(
8684 ListArray::try_new(
8685 Arc::new(Field::new(item_name, DataType::Int32, false)),
8686 offsets,
8687 Arc::new(values),
8688 None,
8689 )
8690 .unwrap(),
8691 ) as ArrayRef
8692 };
8693 let tids = vec![1, 0, 1, 0];
8694 let offs = vec![0, 0, 1, 1];
8695 let arr = mk_dense_union(&uf_map_or_array, tids, offs, |f| match f.name().as_str() {
8696 "array" => Some(list_child.clone()),
8697 "map" => Some(map_child.clone()),
8698 _ => None,
8699 });
8700 cols.push(arr);
8701 }
8702 {
8703 let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
8704 let type_ids = vec![1, 0, 2, 0, 1];
8705 let offsets = vec![0, 0, 0, 1, 1];
8706 let vals = mk_dense_union(&uf_kv_val, type_ids, offsets, |f| match f.data_type() {
8707 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
8708 DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
8709 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
8710 _ => None,
8711 });
8712 let values_struct =
8713 Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None));
8714 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
8715 let arr = Arc::new(
8716 ListArray::try_new(kv_item_field.clone(), list_offsets, values_struct, None)
8717 .unwrap(),
8718 ) as ArrayRef;
8719 cols.push(arr);
8720 }
8721 {
8722 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8724 let arr = mk_dense_union(&uf_uuid_or_fx10, type_ids, offs, |f| match f.data_type() {
8725 DataType::FixedSizeBinary(16) => {
8726 let it = [Some(uuid1), Some(uuid2)].into_iter();
8727 Some(Arc::new(
8728 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8729 ) as ArrayRef)
8730 }
8731 DataType::FixedSizeBinary(10) => {
8732 let fx10_a = [0xAAu8; 10];
8733 let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
8734 let it = [Some(fx10_a), Some(fx10_b)].into_iter();
8735 Some(Arc::new(
8736 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
8737 ) as ArrayRef)
8738 }
8739 _ => None,
8740 });
8741 cols.push(arr);
8742 }
8743 {
8744 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8746 let arr = mk_dense_union(&uf_dur_or_str, type_ids, offs, |f| match f.data_type() {
8747 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => Some(Arc::new(
8748 IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
8749 )
8750 as ArrayRef),
8751 DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
8752 "duration-as-text",
8753 "iso-8601-period-P1Y",
8754 ])) as ArrayRef),
8755 _ => None,
8756 });
8757 cols.push(arr);
8758 }
8759 {
8760 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8762 let arr = mk_dense_union(&uf_date_fixed4, type_ids, offs, |f| match f.data_type() {
8763 DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
8764 DataType::FixedSizeBinary(4) => {
8765 let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
8766 Some(Arc::new(
8767 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
8768 ) as ArrayRef)
8769 }
8770 _ => None,
8771 });
8772 cols.push(arr);
8773 }
8774 {
8775 let tids = vec![4, 3, 1, 0]; let offs = vec![0, 0, 0, 0];
8777 let arr = mk_dense_union(&uf_union_big, tids, offs, |f| match f.data_type() {
8778 DataType::Dictionary(_, _) => {
8779 let keys = Int32Array::from(vec![0i32]);
8780 let values =
8781 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
8782 Some(
8783 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8784 as ArrayRef,
8785 )
8786 }
8787 DataType::Struct(fs) if fs == &union_rec_a_fields => {
8788 let a = Int32Array::from(vec![7]);
8789 let b = StringArray::from(vec!["rec"]);
8790 Some(Arc::new(StructArray::new(
8791 fs.clone(),
8792 vec![Arc::new(a) as ArrayRef, Arc::new(b) as ArrayRef],
8793 None,
8794 )) as ArrayRef)
8795 }
8796 DataType::List(_) => {
8797 let values = Int64Array::from(vec![1i64, 2, 3]);
8798 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
8799 Some(Arc::new(
8800 ListArray::try_new(
8801 Arc::new(Field::new(item_name, DataType::Int64, false)),
8802 offsets,
8803 Arc::new(values),
8804 None,
8805 )
8806 .unwrap(),
8807 ) as ArrayRef)
8808 }
8809 DataType::Map(_, _) => {
8810 let keys = StringArray::from(vec!["k"]);
8811 let vals = StringArray::from(vec!["v"]);
8812 let entries = StructArray::new(
8813 Fields::from(vec![
8814 Field::new("key", DataType::Utf8, false),
8815 Field::new("value", DataType::Utf8, false),
8816 ]),
8817 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8818 None,
8819 );
8820 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
8821 Some(Arc::new(MapArray::new(
8822 union_map_entries.clone(),
8823 moff,
8824 entries,
8825 None,
8826 false,
8827 )) as ArrayRef)
8828 }
8829 _ => None,
8830 });
8831 cols.push(arr);
8832 }
8833 {
8834 let fs = match expected_schema
8835 .field_with_name("maybe_auth")
8836 .unwrap()
8837 .data_type()
8838 {
8839 DataType::Struct(fs) => fs.clone(),
8840 _ => unreachable!(),
8841 };
8842 let user =
8843 Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
8844 let token_values: Vec<Option<&[u8]>> = vec![
8845 None,
8846 Some(b"\x01\x02\x03".as_ref()),
8847 None,
8848 Some(b"".as_ref()),
8849 ];
8850 let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
8851 cols.push(Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef);
8852 }
8853 {
8854 let fs = match expected_schema
8855 .field_with_name("address")
8856 .unwrap()
8857 .data_type()
8858 {
8859 DataType::Struct(fs) => fs.clone(),
8860 _ => unreachable!(),
8861 };
8862 let street = Arc::new(StringArray::from(vec![
8863 "100 Main",
8864 "",
8865 "42 Galaxy Way",
8866 "End Ave",
8867 ])) as ArrayRef;
8868 let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
8869 let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
8870 cols.push(Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef);
8871 }
8872 {
8873 let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
8874 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
8875 let tid_s = 0; let tid_d = 1; let tid_n = 2; let type_ids = vec![tid_d, tid_n, tid_s, tid_d, tid_d, tid_s];
8879 let offsets = vec![0, 0, 0, 1, 2, 1];
8880 let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
8881 let vals = mk_dense_union(&uf_map_vals, type_ids, offsets, |f| match f.data_type() {
8882 DataType::Float64 => {
8883 Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
8884 }
8885 DataType::Utf8 => {
8886 Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
8887 }
8888 DataType::Null => Some(Arc::new(NullArray::new(1)) as ArrayRef),
8889 _ => None,
8890 });
8891 let entries = StructArray::new(
8892 Fields::from(vec![
8893 Field::new("key", DataType::Utf8, false),
8894 Field::new(
8895 "value",
8896 DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8897 true,
8898 ),
8899 ]),
8900 vec![Arc::new(keys) as ArrayRef, vals],
8901 None,
8902 );
8903 let map = Arc::new(MapArray::new(
8904 map_entries_field.clone(),
8905 moff,
8906 entries,
8907 None,
8908 false,
8909 )) as ArrayRef;
8910 cols.push(map);
8911 }
8912 {
8913 let type_ids = vec![
8914 2, 1, 0, 2, 0, 1, 2, 2, 1, 0,
8915 2, ];
8917 let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
8918 let values =
8919 mk_dense_union(&uf_arr_items, type_ids, offsets, |f| match f.data_type() {
8920 DataType::Int64 => {
8921 Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
8922 }
8923 DataType::Utf8 => {
8924 Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
8925 }
8926 DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
8927 _ => None,
8928 });
8929 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
8930 let arr = Arc::new(
8931 ListArray::try_new(arr_items_field.clone(), list_offsets, values, None).unwrap(),
8932 ) as ArrayRef;
8933 cols.push(arr);
8934 }
8935 {
8936 let keys = Int32Array::from(vec![1, 2, 3, 0]); let values = Arc::new(StringArray::from(vec![
8938 "UNKNOWN",
8939 "NEW",
8940 "PROCESSING",
8941 "DONE",
8942 ])) as ArrayRef;
8943 let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
8944 cols.push(Arc::new(dict) as ArrayRef);
8945 }
8946 cols.push(Arc::new(IntervalMonthDayNanoArray::from(vec![
8947 dur_small, dur_zero, dur_large, dur_2years,
8948 ])) as ArrayRef);
8949 cols.push(Arc::new(TimestampMicrosecondArray::from(vec![
8950 ts_us_2024_01_01 + 123_456,
8951 0,
8952 ts_us_2024_01_01 + 101_112,
8953 987_654_321,
8954 ])) as ArrayRef);
8955 cols.push(Arc::new(TimestampMillisecondArray::from(vec![
8956 ts_ms_2024_01_01 + 86_400_000,
8957 0,
8958 ts_ms_2024_01_01 + 789,
8959 123_456_789,
8960 ])) as ArrayRef);
8961 {
8962 let a = TimestampMicrosecondArray::from(vec![
8963 ts_us_2024_01_01,
8964 1,
8965 ts_us_2024_01_01 + 456,
8966 0,
8967 ])
8968 .with_timezone("+00:00");
8969 cols.push(Arc::new(a) as ArrayRef);
8970 }
8971 {
8972 let a = TimestampMillisecondArray::from(vec![
8973 ts_ms_2024_01_01,
8974 -1,
8975 ts_ms_2024_01_01 + 123,
8976 0,
8977 ])
8978 .with_timezone("+00:00");
8979 cols.push(Arc::new(a) as ArrayRef);
8980 }
8981 cols.push(Arc::new(Time64MicrosecondArray::from(vec![
8982 time_us_eod,
8983 0,
8984 1,
8985 1_000_000,
8986 ])) as ArrayRef);
8987 cols.push(Arc::new(Time32MillisecondArray::from(vec![
8988 time_ms_a,
8989 0,
8990 1,
8991 86_400_000 - 1,
8992 ])) as ArrayRef);
8993 cols.push(Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef);
8994 {
8995 let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
8996 cols.push(Arc::new(
8997 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8998 ) as ArrayRef);
8999 }
9000 {
9001 #[cfg(feature = "small_decimals")]
9002 let arr = Arc::new(
9003 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
9004 .with_precision_and_scale(20, 4)
9005 .unwrap(),
9006 ) as ArrayRef;
9007 #[cfg(not(feature = "small_decimals"))]
9008 let arr = Arc::new(
9009 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
9010 .with_precision_and_scale(20, 4)
9011 .unwrap(),
9012 ) as ArrayRef;
9013 cols.push(arr);
9014 }
9015 {
9016 #[cfg(feature = "small_decimals")]
9017 let arr = Arc::new(
9018 Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
9019 .with_precision_and_scale(10, 2)
9020 .unwrap(),
9021 ) as ArrayRef;
9022 #[cfg(not(feature = "small_decimals"))]
9023 let arr = Arc::new(
9024 Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
9025 .with_precision_and_scale(10, 2)
9026 .unwrap(),
9027 ) as ArrayRef;
9028 cols.push(arr);
9029 }
9030 {
9031 let it = [
9032 Some(*b"0123456789ABCDEF"),
9033 Some([0u8; 16]),
9034 Some(*b"ABCDEFGHIJKLMNOP"),
9035 Some([0xAA; 16]),
9036 ]
9037 .into_iter();
9038 cols.push(Arc::new(
9039 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
9040 ) as ArrayRef);
9041 }
9042 cols.push(Arc::new(BinaryArray::from(vec![
9043 b"\x00\x01".as_ref(),
9044 b"".as_ref(),
9045 b"\xFF\x00".as_ref(),
9046 b"\x10\x20\x30\x40".as_ref(),
9047 ])) as ArrayRef);
9048 cols.push(Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef);
9049 {
9050 let tids = vec![0, 1, 2, 1];
9051 let offs = vec![0, 0, 0, 1];
9052 let arr = mk_dense_union(&uf_tri, tids, offs, |f| match f.data_type() {
9053 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
9054 DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
9055 DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
9056 _ => None,
9057 });
9058 cols.push(arr);
9059 }
9060 cols.push(Arc::new(StringArray::from(vec![
9061 Some("alpha"),
9062 None,
9063 Some("s3"),
9064 Some(""),
9065 ])) as ArrayRef);
9066 cols.push(Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef);
9067 cols.push(Arc::new(Int64Array::from(vec![
9068 7_000_000_000i64,
9069 -2,
9070 0,
9071 -9_876_543_210i64,
9072 ])) as ArrayRef);
9073 cols.push(Arc::new(Int64Array::from(vec![7i64, -1, 0, 123])) as ArrayRef);
9074 cols.push(Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef);
9075 cols.push(Arc::new(Float64Array::from(vec![1.25f64, -0.0, 3.5, 9.75])) as ArrayRef);
9076 cols.push(Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef);
9077 cols.push(Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef);
9078 let expected = RecordBatch::try_new(expected_schema, cols).unwrap();
9079 assert_eq!(
9080 expected, batch,
9081 "entire RecordBatch mismatch (schema, all columns, all rows)"
9082 );
9083 }
9084}