1use crate::codec::AvroFieldBuilder;
483use crate::reader::header::read_header;
484use crate::schema::{
485 AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, SINGLE_OBJECT_MAGIC, Schema,
486 SchemaStore,
487};
488use arrow_array::{RecordBatch, RecordBatchReader};
489use arrow_schema::{ArrowError, SchemaRef};
490use block::BlockDecoder;
491use header::Header;
492use indexmap::IndexMap;
493use record::RecordDecoder;
494use std::io::BufRead;
495
496mod block;
497mod cursor;
498mod header;
499mod record;
500mod vlq;
501
502fn is_incomplete_data(err: &ArrowError) -> bool {
503 matches!(
504 err,
505 ArrowError::ParseError(msg)
506 if msg.contains("Unexpected EOF")
507 )
508}
509
510#[derive(Debug)]
633pub struct Decoder {
634 active_decoder: RecordDecoder,
635 active_fingerprint: Option<Fingerprint>,
636 batch_size: usize,
637 remaining_capacity: usize,
638 cache: IndexMap<Fingerprint, RecordDecoder>,
639 fingerprint_algorithm: FingerprintAlgorithm,
640 pending_schema: Option<(Fingerprint, RecordDecoder)>,
641 awaiting_body: bool,
642}
643
644impl Decoder {
645 pub fn schema(&self) -> SchemaRef {
650 self.active_decoder.schema().clone()
651 }
652
653 pub fn batch_size(&self) -> usize {
655 self.batch_size
656 }
657
658 pub fn decode(&mut self, data: &[u8]) -> Result<usize, ArrowError> {
679 let mut total_consumed = 0usize;
680 while total_consumed < data.len() && self.remaining_capacity > 0 {
681 if self.awaiting_body {
682 match self.active_decoder.decode(&data[total_consumed..], 1) {
683 Ok(n) => {
684 self.remaining_capacity -= 1;
685 total_consumed += n;
686 self.awaiting_body = false;
687 continue;
688 }
689 Err(ref e) if is_incomplete_data(e) => break,
690 err => return err,
691 };
692 }
693 match self.handle_prefix(&data[total_consumed..])? {
694 Some(0) => break, Some(n) => {
696 total_consumed += n;
697 self.apply_pending_schema_if_batch_empty();
698 self.awaiting_body = true;
699 }
700 None => {
701 return Err(ArrowError::ParseError(
702 "Missing magic bytes and fingerprint".to_string(),
703 ));
704 }
705 }
706 }
707 Ok(total_consumed)
708 }
709
710 fn handle_prefix(&mut self, buf: &[u8]) -> Result<Option<usize>, ArrowError> {
715 match self.fingerprint_algorithm {
716 FingerprintAlgorithm::Rabin => {
717 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
718 Fingerprint::Rabin(u64::from_le_bytes(bytes))
719 })
720 }
721 FingerprintAlgorithm::Id => self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
722 Fingerprint::Id(u32::from_be_bytes(bytes))
723 }),
724 FingerprintAlgorithm::Id64 => {
725 self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
726 Fingerprint::Id64(u64::from_be_bytes(bytes))
727 })
728 }
729 #[cfg(feature = "md5")]
730 FingerprintAlgorithm::MD5 => {
731 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
732 Fingerprint::MD5(bytes)
733 })
734 }
735 #[cfg(feature = "sha256")]
736 FingerprintAlgorithm::SHA256 => {
737 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
738 Fingerprint::SHA256(bytes)
739 })
740 }
741 }
742 }
743
744 fn handle_prefix_common<const MAGIC_LEN: usize, const N: usize>(
748 &mut self,
749 buf: &[u8],
750 magic: &[u8; MAGIC_LEN],
751 fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
752 ) -> Result<Option<usize>, ArrowError> {
753 if buf.len() < MAGIC_LEN {
756 return Ok(Some(0));
757 }
758 if &buf[..MAGIC_LEN] != magic {
760 return Ok(None);
761 }
762 let consumed_fp = self.handle_fingerprint(&buf[MAGIC_LEN..], fingerprint_from)?;
764 Ok(Some(consumed_fp.map_or(0, |n| n + MAGIC_LEN)))
767 }
768
769 fn handle_fingerprint<const N: usize>(
774 &mut self,
775 buf: &[u8],
776 fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
777 ) -> Result<Option<usize>, ArrowError> {
778 let Some(fingerprint_bytes) = buf.get(..N) else {
780 return Ok(None); };
782 let new_fingerprint = fingerprint_from(fingerprint_bytes.try_into().unwrap());
784 if self.active_fingerprint != Some(new_fingerprint) {
786 let Some(new_decoder) = self.cache.shift_remove(&new_fingerprint) else {
787 return Err(ArrowError::ParseError(format!(
788 "Unknown fingerprint: {new_fingerprint:?}"
789 )));
790 };
791 self.pending_schema = Some((new_fingerprint, new_decoder));
792 if self.remaining_capacity < self.batch_size {
795 self.remaining_capacity = 0;
796 }
797 }
798 Ok(Some(N))
799 }
800
801 fn apply_pending_schema(&mut self) {
802 if let Some((new_fingerprint, new_decoder)) = self.pending_schema.take() {
803 if let Some(old_fingerprint) = self.active_fingerprint.replace(new_fingerprint) {
804 let old_decoder = std::mem::replace(&mut self.active_decoder, new_decoder);
805 self.cache.shift_remove(&old_fingerprint);
806 self.cache.insert(old_fingerprint, old_decoder);
807 } else {
808 self.active_decoder = new_decoder;
809 }
810 }
811 }
812
813 fn apply_pending_schema_if_batch_empty(&mut self) {
814 if self.batch_is_empty() {
815 self.apply_pending_schema();
816 }
817 }
818
819 fn flush_and_reset(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
820 if self.batch_is_empty() {
821 return Ok(None);
822 }
823 let batch = self.active_decoder.flush()?;
824 self.remaining_capacity = self.batch_size;
825 Ok(Some(batch))
826 }
827
828 pub fn flush(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
835 let batch = self.flush_and_reset();
837 self.apply_pending_schema();
838 batch
839 }
840
841 pub fn capacity(&self) -> usize {
843 self.remaining_capacity
844 }
845
846 pub fn batch_is_full(&self) -> bool {
848 self.remaining_capacity == 0
849 }
850
851 pub fn batch_is_empty(&self) -> bool {
853 self.remaining_capacity == self.batch_size
854 }
855
856 fn decode_block(&mut self, data: &[u8], count: usize) -> Result<(usize, usize), ArrowError> {
860 let to_decode = std::cmp::min(count, self.remaining_capacity);
862 if to_decode == 0 {
863 return Ok((0, 0));
864 }
865 let consumed = self.active_decoder.decode(data, to_decode)?;
866 self.remaining_capacity -= to_decode;
867 Ok((consumed, to_decode))
868 }
869
870 fn flush_block(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
873 self.flush_and_reset()
874 }
875}
876
877#[derive(Debug)]
929pub struct ReaderBuilder {
930 batch_size: usize,
931 strict_mode: bool,
932 utf8_view: bool,
933 reader_schema: Option<AvroSchema>,
934 writer_schema_store: Option<SchemaStore>,
935 active_fingerprint: Option<Fingerprint>,
936}
937
938impl Default for ReaderBuilder {
939 fn default() -> Self {
940 Self {
941 batch_size: 1024,
942 strict_mode: false,
943 utf8_view: false,
944 reader_schema: None,
945 writer_schema_store: None,
946 active_fingerprint: None,
947 }
948 }
949}
950
951impl ReaderBuilder {
952 pub fn new() -> Self {
961 Self::default()
962 }
963
964 fn make_record_decoder(
965 &self,
966 writer_schema: &Schema,
967 reader_schema: Option<&Schema>,
968 ) -> Result<RecordDecoder, ArrowError> {
969 let mut builder = AvroFieldBuilder::new(writer_schema);
970 if let Some(reader_schema) = reader_schema {
971 builder = builder.with_reader_schema(reader_schema);
972 }
973 let root = builder
974 .with_utf8view(self.utf8_view)
975 .with_strict_mode(self.strict_mode)
976 .build()?;
977 RecordDecoder::try_new_with_options(root.data_type())
978 }
979
980 fn make_record_decoder_from_schemas(
981 &self,
982 writer_schema: &Schema,
983 reader_schema: Option<&AvroSchema>,
984 ) -> Result<RecordDecoder, ArrowError> {
985 let reader_schema_raw = reader_schema.map(|s| s.schema()).transpose()?;
986 self.make_record_decoder(writer_schema, reader_schema_raw.as_ref())
987 }
988
989 fn make_decoder_with_parts(
990 &self,
991 active_decoder: RecordDecoder,
992 active_fingerprint: Option<Fingerprint>,
993 cache: IndexMap<Fingerprint, RecordDecoder>,
994 fingerprint_algorithm: FingerprintAlgorithm,
995 ) -> Decoder {
996 Decoder {
997 batch_size: self.batch_size,
998 remaining_capacity: self.batch_size,
999 active_fingerprint,
1000 active_decoder,
1001 cache,
1002 fingerprint_algorithm,
1003 pending_schema: None,
1004 awaiting_body: false,
1005 }
1006 }
1007
1008 fn make_decoder(
1009 &self,
1010 header: Option<&Header>,
1011 reader_schema: Option<&AvroSchema>,
1012 ) -> Result<Decoder, ArrowError> {
1013 if let Some(hdr) = header {
1014 let writer_schema = hdr
1015 .schema()
1016 .map_err(|e| ArrowError::ExternalError(Box::new(e)))?
1017 .ok_or_else(|| {
1018 ArrowError::ParseError("No Avro schema present in file header".into())
1019 })?;
1020 let record_decoder =
1021 self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?;
1022 return Ok(self.make_decoder_with_parts(
1023 record_decoder,
1024 None,
1025 IndexMap::new(),
1026 FingerprintAlgorithm::Rabin,
1027 ));
1028 }
1029 let store = self.writer_schema_store.as_ref().ok_or_else(|| {
1030 ArrowError::ParseError("Writer schema store required for raw Avro".into())
1031 })?;
1032 let fingerprints = store.fingerprints();
1033 if fingerprints.is_empty() {
1034 return Err(ArrowError::ParseError(
1035 "Writer schema store must contain at least one schema".into(),
1036 ));
1037 }
1038 let start_fingerprint = self
1039 .active_fingerprint
1040 .or_else(|| fingerprints.first().copied())
1041 .ok_or_else(|| {
1042 ArrowError::ParseError("Could not determine initial schema fingerprint".into())
1043 })?;
1044 let mut cache = IndexMap::with_capacity(fingerprints.len().saturating_sub(1));
1045 let mut active_decoder: Option<RecordDecoder> = None;
1046 for fingerprint in store.fingerprints() {
1047 let avro_schema = match store.lookup(&fingerprint) {
1048 Some(schema) => schema,
1049 None => {
1050 return Err(ArrowError::ComputeError(format!(
1051 "Fingerprint {fingerprint:?} not found in schema store",
1052 )));
1053 }
1054 };
1055 let writer_schema = avro_schema.schema()?;
1056 let record_decoder =
1057 self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?;
1058 if fingerprint == start_fingerprint {
1059 active_decoder = Some(record_decoder);
1060 } else {
1061 cache.insert(fingerprint, record_decoder);
1062 }
1063 }
1064 let active_decoder = active_decoder.ok_or_else(|| {
1065 ArrowError::ComputeError(format!(
1066 "Initial fingerprint {start_fingerprint:?} not found in schema store"
1067 ))
1068 })?;
1069 Ok(self.make_decoder_with_parts(
1070 active_decoder,
1071 Some(start_fingerprint),
1072 cache,
1073 store.fingerprint_algorithm(),
1074 ))
1075 }
1076
1077 pub fn with_batch_size(mut self, batch_size: usize) -> Self {
1083 self.batch_size = batch_size;
1084 self
1085 }
1086
1087 pub fn with_utf8_view(mut self, utf8_view: bool) -> Self {
1093 self.utf8_view = utf8_view;
1094 self
1095 }
1096
1097 pub fn use_utf8view(&self) -> bool {
1099 self.utf8_view
1100 }
1101
1102 pub fn with_strict_mode(mut self, strict_mode: bool) -> Self {
1107 self.strict_mode = strict_mode;
1108 self
1109 }
1110
1111 pub fn with_reader_schema(mut self, schema: AvroSchema) -> Self {
1118 self.reader_schema = Some(schema);
1119 self
1120 }
1121
1122 pub fn with_writer_schema_store(mut self, store: SchemaStore) -> Self {
1130 self.writer_schema_store = Some(store);
1131 self
1132 }
1133
1134 pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self {
1139 self.active_fingerprint = Some(fp);
1140 self
1141 }
1142
1143 pub fn build<R: BufRead>(self, mut reader: R) -> Result<Reader<R>, ArrowError> {
1149 let header = read_header(&mut reader)?;
1150 let decoder = self.make_decoder(Some(&header), self.reader_schema.as_ref())?;
1151 Ok(Reader {
1152 reader,
1153 header,
1154 decoder,
1155 block_decoder: BlockDecoder::default(),
1156 block_data: Vec::new(),
1157 block_count: 0,
1158 block_cursor: 0,
1159 finished: false,
1160 })
1161 }
1162
1163 pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
1172 if self.writer_schema_store.is_none() {
1173 return Err(ArrowError::InvalidArgumentError(
1174 "Building a decoder requires a writer schema store".to_string(),
1175 ));
1176 }
1177 self.make_decoder(None, self.reader_schema.as_ref())
1178 }
1179}
1180
1181#[derive(Debug)]
1191pub struct Reader<R: BufRead> {
1192 reader: R,
1193 header: Header,
1194 decoder: Decoder,
1195 block_decoder: BlockDecoder,
1196 block_data: Vec<u8>,
1197 block_count: usize,
1198 block_cursor: usize,
1199 finished: bool,
1200}
1201
1202impl<R: BufRead> Reader<R> {
1203 pub fn schema(&self) -> SchemaRef {
1206 self.decoder.schema()
1207 }
1208
1209 pub fn avro_header(&self) -> &Header {
1211 &self.header
1212 }
1213
1214 fn read(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
1219 'outer: while !self.finished && !self.decoder.batch_is_full() {
1220 while self.block_cursor == self.block_data.len() {
1221 let buf = self.reader.fill_buf()?;
1222 if buf.is_empty() {
1223 self.finished = true;
1224 break 'outer;
1225 }
1226 let consumed = self.block_decoder.decode(buf)?;
1228 self.reader.consume(consumed);
1229 if let Some(block) = self.block_decoder.flush() {
1230 self.block_data = if let Some(ref codec) = self.header.compression()? {
1232 codec.decompress(&block.data)?
1233 } else {
1234 block.data
1235 };
1236 self.block_count = block.count;
1237 self.block_cursor = 0;
1238 } else if consumed == 0 {
1239 return Err(ArrowError::ParseError(
1241 "Could not decode next Avro block from partial data".to_string(),
1242 ));
1243 }
1244 }
1245 if self.block_cursor < self.block_data.len() {
1247 let (consumed, records_decoded) = self
1248 .decoder
1249 .decode_block(&self.block_data[self.block_cursor..], self.block_count)?;
1250 self.block_cursor += consumed;
1251 self.block_count -= records_decoded;
1252 }
1253 }
1254 self.decoder.flush_block()
1255 }
1256}
1257
1258impl<R: BufRead> Iterator for Reader<R> {
1259 type Item = Result<RecordBatch, ArrowError>;
1260
1261 fn next(&mut self) -> Option<Self::Item> {
1262 self.read().transpose()
1263 }
1264}
1265
1266impl<R: BufRead> RecordBatchReader for Reader<R> {
1267 fn schema(&self) -> SchemaRef {
1268 self.schema()
1269 }
1270}
1271
1272#[cfg(test)]
1273mod test {
1274 use crate::codec::AvroFieldBuilder;
1275 use crate::reader::record::RecordDecoder;
1276 use crate::reader::{Decoder, Reader, ReaderBuilder};
1277 use crate::schema::{
1278 AVRO_ENUM_SYMBOLS_METADATA_KEY, AVRO_NAME_METADATA_KEY, AVRO_NAMESPACE_METADATA_KEY,
1279 AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, PrimitiveType,
1280 SINGLE_OBJECT_MAGIC, SchemaStore,
1281 };
1282 use crate::test_util::arrow_test_data;
1283 use crate::writer::AvroWriter;
1284 use arrow_array::builder::{
1285 ArrayBuilder, BooleanBuilder, Float32Builder, Int32Builder, Int64Builder, ListBuilder,
1286 MapBuilder, StringBuilder, StructBuilder,
1287 };
1288 #[cfg(feature = "snappy")]
1289 use arrow_array::builder::{Float64Builder, MapFieldNames};
1290 use arrow_array::cast::AsArray;
1291 #[cfg(not(feature = "avro_custom_types"))]
1292 use arrow_array::types::Int64Type;
1293 #[cfg(feature = "avro_custom_types")]
1294 use arrow_array::types::{
1295 DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
1296 DurationSecondType,
1297 };
1298 use arrow_array::types::{Int32Type, IntervalMonthDayNanoType};
1299 use arrow_array::*;
1300 #[cfg(feature = "snappy")]
1301 use arrow_buffer::{Buffer, NullBuffer};
1302 use arrow_buffer::{IntervalMonthDayNano, OffsetBuffer, ScalarBuffer, i256};
1303 #[cfg(feature = "avro_custom_types")]
1304 use arrow_schema::{
1305 ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit, UnionFields,
1306 UnionMode,
1307 };
1308 #[cfg(not(feature = "avro_custom_types"))]
1309 use arrow_schema::{
1310 ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, UnionFields, UnionMode,
1311 };
1312 use bytes::Bytes;
1313 use futures::executor::block_on;
1314 use futures::{Stream, StreamExt, TryStreamExt, stream};
1315 use serde_json::{Value, json};
1316 use std::collections::HashMap;
1317 use std::fs::File;
1318 use std::io::{BufReader, Cursor};
1319 use std::sync::Arc;
1320
1321 fn files() -> impl Iterator<Item = &'static str> {
1322 [
1323 #[cfg(feature = "snappy")]
1325 "avro/alltypes_plain.avro",
1326 #[cfg(feature = "snappy")]
1327 "avro/alltypes_plain.snappy.avro",
1328 #[cfg(feature = "zstd")]
1329 "avro/alltypes_plain.zstandard.avro",
1330 #[cfg(feature = "bzip2")]
1331 "avro/alltypes_plain.bzip2.avro",
1332 #[cfg(feature = "xz")]
1333 "avro/alltypes_plain.xz.avro",
1334 ]
1335 .into_iter()
1336 }
1337
1338 fn read_file(path: &str, batch_size: usize, utf8_view: bool) -> RecordBatch {
1339 let file = File::open(path).unwrap();
1340 let reader = ReaderBuilder::new()
1341 .with_batch_size(batch_size)
1342 .with_utf8_view(utf8_view)
1343 .build(BufReader::new(file))
1344 .unwrap();
1345 let schema = reader.schema();
1346 let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1347 arrow::compute::concat_batches(&schema, &batches).unwrap()
1348 }
1349
1350 fn read_file_strict(
1351 path: &str,
1352 batch_size: usize,
1353 utf8_view: bool,
1354 ) -> Result<Reader<BufReader<File>>, ArrowError> {
1355 let file = File::open(path)?;
1356 ReaderBuilder::new()
1357 .with_batch_size(batch_size)
1358 .with_utf8_view(utf8_view)
1359 .with_strict_mode(true)
1360 .build(BufReader::new(file))
1361 }
1362
1363 fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
1364 mut decoder: Decoder,
1365 mut input: S,
1366 ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
1367 async_stream::try_stream! {
1368 if let Some(data) = input.next().await {
1369 let consumed = decoder.decode(&data)?;
1370 if consumed < data.len() {
1371 Err(ArrowError::ParseError(
1372 "did not consume all bytes".to_string(),
1373 ))?;
1374 }
1375 }
1376 if let Some(batch) = decoder.flush()? {
1377 yield batch
1378 }
1379 }
1380 }
1381
1382 fn make_record_schema(pt: PrimitiveType) -> AvroSchema {
1383 let js = format!(
1384 r#"{{"type":"record","name":"TestRecord","fields":[{{"name":"a","type":"{}"}}]}}"#,
1385 pt.as_ref()
1386 );
1387 AvroSchema::new(js)
1388 }
1389
1390 fn make_two_schema_store() -> (
1391 SchemaStore,
1392 Fingerprint,
1393 Fingerprint,
1394 AvroSchema,
1395 AvroSchema,
1396 ) {
1397 let schema_int = make_record_schema(PrimitiveType::Int);
1398 let schema_long = make_record_schema(PrimitiveType::Long);
1399 let mut store = SchemaStore::new();
1400 let fp_int = store
1401 .register(schema_int.clone())
1402 .expect("register int schema");
1403 let fp_long = store
1404 .register(schema_long.clone())
1405 .expect("register long schema");
1406 (store, fp_int, fp_long, schema_int, schema_long)
1407 }
1408
1409 fn make_prefix(fp: Fingerprint) -> Vec<u8> {
1410 match fp {
1411 Fingerprint::Rabin(v) => {
1412 let mut out = Vec::with_capacity(2 + 8);
1413 out.extend_from_slice(&SINGLE_OBJECT_MAGIC);
1414 out.extend_from_slice(&v.to_le_bytes());
1415 out
1416 }
1417 Fingerprint::Id(v) => {
1418 panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1419 }
1420 Fingerprint::Id64(v) => {
1421 panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1422 }
1423 #[cfg(feature = "md5")]
1424 Fingerprint::MD5(v) => {
1425 panic!("make_prefix expects a Rabin fingerprint, got ({v:?})");
1426 }
1427 #[cfg(feature = "sha256")]
1428 Fingerprint::SHA256(id) => {
1429 panic!("make_prefix expects a Rabin fingerprint, got ({id:?})");
1430 }
1431 }
1432 }
1433
1434 fn make_decoder(store: &SchemaStore, fp: Fingerprint, reader_schema: &AvroSchema) -> Decoder {
1435 ReaderBuilder::new()
1436 .with_batch_size(8)
1437 .with_reader_schema(reader_schema.clone())
1438 .with_writer_schema_store(store.clone())
1439 .with_active_fingerprint(fp)
1440 .build_decoder()
1441 .expect("decoder")
1442 }
1443
1444 fn make_id_prefix(id: u32, additional: usize) -> Vec<u8> {
1445 let capacity = CONFLUENT_MAGIC.len() + size_of::<u32>() + additional;
1446 let mut out = Vec::with_capacity(capacity);
1447 out.extend_from_slice(&CONFLUENT_MAGIC);
1448 out.extend_from_slice(&id.to_be_bytes());
1449 out
1450 }
1451
1452 fn make_message_id(id: u32, value: i64) -> Vec<u8> {
1453 let encoded_value = encode_zigzag(value);
1454 let mut msg = make_id_prefix(id, encoded_value.len());
1455 msg.extend_from_slice(&encoded_value);
1456 msg
1457 }
1458
1459 fn make_id64_prefix(id: u64, additional: usize) -> Vec<u8> {
1460 let capacity = CONFLUENT_MAGIC.len() + size_of::<u64>() + additional;
1461 let mut out = Vec::with_capacity(capacity);
1462 out.extend_from_slice(&CONFLUENT_MAGIC);
1463 out.extend_from_slice(&id.to_be_bytes());
1464 out
1465 }
1466
1467 fn make_message_id64(id: u64, value: i64) -> Vec<u8> {
1468 let encoded_value = encode_zigzag(value);
1469 let mut msg = make_id64_prefix(id, encoded_value.len());
1470 msg.extend_from_slice(&encoded_value);
1471 msg
1472 }
1473
1474 fn make_value_schema(pt: PrimitiveType) -> AvroSchema {
1475 let json_schema = format!(
1476 r#"{{"type":"record","name":"S","fields":[{{"name":"v","type":"{}"}}]}}"#,
1477 pt.as_ref()
1478 );
1479 AvroSchema::new(json_schema)
1480 }
1481
1482 fn encode_zigzag(value: i64) -> Vec<u8> {
1483 let mut n = ((value << 1) ^ (value >> 63)) as u64;
1484 let mut out = Vec::new();
1485 loop {
1486 if (n & !0x7F) == 0 {
1487 out.push(n as u8);
1488 break;
1489 } else {
1490 out.push(((n & 0x7F) | 0x80) as u8);
1491 n >>= 7;
1492 }
1493 }
1494 out
1495 }
1496
1497 fn make_message(fp: Fingerprint, value: i64) -> Vec<u8> {
1498 let mut msg = make_prefix(fp);
1499 msg.extend_from_slice(&encode_zigzag(value));
1500 msg
1501 }
1502
1503 fn load_writer_schema_json(path: &str) -> Value {
1504 let file = File::open(path).unwrap();
1505 let header = super::read_header(BufReader::new(file)).unwrap();
1506 let schema = header.schema().unwrap().unwrap();
1507 serde_json::to_value(&schema).unwrap()
1508 }
1509
1510 fn make_reader_schema_with_promotions(
1511 path: &str,
1512 promotions: &HashMap<&str, &str>,
1513 ) -> AvroSchema {
1514 let mut root = load_writer_schema_json(path);
1515 assert_eq!(root["type"], "record", "writer schema must be a record");
1516 let fields = root
1517 .get_mut("fields")
1518 .and_then(|f| f.as_array_mut())
1519 .expect("record has fields");
1520 for f in fields.iter_mut() {
1521 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1522 continue;
1523 };
1524 if let Some(new_ty) = promotions.get(name) {
1525 let ty = f.get_mut("type").expect("field has a type");
1526 match ty {
1527 Value::String(_) => {
1528 *ty = Value::String((*new_ty).to_string());
1529 }
1530 Value::Array(arr) => {
1532 for b in arr.iter_mut() {
1533 match b {
1534 Value::String(s) if s != "null" => {
1535 *b = Value::String((*new_ty).to_string());
1536 break;
1537 }
1538 Value::Object(_) => {
1539 *b = Value::String((*new_ty).to_string());
1540 break;
1541 }
1542 _ => {}
1543 }
1544 }
1545 }
1546 Value::Object(_) => {
1547 *ty = Value::String((*new_ty).to_string());
1548 }
1549 _ => {}
1550 }
1551 }
1552 }
1553 AvroSchema::new(root.to_string())
1554 }
1555
1556 fn make_reader_schema_with_enum_remap(
1557 path: &str,
1558 remap: &HashMap<&str, Vec<&str>>,
1559 ) -> AvroSchema {
1560 let mut root = load_writer_schema_json(path);
1561 assert_eq!(root["type"], "record", "writer schema must be a record");
1562 let fields = root
1563 .get_mut("fields")
1564 .and_then(|f| f.as_array_mut())
1565 .expect("record has fields");
1566
1567 fn to_symbols_array(symbols: &[&str]) -> Value {
1568 Value::Array(symbols.iter().map(|s| Value::String((*s).into())).collect())
1569 }
1570
1571 fn update_enum_symbols(ty: &mut Value, symbols: &Value) {
1572 match ty {
1573 Value::Object(map) => {
1574 if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1575 map.insert("symbols".to_string(), symbols.clone());
1576 }
1577 }
1578 Value::Array(arr) => {
1579 for b in arr.iter_mut() {
1580 if let Value::Object(map) = b {
1581 if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1582 map.insert("symbols".to_string(), symbols.clone());
1583 }
1584 }
1585 }
1586 }
1587 _ => {}
1588 }
1589 }
1590 for f in fields.iter_mut() {
1591 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1592 continue;
1593 };
1594 if let Some(new_symbols) = remap.get(name) {
1595 let symbols_val = to_symbols_array(new_symbols);
1596 let ty = f.get_mut("type").expect("field has a type");
1597 update_enum_symbols(ty, &symbols_val);
1598 }
1599 }
1600 AvroSchema::new(root.to_string())
1601 }
1602
1603 fn read_alltypes_with_reader_schema(path: &str, reader_schema: AvroSchema) -> RecordBatch {
1604 let file = File::open(path).unwrap();
1605 let reader = ReaderBuilder::new()
1606 .with_batch_size(1024)
1607 .with_utf8_view(false)
1608 .with_reader_schema(reader_schema)
1609 .build(BufReader::new(file))
1610 .unwrap();
1611 let schema = reader.schema();
1612 let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1613 arrow::compute::concat_batches(&schema, &batches).unwrap()
1614 }
1615
1616 fn make_reader_schema_with_selected_fields_in_order(
1617 path: &str,
1618 selected: &[&str],
1619 ) -> AvroSchema {
1620 let mut root = load_writer_schema_json(path);
1621 assert_eq!(root["type"], "record", "writer schema must be a record");
1622 let writer_fields = root
1623 .get("fields")
1624 .and_then(|f| f.as_array())
1625 .expect("record has fields");
1626 let mut field_map: HashMap<String, Value> = HashMap::with_capacity(writer_fields.len());
1627 for f in writer_fields {
1628 if let Some(name) = f.get("name").and_then(|n| n.as_str()) {
1629 field_map.insert(name.to_string(), f.clone());
1630 }
1631 }
1632 let mut new_fields = Vec::with_capacity(selected.len());
1633 for name in selected {
1634 let f = field_map
1635 .get(*name)
1636 .unwrap_or_else(|| panic!("field '{name}' not found in writer schema"))
1637 .clone();
1638 new_fields.push(f);
1639 }
1640 root["fields"] = Value::Array(new_fields);
1641 AvroSchema::new(root.to_string())
1642 }
1643
1644 fn write_ocf(schema: &Schema, batches: &[RecordBatch]) -> Vec<u8> {
1645 let mut w = AvroWriter::new(Vec::<u8>::new(), schema.clone()).expect("writer");
1646 for b in batches {
1647 w.write(b).expect("write");
1648 }
1649 w.finish().expect("finish");
1650 w.into_inner()
1651 }
1652
1653 #[test]
1654 fn writer_string_reader_nullable_with_alias() -> Result<(), Box<dyn std::error::Error>> {
1655 let writer_schema = Schema::new(vec![
1657 Field::new("id", DataType::Int64, false),
1658 Field::new("name", DataType::Utf8, false),
1659 ]);
1660 let batch = RecordBatch::try_new(
1661 Arc::new(writer_schema.clone()),
1662 vec![
1663 Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1664 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1665 ],
1666 )?;
1667 let bytes = write_ocf(&writer_schema, &[batch]);
1668 let reader_json = r#"
1669 {
1670 "type": "record",
1671 "name": "topLevelRecord",
1672 "fields": [
1673 { "name": "id", "type": "long" },
1674 { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
1675 { "name": "is_active", "type": "boolean", "default": true }
1676 ]
1677 }"#;
1678 let mut reader = ReaderBuilder::new()
1679 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1680 .build(Cursor::new(bytes))?;
1681 let out = reader.next().unwrap()?;
1682 let full_name = out.column(1).as_string::<i32>();
1684 assert_eq!(full_name.value(0), "a");
1685 assert_eq!(full_name.value(1), "b");
1686
1687 Ok(())
1688 }
1689
1690 #[test]
1691 fn writer_string_reader_string_null_order_second() -> Result<(), Box<dyn std::error::Error>> {
1692 let writer_schema = Schema::new(vec![Field::new("name", DataType::Utf8, false)]);
1694 let batch = RecordBatch::try_new(
1695 Arc::new(writer_schema.clone()),
1696 vec![Arc::new(StringArray::from(vec!["x", "y"])) as ArrayRef],
1697 )?;
1698 let bytes = write_ocf(&writer_schema, &[batch]);
1699
1700 let reader_json = r#"
1702 {
1703 "type":"record", "name":"topLevelRecord",
1704 "fields":[ { "name":"name", "type":["string","null"], "default":"x" } ]
1705 }"#;
1706
1707 let mut reader = ReaderBuilder::new()
1708 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1709 .build(Cursor::new(bytes))?;
1710
1711 let out = reader.next().unwrap()?;
1712 assert_eq!(out.num_rows(), 2);
1713
1714 let name = out.column(0).as_string::<i32>();
1716 assert_eq!(name.value(0), "x");
1717 assert_eq!(name.value(1), "y");
1718
1719 Ok(())
1720 }
1721
1722 #[test]
1723 fn promotion_writer_int_reader_nullable_long() -> Result<(), Box<dyn std::error::Error>> {
1724 let writer_schema = Schema::new(vec![Field::new("v", DataType::Int32, false)]);
1726 let batch = RecordBatch::try_new(
1727 Arc::new(writer_schema.clone()),
1728 vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
1729 )?;
1730 let bytes = write_ocf(&writer_schema, &[batch]);
1731
1732 let reader_json = r#"
1734 {
1735 "type":"record", "name":"topLevelRecord",
1736 "fields":[ { "name":"v", "type":["null","long"], "default": null } ]
1737 }"#;
1738
1739 let mut reader = ReaderBuilder::new()
1740 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1741 .build(Cursor::new(bytes))?;
1742
1743 let out = reader.next().unwrap()?;
1744 assert_eq!(out.num_rows(), 3);
1745
1746 let v = out
1748 .column(0)
1749 .as_primitive::<arrow_array::types::Int64Type>();
1750 assert_eq!(v.values(), &[1, 2, 3]);
1751 assert!(
1752 out.column(0).nulls().is_none(),
1753 "expected no validity bitmap for all-valid column"
1754 );
1755
1756 Ok(())
1757 }
1758
1759 #[test]
1760 fn test_alltypes_schema_promotion_mixed() {
1761 for file in files() {
1762 let file = arrow_test_data(file);
1763 let mut promotions: HashMap<&str, &str> = HashMap::new();
1764 promotions.insert("id", "long");
1765 promotions.insert("tinyint_col", "float");
1766 promotions.insert("smallint_col", "double");
1767 promotions.insert("int_col", "double");
1768 promotions.insert("bigint_col", "double");
1769 promotions.insert("float_col", "double");
1770 promotions.insert("date_string_col", "string");
1771 promotions.insert("string_col", "string");
1772 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
1773 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
1774 let expected = RecordBatch::try_from_iter_with_nullable([
1775 (
1776 "id",
1777 Arc::new(Int64Array::from(vec![4i64, 5, 6, 7, 2, 3, 0, 1])) as _,
1778 true,
1779 ),
1780 (
1781 "bool_col",
1782 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
1783 true,
1784 ),
1785 (
1786 "tinyint_col",
1787 Arc::new(Float32Array::from_iter_values(
1788 (0..8).map(|x| (x % 2) as f32),
1789 )) as _,
1790 true,
1791 ),
1792 (
1793 "smallint_col",
1794 Arc::new(Float64Array::from_iter_values(
1795 (0..8).map(|x| (x % 2) as f64),
1796 )) as _,
1797 true,
1798 ),
1799 (
1800 "int_col",
1801 Arc::new(Float64Array::from_iter_values(
1802 (0..8).map(|x| (x % 2) as f64),
1803 )) as _,
1804 true,
1805 ),
1806 (
1807 "bigint_col",
1808 Arc::new(Float64Array::from_iter_values(
1809 (0..8).map(|x| ((x % 2) * 10) as f64),
1810 )) as _,
1811 true,
1812 ),
1813 (
1814 "float_col",
1815 Arc::new(Float64Array::from_iter_values(
1816 (0..8).map(|x| ((x % 2) as f32 * 1.1f32) as f64),
1817 )) as _,
1818 true,
1819 ),
1820 (
1821 "double_col",
1822 Arc::new(Float64Array::from_iter_values(
1823 (0..8).map(|x| (x % 2) as f64 * 10.1),
1824 )) as _,
1825 true,
1826 ),
1827 (
1828 "date_string_col",
1829 Arc::new(StringArray::from(vec![
1830 "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
1831 "01/01/09", "01/01/09",
1832 ])) as _,
1833 true,
1834 ),
1835 (
1836 "string_col",
1837 Arc::new(StringArray::from(
1838 (0..8)
1839 .map(|x| if x % 2 == 0 { "0" } else { "1" })
1840 .collect::<Vec<_>>(),
1841 )) as _,
1842 true,
1843 ),
1844 (
1845 "timestamp_col",
1846 Arc::new(
1847 TimestampMicrosecondArray::from_iter_values([
1848 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
1857 .with_timezone("+00:00"),
1858 ) as _,
1859 true,
1860 ),
1861 ])
1862 .unwrap();
1863 assert_eq!(batch, expected, "mismatch for file {file}");
1864 }
1865 }
1866
1867 #[test]
1868 fn test_alltypes_schema_promotion_long_to_float_only() {
1869 for file in files() {
1870 let file = arrow_test_data(file);
1871 let mut promotions: HashMap<&str, &str> = HashMap::new();
1872 promotions.insert("bigint_col", "float");
1873 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
1874 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
1875 let expected = RecordBatch::try_from_iter_with_nullable([
1876 (
1877 "id",
1878 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
1879 true,
1880 ),
1881 (
1882 "bool_col",
1883 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
1884 true,
1885 ),
1886 (
1887 "tinyint_col",
1888 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1889 true,
1890 ),
1891 (
1892 "smallint_col",
1893 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1894 true,
1895 ),
1896 (
1897 "int_col",
1898 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1899 true,
1900 ),
1901 (
1902 "bigint_col",
1903 Arc::new(Float32Array::from_iter_values(
1904 (0..8).map(|x| ((x % 2) * 10) as f32),
1905 )) as _,
1906 true,
1907 ),
1908 (
1909 "float_col",
1910 Arc::new(Float32Array::from_iter_values(
1911 (0..8).map(|x| (x % 2) as f32 * 1.1),
1912 )) as _,
1913 true,
1914 ),
1915 (
1916 "double_col",
1917 Arc::new(Float64Array::from_iter_values(
1918 (0..8).map(|x| (x % 2) as f64 * 10.1),
1919 )) as _,
1920 true,
1921 ),
1922 (
1923 "date_string_col",
1924 Arc::new(BinaryArray::from_iter_values([
1925 [48, 51, 47, 48, 49, 47, 48, 57],
1926 [48, 51, 47, 48, 49, 47, 48, 57],
1927 [48, 52, 47, 48, 49, 47, 48, 57],
1928 [48, 52, 47, 48, 49, 47, 48, 57],
1929 [48, 50, 47, 48, 49, 47, 48, 57],
1930 [48, 50, 47, 48, 49, 47, 48, 57],
1931 [48, 49, 47, 48, 49, 47, 48, 57],
1932 [48, 49, 47, 48, 49, 47, 48, 57],
1933 ])) as _,
1934 true,
1935 ),
1936 (
1937 "string_col",
1938 Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
1939 true,
1940 ),
1941 (
1942 "timestamp_col",
1943 Arc::new(
1944 TimestampMicrosecondArray::from_iter_values([
1945 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
1954 .with_timezone("+00:00"),
1955 ) as _,
1956 true,
1957 ),
1958 ])
1959 .unwrap();
1960 assert_eq!(batch, expected, "mismatch for file {file}");
1961 }
1962 }
1963
1964 #[test]
1965 fn test_alltypes_schema_promotion_bytes_to_string_only() {
1966 for file in files() {
1967 let file = arrow_test_data(file);
1968 let mut promotions: HashMap<&str, &str> = HashMap::new();
1969 promotions.insert("date_string_col", "string");
1970 promotions.insert("string_col", "string");
1971 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
1972 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
1973 let expected = RecordBatch::try_from_iter_with_nullable([
1974 (
1975 "id",
1976 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
1977 true,
1978 ),
1979 (
1980 "bool_col",
1981 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
1982 true,
1983 ),
1984 (
1985 "tinyint_col",
1986 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1987 true,
1988 ),
1989 (
1990 "smallint_col",
1991 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1992 true,
1993 ),
1994 (
1995 "int_col",
1996 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
1997 true,
1998 ),
1999 (
2000 "bigint_col",
2001 Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
2002 true,
2003 ),
2004 (
2005 "float_col",
2006 Arc::new(Float32Array::from_iter_values(
2007 (0..8).map(|x| (x % 2) as f32 * 1.1),
2008 )) as _,
2009 true,
2010 ),
2011 (
2012 "double_col",
2013 Arc::new(Float64Array::from_iter_values(
2014 (0..8).map(|x| (x % 2) as f64 * 10.1),
2015 )) as _,
2016 true,
2017 ),
2018 (
2019 "date_string_col",
2020 Arc::new(StringArray::from(vec![
2021 "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2022 "01/01/09", "01/01/09",
2023 ])) as _,
2024 true,
2025 ),
2026 (
2027 "string_col",
2028 Arc::new(StringArray::from(
2029 (0..8)
2030 .map(|x| if x % 2 == 0 { "0" } else { "1" })
2031 .collect::<Vec<_>>(),
2032 )) as _,
2033 true,
2034 ),
2035 (
2036 "timestamp_col",
2037 Arc::new(
2038 TimestampMicrosecondArray::from_iter_values([
2039 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2048 .with_timezone("+00:00"),
2049 ) as _,
2050 true,
2051 ),
2052 ])
2053 .unwrap();
2054 assert_eq!(batch, expected, "mismatch for file {file}");
2055 }
2056 }
2057
2058 #[test]
2059 #[cfg(feature = "snappy")]
2061 fn test_alltypes_illegal_promotion_bool_to_double_errors() {
2062 let file = arrow_test_data("avro/alltypes_plain.avro");
2063 let mut promotions: HashMap<&str, &str> = HashMap::new();
2064 promotions.insert("bool_col", "double"); let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2066 let file_handle = File::open(&file).unwrap();
2067 let result = ReaderBuilder::new()
2068 .with_reader_schema(reader_schema)
2069 .build(BufReader::new(file_handle));
2070 let err = result.expect_err("expected illegal promotion to error");
2071 let msg = err.to_string();
2072 assert!(
2073 msg.contains("Illegal promotion") || msg.contains("illegal promotion"),
2074 "unexpected error: {msg}"
2075 );
2076 }
2077
2078 #[test]
2079 fn test_simple_enum_with_reader_schema_mapping() {
2080 let file = arrow_test_data("avro/simple_enum.avro");
2081 let mut remap: HashMap<&str, Vec<&str>> = HashMap::new();
2082 remap.insert("f1", vec!["d", "c", "b", "a"]);
2083 remap.insert("f2", vec!["h", "g", "f", "e"]);
2084 remap.insert("f3", vec!["k", "i", "j"]);
2085 let reader_schema = make_reader_schema_with_enum_remap(&file, &remap);
2086 let actual = read_alltypes_with_reader_schema(&file, reader_schema);
2087 let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
2088 let f1_keys = Int32Array::from(vec![3, 2, 1, 0]);
2090 let f1_vals = StringArray::from(vec!["d", "c", "b", "a"]);
2091 let f1 = DictionaryArray::<Int32Type>::try_new(f1_keys, Arc::new(f1_vals)).unwrap();
2092 let mut md_f1 = HashMap::new();
2093 md_f1.insert(
2094 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2095 r#"["d","c","b","a"]"#.to_string(),
2096 );
2097 md_f1.insert("avro.name".to_string(), "enum1".to_string());
2099 md_f1.insert("avro.namespace".to_string(), "ns1".to_string());
2100 let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
2101 let f2_keys = Int32Array::from(vec![1, 0, 3, 2]);
2103 let f2_vals = StringArray::from(vec!["h", "g", "f", "e"]);
2104 let f2 = DictionaryArray::<Int32Type>::try_new(f2_keys, Arc::new(f2_vals)).unwrap();
2105 let mut md_f2 = HashMap::new();
2106 md_f2.insert(
2107 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2108 r#"["h","g","f","e"]"#.to_string(),
2109 );
2110 md_f2.insert("avro.name".to_string(), "enum2".to_string());
2112 md_f2.insert("avro.namespace".to_string(), "ns2".to_string());
2113 let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
2114 let f3_keys = Int32Array::from(vec![Some(2), Some(0), None, Some(1)]);
2116 let f3_vals = StringArray::from(vec!["k", "i", "j"]);
2117 let f3 = DictionaryArray::<Int32Type>::try_new(f3_keys, Arc::new(f3_vals)).unwrap();
2118 let mut md_f3 = HashMap::new();
2119 md_f3.insert(
2120 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2121 r#"["k","i","j"]"#.to_string(),
2122 );
2123 md_f3.insert("avro.name".to_string(), "enum3".to_string());
2125 md_f3.insert("avro.namespace".to_string(), "ns1".to_string());
2126 let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
2127 let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
2128 let expected = RecordBatch::try_new(
2129 expected_schema,
2130 vec![Arc::new(f1) as ArrayRef, Arc::new(f2), Arc::new(f3)],
2131 )
2132 .unwrap();
2133 assert_eq!(actual, expected);
2134 }
2135
2136 #[test]
2137 fn test_schema_store_register_lookup() {
2138 let schema_int = make_record_schema(PrimitiveType::Int);
2139 let schema_long = make_record_schema(PrimitiveType::Long);
2140 let mut store = SchemaStore::new();
2141 let fp_int = store.register(schema_int.clone()).unwrap();
2142 let fp_long = store.register(schema_long.clone()).unwrap();
2143 assert_eq!(store.lookup(&fp_int).cloned(), Some(schema_int));
2144 assert_eq!(store.lookup(&fp_long).cloned(), Some(schema_long));
2145 assert_eq!(store.fingerprint_algorithm(), FingerprintAlgorithm::Rabin);
2146 }
2147
2148 #[test]
2149 fn test_unknown_fingerprint_is_error() {
2150 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2151 let unknown_fp = Fingerprint::Rabin(0xDEAD_BEEF_DEAD_BEEF);
2152 let prefix = make_prefix(unknown_fp);
2153 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2154 let err = decoder.decode(&prefix).expect_err("decode should error");
2155 let msg = err.to_string();
2156 assert!(
2157 msg.contains("Unknown fingerprint"),
2158 "unexpected message: {msg}"
2159 );
2160 }
2161
2162 #[test]
2163 fn test_handle_prefix_incomplete_magic() {
2164 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2165 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2166 let buf = &SINGLE_OBJECT_MAGIC[..1];
2167 let res = decoder.handle_prefix(buf).unwrap();
2168 assert_eq!(res, Some(0));
2169 assert!(decoder.pending_schema.is_none());
2170 }
2171
2172 #[test]
2173 fn test_handle_prefix_magic_mismatch() {
2174 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2175 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2176 let buf = [0xFFu8, 0x00u8, 0x01u8];
2177 let res = decoder.handle_prefix(&buf).unwrap();
2178 assert!(res.is_none());
2179 }
2180
2181 #[test]
2182 fn test_handle_prefix_incomplete_fingerprint() {
2183 let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2184 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2185 let long_bytes = match fp_long {
2186 Fingerprint::Rabin(v) => v.to_le_bytes(),
2187 Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2188 Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2189 #[cfg(feature = "md5")]
2190 Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2191 #[cfg(feature = "sha256")]
2192 Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2193 };
2194 let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2195 buf.extend_from_slice(&long_bytes[..4]);
2196 let res = decoder.handle_prefix(&buf).unwrap();
2197 assert_eq!(res, Some(0));
2198 assert!(decoder.pending_schema.is_none());
2199 }
2200
2201 #[test]
2202 fn test_handle_prefix_valid_prefix_switches_schema() {
2203 let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2204 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2205 let writer_schema_long = schema_long.schema().unwrap();
2206 let root_long = AvroFieldBuilder::new(&writer_schema_long).build().unwrap();
2207 let long_decoder = RecordDecoder::try_new_with_options(root_long.data_type()).unwrap();
2208 let _ = decoder.cache.insert(fp_long, long_decoder);
2209 let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2210 match fp_long {
2211 Fingerprint::Rabin(v) => buf.extend_from_slice(&v.to_le_bytes()),
2212 Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2213 Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2214 #[cfg(feature = "md5")]
2215 Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2216 #[cfg(feature = "sha256")]
2217 Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2218 }
2219 let consumed = decoder.handle_prefix(&buf).unwrap().unwrap();
2220 assert_eq!(consumed, buf.len());
2221 assert!(decoder.pending_schema.is_some());
2222 assert_eq!(decoder.pending_schema.as_ref().unwrap().0, fp_long);
2223 }
2224
2225 #[test]
2226 fn test_two_messages_same_schema() {
2227 let writer_schema = make_value_schema(PrimitiveType::Int);
2228 let reader_schema = writer_schema.clone();
2229 let mut store = SchemaStore::new();
2230 let fp = store.register(writer_schema).unwrap();
2231 let msg1 = make_message(fp, 42);
2232 let msg2 = make_message(fp, 11);
2233 let input = [msg1.clone(), msg2.clone()].concat();
2234 let mut decoder = ReaderBuilder::new()
2235 .with_batch_size(8)
2236 .with_reader_schema(reader_schema.clone())
2237 .with_writer_schema_store(store)
2238 .with_active_fingerprint(fp)
2239 .build_decoder()
2240 .unwrap();
2241 let _ = decoder.decode(&input).unwrap();
2242 let batch = decoder.flush().unwrap().expect("batch");
2243 assert_eq!(batch.num_rows(), 2);
2244 let col = batch
2245 .column(0)
2246 .as_any()
2247 .downcast_ref::<Int32Array>()
2248 .unwrap();
2249 assert_eq!(col.value(0), 42);
2250 assert_eq!(col.value(1), 11);
2251 }
2252
2253 #[test]
2254 fn test_two_messages_schema_switch() {
2255 let w_int = make_value_schema(PrimitiveType::Int);
2256 let w_long = make_value_schema(PrimitiveType::Long);
2257 let mut store = SchemaStore::new();
2258 let fp_int = store.register(w_int).unwrap();
2259 let fp_long = store.register(w_long).unwrap();
2260 let msg_int = make_message(fp_int, 1);
2261 let msg_long = make_message(fp_long, 123456789_i64);
2262 let mut decoder = ReaderBuilder::new()
2263 .with_batch_size(8)
2264 .with_writer_schema_store(store)
2265 .with_active_fingerprint(fp_int)
2266 .build_decoder()
2267 .unwrap();
2268 let _ = decoder.decode(&msg_int).unwrap();
2269 let batch1 = decoder.flush().unwrap().expect("batch1");
2270 assert_eq!(batch1.num_rows(), 1);
2271 assert_eq!(
2272 batch1
2273 .column(0)
2274 .as_any()
2275 .downcast_ref::<Int32Array>()
2276 .unwrap()
2277 .value(0),
2278 1
2279 );
2280 let _ = decoder.decode(&msg_long).unwrap();
2281 let batch2 = decoder.flush().unwrap().expect("batch2");
2282 assert_eq!(batch2.num_rows(), 1);
2283 assert_eq!(
2284 batch2
2285 .column(0)
2286 .as_any()
2287 .downcast_ref::<Int64Array>()
2288 .unwrap()
2289 .value(0),
2290 123456789_i64
2291 );
2292 }
2293
2294 #[test]
2295 fn test_two_messages_same_schema_id() {
2296 let writer_schema = make_value_schema(PrimitiveType::Int);
2297 let reader_schema = writer_schema.clone();
2298 let id = 100u32;
2299 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2301 let _ = store
2302 .set(Fingerprint::Id(id), writer_schema.clone())
2303 .expect("set id schema");
2304 let msg1 = make_message_id(id, 21);
2305 let msg2 = make_message_id(id, 22);
2306 let input = [msg1.clone(), msg2.clone()].concat();
2307 let mut decoder = ReaderBuilder::new()
2308 .with_batch_size(8)
2309 .with_reader_schema(reader_schema)
2310 .with_writer_schema_store(store)
2311 .with_active_fingerprint(Fingerprint::Id(id))
2312 .build_decoder()
2313 .unwrap();
2314 let _ = decoder.decode(&input).unwrap();
2315 let batch = decoder.flush().unwrap().expect("batch");
2316 assert_eq!(batch.num_rows(), 2);
2317 let col = batch
2318 .column(0)
2319 .as_any()
2320 .downcast_ref::<Int32Array>()
2321 .unwrap();
2322 assert_eq!(col.value(0), 21);
2323 assert_eq!(col.value(1), 22);
2324 }
2325
2326 #[test]
2327 fn test_unknown_id_fingerprint_is_error() {
2328 let writer_schema = make_value_schema(PrimitiveType::Int);
2329 let id_known = 7u32;
2330 let id_unknown = 9u32;
2331 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2332 let _ = store
2333 .set(Fingerprint::Id(id_known), writer_schema.clone())
2334 .expect("set id schema");
2335 let mut decoder = ReaderBuilder::new()
2336 .with_batch_size(8)
2337 .with_reader_schema(writer_schema)
2338 .with_writer_schema_store(store)
2339 .with_active_fingerprint(Fingerprint::Id(id_known))
2340 .build_decoder()
2341 .unwrap();
2342 let prefix = make_id_prefix(id_unknown, 0);
2343 let err = decoder.decode(&prefix).expect_err("decode should error");
2344 let msg = err.to_string();
2345 assert!(
2346 msg.contains("Unknown fingerprint"),
2347 "unexpected message: {msg}"
2348 );
2349 }
2350
2351 #[test]
2352 fn test_handle_prefix_id_incomplete_magic() {
2353 let writer_schema = make_value_schema(PrimitiveType::Int);
2354 let id = 5u32;
2355 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2356 let _ = store
2357 .set(Fingerprint::Id(id), writer_schema.clone())
2358 .expect("set id schema");
2359 let mut decoder = ReaderBuilder::new()
2360 .with_batch_size(8)
2361 .with_reader_schema(writer_schema)
2362 .with_writer_schema_store(store)
2363 .with_active_fingerprint(Fingerprint::Id(id))
2364 .build_decoder()
2365 .unwrap();
2366 let buf = &CONFLUENT_MAGIC[..0]; let res = decoder.handle_prefix(buf).unwrap();
2368 assert_eq!(res, Some(0));
2369 assert!(decoder.pending_schema.is_none());
2370 }
2371
2372 #[test]
2373 fn test_two_messages_same_schema_id64() {
2374 let writer_schema = make_value_schema(PrimitiveType::Int);
2375 let reader_schema = writer_schema.clone();
2376 let id = 100u64;
2377 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id64);
2379 let _ = store
2380 .set(Fingerprint::Id64(id), writer_schema.clone())
2381 .expect("set id schema");
2382 let msg1 = make_message_id64(id, 21);
2383 let msg2 = make_message_id64(id, 22);
2384 let input = [msg1.clone(), msg2.clone()].concat();
2385 let mut decoder = ReaderBuilder::new()
2386 .with_batch_size(8)
2387 .with_reader_schema(reader_schema)
2388 .with_writer_schema_store(store)
2389 .with_active_fingerprint(Fingerprint::Id64(id))
2390 .build_decoder()
2391 .unwrap();
2392 let _ = decoder.decode(&input).unwrap();
2393 let batch = decoder.flush().unwrap().expect("batch");
2394 assert_eq!(batch.num_rows(), 2);
2395 let col = batch
2396 .column(0)
2397 .as_any()
2398 .downcast_ref::<Int32Array>()
2399 .unwrap();
2400 assert_eq!(col.value(0), 21);
2401 assert_eq!(col.value(1), 22);
2402 }
2403
2404 #[test]
2405 fn test_decode_stream_with_schema() {
2406 struct TestCase<'a> {
2407 name: &'a str,
2408 schema: &'a str,
2409 expected_error: Option<&'a str>,
2410 }
2411 let tests = vec![
2412 TestCase {
2413 name: "success",
2414 schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"string"}]}"#,
2415 expected_error: None,
2416 },
2417 TestCase {
2418 name: "valid schema invalid data",
2419 schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"long"}]}"#,
2420 expected_error: Some("did not consume all bytes"),
2421 },
2422 ];
2423 for test in tests {
2424 let avro_schema = AvroSchema::new(test.schema.to_string());
2425 let mut store = SchemaStore::new();
2426 let fp = store.register(avro_schema.clone()).unwrap();
2427 let prefix = make_prefix(fp);
2428 let record_val = "some_string";
2429 let mut body = prefix;
2430 body.push((record_val.len() as u8) << 1);
2431 body.extend_from_slice(record_val.as_bytes());
2432 let decoder_res = ReaderBuilder::new()
2433 .with_batch_size(1)
2434 .with_writer_schema_store(store)
2435 .with_active_fingerprint(fp)
2436 .build_decoder();
2437 let decoder = match decoder_res {
2438 Ok(d) => d,
2439 Err(e) => {
2440 if let Some(expected) = test.expected_error {
2441 assert!(
2442 e.to_string().contains(expected),
2443 "Test '{}' failed at build – expected '{expected}', got '{e}'",
2444 test.name
2445 );
2446 continue;
2447 } else {
2448 panic!("Test '{}' failed during build: {e}", test.name);
2449 }
2450 }
2451 };
2452 let stream = Box::pin(stream::once(async { Bytes::from(body) }));
2453 let decoded_stream = decode_stream(decoder, stream);
2454 let batches_result: Result<Vec<RecordBatch>, ArrowError> =
2455 block_on(decoded_stream.try_collect());
2456 match (batches_result, test.expected_error) {
2457 (Ok(batches), None) => {
2458 let batch =
2459 arrow::compute::concat_batches(&batches[0].schema(), &batches).unwrap();
2460 let expected_field = Field::new("f2", DataType::Utf8, false);
2461 let expected_schema = Arc::new(Schema::new(vec![expected_field]));
2462 let expected_array = Arc::new(StringArray::from(vec![record_val]));
2463 let expected_batch =
2464 RecordBatch::try_new(expected_schema, vec![expected_array]).unwrap();
2465 assert_eq!(batch, expected_batch, "Test '{}'", test.name);
2466 }
2467 (Err(e), Some(expected)) => {
2468 assert!(
2469 e.to_string().contains(expected),
2470 "Test '{}' – expected error containing '{expected}', got '{e}'",
2471 test.name
2472 );
2473 }
2474 (Ok(_), Some(expected)) => {
2475 panic!(
2476 "Test '{}' expected failure ('{expected}') but succeeded",
2477 test.name
2478 );
2479 }
2480 (Err(e), None) => {
2481 panic!("Test '{}' unexpectedly failed with '{e}'", test.name);
2482 }
2483 }
2484 }
2485 }
2486
2487 #[test]
2488 fn test_utf8view_support() {
2489 struct TestHelper;
2490 impl TestHelper {
2491 fn with_utf8view(field: &Field) -> Field {
2492 match field.data_type() {
2493 DataType::Utf8 => {
2494 Field::new(field.name(), DataType::Utf8View, field.is_nullable())
2495 .with_metadata(field.metadata().clone())
2496 }
2497 _ => field.clone(),
2498 }
2499 }
2500 }
2501
2502 let field = TestHelper::with_utf8view(&Field::new("str_field", DataType::Utf8, false));
2503
2504 assert_eq!(field.data_type(), &DataType::Utf8View);
2505
2506 let array = StringViewArray::from(vec!["test1", "test2"]);
2507 let batch =
2508 RecordBatch::try_from_iter(vec![("str_field", Arc::new(array) as ArrayRef)]).unwrap();
2509
2510 assert!(batch.column(0).as_any().is::<StringViewArray>());
2511 }
2512
2513 fn make_reader_schema_with_default_fields(
2514 path: &str,
2515 default_fields: Vec<Value>,
2516 ) -> AvroSchema {
2517 let mut root = load_writer_schema_json(path);
2518 assert_eq!(root["type"], "record", "writer schema must be a record");
2519 root.as_object_mut()
2520 .expect("schema is a JSON object")
2521 .insert("fields".to_string(), Value::Array(default_fields));
2522 AvroSchema::new(root.to_string())
2523 }
2524
2525 #[test]
2526 fn test_schema_resolution_defaults_all_supported_types() {
2527 let path = "test/data/skippable_types.avro";
2528 let duration_default = "\u{0000}".repeat(12);
2529 let reader_schema = make_reader_schema_with_default_fields(
2530 path,
2531 vec![
2532 serde_json::json!({"name":"d_bool","type":"boolean","default":true}),
2533 serde_json::json!({"name":"d_int","type":"int","default":42}),
2534 serde_json::json!({"name":"d_long","type":"long","default":12345}),
2535 serde_json::json!({"name":"d_float","type":"float","default":1.5}),
2536 serde_json::json!({"name":"d_double","type":"double","default":2.25}),
2537 serde_json::json!({"name":"d_bytes","type":"bytes","default":"XYZ"}),
2538 serde_json::json!({"name":"d_string","type":"string","default":"hello"}),
2539 serde_json::json!({"name":"d_date","type":{"type":"int","logicalType":"date"},"default":0}),
2540 serde_json::json!({"name":"d_time_ms","type":{"type":"int","logicalType":"time-millis"},"default":1000}),
2541 serde_json::json!({"name":"d_time_us","type":{"type":"long","logicalType":"time-micros"},"default":2000}),
2542 serde_json::json!({"name":"d_ts_ms","type":{"type":"long","logicalType":"local-timestamp-millis"},"default":0}),
2543 serde_json::json!({"name":"d_ts_us","type":{"type":"long","logicalType":"local-timestamp-micros"},"default":0}),
2544 serde_json::json!({"name":"d_decimal","type":{"type":"bytes","logicalType":"decimal","precision":10,"scale":2},"default":""}),
2545 serde_json::json!({"name":"d_fixed","type":{"type":"fixed","name":"F4","size":4},"default":"ABCD"}),
2546 serde_json::json!({"name":"d_enum","type":{"type":"enum","name":"E","symbols":["A","B","C"]},"default":"A"}),
2547 serde_json::json!({"name":"d_duration","type":{"type":"fixed","name":"Dur","size":12,"logicalType":"duration"},"default":duration_default}),
2548 serde_json::json!({"name":"d_uuid","type":{"type":"string","logicalType":"uuid"},"default":"00000000-0000-0000-0000-000000000000"}),
2549 serde_json::json!({"name":"d_array","type":{"type":"array","items":"int"},"default":[1,2,3]}),
2550 serde_json::json!({"name":"d_map","type":{"type":"map","values":"long"},"default":{"a":1,"b":2}}),
2551 serde_json::json!({"name":"d_record","type":{
2552 "type":"record","name":"DefaultRec","fields":[
2553 {"name":"x","type":"int"},
2554 {"name":"y","type":["null","string"],"default":null}
2555 ]
2556 },"default":{"x":7}}),
2557 serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}),
2558 serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}),
2559 ],
2560 );
2561 let actual = read_alltypes_with_reader_schema(path, reader_schema);
2562 let num_rows = actual.num_rows();
2563 assert!(num_rows > 0, "skippable_types.avro should contain rows");
2564 assert_eq!(
2565 actual.num_columns(),
2566 22,
2567 "expected exactly our defaulted fields"
2568 );
2569 let mut arrays: Vec<Arc<dyn Array>> = Vec::with_capacity(22);
2570 arrays.push(Arc::new(BooleanArray::from_iter(std::iter::repeat_n(
2571 Some(true),
2572 num_rows,
2573 ))));
2574 arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2575 42, num_rows,
2576 ))));
2577 arrays.push(Arc::new(Int64Array::from_iter_values(std::iter::repeat_n(
2578 12345, num_rows,
2579 ))));
2580 arrays.push(Arc::new(Float32Array::from_iter_values(
2581 std::iter::repeat_n(1.5f32, num_rows),
2582 )));
2583 arrays.push(Arc::new(Float64Array::from_iter_values(
2584 std::iter::repeat_n(2.25f64, num_rows),
2585 )));
2586 arrays.push(Arc::new(BinaryArray::from_iter_values(
2587 std::iter::repeat_n(b"XYZ".as_ref(), num_rows),
2588 )));
2589 arrays.push(Arc::new(StringArray::from_iter_values(
2590 std::iter::repeat_n("hello", num_rows),
2591 )));
2592 arrays.push(Arc::new(Date32Array::from_iter_values(
2593 std::iter::repeat_n(0, num_rows),
2594 )));
2595 arrays.push(Arc::new(Time32MillisecondArray::from_iter_values(
2596 std::iter::repeat_n(1_000, num_rows),
2597 )));
2598 arrays.push(Arc::new(Time64MicrosecondArray::from_iter_values(
2599 std::iter::repeat_n(2_000i64, num_rows),
2600 )));
2601 arrays.push(Arc::new(TimestampMillisecondArray::from_iter_values(
2602 std::iter::repeat_n(0i64, num_rows),
2603 )));
2604 arrays.push(Arc::new(TimestampMicrosecondArray::from_iter_values(
2605 std::iter::repeat_n(0i64, num_rows),
2606 )));
2607 #[cfg(feature = "small_decimals")]
2608 let decimal = Decimal64Array::from_iter_values(std::iter::repeat_n(0i64, num_rows))
2609 .with_precision_and_scale(10, 2)
2610 .unwrap();
2611 #[cfg(not(feature = "small_decimals"))]
2612 let decimal = Decimal128Array::from_iter_values(std::iter::repeat_n(0i128, num_rows))
2613 .with_precision_and_scale(10, 2)
2614 .unwrap();
2615 arrays.push(Arc::new(decimal));
2616 let fixed_iter = std::iter::repeat_n(Some(*b"ABCD"), num_rows);
2617 arrays.push(Arc::new(
2618 FixedSizeBinaryArray::try_from_sparse_iter_with_size(fixed_iter, 4).unwrap(),
2619 ));
2620 let enum_keys = Int32Array::from_iter_values(std::iter::repeat_n(0, num_rows));
2621 let enum_values = StringArray::from_iter_values(["A", "B", "C"]);
2622 let enum_arr =
2623 DictionaryArray::<Int32Type>::try_new(enum_keys, Arc::new(enum_values)).unwrap();
2624 arrays.push(Arc::new(enum_arr));
2625 let duration_values = std::iter::repeat_n(
2626 Some(IntervalMonthDayNanoType::make_value(0, 0, 0)),
2627 num_rows,
2628 );
2629 let duration_arr: IntervalMonthDayNanoArray = duration_values.collect();
2630 arrays.push(Arc::new(duration_arr));
2631 let uuid_bytes = [0u8; 16];
2632 let uuid_iter = std::iter::repeat_n(Some(uuid_bytes), num_rows);
2633 arrays.push(Arc::new(
2634 FixedSizeBinaryArray::try_from_sparse_iter_with_size(uuid_iter, 16).unwrap(),
2635 ));
2636 let item_field = Arc::new(Field::new(
2637 Field::LIST_FIELD_DEFAULT_NAME,
2638 DataType::Int32,
2639 false,
2640 ));
2641 let mut list_builder = ListBuilder::new(Int32Builder::new()).with_field(item_field);
2642 for _ in 0..num_rows {
2643 list_builder.values().append_value(1);
2644 list_builder.values().append_value(2);
2645 list_builder.values().append_value(3);
2646 list_builder.append(true);
2647 }
2648 arrays.push(Arc::new(list_builder.finish()));
2649 let values_field = Arc::new(Field::new("value", DataType::Int64, false));
2650 let mut map_builder = MapBuilder::new(
2651 Some(builder::MapFieldNames {
2652 entry: "entries".to_string(),
2653 key: "key".to_string(),
2654 value: "value".to_string(),
2655 }),
2656 StringBuilder::new(),
2657 Int64Builder::new(),
2658 )
2659 .with_values_field(values_field);
2660 for _ in 0..num_rows {
2661 let (keys, vals) = map_builder.entries();
2662 keys.append_value("a");
2663 vals.append_value(1);
2664 keys.append_value("b");
2665 vals.append_value(2);
2666 map_builder.append(true).unwrap();
2667 }
2668 arrays.push(Arc::new(map_builder.finish()));
2669 let rec_fields: Fields = Fields::from(vec![
2670 Field::new("x", DataType::Int32, false),
2671 Field::new("y", DataType::Utf8, true),
2672 ]);
2673 let mut sb = StructBuilder::new(
2674 rec_fields.clone(),
2675 vec![
2676 Box::new(Int32Builder::new()),
2677 Box::new(StringBuilder::new()),
2678 ],
2679 );
2680 for _ in 0..num_rows {
2681 sb.field_builder::<Int32Builder>(0).unwrap().append_value(7);
2682 sb.field_builder::<StringBuilder>(1).unwrap().append_null();
2683 sb.append(true);
2684 }
2685 arrays.push(Arc::new(sb.finish()));
2686 arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n(
2687 None::<i32>,
2688 num_rows,
2689 ))));
2690 arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2691 123, num_rows,
2692 ))));
2693 let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap();
2694 assert_eq!(
2695 actual, expected,
2696 "defaults should materialize correctly for all fields"
2697 );
2698 }
2699
2700 #[test]
2701 fn test_schema_resolution_default_enum_invalid_symbol_errors() {
2702 let path = "test/data/skippable_types.avro";
2703 let bad_schema = make_reader_schema_with_default_fields(
2704 path,
2705 vec![serde_json::json!({
2706 "name":"bad_enum",
2707 "type":{"type":"enum","name":"E","symbols":["A","B","C"]},
2708 "default":"Z"
2709 })],
2710 );
2711 let file = File::open(path).unwrap();
2712 let res = ReaderBuilder::new()
2713 .with_reader_schema(bad_schema)
2714 .build(BufReader::new(file));
2715 let err = res.expect_err("expected enum default validation to fail");
2716 let msg = err.to_string();
2717 let lower_msg = msg.to_lowercase();
2718 assert!(
2719 lower_msg.contains("enum")
2720 && (lower_msg.contains("symbol") || lower_msg.contains("default")),
2721 "unexpected error: {msg}"
2722 );
2723 }
2724
2725 #[test]
2726 fn test_schema_resolution_default_fixed_size_mismatch_errors() {
2727 let path = "test/data/skippable_types.avro";
2728 let bad_schema = make_reader_schema_with_default_fields(
2729 path,
2730 vec![serde_json::json!({
2731 "name":"bad_fixed",
2732 "type":{"type":"fixed","name":"F","size":4},
2733 "default":"ABC"
2734 })],
2735 );
2736 let file = File::open(path).unwrap();
2737 let res = ReaderBuilder::new()
2738 .with_reader_schema(bad_schema)
2739 .build(BufReader::new(file));
2740 let err = res.expect_err("expected fixed default validation to fail");
2741 let msg = err.to_string();
2742 let lower_msg = msg.to_lowercase();
2743 assert!(
2744 lower_msg.contains("fixed")
2745 && (lower_msg.contains("size")
2746 || lower_msg.contains("length")
2747 || lower_msg.contains("does not match")),
2748 "unexpected error: {msg}"
2749 );
2750 }
2751
2752 #[test]
2753 #[cfg(feature = "snappy")]
2755 fn test_alltypes_skip_writer_fields_keep_double_only() {
2756 let file = arrow_test_data("avro/alltypes_plain.avro");
2757 let reader_schema =
2758 make_reader_schema_with_selected_fields_in_order(&file, &["double_col"]);
2759 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2760 let expected = RecordBatch::try_from_iter_with_nullable([(
2761 "double_col",
2762 Arc::new(Float64Array::from_iter_values(
2763 (0..8).map(|x| (x % 2) as f64 * 10.1),
2764 )) as _,
2765 true,
2766 )])
2767 .unwrap();
2768 assert_eq!(batch, expected);
2769 }
2770
2771 #[test]
2772 #[cfg(feature = "snappy")]
2774 fn test_alltypes_skip_writer_fields_reorder_and_skip_many() {
2775 let file = arrow_test_data("avro/alltypes_plain.avro");
2776 let reader_schema =
2777 make_reader_schema_with_selected_fields_in_order(&file, &["timestamp_col", "id"]);
2778 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2779 let expected = RecordBatch::try_from_iter_with_nullable([
2780 (
2781 "timestamp_col",
2782 Arc::new(
2783 TimestampMicrosecondArray::from_iter_values([
2784 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2793 .with_timezone("+00:00"),
2794 ) as _,
2795 true,
2796 ),
2797 (
2798 "id",
2799 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2800 true,
2801 ),
2802 ])
2803 .unwrap();
2804 assert_eq!(batch, expected);
2805 }
2806
2807 #[test]
2808 fn test_skippable_types_project_each_field_individually() {
2809 let path = "test/data/skippable_types.avro";
2810 let full = read_file(path, 1024, false);
2811 let schema_full = full.schema();
2812 let num_rows = full.num_rows();
2813 let writer_json = load_writer_schema_json(path);
2814 assert_eq!(
2815 writer_json["type"], "record",
2816 "writer schema must be a record"
2817 );
2818 let fields_json = writer_json
2819 .get("fields")
2820 .and_then(|f| f.as_array())
2821 .expect("record has fields");
2822 assert_eq!(
2823 schema_full.fields().len(),
2824 fields_json.len(),
2825 "full read column count vs writer fields"
2826 );
2827 fn rebuild_list_array_with_element(
2828 col: &ArrayRef,
2829 new_elem: Arc<Field>,
2830 is_large: bool,
2831 ) -> ArrayRef {
2832 if is_large {
2833 let list = col
2834 .as_any()
2835 .downcast_ref::<LargeListArray>()
2836 .expect("expected LargeListArray");
2837 let offsets = list.offsets().clone();
2838 let values = list.values().clone();
2839 let validity = list.nulls().cloned();
2840 Arc::new(LargeListArray::try_new(new_elem, offsets, values, validity).unwrap())
2841 } else {
2842 let list = col
2843 .as_any()
2844 .downcast_ref::<ListArray>()
2845 .expect("expected ListArray");
2846 let offsets = list.offsets().clone();
2847 let values = list.values().clone();
2848 let validity = list.nulls().cloned();
2849 Arc::new(ListArray::try_new(new_elem, offsets, values, validity).unwrap())
2850 }
2851 }
2852 for (idx, f) in fields_json.iter().enumerate() {
2853 let name = f
2854 .get("name")
2855 .and_then(|n| n.as_str())
2856 .unwrap_or_else(|| panic!("field at index {idx} has no name"));
2857 let reader_schema = make_reader_schema_with_selected_fields_in_order(path, &[name]);
2858 let projected = read_alltypes_with_reader_schema(path, reader_schema);
2859 assert_eq!(
2860 projected.num_columns(),
2861 1,
2862 "projected batch should contain exactly the selected column '{name}'"
2863 );
2864 assert_eq!(
2865 projected.num_rows(),
2866 num_rows,
2867 "row count mismatch for projected column '{name}'"
2868 );
2869 let col_full = full.column(idx).clone();
2870 let full_field = schema_full.field(idx).as_ref().clone();
2871 let proj_field_ref = projected.schema().field(0).clone();
2872 let proj_field = proj_field_ref.as_ref();
2873 let top_meta = proj_field.metadata().clone();
2874 let (expected_field_ref, expected_col): (Arc<Field>, ArrayRef) =
2875 match (full_field.data_type(), proj_field.data_type()) {
2876 (&DataType::List(_), DataType::List(proj_elem)) => {
2877 let new_col =
2878 rebuild_list_array_with_element(&col_full, proj_elem.clone(), false);
2879 let nf = Field::new(
2880 full_field.name().clone(),
2881 proj_field.data_type().clone(),
2882 full_field.is_nullable(),
2883 )
2884 .with_metadata(top_meta);
2885 (Arc::new(nf), new_col)
2886 }
2887 (&DataType::LargeList(_), DataType::LargeList(proj_elem)) => {
2888 let new_col =
2889 rebuild_list_array_with_element(&col_full, proj_elem.clone(), true);
2890 let nf = Field::new(
2891 full_field.name().clone(),
2892 proj_field.data_type().clone(),
2893 full_field.is_nullable(),
2894 )
2895 .with_metadata(top_meta);
2896 (Arc::new(nf), new_col)
2897 }
2898 _ => {
2899 let nf = full_field.with_metadata(top_meta);
2900 (Arc::new(nf), col_full)
2901 }
2902 };
2903
2904 let expected = RecordBatch::try_new(
2905 Arc::new(Schema::new(vec![expected_field_ref])),
2906 vec![expected_col],
2907 )
2908 .unwrap();
2909 assert_eq!(
2910 projected, expected,
2911 "projected column '{name}' mismatch vs full read column"
2912 );
2913 }
2914 }
2915
2916 #[test]
2917 fn test_union_fields_avro_nullable_and_general_unions() {
2918 let path = "test/data/union_fields.avro";
2919 let batch = read_file(path, 1024, false);
2920 let schema = batch.schema();
2921 let idx = schema.index_of("nullable_int_nullfirst").unwrap();
2922 let a = batch.column(idx).as_primitive::<Int32Type>();
2923 assert_eq!(a.len(), 4);
2924 assert!(a.is_null(0));
2925 assert_eq!(a.value(1), 42);
2926 assert!(a.is_null(2));
2927 assert_eq!(a.value(3), 0);
2928 let idx = schema.index_of("nullable_string_nullsecond").unwrap();
2929 let s = batch
2930 .column(idx)
2931 .as_any()
2932 .downcast_ref::<StringArray>()
2933 .expect("nullable_string_nullsecond should be Utf8");
2934 assert_eq!(s.len(), 4);
2935 assert_eq!(s.value(0), "s1");
2936 assert!(s.is_null(1));
2937 assert_eq!(s.value(2), "s3");
2938 assert!(s.is_valid(3)); assert_eq!(s.value(3), "");
2940 let idx = schema.index_of("union_prim").unwrap();
2941 let u = batch
2942 .column(idx)
2943 .as_any()
2944 .downcast_ref::<UnionArray>()
2945 .expect("union_prim should be Union");
2946 let fields = match u.data_type() {
2947 DataType::Union(fields, mode) => {
2948 assert!(matches!(mode, UnionMode::Dense), "expect dense unions");
2949 fields
2950 }
2951 other => panic!("expected Union, got {other:?}"),
2952 };
2953 let tid_by_name = |name: &str| -> i8 {
2954 for (tid, f) in fields.iter() {
2955 if f.name() == name {
2956 return tid;
2957 }
2958 }
2959 panic!("union child '{name}' not found");
2960 };
2961 let expected_type_ids = vec![
2962 tid_by_name("long"),
2963 tid_by_name("int"),
2964 tid_by_name("float"),
2965 tid_by_name("double"),
2966 ];
2967 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
2968 assert_eq!(
2969 type_ids, expected_type_ids,
2970 "branch selection for union_prim rows"
2971 );
2972 let longs = u
2973 .child(tid_by_name("long"))
2974 .as_any()
2975 .downcast_ref::<Int64Array>()
2976 .unwrap();
2977 assert_eq!(longs.len(), 1);
2978 let ints = u
2979 .child(tid_by_name("int"))
2980 .as_any()
2981 .downcast_ref::<Int32Array>()
2982 .unwrap();
2983 assert_eq!(ints.len(), 1);
2984 let floats = u
2985 .child(tid_by_name("float"))
2986 .as_any()
2987 .downcast_ref::<Float32Array>()
2988 .unwrap();
2989 assert_eq!(floats.len(), 1);
2990 let doubles = u
2991 .child(tid_by_name("double"))
2992 .as_any()
2993 .downcast_ref::<Float64Array>()
2994 .unwrap();
2995 assert_eq!(doubles.len(), 1);
2996 let idx = schema.index_of("union_bytes_vs_string").unwrap();
2997 let u = batch
2998 .column(idx)
2999 .as_any()
3000 .downcast_ref::<UnionArray>()
3001 .expect("union_bytes_vs_string should be Union");
3002 let fields = match u.data_type() {
3003 DataType::Union(fields, _) => fields,
3004 other => panic!("expected Union, got {other:?}"),
3005 };
3006 let tid_by_name = |name: &str| -> i8 {
3007 for (tid, f) in fields.iter() {
3008 if f.name() == name {
3009 return tid;
3010 }
3011 }
3012 panic!("union child '{name}' not found");
3013 };
3014 let tid_bytes = tid_by_name("bytes");
3015 let tid_string = tid_by_name("string");
3016 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3017 assert_eq!(
3018 type_ids,
3019 vec![tid_bytes, tid_string, tid_string, tid_bytes],
3020 "branch selection for bytes/string union"
3021 );
3022 let s_child = u
3023 .child(tid_string)
3024 .as_any()
3025 .downcast_ref::<StringArray>()
3026 .unwrap();
3027 assert_eq!(s_child.len(), 2);
3028 assert_eq!(s_child.value(0), "hello");
3029 assert_eq!(s_child.value(1), "world");
3030 let b_child = u
3031 .child(tid_bytes)
3032 .as_any()
3033 .downcast_ref::<BinaryArray>()
3034 .unwrap();
3035 assert_eq!(b_child.len(), 2);
3036 assert_eq!(b_child.value(0), &[0x00, 0xFF, 0x7F]);
3037 assert_eq!(b_child.value(1), b""); let idx = schema.index_of("union_enum_records_array_map").unwrap();
3039 let u = batch
3040 .column(idx)
3041 .as_any()
3042 .downcast_ref::<UnionArray>()
3043 .expect("union_enum_records_array_map should be Union");
3044 let fields = match u.data_type() {
3045 DataType::Union(fields, _) => fields,
3046 other => panic!("expected Union, got {other:?}"),
3047 };
3048 let mut tid_enum: Option<i8> = None;
3049 let mut tid_rec_a: Option<i8> = None;
3050 let mut tid_rec_b: Option<i8> = None;
3051 let mut tid_array: Option<i8> = None;
3052 for (tid, f) in fields.iter() {
3053 match f.data_type() {
3054 DataType::Dictionary(_, _) => tid_enum = Some(tid),
3055 DataType::Struct(childs) => {
3056 if childs.len() == 2 && childs[0].name() == "a" && childs[1].name() == "b" {
3057 tid_rec_a = Some(tid);
3058 } else if childs.len() == 2
3059 && childs[0].name() == "x"
3060 && childs[1].name() == "y"
3061 {
3062 tid_rec_b = Some(tid);
3063 }
3064 }
3065 DataType::List(_) => tid_array = Some(tid),
3066 _ => {}
3067 }
3068 }
3069 let (tid_enum, tid_rec_a, tid_rec_b, tid_array) = (
3070 tid_enum.expect("enum child"),
3071 tid_rec_a.expect("RecA child"),
3072 tid_rec_b.expect("RecB child"),
3073 tid_array.expect("array<long> child"),
3074 );
3075 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3076 assert_eq!(
3077 type_ids,
3078 vec![tid_enum, tid_rec_a, tid_rec_b, tid_array],
3079 "branch selection for complex union"
3080 );
3081 let dict = u
3082 .child(tid_enum)
3083 .as_any()
3084 .downcast_ref::<DictionaryArray<Int32Type>>()
3085 .unwrap();
3086 assert_eq!(dict.len(), 1);
3087 assert!(dict.is_valid(0));
3088 let rec_a = u
3089 .child(tid_rec_a)
3090 .as_any()
3091 .downcast_ref::<StructArray>()
3092 .unwrap();
3093 assert_eq!(rec_a.len(), 1);
3094 let a_val = rec_a
3095 .column_by_name("a")
3096 .unwrap()
3097 .as_any()
3098 .downcast_ref::<Int32Array>()
3099 .unwrap();
3100 assert_eq!(a_val.value(0), 7);
3101 let b_val = rec_a
3102 .column_by_name("b")
3103 .unwrap()
3104 .as_any()
3105 .downcast_ref::<StringArray>()
3106 .unwrap();
3107 assert_eq!(b_val.value(0), "x");
3108 let rec_b = u
3110 .child(tid_rec_b)
3111 .as_any()
3112 .downcast_ref::<StructArray>()
3113 .unwrap();
3114 let x_val = rec_b
3115 .column_by_name("x")
3116 .unwrap()
3117 .as_any()
3118 .downcast_ref::<Int64Array>()
3119 .unwrap();
3120 assert_eq!(x_val.value(0), 123_456_789_i64);
3121 let y_val = rec_b
3122 .column_by_name("y")
3123 .unwrap()
3124 .as_any()
3125 .downcast_ref::<BinaryArray>()
3126 .unwrap();
3127 assert_eq!(y_val.value(0), &[0xFF, 0x00]);
3128 let arr = u
3129 .child(tid_array)
3130 .as_any()
3131 .downcast_ref::<ListArray>()
3132 .unwrap();
3133 assert_eq!(arr.len(), 1);
3134 let first_values = arr.value(0);
3135 let longs = first_values.as_any().downcast_ref::<Int64Array>().unwrap();
3136 assert_eq!(longs.len(), 3);
3137 assert_eq!(longs.value(0), 1);
3138 assert_eq!(longs.value(1), 2);
3139 assert_eq!(longs.value(2), 3);
3140 let idx = schema.index_of("union_date_or_fixed4").unwrap();
3141 let u = batch
3142 .column(idx)
3143 .as_any()
3144 .downcast_ref::<UnionArray>()
3145 .expect("union_date_or_fixed4 should be Union");
3146 let fields = match u.data_type() {
3147 DataType::Union(fields, _) => fields,
3148 other => panic!("expected Union, got {other:?}"),
3149 };
3150 let mut tid_date: Option<i8> = None;
3151 let mut tid_fixed: Option<i8> = None;
3152 for (tid, f) in fields.iter() {
3153 match f.data_type() {
3154 DataType::Date32 => tid_date = Some(tid),
3155 DataType::FixedSizeBinary(4) => tid_fixed = Some(tid),
3156 _ => {}
3157 }
3158 }
3159 let (tid_date, tid_fixed) = (tid_date.expect("date"), tid_fixed.expect("fixed(4)"));
3160 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3161 assert_eq!(
3162 type_ids,
3163 vec![tid_date, tid_fixed, tid_date, tid_fixed],
3164 "branch selection for date/fixed4 union"
3165 );
3166 let dates = u
3167 .child(tid_date)
3168 .as_any()
3169 .downcast_ref::<Date32Array>()
3170 .unwrap();
3171 assert_eq!(dates.len(), 2);
3172 assert_eq!(dates.value(0), 19_000); assert_eq!(dates.value(1), 0); let fixed = u
3175 .child(tid_fixed)
3176 .as_any()
3177 .downcast_ref::<FixedSizeBinaryArray>()
3178 .unwrap();
3179 assert_eq!(fixed.len(), 2);
3180 assert_eq!(fixed.value(0), b"ABCD");
3181 assert_eq!(fixed.value(1), &[0x00, 0x11, 0x22, 0x33]);
3182 }
3183
3184 #[test]
3185 fn test_union_schema_resolution_all_type_combinations() {
3186 let path = "test/data/union_fields.avro";
3187 let baseline = read_file(path, 1024, false);
3188 let baseline_schema = baseline.schema();
3189 let mut root = load_writer_schema_json(path);
3190 assert_eq!(root["type"], "record", "writer schema must be a record");
3191 let fields = root
3192 .get_mut("fields")
3193 .and_then(|f| f.as_array_mut())
3194 .expect("record has fields");
3195 fn is_named_type(obj: &Value, ty: &str, nm: &str) -> bool {
3196 obj.get("type").and_then(|v| v.as_str()) == Some(ty)
3197 && obj.get("name").and_then(|v| v.as_str()) == Some(nm)
3198 }
3199 fn is_logical(obj: &Value, prim: &str, lt: &str) -> bool {
3200 obj.get("type").and_then(|v| v.as_str()) == Some(prim)
3201 && obj.get("logicalType").and_then(|v| v.as_str()) == Some(lt)
3202 }
3203 fn find_first(arr: &[Value], pred: impl Fn(&Value) -> bool) -> Option<Value> {
3204 arr.iter().find(|v| pred(v)).cloned()
3205 }
3206 fn prim(s: &str) -> Value {
3207 Value::String(s.to_string())
3208 }
3209 for f in fields.iter_mut() {
3210 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
3211 continue;
3212 };
3213 match name {
3214 "nullable_int_nullfirst" => {
3216 f["type"] = json!(["int", "null"]);
3217 }
3218 "nullable_string_nullsecond" => {
3219 f["type"] = json!(["null", "string"]);
3220 }
3221 "union_prim" => {
3222 let orig = f["type"].as_array().unwrap().clone();
3223 let long = prim("long");
3224 let double = prim("double");
3225 let string = prim("string");
3226 let bytes = prim("bytes");
3227 let boolean = prim("boolean");
3228 assert!(orig.contains(&long));
3229 assert!(orig.contains(&double));
3230 assert!(orig.contains(&string));
3231 assert!(orig.contains(&bytes));
3232 assert!(orig.contains(&boolean));
3233 f["type"] = json!([long, double, string, bytes, boolean]);
3234 }
3235 "union_bytes_vs_string" => {
3236 f["type"] = json!(["string", "bytes"]);
3237 }
3238 "union_fixed_dur_decfix" => {
3239 let orig = f["type"].as_array().unwrap().clone();
3240 let fx8 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx8")).unwrap();
3241 let dur12 = find_first(&orig, |o| is_named_type(o, "fixed", "Dur12")).unwrap();
3242 let decfix16 =
3243 find_first(&orig, |o| is_named_type(o, "fixed", "DecFix16")).unwrap();
3244 f["type"] = json!([decfix16, dur12, fx8]);
3245 }
3246 "union_enum_records_array_map" => {
3247 let orig = f["type"].as_array().unwrap().clone();
3248 let enum_color = find_first(&orig, |o| {
3249 o.get("type").and_then(|v| v.as_str()) == Some("enum")
3250 })
3251 .unwrap();
3252 let rec_a = find_first(&orig, |o| is_named_type(o, "record", "RecA")).unwrap();
3253 let rec_b = find_first(&orig, |o| is_named_type(o, "record", "RecB")).unwrap();
3254 let arr = find_first(&orig, |o| {
3255 o.get("type").and_then(|v| v.as_str()) == Some("array")
3256 })
3257 .unwrap();
3258 let map = find_first(&orig, |o| {
3259 o.get("type").and_then(|v| v.as_str()) == Some("map")
3260 })
3261 .unwrap();
3262 f["type"] = json!([arr, map, rec_b, rec_a, enum_color]);
3263 }
3264 "union_date_or_fixed4" => {
3265 let orig = f["type"].as_array().unwrap().clone();
3266 let date = find_first(&orig, |o| is_logical(o, "int", "date")).unwrap();
3267 let fx4 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx4")).unwrap();
3268 f["type"] = json!([fx4, date]);
3269 }
3270 "union_time_millis_or_enum" => {
3271 let orig = f["type"].as_array().unwrap().clone();
3272 let time_ms =
3273 find_first(&orig, |o| is_logical(o, "int", "time-millis")).unwrap();
3274 let en = find_first(&orig, |o| {
3275 o.get("type").and_then(|v| v.as_str()) == Some("enum")
3276 })
3277 .unwrap();
3278 f["type"] = json!([en, time_ms]);
3279 }
3280 "union_time_micros_or_string" => {
3281 let orig = f["type"].as_array().unwrap().clone();
3282 let time_us =
3283 find_first(&orig, |o| is_logical(o, "long", "time-micros")).unwrap();
3284 f["type"] = json!(["string", time_us]);
3285 }
3286 "union_ts_millis_utc_or_array" => {
3287 let orig = f["type"].as_array().unwrap().clone();
3288 let ts_ms =
3289 find_first(&orig, |o| is_logical(o, "long", "timestamp-millis")).unwrap();
3290 let arr = find_first(&orig, |o| {
3291 o.get("type").and_then(|v| v.as_str()) == Some("array")
3292 })
3293 .unwrap();
3294 f["type"] = json!([arr, ts_ms]);
3295 }
3296 "union_ts_micros_local_or_bytes" => {
3297 let orig = f["type"].as_array().unwrap().clone();
3298 let lts_us =
3299 find_first(&orig, |o| is_logical(o, "long", "local-timestamp-micros"))
3300 .unwrap();
3301 f["type"] = json!(["bytes", lts_us]);
3302 }
3303 "union_uuid_or_fixed10" => {
3304 let orig = f["type"].as_array().unwrap().clone();
3305 let uuid = find_first(&orig, |o| is_logical(o, "string", "uuid")).unwrap();
3306 let fx10 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx10")).unwrap();
3307 f["type"] = json!([fx10, uuid]);
3308 }
3309 "union_dec_bytes_or_dec_fixed" => {
3310 let orig = f["type"].as_array().unwrap().clone();
3311 let dec_bytes = find_first(&orig, |o| {
3312 o.get("type").and_then(|v| v.as_str()) == Some("bytes")
3313 && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3314 })
3315 .unwrap();
3316 let dec_fix = find_first(&orig, |o| {
3317 is_named_type(o, "fixed", "DecFix20")
3318 && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3319 })
3320 .unwrap();
3321 f["type"] = json!([dec_fix, dec_bytes]);
3322 }
3323 "union_null_bytes_string" => {
3324 f["type"] = json!(["bytes", "string", "null"]);
3325 }
3326 "array_of_union" => {
3327 let obj = f
3328 .get_mut("type")
3329 .expect("array type")
3330 .as_object_mut()
3331 .unwrap();
3332 obj.insert("items".to_string(), json!(["string", "long"]));
3333 }
3334 "map_of_union" => {
3335 let obj = f
3336 .get_mut("type")
3337 .expect("map type")
3338 .as_object_mut()
3339 .unwrap();
3340 obj.insert("values".to_string(), json!(["double", "null"]));
3341 }
3342 "record_with_union_field" => {
3343 let rec = f
3344 .get_mut("type")
3345 .expect("record type")
3346 .as_object_mut()
3347 .unwrap();
3348 let rec_fields = rec.get_mut("fields").unwrap().as_array_mut().unwrap();
3349 let mut found = false;
3350 for rf in rec_fields.iter_mut() {
3351 if rf.get("name").and_then(|v| v.as_str()) == Some("u") {
3352 rf["type"] = json!(["string", "long"]); found = true;
3354 break;
3355 }
3356 }
3357 assert!(found, "field 'u' expected in HasUnion");
3358 }
3359 "union_ts_micros_utc_or_map" => {
3360 let orig = f["type"].as_array().unwrap().clone();
3361 let ts_us =
3362 find_first(&orig, |o| is_logical(o, "long", "timestamp-micros")).unwrap();
3363 let map = find_first(&orig, |o| {
3364 o.get("type").and_then(|v| v.as_str()) == Some("map")
3365 })
3366 .unwrap();
3367 f["type"] = json!([map, ts_us]);
3368 }
3369 "union_ts_millis_local_or_string" => {
3370 let orig = f["type"].as_array().unwrap().clone();
3371 let lts_ms =
3372 find_first(&orig, |o| is_logical(o, "long", "local-timestamp-millis"))
3373 .unwrap();
3374 f["type"] = json!(["string", lts_ms]);
3375 }
3376 "union_bool_or_string" => {
3377 f["type"] = json!(["string", "boolean"]);
3378 }
3379 _ => {}
3380 }
3381 }
3382 let reader_schema = AvroSchema::new(root.to_string());
3383 let resolved = read_alltypes_with_reader_schema(path, reader_schema);
3384
3385 fn branch_token(dt: &DataType) -> String {
3386 match dt {
3387 DataType::Null => "null".into(),
3388 DataType::Boolean => "boolean".into(),
3389 DataType::Int32 => "int".into(),
3390 DataType::Int64 => "long".into(),
3391 DataType::Float32 => "float".into(),
3392 DataType::Float64 => "double".into(),
3393 DataType::Binary => "bytes".into(),
3394 DataType::Utf8 => "string".into(),
3395 DataType::Date32 => "date".into(),
3396 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => "time-millis".into(),
3397 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => "time-micros".into(),
3398 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => if tz.is_some() {
3399 "timestamp-millis"
3400 } else {
3401 "local-timestamp-millis"
3402 }
3403 .into(),
3404 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => if tz.is_some() {
3405 "timestamp-micros"
3406 } else {
3407 "local-timestamp-micros"
3408 }
3409 .into(),
3410 DataType::Interval(IntervalUnit::MonthDayNano) => "duration".into(),
3411 DataType::FixedSizeBinary(n) => format!("fixed{n}"),
3412 DataType::Dictionary(_, _) => "enum".into(),
3413 DataType::Decimal128(p, s) => format!("decimal({p},{s})"),
3414 DataType::Decimal256(p, s) => format!("decimal({p},{s})"),
3415 #[cfg(feature = "small_decimals")]
3416 DataType::Decimal64(p, s) => format!("decimal({p},{s})"),
3417 DataType::Struct(fields) => {
3418 if fields.len() == 2 && fields[0].name() == "a" && fields[1].name() == "b" {
3419 "record:RecA".into()
3420 } else if fields.len() == 2
3421 && fields[0].name() == "x"
3422 && fields[1].name() == "y"
3423 {
3424 "record:RecB".into()
3425 } else {
3426 "record".into()
3427 }
3428 }
3429 DataType::List(_) => "array".into(),
3430 DataType::Map(_, _) => "map".into(),
3431 other => format!("{other:?}"),
3432 }
3433 }
3434
3435 fn union_tokens(u: &UnionArray) -> (Vec<i8>, HashMap<i8, String>) {
3436 let fields = match u.data_type() {
3437 DataType::Union(fields, _) => fields,
3438 other => panic!("expected Union, got {other:?}"),
3439 };
3440 let mut dict: HashMap<i8, String> = HashMap::with_capacity(fields.len());
3441 for (tid, f) in fields.iter() {
3442 dict.insert(tid, branch_token(f.data_type()));
3443 }
3444 let ids: Vec<i8> = u.type_ids().iter().copied().collect();
3445 (ids, dict)
3446 }
3447
3448 fn expected_token(field_name: &str, writer_token: &str) -> String {
3449 match field_name {
3450 "union_prim" => match writer_token {
3451 "int" => "long".into(),
3452 "float" => "double".into(),
3453 other => other.into(),
3454 },
3455 "record_with_union_field.u" => match writer_token {
3456 "int" => "long".into(),
3457 other => other.into(),
3458 },
3459 _ => writer_token.into(),
3460 }
3461 }
3462
3463 fn get_union<'a>(
3464 rb: &'a RecordBatch,
3465 schema: arrow_schema::SchemaRef,
3466 fname: &str,
3467 ) -> &'a UnionArray {
3468 let idx = schema.index_of(fname).unwrap();
3469 rb.column(idx)
3470 .as_any()
3471 .downcast_ref::<UnionArray>()
3472 .unwrap_or_else(|| panic!("{fname} should be a Union"))
3473 }
3474
3475 fn assert_union_equivalent(field_name: &str, u_writer: &UnionArray, u_reader: &UnionArray) {
3476 let (ids_w, dict_w) = union_tokens(u_writer);
3477 let (ids_r, dict_r) = union_tokens(u_reader);
3478 assert_eq!(
3479 ids_w.len(),
3480 ids_r.len(),
3481 "{field_name}: row count mismatch between baseline and resolved"
3482 );
3483 for (i, (id_w, id_r)) in ids_w.iter().zip(ids_r.iter()).enumerate() {
3484 let w_tok = dict_w.get(id_w).unwrap();
3485 let want = expected_token(field_name, w_tok);
3486 let got = dict_r.get(id_r).unwrap();
3487 assert_eq!(
3488 got, &want,
3489 "{field_name}: row {i} resolved to wrong union branch (writer={w_tok}, expected={want}, got={got})"
3490 );
3491 }
3492 }
3493
3494 for (fname, dt) in [
3495 ("nullable_int_nullfirst", DataType::Int32),
3496 ("nullable_string_nullsecond", DataType::Utf8),
3497 ] {
3498 let idx_b = baseline_schema.index_of(fname).unwrap();
3499 let idx_r = resolved.schema().index_of(fname).unwrap();
3500 let col_b = baseline.column(idx_b);
3501 let col_r = resolved.column(idx_r);
3502 assert_eq!(
3503 col_b.data_type(),
3504 &dt,
3505 "baseline {fname} should decode as non-union with nullability"
3506 );
3507 assert_eq!(
3508 col_b.as_ref(),
3509 col_r.as_ref(),
3510 "{fname}: values must be identical regardless of null-branch order"
3511 );
3512 }
3513 let union_fields = [
3514 "union_prim",
3515 "union_bytes_vs_string",
3516 "union_fixed_dur_decfix",
3517 "union_enum_records_array_map",
3518 "union_date_or_fixed4",
3519 "union_time_millis_or_enum",
3520 "union_time_micros_or_string",
3521 "union_ts_millis_utc_or_array",
3522 "union_ts_micros_local_or_bytes",
3523 "union_uuid_or_fixed10",
3524 "union_dec_bytes_or_dec_fixed",
3525 "union_null_bytes_string",
3526 "union_ts_micros_utc_or_map",
3527 "union_ts_millis_local_or_string",
3528 "union_bool_or_string",
3529 ];
3530 for fname in union_fields {
3531 let u_b = get_union(&baseline, baseline_schema.clone(), fname);
3532 let u_r = get_union(&resolved, resolved.schema(), fname);
3533 assert_union_equivalent(fname, u_b, u_r);
3534 }
3535 {
3536 let fname = "array_of_union";
3537 let idx_b = baseline_schema.index_of(fname).unwrap();
3538 let idx_r = resolved.schema().index_of(fname).unwrap();
3539 let arr_b = baseline
3540 .column(idx_b)
3541 .as_any()
3542 .downcast_ref::<ListArray>()
3543 .expect("array_of_union should be a List");
3544 let arr_r = resolved
3545 .column(idx_r)
3546 .as_any()
3547 .downcast_ref::<ListArray>()
3548 .expect("array_of_union should be a List");
3549 assert_eq!(
3550 arr_b.value_offsets(),
3551 arr_r.value_offsets(),
3552 "{fname}: list offsets changed after resolution"
3553 );
3554 let u_b = arr_b
3555 .values()
3556 .as_any()
3557 .downcast_ref::<UnionArray>()
3558 .expect("array items should be Union");
3559 let u_r = arr_r
3560 .values()
3561 .as_any()
3562 .downcast_ref::<UnionArray>()
3563 .expect("array items should be Union");
3564 let (ids_b, dict_b) = union_tokens(u_b);
3565 let (ids_r, dict_r) = union_tokens(u_r);
3566 assert_eq!(ids_b.len(), ids_r.len(), "{fname}: values length mismatch");
3567 for (i, (id_b, id_r)) in ids_b.iter().zip(ids_r.iter()).enumerate() {
3568 let w_tok = dict_b.get(id_b).unwrap();
3569 let got = dict_r.get(id_r).unwrap();
3570 assert_eq!(
3571 got, w_tok,
3572 "{fname}: value {i} resolved to wrong branch (writer={w_tok}, got={got})"
3573 );
3574 }
3575 }
3576 {
3577 let fname = "map_of_union";
3578 let idx_b = baseline_schema.index_of(fname).unwrap();
3579 let idx_r = resolved.schema().index_of(fname).unwrap();
3580 let map_b = baseline
3581 .column(idx_b)
3582 .as_any()
3583 .downcast_ref::<MapArray>()
3584 .expect("map_of_union should be a Map");
3585 let map_r = resolved
3586 .column(idx_r)
3587 .as_any()
3588 .downcast_ref::<MapArray>()
3589 .expect("map_of_union should be a Map");
3590 assert_eq!(
3591 map_b.value_offsets(),
3592 map_r.value_offsets(),
3593 "{fname}: map value offsets changed after resolution"
3594 );
3595 let ent_b = map_b.entries();
3596 let ent_r = map_r.entries();
3597 let val_b_any = ent_b.column(1).as_ref();
3598 let val_r_any = ent_r.column(1).as_ref();
3599 let b_union = val_b_any.as_any().downcast_ref::<UnionArray>();
3600 let r_union = val_r_any.as_any().downcast_ref::<UnionArray>();
3601 if let (Some(u_b), Some(u_r)) = (b_union, r_union) {
3602 assert_union_equivalent(fname, u_b, u_r);
3603 } else {
3604 assert_eq!(
3605 val_b_any.data_type(),
3606 val_r_any.data_type(),
3607 "{fname}: value data types differ after resolution"
3608 );
3609 assert_eq!(
3610 val_b_any, val_r_any,
3611 "{fname}: value arrays differ after resolution (nullable value column case)"
3612 );
3613 let value_nullable = |m: &MapArray| -> bool {
3614 match m.data_type() {
3615 DataType::Map(entries_field, _sorted) => match entries_field.data_type() {
3616 DataType::Struct(fields) => {
3617 assert_eq!(fields.len(), 2, "entries struct must have 2 fields");
3618 assert_eq!(fields[0].name(), "key");
3619 assert_eq!(fields[1].name(), "value");
3620 fields[1].is_nullable()
3621 }
3622 other => panic!("Map entries field must be Struct, got {other:?}"),
3623 },
3624 other => panic!("expected Map data type, got {other:?}"),
3625 }
3626 };
3627 assert!(
3628 value_nullable(map_b),
3629 "{fname}: baseline Map value field should be nullable per Arrow spec"
3630 );
3631 assert!(
3632 value_nullable(map_r),
3633 "{fname}: resolved Map value field should be nullable per Arrow spec"
3634 );
3635 }
3636 }
3637 {
3638 let fname = "record_with_union_field";
3639 let idx_b = baseline_schema.index_of(fname).unwrap();
3640 let idx_r = resolved.schema().index_of(fname).unwrap();
3641 let rec_b = baseline
3642 .column(idx_b)
3643 .as_any()
3644 .downcast_ref::<StructArray>()
3645 .expect("record_with_union_field should be a Struct");
3646 let rec_r = resolved
3647 .column(idx_r)
3648 .as_any()
3649 .downcast_ref::<StructArray>()
3650 .expect("record_with_union_field should be a Struct");
3651 let u_b = rec_b
3652 .column_by_name("u")
3653 .unwrap()
3654 .as_any()
3655 .downcast_ref::<UnionArray>()
3656 .expect("field 'u' should be Union (baseline)");
3657 let u_r = rec_r
3658 .column_by_name("u")
3659 .unwrap()
3660 .as_any()
3661 .downcast_ref::<UnionArray>()
3662 .expect("field 'u' should be Union (resolved)");
3663 assert_union_equivalent("record_with_union_field.u", u_b, u_r);
3664 }
3665 }
3666
3667 #[test]
3668 fn test_union_fields_end_to_end_expected_arrays() {
3669 fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
3670 for (tid, f) in fields.iter() {
3671 if f.name() == want {
3672 return tid;
3673 }
3674 }
3675 panic!("union child '{want}' not found")
3676 }
3677
3678 fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
3679 for (tid, f) in fields.iter() {
3680 if pred(f.data_type()) {
3681 return tid;
3682 }
3683 }
3684 panic!("no union child matches predicate");
3685 }
3686
3687 fn uuid16_from_str(s: &str) -> [u8; 16] {
3688 fn hex(b: u8) -> u8 {
3689 match b {
3690 b'0'..=b'9' => b - b'0',
3691 b'a'..=b'f' => b - b'a' + 10,
3692 b'A'..=b'F' => b - b'A' + 10,
3693 _ => panic!("invalid hex"),
3694 }
3695 }
3696 let mut out = [0u8; 16];
3697 let bytes = s.as_bytes();
3698 let (mut i, mut j) = (0, 0);
3699 while i < bytes.len() {
3700 if bytes[i] == b'-' {
3701 i += 1;
3702 continue;
3703 }
3704 let hi = hex(bytes[i]);
3705 let lo = hex(bytes[i + 1]);
3706 out[j] = (hi << 4) | lo;
3707 j += 1;
3708 i += 2;
3709 }
3710 assert_eq!(j, 16, "uuid must decode to 16 bytes");
3711 out
3712 }
3713
3714 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
3715 match dt {
3716 DataType::Null => Arc::new(NullArray::new(0)),
3717 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
3718 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
3719 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
3720 DataType::Float32 => Arc::new(arrow_array::Float32Array::from(Vec::<f32>::new())),
3721 DataType::Float64 => Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())),
3722 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
3723 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
3724 DataType::Date32 => Arc::new(arrow_array::Date32Array::from(Vec::<i32>::new())),
3725 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
3726 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
3727 }
3728 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
3729 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
3730 }
3731 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
3732 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
3733 Arc::new(if let Some(tz) = tz {
3734 a.with_timezone(tz.clone())
3735 } else {
3736 a
3737 })
3738 }
3739 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
3740 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
3741 Arc::new(if let Some(tz) = tz {
3742 a.with_timezone(tz.clone())
3743 } else {
3744 a
3745 })
3746 }
3747 DataType::Interval(IntervalUnit::MonthDayNano) => {
3748 Arc::new(arrow_array::IntervalMonthDayNanoArray::from(Vec::<
3749 IntervalMonthDayNano,
3750 >::new(
3751 )))
3752 }
3753 DataType::FixedSizeBinary(n) => Arc::new(FixedSizeBinaryArray::new_null(*n, 0)),
3754 DataType::Dictionary(k, v) => {
3755 assert_eq!(**k, DataType::Int32, "expect int32 keys for enums");
3756 let keys = Int32Array::from(Vec::<i32>::new());
3757 let values = match v.as_ref() {
3758 DataType::Utf8 => {
3759 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
3760 }
3761 other => panic!("unexpected dictionary value type {other:?}"),
3762 };
3763 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
3764 }
3765 DataType::List(field) => {
3766 let values: ArrayRef = match field.data_type() {
3767 DataType::Int32 => {
3768 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
3769 }
3770 DataType::Int64 => {
3771 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
3772 }
3773 DataType::Utf8 => {
3774 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
3775 }
3776 DataType::Union(_, _) => {
3777 let (uf, _) = if let DataType::Union(f, m) = field.data_type() {
3778 (f.clone(), m)
3779 } else {
3780 unreachable!()
3781 };
3782 let children: Vec<ArrayRef> = uf
3783 .iter()
3784 .map(|(_, f)| empty_child_for(f.data_type()))
3785 .collect();
3786 Arc::new(
3787 UnionArray::try_new(
3788 uf.clone(),
3789 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
3790 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
3791 children,
3792 )
3793 .unwrap(),
3794 ) as ArrayRef
3795 }
3796 other => panic!("unsupported list item type: {other:?}"),
3797 };
3798 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
3799 Arc::new(ListArray::try_new(field.clone(), offsets, values, None).unwrap())
3800 }
3801 DataType::Map(entry_field, ordered) => {
3802 let DataType::Struct(childs) = entry_field.data_type() else {
3803 panic!("map entries must be struct")
3804 };
3805 let key_field = &childs[0];
3806 let val_field = &childs[1];
3807 assert_eq!(key_field.data_type(), &DataType::Utf8);
3808 let keys = StringArray::from(Vec::<&str>::new());
3809 let vals: ArrayRef = match val_field.data_type() {
3810 DataType::Float64 => {
3811 Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())) as ArrayRef
3812 }
3813 DataType::Int64 => {
3814 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
3815 }
3816 DataType::Utf8 => {
3817 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
3818 }
3819 DataType::Union(uf, _) => {
3820 let ch: Vec<ArrayRef> = uf
3821 .iter()
3822 .map(|(_, f)| empty_child_for(f.data_type()))
3823 .collect();
3824 Arc::new(
3825 UnionArray::try_new(
3826 uf.clone(),
3827 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
3828 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
3829 ch,
3830 )
3831 .unwrap(),
3832 ) as ArrayRef
3833 }
3834 other => panic!("unsupported map value type: {other:?}"),
3835 };
3836 let entries = StructArray::new(
3837 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
3838 vec![Arc::new(keys) as ArrayRef, vals],
3839 None,
3840 );
3841 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
3842 Arc::new(MapArray::new(
3843 entry_field.clone(),
3844 offsets,
3845 entries,
3846 None,
3847 *ordered,
3848 ))
3849 }
3850 other => panic!("empty_child_for: unhandled type {other:?}"),
3851 }
3852 }
3853
3854 fn mk_dense_union(
3855 fields: &UnionFields,
3856 type_ids: Vec<i8>,
3857 offsets: Vec<i32>,
3858 provide: impl Fn(&Field) -> Option<ArrayRef>,
3859 ) -> ArrayRef {
3860 let children: Vec<ArrayRef> = fields
3861 .iter()
3862 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
3863 .collect();
3864
3865 Arc::new(
3866 UnionArray::try_new(
3867 fields.clone(),
3868 ScalarBuffer::<i8>::from(type_ids),
3869 Some(ScalarBuffer::<i32>::from(offsets)),
3870 children,
3871 )
3872 .unwrap(),
3873 ) as ArrayRef
3874 }
3875
3876 let date_a: i32 = 19_000;
3878 let time_ms_a: i32 = 13 * 3_600_000 + 45 * 60_000 + 30_000 + 123;
3879 let time_us_b: i64 = 23 * 3_600_000_000 + 59 * 60_000_000 + 59 * 1_000_000 + 999_999;
3880 let ts_ms_2024_01_01: i64 = 1_704_067_200_000;
3881 let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1000;
3882 let fx8_a: [u8; 8] = *b"ABCDEFGH";
3884 let fx4_abcd: [u8; 4] = *b"ABCD";
3885 let fx4_misc: [u8; 4] = [0x00, 0x11, 0x22, 0x33];
3886 let fx10_ascii: [u8; 10] = *b"0123456789";
3887 let fx10_aa: [u8; 10] = [0xAA; 10];
3888 let dur_a = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
3890 let dur_b = IntervalMonthDayNanoType::make_value(12, 31, 999_000_000);
3891 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
3893 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
3894 let dec_b_scale2_pos: i128 = 123_456; let dec_fix16_neg: i128 = -101; let dec_fix20_s4: i128 = 1_234_567_891_234; let dec_fix20_s4_neg: i128 = -123; let path = "test/data/union_fields.avro";
3900 let actual = read_file(path, 1024, false);
3901 let schema = actual.schema();
3902 let get_union = |name: &str| -> (UnionFields, UnionMode) {
3904 let idx = schema.index_of(name).unwrap();
3905 match schema.field(idx).data_type() {
3906 DataType::Union(f, m) => (f.clone(), *m),
3907 other => panic!("{name} should be a Union, got {other:?}"),
3908 }
3909 };
3910 let mut expected_cols: Vec<ArrayRef> = Vec::with_capacity(schema.fields().len());
3911 expected_cols.push(Arc::new(Int32Array::from(vec![
3913 None,
3914 Some(42),
3915 None,
3916 Some(0),
3917 ])));
3918 expected_cols.push(Arc::new(StringArray::from(vec![
3920 Some("s1"),
3921 None,
3922 Some("s3"),
3923 Some(""),
3924 ])));
3925 {
3927 let (uf, mode) = get_union("union_prim");
3928 assert!(matches!(mode, UnionMode::Dense));
3929 let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
3930 let expected_names = vec![
3931 "boolean", "int", "long", "float", "double", "bytes", "string",
3932 ];
3933 assert_eq!(
3934 generated_names, expected_names,
3935 "Field names for union_prim are incorrect"
3936 );
3937 let tids = vec![
3938 tid_by_name(&uf, "long"),
3939 tid_by_name(&uf, "int"),
3940 tid_by_name(&uf, "float"),
3941 tid_by_name(&uf, "double"),
3942 ];
3943 let offs = vec![0, 0, 0, 0];
3944 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
3945 "int" => Some(Arc::new(Int32Array::from(vec![-1])) as ArrayRef),
3946 "long" => Some(Arc::new(Int64Array::from(vec![1_234_567_890_123i64])) as ArrayRef),
3947 "float" => {
3948 Some(Arc::new(arrow_array::Float32Array::from(vec![1.25f32])) as ArrayRef)
3949 }
3950 "double" => {
3951 Some(Arc::new(arrow_array::Float64Array::from(vec![-2.5f64])) as ArrayRef)
3952 }
3953 _ => None,
3954 });
3955 expected_cols.push(arr);
3956 }
3957 {
3959 let (uf, _) = get_union("union_bytes_vs_string");
3960 let tids = vec![
3961 tid_by_name(&uf, "bytes"),
3962 tid_by_name(&uf, "string"),
3963 tid_by_name(&uf, "string"),
3964 tid_by_name(&uf, "bytes"),
3965 ];
3966 let offs = vec![0, 0, 1, 1];
3967 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
3968 "bytes" => Some(
3969 Arc::new(BinaryArray::from(vec![&[0x00, 0xFF, 0x7F][..], &[][..]])) as ArrayRef,
3970 ),
3971 "string" => Some(Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef),
3972 _ => None,
3973 });
3974 expected_cols.push(arr);
3975 }
3976 {
3978 let (uf, _) = get_union("union_fixed_dur_decfix");
3979 let tid_fx8 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(8)));
3980 let tid_dur = tid_by_dt(&uf, |dt| {
3981 matches!(
3982 dt,
3983 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)
3984 )
3985 });
3986 let tid_dec = tid_by_dt(&uf, |dt| match dt {
3987 #[cfg(feature = "small_decimals")]
3988 DataType::Decimal64(10, 2) => true,
3989 DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
3990 _ => false,
3991 });
3992 let tids = vec![tid_fx8, tid_dur, tid_dec, tid_dur];
3993 let offs = vec![0, 0, 0, 1];
3994 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
3995 DataType::FixedSizeBinary(8) => {
3996 let it = [Some(fx8_a)].into_iter();
3997 Some(Arc::new(
3998 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 8).unwrap(),
3999 ) as ArrayRef)
4000 }
4001 DataType::Interval(IntervalUnit::MonthDayNano) => {
4002 Some(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![
4003 dur_a, dur_b,
4004 ])) as ArrayRef)
4005 }
4006 #[cfg(feature = "small_decimals")]
4007 DataType::Decimal64(10, 2) => {
4008 let a = arrow_array::Decimal64Array::from_iter_values([dec_fix16_neg as i64]);
4009 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4010 }
4011 DataType::Decimal128(10, 2) => {
4012 let a = arrow_array::Decimal128Array::from_iter_values([dec_fix16_neg]);
4013 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4014 }
4015 DataType::Decimal256(10, 2) => {
4016 let a = arrow_array::Decimal256Array::from_iter_values([i256::from_i128(
4017 dec_fix16_neg,
4018 )]);
4019 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4020 }
4021 _ => None,
4022 });
4023 let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4024 let expected_names = vec!["Fx8", "Dur12", "DecFix16"];
4025 assert_eq!(
4026 generated_names, expected_names,
4027 "Data type names were not generated correctly for union_fixed_dur_decfix"
4028 );
4029 expected_cols.push(arr);
4030 }
4031 {
4033 let (uf, _) = get_union("union_enum_records_array_map");
4034 let tid_enum = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4035 let tid_reca = tid_by_dt(&uf, |dt| {
4036 if let DataType::Struct(fs) = dt {
4037 fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b"
4038 } else {
4039 false
4040 }
4041 });
4042 let tid_recb = tid_by_dt(&uf, |dt| {
4043 if let DataType::Struct(fs) = dt {
4044 fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y"
4045 } else {
4046 false
4047 }
4048 });
4049 let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4050 let tids = vec![tid_enum, tid_reca, tid_recb, tid_arr];
4051 let offs = vec![0, 0, 0, 0];
4052 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4053 DataType::Dictionary(_, _) => {
4054 let keys = Int32Array::from(vec![0i32]); let values =
4056 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
4057 Some(
4058 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4059 as ArrayRef,
4060 )
4061 }
4062 DataType::Struct(fs)
4063 if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
4064 {
4065 let a = Int32Array::from(vec![7]);
4066 let b = StringArray::from(vec!["x"]);
4067 Some(Arc::new(StructArray::new(
4068 fs.clone(),
4069 vec![Arc::new(a), Arc::new(b)],
4070 None,
4071 )) as ArrayRef)
4072 }
4073 DataType::Struct(fs)
4074 if fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y" =>
4075 {
4076 let x = Int64Array::from(vec![123_456_789i64]);
4077 let y = BinaryArray::from(vec![&[0xFF, 0x00][..]]);
4078 Some(Arc::new(StructArray::new(
4079 fs.clone(),
4080 vec![Arc::new(x), Arc::new(y)],
4081 None,
4082 )) as ArrayRef)
4083 }
4084 DataType::List(field) => {
4085 let values = Int64Array::from(vec![1i64, 2, 3]);
4086 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
4087 Some(Arc::new(
4088 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4089 ) as ArrayRef)
4090 }
4091 DataType::Map(_, _) => None,
4092 other => panic!("unexpected child {other:?}"),
4093 });
4094 expected_cols.push(arr);
4095 }
4096 {
4098 let (uf, _) = get_union("union_date_or_fixed4");
4099 let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
4100 let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
4101 let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
4102 let offs = vec![0, 0, 1, 1];
4103 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4104 DataType::Date32 => {
4105 Some(Arc::new(arrow_array::Date32Array::from(vec![date_a, 0])) as ArrayRef)
4106 }
4107 DataType::FixedSizeBinary(4) => {
4108 let it = [Some(fx4_abcd), Some(fx4_misc)].into_iter();
4109 Some(Arc::new(
4110 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
4111 ) as ArrayRef)
4112 }
4113 _ => None,
4114 });
4115 expected_cols.push(arr);
4116 }
4117 {
4119 let (uf, _) = get_union("union_time_millis_or_enum");
4120 let tid_ms = tid_by_dt(&uf, |dt| {
4121 matches!(dt, DataType::Time32(arrow_schema::TimeUnit::Millisecond))
4122 });
4123 let tid_en = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4124 let tids = vec![tid_ms, tid_en, tid_en, tid_ms];
4125 let offs = vec![0, 0, 1, 1];
4126 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4127 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4128 Some(Arc::new(Time32MillisecondArray::from(vec![time_ms_a, 0])) as ArrayRef)
4129 }
4130 DataType::Dictionary(_, _) => {
4131 let keys = Int32Array::from(vec![0i32, 1]); let values = Arc::new(StringArray::from(vec!["ON", "OFF"])) as ArrayRef;
4133 Some(
4134 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4135 as ArrayRef,
4136 )
4137 }
4138 _ => None,
4139 });
4140 expected_cols.push(arr);
4141 }
4142 {
4144 let (uf, _) = get_union("union_time_micros_or_string");
4145 let tid_us = tid_by_dt(&uf, |dt| {
4146 matches!(dt, DataType::Time64(arrow_schema::TimeUnit::Microsecond))
4147 });
4148 let tid_s = tid_by_name(&uf, "string");
4149 let tids = vec![tid_s, tid_us, tid_s, tid_s];
4150 let offs = vec![0, 0, 1, 2];
4151 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4152 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4153 Some(Arc::new(Time64MicrosecondArray::from(vec![time_us_b])) as ArrayRef)
4154 }
4155 DataType::Utf8 => {
4156 Some(Arc::new(StringArray::from(vec!["evening", "night", ""])) as ArrayRef)
4157 }
4158 _ => None,
4159 });
4160 expected_cols.push(arr);
4161 }
4162 {
4164 let (uf, _) = get_union("union_ts_millis_utc_or_array");
4165 let tid_ts = tid_by_dt(&uf, |dt| {
4166 matches!(
4167 dt,
4168 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)
4169 )
4170 });
4171 let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4172 let tids = vec![tid_ts, tid_arr, tid_arr, tid_ts];
4173 let offs = vec![0, 0, 1, 1];
4174 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4175 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4176 let a = TimestampMillisecondArray::from(vec![
4177 ts_ms_2024_01_01,
4178 ts_ms_2024_01_01 + 86_400_000,
4179 ]);
4180 Some(Arc::new(if let Some(tz) = tz {
4181 a.with_timezone(tz.clone())
4182 } else {
4183 a
4184 }) as ArrayRef)
4185 }
4186 DataType::List(field) => {
4187 let values = Int32Array::from(vec![0, 1, 2, -1, 0, 1]);
4188 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6]));
4189 Some(Arc::new(
4190 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4191 ) as ArrayRef)
4192 }
4193 _ => None,
4194 });
4195 expected_cols.push(arr);
4196 }
4197 {
4199 let (uf, _) = get_union("union_ts_micros_local_or_bytes");
4200 let tid_lts = tid_by_dt(&uf, |dt| {
4201 matches!(
4202 dt,
4203 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
4204 )
4205 });
4206 let tid_b = tid_by_name(&uf, "bytes");
4207 let tids = vec![tid_b, tid_lts, tid_b, tid_b];
4208 let offs = vec![0, 0, 1, 2];
4209 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4210 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) => Some(Arc::new(
4211 TimestampMicrosecondArray::from(vec![ts_us_2024_01_01]),
4212 )
4213 as ArrayRef),
4214 DataType::Binary => Some(Arc::new(BinaryArray::from(vec![
4215 &b"\x11\x22\x33"[..],
4216 &b"\x00"[..],
4217 &b"\x10\x20\x30\x40"[..],
4218 ])) as ArrayRef),
4219 _ => None,
4220 });
4221 expected_cols.push(arr);
4222 }
4223 {
4225 let (uf, _) = get_union("union_uuid_or_fixed10");
4226 let tid_fx16 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
4227 let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
4228 let tids = vec![tid_fx16, tid_fx10, tid_fx16, tid_fx10];
4229 let offs = vec![0, 0, 1, 1];
4230 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4231 DataType::FixedSizeBinary(16) => {
4232 let it = [Some(uuid1), Some(uuid2)].into_iter();
4233 Some(Arc::new(
4234 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
4235 ) as ArrayRef)
4236 }
4237 DataType::FixedSizeBinary(10) => {
4238 let it = [Some(fx10_ascii), Some(fx10_aa)].into_iter();
4239 Some(Arc::new(
4240 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
4241 ) as ArrayRef)
4242 }
4243 _ => None,
4244 });
4245 expected_cols.push(arr);
4246 }
4247 {
4249 let (uf, _) = get_union("union_dec_bytes_or_dec_fixed");
4250 let tid_b10s2 = tid_by_dt(&uf, |dt| match dt {
4251 #[cfg(feature = "small_decimals")]
4252 DataType::Decimal64(10, 2) => true,
4253 DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4254 _ => false,
4255 });
4256 let tid_f20s4 = tid_by_dt(&uf, |dt| {
4257 matches!(
4258 dt,
4259 DataType::Decimal128(20, 4) | DataType::Decimal256(20, 4)
4260 )
4261 });
4262 let tids = vec![tid_b10s2, tid_f20s4, tid_b10s2, tid_f20s4];
4263 let offs = vec![0, 0, 1, 1];
4264 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4265 #[cfg(feature = "small_decimals")]
4266 DataType::Decimal64(10, 2) => {
4267 let a = Decimal64Array::from_iter_values([dec_b_scale2_pos as i64, 0i64]);
4268 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4269 }
4270 DataType::Decimal128(10, 2) => {
4271 let a = Decimal128Array::from_iter_values([dec_b_scale2_pos, 0]);
4272 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4273 }
4274 DataType::Decimal256(10, 2) => {
4275 let a = Decimal256Array::from_iter_values([
4276 i256::from_i128(dec_b_scale2_pos),
4277 i256::from(0),
4278 ]);
4279 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4280 }
4281 DataType::Decimal128(20, 4) => {
4282 let a = Decimal128Array::from_iter_values([dec_fix20_s4_neg, dec_fix20_s4]);
4283 Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4284 }
4285 DataType::Decimal256(20, 4) => {
4286 let a = Decimal256Array::from_iter_values([
4287 i256::from_i128(dec_fix20_s4_neg),
4288 i256::from_i128(dec_fix20_s4),
4289 ]);
4290 Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4291 }
4292 _ => None,
4293 });
4294 expected_cols.push(arr);
4295 }
4296 {
4298 let (uf, _) = get_union("union_null_bytes_string");
4299 let tid_n = tid_by_name(&uf, "null");
4300 let tid_b = tid_by_name(&uf, "bytes");
4301 let tid_s = tid_by_name(&uf, "string");
4302 let tids = vec![tid_n, tid_b, tid_s, tid_s];
4303 let offs = vec![0, 0, 0, 1];
4304 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4305 "null" => Some(Arc::new(arrow_array::NullArray::new(1)) as ArrayRef),
4306 "bytes" => Some(Arc::new(BinaryArray::from(vec![&b"\x01\x02"[..]])) as ArrayRef),
4307 "string" => Some(Arc::new(StringArray::from(vec!["text", "u"])) as ArrayRef),
4308 _ => None,
4309 });
4310 expected_cols.push(arr);
4311 }
4312 {
4314 let idx = schema.index_of("array_of_union").unwrap();
4315 let dt = schema.field(idx).data_type().clone();
4316 let (item_field, _) = match &dt {
4317 DataType::List(f) => (f.clone(), ()),
4318 other => panic!("array_of_union must be List, got {other:?}"),
4319 };
4320 let (uf, _) = match item_field.data_type() {
4321 DataType::Union(f, m) => (f.clone(), m),
4322 other => panic!("array_of_union items must be Union, got {other:?}"),
4323 };
4324 let tid_l = tid_by_name(&uf, "long");
4325 let tid_s = tid_by_name(&uf, "string");
4326 let type_ids = vec![tid_l, tid_s, tid_l, tid_s, tid_l, tid_l, tid_s, tid_l];
4327 let offsets = vec![0, 0, 1, 1, 2, 3, 2, 4];
4328 let values_union =
4329 mk_dense_union(&uf, type_ids, offsets, |f| match f.name().as_str() {
4330 "long" => {
4331 Some(Arc::new(Int64Array::from(vec![1i64, -5, 42, -1, 0])) as ArrayRef)
4332 }
4333 "string" => Some(Arc::new(StringArray::from(vec!["a", "", "z"])) as ArrayRef),
4334 _ => None,
4335 });
4336 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 8]));
4337 expected_cols.push(Arc::new(
4338 ListArray::try_new(item_field.clone(), list_offsets, values_union, None).unwrap(),
4339 ));
4340 }
4341 {
4343 let idx = schema.index_of("map_of_union").unwrap();
4344 let dt = schema.field(idx).data_type().clone();
4345 let (entry_field, ordered) = match &dt {
4346 DataType::Map(f, ordered) => (f.clone(), *ordered),
4347 other => panic!("map_of_union must be Map, got {other:?}"),
4348 };
4349 let DataType::Struct(entry_fields) = entry_field.data_type() else {
4350 panic!("map entries must be struct")
4351 };
4352 let key_field = entry_fields[0].clone();
4353 let val_field = entry_fields[1].clone();
4354 let keys = StringArray::from(vec!["a", "b", "x", "pi"]);
4355 let rounded_pi = (std::f64::consts::PI * 100_000.0).round() / 100_000.0;
4356 let values: ArrayRef = match val_field.data_type() {
4357 DataType::Union(uf, _) => {
4358 let tid_n = tid_by_name(uf, "null");
4359 let tid_d = tid_by_name(uf, "double");
4360 let tids = vec![tid_n, tid_d, tid_d, tid_d];
4361 let offs = vec![0, 0, 1, 2];
4362 mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4363 "null" => Some(Arc::new(NullArray::new(1)) as ArrayRef),
4364 "double" => Some(Arc::new(arrow_array::Float64Array::from(vec![
4365 2.5f64, -0.5f64, rounded_pi,
4366 ])) as ArrayRef),
4367 _ => None,
4368 })
4369 }
4370 DataType::Float64 => Arc::new(arrow_array::Float64Array::from(vec![
4371 None,
4372 Some(2.5),
4373 Some(-0.5),
4374 Some(rounded_pi),
4375 ])),
4376 other => panic!("unexpected map value type {other:?}"),
4377 };
4378 let entries = StructArray::new(
4379 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4380 vec![Arc::new(keys) as ArrayRef, values],
4381 None,
4382 );
4383 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 3, 4]));
4384 expected_cols.push(Arc::new(MapArray::new(
4385 entry_field,
4386 offsets,
4387 entries,
4388 None,
4389 ordered,
4390 )));
4391 }
4392 {
4394 let idx = schema.index_of("record_with_union_field").unwrap();
4395 let DataType::Struct(rec_fields) = schema.field(idx).data_type() else {
4396 panic!("record_with_union_field should be Struct")
4397 };
4398 let id = Int32Array::from(vec![1, 2, 3, 4]);
4399 let u_field = rec_fields.iter().find(|f| f.name() == "u").unwrap();
4400 let DataType::Union(uf, _) = u_field.data_type() else {
4401 panic!("u must be Union")
4402 };
4403 let tid_i = tid_by_name(uf, "int");
4404 let tid_s = tid_by_name(uf, "string");
4405 let tids = vec![tid_s, tid_i, tid_i, tid_s];
4406 let offs = vec![0, 0, 1, 1];
4407 let u = mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4408 "int" => Some(Arc::new(Int32Array::from(vec![99, 0])) as ArrayRef),
4409 "string" => Some(Arc::new(StringArray::from(vec!["one", "four"])) as ArrayRef),
4410 _ => None,
4411 });
4412 let rec = StructArray::new(rec_fields.clone(), vec![Arc::new(id) as ArrayRef, u], None);
4413 expected_cols.push(Arc::new(rec));
4414 }
4415 {
4417 let (uf, _) = get_union("union_ts_micros_utc_or_map");
4418 let tid_ts = tid_by_dt(&uf, |dt| {
4419 matches!(
4420 dt,
4421 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some(_))
4422 )
4423 });
4424 let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
4425 let tids = vec![tid_ts, tid_map, tid_ts, tid_map];
4426 let offs = vec![0, 0, 1, 1];
4427 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4428 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4429 let a = TimestampMicrosecondArray::from(vec![ts_us_2024_01_01, 0i64]);
4430 Some(Arc::new(if let Some(tz) = tz {
4431 a.with_timezone(tz.clone())
4432 } else {
4433 a
4434 }) as ArrayRef)
4435 }
4436 DataType::Map(entry_field, ordered) => {
4437 let DataType::Struct(fs) = entry_field.data_type() else {
4438 panic!("map entries must be struct")
4439 };
4440 let key_field = fs[0].clone();
4441 let val_field = fs[1].clone();
4442 assert_eq!(key_field.data_type(), &DataType::Utf8);
4443 assert_eq!(val_field.data_type(), &DataType::Int64);
4444 let keys = StringArray::from(vec!["k1", "k2", "n"]);
4445 let vals = Int64Array::from(vec![1i64, 2, 0]);
4446 let entries = StructArray::new(
4447 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4448 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
4449 None,
4450 );
4451 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
4452 Some(Arc::new(MapArray::new(
4453 entry_field.clone(),
4454 offsets,
4455 entries,
4456 None,
4457 *ordered,
4458 )) as ArrayRef)
4459 }
4460 _ => None,
4461 });
4462 expected_cols.push(arr);
4463 }
4464 {
4466 let (uf, _) = get_union("union_ts_millis_local_or_string");
4467 let tid_ts = tid_by_dt(&uf, |dt| {
4468 matches!(
4469 dt,
4470 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
4471 )
4472 });
4473 let tid_s = tid_by_name(&uf, "string");
4474 let tids = vec![tid_s, tid_ts, tid_s, tid_s];
4475 let offs = vec![0, 0, 1, 2];
4476 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4477 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) => Some(Arc::new(
4478 TimestampMillisecondArray::from(vec![ts_ms_2024_01_01]),
4479 )
4480 as ArrayRef),
4481 DataType::Utf8 => {
4482 Some(
4483 Arc::new(StringArray::from(vec!["local midnight", "done", ""])) as ArrayRef,
4484 )
4485 }
4486 _ => None,
4487 });
4488 expected_cols.push(arr);
4489 }
4490 {
4492 let (uf, _) = get_union("union_bool_or_string");
4493 let tid_b = tid_by_name(&uf, "boolean");
4494 let tid_s = tid_by_name(&uf, "string");
4495 let tids = vec![tid_b, tid_s, tid_b, tid_s];
4496 let offs = vec![0, 0, 1, 1];
4497 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4498 "boolean" => Some(Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef),
4499 "string" => Some(Arc::new(StringArray::from(vec!["no", "yes"])) as ArrayRef),
4500 _ => None,
4501 });
4502 expected_cols.push(arr);
4503 }
4504 let expected = RecordBatch::try_new(schema.clone(), expected_cols).unwrap();
4505 assert_eq!(
4506 actual, expected,
4507 "full end-to-end equality for union_fields.avro"
4508 );
4509 }
4510
4511 #[test]
4512 fn test_read_zero_byte_avro_file() {
4513 let batch = read_file("test/data/zero_byte.avro", 3, false);
4514 let schema = batch.schema();
4515 assert_eq!(schema.fields().len(), 1);
4516 let field = schema.field(0);
4517 assert_eq!(field.name(), "data");
4518 assert_eq!(field.data_type(), &DataType::Binary);
4519 assert!(field.is_nullable());
4520 assert_eq!(batch.num_rows(), 3);
4521 assert_eq!(batch.num_columns(), 1);
4522 let binary_array = batch
4523 .column(0)
4524 .as_any()
4525 .downcast_ref::<BinaryArray>()
4526 .unwrap();
4527 assert!(binary_array.is_null(0));
4528 assert!(binary_array.is_valid(1));
4529 assert_eq!(binary_array.value(1), b"");
4530 assert!(binary_array.is_valid(2));
4531 assert_eq!(binary_array.value(2), b"some bytes");
4532 }
4533
4534 #[test]
4535 fn test_alltypes() {
4536 let expected = RecordBatch::try_from_iter_with_nullable([
4537 (
4538 "id",
4539 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
4540 true,
4541 ),
4542 (
4543 "bool_col",
4544 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
4545 true,
4546 ),
4547 (
4548 "tinyint_col",
4549 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4550 true,
4551 ),
4552 (
4553 "smallint_col",
4554 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4555 true,
4556 ),
4557 (
4558 "int_col",
4559 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4560 true,
4561 ),
4562 (
4563 "bigint_col",
4564 Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
4565 true,
4566 ),
4567 (
4568 "float_col",
4569 Arc::new(Float32Array::from_iter_values(
4570 (0..8).map(|x| (x % 2) as f32 * 1.1),
4571 )) as _,
4572 true,
4573 ),
4574 (
4575 "double_col",
4576 Arc::new(Float64Array::from_iter_values(
4577 (0..8).map(|x| (x % 2) as f64 * 10.1),
4578 )) as _,
4579 true,
4580 ),
4581 (
4582 "date_string_col",
4583 Arc::new(BinaryArray::from_iter_values([
4584 [48, 51, 47, 48, 49, 47, 48, 57],
4585 [48, 51, 47, 48, 49, 47, 48, 57],
4586 [48, 52, 47, 48, 49, 47, 48, 57],
4587 [48, 52, 47, 48, 49, 47, 48, 57],
4588 [48, 50, 47, 48, 49, 47, 48, 57],
4589 [48, 50, 47, 48, 49, 47, 48, 57],
4590 [48, 49, 47, 48, 49, 47, 48, 57],
4591 [48, 49, 47, 48, 49, 47, 48, 57],
4592 ])) as _,
4593 true,
4594 ),
4595 (
4596 "string_col",
4597 Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
4598 true,
4599 ),
4600 (
4601 "timestamp_col",
4602 Arc::new(
4603 TimestampMicrosecondArray::from_iter_values([
4604 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
4613 .with_timezone("+00:00"),
4614 ) as _,
4615 true,
4616 ),
4617 ])
4618 .unwrap();
4619
4620 for file in files() {
4621 let file = arrow_test_data(file);
4622
4623 assert_eq!(read_file(&file, 8, false), expected);
4624 assert_eq!(read_file(&file, 3, false), expected);
4625 }
4626 }
4627
4628 #[test]
4629 #[cfg(feature = "snappy")]
4631 fn test_alltypes_dictionary() {
4632 let file = "avro/alltypes_dictionary.avro";
4633 let expected = RecordBatch::try_from_iter_with_nullable([
4634 ("id", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
4635 (
4636 "bool_col",
4637 Arc::new(BooleanArray::from(vec![Some(true), Some(false)])) as _,
4638 true,
4639 ),
4640 (
4641 "tinyint_col",
4642 Arc::new(Int32Array::from(vec![0, 1])) as _,
4643 true,
4644 ),
4645 (
4646 "smallint_col",
4647 Arc::new(Int32Array::from(vec![0, 1])) as _,
4648 true,
4649 ),
4650 ("int_col", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
4651 (
4652 "bigint_col",
4653 Arc::new(Int64Array::from(vec![0, 10])) as _,
4654 true,
4655 ),
4656 (
4657 "float_col",
4658 Arc::new(Float32Array::from(vec![0.0, 1.1])) as _,
4659 true,
4660 ),
4661 (
4662 "double_col",
4663 Arc::new(Float64Array::from(vec![0.0, 10.1])) as _,
4664 true,
4665 ),
4666 (
4667 "date_string_col",
4668 Arc::new(BinaryArray::from_iter_values([b"01/01/09", b"01/01/09"])) as _,
4669 true,
4670 ),
4671 (
4672 "string_col",
4673 Arc::new(BinaryArray::from_iter_values([b"0", b"1"])) as _,
4674 true,
4675 ),
4676 (
4677 "timestamp_col",
4678 Arc::new(
4679 TimestampMicrosecondArray::from_iter_values([
4680 1230768000000000, 1230768060000000, ])
4683 .with_timezone("+00:00"),
4684 ) as _,
4685 true,
4686 ),
4687 ])
4688 .unwrap();
4689 let file_path = arrow_test_data(file);
4690 let batch_large = read_file(&file_path, 8, false);
4691 assert_eq!(
4692 batch_large, expected,
4693 "Decoded RecordBatch does not match for file {file}"
4694 );
4695 let batch_small = read_file(&file_path, 3, false);
4696 assert_eq!(
4697 batch_small, expected,
4698 "Decoded RecordBatch (batch size 3) does not match for file {file}"
4699 );
4700 }
4701
4702 #[test]
4703 fn test_alltypes_nulls_plain() {
4704 let file = "avro/alltypes_nulls_plain.avro";
4705 let expected = RecordBatch::try_from_iter_with_nullable([
4706 (
4707 "string_col",
4708 Arc::new(StringArray::from(vec![None::<&str>])) as _,
4709 true,
4710 ),
4711 ("int_col", Arc::new(Int32Array::from(vec![None])) as _, true),
4712 (
4713 "bool_col",
4714 Arc::new(BooleanArray::from(vec![None])) as _,
4715 true,
4716 ),
4717 (
4718 "bigint_col",
4719 Arc::new(Int64Array::from(vec![None])) as _,
4720 true,
4721 ),
4722 (
4723 "float_col",
4724 Arc::new(Float32Array::from(vec![None])) as _,
4725 true,
4726 ),
4727 (
4728 "double_col",
4729 Arc::new(Float64Array::from(vec![None])) as _,
4730 true,
4731 ),
4732 (
4733 "bytes_col",
4734 Arc::new(BinaryArray::from(vec![None::<&[u8]>])) as _,
4735 true,
4736 ),
4737 ])
4738 .unwrap();
4739 let file_path = arrow_test_data(file);
4740 let batch_large = read_file(&file_path, 8, false);
4741 assert_eq!(
4742 batch_large, expected,
4743 "Decoded RecordBatch does not match for file {file}"
4744 );
4745 let batch_small = read_file(&file_path, 3, false);
4746 assert_eq!(
4747 batch_small, expected,
4748 "Decoded RecordBatch (batch size 3) does not match for file {file}"
4749 );
4750 }
4751
4752 #[test]
4753 #[cfg(feature = "snappy")]
4755 fn test_binary() {
4756 let file = arrow_test_data("avro/binary.avro");
4757 let batch = read_file(&file, 8, false);
4758 let expected = RecordBatch::try_from_iter_with_nullable([(
4759 "foo",
4760 Arc::new(BinaryArray::from_iter_values(vec![
4761 b"\x00" as &[u8],
4762 b"\x01" as &[u8],
4763 b"\x02" as &[u8],
4764 b"\x03" as &[u8],
4765 b"\x04" as &[u8],
4766 b"\x05" as &[u8],
4767 b"\x06" as &[u8],
4768 b"\x07" as &[u8],
4769 b"\x08" as &[u8],
4770 b"\t" as &[u8],
4771 b"\n" as &[u8],
4772 b"\x0b" as &[u8],
4773 ])) as Arc<dyn Array>,
4774 true,
4775 )])
4776 .unwrap();
4777 assert_eq!(batch, expected);
4778 }
4779
4780 #[test]
4781 #[cfg(feature = "snappy")]
4783 fn test_decimal() {
4784 #[cfg(feature = "small_decimals")]
4788 let files: [(&str, DataType, HashMap<String, String>); 8] = [
4789 (
4790 "avro/fixed_length_decimal.avro",
4791 DataType::Decimal128(25, 2),
4792 HashMap::from([
4793 (
4794 "avro.namespace".to_string(),
4795 "topLevelRecord.value".to_string(),
4796 ),
4797 ("avro.name".to_string(), "fixed".to_string()),
4798 ]),
4799 ),
4800 (
4801 "avro/fixed_length_decimal_legacy.avro",
4802 DataType::Decimal64(13, 2),
4803 HashMap::from([
4804 (
4805 "avro.namespace".to_string(),
4806 "topLevelRecord.value".to_string(),
4807 ),
4808 ("avro.name".to_string(), "fixed".to_string()),
4809 ]),
4810 ),
4811 (
4812 "avro/int32_decimal.avro",
4813 DataType::Decimal32(4, 2),
4814 HashMap::from([
4815 (
4816 "avro.namespace".to_string(),
4817 "topLevelRecord.value".to_string(),
4818 ),
4819 ("avro.name".to_string(), "fixed".to_string()),
4820 ]),
4821 ),
4822 (
4823 "avro/int64_decimal.avro",
4824 DataType::Decimal64(10, 2),
4825 HashMap::from([
4826 (
4827 "avro.namespace".to_string(),
4828 "topLevelRecord.value".to_string(),
4829 ),
4830 ("avro.name".to_string(), "fixed".to_string()),
4831 ]),
4832 ),
4833 (
4834 "test/data/int256_decimal.avro",
4835 DataType::Decimal256(76, 10),
4836 HashMap::new(),
4837 ),
4838 (
4839 "test/data/fixed256_decimal.avro",
4840 DataType::Decimal256(76, 10),
4841 HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
4842 ),
4843 (
4844 "test/data/fixed_length_decimal_legacy_32.avro",
4845 DataType::Decimal32(9, 2),
4846 HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
4847 ),
4848 (
4849 "test/data/int128_decimal.avro",
4850 DataType::Decimal128(38, 2),
4851 HashMap::new(),
4852 ),
4853 ];
4854 #[cfg(not(feature = "small_decimals"))]
4855 let files: [(&str, DataType, HashMap<String, String>); 8] = [
4856 (
4857 "avro/fixed_length_decimal.avro",
4858 DataType::Decimal128(25, 2),
4859 HashMap::from([
4860 (
4861 "avro.namespace".to_string(),
4862 "topLevelRecord.value".to_string(),
4863 ),
4864 ("avro.name".to_string(), "fixed".to_string()),
4865 ]),
4866 ),
4867 (
4868 "avro/fixed_length_decimal_legacy.avro",
4869 DataType::Decimal128(13, 2),
4870 HashMap::from([
4871 (
4872 "avro.namespace".to_string(),
4873 "topLevelRecord.value".to_string(),
4874 ),
4875 ("avro.name".to_string(), "fixed".to_string()),
4876 ]),
4877 ),
4878 (
4879 "avro/int32_decimal.avro",
4880 DataType::Decimal128(4, 2),
4881 HashMap::from([
4882 (
4883 "avro.namespace".to_string(),
4884 "topLevelRecord.value".to_string(),
4885 ),
4886 ("avro.name".to_string(), "fixed".to_string()),
4887 ]),
4888 ),
4889 (
4890 "avro/int64_decimal.avro",
4891 DataType::Decimal128(10, 2),
4892 HashMap::from([
4893 (
4894 "avro.namespace".to_string(),
4895 "topLevelRecord.value".to_string(),
4896 ),
4897 ("avro.name".to_string(), "fixed".to_string()),
4898 ]),
4899 ),
4900 (
4901 "test/data/int256_decimal.avro",
4902 DataType::Decimal256(76, 10),
4903 HashMap::new(),
4904 ),
4905 (
4906 "test/data/fixed256_decimal.avro",
4907 DataType::Decimal256(76, 10),
4908 HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
4909 ),
4910 (
4911 "test/data/fixed_length_decimal_legacy_32.avro",
4912 DataType::Decimal128(9, 2),
4913 HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
4914 ),
4915 (
4916 "test/data/int128_decimal.avro",
4917 DataType::Decimal128(38, 2),
4918 HashMap::new(),
4919 ),
4920 ];
4921 for (file, expected_dt, mut metadata) in files {
4922 let (precision, scale) = match expected_dt {
4923 DataType::Decimal32(p, s)
4924 | DataType::Decimal64(p, s)
4925 | DataType::Decimal128(p, s)
4926 | DataType::Decimal256(p, s) => (p, s),
4927 _ => unreachable!("Unexpected decimal type in test inputs"),
4928 };
4929 assert!(scale >= 0, "test data uses non-negative scales only");
4930 let scale_u32 = scale as u32;
4931 let file_path: String = if file.starts_with("avro/") {
4932 arrow_test_data(file)
4933 } else {
4934 std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
4935 .join(file)
4936 .to_string_lossy()
4937 .into_owned()
4938 };
4939 let pow10: i128 = 10i128.pow(scale_u32);
4940 let values_i128: Vec<i128> = (1..=24).map(|n| (n as i128) * pow10).collect();
4941 let build_expected = |dt: &DataType, values: &[i128]| -> ArrayRef {
4942 match *dt {
4943 #[cfg(feature = "small_decimals")]
4944 DataType::Decimal32(p, s) => {
4945 let it = values.iter().map(|&v| v as i32);
4946 Arc::new(
4947 Decimal32Array::from_iter_values(it)
4948 .with_precision_and_scale(p, s)
4949 .unwrap(),
4950 )
4951 }
4952 #[cfg(feature = "small_decimals")]
4953 DataType::Decimal64(p, s) => {
4954 let it = values.iter().map(|&v| v as i64);
4955 Arc::new(
4956 Decimal64Array::from_iter_values(it)
4957 .with_precision_and_scale(p, s)
4958 .unwrap(),
4959 )
4960 }
4961 DataType::Decimal128(p, s) => {
4962 let it = values.iter().copied();
4963 Arc::new(
4964 Decimal128Array::from_iter_values(it)
4965 .with_precision_and_scale(p, s)
4966 .unwrap(),
4967 )
4968 }
4969 DataType::Decimal256(p, s) => {
4970 let it = values.iter().map(|&v| i256::from_i128(v));
4971 Arc::new(
4972 Decimal256Array::from_iter_values(it)
4973 .with_precision_and_scale(p, s)
4974 .unwrap(),
4975 )
4976 }
4977 _ => unreachable!("Unexpected decimal type in test"),
4978 }
4979 };
4980 let actual_batch = read_file(&file_path, 8, false);
4981 let actual_nullable = actual_batch.schema().field(0).is_nullable();
4982 let expected_array = build_expected(&expected_dt, &values_i128);
4983 metadata.insert("precision".to_string(), precision.to_string());
4984 metadata.insert("scale".to_string(), scale.to_string());
4985 let field =
4986 Field::new("value", expected_dt.clone(), actual_nullable).with_metadata(metadata);
4987 let expected_schema = Arc::new(Schema::new(vec![field]));
4988 let expected_batch =
4989 RecordBatch::try_new(expected_schema.clone(), vec![expected_array]).unwrap();
4990 assert_eq!(
4991 actual_batch, expected_batch,
4992 "Decoded RecordBatch does not match for {file}"
4993 );
4994 let actual_batch_small = read_file(&file_path, 3, false);
4995 assert_eq!(
4996 actual_batch_small, expected_batch,
4997 "Decoded RecordBatch does not match for {file} with batch size 3"
4998 );
4999 }
5000 }
5001
5002 #[test]
5003 fn test_read_duration_logical_types_feature_toggle() -> Result<(), ArrowError> {
5004 let file_path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5005 .join("test/data/duration_logical_types.avro")
5006 .to_string_lossy()
5007 .into_owned();
5008
5009 let actual_batch = read_file(&file_path, 4, false);
5010
5011 let expected_batch = {
5012 #[cfg(feature = "avro_custom_types")]
5013 {
5014 let schema = Arc::new(Schema::new(vec![
5015 Field::new(
5016 "duration_time_nanos",
5017 DataType::Duration(TimeUnit::Nanosecond),
5018 false,
5019 ),
5020 Field::new(
5021 "duration_time_micros",
5022 DataType::Duration(TimeUnit::Microsecond),
5023 false,
5024 ),
5025 Field::new(
5026 "duration_time_millis",
5027 DataType::Duration(TimeUnit::Millisecond),
5028 false,
5029 ),
5030 Field::new(
5031 "duration_time_seconds",
5032 DataType::Duration(TimeUnit::Second),
5033 false,
5034 ),
5035 ]));
5036
5037 let nanos = Arc::new(PrimitiveArray::<DurationNanosecondType>::from(vec![
5038 10, 20, 30, 40,
5039 ])) as ArrayRef;
5040 let micros = Arc::new(PrimitiveArray::<DurationMicrosecondType>::from(vec![
5041 100, 200, 300, 400,
5042 ])) as ArrayRef;
5043 let millis = Arc::new(PrimitiveArray::<DurationMillisecondType>::from(vec![
5044 1000, 2000, 3000, 4000,
5045 ])) as ArrayRef;
5046 let seconds = Arc::new(PrimitiveArray::<DurationSecondType>::from(vec![1, 2, 3, 4]))
5047 as ArrayRef;
5048
5049 RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5050 }
5051 #[cfg(not(feature = "avro_custom_types"))]
5052 {
5053 let schema = Arc::new(Schema::new(vec![
5054 Field::new("duration_time_nanos", DataType::Int64, false).with_metadata(
5055 [(
5056 "logicalType".to_string(),
5057 "arrow.duration-nanos".to_string(),
5058 )]
5059 .into(),
5060 ),
5061 Field::new("duration_time_micros", DataType::Int64, false).with_metadata(
5062 [(
5063 "logicalType".to_string(),
5064 "arrow.duration-micros".to_string(),
5065 )]
5066 .into(),
5067 ),
5068 Field::new("duration_time_millis", DataType::Int64, false).with_metadata(
5069 [(
5070 "logicalType".to_string(),
5071 "arrow.duration-millis".to_string(),
5072 )]
5073 .into(),
5074 ),
5075 Field::new("duration_time_seconds", DataType::Int64, false).with_metadata(
5076 [(
5077 "logicalType".to_string(),
5078 "arrow.duration-seconds".to_string(),
5079 )]
5080 .into(),
5081 ),
5082 ]));
5083
5084 let nanos =
5085 Arc::new(PrimitiveArray::<Int64Type>::from(vec![10, 20, 30, 40])) as ArrayRef;
5086 let micros = Arc::new(PrimitiveArray::<Int64Type>::from(vec![100, 200, 300, 400]))
5087 as ArrayRef;
5088 let millis = Arc::new(PrimitiveArray::<Int64Type>::from(vec![
5089 1000, 2000, 3000, 4000,
5090 ])) as ArrayRef;
5091 let seconds =
5092 Arc::new(PrimitiveArray::<Int64Type>::from(vec![1, 2, 3, 4])) as ArrayRef;
5093
5094 RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5095 }
5096 };
5097
5098 assert_eq!(actual_batch, expected_batch);
5099
5100 Ok(())
5101 }
5102
5103 #[test]
5104 #[cfg(feature = "snappy")]
5106 fn test_dict_pages_offset_zero() {
5107 let file = arrow_test_data("avro/dict-page-offset-zero.avro");
5108 let batch = read_file(&file, 32, false);
5109 let num_rows = batch.num_rows();
5110 let expected_field = Int32Array::from(vec![Some(1552); num_rows]);
5111 let expected = RecordBatch::try_from_iter_with_nullable([(
5112 "l_partkey",
5113 Arc::new(expected_field) as Arc<dyn Array>,
5114 true,
5115 )])
5116 .unwrap();
5117 assert_eq!(batch, expected);
5118 }
5119
5120 #[test]
5121 #[cfg(feature = "snappy")]
5123 fn test_list_columns() {
5124 let file = arrow_test_data("avro/list_columns.avro");
5125 let mut int64_list_builder = ListBuilder::new(Int64Builder::new());
5126 {
5127 {
5128 let values = int64_list_builder.values();
5129 values.append_value(1);
5130 values.append_value(2);
5131 values.append_value(3);
5132 }
5133 int64_list_builder.append(true);
5134 }
5135 {
5136 {
5137 let values = int64_list_builder.values();
5138 values.append_null();
5139 values.append_value(1);
5140 }
5141 int64_list_builder.append(true);
5142 }
5143 {
5144 {
5145 let values = int64_list_builder.values();
5146 values.append_value(4);
5147 }
5148 int64_list_builder.append(true);
5149 }
5150 let int64_list = int64_list_builder.finish();
5151 let mut utf8_list_builder = ListBuilder::new(StringBuilder::new());
5152 {
5153 {
5154 let values = utf8_list_builder.values();
5155 values.append_value("abc");
5156 values.append_value("efg");
5157 values.append_value("hij");
5158 }
5159 utf8_list_builder.append(true);
5160 }
5161 {
5162 utf8_list_builder.append(false);
5163 }
5164 {
5165 {
5166 let values = utf8_list_builder.values();
5167 values.append_value("efg");
5168 values.append_null();
5169 values.append_value("hij");
5170 values.append_value("xyz");
5171 }
5172 utf8_list_builder.append(true);
5173 }
5174 let utf8_list = utf8_list_builder.finish();
5175 let expected = RecordBatch::try_from_iter_with_nullable([
5176 ("int64_list", Arc::new(int64_list) as Arc<dyn Array>, true),
5177 ("utf8_list", Arc::new(utf8_list) as Arc<dyn Array>, true),
5178 ])
5179 .unwrap();
5180 let batch = read_file(&file, 8, false);
5181 assert_eq!(batch, expected);
5182 }
5183
5184 #[test]
5185 #[cfg(feature = "snappy")]
5186 fn test_nested_lists() {
5187 use arrow_data::ArrayDataBuilder;
5188 let file = arrow_test_data("avro/nested_lists.snappy.avro");
5189 let inner_values = StringArray::from(vec![
5190 Some("a"),
5191 Some("b"),
5192 Some("c"),
5193 Some("d"),
5194 Some("a"),
5195 Some("b"),
5196 Some("c"),
5197 Some("d"),
5198 Some("e"),
5199 Some("a"),
5200 Some("b"),
5201 Some("c"),
5202 Some("d"),
5203 Some("e"),
5204 Some("f"),
5205 ]);
5206 let inner_offsets = Buffer::from_slice_ref([0, 2, 3, 3, 4, 6, 8, 8, 9, 11, 13, 14, 14, 15]);
5207 let inner_validity = [
5208 true, true, false, true, true, true, false, true, true, true, true, false, true,
5209 ];
5210 let inner_null_buffer = Buffer::from_iter(inner_validity.iter().copied());
5211 let inner_field = Field::new("item", DataType::Utf8, true);
5212 let inner_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(inner_field)))
5213 .len(13)
5214 .add_buffer(inner_offsets)
5215 .add_child_data(inner_values.to_data())
5216 .null_bit_buffer(Some(inner_null_buffer))
5217 .build()
5218 .unwrap();
5219 let inner_list_array = ListArray::from(inner_list_data);
5220 let middle_offsets = Buffer::from_slice_ref([0, 2, 4, 6, 8, 11, 13]);
5221 let middle_validity = [true; 6];
5222 let middle_null_buffer = Buffer::from_iter(middle_validity.iter().copied());
5223 let middle_field = Field::new("item", inner_list_array.data_type().clone(), true);
5224 let middle_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(middle_field)))
5225 .len(6)
5226 .add_buffer(middle_offsets)
5227 .add_child_data(inner_list_array.to_data())
5228 .null_bit_buffer(Some(middle_null_buffer))
5229 .build()
5230 .unwrap();
5231 let middle_list_array = ListArray::from(middle_list_data);
5232 let outer_offsets = Buffer::from_slice_ref([0, 2, 4, 6]);
5233 let outer_null_buffer = Buffer::from_slice_ref([0b111]); let outer_field = Field::new("item", middle_list_array.data_type().clone(), true);
5235 let outer_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(outer_field)))
5236 .len(3)
5237 .add_buffer(outer_offsets)
5238 .add_child_data(middle_list_array.to_data())
5239 .null_bit_buffer(Some(outer_null_buffer))
5240 .build()
5241 .unwrap();
5242 let a_expected = ListArray::from(outer_list_data);
5243 let b_expected = Int32Array::from(vec![1, 1, 1]);
5244 let expected = RecordBatch::try_from_iter_with_nullable([
5245 ("a", Arc::new(a_expected) as Arc<dyn Array>, true),
5246 ("b", Arc::new(b_expected) as Arc<dyn Array>, true),
5247 ])
5248 .unwrap();
5249 let left = read_file(&file, 8, false);
5250 assert_eq!(left, expected, "Mismatch for batch size=8");
5251 let left_small = read_file(&file, 3, false);
5252 assert_eq!(left_small, expected, "Mismatch for batch size=3");
5253 }
5254
5255 #[test]
5256 fn test_simple() {
5257 let tests = [
5258 ("avro/simple_enum.avro", 4, build_expected_enum(), 2),
5259 ("avro/simple_fixed.avro", 2, build_expected_fixed(), 1),
5260 ];
5261
5262 fn build_expected_enum() -> RecordBatch {
5263 let keys_f1 = Int32Array::from(vec![0, 1, 2, 3]);
5265 let vals_f1 = StringArray::from(vec!["a", "b", "c", "d"]);
5266 let f1_dict =
5267 DictionaryArray::<Int32Type>::try_new(keys_f1, Arc::new(vals_f1)).unwrap();
5268 let keys_f2 = Int32Array::from(vec![2, 3, 0, 1]);
5269 let vals_f2 = StringArray::from(vec!["e", "f", "g", "h"]);
5270 let f2_dict =
5271 DictionaryArray::<Int32Type>::try_new(keys_f2, Arc::new(vals_f2)).unwrap();
5272 let keys_f3 = Int32Array::from(vec![Some(1), Some(2), None, Some(0)]);
5273 let vals_f3 = StringArray::from(vec!["i", "j", "k"]);
5274 let f3_dict =
5275 DictionaryArray::<Int32Type>::try_new(keys_f3, Arc::new(vals_f3)).unwrap();
5276 let dict_type =
5277 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
5278 let mut md_f1 = HashMap::new();
5279 md_f1.insert(
5280 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5281 r#"["a","b","c","d"]"#.to_string(),
5282 );
5283 md_f1.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum1".to_string());
5284 md_f1.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5285 let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
5286 let mut md_f2 = HashMap::new();
5287 md_f2.insert(
5288 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5289 r#"["e","f","g","h"]"#.to_string(),
5290 );
5291 md_f2.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum2".to_string());
5292 md_f2.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5293 let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
5294 let mut md_f3 = HashMap::new();
5295 md_f3.insert(
5296 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5297 r#"["i","j","k"]"#.to_string(),
5298 );
5299 md_f3.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum3".to_string());
5300 md_f3.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5301 let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
5302 let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
5303 RecordBatch::try_new(
5304 expected_schema,
5305 vec![
5306 Arc::new(f1_dict) as Arc<dyn Array>,
5307 Arc::new(f2_dict) as Arc<dyn Array>,
5308 Arc::new(f3_dict) as Arc<dyn Array>,
5309 ],
5310 )
5311 .unwrap()
5312 }
5313
5314 fn build_expected_fixed() -> RecordBatch {
5315 let f1 =
5316 FixedSizeBinaryArray::try_from_iter(vec![b"abcde", b"12345"].into_iter()).unwrap();
5317 let f2 =
5318 FixedSizeBinaryArray::try_from_iter(vec![b"fghijklmno", b"1234567890"].into_iter())
5319 .unwrap();
5320 let f3 = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5321 vec![Some(b"ABCDEF" as &[u8]), None].into_iter(),
5322 6,
5323 )
5324 .unwrap();
5325
5326 let mut md_f1 = HashMap::new();
5328 md_f1.insert(
5329 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5330 "fixed1".to_string(),
5331 );
5332 md_f1.insert(
5333 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5334 "ns1".to_string(),
5335 );
5336
5337 let mut md_f2 = HashMap::new();
5338 md_f2.insert(
5339 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5340 "fixed2".to_string(),
5341 );
5342 md_f2.insert(
5343 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5344 "ns2".to_string(),
5345 );
5346
5347 let mut md_f3 = HashMap::new();
5348 md_f3.insert(
5349 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5350 "fixed3".to_string(),
5351 );
5352 md_f3.insert(
5353 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5354 "ns1".to_string(),
5355 );
5356
5357 let expected_schema = Arc::new(Schema::new(vec![
5358 Field::new("f1", DataType::FixedSizeBinary(5), false).with_metadata(md_f1),
5359 Field::new("f2", DataType::FixedSizeBinary(10), false).with_metadata(md_f2),
5360 Field::new("f3", DataType::FixedSizeBinary(6), true).with_metadata(md_f3),
5361 ]));
5362
5363 RecordBatch::try_new(
5364 expected_schema,
5365 vec![
5366 Arc::new(f1) as Arc<dyn Array>,
5367 Arc::new(f2) as Arc<dyn Array>,
5368 Arc::new(f3) as Arc<dyn Array>,
5369 ],
5370 )
5371 .unwrap()
5372 }
5373 for (file_name, batch_size, expected, alt_batch_size) in tests {
5374 let file = arrow_test_data(file_name);
5375 let actual = read_file(&file, batch_size, false);
5376 assert_eq!(actual, expected);
5377 let actual2 = read_file(&file, alt_batch_size, false);
5378 assert_eq!(actual2, expected);
5379 }
5380 }
5381
5382 #[test]
5383 #[cfg(feature = "snappy")]
5384 fn test_single_nan() {
5385 let file = arrow_test_data("avro/single_nan.avro");
5386 let actual = read_file(&file, 1, false);
5387 use arrow_array::Float64Array;
5388 let schema = Arc::new(Schema::new(vec![Field::new(
5389 "mycol",
5390 DataType::Float64,
5391 true,
5392 )]));
5393 let col = Float64Array::from(vec![None]);
5394 let expected = RecordBatch::try_new(schema, vec![Arc::new(col)]).unwrap();
5395 assert_eq!(actual, expected);
5396 let actual2 = read_file(&file, 2, false);
5397 assert_eq!(actual2, expected);
5398 }
5399
5400 #[test]
5401 fn test_duration_uuid() {
5402 let batch = read_file("test/data/duration_uuid.avro", 4, false);
5403 let schema = batch.schema();
5404 let fields = schema.fields();
5405 assert_eq!(fields.len(), 2);
5406 assert_eq!(fields[0].name(), "duration_field");
5407 assert_eq!(
5408 fields[0].data_type(),
5409 &DataType::Interval(IntervalUnit::MonthDayNano)
5410 );
5411 assert_eq!(fields[1].name(), "uuid_field");
5412 assert_eq!(fields[1].data_type(), &DataType::FixedSizeBinary(16));
5413 assert_eq!(batch.num_rows(), 4);
5414 assert_eq!(batch.num_columns(), 2);
5415 let duration_array = batch
5416 .column(0)
5417 .as_any()
5418 .downcast_ref::<IntervalMonthDayNanoArray>()
5419 .unwrap();
5420 let expected_duration_array: IntervalMonthDayNanoArray = [
5421 Some(IntervalMonthDayNanoType::make_value(1, 15, 500_000_000)),
5422 Some(IntervalMonthDayNanoType::make_value(0, 5, 2_500_000_000)),
5423 Some(IntervalMonthDayNanoType::make_value(2, 0, 0)),
5424 Some(IntervalMonthDayNanoType::make_value(12, 31, 999_000_000)),
5425 ]
5426 .iter()
5427 .copied()
5428 .collect();
5429 assert_eq!(&expected_duration_array, duration_array);
5430 let uuid_array = batch
5431 .column(1)
5432 .as_any()
5433 .downcast_ref::<FixedSizeBinaryArray>()
5434 .unwrap();
5435 let expected_uuid_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5436 [
5437 Some([
5438 0xfe, 0x7b, 0xc3, 0x0b, 0x4c, 0xe8, 0x4c, 0x5e, 0xb6, 0x7c, 0x22, 0x34, 0xa2,
5439 0xd3, 0x8e, 0x66,
5440 ]),
5441 Some([
5442 0xb3, 0x3f, 0x2a, 0xd7, 0x97, 0xb4, 0x4d, 0xe1, 0x8b, 0xfe, 0x94, 0x94, 0x1d,
5443 0x60, 0x15, 0x6e,
5444 ]),
5445 Some([
5446 0x5f, 0x74, 0x92, 0x64, 0x07, 0x4b, 0x40, 0x05, 0x84, 0xbf, 0x11, 0x5e, 0xa8,
5447 0x4e, 0xd2, 0x0a,
5448 ]),
5449 Some([
5450 0x08, 0x26, 0xcc, 0x06, 0xd2, 0xe3, 0x45, 0x99, 0xb4, 0xad, 0xaf, 0x5f, 0xa6,
5451 0x90, 0x5c, 0xdb,
5452 ]),
5453 ]
5454 .into_iter(),
5455 16,
5456 )
5457 .unwrap();
5458 assert_eq!(&expected_uuid_array, uuid_array);
5459 }
5460
5461 #[test]
5462 #[cfg(feature = "snappy")]
5463 fn test_datapage_v2() {
5464 let file = arrow_test_data("avro/datapage_v2.snappy.avro");
5465 let batch = read_file(&file, 8, false);
5466 let a = StringArray::from(vec![
5467 Some("abc"),
5468 Some("abc"),
5469 Some("abc"),
5470 None,
5471 Some("abc"),
5472 ]);
5473 let b = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
5474 let c = Float64Array::from(vec![Some(2.0), Some(3.0), Some(4.0), Some(5.0), Some(2.0)]);
5475 let d = BooleanArray::from(vec![
5476 Some(true),
5477 Some(true),
5478 Some(true),
5479 Some(false),
5480 Some(true),
5481 ]);
5482 let e_values = Int32Array::from(vec![
5483 Some(1),
5484 Some(2),
5485 Some(3),
5486 Some(1),
5487 Some(2),
5488 Some(3),
5489 Some(1),
5490 Some(2),
5491 ]);
5492 let e_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 3, 3, 3, 6, 8]));
5493 let e_validity = Some(NullBuffer::from(vec![true, false, false, true, true]));
5494 let field_e = Arc::new(Field::new("item", DataType::Int32, true));
5495 let e = ListArray::new(field_e, e_offsets, Arc::new(e_values), e_validity);
5496 let expected = RecordBatch::try_from_iter_with_nullable([
5497 ("a", Arc::new(a) as Arc<dyn Array>, true),
5498 ("b", Arc::new(b) as Arc<dyn Array>, true),
5499 ("c", Arc::new(c) as Arc<dyn Array>, true),
5500 ("d", Arc::new(d) as Arc<dyn Array>, true),
5501 ("e", Arc::new(e) as Arc<dyn Array>, true),
5502 ])
5503 .unwrap();
5504 assert_eq!(batch, expected);
5505 }
5506
5507 #[test]
5508 fn test_nested_records() {
5509 let f1_f1_1 = StringArray::from(vec!["aaa", "bbb"]);
5510 let f1_f1_2 = Int32Array::from(vec![10, 20]);
5511 let rounded_pi = (std::f64::consts::PI * 100.0).round() / 100.0;
5512 let f1_f1_3_1 = Float64Array::from(vec![rounded_pi, rounded_pi]);
5513 let f1_f1_3 = StructArray::from(vec![(
5514 Arc::new(Field::new("f1_3_1", DataType::Float64, false)),
5515 Arc::new(f1_f1_3_1) as Arc<dyn Array>,
5516 )]);
5517 let mut f1_3_md: HashMap<String, String> = HashMap::new();
5519 f1_3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns3".to_string());
5520 f1_3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record3".to_string());
5521 let f1_expected = StructArray::from(vec![
5522 (
5523 Arc::new(Field::new("f1_1", DataType::Utf8, false)),
5524 Arc::new(f1_f1_1) as Arc<dyn Array>,
5525 ),
5526 (
5527 Arc::new(Field::new("f1_2", DataType::Int32, false)),
5528 Arc::new(f1_f1_2) as Arc<dyn Array>,
5529 ),
5530 (
5531 Arc::new(
5532 Field::new(
5533 "f1_3",
5534 DataType::Struct(Fields::from(vec![Field::new(
5535 "f1_3_1",
5536 DataType::Float64,
5537 false,
5538 )])),
5539 false,
5540 )
5541 .with_metadata(f1_3_md),
5542 ),
5543 Arc::new(f1_f1_3) as Arc<dyn Array>,
5544 ),
5545 ]);
5546 let f2_fields = vec![
5547 Field::new("f2_1", DataType::Boolean, false),
5548 Field::new("f2_2", DataType::Float32, false),
5549 ];
5550 let f2_struct_builder = StructBuilder::new(
5551 f2_fields
5552 .iter()
5553 .map(|f| Arc::new(f.clone()))
5554 .collect::<Vec<Arc<Field>>>(),
5555 vec![
5556 Box::new(BooleanBuilder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5557 Box::new(Float32Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5558 ],
5559 );
5560 let mut f2_list_builder = ListBuilder::new(f2_struct_builder);
5561 {
5562 let struct_builder = f2_list_builder.values();
5563 struct_builder.append(true);
5564 {
5565 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5566 b.append_value(true);
5567 }
5568 {
5569 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5570 b.append_value(1.2_f32);
5571 }
5572 struct_builder.append(true);
5573 {
5574 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5575 b.append_value(true);
5576 }
5577 {
5578 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5579 b.append_value(2.2_f32);
5580 }
5581 f2_list_builder.append(true);
5582 }
5583 {
5584 let struct_builder = f2_list_builder.values();
5585 struct_builder.append(true);
5586 {
5587 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5588 b.append_value(false);
5589 }
5590 {
5591 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5592 b.append_value(10.2_f32);
5593 }
5594 f2_list_builder.append(true);
5595 }
5596
5597 let list_array_with_nullable_items = f2_list_builder.finish();
5598 let mut f2_item_md: HashMap<String, String> = HashMap::new();
5600 f2_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record4".to_string());
5601 f2_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns4".to_string());
5602 let item_field = Arc::new(
5603 Field::new(
5604 "item",
5605 list_array_with_nullable_items.values().data_type().clone(),
5606 false, )
5608 .with_metadata(f2_item_md),
5609 );
5610 let list_data_type = DataType::List(item_field);
5611 let f2_array_data = list_array_with_nullable_items
5612 .to_data()
5613 .into_builder()
5614 .data_type(list_data_type)
5615 .build()
5616 .unwrap();
5617 let f2_expected = ListArray::from(f2_array_data);
5618 let mut f3_struct_builder = StructBuilder::new(
5619 vec![Arc::new(Field::new("f3_1", DataType::Utf8, false))],
5620 vec![Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>],
5621 );
5622 f3_struct_builder.append(true);
5623 {
5624 let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
5625 b.append_value("xyz");
5626 }
5627 f3_struct_builder.append(false);
5628 {
5629 let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
5630 b.append_null();
5631 }
5632 let f3_expected = f3_struct_builder.finish();
5633 let f4_fields = [Field::new("f4_1", DataType::Int64, false)];
5634 let f4_struct_builder = StructBuilder::new(
5635 f4_fields
5636 .iter()
5637 .map(|f| Arc::new(f.clone()))
5638 .collect::<Vec<Arc<Field>>>(),
5639 vec![Box::new(Int64Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>],
5640 );
5641 let mut f4_list_builder = ListBuilder::new(f4_struct_builder);
5642 {
5643 let struct_builder = f4_list_builder.values();
5644 struct_builder.append(true);
5645 {
5646 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5647 b.append_value(200);
5648 }
5649 struct_builder.append(false);
5650 {
5651 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5652 b.append_null();
5653 }
5654 f4_list_builder.append(true);
5655 }
5656 {
5657 let struct_builder = f4_list_builder.values();
5658 struct_builder.append(false);
5659 {
5660 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5661 b.append_null();
5662 }
5663 struct_builder.append(true);
5664 {
5665 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
5666 b.append_value(300);
5667 }
5668 f4_list_builder.append(true);
5669 }
5670 let f4_expected = f4_list_builder.finish();
5671 let mut f4_item_md: HashMap<String, String> = HashMap::new();
5673 f4_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns6".to_string());
5674 f4_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record6".to_string());
5675 let f4_item_field = Arc::new(
5676 Field::new("item", f4_expected.values().data_type().clone(), true)
5677 .with_metadata(f4_item_md),
5678 );
5679 let f4_list_data_type = DataType::List(f4_item_field);
5680 let f4_array_data = f4_expected
5681 .to_data()
5682 .into_builder()
5683 .data_type(f4_list_data_type)
5684 .build()
5685 .unwrap();
5686 let f4_expected = ListArray::from(f4_array_data);
5687 let mut f1_md: HashMap<String, String> = HashMap::new();
5689 f1_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record2".to_string());
5690 f1_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5691 let mut f3_md: HashMap<String, String> = HashMap::new();
5692 f3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns5".to_string());
5693 f3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record5".to_string());
5694 let expected_schema = Schema::new(vec![
5695 Field::new("f1", f1_expected.data_type().clone(), false).with_metadata(f1_md),
5696 Field::new("f2", f2_expected.data_type().clone(), false),
5697 Field::new("f3", f3_expected.data_type().clone(), true).with_metadata(f3_md),
5698 Field::new("f4", f4_expected.data_type().clone(), false),
5699 ]);
5700 let expected = RecordBatch::try_new(
5701 Arc::new(expected_schema),
5702 vec![
5703 Arc::new(f1_expected) as Arc<dyn Array>,
5704 Arc::new(f2_expected) as Arc<dyn Array>,
5705 Arc::new(f3_expected) as Arc<dyn Array>,
5706 Arc::new(f4_expected) as Arc<dyn Array>,
5707 ],
5708 )
5709 .unwrap();
5710 let file = arrow_test_data("avro/nested_records.avro");
5711 let batch_large = read_file(&file, 8, false);
5712 assert_eq!(
5713 batch_large, expected,
5714 "Decoded RecordBatch does not match expected data for nested records (batch size 8)"
5715 );
5716 let batch_small = read_file(&file, 3, false);
5717 assert_eq!(
5718 batch_small, expected,
5719 "Decoded RecordBatch does not match expected data for nested records (batch size 3)"
5720 );
5721 }
5722
5723 #[test]
5724 #[cfg(feature = "snappy")]
5726 fn test_repeated_no_annotation() {
5727 use arrow_data::ArrayDataBuilder;
5728 let file = arrow_test_data("avro/repeated_no_annotation.avro");
5729 let batch_large = read_file(&file, 8, false);
5730 let id_array = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
5732 let number_array = Int64Array::from(vec![
5734 Some(5555555555),
5735 Some(1111111111),
5736 Some(1111111111),
5737 Some(2222222222),
5738 Some(3333333333),
5739 ]);
5740 let kind_array =
5741 StringArray::from(vec![None, Some("home"), Some("home"), None, Some("mobile")]);
5742 let phone_fields = Fields::from(vec![
5743 Field::new("number", DataType::Int64, true),
5744 Field::new("kind", DataType::Utf8, true),
5745 ]);
5746 let phone_struct_data = ArrayDataBuilder::new(DataType::Struct(phone_fields))
5747 .len(5)
5748 .child_data(vec![number_array.into_data(), kind_array.into_data()])
5749 .build()
5750 .unwrap();
5751 let phone_struct_array = StructArray::from(phone_struct_data);
5752 let phone_list_offsets = Buffer::from_slice_ref([0i32, 0, 0, 0, 1, 2, 5]);
5754 let phone_list_validity = Buffer::from_iter([false, false, true, true, true, true]);
5755 let mut phone_item_md = HashMap::new();
5757 phone_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "phone".to_string());
5758 phone_item_md.insert(
5759 AVRO_NAMESPACE_METADATA_KEY.to_string(),
5760 "topLevelRecord.phoneNumbers".to_string(),
5761 );
5762 let phone_item_field = Field::new("item", phone_struct_array.data_type().clone(), true)
5763 .with_metadata(phone_item_md);
5764 let phone_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(phone_item_field)))
5765 .len(6)
5766 .add_buffer(phone_list_offsets)
5767 .null_bit_buffer(Some(phone_list_validity))
5768 .child_data(vec![phone_struct_array.into_data()])
5769 .build()
5770 .unwrap();
5771 let phone_list_array = ListArray::from(phone_list_data);
5772 let phone_numbers_validity = Buffer::from_iter([false, false, true, true, true, true]);
5774 let phone_numbers_field = Field::new("phone", phone_list_array.data_type().clone(), true);
5775 let phone_numbers_struct_data =
5776 ArrayDataBuilder::new(DataType::Struct(Fields::from(vec![phone_numbers_field])))
5777 .len(6)
5778 .null_bit_buffer(Some(phone_numbers_validity))
5779 .child_data(vec![phone_list_array.into_data()])
5780 .build()
5781 .unwrap();
5782 let phone_numbers_struct_array = StructArray::from(phone_numbers_struct_data);
5783 let mut phone_numbers_md = HashMap::new();
5785 phone_numbers_md.insert(
5786 AVRO_NAME_METADATA_KEY.to_string(),
5787 "phoneNumbers".to_string(),
5788 );
5789 phone_numbers_md.insert(
5790 AVRO_NAMESPACE_METADATA_KEY.to_string(),
5791 "topLevelRecord".to_string(),
5792 );
5793 let id_field = Field::new("id", DataType::Int32, true);
5794 let phone_numbers_schema_field = Field::new(
5795 "phoneNumbers",
5796 phone_numbers_struct_array.data_type().clone(),
5797 true,
5798 )
5799 .with_metadata(phone_numbers_md);
5800 let expected_schema = Schema::new(vec![id_field, phone_numbers_schema_field]);
5801 let expected = RecordBatch::try_new(
5803 Arc::new(expected_schema),
5804 vec![
5805 Arc::new(id_array) as _,
5806 Arc::new(phone_numbers_struct_array) as _,
5807 ],
5808 )
5809 .unwrap();
5810 assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
5811 let batch_small = read_file(&file, 3, false);
5812 assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
5813 }
5814
5815 #[test]
5816 #[cfg(feature = "snappy")]
5818 fn test_nonnullable_impala() {
5819 let file = arrow_test_data("avro/nonnullable.impala.avro");
5820 let id = Int64Array::from(vec![Some(8)]);
5821 let mut int_array_builder = ListBuilder::new(Int32Builder::new());
5822 {
5823 let vb = int_array_builder.values();
5824 vb.append_value(-1);
5825 }
5826 int_array_builder.append(true); let int_array = int_array_builder.finish();
5828 let mut iaa_builder = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
5829 {
5830 let inner_list_builder = iaa_builder.values();
5831 {
5832 let vb = inner_list_builder.values();
5833 vb.append_value(-1);
5834 vb.append_value(-2);
5835 }
5836 inner_list_builder.append(true);
5837 inner_list_builder.append(true);
5838 }
5839 iaa_builder.append(true);
5840 let int_array_array = iaa_builder.finish();
5841 let field_names = MapFieldNames {
5842 entry: "entries".to_string(),
5843 key: "key".to_string(),
5844 value: "value".to_string(),
5845 };
5846 let mut int_map_builder =
5847 MapBuilder::new(Some(field_names), StringBuilder::new(), Int32Builder::new());
5848 {
5849 let (keys, vals) = int_map_builder.entries();
5850 keys.append_value("k1");
5851 vals.append_value(-1);
5852 }
5853 int_map_builder.append(true).unwrap(); let int_map = int_map_builder.finish();
5855 let field_names2 = MapFieldNames {
5856 entry: "entries".to_string(),
5857 key: "key".to_string(),
5858 value: "value".to_string(),
5859 };
5860 let mut ima_builder = ListBuilder::new(MapBuilder::new(
5861 Some(field_names2),
5862 StringBuilder::new(),
5863 Int32Builder::new(),
5864 ));
5865 {
5866 let map_builder = ima_builder.values();
5867 map_builder.append(true).unwrap();
5868 {
5869 let (keys, vals) = map_builder.entries();
5870 keys.append_value("k1");
5871 vals.append_value(1);
5872 }
5873 map_builder.append(true).unwrap();
5874 map_builder.append(true).unwrap();
5875 map_builder.append(true).unwrap();
5876 }
5877 ima_builder.append(true);
5878 let int_map_array_ = ima_builder.finish();
5879 let meta_nested_struct: HashMap<String, String> = [
5881 ("avro.name", "nested_Struct"),
5882 ("avro.namespace", "topLevelRecord"),
5883 ]
5884 .into_iter()
5885 .map(|(k, v)| (k.to_string(), v.to_string()))
5886 .collect();
5887 let meta_c: HashMap<String, String> = [
5888 ("avro.name", "c"),
5889 ("avro.namespace", "topLevelRecord.nested_Struct"),
5890 ]
5891 .into_iter()
5892 .map(|(k, v)| (k.to_string(), v.to_string()))
5893 .collect();
5894 let meta_d_item_struct: HashMap<String, String> = [
5895 ("avro.name", "D"),
5896 ("avro.namespace", "topLevelRecord.nested_Struct.c"),
5897 ]
5898 .into_iter()
5899 .map(|(k, v)| (k.to_string(), v.to_string()))
5900 .collect();
5901 let meta_g_value: HashMap<String, String> = [
5902 ("avro.name", "G"),
5903 ("avro.namespace", "topLevelRecord.nested_Struct"),
5904 ]
5905 .into_iter()
5906 .map(|(k, v)| (k.to_string(), v.to_string()))
5907 .collect();
5908 let meta_h: HashMap<String, String> = [
5909 ("avro.name", "h"),
5910 ("avro.namespace", "topLevelRecord.nested_Struct.G"),
5911 ]
5912 .into_iter()
5913 .map(|(k, v)| (k.to_string(), v.to_string()))
5914 .collect();
5915 let ef_struct_field = Arc::new(
5917 Field::new(
5918 "item",
5919 DataType::Struct(
5920 vec![
5921 Field::new("e", DataType::Int32, true),
5922 Field::new("f", DataType::Utf8, true),
5923 ]
5924 .into(),
5925 ),
5926 true,
5927 )
5928 .with_metadata(meta_d_item_struct.clone()),
5929 );
5930 let d_inner_list_field = Arc::new(Field::new(
5931 "item",
5932 DataType::List(ef_struct_field.clone()),
5933 true,
5934 ));
5935 let d_field = Field::new("D", DataType::List(d_inner_list_field.clone()), true);
5936 let i_list_field = Arc::new(Field::new("item", DataType::Float64, true));
5938 let i_field = Field::new("i", DataType::List(i_list_field.clone()), true);
5939 let h_field = Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
5941 .with_metadata(meta_h.clone());
5942 let g_value_struct_field = Field::new(
5944 "value",
5945 DataType::Struct(vec![h_field.clone()].into()),
5946 true,
5947 )
5948 .with_metadata(meta_g_value.clone());
5949 let entries_struct_field = Field::new(
5951 "entries",
5952 DataType::Struct(
5953 vec![
5954 Field::new("key", DataType::Utf8, false),
5955 g_value_struct_field.clone(),
5956 ]
5957 .into(),
5958 ),
5959 false,
5960 );
5961 let a_field = Arc::new(Field::new("a", DataType::Int32, true));
5963 let b_field = Arc::new(Field::new(
5964 "B",
5965 DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
5966 true,
5967 ));
5968 let c_field = Arc::new(
5969 Field::new("c", DataType::Struct(vec![d_field.clone()].into()), true)
5970 .with_metadata(meta_c.clone()),
5971 );
5972 let g_field = Arc::new(Field::new(
5973 "G",
5974 DataType::Map(Arc::new(entries_struct_field.clone()), false),
5975 true,
5976 ));
5977 let mut nested_sb = StructBuilder::new(
5979 vec![
5980 a_field.clone(),
5981 b_field.clone(),
5982 c_field.clone(),
5983 g_field.clone(),
5984 ],
5985 vec![
5986 Box::new(Int32Builder::new()),
5987 Box::new(ListBuilder::new(Int32Builder::new())),
5988 {
5989 Box::new(StructBuilder::new(
5991 vec![Arc::new(d_field.clone())],
5992 vec![Box::new({
5993 let ef_struct_builder = StructBuilder::new(
5994 vec![
5995 Arc::new(Field::new("e", DataType::Int32, true)),
5996 Arc::new(Field::new("f", DataType::Utf8, true)),
5997 ],
5998 vec![
5999 Box::new(Int32Builder::new()),
6000 Box::new(StringBuilder::new()),
6001 ],
6002 );
6003 let list_of_ef = ListBuilder::new(ef_struct_builder)
6005 .with_field(ef_struct_field.clone());
6006 ListBuilder::new(list_of_ef)
6008 })],
6009 ))
6010 },
6011 {
6012 let map_field_names = MapFieldNames {
6013 entry: "entries".to_string(),
6014 key: "key".to_string(),
6015 value: "value".to_string(),
6016 };
6017 let i_list_builder = ListBuilder::new(Float64Builder::new());
6018 let h_struct_builder = StructBuilder::new(
6019 vec![Arc::new(Field::new(
6020 "i",
6021 DataType::List(i_list_field.clone()),
6022 true,
6023 ))],
6024 vec![Box::new(i_list_builder)],
6025 );
6026 let g_value_builder = StructBuilder::new(
6027 vec![Arc::new(
6028 Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6029 .with_metadata(meta_h.clone()),
6030 )],
6031 vec![Box::new(h_struct_builder)],
6032 );
6033 let map_builder = MapBuilder::new(
6035 Some(map_field_names),
6036 StringBuilder::new(),
6037 g_value_builder,
6038 )
6039 .with_values_field(Arc::new(
6040 Field::new(
6041 "value",
6042 DataType::Struct(vec![h_field.clone()].into()),
6043 true,
6044 )
6045 .with_metadata(meta_g_value.clone()),
6046 ));
6047
6048 Box::new(map_builder)
6049 },
6050 ],
6051 );
6052 nested_sb.append(true);
6053 {
6054 let a_builder = nested_sb.field_builder::<Int32Builder>(0).unwrap();
6055 a_builder.append_value(-1);
6056 }
6057 {
6058 let b_builder = nested_sb
6059 .field_builder::<ListBuilder<Int32Builder>>(1)
6060 .unwrap();
6061 {
6062 let vb = b_builder.values();
6063 vb.append_value(-1);
6064 }
6065 b_builder.append(true);
6066 }
6067 {
6068 let c_struct_builder = nested_sb.field_builder::<StructBuilder>(2).unwrap();
6069 c_struct_builder.append(true);
6070 let d_list_builder = c_struct_builder
6071 .field_builder::<ListBuilder<ListBuilder<StructBuilder>>>(0)
6072 .unwrap();
6073 {
6074 let sub_list_builder = d_list_builder.values();
6075 {
6076 let ef_struct = sub_list_builder.values();
6077 ef_struct.append(true);
6078 {
6079 let e_b = ef_struct.field_builder::<Int32Builder>(0).unwrap();
6080 e_b.append_value(-1);
6081 let f_b = ef_struct.field_builder::<StringBuilder>(1).unwrap();
6082 f_b.append_value("nonnullable");
6083 }
6084 sub_list_builder.append(true);
6085 }
6086 d_list_builder.append(true);
6087 }
6088 }
6089 {
6090 let g_map_builder = nested_sb
6091 .field_builder::<MapBuilder<StringBuilder, StructBuilder>>(3)
6092 .unwrap();
6093 g_map_builder.append(true).unwrap();
6094 }
6095 let nested_struct = nested_sb.finish();
6096 let schema = Arc::new(arrow_schema::Schema::new(vec![
6097 Field::new("ID", id.data_type().clone(), true),
6098 Field::new("Int_Array", int_array.data_type().clone(), true),
6099 Field::new("int_array_array", int_array_array.data_type().clone(), true),
6100 Field::new("Int_Map", int_map.data_type().clone(), true),
6101 Field::new("int_map_array", int_map_array_.data_type().clone(), true),
6102 Field::new("nested_Struct", nested_struct.data_type().clone(), true)
6103 .with_metadata(meta_nested_struct.clone()),
6104 ]));
6105 let expected = RecordBatch::try_new(
6106 schema,
6107 vec![
6108 Arc::new(id) as Arc<dyn Array>,
6109 Arc::new(int_array),
6110 Arc::new(int_array_array),
6111 Arc::new(int_map),
6112 Arc::new(int_map_array_),
6113 Arc::new(nested_struct),
6114 ],
6115 )
6116 .unwrap();
6117 let batch_large = read_file(&file, 8, false);
6118 assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6119 let batch_small = read_file(&file, 3, false);
6120 assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6121 }
6122
6123 #[test]
6124 fn test_nonnullable_impala_strict() {
6125 let file = arrow_test_data("avro/nonnullable.impala.avro");
6126 let err = read_file_strict(&file, 8, false).unwrap_err();
6127 assert!(err.to_string().contains(
6128 "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6129 ));
6130 }
6131
6132 #[test]
6133 #[cfg(feature = "snappy")]
6135 fn test_nullable_impala() {
6136 let file = arrow_test_data("avro/nullable.impala.avro");
6137 let batch1 = read_file(&file, 3, false);
6138 let batch2 = read_file(&file, 8, false);
6139 assert_eq!(batch1, batch2);
6140 let batch = batch1;
6141 assert_eq!(batch.num_rows(), 7);
6142 let id_array = batch
6143 .column(0)
6144 .as_any()
6145 .downcast_ref::<Int64Array>()
6146 .expect("id column should be an Int64Array");
6147 let expected_ids = [1, 2, 3, 4, 5, 6, 7];
6148 for (i, &expected_id) in expected_ids.iter().enumerate() {
6149 assert_eq!(id_array.value(i), expected_id, "Mismatch in id at row {i}",);
6150 }
6151 let int_array = batch
6152 .column(1)
6153 .as_any()
6154 .downcast_ref::<ListArray>()
6155 .expect("int_array column should be a ListArray");
6156 {
6157 let offsets = int_array.value_offsets();
6158 let start = offsets[0] as usize;
6159 let end = offsets[1] as usize;
6160 let values = int_array
6161 .values()
6162 .as_any()
6163 .downcast_ref::<Int32Array>()
6164 .expect("Values of int_array should be an Int32Array");
6165 let row0: Vec<Option<i32>> = (start..end).map(|i| Some(values.value(i))).collect();
6166 assert_eq!(
6167 row0,
6168 vec![Some(1), Some(2), Some(3)],
6169 "Mismatch in int_array row 0"
6170 );
6171 }
6172 let nested_struct = batch
6173 .column(5)
6174 .as_any()
6175 .downcast_ref::<StructArray>()
6176 .expect("nested_struct column should be a StructArray");
6177 let a_array = nested_struct
6178 .column_by_name("A")
6179 .expect("Field A should exist in nested_struct")
6180 .as_any()
6181 .downcast_ref::<Int32Array>()
6182 .expect("Field A should be an Int32Array");
6183 assert_eq!(a_array.value(0), 1, "Mismatch in nested_struct.A at row 0");
6184 assert!(
6185 !a_array.is_valid(1),
6186 "Expected null in nested_struct.A at row 1"
6187 );
6188 assert!(
6189 !a_array.is_valid(3),
6190 "Expected null in nested_struct.A at row 3"
6191 );
6192 assert_eq!(a_array.value(6), 7, "Mismatch in nested_struct.A at row 6");
6193 }
6194
6195 #[test]
6196 fn test_nullable_impala_strict() {
6197 let file = arrow_test_data("avro/nullable.impala.avro");
6198 let err = read_file_strict(&file, 8, false).unwrap_err();
6199 assert!(err.to_string().contains(
6200 "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6201 ));
6202 }
6203
6204 #[test]
6205 fn test_nested_record_type_reuse() {
6206 let batch = read_file("test/data/nested_record_reuse.avro", 8, false);
6232 let schema = batch.schema();
6233
6234 assert_eq!(schema.fields().len(), 3);
6236 let fields = schema.fields();
6237 assert_eq!(fields[0].name(), "nested");
6238 assert_eq!(fields[1].name(), "nestedRecord");
6239 assert_eq!(fields[2].name(), "nestedArray");
6240 assert!(matches!(fields[0].data_type(), DataType::Struct(_)));
6241 assert!(matches!(fields[1].data_type(), DataType::Struct(_)));
6242 assert!(matches!(fields[2].data_type(), DataType::List(_)));
6243
6244 if let DataType::Struct(nested_fields) = fields[0].data_type() {
6246 assert_eq!(nested_fields.len(), 1);
6247 assert_eq!(nested_fields[0].name(), "nested_int");
6248 assert_eq!(nested_fields[0].data_type(), &DataType::Int32);
6249 }
6250
6251 assert_eq!(fields[0].data_type(), fields[1].data_type());
6253 if let DataType::List(array_field) = fields[2].data_type() {
6254 assert_eq!(array_field.data_type(), fields[0].data_type());
6255 }
6256
6257 assert_eq!(batch.num_rows(), 2);
6259 assert_eq!(batch.num_columns(), 3);
6260
6261 let nested_col = batch
6263 .column(0)
6264 .as_any()
6265 .downcast_ref::<StructArray>()
6266 .unwrap();
6267 let nested_int_array = nested_col
6268 .column_by_name("nested_int")
6269 .unwrap()
6270 .as_any()
6271 .downcast_ref::<Int32Array>()
6272 .unwrap();
6273 assert_eq!(nested_int_array.value(0), 42);
6274 assert_eq!(nested_int_array.value(1), 99);
6275
6276 let nested_record_col = batch
6278 .column(1)
6279 .as_any()
6280 .downcast_ref::<StructArray>()
6281 .unwrap();
6282 let nested_record_int_array = nested_record_col
6283 .column_by_name("nested_int")
6284 .unwrap()
6285 .as_any()
6286 .downcast_ref::<Int32Array>()
6287 .unwrap();
6288 assert_eq!(nested_record_int_array.value(0), 100);
6289 assert_eq!(nested_record_int_array.value(1), 200);
6290
6291 let nested_array_col = batch
6293 .column(2)
6294 .as_any()
6295 .downcast_ref::<ListArray>()
6296 .unwrap();
6297 assert_eq!(nested_array_col.len(), 2);
6298 let first_array_struct = nested_array_col.value(0);
6299 let first_array_struct_array = first_array_struct
6300 .as_any()
6301 .downcast_ref::<StructArray>()
6302 .unwrap();
6303 let first_array_int_values = first_array_struct_array
6304 .column_by_name("nested_int")
6305 .unwrap()
6306 .as_any()
6307 .downcast_ref::<Int32Array>()
6308 .unwrap();
6309 assert_eq!(first_array_int_values.len(), 3);
6310 assert_eq!(first_array_int_values.value(0), 1);
6311 assert_eq!(first_array_int_values.value(1), 2);
6312 assert_eq!(first_array_int_values.value(2), 3);
6313 }
6314
6315 #[test]
6316 fn test_enum_type_reuse() {
6317 let batch = read_file("test/data/enum_reuse.avro", 8, false);
6340 let schema = batch.schema();
6341
6342 assert_eq!(schema.fields().len(), 3);
6344 let fields = schema.fields();
6345 assert_eq!(fields[0].name(), "status");
6346 assert_eq!(fields[1].name(), "backupStatus");
6347 assert_eq!(fields[2].name(), "statusHistory");
6348 assert!(matches!(fields[0].data_type(), DataType::Dictionary(_, _)));
6349 assert!(matches!(fields[1].data_type(), DataType::Dictionary(_, _)));
6350 assert!(matches!(fields[2].data_type(), DataType::List(_)));
6351
6352 if let DataType::Dictionary(key_type, value_type) = fields[0].data_type() {
6353 assert_eq!(key_type.as_ref(), &DataType::Int32);
6354 assert_eq!(value_type.as_ref(), &DataType::Utf8);
6355 }
6356
6357 assert_eq!(fields[0].data_type(), fields[1].data_type());
6359 if let DataType::List(array_field) = fields[2].data_type() {
6360 assert_eq!(array_field.data_type(), fields[0].data_type());
6361 }
6362
6363 assert_eq!(batch.num_rows(), 2);
6365 assert_eq!(batch.num_columns(), 3);
6366
6367 let status_col = batch
6369 .column(0)
6370 .as_any()
6371 .downcast_ref::<DictionaryArray<Int32Type>>()
6372 .unwrap();
6373 let status_values = status_col
6374 .values()
6375 .as_any()
6376 .downcast_ref::<StringArray>()
6377 .unwrap();
6378
6379 assert_eq!(
6381 status_values.value(status_col.key(0).unwrap() as usize),
6382 "ACTIVE"
6383 );
6384 assert_eq!(
6385 status_values.value(status_col.key(1).unwrap() as usize),
6386 "PENDING"
6387 );
6388
6389 let backup_status_col = batch
6391 .column(1)
6392 .as_any()
6393 .downcast_ref::<DictionaryArray<Int32Type>>()
6394 .unwrap();
6395 let backup_status_values = backup_status_col
6396 .values()
6397 .as_any()
6398 .downcast_ref::<StringArray>()
6399 .unwrap();
6400
6401 assert_eq!(
6403 backup_status_values.value(backup_status_col.key(0).unwrap() as usize),
6404 "INACTIVE"
6405 );
6406 assert_eq!(
6407 backup_status_values.value(backup_status_col.key(1).unwrap() as usize),
6408 "ACTIVE"
6409 );
6410
6411 let status_history_col = batch
6413 .column(2)
6414 .as_any()
6415 .downcast_ref::<ListArray>()
6416 .unwrap();
6417 assert_eq!(status_history_col.len(), 2);
6418
6419 let first_array_dict = status_history_col.value(0);
6421 let first_array_dict_array = first_array_dict
6422 .as_any()
6423 .downcast_ref::<DictionaryArray<Int32Type>>()
6424 .unwrap();
6425 let first_array_values = first_array_dict_array
6426 .values()
6427 .as_any()
6428 .downcast_ref::<StringArray>()
6429 .unwrap();
6430
6431 assert_eq!(first_array_dict_array.len(), 3);
6433 assert_eq!(
6434 first_array_values.value(first_array_dict_array.key(0).unwrap() as usize),
6435 "PENDING"
6436 );
6437 assert_eq!(
6438 first_array_values.value(first_array_dict_array.key(1).unwrap() as usize),
6439 "ACTIVE"
6440 );
6441 assert_eq!(
6442 first_array_values.value(first_array_dict_array.key(2).unwrap() as usize),
6443 "INACTIVE"
6444 );
6445 }
6446
6447 #[test]
6448 fn comprehensive_e2e_test() {
6449 let path = "test/data/comprehensive_e2e.avro";
6450 let batch = read_file(path, 1024, false);
6451 let schema = batch.schema();
6452
6453 #[inline]
6454 fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
6455 for (tid, f) in fields.iter() {
6456 if f.name() == want {
6457 return tid;
6458 }
6459 }
6460 panic!("union child '{want}' not found");
6461 }
6462
6463 #[inline]
6464 fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
6465 for (tid, f) in fields.iter() {
6466 if pred(f.data_type()) {
6467 return tid;
6468 }
6469 }
6470 panic!("no union child matches predicate");
6471 }
6472
6473 fn mk_dense_union(
6474 fields: &UnionFields,
6475 type_ids: Vec<i8>,
6476 offsets: Vec<i32>,
6477 provide: impl Fn(&Field) -> Option<ArrayRef>,
6478 ) -> ArrayRef {
6479 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
6480 match dt {
6481 DataType::Null => Arc::new(NullArray::new(0)),
6482 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
6483 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
6484 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
6485 DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
6486 DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
6487 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
6488 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
6489 DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
6490 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
6491 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
6492 }
6493 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
6494 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
6495 }
6496 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
6497 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
6498 Arc::new(if let Some(tz) = tz {
6499 a.with_timezone(tz.clone())
6500 } else {
6501 a
6502 })
6503 }
6504 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
6505 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
6506 Arc::new(if let Some(tz) = tz {
6507 a.with_timezone(tz.clone())
6508 } else {
6509 a
6510 })
6511 }
6512 DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
6513 IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
6514 ),
6515 DataType::FixedSizeBinary(sz) => Arc::new(
6516 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
6517 std::iter::empty::<Option<Vec<u8>>>(),
6518 *sz,
6519 )
6520 .unwrap(),
6521 ),
6522 DataType::Dictionary(_, _) => {
6523 let keys = Int32Array::from(Vec::<i32>::new());
6524 let values = Arc::new(StringArray::from(Vec::<&str>::new()));
6525 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
6526 }
6527 DataType::Struct(fields) => {
6528 let children: Vec<ArrayRef> = fields
6529 .iter()
6530 .map(|f| empty_child_for(f.data_type()) as ArrayRef)
6531 .collect();
6532 Arc::new(StructArray::new(fields.clone(), children, None))
6533 }
6534 DataType::List(field) => {
6535 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6536 Arc::new(
6537 ListArray::try_new(
6538 field.clone(),
6539 offsets,
6540 empty_child_for(field.data_type()),
6541 None,
6542 )
6543 .unwrap(),
6544 )
6545 }
6546 DataType::Map(entry_field, is_sorted) => {
6547 let (key_field, val_field) = match entry_field.data_type() {
6548 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
6549 other => panic!("unexpected map entries type: {other:?}"),
6550 };
6551 let keys = StringArray::from(Vec::<&str>::new());
6552 let vals: ArrayRef = match val_field.data_type() {
6553 DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
6554 DataType::Boolean => {
6555 Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
6556 }
6557 DataType::Int32 => {
6558 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
6559 }
6560 DataType::Int64 => {
6561 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
6562 }
6563 DataType::Float32 => {
6564 Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
6565 }
6566 DataType::Float64 => {
6567 Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
6568 }
6569 DataType::Utf8 => {
6570 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
6571 }
6572 DataType::Binary => {
6573 Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
6574 }
6575 DataType::Union(uf, _) => {
6576 let children: Vec<ArrayRef> = uf
6577 .iter()
6578 .map(|(_, f)| empty_child_for(f.data_type()))
6579 .collect();
6580 Arc::new(
6581 UnionArray::try_new(
6582 uf.clone(),
6583 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
6584 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
6585 children,
6586 )
6587 .unwrap(),
6588 ) as ArrayRef
6589 }
6590 other => panic!("unsupported map value type: {other:?}"),
6591 };
6592 let entries = StructArray::new(
6593 Fields::from(vec![
6594 key_field.as_ref().clone(),
6595 val_field.as_ref().clone(),
6596 ]),
6597 vec![Arc::new(keys) as ArrayRef, vals],
6598 None,
6599 );
6600 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
6601 Arc::new(MapArray::new(
6602 entry_field.clone(),
6603 offsets,
6604 entries,
6605 None,
6606 *is_sorted,
6607 ))
6608 }
6609 other => panic!("empty_child_for: unhandled type {other:?}"),
6610 }
6611 }
6612 let children: Vec<ArrayRef> = fields
6613 .iter()
6614 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
6615 .collect();
6616 Arc::new(
6617 UnionArray::try_new(
6618 fields.clone(),
6619 ScalarBuffer::<i8>::from(type_ids),
6620 Some(ScalarBuffer::<i32>::from(offsets)),
6621 children,
6622 )
6623 .unwrap(),
6624 ) as ArrayRef
6625 }
6626
6627 #[inline]
6628 fn uuid16_from_str(s: &str) -> [u8; 16] {
6629 let mut out = [0u8; 16];
6630 let mut idx = 0usize;
6631 let mut hi: Option<u8> = None;
6632 for ch in s.chars() {
6633 if ch == '-' {
6634 continue;
6635 }
6636 let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
6637 if let Some(h) = hi {
6638 out[idx] = (h << 4) | v;
6639 idx += 1;
6640 hi = None;
6641 } else {
6642 hi = Some(v);
6643 }
6644 }
6645 assert_eq!(idx, 16, "UUID must decode to 16 bytes");
6646 out
6647 }
6648 let date_a: i32 = 19_000; let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
6650 let time_us_eod: i64 = 86_400_000_000 - 1;
6651 let ts_ms_2024_01_01: i64 = 1_704_067_200_000; let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
6653 let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
6654 let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
6655 let dur_large =
6656 IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
6657 let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
6658 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
6659 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
6660
6661 #[inline]
6662 fn push_like(
6663 reader_schema: &arrow_schema::Schema,
6664 name: &str,
6665 arr: ArrayRef,
6666 fields: &mut Vec<FieldRef>,
6667 cols: &mut Vec<ArrayRef>,
6668 ) {
6669 let src = reader_schema
6670 .field_with_name(name)
6671 .unwrap_or_else(|_| panic!("source schema missing field '{name}'"));
6672 let mut f = Field::new(name, arr.data_type().clone(), src.is_nullable());
6673 let md = src.metadata();
6674 if !md.is_empty() {
6675 f = f.with_metadata(md.clone());
6676 }
6677 fields.push(Arc::new(f));
6678 cols.push(arr);
6679 }
6680
6681 let mut fields: Vec<FieldRef> = Vec::new();
6682 let mut columns: Vec<ArrayRef> = Vec::new();
6683 push_like(
6684 schema.as_ref(),
6685 "id",
6686 Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
6687 &mut fields,
6688 &mut columns,
6689 );
6690 push_like(
6691 schema.as_ref(),
6692 "flag",
6693 Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef,
6694 &mut fields,
6695 &mut columns,
6696 );
6697 push_like(
6698 schema.as_ref(),
6699 "ratio_f32",
6700 Arc::new(Float32Array::from(vec![1.25f32, -0.0, 3.5, 9.75])) as ArrayRef,
6701 &mut fields,
6702 &mut columns,
6703 );
6704 push_like(
6705 schema.as_ref(),
6706 "ratio_f64",
6707 Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef,
6708 &mut fields,
6709 &mut columns,
6710 );
6711 push_like(
6712 schema.as_ref(),
6713 "count_i32",
6714 Arc::new(Int32Array::from(vec![7, -1, 0, 123])) as ArrayRef,
6715 &mut fields,
6716 &mut columns,
6717 );
6718 push_like(
6719 schema.as_ref(),
6720 "count_i64",
6721 Arc::new(Int64Array::from(vec![
6722 7_000_000_000i64,
6723 -2,
6724 0,
6725 -9_876_543_210i64,
6726 ])) as ArrayRef,
6727 &mut fields,
6728 &mut columns,
6729 );
6730 push_like(
6731 schema.as_ref(),
6732 "opt_i32_nullfirst",
6733 Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef,
6734 &mut fields,
6735 &mut columns,
6736 );
6737 push_like(
6738 schema.as_ref(),
6739 "opt_str_nullsecond",
6740 Arc::new(StringArray::from(vec![
6741 Some("alpha"),
6742 None,
6743 Some("s3"),
6744 Some(""),
6745 ])) as ArrayRef,
6746 &mut fields,
6747 &mut columns,
6748 );
6749 {
6750 let uf = match schema
6751 .field_with_name("tri_union_prim")
6752 .unwrap()
6753 .data_type()
6754 {
6755 DataType::Union(f, UnionMode::Dense) => f.clone(),
6756 other => panic!("tri_union_prim should be dense union, got {other:?}"),
6757 };
6758 let tid_i = tid_by_name(&uf, "int");
6759 let tid_s = tid_by_name(&uf, "string");
6760 let tid_b = tid_by_name(&uf, "boolean");
6761 let tids = vec![tid_i, tid_s, tid_b, tid_s];
6762 let offs = vec![0, 0, 0, 1];
6763 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
6764 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
6765 DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
6766 DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
6767 _ => None,
6768 });
6769 push_like(
6770 schema.as_ref(),
6771 "tri_union_prim",
6772 arr,
6773 &mut fields,
6774 &mut columns,
6775 );
6776 }
6777
6778 push_like(
6779 schema.as_ref(),
6780 "str_utf8",
6781 Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef,
6782 &mut fields,
6783 &mut columns,
6784 );
6785 push_like(
6786 schema.as_ref(),
6787 "raw_bytes",
6788 Arc::new(BinaryArray::from(vec![
6789 b"\x00\x01".as_ref(),
6790 b"".as_ref(),
6791 b"\xFF\x00".as_ref(),
6792 b"\x10\x20\x30\x40".as_ref(),
6793 ])) as ArrayRef,
6794 &mut fields,
6795 &mut columns,
6796 );
6797 {
6798 let it = [
6799 Some(*b"0123456789ABCDEF"),
6800 Some([0u8; 16]),
6801 Some(*b"ABCDEFGHIJKLMNOP"),
6802 Some([0xAA; 16]),
6803 ]
6804 .into_iter();
6805 let arr =
6806 Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
6807 as ArrayRef;
6808 push_like(
6809 schema.as_ref(),
6810 "fx16_plain",
6811 arr,
6812 &mut fields,
6813 &mut columns,
6814 );
6815 }
6816 {
6817 #[cfg(feature = "small_decimals")]
6818 let dec10_2 = Arc::new(
6819 Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
6820 .with_precision_and_scale(10, 2)
6821 .unwrap(),
6822 ) as ArrayRef;
6823 #[cfg(not(feature = "small_decimals"))]
6824 let dec10_2 = Arc::new(
6825 Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
6826 .with_precision_and_scale(10, 2)
6827 .unwrap(),
6828 ) as ArrayRef;
6829 push_like(
6830 schema.as_ref(),
6831 "dec_bytes_s10_2",
6832 dec10_2,
6833 &mut fields,
6834 &mut columns,
6835 );
6836 }
6837 {
6838 #[cfg(feature = "small_decimals")]
6839 let dec20_4 = Arc::new(
6840 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
6841 .with_precision_and_scale(20, 4)
6842 .unwrap(),
6843 ) as ArrayRef;
6844 #[cfg(not(feature = "small_decimals"))]
6845 let dec20_4 = Arc::new(
6846 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
6847 .with_precision_and_scale(20, 4)
6848 .unwrap(),
6849 ) as ArrayRef;
6850 push_like(
6851 schema.as_ref(),
6852 "dec_fix_s20_4",
6853 dec20_4,
6854 &mut fields,
6855 &mut columns,
6856 );
6857 }
6858 {
6859 let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
6860 let arr =
6861 Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
6862 as ArrayRef;
6863 push_like(schema.as_ref(), "uuid_str", arr, &mut fields, &mut columns);
6864 }
6865 push_like(
6866 schema.as_ref(),
6867 "d_date",
6868 Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef,
6869 &mut fields,
6870 &mut columns,
6871 );
6872 push_like(
6873 schema.as_ref(),
6874 "t_millis",
6875 Arc::new(Time32MillisecondArray::from(vec![
6876 time_ms_a,
6877 0,
6878 1,
6879 86_400_000 - 1,
6880 ])) as ArrayRef,
6881 &mut fields,
6882 &mut columns,
6883 );
6884 push_like(
6885 schema.as_ref(),
6886 "t_micros",
6887 Arc::new(Time64MicrosecondArray::from(vec![
6888 time_us_eod,
6889 0,
6890 1,
6891 1_000_000,
6892 ])) as ArrayRef,
6893 &mut fields,
6894 &mut columns,
6895 );
6896 {
6897 let a = TimestampMillisecondArray::from(vec![
6898 ts_ms_2024_01_01,
6899 -1,
6900 ts_ms_2024_01_01 + 123,
6901 0,
6902 ])
6903 .with_timezone("+00:00");
6904 push_like(
6905 schema.as_ref(),
6906 "ts_millis_utc",
6907 Arc::new(a) as ArrayRef,
6908 &mut fields,
6909 &mut columns,
6910 );
6911 }
6912 {
6913 let a = TimestampMicrosecondArray::from(vec![
6914 ts_us_2024_01_01,
6915 1,
6916 ts_us_2024_01_01 + 456,
6917 0,
6918 ])
6919 .with_timezone("+00:00");
6920 push_like(
6921 schema.as_ref(),
6922 "ts_micros_utc",
6923 Arc::new(a) as ArrayRef,
6924 &mut fields,
6925 &mut columns,
6926 );
6927 }
6928 push_like(
6929 schema.as_ref(),
6930 "ts_millis_local",
6931 Arc::new(TimestampMillisecondArray::from(vec![
6932 ts_ms_2024_01_01 + 86_400_000,
6933 0,
6934 ts_ms_2024_01_01 + 789,
6935 123_456_789,
6936 ])) as ArrayRef,
6937 &mut fields,
6938 &mut columns,
6939 );
6940 push_like(
6941 schema.as_ref(),
6942 "ts_micros_local",
6943 Arc::new(TimestampMicrosecondArray::from(vec![
6944 ts_us_2024_01_01 + 123_456,
6945 0,
6946 ts_us_2024_01_01 + 101_112,
6947 987_654_321,
6948 ])) as ArrayRef,
6949 &mut fields,
6950 &mut columns,
6951 );
6952 {
6953 let v = vec![dur_small, dur_zero, dur_large, dur_2years];
6954 push_like(
6955 schema.as_ref(),
6956 "interval_mdn",
6957 Arc::new(IntervalMonthDayNanoArray::from(v)) as ArrayRef,
6958 &mut fields,
6959 &mut columns,
6960 );
6961 }
6962 {
6963 let keys = Int32Array::from(vec![1, 2, 3, 0]); let values = Arc::new(StringArray::from(vec![
6965 "UNKNOWN",
6966 "NEW",
6967 "PROCESSING",
6968 "DONE",
6969 ])) as ArrayRef;
6970 let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
6971 push_like(
6972 schema.as_ref(),
6973 "status",
6974 Arc::new(dict) as ArrayRef,
6975 &mut fields,
6976 &mut columns,
6977 );
6978 }
6979 {
6980 let list_field = match schema.field_with_name("arr_union").unwrap().data_type() {
6981 DataType::List(f) => f.clone(),
6982 other => panic!("arr_union should be List, got {other:?}"),
6983 };
6984 let uf = match list_field.data_type() {
6985 DataType::Union(f, UnionMode::Dense) => f.clone(),
6986 other => panic!("arr_union item should be union, got {other:?}"),
6987 };
6988 let tid_l = tid_by_name(&uf, "long");
6989 let tid_s = tid_by_name(&uf, "string");
6990 let tid_n = tid_by_name(&uf, "null");
6991 let type_ids = vec![
6992 tid_l, tid_s, tid_n, tid_l, tid_n, tid_s, tid_l, tid_l, tid_s, tid_n, tid_l,
6993 ];
6994 let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
6995 let values = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
6996 DataType::Int64 => {
6997 Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
6998 }
6999 DataType::Utf8 => {
7000 Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
7001 }
7002 DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
7003 _ => None,
7004 });
7005 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
7006 let arr = Arc::new(ListArray::try_new(list_field, list_offsets, values, None).unwrap())
7007 as ArrayRef;
7008 push_like(schema.as_ref(), "arr_union", arr, &mut fields, &mut columns);
7009 }
7010 {
7011 let (entry_field, entries_fields, uf, is_sorted) =
7012 match schema.field_with_name("map_union").unwrap().data_type() {
7013 DataType::Map(entry_field, is_sorted) => {
7014 let fs = match entry_field.data_type() {
7015 DataType::Struct(fs) => fs.clone(),
7016 other => panic!("map entries must be struct, got {other:?}"),
7017 };
7018 let val_f = fs[1].clone();
7019 let uf = match val_f.data_type() {
7020 DataType::Union(f, UnionMode::Dense) => f.clone(),
7021 other => panic!("map value must be union, got {other:?}"),
7022 };
7023 (entry_field.clone(), fs, uf, *is_sorted)
7024 }
7025 other => panic!("map_union should be Map, got {other:?}"),
7026 };
7027 let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
7028 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
7029 let tid_null = tid_by_name(&uf, "null");
7030 let tid_d = tid_by_name(&uf, "double");
7031 let tid_s = tid_by_name(&uf, "string");
7032 let type_ids = vec![tid_d, tid_null, tid_s, tid_d, tid_d, tid_s];
7033 let offsets = vec![0, 0, 0, 1, 2, 1];
7034 let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
7035 let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7036 DataType::Float64 => {
7037 Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
7038 }
7039 DataType::Utf8 => {
7040 Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
7041 }
7042 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7043 _ => None,
7044 });
7045 let entries = StructArray::new(
7046 entries_fields.clone(),
7047 vec![Arc::new(keys) as ArrayRef, vals],
7048 None,
7049 );
7050 let map =
7051 Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef;
7052 push_like(schema.as_ref(), "map_union", map, &mut fields, &mut columns);
7053 }
7054 {
7055 let fs = match schema.field_with_name("address").unwrap().data_type() {
7056 DataType::Struct(fs) => fs.clone(),
7057 other => panic!("address should be Struct, got {other:?}"),
7058 };
7059 let street = Arc::new(StringArray::from(vec![
7060 "100 Main",
7061 "",
7062 "42 Galaxy Way",
7063 "End Ave",
7064 ])) as ArrayRef;
7065 let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
7066 let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
7067 let arr = Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef;
7068 push_like(schema.as_ref(), "address", arr, &mut fields, &mut columns);
7069 }
7070 {
7071 let fs = match schema.field_with_name("maybe_auth").unwrap().data_type() {
7072 DataType::Struct(fs) => fs.clone(),
7073 other => panic!("maybe_auth should be Struct, got {other:?}"),
7074 };
7075 let user =
7076 Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
7077 let token_values: Vec<Option<&[u8]>> = vec![
7078 None, Some(b"\x01\x02\x03".as_ref()), None, Some(b"".as_ref()), ];
7083 let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
7084 let arr = Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef;
7085 push_like(
7086 schema.as_ref(),
7087 "maybe_auth",
7088 arr,
7089 &mut fields,
7090 &mut columns,
7091 );
7092 }
7093 {
7094 let uf = match schema
7095 .field_with_name("union_enum_record_array_map")
7096 .unwrap()
7097 .data_type()
7098 {
7099 DataType::Union(f, UnionMode::Dense) => f.clone(),
7100 other => panic!("union_enum_record_array_map should be union, got {other:?}"),
7101 };
7102 let mut tid_enum: Option<i8> = None;
7103 let mut tid_rec_a: Option<i8> = None;
7104 let mut tid_array: Option<i8> = None;
7105 let mut tid_map: Option<i8> = None;
7106 let mut map_entry_field: Option<FieldRef> = None;
7107 let mut map_sorted: bool = false;
7108 for (tid, f) in uf.iter() {
7109 match f.data_type() {
7110 DataType::Dictionary(_, _) => tid_enum = Some(tid),
7111 DataType::Struct(childs)
7112 if childs.len() == 2
7113 && childs[0].name() == "a"
7114 && childs[1].name() == "b" =>
7115 {
7116 tid_rec_a = Some(tid)
7117 }
7118 DataType::List(item) if matches!(item.data_type(), DataType::Int64) => {
7119 tid_array = Some(tid)
7120 }
7121 DataType::Map(ef, is_sorted) => {
7122 tid_map = Some(tid);
7123 map_entry_field = Some(ef.clone());
7124 map_sorted = *is_sorted;
7125 }
7126 _ => {}
7127 }
7128 }
7129 let (tid_enum, tid_rec_a, tid_array, tid_map) = (
7130 tid_enum.unwrap(),
7131 tid_rec_a.unwrap(),
7132 tid_array.unwrap(),
7133 tid_map.unwrap(),
7134 );
7135 let tids = vec![tid_enum, tid_rec_a, tid_array, tid_map];
7136 let offs = vec![0, 0, 0, 0];
7137 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7138 DataType::Dictionary(_, _) => {
7139 let keys = Int32Array::from(vec![0i32]);
7140 let values =
7141 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
7142 Some(
7143 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7144 as ArrayRef,
7145 )
7146 }
7147 DataType::Struct(fs)
7148 if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
7149 {
7150 let a = Int32Array::from(vec![7]);
7151 let b = StringArray::from(vec!["rec"]);
7152 Some(Arc::new(StructArray::new(
7153 fs.clone(),
7154 vec![Arc::new(a), Arc::new(b)],
7155 None,
7156 )) as ArrayRef)
7157 }
7158 DataType::List(field) => {
7159 let values = Int64Array::from(vec![1i64, 2, 3]);
7160 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
7161 Some(Arc::new(
7162 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
7163 ) as ArrayRef)
7164 }
7165 DataType::Map(_, _) => {
7166 let entry_field = map_entry_field.clone().unwrap();
7167 let (key_field, val_field) = match entry_field.data_type() {
7168 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7169 _ => unreachable!(),
7170 };
7171 let keys = StringArray::from(vec!["k"]);
7172 let vals = StringArray::from(vec!["v"]);
7173 let entries = StructArray::new(
7174 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7175 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7176 None,
7177 );
7178 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
7179 Some(Arc::new(MapArray::new(
7180 entry_field.clone(),
7181 offsets,
7182 entries,
7183 None,
7184 map_sorted,
7185 )) as ArrayRef)
7186 }
7187 _ => None,
7188 });
7189 push_like(
7190 schema.as_ref(),
7191 "union_enum_record_array_map",
7192 arr,
7193 &mut fields,
7194 &mut columns,
7195 );
7196 }
7197 {
7198 let uf = match schema
7199 .field_with_name("union_date_or_fixed4")
7200 .unwrap()
7201 .data_type()
7202 {
7203 DataType::Union(f, UnionMode::Dense) => f.clone(),
7204 other => panic!("union_date_or_fixed4 should be union, got {other:?}"),
7205 };
7206 let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
7207 let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
7208 let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
7209 let offs = vec![0, 0, 1, 1];
7210 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7211 DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
7212 DataType::FixedSizeBinary(4) => {
7213 let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
7214 Some(Arc::new(
7215 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
7216 ) as ArrayRef)
7217 }
7218 _ => None,
7219 });
7220 push_like(
7221 schema.as_ref(),
7222 "union_date_or_fixed4",
7223 arr,
7224 &mut fields,
7225 &mut columns,
7226 );
7227 }
7228 {
7229 let uf = match schema
7230 .field_with_name("union_interval_or_string")
7231 .unwrap()
7232 .data_type()
7233 {
7234 DataType::Union(f, UnionMode::Dense) => f.clone(),
7235 other => panic!("union_interval_or_string should be union, got {other:?}"),
7236 };
7237 let tid_dur = tid_by_dt(&uf, |dt| {
7238 matches!(dt, DataType::Interval(IntervalUnit::MonthDayNano))
7239 });
7240 let tid_str = tid_by_dt(&uf, |dt| matches!(dt, DataType::Utf8));
7241 let tids = vec![tid_dur, tid_str, tid_dur, tid_str];
7242 let offs = vec![0, 0, 1, 1];
7243 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7244 DataType::Interval(IntervalUnit::MonthDayNano) => Some(Arc::new(
7245 IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
7246 )
7247 as ArrayRef),
7248 DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
7249 "duration-as-text",
7250 "iso-8601-period-P1Y",
7251 ])) as ArrayRef),
7252 _ => None,
7253 });
7254 push_like(
7255 schema.as_ref(),
7256 "union_interval_or_string",
7257 arr,
7258 &mut fields,
7259 &mut columns,
7260 );
7261 }
7262 {
7263 let uf = match schema
7264 .field_with_name("union_uuid_or_fixed10")
7265 .unwrap()
7266 .data_type()
7267 {
7268 DataType::Union(f, UnionMode::Dense) => f.clone(),
7269 other => panic!("union_uuid_or_fixed10 should be union, got {other:?}"),
7270 };
7271 let tid_uuid = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
7272 let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
7273 let tids = vec![tid_uuid, tid_fx10, tid_uuid, tid_fx10];
7274 let offs = vec![0, 0, 1, 1];
7275 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7276 DataType::FixedSizeBinary(16) => {
7277 let it = [Some(uuid1), Some(uuid2)].into_iter();
7278 Some(Arc::new(
7279 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
7280 ) as ArrayRef)
7281 }
7282 DataType::FixedSizeBinary(10) => {
7283 let fx10_a = [0xAAu8; 10];
7284 let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
7285 let it = [Some(fx10_a), Some(fx10_b)].into_iter();
7286 Some(Arc::new(
7287 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
7288 ) as ArrayRef)
7289 }
7290 _ => None,
7291 });
7292 push_like(
7293 schema.as_ref(),
7294 "union_uuid_or_fixed10",
7295 arr,
7296 &mut fields,
7297 &mut columns,
7298 );
7299 }
7300 {
7301 let list_field = match schema
7302 .field_with_name("array_records_with_union")
7303 .unwrap()
7304 .data_type()
7305 {
7306 DataType::List(f) => f.clone(),
7307 other => panic!("array_records_with_union should be List, got {other:?}"),
7308 };
7309 let kv_fields = match list_field.data_type() {
7310 DataType::Struct(fs) => fs.clone(),
7311 other => panic!("array_records_with_union items must be Struct, got {other:?}"),
7312 };
7313 let val_field = kv_fields
7314 .iter()
7315 .find(|f| f.name() == "val")
7316 .unwrap()
7317 .clone();
7318 let uf = match val_field.data_type() {
7319 DataType::Union(f, UnionMode::Dense) => f.clone(),
7320 other => panic!("KV.val should be union, got {other:?}"),
7321 };
7322 let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
7323 let tid_null = tid_by_name(&uf, "null");
7324 let tid_i = tid_by_name(&uf, "int");
7325 let tid_l = tid_by_name(&uf, "long");
7326 let type_ids = vec![tid_i, tid_null, tid_l, tid_null, tid_i];
7327 let offsets = vec![0, 0, 0, 1, 1];
7328 let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7329 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
7330 DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
7331 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7332 _ => None,
7333 });
7334 let values_struct =
7335 Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None)) as ArrayRef;
7336 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
7337 let arr = Arc::new(
7338 ListArray::try_new(list_field, list_offsets, values_struct, None).unwrap(),
7339 ) as ArrayRef;
7340 push_like(
7341 schema.as_ref(),
7342 "array_records_with_union",
7343 arr,
7344 &mut fields,
7345 &mut columns,
7346 );
7347 }
7348 {
7349 let uf = match schema
7350 .field_with_name("union_map_or_array_int")
7351 .unwrap()
7352 .data_type()
7353 {
7354 DataType::Union(f, UnionMode::Dense) => f.clone(),
7355 other => panic!("union_map_or_array_int should be union, got {other:?}"),
7356 };
7357 let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
7358 let tid_list = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
7359 let map_child: ArrayRef = {
7360 let (entry_field, is_sorted) = match uf
7361 .iter()
7362 .find(|(tid, _)| *tid == tid_map)
7363 .unwrap()
7364 .1
7365 .data_type()
7366 {
7367 DataType::Map(ef, is_sorted) => (ef.clone(), *is_sorted),
7368 _ => unreachable!(),
7369 };
7370 let (key_field, val_field) = match entry_field.data_type() {
7371 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7372 _ => unreachable!(),
7373 };
7374 let keys = StringArray::from(vec!["x", "y", "only"]);
7375 let vals = Int32Array::from(vec![1, 2, 10]);
7376 let entries = StructArray::new(
7377 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7378 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7379 None,
7380 );
7381 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
7382 Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef
7383 };
7384 let list_child: ArrayRef = {
7385 let list_field = match uf
7386 .iter()
7387 .find(|(tid, _)| *tid == tid_list)
7388 .unwrap()
7389 .1
7390 .data_type()
7391 {
7392 DataType::List(f) => f.clone(),
7393 _ => unreachable!(),
7394 };
7395 let values = Int32Array::from(vec![1, 2, 3, 0]);
7396 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
7397 Arc::new(ListArray::try_new(list_field, offsets, Arc::new(values), None).unwrap())
7398 as ArrayRef
7399 };
7400 let tids = vec![tid_map, tid_list, tid_map, tid_list];
7401 let offs = vec![0, 0, 1, 1];
7402 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7403 DataType::Map(_, _) => Some(map_child.clone()),
7404 DataType::List(_) => Some(list_child.clone()),
7405 _ => None,
7406 });
7407 push_like(
7408 schema.as_ref(),
7409 "union_map_or_array_int",
7410 arr,
7411 &mut fields,
7412 &mut columns,
7413 );
7414 }
7415 push_like(
7416 schema.as_ref(),
7417 "renamed_with_default",
7418 Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
7419 &mut fields,
7420 &mut columns,
7421 );
7422 {
7423 let fs = match schema.field_with_name("person").unwrap().data_type() {
7424 DataType::Struct(fs) => fs.clone(),
7425 other => panic!("person should be Struct, got {other:?}"),
7426 };
7427 let name =
7428 Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef;
7429 let age = Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef;
7430 let arr = Arc::new(StructArray::new(fs, vec![name, age], None)) as ArrayRef;
7431 push_like(schema.as_ref(), "person", arr, &mut fields, &mut columns);
7432 }
7433 let expected =
7434 RecordBatch::try_new(Arc::new(Schema::new(Fields::from(fields))), columns).unwrap();
7435 assert_eq!(
7436 expected, batch,
7437 "entire RecordBatch mismatch (schema, all columns, all rows)"
7438 );
7439 }
7440 #[test]
7441 fn comprehensive_e2e_resolution_test() {
7442 use serde_json::Value;
7443 use std::collections::HashMap;
7444
7445 fn make_comprehensive_reader_schema(path: &str) -> AvroSchema {
7458 fn set_type_string(f: &mut Value, new_ty: &str) {
7459 if let Some(ty) = f.get_mut("type") {
7460 match ty {
7461 Value::String(_) | Value::Object(_) => {
7462 *ty = Value::String(new_ty.to_string());
7463 }
7464 Value::Array(arr) => {
7465 for b in arr.iter_mut() {
7466 match b {
7467 Value::String(s) if s != "null" => {
7468 *b = Value::String(new_ty.to_string());
7469 break;
7470 }
7471 Value::Object(_) => {
7472 *b = Value::String(new_ty.to_string());
7473 break;
7474 }
7475 _ => {}
7476 }
7477 }
7478 }
7479 _ => {}
7480 }
7481 }
7482 }
7483 fn reverse_union_array(f: &mut Value) {
7484 if let Some(arr) = f.get_mut("type").and_then(|t| t.as_array_mut()) {
7485 arr.reverse();
7486 }
7487 }
7488 fn reverse_items_union(f: &mut Value) {
7489 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7490 if let Some(items) = obj.get_mut("items").and_then(|v| v.as_array_mut()) {
7491 items.reverse();
7492 }
7493 }
7494 }
7495 fn reverse_map_values_union(f: &mut Value) {
7496 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7497 if let Some(values) = obj.get_mut("values").and_then(|v| v.as_array_mut()) {
7498 values.reverse();
7499 }
7500 }
7501 }
7502 fn reverse_nested_union_in_record(f: &mut Value, field_name: &str) {
7503 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7504 if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7505 for ff in fields.iter_mut() {
7506 if ff.get("name").and_then(|n| n.as_str()) == Some(field_name) {
7507 if let Some(ty) = ff.get_mut("type") {
7508 if let Some(arr) = ty.as_array_mut() {
7509 arr.reverse();
7510 }
7511 }
7512 }
7513 }
7514 }
7515 }
7516 }
7517 fn rename_nested_field_with_alias(f: &mut Value, old: &str, new: &str) {
7518 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7519 if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
7520 for ff in fields.iter_mut() {
7521 if ff.get("name").and_then(|n| n.as_str()) == Some(old) {
7522 ff["name"] = Value::String(new.to_string());
7523 ff["aliases"] = Value::Array(vec![Value::String(old.to_string())]);
7524 }
7525 }
7526 }
7527 }
7528 }
7529 let mut root = load_writer_schema_json(path);
7530 assert_eq!(root["type"], "record", "writer schema must be a record");
7531 let fields = root
7532 .get_mut("fields")
7533 .and_then(|f| f.as_array_mut())
7534 .expect("record has fields");
7535 for f in fields.iter_mut() {
7536 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
7537 continue;
7538 };
7539 match name {
7540 "id" => {
7542 f["name"] = Value::String("identifier".into());
7543 f["aliases"] = Value::Array(vec![Value::String("id".into())]);
7544 }
7545 "renamed_with_default" => {
7546 f["name"] = Value::String("old_count".into());
7547 f["aliases"] =
7548 Value::Array(vec![Value::String("renamed_with_default".into())]);
7549 }
7550 "count_i32" => set_type_string(f, "long"),
7552 "ratio_f32" => set_type_string(f, "double"),
7553 "opt_str_nullsecond" => reverse_union_array(f),
7555 "union_enum_record_array_map" => reverse_union_array(f),
7556 "union_date_or_fixed4" => reverse_union_array(f),
7557 "union_interval_or_string" => reverse_union_array(f),
7558 "union_uuid_or_fixed10" => reverse_union_array(f),
7559 "union_map_or_array_int" => reverse_union_array(f),
7560 "maybe_auth" => reverse_nested_union_in_record(f, "token"),
7561 "arr_union" => reverse_items_union(f),
7563 "map_union" => reverse_map_values_union(f),
7564 "address" => rename_nested_field_with_alias(f, "street", "street_name"),
7566 "person" => {
7568 if let Some(tobj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
7569 tobj.insert("name".to_string(), Value::String("Person".into()));
7570 tobj.insert(
7571 "namespace".to_string(),
7572 Value::String("com.example".into()),
7573 );
7574 tobj.insert(
7575 "aliases".into(),
7576 Value::Array(vec![
7577 Value::String("PersonV2".into()),
7578 Value::String("com.example.v2.PersonV2".into()),
7579 ]),
7580 );
7581 }
7582 }
7583 _ => {}
7584 }
7585 }
7586 fields.reverse();
7587 AvroSchema::new(root.to_string())
7588 }
7589
7590 let path = "test/data/comprehensive_e2e.avro";
7591 let reader_schema = make_comprehensive_reader_schema(path);
7592 let batch = read_alltypes_with_reader_schema(path, reader_schema.clone());
7593
7594 const UUID_EXT_KEY: &str = "ARROW:extension:name";
7595 const UUID_LOGICAL_KEY: &str = "logicalType";
7596
7597 let uuid_md_top: Option<HashMap<String, String>> = batch
7598 .schema()
7599 .field_with_name("uuid_str")
7600 .ok()
7601 .and_then(|f| {
7602 let md = f.metadata();
7603 let has_ext = md.get(UUID_EXT_KEY).is_some();
7604 let is_uuid_logical = md
7605 .get(UUID_LOGICAL_KEY)
7606 .map(|v| v.trim_matches('"') == "uuid")
7607 .unwrap_or(false);
7608 if has_ext || is_uuid_logical {
7609 Some(md.clone())
7610 } else {
7611 None
7612 }
7613 });
7614
7615 let uuid_md_union: Option<HashMap<String, String>> = batch
7616 .schema()
7617 .field_with_name("union_uuid_or_fixed10")
7618 .ok()
7619 .and_then(|f| match f.data_type() {
7620 DataType::Union(uf, _) => uf
7621 .iter()
7622 .find(|(_, child)| child.name() == "uuid")
7623 .and_then(|(_, child)| {
7624 let md = child.metadata();
7625 let has_ext = md.get(UUID_EXT_KEY).is_some();
7626 let is_uuid_logical = md
7627 .get(UUID_LOGICAL_KEY)
7628 .map(|v| v.trim_matches('"') == "uuid")
7629 .unwrap_or(false);
7630 if has_ext || is_uuid_logical {
7631 Some(md.clone())
7632 } else {
7633 None
7634 }
7635 }),
7636 _ => None,
7637 });
7638
7639 let add_uuid_ext_top = |f: Field| -> Field {
7640 if let Some(md) = &uuid_md_top {
7641 f.with_metadata(md.clone())
7642 } else {
7643 f
7644 }
7645 };
7646 let add_uuid_ext_union = |f: Field| -> Field {
7647 if let Some(md) = &uuid_md_union {
7648 f.with_metadata(md.clone())
7649 } else {
7650 f
7651 }
7652 };
7653
7654 #[inline]
7655 fn uuid16_from_str(s: &str) -> [u8; 16] {
7656 let mut out = [0u8; 16];
7657 let mut idx = 0usize;
7658 let mut hi: Option<u8> = None;
7659 for ch in s.chars() {
7660 if ch == '-' {
7661 continue;
7662 }
7663 let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
7664 if let Some(h) = hi {
7665 out[idx] = (h << 4) | v;
7666 idx += 1;
7667 hi = None;
7668 } else {
7669 hi = Some(v);
7670 }
7671 }
7672 assert_eq!(idx, 16, "UUID must decode to 16 bytes");
7673 out
7674 }
7675
7676 fn mk_dense_union(
7677 fields: &UnionFields,
7678 type_ids: Vec<i8>,
7679 offsets: Vec<i32>,
7680 provide: impl Fn(&Field) -> Option<ArrayRef>,
7681 ) -> ArrayRef {
7682 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
7683 match dt {
7684 DataType::Null => Arc::new(NullArray::new(0)),
7685 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
7686 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
7687 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
7688 DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
7689 DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
7690 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
7691 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
7692 DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
7693 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
7694 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
7695 }
7696 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
7697 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
7698 }
7699 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
7700 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
7701 Arc::new(if let Some(tz) = tz {
7702 a.with_timezone(tz.clone())
7703 } else {
7704 a
7705 })
7706 }
7707 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
7708 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
7709 Arc::new(if let Some(tz) = tz {
7710 a.with_timezone(tz.clone())
7711 } else {
7712 a
7713 })
7714 }
7715 DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
7716 IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
7717 ),
7718 DataType::FixedSizeBinary(sz) => Arc::new(
7719 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
7720 std::iter::empty::<Option<Vec<u8>>>(),
7721 *sz,
7722 )
7723 .unwrap(),
7724 ),
7725 DataType::Dictionary(_, _) => {
7726 let keys = Int32Array::from(Vec::<i32>::new());
7727 let values = Arc::new(StringArray::from(Vec::<&str>::new()));
7728 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7729 }
7730 DataType::Struct(fields) => {
7731 let children: Vec<ArrayRef> = fields
7732 .iter()
7733 .map(|f| empty_child_for(f.data_type()) as ArrayRef)
7734 .collect();
7735 Arc::new(StructArray::new(fields.clone(), children, None))
7736 }
7737 DataType::List(field) => {
7738 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
7739 Arc::new(
7740 ListArray::try_new(
7741 field.clone(),
7742 offsets,
7743 empty_child_for(field.data_type()),
7744 None,
7745 )
7746 .unwrap(),
7747 )
7748 }
7749 DataType::Map(entry_field, is_sorted) => {
7750 let (key_field, val_field) = match entry_field.data_type() {
7751 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7752 other => panic!("unexpected map entries type: {other:?}"),
7753 };
7754 let keys = StringArray::from(Vec::<&str>::new());
7755 let vals: ArrayRef = match val_field.data_type() {
7756 DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
7757 DataType::Boolean => {
7758 Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
7759 }
7760 DataType::Int32 => {
7761 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
7762 }
7763 DataType::Int64 => {
7764 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
7765 }
7766 DataType::Float32 => {
7767 Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
7768 }
7769 DataType::Float64 => {
7770 Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
7771 }
7772 DataType::Utf8 => {
7773 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
7774 }
7775 DataType::Binary => {
7776 Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
7777 }
7778 DataType::Union(uf, _) => {
7779 let children: Vec<ArrayRef> = uf
7780 .iter()
7781 .map(|(_, f)| empty_child_for(f.data_type()))
7782 .collect();
7783 Arc::new(
7784 UnionArray::try_new(
7785 uf.clone(),
7786 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
7787 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
7788 children,
7789 )
7790 .unwrap(),
7791 ) as ArrayRef
7792 }
7793 other => panic!("unsupported map value type: {other:?}"),
7794 };
7795 let entries = StructArray::new(
7796 Fields::from(vec![
7797 key_field.as_ref().clone(),
7798 val_field.as_ref().clone(),
7799 ]),
7800 vec![Arc::new(keys) as ArrayRef, vals],
7801 None,
7802 );
7803 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
7804 Arc::new(MapArray::new(
7805 entry_field.clone(),
7806 offsets,
7807 entries,
7808 None,
7809 *is_sorted,
7810 ))
7811 }
7812 other => panic!("empty_child_for: unhandled type {other:?}"),
7813 }
7814 }
7815 let children: Vec<ArrayRef> = fields
7816 .iter()
7817 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
7818 .collect();
7819 Arc::new(
7820 UnionArray::try_new(
7821 fields.clone(),
7822 ScalarBuffer::<i8>::from(type_ids),
7823 Some(ScalarBuffer::<i32>::from(offsets)),
7824 children,
7825 )
7826 .unwrap(),
7827 ) as ArrayRef
7828 }
7829 let date_a: i32 = 19_000; let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
7831 let time_us_eod: i64 = 86_400_000_000 - 1;
7832 let ts_ms_2024_01_01: i64 = 1_704_067_200_000; let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
7834 let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
7835 let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
7836 let dur_large =
7837 IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
7838 let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
7839 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
7840 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
7841 let item_name = Field::LIST_FIELD_DEFAULT_NAME;
7842 let uf_tri = UnionFields::new(
7843 vec![0, 1, 2],
7844 vec![
7845 Field::new("int", DataType::Int32, false),
7846 Field::new("string", DataType::Utf8, false),
7847 Field::new("boolean", DataType::Boolean, false),
7848 ],
7849 );
7850 let uf_arr_items = UnionFields::new(
7851 vec![0, 1, 2],
7852 vec![
7853 Field::new("null", DataType::Null, false),
7854 Field::new("string", DataType::Utf8, false),
7855 Field::new("long", DataType::Int64, false),
7856 ],
7857 );
7858 let arr_items_field = Arc::new(Field::new(
7859 item_name,
7860 DataType::Union(uf_arr_items.clone(), UnionMode::Dense),
7861 true,
7862 ));
7863 let uf_map_vals = UnionFields::new(
7864 vec![0, 1, 2],
7865 vec![
7866 Field::new("string", DataType::Utf8, false),
7867 Field::new("double", DataType::Float64, false),
7868 Field::new("null", DataType::Null, false),
7869 ],
7870 );
7871 let map_entries_field = Arc::new(Field::new(
7872 "entries",
7873 DataType::Struct(Fields::from(vec![
7874 Field::new("key", DataType::Utf8, false),
7875 Field::new(
7876 "value",
7877 DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
7878 true,
7879 ),
7880 ])),
7881 false,
7882 ));
7883 let mut enum_md_color = {
7885 let mut m = HashMap::<String, String>::new();
7886 m.insert(
7887 crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
7888 serde_json::to_string(&vec!["RED", "GREEN", "BLUE"]).unwrap(),
7889 );
7890 m
7891 };
7892 enum_md_color.insert(AVRO_NAME_METADATA_KEY.to_string(), "Color".to_string());
7893 enum_md_color.insert(
7894 AVRO_NAMESPACE_METADATA_KEY.to_string(),
7895 "org.apache.arrow.avrotests.v1.types".to_string(),
7896 );
7897 let union_rec_a_fields = Fields::from(vec![
7898 Field::new("a", DataType::Int32, false),
7899 Field::new("b", DataType::Utf8, false),
7900 ]);
7901 let union_rec_b_fields = Fields::from(vec![
7902 Field::new("x", DataType::Int64, false),
7903 Field::new("y", DataType::Binary, false),
7904 ]);
7905 let union_map_entries = Arc::new(Field::new(
7906 "entries",
7907 DataType::Struct(Fields::from(vec![
7908 Field::new("key", DataType::Utf8, false),
7909 Field::new("value", DataType::Utf8, false),
7910 ])),
7911 false,
7912 ));
7913 let rec_a_md = {
7914 let mut m = HashMap::<String, String>::new();
7915 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecA".to_string());
7916 m.insert(
7917 AVRO_NAMESPACE_METADATA_KEY.to_string(),
7918 "org.apache.arrow.avrotests.v1.types".to_string(),
7919 );
7920 m
7921 };
7922 let rec_b_md = {
7923 let mut m = HashMap::<String, String>::new();
7924 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecB".to_string());
7925 m.insert(
7926 AVRO_NAMESPACE_METADATA_KEY.to_string(),
7927 "org.apache.arrow.avrotests.v1.types".to_string(),
7928 );
7929 m
7930 };
7931 let uf_union_big = UnionFields::new(
7932 vec![0, 1, 2, 3, 4],
7933 vec![
7934 Field::new(
7935 "map",
7936 DataType::Map(union_map_entries.clone(), false),
7937 false,
7938 ),
7939 Field::new(
7940 "array",
7941 DataType::List(Arc::new(Field::new(item_name, DataType::Int64, false))),
7942 false,
7943 ),
7944 Field::new(
7945 "org.apache.arrow.avrotests.v1.types.RecB",
7946 DataType::Struct(union_rec_b_fields.clone()),
7947 false,
7948 )
7949 .with_metadata(rec_b_md.clone()),
7950 Field::new(
7951 "org.apache.arrow.avrotests.v1.types.RecA",
7952 DataType::Struct(union_rec_a_fields.clone()),
7953 false,
7954 )
7955 .with_metadata(rec_a_md.clone()),
7956 Field::new(
7957 "org.apache.arrow.avrotests.v1.types.Color",
7958 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
7959 false,
7960 )
7961 .with_metadata(enum_md_color.clone()),
7962 ],
7963 );
7964 let fx4_md = {
7965 let mut m = HashMap::<String, String>::new();
7966 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx4".to_string());
7967 m.insert(
7968 AVRO_NAMESPACE_METADATA_KEY.to_string(),
7969 "org.apache.arrow.avrotests.v1".to_string(),
7970 );
7971 m
7972 };
7973 let uf_date_fixed4 = UnionFields::new(
7974 vec![0, 1],
7975 vec![
7976 Field::new(
7977 "org.apache.arrow.avrotests.v1.Fx4",
7978 DataType::FixedSizeBinary(4),
7979 false,
7980 )
7981 .with_metadata(fx4_md.clone()),
7982 Field::new("date", DataType::Date32, false),
7983 ],
7984 );
7985 let dur12u_md = {
7986 let mut m = HashMap::<String, String>::new();
7987 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12U".to_string());
7988 m.insert(
7989 AVRO_NAMESPACE_METADATA_KEY.to_string(),
7990 "org.apache.arrow.avrotests.v1".to_string(),
7991 );
7992 m
7993 };
7994 let uf_dur_or_str = UnionFields::new(
7995 vec![0, 1],
7996 vec![
7997 Field::new("string", DataType::Utf8, false),
7998 Field::new(
7999 "org.apache.arrow.avrotests.v1.Dur12U",
8000 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano),
8001 false,
8002 )
8003 .with_metadata(dur12u_md.clone()),
8004 ],
8005 );
8006 let fx10_md = {
8007 let mut m = HashMap::<String, String>::new();
8008 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx10".to_string());
8009 m.insert(
8010 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8011 "org.apache.arrow.avrotests.v1".to_string(),
8012 );
8013 m
8014 };
8015 let uf_uuid_or_fx10 = UnionFields::new(
8016 vec![0, 1],
8017 vec![
8018 Field::new(
8019 "org.apache.arrow.avrotests.v1.Fx10",
8020 DataType::FixedSizeBinary(10),
8021 false,
8022 )
8023 .with_metadata(fx10_md.clone()),
8024 add_uuid_ext_union(Field::new("uuid", DataType::FixedSizeBinary(16), false)),
8025 ],
8026 );
8027 let uf_kv_val = UnionFields::new(
8028 vec![0, 1, 2],
8029 vec![
8030 Field::new("null", DataType::Null, false),
8031 Field::new("int", DataType::Int32, false),
8032 Field::new("long", DataType::Int64, false),
8033 ],
8034 );
8035 let kv_fields = Fields::from(vec![
8036 Field::new("key", DataType::Utf8, false),
8037 Field::new(
8038 "val",
8039 DataType::Union(uf_kv_val.clone(), UnionMode::Dense),
8040 true,
8041 ),
8042 ]);
8043 let kv_item_field = Arc::new(Field::new(
8044 item_name,
8045 DataType::Struct(kv_fields.clone()),
8046 false,
8047 ));
8048 let map_int_entries = Arc::new(Field::new(
8049 "entries",
8050 DataType::Struct(Fields::from(vec![
8051 Field::new("key", DataType::Utf8, false),
8052 Field::new("value", DataType::Int32, false),
8053 ])),
8054 false,
8055 ));
8056 let uf_map_or_array = UnionFields::new(
8057 vec![0, 1],
8058 vec![
8059 Field::new(
8060 "array",
8061 DataType::List(Arc::new(Field::new(item_name, DataType::Int32, false))),
8062 false,
8063 ),
8064 Field::new("map", DataType::Map(map_int_entries.clone(), false), false),
8065 ],
8066 );
8067 let mut enum_md_status = {
8068 let mut m = HashMap::<String, String>::new();
8069 m.insert(
8070 crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8071 serde_json::to_string(&vec!["UNKNOWN", "NEW", "PROCESSING", "DONE"]).unwrap(),
8072 );
8073 m
8074 };
8075 enum_md_status.insert(AVRO_NAME_METADATA_KEY.to_string(), "Status".to_string());
8076 enum_md_status.insert(
8077 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8078 "org.apache.arrow.avrotests.v1.types".to_string(),
8079 );
8080 let mut dec20_md = HashMap::<String, String>::new();
8081 dec20_md.insert("precision".to_string(), "20".to_string());
8082 dec20_md.insert("scale".to_string(), "4".to_string());
8083 dec20_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "DecFix20".to_string());
8084 dec20_md.insert(
8085 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8086 "org.apache.arrow.avrotests.v1.types".to_string(),
8087 );
8088 let mut dec10_md = HashMap::<String, String>::new();
8089 dec10_md.insert("precision".to_string(), "10".to_string());
8090 dec10_md.insert("scale".to_string(), "2".to_string());
8091 let fx16_top_md = {
8092 let mut m = HashMap::<String, String>::new();
8093 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx16".to_string());
8094 m.insert(
8095 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8096 "org.apache.arrow.avrotests.v1.types".to_string(),
8097 );
8098 m
8099 };
8100 let dur12_top_md = {
8101 let mut m = HashMap::<String, String>::new();
8102 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12".to_string());
8103 m.insert(
8104 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8105 "org.apache.arrow.avrotests.v1.types".to_string(),
8106 );
8107 m
8108 };
8109 #[cfg(feature = "small_decimals")]
8110 let dec20_dt = DataType::Decimal128(20, 4);
8111 #[cfg(not(feature = "small_decimals"))]
8112 let dec20_dt = DataType::Decimal128(20, 4);
8113 #[cfg(feature = "small_decimals")]
8114 let dec10_dt = DataType::Decimal64(10, 2);
8115 #[cfg(not(feature = "small_decimals"))]
8116 let dec10_dt = DataType::Decimal128(10, 2);
8117 let fields: Vec<FieldRef> = vec![
8118 Arc::new(Field::new(
8119 "person",
8120 DataType::Struct(Fields::from(vec![
8121 Field::new("name", DataType::Utf8, false),
8122 Field::new("age", DataType::Int32, false),
8123 ])),
8124 false,
8125 )),
8126 Arc::new(Field::new("old_count", DataType::Int32, false)),
8127 Arc::new(Field::new(
8128 "union_map_or_array_int",
8129 DataType::Union(uf_map_or_array.clone(), UnionMode::Dense),
8130 false,
8131 )),
8132 Arc::new(Field::new(
8133 "array_records_with_union",
8134 DataType::List(kv_item_field.clone()),
8135 false,
8136 )),
8137 Arc::new(Field::new(
8138 "union_uuid_or_fixed10",
8139 DataType::Union(uf_uuid_or_fx10.clone(), UnionMode::Dense),
8140 false,
8141 )),
8142 Arc::new(Field::new(
8143 "union_interval_or_string",
8144 DataType::Union(uf_dur_or_str.clone(), UnionMode::Dense),
8145 false,
8146 )),
8147 Arc::new(Field::new(
8148 "union_date_or_fixed4",
8149 DataType::Union(uf_date_fixed4.clone(), UnionMode::Dense),
8150 false,
8151 )),
8152 Arc::new(Field::new(
8153 "union_enum_record_array_map",
8154 DataType::Union(uf_union_big.clone(), UnionMode::Dense),
8155 false,
8156 )),
8157 Arc::new(Field::new(
8158 "maybe_auth",
8159 DataType::Struct(Fields::from(vec![
8160 Field::new("user", DataType::Utf8, false),
8161 Field::new("token", DataType::Binary, true), ])),
8163 false,
8164 )),
8165 Arc::new(Field::new(
8166 "address",
8167 DataType::Struct(Fields::from(vec![
8168 Field::new("street_name", DataType::Utf8, false),
8169 Field::new("zip", DataType::Int32, false),
8170 Field::new("country", DataType::Utf8, false),
8171 ])),
8172 false,
8173 )),
8174 Arc::new(Field::new(
8175 "map_union",
8176 DataType::Map(map_entries_field.clone(), false),
8177 false,
8178 )),
8179 Arc::new(Field::new(
8180 "arr_union",
8181 DataType::List(arr_items_field.clone()),
8182 false,
8183 )),
8184 Arc::new(
8185 Field::new(
8186 "status",
8187 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8188 false,
8189 )
8190 .with_metadata(enum_md_status.clone()),
8191 ),
8192 Arc::new(
8193 Field::new(
8194 "interval_mdn",
8195 DataType::Interval(IntervalUnit::MonthDayNano),
8196 false,
8197 )
8198 .with_metadata(dur12_top_md.clone()),
8199 ),
8200 Arc::new(Field::new(
8201 "ts_micros_local",
8202 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
8203 false,
8204 )),
8205 Arc::new(Field::new(
8206 "ts_millis_local",
8207 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
8208 false,
8209 )),
8210 Arc::new(Field::new(
8211 "ts_micros_utc",
8212 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+00:00".into())),
8213 false,
8214 )),
8215 Arc::new(Field::new(
8216 "ts_millis_utc",
8217 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, Some("+00:00".into())),
8218 false,
8219 )),
8220 Arc::new(Field::new(
8221 "t_micros",
8222 DataType::Time64(arrow_schema::TimeUnit::Microsecond),
8223 false,
8224 )),
8225 Arc::new(Field::new(
8226 "t_millis",
8227 DataType::Time32(arrow_schema::TimeUnit::Millisecond),
8228 false,
8229 )),
8230 Arc::new(Field::new("d_date", DataType::Date32, false)),
8231 Arc::new(add_uuid_ext_top(Field::new(
8232 "uuid_str",
8233 DataType::FixedSizeBinary(16),
8234 false,
8235 ))),
8236 Arc::new(Field::new("dec_fix_s20_4", dec20_dt, false).with_metadata(dec20_md.clone())),
8237 Arc::new(
8238 Field::new("dec_bytes_s10_2", dec10_dt, false).with_metadata(dec10_md.clone()),
8239 ),
8240 Arc::new(
8241 Field::new("fx16_plain", DataType::FixedSizeBinary(16), false)
8242 .with_metadata(fx16_top_md.clone()),
8243 ),
8244 Arc::new(Field::new("raw_bytes", DataType::Binary, false)),
8245 Arc::new(Field::new("str_utf8", DataType::Utf8, false)),
8246 Arc::new(Field::new(
8247 "tri_union_prim",
8248 DataType::Union(uf_tri.clone(), UnionMode::Dense),
8249 false,
8250 )),
8251 Arc::new(Field::new("opt_str_nullsecond", DataType::Utf8, true)),
8252 Arc::new(Field::new("opt_i32_nullfirst", DataType::Int32, true)),
8253 Arc::new(Field::new("count_i64", DataType::Int64, false)),
8254 Arc::new(Field::new("count_i32", DataType::Int64, false)),
8255 Arc::new(Field::new("ratio_f64", DataType::Float64, false)),
8256 Arc::new(Field::new("ratio_f32", DataType::Float64, false)),
8257 Arc::new(Field::new("flag", DataType::Boolean, false)),
8258 Arc::new(Field::new("identifier", DataType::Int64, false)),
8259 ];
8260 let expected_schema = Arc::new(arrow_schema::Schema::new(Fields::from(fields)));
8261 let mut cols: Vec<ArrayRef> = vec![
8262 Arc::new(StructArray::new(
8263 match expected_schema
8264 .field_with_name("person")
8265 .unwrap()
8266 .data_type()
8267 {
8268 DataType::Struct(fs) => fs.clone(),
8269 _ => unreachable!(),
8270 },
8271 vec![
8272 Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef,
8273 Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef,
8274 ],
8275 None,
8276 )) as ArrayRef,
8277 Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
8278 ];
8279 {
8280 let map_child: ArrayRef = {
8281 let keys = StringArray::from(vec!["x", "y", "only"]);
8282 let vals = Int32Array::from(vec![1, 2, 10]);
8283 let entries = StructArray::new(
8284 Fields::from(vec![
8285 Field::new("key", DataType::Utf8, false),
8286 Field::new("value", DataType::Int32, false),
8287 ]),
8288 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8289 None,
8290 );
8291 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
8292 Arc::new(MapArray::new(
8293 map_int_entries.clone(),
8294 moff,
8295 entries,
8296 None,
8297 false,
8298 )) as ArrayRef
8299 };
8300 let list_child: ArrayRef = {
8301 let values = Int32Array::from(vec![1, 2, 3, 0]);
8302 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
8303 Arc::new(
8304 ListArray::try_new(
8305 Arc::new(Field::new(item_name, DataType::Int32, false)),
8306 offsets,
8307 Arc::new(values),
8308 None,
8309 )
8310 .unwrap(),
8311 ) as ArrayRef
8312 };
8313 let tids = vec![1, 0, 1, 0];
8314 let offs = vec![0, 0, 1, 1];
8315 let arr = mk_dense_union(&uf_map_or_array, tids, offs, |f| match f.name().as_str() {
8316 "array" => Some(list_child.clone()),
8317 "map" => Some(map_child.clone()),
8318 _ => None,
8319 });
8320 cols.push(arr);
8321 }
8322 {
8323 let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
8324 let type_ids = vec![1, 0, 2, 0, 1];
8325 let offsets = vec![0, 0, 0, 1, 1];
8326 let vals = mk_dense_union(&uf_kv_val, type_ids, offsets, |f| match f.data_type() {
8327 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
8328 DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
8329 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
8330 _ => None,
8331 });
8332 let values_struct =
8333 Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None));
8334 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
8335 let arr = Arc::new(
8336 ListArray::try_new(kv_item_field.clone(), list_offsets, values_struct, None)
8337 .unwrap(),
8338 ) as ArrayRef;
8339 cols.push(arr);
8340 }
8341 {
8342 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8344 let arr = mk_dense_union(&uf_uuid_or_fx10, type_ids, offs, |f| match f.data_type() {
8345 DataType::FixedSizeBinary(16) => {
8346 let it = [Some(uuid1), Some(uuid2)].into_iter();
8347 Some(Arc::new(
8348 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8349 ) as ArrayRef)
8350 }
8351 DataType::FixedSizeBinary(10) => {
8352 let fx10_a = [0xAAu8; 10];
8353 let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
8354 let it = [Some(fx10_a), Some(fx10_b)].into_iter();
8355 Some(Arc::new(
8356 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
8357 ) as ArrayRef)
8358 }
8359 _ => None,
8360 });
8361 cols.push(arr);
8362 }
8363 {
8364 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8366 let arr = mk_dense_union(&uf_dur_or_str, type_ids, offs, |f| match f.data_type() {
8367 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => Some(Arc::new(
8368 IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
8369 )
8370 as ArrayRef),
8371 DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
8372 "duration-as-text",
8373 "iso-8601-period-P1Y",
8374 ])) as ArrayRef),
8375 _ => None,
8376 });
8377 cols.push(arr);
8378 }
8379 {
8380 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8382 let arr = mk_dense_union(&uf_date_fixed4, type_ids, offs, |f| match f.data_type() {
8383 DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
8384 DataType::FixedSizeBinary(4) => {
8385 let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
8386 Some(Arc::new(
8387 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
8388 ) as ArrayRef)
8389 }
8390 _ => None,
8391 });
8392 cols.push(arr);
8393 }
8394 {
8395 let tids = vec![4, 3, 1, 0]; let offs = vec![0, 0, 0, 0];
8397 let arr = mk_dense_union(&uf_union_big, tids, offs, |f| match f.data_type() {
8398 DataType::Dictionary(_, _) => {
8399 let keys = Int32Array::from(vec![0i32]);
8400 let values =
8401 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
8402 Some(
8403 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8404 as ArrayRef,
8405 )
8406 }
8407 DataType::Struct(fs) if fs == &union_rec_a_fields => {
8408 let a = Int32Array::from(vec![7]);
8409 let b = StringArray::from(vec!["rec"]);
8410 Some(Arc::new(StructArray::new(
8411 fs.clone(),
8412 vec![Arc::new(a) as ArrayRef, Arc::new(b) as ArrayRef],
8413 None,
8414 )) as ArrayRef)
8415 }
8416 DataType::List(_) => {
8417 let values = Int64Array::from(vec![1i64, 2, 3]);
8418 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
8419 Some(Arc::new(
8420 ListArray::try_new(
8421 Arc::new(Field::new(item_name, DataType::Int64, false)),
8422 offsets,
8423 Arc::new(values),
8424 None,
8425 )
8426 .unwrap(),
8427 ) as ArrayRef)
8428 }
8429 DataType::Map(_, _) => {
8430 let keys = StringArray::from(vec!["k"]);
8431 let vals = StringArray::from(vec!["v"]);
8432 let entries = StructArray::new(
8433 Fields::from(vec![
8434 Field::new("key", DataType::Utf8, false),
8435 Field::new("value", DataType::Utf8, false),
8436 ]),
8437 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8438 None,
8439 );
8440 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
8441 Some(Arc::new(MapArray::new(
8442 union_map_entries.clone(),
8443 moff,
8444 entries,
8445 None,
8446 false,
8447 )) as ArrayRef)
8448 }
8449 _ => None,
8450 });
8451 cols.push(arr);
8452 }
8453 {
8454 let fs = match expected_schema
8455 .field_with_name("maybe_auth")
8456 .unwrap()
8457 .data_type()
8458 {
8459 DataType::Struct(fs) => fs.clone(),
8460 _ => unreachable!(),
8461 };
8462 let user =
8463 Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
8464 let token_values: Vec<Option<&[u8]>> = vec![
8465 None,
8466 Some(b"\x01\x02\x03".as_ref()),
8467 None,
8468 Some(b"".as_ref()),
8469 ];
8470 let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
8471 cols.push(Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef);
8472 }
8473 {
8474 let fs = match expected_schema
8475 .field_with_name("address")
8476 .unwrap()
8477 .data_type()
8478 {
8479 DataType::Struct(fs) => fs.clone(),
8480 _ => unreachable!(),
8481 };
8482 let street = Arc::new(StringArray::from(vec![
8483 "100 Main",
8484 "",
8485 "42 Galaxy Way",
8486 "End Ave",
8487 ])) as ArrayRef;
8488 let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
8489 let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
8490 cols.push(Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef);
8491 }
8492 {
8493 let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
8494 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
8495 let tid_s = 0; let tid_d = 1; let tid_n = 2; let type_ids = vec![tid_d, tid_n, tid_s, tid_d, tid_d, tid_s];
8499 let offsets = vec![0, 0, 0, 1, 2, 1];
8500 let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
8501 let vals = mk_dense_union(&uf_map_vals, type_ids, offsets, |f| match f.data_type() {
8502 DataType::Float64 => {
8503 Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
8504 }
8505 DataType::Utf8 => {
8506 Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
8507 }
8508 DataType::Null => Some(Arc::new(NullArray::new(1)) as ArrayRef),
8509 _ => None,
8510 });
8511 let entries = StructArray::new(
8512 Fields::from(vec![
8513 Field::new("key", DataType::Utf8, false),
8514 Field::new(
8515 "value",
8516 DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8517 true,
8518 ),
8519 ]),
8520 vec![Arc::new(keys) as ArrayRef, vals],
8521 None,
8522 );
8523 let map = Arc::new(MapArray::new(
8524 map_entries_field.clone(),
8525 moff,
8526 entries,
8527 None,
8528 false,
8529 )) as ArrayRef;
8530 cols.push(map);
8531 }
8532 {
8533 let type_ids = vec![
8534 2, 1, 0, 2, 0, 1, 2, 2, 1, 0,
8535 2, ];
8537 let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
8538 let values =
8539 mk_dense_union(&uf_arr_items, type_ids, offsets, |f| match f.data_type() {
8540 DataType::Int64 => {
8541 Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
8542 }
8543 DataType::Utf8 => {
8544 Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
8545 }
8546 DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
8547 _ => None,
8548 });
8549 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
8550 let arr = Arc::new(
8551 ListArray::try_new(arr_items_field.clone(), list_offsets, values, None).unwrap(),
8552 ) as ArrayRef;
8553 cols.push(arr);
8554 }
8555 {
8556 let keys = Int32Array::from(vec![1, 2, 3, 0]); let values = Arc::new(StringArray::from(vec![
8558 "UNKNOWN",
8559 "NEW",
8560 "PROCESSING",
8561 "DONE",
8562 ])) as ArrayRef;
8563 let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
8564 cols.push(Arc::new(dict) as ArrayRef);
8565 }
8566 cols.push(Arc::new(IntervalMonthDayNanoArray::from(vec![
8567 dur_small, dur_zero, dur_large, dur_2years,
8568 ])) as ArrayRef);
8569 cols.push(Arc::new(TimestampMicrosecondArray::from(vec![
8570 ts_us_2024_01_01 + 123_456,
8571 0,
8572 ts_us_2024_01_01 + 101_112,
8573 987_654_321,
8574 ])) as ArrayRef);
8575 cols.push(Arc::new(TimestampMillisecondArray::from(vec![
8576 ts_ms_2024_01_01 + 86_400_000,
8577 0,
8578 ts_ms_2024_01_01 + 789,
8579 123_456_789,
8580 ])) as ArrayRef);
8581 {
8582 let a = TimestampMicrosecondArray::from(vec![
8583 ts_us_2024_01_01,
8584 1,
8585 ts_us_2024_01_01 + 456,
8586 0,
8587 ])
8588 .with_timezone("+00:00");
8589 cols.push(Arc::new(a) as ArrayRef);
8590 }
8591 {
8592 let a = TimestampMillisecondArray::from(vec![
8593 ts_ms_2024_01_01,
8594 -1,
8595 ts_ms_2024_01_01 + 123,
8596 0,
8597 ])
8598 .with_timezone("+00:00");
8599 cols.push(Arc::new(a) as ArrayRef);
8600 }
8601 cols.push(Arc::new(Time64MicrosecondArray::from(vec![
8602 time_us_eod,
8603 0,
8604 1,
8605 1_000_000,
8606 ])) as ArrayRef);
8607 cols.push(Arc::new(Time32MillisecondArray::from(vec![
8608 time_ms_a,
8609 0,
8610 1,
8611 86_400_000 - 1,
8612 ])) as ArrayRef);
8613 cols.push(Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef);
8614 {
8615 let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
8616 cols.push(Arc::new(
8617 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8618 ) as ArrayRef);
8619 }
8620 {
8621 #[cfg(feature = "small_decimals")]
8622 let arr = Arc::new(
8623 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
8624 .with_precision_and_scale(20, 4)
8625 .unwrap(),
8626 ) as ArrayRef;
8627 #[cfg(not(feature = "small_decimals"))]
8628 let arr = Arc::new(
8629 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
8630 .with_precision_and_scale(20, 4)
8631 .unwrap(),
8632 ) as ArrayRef;
8633 cols.push(arr);
8634 }
8635 {
8636 #[cfg(feature = "small_decimals")]
8637 let arr = Arc::new(
8638 Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
8639 .with_precision_and_scale(10, 2)
8640 .unwrap(),
8641 ) as ArrayRef;
8642 #[cfg(not(feature = "small_decimals"))]
8643 let arr = Arc::new(
8644 Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
8645 .with_precision_and_scale(10, 2)
8646 .unwrap(),
8647 ) as ArrayRef;
8648 cols.push(arr);
8649 }
8650 {
8651 let it = [
8652 Some(*b"0123456789ABCDEF"),
8653 Some([0u8; 16]),
8654 Some(*b"ABCDEFGHIJKLMNOP"),
8655 Some([0xAA; 16]),
8656 ]
8657 .into_iter();
8658 cols.push(Arc::new(
8659 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8660 ) as ArrayRef);
8661 }
8662 cols.push(Arc::new(BinaryArray::from(vec![
8663 b"\x00\x01".as_ref(),
8664 b"".as_ref(),
8665 b"\xFF\x00".as_ref(),
8666 b"\x10\x20\x30\x40".as_ref(),
8667 ])) as ArrayRef);
8668 cols.push(Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef);
8669 {
8670 let tids = vec![0, 1, 2, 1];
8671 let offs = vec![0, 0, 0, 1];
8672 let arr = mk_dense_union(&uf_tri, tids, offs, |f| match f.data_type() {
8673 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
8674 DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
8675 DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
8676 _ => None,
8677 });
8678 cols.push(arr);
8679 }
8680 cols.push(Arc::new(StringArray::from(vec![
8681 Some("alpha"),
8682 None,
8683 Some("s3"),
8684 Some(""),
8685 ])) as ArrayRef);
8686 cols.push(Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef);
8687 cols.push(Arc::new(Int64Array::from(vec![
8688 7_000_000_000i64,
8689 -2,
8690 0,
8691 -9_876_543_210i64,
8692 ])) as ArrayRef);
8693 cols.push(Arc::new(Int64Array::from(vec![7i64, -1, 0, 123])) as ArrayRef);
8694 cols.push(Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef);
8695 cols.push(Arc::new(Float64Array::from(vec![1.25f64, -0.0, 3.5, 9.75])) as ArrayRef);
8696 cols.push(Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef);
8697 cols.push(Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef);
8698 let expected = RecordBatch::try_new(expected_schema, cols).unwrap();
8699 assert_eq!(
8700 expected, batch,
8701 "entire RecordBatch mismatch (schema, all columns, all rows)"
8702 );
8703 }
8704}