1use std::collections::HashMap;
19use std::fmt;
20use std::hash::Hash;
21use std::sync::Arc;
22
23use crate::error::ArrowError;
24use crate::field::Field;
25use crate::{DataType, FieldRef, Fields};
26
27#[derive(Debug, Default)]
29pub struct SchemaBuilder {
30 fields: Vec<FieldRef>,
31 metadata: HashMap<String, String>,
32}
33
34impl SchemaBuilder {
35 pub fn new() -> Self {
37 Self::default()
38 }
39
40 pub fn with_capacity(capacity: usize) -> Self {
42 Self {
43 fields: Vec::with_capacity(capacity),
44 metadata: Default::default(),
45 }
46 }
47
48 pub fn push(&mut self, field: impl Into<FieldRef>) {
50 self.fields.push(field.into())
51 }
52
53 pub fn remove(&mut self, idx: usize) -> FieldRef {
59 self.fields.remove(idx)
60 }
61
62 pub fn field(&mut self, idx: usize) -> &FieldRef {
68 &mut self.fields[idx]
69 }
70
71 pub fn field_mut(&mut self, idx: usize) -> &mut FieldRef {
77 &mut self.fields[idx]
78 }
79
80 pub fn metadata(&mut self) -> &HashMap<String, String> {
82 &self.metadata
83 }
84
85 pub fn metadata_mut(&mut self) -> &mut HashMap<String, String> {
87 &mut self.metadata
88 }
89
90 pub fn reverse(&mut self) {
92 self.fields.reverse();
93 }
94
95 pub fn try_merge(&mut self, field: &FieldRef) -> Result<(), ArrowError> {
99 let existing = self.fields.iter_mut().find(|f| f.name() == field.name());
101 match existing {
102 Some(e) if Arc::ptr_eq(e, field) => {} Some(e) => match Arc::get_mut(e) {
104 Some(e) => e.try_merge(field.as_ref())?,
105 None => {
106 let mut t = e.as_ref().clone();
107 t.try_merge(field)?;
108 *e = Arc::new(t)
109 }
110 },
111 None => self.fields.push(field.clone()),
112 }
113 Ok(())
114 }
115
116 pub fn finish(self) -> Schema {
118 Schema {
119 fields: self.fields.into(),
120 metadata: self.metadata,
121 }
122 }
123}
124
125impl From<&Fields> for SchemaBuilder {
126 fn from(value: &Fields) -> Self {
127 Self {
128 fields: value.to_vec(),
129 metadata: Default::default(),
130 }
131 }
132}
133
134impl From<Fields> for SchemaBuilder {
135 fn from(value: Fields) -> Self {
136 Self {
137 fields: value.to_vec(),
138 metadata: Default::default(),
139 }
140 }
141}
142
143impl From<&Schema> for SchemaBuilder {
144 fn from(value: &Schema) -> Self {
145 Self::from(value.clone())
146 }
147}
148
149impl From<Schema> for SchemaBuilder {
150 fn from(value: Schema) -> Self {
151 Self {
152 fields: value.fields.to_vec(),
153 metadata: value.metadata,
154 }
155 }
156}
157
158impl Extend<FieldRef> for SchemaBuilder {
159 fn extend<T: IntoIterator<Item = FieldRef>>(&mut self, iter: T) {
160 let iter = iter.into_iter();
161 self.fields.reserve(iter.size_hint().0);
162 for f in iter {
163 self.push(f)
164 }
165 }
166}
167
168impl Extend<Field> for SchemaBuilder {
169 fn extend<T: IntoIterator<Item = Field>>(&mut self, iter: T) {
170 let iter = iter.into_iter();
171 self.fields.reserve(iter.size_hint().0);
172 for f in iter {
173 self.push(f)
174 }
175 }
176}
177
178pub type SchemaRef = Arc<Schema>;
180
181#[derive(Debug, Clone, PartialEq, Eq)]
186#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
187pub struct Schema {
188 pub fields: Fields,
190 pub metadata: HashMap<String, String>,
192}
193
194impl Schema {
195 pub fn empty() -> Self {
197 Self {
198 fields: Default::default(),
199 metadata: HashMap::new(),
200 }
201 }
202
203 pub fn new(fields: impl Into<Fields>) -> Self {
215 Self::new_with_metadata(fields, HashMap::new())
216 }
217
218 #[inline]
236 pub fn new_with_metadata(fields: impl Into<Fields>, metadata: HashMap<String, String>) -> Self {
237 Self {
238 fields: fields.into(),
239 metadata,
240 }
241 }
242
243 pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
245 self.metadata = metadata;
246 self
247 }
248
249 pub fn project(&self, indices: &[usize]) -> Result<Schema, ArrowError> {
252 let new_fields = indices
253 .iter()
254 .map(|i| {
255 self.fields.get(*i).cloned().ok_or_else(|| {
256 ArrowError::SchemaError(format!(
257 "project index {} out of bounds, max field {}",
258 i,
259 self.fields().len()
260 ))
261 })
262 })
263 .collect::<Result<Vec<_>, _>>()?;
264 Ok(Self::new_with_metadata(new_fields, self.metadata.clone()))
265 }
266
267 pub fn try_merge(schemas: impl IntoIterator<Item = Self>) -> Result<Self, ArrowError> {
296 let mut out_meta = HashMap::new();
297 let mut out_fields = SchemaBuilder::new();
298 for schema in schemas {
299 let Schema { metadata, fields } = schema;
300
301 for (key, value) in metadata.into_iter() {
303 if let Some(old_val) = out_meta.get(&key) {
304 if old_val != &value {
305 return Err(ArrowError::SchemaError(format!(
306 "Fail to merge schema due to conflicting metadata. \
307 Key '{key}' has different values '{old_val}' and '{value}'"
308 )));
309 }
310 }
311 out_meta.insert(key, value);
312 }
313
314 fields.iter().try_for_each(|x| out_fields.try_merge(x))?
316 }
317
318 Ok(out_fields.finish().with_metadata(out_meta))
319 }
320
321 #[inline]
323 pub const fn fields(&self) -> &Fields {
324 &self.fields
325 }
326
327 #[inline]
364 pub fn flattened_fields(&self) -> Vec<&Field> {
365 self.fields.iter().flat_map(|f| f.fields()).collect()
366 }
367
368 pub fn field(&self, i: usize) -> &Field {
375 &self.fields[i]
376 }
377
378 pub fn field_with_name(&self, name: &str) -> Result<&Field, ArrowError> {
380 Ok(&self.fields[self.index_of(name)?])
381 }
382
383 #[deprecated(
386 since = "54.0.0",
387 note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it."
388 )]
389 pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> {
390 #[allow(deprecated)]
391 self.fields
392 .iter()
393 .flat_map(|f| f.fields_with_dict_id(dict_id))
394 .collect()
395 }
396
397 pub fn index_of(&self, name: &str) -> Result<usize, ArrowError> {
399 let (idx, _) = self.fields().find(name).ok_or_else(|| {
400 let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect();
401 ArrowError::SchemaError(format!(
402 "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}"
403 ))
404 })?;
405 Ok(idx)
406 }
407
408 #[inline]
410 pub const fn metadata(&self) -> &HashMap<String, String> {
411 &self.metadata
412 }
413
414 pub fn normalize(&self, separator: &str, max_level: Option<usize>) -> Result<Self, ArrowError> {
458 let max_level = match max_level.unwrap_or(usize::MAX) {
459 0 => usize::MAX,
460 val => val,
461 };
462 let mut stack: Vec<(usize, Vec<&str>, &FieldRef)> = self
463 .fields()
464 .iter()
465 .rev()
466 .map(|f| {
467 let name_vec: Vec<&str> = vec![f.name()];
468 (0, name_vec, f)
469 })
470 .collect();
471 let mut fields: Vec<FieldRef> = Vec::new();
472
473 while let Some((depth, name, field_ref)) = stack.pop() {
474 match field_ref.data_type() {
475 DataType::Struct(ff) if depth < max_level => {
476 for fff in ff.into_iter().rev() {
478 let mut name = name.clone();
479 name.push(separator);
480 name.push(fff.name());
481 stack.push((depth + 1, name, fff))
482 }
483 }
484 _ => {
485 let updated_field = Field::new(
486 name.concat(),
487 field_ref.data_type().clone(),
488 field_ref.is_nullable(),
489 );
490 fields.push(Arc::new(updated_field));
491 }
492 }
493 }
494 Ok(Schema::new(fields))
495 }
496
497 pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> {
500 let (idx, field) = self.fields.find(name)?;
501 Some((idx, field.as_ref()))
502 }
503
504 pub fn contains(&self, other: &Schema) -> bool {
511 self.fields.contains(&other.fields)
513 && other
514 .metadata
515 .iter()
516 .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default())
517 }
518}
519
520impl fmt::Display for Schema {
521 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
522 f.write_str(
523 &self
524 .fields
525 .iter()
526 .map(|c| c.to_string())
527 .collect::<Vec<String>>()
528 .join(", "),
529 )
530 }
531}
532
533#[allow(clippy::derived_hash_with_manual_eq)]
535impl Hash for Schema {
536 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
537 self.fields.hash(state);
538
539 let mut keys: Vec<&String> = self.metadata.keys().collect();
541 keys.sort();
542 for k in keys {
543 k.hash(state);
544 self.metadata.get(k).expect("key valid").hash(state);
545 }
546 }
547}
548
549impl AsRef<Schema> for Schema {
550 fn as_ref(&self) -> &Schema {
551 self
552 }
553}
554
555#[cfg(test)]
556mod tests {
557 use crate::datatype::DataType;
558 use crate::{TimeUnit, UnionMode};
559
560 use super::*;
561
562 #[test]
563 #[expect(clippy::needless_borrows_for_generic_args)] fn test_schema_as_ref() {
565 fn accept_ref(_: impl AsRef<Schema>) {}
566
567 let schema = Schema::new(vec![
568 Field::new("name", DataType::Utf8, false),
569 Field::new("address", DataType::Utf8, false),
570 Field::new("priority", DataType::UInt8, false),
571 ]);
572
573 accept_ref(schema.clone());
574 accept_ref(&schema.clone());
575 accept_ref(&&schema.clone());
576 accept_ref(Arc::new(schema.clone()));
577 accept_ref(&Arc::new(schema.clone()));
578 accept_ref(&&Arc::new(schema.clone()));
579 }
580
581 #[test]
582 #[cfg(feature = "serde")]
583 fn test_ser_de_metadata() {
584 let schema = Schema::new(vec![
586 Field::new("name", DataType::Utf8, false),
587 Field::new("address", DataType::Utf8, false),
588 Field::new("priority", DataType::UInt8, false),
589 ]);
590
591 let json = serde_json::to_string(&schema).unwrap();
592 let de_schema = serde_json::from_str(&json).unwrap();
593
594 assert_eq!(schema, de_schema);
595
596 let schema =
598 schema.with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect());
599 let json = serde_json::to_string(&schema).unwrap();
600 let de_schema = serde_json::from_str(&json).unwrap();
601
602 assert_eq!(schema, de_schema);
603 }
604
605 #[test]
606 fn test_projection() {
607 let mut metadata = HashMap::new();
608 metadata.insert("meta".to_string(), "data".to_string());
609
610 let schema = Schema::new(vec![
611 Field::new("name", DataType::Utf8, false),
612 Field::new("address", DataType::Utf8, false),
613 Field::new("priority", DataType::UInt8, false),
614 ])
615 .with_metadata(metadata);
616
617 let projected: Schema = schema.project(&[0, 2]).unwrap();
618
619 assert_eq!(projected.fields().len(), 2);
620 assert_eq!(projected.fields()[0].name(), "name");
621 assert_eq!(projected.fields()[1].name(), "priority");
622 assert_eq!(projected.metadata.get("meta").unwrap(), "data")
623 }
624
625 #[test]
626 fn test_oob_projection() {
627 let mut metadata = HashMap::new();
628 metadata.insert("meta".to_string(), "data".to_string());
629
630 let schema = Schema::new(vec![
631 Field::new("name", DataType::Utf8, false),
632 Field::new("address", DataType::Utf8, false),
633 Field::new("priority", DataType::UInt8, false),
634 ])
635 .with_metadata(metadata);
636
637 let projected = schema.project(&[0, 3]);
638
639 assert!(projected.is_err());
640 if let Err(e) = projected {
641 assert_eq!(
642 e.to_string(),
643 "Schema error: project index 3 out of bounds, max field 3".to_string()
644 )
645 }
646 }
647
648 #[test]
649 fn test_schema_contains() {
650 let mut metadata1 = HashMap::new();
651 metadata1.insert("meta".to_string(), "data".to_string());
652
653 let schema1 = Schema::new(vec![
654 Field::new("name", DataType::Utf8, false),
655 Field::new("address", DataType::Utf8, false),
656 Field::new("priority", DataType::UInt8, false),
657 ])
658 .with_metadata(metadata1.clone());
659
660 let mut metadata2 = HashMap::new();
661 metadata2.insert("meta".to_string(), "data".to_string());
662 metadata2.insert("meta2".to_string(), "data".to_string());
663 let schema2 = Schema::new(vec![
664 Field::new("name", DataType::Utf8, false),
665 Field::new("address", DataType::Utf8, false),
666 Field::new("priority", DataType::UInt8, false),
667 ])
668 .with_metadata(metadata2);
669
670 assert!(schema1.contains(&schema1));
672 assert!(schema2.contains(&schema2));
673
674 assert!(!schema1.contains(&schema2));
675 assert!(schema2.contains(&schema1));
676 }
677
678 #[test]
679 fn schema_equality() {
680 let schema1 = Schema::new(vec![
681 Field::new("c1", DataType::Utf8, false),
682 Field::new("c2", DataType::Float64, true),
683 Field::new("c3", DataType::LargeBinary, true),
684 ]);
685 let schema2 = Schema::new(vec![
686 Field::new("c1", DataType::Utf8, false),
687 Field::new("c2", DataType::Float64, true),
688 Field::new("c3", DataType::LargeBinary, true),
689 ]);
690
691 assert_eq!(schema1, schema2);
692
693 let schema3 = Schema::new(vec![
694 Field::new("c1", DataType::Utf8, false),
695 Field::new("c2", DataType::Float32, true),
696 ]);
697 let schema4 = Schema::new(vec![
698 Field::new("C1", DataType::Utf8, false),
699 Field::new("C2", DataType::Float64, true),
700 ]);
701
702 assert_ne!(schema1, schema3);
703 assert_ne!(schema1, schema4);
704 assert_ne!(schema2, schema3);
705 assert_ne!(schema2, schema4);
706 assert_ne!(schema3, schema4);
707
708 let f = Field::new("c1", DataType::Utf8, false).with_metadata(
709 [("foo".to_string(), "bar".to_string())]
710 .iter()
711 .cloned()
712 .collect(),
713 );
714 let schema5 = Schema::new(vec![
715 f,
716 Field::new("c2", DataType::Float64, true),
717 Field::new("c3", DataType::LargeBinary, true),
718 ]);
719 assert_ne!(schema1, schema5);
720 }
721
722 #[test]
723 fn create_schema_string() {
724 let schema = person_schema();
725 assert_eq!(
726 schema.to_string(),
727 "Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \
728 Field { \"last_name\": Utf8 }, \
729 Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) }, \
730 Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }"
731 )
732 }
733
734 #[test]
735 fn schema_field_accessors() {
736 let schema = person_schema();
737
738 assert_eq!(schema.fields().len(), 4);
740
741 let first_name = &schema.fields()[0];
743 assert_eq!(first_name.name(), "first_name");
744 assert_eq!(first_name.data_type(), &DataType::Utf8);
745 assert!(!first_name.is_nullable());
746 #[allow(deprecated)]
747 let dict_id = first_name.dict_id();
748 assert_eq!(dict_id, None);
749 assert_eq!(first_name.dict_is_ordered(), None);
750
751 let metadata = first_name.metadata();
752 assert!(!metadata.is_empty());
753 let md = &metadata;
754 assert_eq!(md.len(), 1);
755 let key = md.get("k");
756 assert!(key.is_some());
757 assert_eq!(key.unwrap(), "v");
758
759 let interests = &schema.fields()[3];
760 assert_eq!(interests.name(), "interests");
761 assert_eq!(
762 interests.data_type(),
763 &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
764 );
765 #[allow(deprecated)]
766 let dict_id = interests.dict_id();
767 assert_eq!(dict_id, Some(123));
768 assert_eq!(interests.dict_is_ordered(), Some(true));
769 }
770
771 #[test]
772 #[should_panic(
773 expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]"
774 )]
775 fn schema_index_of() {
776 let schema = person_schema();
777 assert_eq!(schema.index_of("first_name").unwrap(), 0);
778 assert_eq!(schema.index_of("last_name").unwrap(), 1);
779 schema.index_of("nickname").unwrap();
780 }
781
782 #[test]
783 fn normalize_simple() {
784 let schema = Schema::new(vec![
785 Field::new(
786 "a",
787 DataType::Struct(Fields::from(vec![
788 Arc::new(Field::new("animals", DataType::Utf8, true)),
789 Arc::new(Field::new("n_legs", DataType::Int64, true)),
790 Arc::new(Field::new("year", DataType::Int64, true)),
791 ])),
792 false,
793 ),
794 Field::new("month", DataType::Int64, true),
795 ])
796 .normalize(".", Some(0))
797 .expect("valid normalization");
798
799 let expected = Schema::new(vec![
800 Field::new("a.animals", DataType::Utf8, true),
801 Field::new("a.n_legs", DataType::Int64, true),
802 Field::new("a.year", DataType::Int64, true),
803 Field::new("month", DataType::Int64, true),
804 ]);
805
806 assert_eq!(schema, expected);
807
808 let schema = Schema::new(vec![
810 Field::new(
811 "a",
812 DataType::Struct(Fields::from(vec![
813 Arc::new(Field::new("animals", DataType::Utf8, true)),
814 Arc::new(Field::new("n_legs", DataType::Int64, true)),
815 Arc::new(Field::new("year", DataType::Int64, true)),
816 ])),
817 false,
818 ),
819 Field::new("month", DataType::Int64, true),
820 ])
821 .normalize(".", None)
822 .expect("valid normalization");
823
824 assert_eq!(schema, expected);
825 }
826
827 #[test]
828 fn normalize_nested() {
829 let a = Arc::new(Field::new("a", DataType::Utf8, true));
830 let b = Arc::new(Field::new("b", DataType::Int64, false));
831 let c = Arc::new(Field::new("c", DataType::Int64, true));
832
833 let d = Arc::new(Field::new("d", DataType::Utf8, true));
834 let e = Arc::new(Field::new("e", DataType::Int64, false));
835 let f = Arc::new(Field::new("f", DataType::Int64, true));
836
837 let one = Arc::new(Field::new(
838 "1",
839 DataType::Struct(Fields::from(vec![a.clone(), b.clone(), c.clone()])),
840 false,
841 ));
842 let two = Arc::new(Field::new(
843 "2",
844 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
845 true,
846 ));
847
848 let exclamation = Arc::new(Field::new(
849 "!",
850 DataType::Struct(Fields::from(vec![one, two])),
851 false,
852 ));
853
854 let normalize_all = Schema::new(vec![exclamation.clone()])
855 .normalize(".", Some(0))
856 .expect("valid normalization");
857
858 let expected = Schema::new(vec![
859 Field::new("!.1.a", DataType::Utf8, true),
860 Field::new("!.1.b", DataType::Int64, false),
861 Field::new("!.1.c", DataType::Int64, true),
862 Field::new("!.2.d", DataType::Utf8, true),
863 Field::new("!.2.e", DataType::Int64, false),
864 Field::new("!.2.f", DataType::Int64, true),
865 ]);
866
867 assert_eq!(normalize_all, expected);
868
869 let normalize_depth_one = Schema::new(vec![exclamation])
870 .normalize(".", Some(1))
871 .expect("valid normalization");
872
873 let expected = Schema::new(vec![
874 Field::new("!.1", DataType::Struct(Fields::from(vec![a, b, c])), false),
875 Field::new("!.2", DataType::Struct(Fields::from(vec![d, e, f])), true),
876 ]);
877
878 assert_eq!(normalize_depth_one, expected);
879 }
880
881 #[test]
882 fn normalize_list() {
883 let a = Arc::new(Field::new("a", DataType::Utf8, true));
885 let b = Arc::new(Field::new("b", DataType::Int64, false));
886 let c = Arc::new(Field::new("c", DataType::Int64, true));
887 let d = Arc::new(Field::new("d", DataType::Utf8, true));
888 let e = Arc::new(Field::new("e", DataType::Int64, false));
889 let f = Arc::new(Field::new("f", DataType::Int64, true));
890
891 let one = Arc::new(Field::new(
892 "1",
893 DataType::Struct(Fields::from(vec![a.clone(), b.clone(), c.clone()])),
894 true,
895 ));
896
897 let two = Arc::new(Field::new(
898 "2",
899 DataType::List(Arc::new(Field::new_list_field(
900 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
901 true,
902 ))),
903 false,
904 ));
905
906 let exclamation = Arc::new(Field::new(
907 "!",
908 DataType::Struct(Fields::from(vec![one.clone(), two.clone()])),
909 false,
910 ));
911
912 let normalize_all = Schema::new(vec![exclamation.clone()])
913 .normalize(".", None)
914 .expect("valid normalization");
915
916 let expected = Schema::new(vec![
918 Field::new("!.1.a", DataType::Utf8, true),
919 Field::new("!.1.b", DataType::Int64, false),
920 Field::new("!.1.c", DataType::Int64, true),
921 Field::new(
922 "!.2",
923 DataType::List(Arc::new(Field::new_list_field(
924 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
925 true,
926 ))),
927 false,
928 ),
929 ]);
930
931 assert_eq!(normalize_all, expected);
932 assert_eq!(normalize_all.fields().len(), 4);
933
934 let two = Arc::new(Field::new(
936 "2",
937 DataType::FixedSizeList(
938 Arc::new(Field::new_fixed_size_list(
939 "3",
940 Arc::new(Field::new_list_field(
941 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
942 true,
943 )),
944 1,
945 true,
946 )),
947 1,
948 ),
949 false,
950 ));
951
952 let exclamation = Arc::new(Field::new(
953 "!",
954 DataType::Struct(Fields::from(vec![one.clone(), two])),
955 false,
956 ));
957
958 let normalize_all = Schema::new(vec![exclamation.clone()])
959 .normalize(".", None)
960 .expect("valid normalization");
961
962 let expected = Schema::new(vec![
964 Field::new("!.1.a", DataType::Utf8, true),
965 Field::new("!.1.b", DataType::Int64, false),
966 Field::new("!.1.c", DataType::Int64, true),
967 Field::new(
968 "!.2",
969 DataType::FixedSizeList(
970 Arc::new(Field::new_fixed_size_list(
971 "3",
972 Arc::new(Field::new_list_field(
973 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
974 true,
975 )),
976 1,
977 true,
978 )),
979 1,
980 ),
981 false,
982 ),
983 ]);
984
985 assert_eq!(normalize_all, expected);
986 assert_eq!(normalize_all.fields().len(), 4);
987
988 let two = Arc::new(Field::new(
990 "2",
991 DataType::FixedSizeList(
992 Arc::new(Field::new_large_list(
993 "3",
994 Arc::new(Field::new_list_field(
995 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
996 true,
997 )),
998 true,
999 )),
1000 1,
1001 ),
1002 false,
1003 ));
1004
1005 let exclamation = Arc::new(Field::new(
1006 "!",
1007 DataType::Struct(Fields::from(vec![one.clone(), two])),
1008 false,
1009 ));
1010
1011 let normalize_all = Schema::new(vec![exclamation.clone()])
1012 .normalize(".", None)
1013 .expect("valid normalization");
1014
1015 let expected = Schema::new(vec![
1017 Field::new("!.1.a", DataType::Utf8, true),
1018 Field::new("!.1.b", DataType::Int64, false),
1019 Field::new("!.1.c", DataType::Int64, true),
1020 Field::new(
1021 "!.2",
1022 DataType::FixedSizeList(
1023 Arc::new(Field::new_large_list(
1024 "3",
1025 Arc::new(Field::new_list_field(
1026 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
1027 true,
1028 )),
1029 true,
1030 )),
1031 1,
1032 ),
1033 false,
1034 ),
1035 ]);
1036
1037 assert_eq!(normalize_all, expected);
1038 assert_eq!(normalize_all.fields().len(), 4);
1039 }
1040
1041 #[test]
1042 fn normalize_deep_nested() {
1043 let a = Arc::new(Field::new("a", DataType::Utf8, true));
1045 let b = Arc::new(Field::new("b", DataType::Int64, false));
1046 let c = Arc::new(Field::new("c", DataType::Int64, true));
1047 let d = Arc::new(Field::new("d", DataType::Utf8, true));
1048 let e = Arc::new(Field::new("e", DataType::Int64, false));
1049 let f = Arc::new(Field::new("f", DataType::Int64, true));
1050
1051 let one = Arc::new(Field::new(
1052 "1",
1053 DataType::Struct(Fields::from(vec![a.clone(), b.clone(), c.clone()])),
1054 true,
1055 ));
1056
1057 let two = Arc::new(Field::new(
1058 "2",
1059 DataType::List(Arc::new(Field::new_list_field(
1060 DataType::Struct(Fields::from(vec![d.clone(), e.clone(), f.clone()])),
1061 true,
1062 ))),
1063 false,
1064 ));
1065
1066 let l10 = Arc::new(Field::new(
1067 "l10",
1068 DataType::List(Arc::new(Field::new_list_field(
1069 DataType::Struct(Fields::from(vec![one, two])),
1070 true,
1071 ))),
1072 false,
1073 ));
1074
1075 let l9 = Arc::new(Field::new(
1076 "l9",
1077 DataType::List(Arc::new(Field::new_list_field(
1078 DataType::Struct(Fields::from(vec![l10])),
1079 true,
1080 ))),
1081 false,
1082 ));
1083
1084 let l8 = Arc::new(Field::new(
1085 "l8",
1086 DataType::List(Arc::new(Field::new_list_field(
1087 DataType::Struct(Fields::from(vec![l9])),
1088 true,
1089 ))),
1090 false,
1091 ));
1092 let l7 = Arc::new(Field::new(
1093 "l7",
1094 DataType::List(Arc::new(Field::new_list_field(
1095 DataType::Struct(Fields::from(vec![l8])),
1096 true,
1097 ))),
1098 false,
1099 ));
1100 let l6 = Arc::new(Field::new(
1101 "l6",
1102 DataType::List(Arc::new(Field::new_list_field(
1103 DataType::Struct(Fields::from(vec![l7])),
1104 true,
1105 ))),
1106 false,
1107 ));
1108 let l5 = Arc::new(Field::new(
1109 "l5",
1110 DataType::List(Arc::new(Field::new_list_field(
1111 DataType::Struct(Fields::from(vec![l6])),
1112 true,
1113 ))),
1114 false,
1115 ));
1116 let l4 = Arc::new(Field::new(
1117 "l4",
1118 DataType::List(Arc::new(Field::new_list_field(
1119 DataType::Struct(Fields::from(vec![l5])),
1120 true,
1121 ))),
1122 false,
1123 ));
1124 let l3 = Arc::new(Field::new(
1125 "l3",
1126 DataType::List(Arc::new(Field::new_list_field(
1127 DataType::Struct(Fields::from(vec![l4])),
1128 true,
1129 ))),
1130 false,
1131 ));
1132 let l2 = Arc::new(Field::new(
1133 "l2",
1134 DataType::List(Arc::new(Field::new_list_field(
1135 DataType::Struct(Fields::from(vec![l3])),
1136 true,
1137 ))),
1138 false,
1139 ));
1140 let l1 = Arc::new(Field::new(
1141 "l1",
1142 DataType::List(Arc::new(Field::new_list_field(
1143 DataType::Struct(Fields::from(vec![l2])),
1144 true,
1145 ))),
1146 false,
1147 ));
1148
1149 let normalize_all = Schema::new(vec![l1])
1150 .normalize(".", None)
1151 .expect("valid normalization");
1152
1153 assert_eq!(normalize_all.fields().len(), 1);
1154 }
1155
1156 #[test]
1157 fn normalize_dictionary() {
1158 let a = Arc::new(Field::new("a", DataType::Utf8, true));
1159 let b = Arc::new(Field::new("b", DataType::Int64, false));
1160
1161 let one = Arc::new(Field::new(
1162 "1",
1163 DataType::Dictionary(
1164 Box::new(DataType::Int32),
1165 Box::new(DataType::Struct(Fields::from(vec![a.clone(), b.clone()]))),
1166 ),
1167 false,
1168 ));
1169
1170 let normalize_all = Schema::new(vec![one.clone()])
1171 .normalize(".", None)
1172 .expect("valid normalization");
1173
1174 let expected = Schema::new(vec![Field::new(
1175 "1",
1176 DataType::Dictionary(
1177 Box::new(DataType::Int32),
1178 Box::new(DataType::Struct(Fields::from(vec![a.clone(), b.clone()]))),
1179 ),
1180 false,
1181 )]);
1182
1183 assert_eq!(normalize_all, expected);
1184 }
1185
1186 #[test]
1187 #[should_panic(
1188 expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]"
1189 )]
1190 fn schema_field_with_name() {
1191 let schema = person_schema();
1192 assert_eq!(
1193 schema.field_with_name("first_name").unwrap().name(),
1194 "first_name"
1195 );
1196 assert_eq!(
1197 schema.field_with_name("last_name").unwrap().name(),
1198 "last_name"
1199 );
1200 schema.field_with_name("nickname").unwrap();
1201 }
1202
1203 #[test]
1204 fn schema_field_with_dict_id() {
1205 let schema = person_schema();
1206
1207 #[allow(deprecated)]
1208 let fields_dict_123: Vec<_> = schema
1209 .fields_with_dict_id(123)
1210 .iter()
1211 .map(|f| f.name())
1212 .collect();
1213 assert_eq!(fields_dict_123, vec!["interests"]);
1214
1215 #[allow(deprecated)]
1216 let is_empty = schema.fields_with_dict_id(456).is_empty();
1217 assert!(is_empty);
1218 }
1219
1220 fn person_schema() -> Schema {
1221 let kv_array = [("k".to_string(), "v".to_string())];
1222 let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
1223 let first_name =
1224 Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
1225
1226 Schema::new(vec![
1227 first_name,
1228 Field::new("last_name", DataType::Utf8, false),
1229 Field::new(
1230 "address",
1231 DataType::Struct(Fields::from(vec![
1232 Field::new("street", DataType::Utf8, false),
1233 Field::new("zip", DataType::UInt16, false),
1234 ])),
1235 false,
1236 ),
1237 #[allow(deprecated)]
1238 Field::new_dict(
1239 "interests",
1240 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1241 true,
1242 123,
1243 true,
1244 ),
1245 ])
1246 }
1247
1248 #[test]
1249 fn test_try_merge_field_with_metadata() {
1250 let metadata1: HashMap<String, String> = [("foo".to_string(), "bar".to_string())]
1252 .iter()
1253 .cloned()
1254 .collect();
1255 let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1);
1256
1257 let metadata2: HashMap<String, String> = [("foo".to_string(), "baz".to_string())]
1258 .iter()
1259 .cloned()
1260 .collect();
1261 let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);
1262
1263 assert!(Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err());
1264
1265 let mut f1 = Field::new("first_name", DataType::Utf8, false);
1267 let metadata2: HashMap<String, String> = [("missing".to_string(), "value".to_string())]
1268 .iter()
1269 .cloned()
1270 .collect();
1271 let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);
1272
1273 assert!(f1.try_merge(&f2).is_ok());
1274 assert!(!f1.metadata().is_empty());
1275 assert_eq!(f1.metadata(), f2.metadata());
1276
1277 let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(
1279 [("foo".to_string(), "bar".to_string())]
1280 .iter()
1281 .cloned()
1282 .collect(),
1283 );
1284 let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(
1285 [("foo2".to_string(), "bar2".to_string())]
1286 .iter()
1287 .cloned()
1288 .collect(),
1289 );
1290
1291 assert!(f1.try_merge(&f2).is_ok());
1292 assert!(!f1.metadata().is_empty());
1293 assert_eq!(
1294 f1.metadata().clone(),
1295 [
1296 ("foo".to_string(), "bar".to_string()),
1297 ("foo2".to_string(), "bar2".to_string())
1298 ]
1299 .iter()
1300 .cloned()
1301 .collect()
1302 );
1303
1304 let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(
1306 [("foo".to_string(), "bar".to_string())]
1307 .iter()
1308 .cloned()
1309 .collect(),
1310 );
1311 let f2 = Field::new("first_name", DataType::Utf8, false);
1312 assert!(f1.try_merge(&f2).is_ok());
1313 assert!(!f1.metadata().is_empty());
1314 assert_eq!(
1315 f1.metadata().clone(),
1316 [("foo".to_string(), "bar".to_string())]
1317 .iter()
1318 .cloned()
1319 .collect()
1320 );
1321
1322 let mut f1 = Field::new("first_name", DataType::Utf8, false);
1324 let f2 = Field::new("first_name", DataType::Utf8, false);
1325 assert!(f1.try_merge(&f2).is_ok());
1326 assert!(f1.metadata().is_empty());
1327 }
1328
1329 #[test]
1330 fn test_schema_merge() {
1331 let merged = Schema::try_merge(vec![
1332 Schema::new(vec![
1333 Field::new("first_name", DataType::Utf8, false),
1334 Field::new("last_name", DataType::Utf8, false),
1335 Field::new(
1336 "address",
1337 DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)].into()),
1338 false,
1339 ),
1340 ]),
1341 Schema::new_with_metadata(
1342 vec![
1343 Field::new("last_name", DataType::Utf8, true),
1345 Field::new(
1346 "address",
1347 DataType::Struct(Fields::from(vec![
1348 Field::new("street", DataType::Utf8, false),
1350 Field::new("zip", DataType::UInt16, true),
1352 ])),
1353 false,
1354 ),
1355 Field::new("number", DataType::Utf8, true),
1357 ],
1358 [("foo".to_string(), "bar".to_string())]
1359 .iter()
1360 .cloned()
1361 .collect::<HashMap<String, String>>(),
1362 ),
1363 ])
1364 .unwrap();
1365
1366 assert_eq!(
1367 merged,
1368 Schema::new_with_metadata(
1369 vec![
1370 Field::new("first_name", DataType::Utf8, false),
1371 Field::new("last_name", DataType::Utf8, true),
1372 Field::new(
1373 "address",
1374 DataType::Struct(Fields::from(vec![
1375 Field::new("zip", DataType::UInt16, true),
1376 Field::new("street", DataType::Utf8, false),
1377 ])),
1378 false,
1379 ),
1380 Field::new("number", DataType::Utf8, true),
1381 ],
1382 [("foo".to_string(), "bar".to_string())]
1383 .iter()
1384 .cloned()
1385 .collect::<HashMap<String, String>>()
1386 )
1387 );
1388
1389 assert_eq!(
1391 Schema::try_merge(vec![
1392 Schema::new(vec![Field::new_union(
1393 "c1",
1394 vec![0, 1],
1395 vec![
1396 Field::new("c11", DataType::Utf8, true),
1397 Field::new("c12", DataType::Utf8, true),
1398 ],
1399 UnionMode::Dense
1400 ),]),
1401 Schema::new(vec![Field::new_union(
1402 "c1",
1403 vec![1, 2],
1404 vec![
1405 Field::new("c12", DataType::Utf8, true),
1406 Field::new("c13", DataType::Time64(TimeUnit::Second), true),
1407 ],
1408 UnionMode::Dense
1409 ),])
1410 ])
1411 .unwrap(),
1412 Schema::new(vec![Field::new_union(
1413 "c1",
1414 vec![0, 1, 2],
1415 vec![
1416 Field::new("c11", DataType::Utf8, true),
1417 Field::new("c12", DataType::Utf8, true),
1418 Field::new("c13", DataType::Time64(TimeUnit::Second), true),
1419 ],
1420 UnionMode::Dense
1421 ),]),
1422 );
1423
1424 assert!(
1426 Schema::try_merge(vec![
1427 Schema::new(vec![
1428 Field::new("first_name", DataType::Utf8, false),
1429 Field::new("last_name", DataType::Utf8, false),
1430 ]),
1431 Schema::new(vec![Field::new("last_name", DataType::Int64, false),])
1432 ])
1433 .is_err()
1434 );
1435
1436 let res = Schema::try_merge(vec![
1438 Schema::new_with_metadata(
1439 vec![Field::new("first_name", DataType::Utf8, false)],
1440 [("foo".to_string(), "bar".to_string())]
1441 .iter()
1442 .cloned()
1443 .collect::<HashMap<String, String>>(),
1444 ),
1445 Schema::new_with_metadata(
1446 vec![Field::new("last_name", DataType::Utf8, false)],
1447 [("foo".to_string(), "baz".to_string())]
1448 .iter()
1449 .cloned()
1450 .collect::<HashMap<String, String>>(),
1451 ),
1452 ])
1453 .unwrap_err();
1454
1455 let expected = "Fail to merge schema due to conflicting metadata. Key 'foo' has different values 'bar' and 'baz'";
1456 assert!(
1457 res.to_string().contains(expected),
1458 "Could not find expected string '{expected}' in '{res}'"
1459 );
1460 }
1461
1462 #[test]
1463 fn test_schema_builder_change_field() {
1464 let mut builder = SchemaBuilder::new();
1465 builder.push(Field::new("a", DataType::Int32, false));
1466 builder.push(Field::new("b", DataType::Utf8, false));
1467 *builder.field_mut(1) = Arc::new(Field::new("c", DataType::Int32, false));
1468 assert_eq!(
1469 builder.fields,
1470 vec![
1471 Arc::new(Field::new("a", DataType::Int32, false)),
1472 Arc::new(Field::new("c", DataType::Int32, false))
1473 ]
1474 );
1475 }
1476
1477 #[test]
1478 fn test_schema_builder_reverse() {
1479 let mut builder = SchemaBuilder::new();
1480 builder.push(Field::new("a", DataType::Int32, false));
1481 builder.push(Field::new("b", DataType::Utf8, true));
1482 builder.reverse();
1483 assert_eq!(
1484 builder.fields,
1485 vec![
1486 Arc::new(Field::new("b", DataType::Utf8, true)),
1487 Arc::new(Field::new("a", DataType::Int32, false))
1488 ]
1489 );
1490 }
1491
1492 #[test]
1493 fn test_schema_builder_metadata() {
1494 let mut metadata = HashMap::with_capacity(1);
1495 metadata.insert("key".to_string(), "value".to_string());
1496
1497 let fields = vec![Field::new("test", DataType::Int8, true)];
1498 let mut builder: SchemaBuilder = Schema::new(fields).with_metadata(metadata).into();
1499 builder.metadata_mut().insert("k".into(), "v".into());
1500 let out = builder.finish();
1501 assert_eq!(out.metadata.len(), 2);
1502 assert_eq!(out.metadata["k"], "v");
1503 assert_eq!(out.metadata["key"], "value");
1504 }
1505}