arrow_schema/
field.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::error::ArrowError;
19use std::cmp::Ordering;
20use std::collections::HashMap;
21use std::hash::{Hash, Hasher};
22use std::sync::Arc;
23
24use crate::datatype::DataType;
25#[cfg(feature = "canonical_extension_types")]
26use crate::extension::CanonicalExtensionType;
27use crate::schema::SchemaBuilder;
28use crate::{
29    extension::{ExtensionType, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
30    Fields, UnionFields, UnionMode,
31};
32
33/// A reference counted [`Field`]
34pub type FieldRef = Arc<Field>;
35
36/// Describes a single column in a [`Schema`](super::Schema).
37///
38/// A [`Schema`](super::Schema) is an ordered collection of
39/// [`Field`] objects.
40#[derive(Debug, Clone)]
41#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
42pub struct Field {
43    name: String,
44    data_type: DataType,
45    nullable: bool,
46    #[deprecated(
47        since = "54.0.0",
48        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
49    )]
50    dict_id: i64,
51    dict_is_ordered: bool,
52    /// A map of key-value pairs containing additional custom meta data.
53    metadata: HashMap<String, String>,
54}
55
56// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
57// into comparison. However, these properties are only used in IPC context
58// for matching dictionary encoded data. They are not necessary to be same
59// to consider schema equality. For example, in C++ `Field` implementation,
60// it doesn't contain these dictionary properties too.
61impl PartialEq for Field {
62    fn eq(&self, other: &Self) -> bool {
63        self.name == other.name
64            && self.data_type == other.data_type
65            && self.nullable == other.nullable
66            && self.metadata == other.metadata
67    }
68}
69
70impl Eq for Field {}
71
72impl PartialOrd for Field {
73    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
74        Some(self.cmp(other))
75    }
76}
77
78impl Ord for Field {
79    fn cmp(&self, other: &Self) -> Ordering {
80        self.name
81            .cmp(other.name())
82            .then_with(|| self.data_type.cmp(other.data_type()))
83            .then_with(|| self.nullable.cmp(&other.nullable))
84            .then_with(|| {
85                // ensure deterministic key order
86                let mut keys: Vec<&String> =
87                    self.metadata.keys().chain(other.metadata.keys()).collect();
88                keys.sort();
89                for k in keys {
90                    match (self.metadata.get(k), other.metadata.get(k)) {
91                        (None, None) => {}
92                        (Some(_), None) => {
93                            return Ordering::Less;
94                        }
95                        (None, Some(_)) => {
96                            return Ordering::Greater;
97                        }
98                        (Some(v1), Some(v2)) => match v1.cmp(v2) {
99                            Ordering::Equal => {}
100                            other => {
101                                return other;
102                            }
103                        },
104                    }
105                }
106
107                Ordering::Equal
108            })
109    }
110}
111
112impl Hash for Field {
113    fn hash<H: Hasher>(&self, state: &mut H) {
114        self.name.hash(state);
115        self.data_type.hash(state);
116        self.nullable.hash(state);
117
118        // ensure deterministic key order
119        let mut keys: Vec<&String> = self.metadata.keys().collect();
120        keys.sort();
121        for k in keys {
122            k.hash(state);
123            self.metadata.get(k).expect("key valid").hash(state);
124        }
125    }
126}
127
128impl Field {
129    /// Default list member field name
130    pub const LIST_FIELD_DEFAULT_NAME: &'static str = "item";
131
132    /// Creates a new field with the given name, type, and nullability
133    pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self {
134        #[allow(deprecated)]
135        Field {
136            name: name.into(),
137            data_type,
138            nullable,
139            dict_id: 0,
140            dict_is_ordered: false,
141            metadata: HashMap::default(),
142        }
143    }
144
145    /// Creates a new `Field` suitable for [`DataType::List`] and
146    /// [`DataType::LargeList`]
147    ///
148    /// While not required, this method follows the convention of naming the
149    /// `Field` `"item"`.
150    ///
151    /// # Example
152    /// ```
153    /// # use arrow_schema::{Field, DataType};
154    /// assert_eq!(
155    ///   Field::new("item", DataType::Int32, true),
156    ///   Field::new_list_field(DataType::Int32, true)
157    /// );
158    /// ```
159    pub fn new_list_field(data_type: DataType, nullable: bool) -> Self {
160        Self::new(Self::LIST_FIELD_DEFAULT_NAME, data_type, nullable)
161    }
162
163    /// Creates a new field that has additional dictionary information
164    #[deprecated(
165        since = "54.0.0",
166        note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter."
167    )]
168    pub fn new_dict(
169        name: impl Into<String>,
170        data_type: DataType,
171        nullable: bool,
172        dict_id: i64,
173        dict_is_ordered: bool,
174    ) -> Self {
175        #[allow(deprecated)]
176        Field {
177            name: name.into(),
178            data_type,
179            nullable,
180            dict_id,
181            dict_is_ordered,
182            metadata: HashMap::default(),
183        }
184    }
185
186    /// Create a new [`Field`] with [`DataType::Dictionary`]
187    ///
188    /// Use [`Self::new_dict`] for more advanced dictionary options
189    ///
190    /// # Panics
191    ///
192    /// Panics if [`!key.is_dictionary_key_type`][DataType::is_dictionary_key_type]
193    pub fn new_dictionary(
194        name: impl Into<String>,
195        key: DataType,
196        value: DataType,
197        nullable: bool,
198    ) -> Self {
199        assert!(
200            key.is_dictionary_key_type(),
201            "{key} is not a valid dictionary key"
202        );
203        let data_type = DataType::Dictionary(Box::new(key), Box::new(value));
204        Self::new(name, data_type, nullable)
205    }
206
207    /// Create a new [`Field`] with [`DataType::Struct`]
208    ///
209    /// - `name`: the name of the [`DataType::Struct`] field
210    /// - `fields`: the description of each struct element
211    /// - `nullable`: if the [`DataType::Struct`] array is nullable
212    pub fn new_struct(name: impl Into<String>, fields: impl Into<Fields>, nullable: bool) -> Self {
213        Self::new(name, DataType::Struct(fields.into()), nullable)
214    }
215
216    /// Create a new [`Field`] with [`DataType::List`]
217    ///
218    /// - `name`: the name of the [`DataType::List`] field
219    /// - `value`: the description of each list element
220    /// - `nullable`: if the [`DataType::List`] array is nullable
221    pub fn new_list(name: impl Into<String>, value: impl Into<FieldRef>, nullable: bool) -> Self {
222        Self::new(name, DataType::List(value.into()), nullable)
223    }
224
225    /// Create a new [`Field`] with [`DataType::LargeList`]
226    ///
227    /// - `name`: the name of the [`DataType::LargeList`] field
228    /// - `value`: the description of each list element
229    /// - `nullable`: if the [`DataType::LargeList`] array is nullable
230    pub fn new_large_list(
231        name: impl Into<String>,
232        value: impl Into<FieldRef>,
233        nullable: bool,
234    ) -> Self {
235        Self::new(name, DataType::LargeList(value.into()), nullable)
236    }
237
238    /// Create a new [`Field`] with [`DataType::FixedSizeList`]
239    ///
240    /// - `name`: the name of the [`DataType::FixedSizeList`] field
241    /// - `value`: the description of each list element
242    /// - `size`: the size of the fixed size list
243    /// - `nullable`: if the [`DataType::FixedSizeList`] array is nullable
244    pub fn new_fixed_size_list(
245        name: impl Into<String>,
246        value: impl Into<FieldRef>,
247        size: i32,
248        nullable: bool,
249    ) -> Self {
250        Self::new(name, DataType::FixedSizeList(value.into(), size), nullable)
251    }
252
253    /// Create a new [`Field`] with [`DataType::Map`]
254    ///
255    /// - `name`: the name of the [`DataType::Map`] field
256    /// - `entries`: the name of the inner [`DataType::Struct`] field
257    /// - `keys`: the map keys
258    /// - `values`: the map values
259    /// - `sorted`: if the [`DataType::Map`] array is sorted
260    /// - `nullable`: if the [`DataType::Map`] array is nullable
261    pub fn new_map(
262        name: impl Into<String>,
263        entries: impl Into<String>,
264        keys: impl Into<FieldRef>,
265        values: impl Into<FieldRef>,
266        sorted: bool,
267        nullable: bool,
268    ) -> Self {
269        let data_type = DataType::Map(
270            Arc::new(Field::new(
271                entries.into(),
272                DataType::Struct(Fields::from([keys.into(), values.into()])),
273                false, // The inner map field is always non-nullable (#1697),
274            )),
275            sorted,
276        );
277        Self::new(name, data_type, nullable)
278    }
279
280    /// Create a new [`Field`] with [`DataType::Union`]
281    ///
282    /// - `name`: the name of the [`DataType::Union`] field
283    /// - `type_ids`: the union type ids
284    /// - `fields`: the union fields
285    /// - `mode`: the union mode
286    pub fn new_union<S, F, T>(name: S, type_ids: T, fields: F, mode: UnionMode) -> Self
287    where
288        S: Into<String>,
289        F: IntoIterator,
290        F::Item: Into<FieldRef>,
291        T: IntoIterator<Item = i8>,
292    {
293        Self::new(
294            name,
295            DataType::Union(UnionFields::new(type_ids, fields), mode),
296            false, // Unions cannot be nullable
297        )
298    }
299
300    /// Sets the `Field`'s optional custom metadata.
301    #[inline]
302    pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
303        self.metadata = metadata;
304    }
305
306    /// Sets the metadata of this `Field` to be `metadata` and returns self
307    pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
308        self.set_metadata(metadata);
309        self
310    }
311
312    /// Returns the immutable reference to the `Field`'s optional custom metadata.
313    #[inline]
314    pub const fn metadata(&self) -> &HashMap<String, String> {
315        &self.metadata
316    }
317
318    /// Returns an immutable reference to the `Field`'s name.
319    #[inline]
320    pub const fn name(&self) -> &String {
321        &self.name
322    }
323
324    /// Set the name of the [`Field`] and returns self.
325    ///
326    /// ```
327    /// # use arrow_schema::*;
328    /// let field = Field::new("c1", DataType::Int64, false)
329    ///    .with_name("c2");
330    ///
331    /// assert_eq!(field.name(), "c2");
332    /// ```
333    pub fn with_name(mut self, name: impl Into<String>) -> Self {
334        self.name = name.into();
335        self
336    }
337
338    /// Returns an immutable reference to the [`Field`]'s  [`DataType`].
339    #[inline]
340    pub const fn data_type(&self) -> &DataType {
341        &self.data_type
342    }
343
344    /// Set [`DataType`] of the [`Field`] and returns self.
345    ///
346    /// ```
347    /// # use arrow_schema::*;
348    /// let field = Field::new("c1", DataType::Int64, false)
349    ///    .with_data_type(DataType::Utf8);
350    ///
351    /// assert_eq!(field.data_type(), &DataType::Utf8);
352    /// ```
353    pub fn with_data_type(mut self, data_type: DataType) -> Self {
354        self.data_type = data_type;
355        self
356    }
357
358    /// Returns the extension type name of this [`Field`], if set.
359    ///
360    /// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in
361    /// [`Field::metadata`]. If the key is missing, there is no extension type
362    /// name and this returns `None`.
363    ///
364    /// # Example
365    ///
366    /// ```
367    /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY, Field};
368    ///
369    /// let field = Field::new("", DataType::Null, false);
370    /// assert_eq!(field.extension_type_name(), None);
371    ///
372    /// let field = Field::new("", DataType::Null, false).with_metadata(
373    ///    [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())]
374    ///        .into_iter()
375    ///        .collect(),
376    /// );
377    /// assert_eq!(field.extension_type_name(), Some("example"));
378    /// ```
379    pub fn extension_type_name(&self) -> Option<&str> {
380        self.metadata()
381            .get(EXTENSION_TYPE_NAME_KEY)
382            .map(String::as_ref)
383    }
384
385    /// Returns the extension type metadata of this [`Field`], if set.
386    ///
387    /// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in
388    /// [`Field::metadata`]. If the key is missing, there is no extension type
389    /// metadata and this returns `None`.
390    ///
391    /// # Example
392    ///
393    /// ```
394    /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY, Field};
395    ///
396    /// let field = Field::new("", DataType::Null, false);
397    /// assert_eq!(field.extension_type_metadata(), None);
398    ///
399    /// let field = Field::new("", DataType::Null, false).with_metadata(
400    ///    [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())]
401    ///        .into_iter()
402    ///        .collect(),
403    /// );
404    /// assert_eq!(field.extension_type_metadata(), Some("example"));
405    /// ```
406    pub fn extension_type_metadata(&self) -> Option<&str> {
407        self.metadata()
408            .get(EXTENSION_TYPE_METADATA_KEY)
409            .map(String::as_ref)
410    }
411
412    /// Returns an instance of the given [`ExtensionType`] of this [`Field`],
413    /// if set in the [`Field::metadata`].
414    ///
415    /// # Error
416    ///
417    /// Returns an error if
418    /// - this field does not have the name of this extension type
419    ///   ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or
420    ///   missing)
421    /// - the deserialization of the metadata
422    ///   ([`ExtensionType::deserialize_metadata`]) fails
423    /// - the construction of the extension type ([`ExtensionType::try_new`])
424    ///   fail (for example when the [`Field::data_type`] is not supported by
425    ///   the extension type ([`ExtensionType::supports_data_type`]))
426    pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E, ArrowError> {
427        // Check the extension name in the metadata
428        match self.extension_type_name() {
429            // It should match the name of the given extension type
430            Some(name) if name == E::NAME => {
431                // Deserialize the metadata and try to construct the extension
432                // type
433                E::deserialize_metadata(self.extension_type_metadata())
434                    .and_then(|metadata| E::try_new(self.data_type(), metadata))
435            }
436            // Name mismatch
437            Some(name) => Err(ArrowError::InvalidArgumentError(format!(
438                "Field extension type name mismatch, expected {}, found {name}",
439                E::NAME
440            ))),
441            // Name missing
442            None => Err(ArrowError::InvalidArgumentError(
443                "Field extension type name missing".to_owned(),
444            )),
445        }
446    }
447
448    /// Returns an instance of the given [`ExtensionType`] of this [`Field`],
449    /// panics if this [`Field`] does not have this extension type.
450    ///
451    /// # Panic
452    ///
453    /// This calls [`Field::try_extension_type`] and panics when it returns an
454    /// error.
455    pub fn extension_type<E: ExtensionType>(&self) -> E {
456        self.try_extension_type::<E>()
457            .unwrap_or_else(|e| panic!("{e}"))
458    }
459
460    /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
461    /// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the
462    /// given extension type supports the [`Field::data_type`] of this field
463    /// ([`ExtensionType::supports_data_type`]).
464    ///
465    /// If the given extension type defines no metadata, a previously set
466    /// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared.
467    ///
468    /// # Error
469    ///
470    /// This functions returns an error if the data type of this field does not
471    /// match any of the supported storage types of the given extension type.
472    pub fn try_with_extension_type<E: ExtensionType>(
473        &mut self,
474        extension_type: E,
475    ) -> Result<(), ArrowError> {
476        // Make sure the data type of this field is supported
477        extension_type.supports_data_type(&self.data_type)?;
478
479        self.metadata
480            .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned());
481        match extension_type.serialize_metadata() {
482            Some(metadata) => self
483                .metadata
484                .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata),
485            // If this extension type has no metadata, we make sure to
486            // clear previously set metadata.
487            None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY),
488        };
489
490        Ok(())
491    }
492
493    /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
494    /// and [`ExtensionType::metadata`] of the given [`ExtensionType`].
495    ///
496    /// # Panics
497    ///
498    /// This calls [`Field::try_with_extension_type`] and panics when it
499    /// returns an error.
500    pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E) -> Self {
501        self.try_with_extension_type(extension_type)
502            .unwrap_or_else(|e| panic!("{e}"));
503        self
504    }
505
506    /// Returns the [`CanonicalExtensionType`] of this [`Field`], if set.
507    ///
508    /// # Error
509    ///
510    /// Returns an error if
511    /// - this field does have a canonical extension type (mismatch or missing)
512    /// - the canonical extension is not supported
513    /// - the construction of the extension type fails
514    #[cfg(feature = "canonical_extension_types")]
515    pub fn try_canonical_extension_type(&self) -> Result<CanonicalExtensionType, ArrowError> {
516        CanonicalExtensionType::try_from(self)
517    }
518
519    /// Indicates whether this [`Field`] supports null values.
520    #[inline]
521    pub const fn is_nullable(&self) -> bool {
522        self.nullable
523    }
524
525    /// Set `nullable` of the [`Field`] and returns self.
526    ///
527    /// ```
528    /// # use arrow_schema::*;
529    /// let field = Field::new("c1", DataType::Int64, false)
530    ///    .with_nullable(true);
531    ///
532    /// assert_eq!(field.is_nullable(), true);
533    /// ```
534    pub fn with_nullable(mut self, nullable: bool) -> Self {
535        self.nullable = nullable;
536        self
537    }
538
539    /// Returns a (flattened) [`Vec`] containing all child [`Field`]s
540    /// within `self` contained within this field (including `self`)
541    pub(crate) fn fields(&self) -> Vec<&Field> {
542        let mut collected_fields = vec![self];
543        collected_fields.append(&mut Field::_fields(&self.data_type));
544
545        collected_fields
546    }
547
548    fn _fields(dt: &DataType) -> Vec<&Field> {
549        match dt {
550            DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(),
551            DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(),
552            DataType::List(field)
553            | DataType::LargeList(field)
554            | DataType::FixedSizeList(field, _)
555            | DataType::Map(field, _) => field.fields(),
556            DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()),
557            DataType::RunEndEncoded(_, field) => field.fields(),
558            _ => vec![],
559        }
560    }
561
562    /// Returns a vector containing all (potentially nested) `Field` instances selected by the
563    /// dictionary ID they use
564    #[inline]
565    #[deprecated(
566        since = "54.0.0",
567        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
568    )]
569    pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> {
570        self.fields()
571            .into_iter()
572            .filter(|&field| {
573                #[allow(deprecated)]
574                let matching_dict_id = field.dict_id == id;
575                matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id
576            })
577            .collect()
578    }
579
580    /// Returns the dictionary ID, if this is a dictionary type.
581    #[inline]
582    #[deprecated(
583        since = "54.0.0",
584        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
585    )]
586    pub const fn dict_id(&self) -> Option<i64> {
587        match self.data_type {
588            #[allow(deprecated)]
589            DataType::Dictionary(_, _) => Some(self.dict_id),
590            _ => None,
591        }
592    }
593
594    /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
595    ///
596    /// # Example
597    /// ```
598    /// # use arrow_schema::{DataType, Field};
599    /// // non dictionaries do not have a dict is ordered flat
600    /// let field = Field::new("c1", DataType::Int64, false);
601    /// assert_eq!(field.dict_is_ordered(), None);
602    /// // by default dictionary is not ordered
603    /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
604    /// assert_eq!(field.dict_is_ordered(), Some(false));
605    /// let field = field.with_dict_is_ordered(true);
606    /// assert_eq!(field.dict_is_ordered(), Some(true));
607    /// ```
608    #[inline]
609    pub const fn dict_is_ordered(&self) -> Option<bool> {
610        match self.data_type {
611            DataType::Dictionary(_, _) => Some(self.dict_is_ordered),
612            _ => None,
613        }
614    }
615
616    /// Set the is ordered field for this `Field`, if it is a dictionary.
617    ///
618    /// Does nothing if this is not a dictionary type.
619    ///
620    /// See [`Field::dict_is_ordered`] for more information.
621    pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
622        if matches!(self.data_type, DataType::Dictionary(_, _)) {
623            self.dict_is_ordered = dict_is_ordered;
624        };
625        self
626    }
627
628    /// Merge this field into self if it is compatible.
629    ///
630    /// Struct fields are merged recursively.
631    ///
632    /// NOTE: `self` may be updated to a partial / unexpected state in case of merge failure.
633    ///
634    /// Example:
635    ///
636    /// ```
637    /// # use arrow_schema::*;
638    /// let mut field = Field::new("c1", DataType::Int64, false);
639    /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok());
640    /// assert!(field.is_nullable());
641    /// ```
642    pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> {
643        #[allow(deprecated)]
644        if from.dict_id != self.dict_id {
645            return Err(ArrowError::SchemaError(format!(
646                "Fail to merge schema field '{}' because from dict_id = {} does not match {}",
647                self.name, from.dict_id, self.dict_id
648            )));
649        }
650        if from.dict_is_ordered != self.dict_is_ordered {
651            return Err(ArrowError::SchemaError(format!(
652                "Fail to merge schema field '{}' because from dict_is_ordered = {} does not match {}",
653                self.name, from.dict_is_ordered, self.dict_is_ordered
654            )));
655        }
656        // merge metadata
657        match (self.metadata().is_empty(), from.metadata().is_empty()) {
658            (false, false) => {
659                let mut merged = self.metadata().clone();
660                for (key, from_value) in from.metadata() {
661                    if let Some(self_value) = self.metadata.get(key) {
662                        if self_value != from_value {
663                            return Err(ArrowError::SchemaError(format!(
664                                "Fail to merge field '{}' due to conflicting metadata data value for key {}.
665                                    From value = {} does not match {}", self.name, key, from_value, self_value),
666                            ));
667                        }
668                    } else {
669                        merged.insert(key.clone(), from_value.clone());
670                    }
671                }
672                self.set_metadata(merged);
673            }
674            (true, false) => {
675                self.set_metadata(from.metadata().clone());
676            }
677            _ => {}
678        }
679        match &mut self.data_type {
680            DataType::Struct(nested_fields) => match &from.data_type {
681                DataType::Struct(from_nested_fields) => {
682                    let mut builder = SchemaBuilder::new();
683                    nested_fields.iter().chain(from_nested_fields).try_for_each(|f| builder.try_merge(f))?;
684                    *nested_fields = builder.finish().fields;
685                }
686                _ => {
687                    return Err(ArrowError::SchemaError(
688                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Struct",
689                            self.name, from.data_type)
690                ))}
691            },
692            DataType::Union(nested_fields, _) => match &from.data_type {
693                DataType::Union(from_nested_fields, _) => {
694                    nested_fields.try_merge(from_nested_fields)?
695                }
696                _ => {
697                    return Err(ArrowError::SchemaError(
698                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Union",
699                            self.name, from.data_type)
700                    ));
701                }
702            },
703            DataType::List(field) => match &from.data_type {
704                DataType::List(from_field) => {
705                    let mut f = (**field).clone();
706                    f.try_merge(from_field)?;
707                    (*field) = Arc::new(f);
708                },
709                _ => {
710                    return Err(ArrowError::SchemaError(
711                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::List",
712                            self.name, from.data_type)
713                ))}
714            },
715            DataType::LargeList(field) => match &from.data_type {
716                DataType::LargeList(from_field) => {
717                    let mut f = (**field).clone();
718                    f.try_merge(from_field)?;
719                    (*field) = Arc::new(f);
720                },
721                _ => {
722                    return Err(ArrowError::SchemaError(
723                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::LargeList",
724                            self.name, from.data_type)
725                ))}
726            },
727            DataType::Null => {
728                self.nullable = true;
729                self.data_type = from.data_type.clone();
730            }
731            | DataType::Boolean
732            | DataType::Int8
733            | DataType::Int16
734            | DataType::Int32
735            | DataType::Int64
736            | DataType::UInt8
737            | DataType::UInt16
738            | DataType::UInt32
739            | DataType::UInt64
740            | DataType::Float16
741            | DataType::Float32
742            | DataType::Float64
743            | DataType::Timestamp(_, _)
744            | DataType::Date32
745            | DataType::Date64
746            | DataType::Time32(_)
747            | DataType::Time64(_)
748            | DataType::Duration(_)
749            | DataType::Binary
750            | DataType::LargeBinary
751            | DataType::BinaryView
752            | DataType::Interval(_)
753            | DataType::LargeListView(_)
754            | DataType::ListView(_)
755            | DataType::Map(_, _)
756            | DataType::Dictionary(_, _)
757            | DataType::RunEndEncoded(_, _)
758            | DataType::FixedSizeList(_, _)
759            | DataType::FixedSizeBinary(_)
760            | DataType::Utf8
761            | DataType::LargeUtf8
762            | DataType::Utf8View
763            | DataType::Decimal128(_, _)
764            | DataType::Decimal256(_, _) => {
765                if from.data_type == DataType::Null {
766                    self.nullable = true;
767                } else if self.data_type != from.data_type {
768                    return Err(ArrowError::SchemaError(
769                        format!("Fail to merge schema field '{}' because the from data_type = {} does not equal {}",
770                            self.name, from.data_type, self.data_type)
771                    ));
772                }
773            }
774        }
775        self.nullable |= from.nullable;
776
777        Ok(())
778    }
779
780    /// Check to see if `self` is a superset of `other` field. Superset is defined as:
781    ///
782    /// * if nullability doesn't match, self needs to be nullable
783    /// * self.metadata is a superset of other.metadata
784    /// * all other fields are equal
785    pub fn contains(&self, other: &Field) -> bool {
786        #[allow(deprecated)]
787        let matching_dict_id = self.dict_id == other.dict_id;
788        self.name == other.name
789        && self.data_type.contains(&other.data_type)
790        && matching_dict_id
791        && self.dict_is_ordered == other.dict_is_ordered
792        // self need to be nullable or both of them are not nullable
793        && (self.nullable || !other.nullable)
794        // make sure self.metadata is a superset of other.metadata
795        && other.metadata.iter().all(|(k, v1)| {
796            self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()
797        })
798    }
799
800    /// Return size of this instance in bytes.
801    ///
802    /// Includes the size of `Self`.
803    pub fn size(&self) -> usize {
804        std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type)
805            + self.data_type.size()
806            + self.name.capacity()
807            + (std::mem::size_of::<(String, String)>() * self.metadata.capacity())
808            + self
809                .metadata
810                .iter()
811                .map(|(k, v)| k.capacity() + v.capacity())
812                .sum::<usize>()
813    }
814}
815
816// TODO: improve display with crate https://crates.io/crates/derive_more ?
817impl std::fmt::Display for Field {
818    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
819        write!(f, "{self:?}")
820    }
821}
822
823#[cfg(test)]
824mod test {
825    use super::*;
826    use std::collections::hash_map::DefaultHasher;
827
828    #[test]
829    fn test_new_with_string() {
830        // Fields should allow owned Strings to support reuse
831        let s = "c1";
832        Field::new(s, DataType::Int64, false);
833    }
834
835    #[test]
836    fn test_new_dict_with_string() {
837        // Fields should allow owned Strings to support reuse
838        let s = "c1";
839        #[allow(deprecated)]
840        Field::new_dict(s, DataType::Int64, false, 4, false);
841    }
842
843    #[test]
844    fn test_merge_incompatible_types() {
845        let mut field = Field::new("c1", DataType::Int64, false);
846        let result = field
847            .try_merge(&Field::new("c1", DataType::Float32, true))
848            .expect_err("should fail")
849            .to_string();
850        assert_eq!("Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", result);
851    }
852
853    #[test]
854    fn test_merge_with_null() {
855        let mut field1 = Field::new("c1", DataType::Null, true);
856        field1
857            .try_merge(&Field::new("c1", DataType::Float32, false))
858            .expect("should widen type to nullable float");
859        assert_eq!(Field::new("c1", DataType::Float32, true), field1);
860
861        let mut field2 = Field::new("c2", DataType::Utf8, false);
862        field2
863            .try_merge(&Field::new("c2", DataType::Null, true))
864            .expect("should widen type to nullable utf8");
865        assert_eq!(Field::new("c2", DataType::Utf8, true), field2);
866    }
867
868    #[test]
869    fn test_merge_with_nested_null() {
870        let mut struct1 = Field::new(
871            "s1",
872            DataType::Struct(Fields::from(vec![Field::new(
873                "inner",
874                DataType::Float32,
875                false,
876            )])),
877            false,
878        );
879
880        let struct2 = Field::new(
881            "s2",
882            DataType::Struct(Fields::from(vec![Field::new(
883                "inner",
884                DataType::Null,
885                false,
886            )])),
887            true,
888        );
889
890        struct1
891            .try_merge(&struct2)
892            .expect("should widen inner field's type to nullable float");
893        assert_eq!(
894            Field::new(
895                "s1",
896                DataType::Struct(Fields::from(vec![Field::new(
897                    "inner",
898                    DataType::Float32,
899                    true,
900                )])),
901                true,
902            ),
903            struct1
904        );
905
906        let mut list1 = Field::new(
907            "l1",
908            DataType::List(Field::new("inner", DataType::Float32, false).into()),
909            false,
910        );
911
912        let list2 = Field::new(
913            "l2",
914            DataType::List(Field::new("inner", DataType::Null, false).into()),
915            true,
916        );
917
918        list1
919            .try_merge(&list2)
920            .expect("should widen inner field's type to nullable float");
921        assert_eq!(
922            Field::new(
923                "l1",
924                DataType::List(Field::new("inner", DataType::Float32, true).into()),
925                true,
926            ),
927            list1
928        );
929
930        let mut large_list1 = Field::new(
931            "ll1",
932            DataType::LargeList(Field::new("inner", DataType::Float32, false).into()),
933            false,
934        );
935
936        let large_list2 = Field::new(
937            "ll2",
938            DataType::LargeList(Field::new("inner", DataType::Null, false).into()),
939            true,
940        );
941
942        large_list1
943            .try_merge(&large_list2)
944            .expect("should widen inner field's type to nullable float");
945        assert_eq!(
946            Field::new(
947                "ll1",
948                DataType::LargeList(Field::new("inner", DataType::Float32, true).into()),
949                true,
950            ),
951            large_list1
952        );
953    }
954
955    #[test]
956    fn test_fields_with_dict_id() {
957        #[allow(deprecated)]
958        let dict1 = Field::new_dict(
959            "dict1",
960            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
961            false,
962            10,
963            false,
964        );
965        #[allow(deprecated)]
966        let dict2 = Field::new_dict(
967            "dict2",
968            DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()),
969            false,
970            20,
971            false,
972        );
973
974        let field = Field::new(
975            "struct<dict1, list[struct<dict2, list[struct<dict1]>]>",
976            DataType::Struct(Fields::from(vec![
977                dict1.clone(),
978                Field::new(
979                    "list[struct<dict1, list[struct<dict2>]>]",
980                    DataType::List(Arc::new(Field::new(
981                        "struct<dict1, list[struct<dict2>]>",
982                        DataType::Struct(Fields::from(vec![
983                            dict1.clone(),
984                            Field::new(
985                                "list[struct<dict2>]",
986                                DataType::List(Arc::new(Field::new(
987                                    "struct<dict2>",
988                                    DataType::Struct(vec![dict2.clone()].into()),
989                                    false,
990                                ))),
991                                false,
992                            ),
993                        ])),
994                        false,
995                    ))),
996                    false,
997                ),
998            ])),
999            false,
1000        );
1001
1002        #[allow(deprecated)]
1003        for field in field.fields_with_dict_id(10) {
1004            assert_eq!(dict1, *field);
1005        }
1006        #[allow(deprecated)]
1007        for field in field.fields_with_dict_id(20) {
1008            assert_eq!(dict2, *field);
1009        }
1010    }
1011
1012    fn get_field_hash(field: &Field) -> u64 {
1013        let mut s = DefaultHasher::new();
1014        field.hash(&mut s);
1015        s.finish()
1016    }
1017
1018    #[test]
1019    fn test_field_comparison_case() {
1020        // dictionary-encoding properties not used for field comparison
1021        #[allow(deprecated)]
1022        let dict1 = Field::new_dict(
1023            "dict1",
1024            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1025            false,
1026            10,
1027            false,
1028        );
1029        #[allow(deprecated)]
1030        let dict2 = Field::new_dict(
1031            "dict1",
1032            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1033            false,
1034            20,
1035            false,
1036        );
1037
1038        assert_eq!(dict1, dict2);
1039        assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2));
1040
1041        #[allow(deprecated)]
1042        let dict1 = Field::new_dict(
1043            "dict0",
1044            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1045            false,
1046            10,
1047            false,
1048        );
1049
1050        assert_ne!(dict1, dict2);
1051        assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
1052    }
1053
1054    #[test]
1055    fn test_field_comparison_metadata() {
1056        let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
1057            (String::from("k1"), String::from("v1")),
1058            (String::from("k2"), String::from("v2")),
1059        ]));
1060        let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
1061            (String::from("k1"), String::from("v1")),
1062            (String::from("k3"), String::from("v3")),
1063        ]));
1064        let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
1065            (String::from("k1"), String::from("v1")),
1066            (String::from("k3"), String::from("v4")),
1067        ]));
1068
1069        assert!(f1.cmp(&f2).is_lt());
1070        assert!(f2.cmp(&f3).is_lt());
1071        assert!(f1.cmp(&f3).is_lt());
1072    }
1073
1074    #[test]
1075    fn test_contains_reflexivity() {
1076        let mut field = Field::new("field1", DataType::Float16, false);
1077        field.set_metadata(HashMap::from([
1078            (String::from("k0"), String::from("v0")),
1079            (String::from("k1"), String::from("v1")),
1080        ]));
1081        assert!(field.contains(&field))
1082    }
1083
1084    #[test]
1085    fn test_contains_transitivity() {
1086        let child_field = Field::new("child1", DataType::Float16, false);
1087
1088        let mut field1 = Field::new(
1089            "field1",
1090            DataType::Struct(Fields::from(vec![child_field])),
1091            false,
1092        );
1093        field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));
1094
1095        let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true);
1096        field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
1097        field2.try_merge(&field1).unwrap();
1098
1099        let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false);
1100        field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
1101        field3.try_merge(&field2).unwrap();
1102
1103        assert!(field2.contains(&field1));
1104        assert!(field3.contains(&field2));
1105        assert!(field3.contains(&field1));
1106
1107        assert!(!field1.contains(&field2));
1108        assert!(!field1.contains(&field3));
1109        assert!(!field2.contains(&field3));
1110    }
1111
1112    #[test]
1113    fn test_contains_nullable() {
1114        let field1 = Field::new("field1", DataType::Boolean, true);
1115        let field2 = Field::new("field1", DataType::Boolean, false);
1116        assert!(field1.contains(&field2));
1117        assert!(!field2.contains(&field1));
1118    }
1119
1120    #[test]
1121    fn test_contains_must_have_same_fields() {
1122        let child_field1 = Field::new("child1", DataType::Float16, false);
1123        let child_field2 = Field::new("child2", DataType::Float16, false);
1124
1125        let field1 = Field::new(
1126            "field1",
1127            DataType::Struct(vec![child_field1.clone()].into()),
1128            true,
1129        );
1130        let field2 = Field::new(
1131            "field1",
1132            DataType::Struct(vec![child_field1, child_field2].into()),
1133            true,
1134        );
1135
1136        assert!(!field1.contains(&field2));
1137        assert!(!field2.contains(&field1));
1138
1139        // UnionFields with different type ID
1140        let field1 = Field::new(
1141            "field1",
1142            DataType::Union(
1143                UnionFields::new(
1144                    vec![1, 2],
1145                    vec![
1146                        Field::new("field1", DataType::UInt8, true),
1147                        Field::new("field3", DataType::Utf8, false),
1148                    ],
1149                ),
1150                UnionMode::Dense,
1151            ),
1152            true,
1153        );
1154        let field2 = Field::new(
1155            "field1",
1156            DataType::Union(
1157                UnionFields::new(
1158                    vec![1, 3],
1159                    vec![
1160                        Field::new("field1", DataType::UInt8, false),
1161                        Field::new("field3", DataType::Utf8, false),
1162                    ],
1163                ),
1164                UnionMode::Dense,
1165            ),
1166            true,
1167        );
1168        assert!(!field1.contains(&field2));
1169
1170        // UnionFields with same type ID
1171        let field1 = Field::new(
1172            "field1",
1173            DataType::Union(
1174                UnionFields::new(
1175                    vec![1, 2],
1176                    vec![
1177                        Field::new("field1", DataType::UInt8, true),
1178                        Field::new("field3", DataType::Utf8, false),
1179                    ],
1180                ),
1181                UnionMode::Dense,
1182            ),
1183            true,
1184        );
1185        let field2 = Field::new(
1186            "field1",
1187            DataType::Union(
1188                UnionFields::new(
1189                    vec![1, 2],
1190                    vec![
1191                        Field::new("field1", DataType::UInt8, false),
1192                        Field::new("field3", DataType::Utf8, false),
1193                    ],
1194                ),
1195                UnionMode::Dense,
1196            ),
1197            true,
1198        );
1199        assert!(field1.contains(&field2));
1200    }
1201
1202    #[cfg(feature = "serde")]
1203    fn assert_binary_serde_round_trip(field: Field) {
1204        let serialized = bincode::serialize(&field).unwrap();
1205        let deserialized: Field = bincode::deserialize(&serialized).unwrap();
1206        assert_eq!(field, deserialized)
1207    }
1208
1209    #[cfg(feature = "serde")]
1210    #[test]
1211    fn test_field_without_metadata_serde() {
1212        let field = Field::new("name", DataType::Boolean, true);
1213        assert_binary_serde_round_trip(field)
1214    }
1215
1216    #[cfg(feature = "serde")]
1217    #[test]
1218    fn test_field_with_empty_metadata_serde() {
1219        let field = Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new());
1220
1221        assert_binary_serde_round_trip(field)
1222    }
1223
1224    #[cfg(feature = "serde")]
1225    #[test]
1226    fn test_field_with_nonempty_metadata_serde() {
1227        let mut metadata = HashMap::new();
1228        metadata.insert("hi".to_owned(), "".to_owned());
1229        let field = Field::new("name", DataType::Boolean, false).with_metadata(metadata);
1230
1231        assert_binary_serde_round_trip(field)
1232    }
1233}