arrow_schema/
field.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::error::ArrowError;
19use std::cmp::Ordering;
20use std::collections::HashMap;
21use std::hash::{Hash, Hasher};
22use std::sync::Arc;
23
24use crate::datatype::DataType;
25#[cfg(feature = "canonical_extension_types")]
26use crate::extension::CanonicalExtensionType;
27use crate::schema::SchemaBuilder;
28use crate::{
29    extension::{ExtensionType, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
30    Fields, UnionFields, UnionMode,
31};
32
33/// A reference counted [`Field`]
34pub type FieldRef = Arc<Field>;
35
36/// Describes a single column in a [`Schema`](super::Schema).
37///
38/// A [`Schema`](super::Schema) is an ordered collection of
39/// [`Field`] objects. Fields contain:
40/// * `name`: the name of the field
41/// * `data_type`: the type of the field
42/// * `nullable`: if the field is nullable
43/// * `metadata`: a map of key-value pairs containing additional custom metadata
44///
45/// Arrow Extension types, are encoded in `Field`s metadata. See
46/// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any.
47#[derive(Debug, Clone)]
48#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
49pub struct Field {
50    name: String,
51    data_type: DataType,
52    nullable: bool,
53    #[deprecated(
54        since = "54.0.0",
55        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
56    )]
57    dict_id: i64,
58    dict_is_ordered: bool,
59    /// A map of key-value pairs containing additional custom meta data.
60    metadata: HashMap<String, String>,
61}
62
63// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
64// into comparison. However, these properties are only used in IPC context
65// for matching dictionary encoded data. They are not necessary to be same
66// to consider schema equality. For example, in C++ `Field` implementation,
67// it doesn't contain these dictionary properties too.
68impl PartialEq for Field {
69    fn eq(&self, other: &Self) -> bool {
70        self.name == other.name
71            && self.data_type == other.data_type
72            && self.nullable == other.nullable
73            && self.metadata == other.metadata
74    }
75}
76
77impl Eq for Field {}
78
79impl PartialOrd for Field {
80    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
81        Some(self.cmp(other))
82    }
83}
84
85impl Ord for Field {
86    fn cmp(&self, other: &Self) -> Ordering {
87        self.name
88            .cmp(other.name())
89            .then_with(|| self.data_type.cmp(other.data_type()))
90            .then_with(|| self.nullable.cmp(&other.nullable))
91            .then_with(|| {
92                // ensure deterministic key order
93                let mut keys: Vec<&String> =
94                    self.metadata.keys().chain(other.metadata.keys()).collect();
95                keys.sort();
96                for k in keys {
97                    match (self.metadata.get(k), other.metadata.get(k)) {
98                        (None, None) => {}
99                        (Some(_), None) => {
100                            return Ordering::Less;
101                        }
102                        (None, Some(_)) => {
103                            return Ordering::Greater;
104                        }
105                        (Some(v1), Some(v2)) => match v1.cmp(v2) {
106                            Ordering::Equal => {}
107                            other => {
108                                return other;
109                            }
110                        },
111                    }
112                }
113
114                Ordering::Equal
115            })
116    }
117}
118
119impl Hash for Field {
120    fn hash<H: Hasher>(&self, state: &mut H) {
121        self.name.hash(state);
122        self.data_type.hash(state);
123        self.nullable.hash(state);
124
125        // ensure deterministic key order
126        let mut keys: Vec<&String> = self.metadata.keys().collect();
127        keys.sort();
128        for k in keys {
129            k.hash(state);
130            self.metadata.get(k).expect("key valid").hash(state);
131        }
132    }
133}
134
135impl Field {
136    /// Default list member field name
137    pub const LIST_FIELD_DEFAULT_NAME: &'static str = "item";
138
139    /// Creates a new field with the given name, data type, and nullability
140    ///
141    /// # Example
142    /// ```
143    /// # use arrow_schema::{Field, DataType};
144    /// Field::new("field_name", DataType::Int32, true);
145    /// ```
146    pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self {
147        #[allow(deprecated)]
148        Field {
149            name: name.into(),
150            data_type,
151            nullable,
152            dict_id: 0,
153            dict_is_ordered: false,
154            metadata: HashMap::default(),
155        }
156    }
157
158    /// Creates a new `Field` suitable for [`DataType::List`] and
159    /// [`DataType::LargeList`]
160    ///
161    /// While not required, this method follows the convention of naming the
162    /// `Field` `"item"`.
163    ///
164    /// # Example
165    /// ```
166    /// # use arrow_schema::{Field, DataType};
167    /// assert_eq!(
168    ///   Field::new("item", DataType::Int32, true),
169    ///   Field::new_list_field(DataType::Int32, true)
170    /// );
171    /// ```
172    pub fn new_list_field(data_type: DataType, nullable: bool) -> Self {
173        Self::new(Self::LIST_FIELD_DEFAULT_NAME, data_type, nullable)
174    }
175
176    /// Creates a new field that has additional dictionary information
177    #[deprecated(
178        since = "54.0.0",
179        note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter."
180    )]
181    pub fn new_dict(
182        name: impl Into<String>,
183        data_type: DataType,
184        nullable: bool,
185        dict_id: i64,
186        dict_is_ordered: bool,
187    ) -> Self {
188        #[allow(deprecated)]
189        Field {
190            name: name.into(),
191            data_type,
192            nullable,
193            dict_id,
194            dict_is_ordered,
195            metadata: HashMap::default(),
196        }
197    }
198
199    /// Create a new [`Field`] with [`DataType::Dictionary`]
200    ///
201    /// Use [`Self::new_dict`] for more advanced dictionary options
202    ///
203    /// # Panics
204    ///
205    /// Panics if [`!key.is_dictionary_key_type`][DataType::is_dictionary_key_type]
206    pub fn new_dictionary(
207        name: impl Into<String>,
208        key: DataType,
209        value: DataType,
210        nullable: bool,
211    ) -> Self {
212        assert!(
213            key.is_dictionary_key_type(),
214            "{key} is not a valid dictionary key"
215        );
216        let data_type = DataType::Dictionary(Box::new(key), Box::new(value));
217        Self::new(name, data_type, nullable)
218    }
219
220    /// Create a new [`Field`] with [`DataType::Struct`]
221    ///
222    /// - `name`: the name of the [`DataType::Struct`] field
223    /// - `fields`: the description of each struct element
224    /// - `nullable`: if the [`DataType::Struct`] array is nullable
225    pub fn new_struct(name: impl Into<String>, fields: impl Into<Fields>, nullable: bool) -> Self {
226        Self::new(name, DataType::Struct(fields.into()), nullable)
227    }
228
229    /// Create a new [`Field`] with [`DataType::List`]
230    ///
231    /// - `name`: the name of the [`DataType::List`] field
232    /// - `value`: the description of each list element
233    /// - `nullable`: if the [`DataType::List`] array is nullable
234    pub fn new_list(name: impl Into<String>, value: impl Into<FieldRef>, nullable: bool) -> Self {
235        Self::new(name, DataType::List(value.into()), nullable)
236    }
237
238    /// Create a new [`Field`] with [`DataType::LargeList`]
239    ///
240    /// - `name`: the name of the [`DataType::LargeList`] field
241    /// - `value`: the description of each list element
242    /// - `nullable`: if the [`DataType::LargeList`] array is nullable
243    pub fn new_large_list(
244        name: impl Into<String>,
245        value: impl Into<FieldRef>,
246        nullable: bool,
247    ) -> Self {
248        Self::new(name, DataType::LargeList(value.into()), nullable)
249    }
250
251    /// Create a new [`Field`] with [`DataType::FixedSizeList`]
252    ///
253    /// - `name`: the name of the [`DataType::FixedSizeList`] field
254    /// - `value`: the description of each list element
255    /// - `size`: the size of the fixed size list
256    /// - `nullable`: if the [`DataType::FixedSizeList`] array is nullable
257    pub fn new_fixed_size_list(
258        name: impl Into<String>,
259        value: impl Into<FieldRef>,
260        size: i32,
261        nullable: bool,
262    ) -> Self {
263        Self::new(name, DataType::FixedSizeList(value.into(), size), nullable)
264    }
265
266    /// Create a new [`Field`] with [`DataType::Map`]
267    ///
268    /// - `name`: the name of the [`DataType::Map`] field
269    /// - `entries`: the name of the inner [`DataType::Struct`] field
270    /// - `keys`: the map keys
271    /// - `values`: the map values
272    /// - `sorted`: if the [`DataType::Map`] array is sorted
273    /// - `nullable`: if the [`DataType::Map`] array is nullable
274    pub fn new_map(
275        name: impl Into<String>,
276        entries: impl Into<String>,
277        keys: impl Into<FieldRef>,
278        values: impl Into<FieldRef>,
279        sorted: bool,
280        nullable: bool,
281    ) -> Self {
282        let data_type = DataType::Map(
283            Arc::new(Field::new(
284                entries.into(),
285                DataType::Struct(Fields::from([keys.into(), values.into()])),
286                false, // The inner map field is always non-nullable (#1697),
287            )),
288            sorted,
289        );
290        Self::new(name, data_type, nullable)
291    }
292
293    /// Create a new [`Field`] with [`DataType::Union`]
294    ///
295    /// - `name`: the name of the [`DataType::Union`] field
296    /// - `type_ids`: the union type ids
297    /// - `fields`: the union fields
298    /// - `mode`: the union mode
299    pub fn new_union<S, F, T>(name: S, type_ids: T, fields: F, mode: UnionMode) -> Self
300    where
301        S: Into<String>,
302        F: IntoIterator,
303        F::Item: Into<FieldRef>,
304        T: IntoIterator<Item = i8>,
305    {
306        Self::new(
307            name,
308            DataType::Union(UnionFields::new(type_ids, fields), mode),
309            false, // Unions cannot be nullable
310        )
311    }
312
313    /// Sets the `Field`'s optional custom metadata.
314    #[inline]
315    pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
316        self.metadata = metadata;
317    }
318
319    /// Sets the metadata of this `Field` to be `metadata` and returns self
320    pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
321        self.set_metadata(metadata);
322        self
323    }
324
325    /// Returns the immutable reference to the `Field`'s optional custom metadata.
326    #[inline]
327    pub const fn metadata(&self) -> &HashMap<String, String> {
328        &self.metadata
329    }
330
331    /// Returns a mutable reference to the `Field`'s optional custom metadata.
332    #[inline]
333    pub fn metadata_mut(&mut self) -> &mut HashMap<String, String> {
334        &mut self.metadata
335    }
336
337    /// Returns an immutable reference to the `Field`'s name.
338    #[inline]
339    pub const fn name(&self) -> &String {
340        &self.name
341    }
342
343    /// Set the name of this [`Field`]
344    #[inline]
345    pub fn set_name(&mut self, name: impl Into<String>) {
346        self.name = name.into();
347    }
348
349    /// Set the name of the [`Field`] and returns self.
350    ///
351    /// ```
352    /// # use arrow_schema::*;
353    /// let field = Field::new("c1", DataType::Int64, false)
354    ///    .with_name("c2");
355    ///
356    /// assert_eq!(field.name(), "c2");
357    /// ```
358    pub fn with_name(mut self, name: impl Into<String>) -> Self {
359        self.set_name(name);
360        self
361    }
362
363    /// Returns an immutable reference to the [`Field`]'s  [`DataType`].
364    #[inline]
365    pub const fn data_type(&self) -> &DataType {
366        &self.data_type
367    }
368
369    /// Set [`DataType`] of the [`Field`]
370    ///
371    /// ```
372    /// # use arrow_schema::*;
373    /// let mut field = Field::new("c1", DataType::Int64, false);
374    /// field.set_data_type(DataType::Utf8);
375    ///
376    /// assert_eq!(field.data_type(), &DataType::Utf8);
377    /// ```
378    #[inline]
379    pub fn set_data_type(&mut self, data_type: DataType) {
380        self.data_type = data_type;
381    }
382
383    /// Set [`DataType`] of the [`Field`] and returns self.
384    ///
385    /// ```
386    /// # use arrow_schema::*;
387    /// let field = Field::new("c1", DataType::Int64, false)
388    ///    .with_data_type(DataType::Utf8);
389    ///
390    /// assert_eq!(field.data_type(), &DataType::Utf8);
391    /// ```
392    pub fn with_data_type(mut self, data_type: DataType) -> Self {
393        self.set_data_type(data_type);
394        self
395    }
396
397    /// Returns the extension type name of this [`Field`], if set.
398    ///
399    /// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in
400    /// [`Field::metadata`]. If the key is missing, there is no extension type
401    /// name and this returns `None`.
402    ///
403    /// # Example
404    ///
405    /// ```
406    /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY, Field};
407    ///
408    /// let field = Field::new("", DataType::Null, false);
409    /// assert_eq!(field.extension_type_name(), None);
410    ///
411    /// let field = Field::new("", DataType::Null, false).with_metadata(
412    ///    [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())]
413    ///        .into_iter()
414    ///        .collect(),
415    /// );
416    /// assert_eq!(field.extension_type_name(), Some("example"));
417    /// ```
418    pub fn extension_type_name(&self) -> Option<&str> {
419        self.metadata()
420            .get(EXTENSION_TYPE_NAME_KEY)
421            .map(String::as_ref)
422    }
423
424    /// Returns the extension type metadata of this [`Field`], if set.
425    ///
426    /// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in
427    /// [`Field::metadata`]. If the key is missing, there is no extension type
428    /// metadata and this returns `None`.
429    ///
430    /// # Example
431    ///
432    /// ```
433    /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY, Field};
434    ///
435    /// let field = Field::new("", DataType::Null, false);
436    /// assert_eq!(field.extension_type_metadata(), None);
437    ///
438    /// let field = Field::new("", DataType::Null, false).with_metadata(
439    ///    [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())]
440    ///        .into_iter()
441    ///        .collect(),
442    /// );
443    /// assert_eq!(field.extension_type_metadata(), Some("example"));
444    /// ```
445    pub fn extension_type_metadata(&self) -> Option<&str> {
446        self.metadata()
447            .get(EXTENSION_TYPE_METADATA_KEY)
448            .map(String::as_ref)
449    }
450
451    /// Returns an instance of the given [`ExtensionType`] of this [`Field`],
452    /// if set in the [`Field::metadata`].
453    ///
454    /// # Error
455    ///
456    /// Returns an error if
457    /// - this field does not have the name of this extension type
458    ///   ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or
459    ///   missing)
460    /// - the deserialization of the metadata
461    ///   ([`ExtensionType::deserialize_metadata`]) fails
462    /// - the construction of the extension type ([`ExtensionType::try_new`])
463    ///   fail (for example when the [`Field::data_type`] is not supported by
464    ///   the extension type ([`ExtensionType::supports_data_type`]))
465    pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E, ArrowError> {
466        // Check the extension name in the metadata
467        match self.extension_type_name() {
468            // It should match the name of the given extension type
469            Some(name) if name == E::NAME => {
470                // Deserialize the metadata and try to construct the extension
471                // type
472                E::deserialize_metadata(self.extension_type_metadata())
473                    .and_then(|metadata| E::try_new(self.data_type(), metadata))
474            }
475            // Name mismatch
476            Some(name) => Err(ArrowError::InvalidArgumentError(format!(
477                "Field extension type name mismatch, expected {}, found {name}",
478                E::NAME
479            ))),
480            // Name missing
481            None => Err(ArrowError::InvalidArgumentError(
482                "Field extension type name missing".to_owned(),
483            )),
484        }
485    }
486
487    /// Returns an instance of the given [`ExtensionType`] of this [`Field`],
488    /// panics if this [`Field`] does not have this extension type.
489    ///
490    /// # Panic
491    ///
492    /// This calls [`Field::try_extension_type`] and panics when it returns an
493    /// error.
494    pub fn extension_type<E: ExtensionType>(&self) -> E {
495        self.try_extension_type::<E>()
496            .unwrap_or_else(|e| panic!("{e}"))
497    }
498
499    /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
500    /// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the
501    /// given extension type supports the [`Field::data_type`] of this field
502    /// ([`ExtensionType::supports_data_type`]).
503    ///
504    /// If the given extension type defines no metadata, a previously set
505    /// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared.
506    ///
507    /// # Error
508    ///
509    /// This functions returns an error if the data type of this field does not
510    /// match any of the supported storage types of the given extension type.
511    pub fn try_with_extension_type<E: ExtensionType>(
512        &mut self,
513        extension_type: E,
514    ) -> Result<(), ArrowError> {
515        // Make sure the data type of this field is supported
516        extension_type.supports_data_type(&self.data_type)?;
517
518        self.metadata
519            .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned());
520        match extension_type.serialize_metadata() {
521            Some(metadata) => self
522                .metadata
523                .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata),
524            // If this extension type has no metadata, we make sure to
525            // clear previously set metadata.
526            None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY),
527        };
528
529        Ok(())
530    }
531
532    /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
533    /// and [`ExtensionType::metadata`] of the given [`ExtensionType`].
534    ///
535    /// # Panics
536    ///
537    /// This calls [`Field::try_with_extension_type`] and panics when it
538    /// returns an error.
539    pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E) -> Self {
540        self.try_with_extension_type(extension_type)
541            .unwrap_or_else(|e| panic!("{e}"));
542        self
543    }
544
545    /// Returns the [`CanonicalExtensionType`] of this [`Field`], if set.
546    ///
547    /// # Error
548    ///
549    /// Returns an error if
550    /// - this field does have a canonical extension type (mismatch or missing)
551    /// - the canonical extension is not supported
552    /// - the construction of the extension type fails
553    #[cfg(feature = "canonical_extension_types")]
554    pub fn try_canonical_extension_type(&self) -> Result<CanonicalExtensionType, ArrowError> {
555        CanonicalExtensionType::try_from(self)
556    }
557
558    /// Indicates whether this [`Field`] supports null values.
559    ///
560    /// If true, the field *may* contain null values.
561    #[inline]
562    pub const fn is_nullable(&self) -> bool {
563        self.nullable
564    }
565
566    /// Set the `nullable` of this [`Field`].
567    ///
568    /// ```
569    /// # use arrow_schema::*;
570    /// let mut field = Field::new("c1", DataType::Int64, false);
571    /// field.set_nullable(true);
572    ///
573    /// assert_eq!(field.is_nullable(), true);
574    /// ```
575    #[inline]
576    pub fn set_nullable(&mut self, nullable: bool) {
577        self.nullable = nullable;
578    }
579
580    /// Set `nullable` of the [`Field`] and returns self.
581    ///
582    /// ```
583    /// # use arrow_schema::*;
584    /// let field = Field::new("c1", DataType::Int64, false)
585    ///    .with_nullable(true);
586    ///
587    /// assert_eq!(field.is_nullable(), true);
588    /// ```
589    pub fn with_nullable(mut self, nullable: bool) -> Self {
590        self.set_nullable(nullable);
591        self
592    }
593
594    /// Returns a (flattened) [`Vec`] containing all child [`Field`]s
595    /// within `self` contained within this field (including `self`)
596    pub(crate) fn fields(&self) -> Vec<&Field> {
597        let mut collected_fields = vec![self];
598        collected_fields.append(&mut Field::_fields(&self.data_type));
599
600        collected_fields
601    }
602
603    fn _fields(dt: &DataType) -> Vec<&Field> {
604        match dt {
605            DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(),
606            DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(),
607            DataType::List(field)
608            | DataType::LargeList(field)
609            | DataType::FixedSizeList(field, _)
610            | DataType::Map(field, _) => field.fields(),
611            DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()),
612            DataType::RunEndEncoded(_, field) => field.fields(),
613            _ => vec![],
614        }
615    }
616
617    /// Returns a vector containing all (potentially nested) `Field` instances selected by the
618    /// dictionary ID they use
619    #[inline]
620    #[deprecated(
621        since = "54.0.0",
622        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
623    )]
624    pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> {
625        self.fields()
626            .into_iter()
627            .filter(|&field| {
628                #[allow(deprecated)]
629                let matching_dict_id = field.dict_id == id;
630                matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id
631            })
632            .collect()
633    }
634
635    /// Returns the dictionary ID, if this is a dictionary type.
636    #[inline]
637    #[deprecated(
638        since = "54.0.0",
639        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
640    )]
641    pub const fn dict_id(&self) -> Option<i64> {
642        match self.data_type {
643            #[allow(deprecated)]
644            DataType::Dictionary(_, _) => Some(self.dict_id),
645            _ => None,
646        }
647    }
648
649    /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
650    ///
651    /// # Example
652    /// ```
653    /// # use arrow_schema::{DataType, Field};
654    /// // non dictionaries do not have a dict is ordered flat
655    /// let field = Field::new("c1", DataType::Int64, false);
656    /// assert_eq!(field.dict_is_ordered(), None);
657    /// // by default dictionary is not ordered
658    /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
659    /// assert_eq!(field.dict_is_ordered(), Some(false));
660    /// let field = field.with_dict_is_ordered(true);
661    /// assert_eq!(field.dict_is_ordered(), Some(true));
662    /// ```
663    #[inline]
664    pub const fn dict_is_ordered(&self) -> Option<bool> {
665        match self.data_type {
666            DataType::Dictionary(_, _) => Some(self.dict_is_ordered),
667            _ => None,
668        }
669    }
670
671    /// Set the is ordered field for this `Field`, if it is a dictionary.
672    ///
673    /// Does nothing if this is not a dictionary type.
674    ///
675    /// See [`Field::dict_is_ordered`] for more information.
676    pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
677        if matches!(self.data_type, DataType::Dictionary(_, _)) {
678            self.dict_is_ordered = dict_is_ordered;
679        };
680        self
681    }
682
683    /// Merge this field into self if it is compatible.
684    ///
685    /// Struct fields are merged recursively.
686    ///
687    /// NOTE: `self` may be updated to a partial / unexpected state in case of merge failure.
688    ///
689    /// Example:
690    ///
691    /// ```
692    /// # use arrow_schema::*;
693    /// let mut field = Field::new("c1", DataType::Int64, false);
694    /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok());
695    /// assert!(field.is_nullable());
696    /// ```
697    pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> {
698        #[allow(deprecated)]
699        if from.dict_id != self.dict_id {
700            return Err(ArrowError::SchemaError(format!(
701                "Fail to merge schema field '{}' because from dict_id = {} does not match {}",
702                self.name, from.dict_id, self.dict_id
703            )));
704        }
705        if from.dict_is_ordered != self.dict_is_ordered {
706            return Err(ArrowError::SchemaError(format!(
707                "Fail to merge schema field '{}' because from dict_is_ordered = {} does not match {}",
708                self.name, from.dict_is_ordered, self.dict_is_ordered
709            )));
710        }
711        // merge metadata
712        match (self.metadata().is_empty(), from.metadata().is_empty()) {
713            (false, false) => {
714                let mut merged = self.metadata().clone();
715                for (key, from_value) in from.metadata() {
716                    if let Some(self_value) = self.metadata.get(key) {
717                        if self_value != from_value {
718                            return Err(ArrowError::SchemaError(format!(
719                                "Fail to merge field '{}' due to conflicting metadata data value for key {}.
720                                    From value = {} does not match {}", self.name, key, from_value, self_value),
721                            ));
722                        }
723                    } else {
724                        merged.insert(key.clone(), from_value.clone());
725                    }
726                }
727                self.set_metadata(merged);
728            }
729            (true, false) => {
730                self.set_metadata(from.metadata().clone());
731            }
732            _ => {}
733        }
734        match &mut self.data_type {
735            DataType::Struct(nested_fields) => match &from.data_type {
736                DataType::Struct(from_nested_fields) => {
737                    let mut builder = SchemaBuilder::new();
738                    nested_fields.iter().chain(from_nested_fields).try_for_each(|f| builder.try_merge(f))?;
739                    *nested_fields = builder.finish().fields;
740                }
741                _ => {
742                    return Err(ArrowError::SchemaError(
743                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Struct",
744                            self.name, from.data_type)
745                ))}
746            },
747            DataType::Union(nested_fields, _) => match &from.data_type {
748                DataType::Union(from_nested_fields, _) => {
749                    nested_fields.try_merge(from_nested_fields)?
750                }
751                _ => {
752                    return Err(ArrowError::SchemaError(
753                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Union",
754                            self.name, from.data_type)
755                    ));
756                }
757            },
758            DataType::List(field) => match &from.data_type {
759                DataType::List(from_field) => {
760                    let mut f = (**field).clone();
761                    f.try_merge(from_field)?;
762                    (*field) = Arc::new(f);
763                },
764                _ => {
765                    return Err(ArrowError::SchemaError(
766                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::List",
767                            self.name, from.data_type)
768                ))}
769            },
770            DataType::LargeList(field) => match &from.data_type {
771                DataType::LargeList(from_field) => {
772                    let mut f = (**field).clone();
773                    f.try_merge(from_field)?;
774                    (*field) = Arc::new(f);
775                },
776                _ => {
777                    return Err(ArrowError::SchemaError(
778                        format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::LargeList",
779                            self.name, from.data_type)
780                ))}
781            },
782            DataType::Null => {
783                self.nullable = true;
784                self.data_type = from.data_type.clone();
785            }
786            | DataType::Boolean
787            | DataType::Int8
788            | DataType::Int16
789            | DataType::Int32
790            | DataType::Int64
791            | DataType::UInt8
792            | DataType::UInt16
793            | DataType::UInt32
794            | DataType::UInt64
795            | DataType::Float16
796            | DataType::Float32
797            | DataType::Float64
798            | DataType::Timestamp(_, _)
799            | DataType::Date32
800            | DataType::Date64
801            | DataType::Time32(_)
802            | DataType::Time64(_)
803            | DataType::Duration(_)
804            | DataType::Binary
805            | DataType::LargeBinary
806            | DataType::BinaryView
807            | DataType::Interval(_)
808            | DataType::LargeListView(_)
809            | DataType::ListView(_)
810            | DataType::Map(_, _)
811            | DataType::Dictionary(_, _)
812            | DataType::RunEndEncoded(_, _)
813            | DataType::FixedSizeList(_, _)
814            | DataType::FixedSizeBinary(_)
815            | DataType::Utf8
816            | DataType::LargeUtf8
817            | DataType::Utf8View
818            | DataType::Decimal32(_, _)
819            | DataType::Decimal64(_, _)
820            | DataType::Decimal128(_, _)
821            | DataType::Decimal256(_, _) => {
822                if from.data_type == DataType::Null {
823                    self.nullable = true;
824                } else if self.data_type != from.data_type {
825                    return Err(ArrowError::SchemaError(
826                        format!("Fail to merge schema field '{}' because the from data_type = {} does not equal {}",
827                            self.name, from.data_type, self.data_type)
828                    ));
829                }
830            }
831        }
832        self.nullable |= from.nullable;
833
834        Ok(())
835    }
836
837    /// Check to see if `self` is a superset of `other` field. Superset is defined as:
838    ///
839    /// * if nullability doesn't match, self needs to be nullable
840    /// * self.metadata is a superset of other.metadata
841    /// * all other fields are equal
842    pub fn contains(&self, other: &Field) -> bool {
843        #[allow(deprecated)]
844        let matching_dict_id = self.dict_id == other.dict_id;
845        self.name == other.name
846        && self.data_type.contains(&other.data_type)
847        && matching_dict_id
848        && self.dict_is_ordered == other.dict_is_ordered
849        // self need to be nullable or both of them are not nullable
850        && (self.nullable || !other.nullable)
851        // make sure self.metadata is a superset of other.metadata
852        && other.metadata.iter().all(|(k, v1)| {
853            self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()
854        })
855    }
856
857    /// Return size of this instance in bytes.
858    ///
859    /// Includes the size of `Self`.
860    pub fn size(&self) -> usize {
861        std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type)
862            + self.data_type.size()
863            + self.name.capacity()
864            + (std::mem::size_of::<(String, String)>() * self.metadata.capacity())
865            + self
866                .metadata
867                .iter()
868                .map(|(k, v)| k.capacity() + v.capacity())
869                .sum::<usize>()
870    }
871}
872
873// TODO: improve display with crate https://crates.io/crates/derive_more ?
874impl std::fmt::Display for Field {
875    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
876        write!(f, "{self:?}")
877    }
878}
879
880#[cfg(test)]
881mod test {
882    use super::*;
883    use std::collections::hash_map::DefaultHasher;
884
885    #[test]
886    fn test_new_with_string() {
887        // Fields should allow owned Strings to support reuse
888        let s = "c1";
889        Field::new(s, DataType::Int64, false);
890    }
891
892    #[test]
893    fn test_new_dict_with_string() {
894        // Fields should allow owned Strings to support reuse
895        let s = "c1";
896        #[allow(deprecated)]
897        Field::new_dict(s, DataType::Int64, false, 4, false);
898    }
899
900    #[test]
901    fn test_merge_incompatible_types() {
902        let mut field = Field::new("c1", DataType::Int64, false);
903        let result = field
904            .try_merge(&Field::new("c1", DataType::Float32, true))
905            .expect_err("should fail")
906            .to_string();
907        assert_eq!("Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", result);
908    }
909
910    #[test]
911    fn test_merge_with_null() {
912        let mut field1 = Field::new("c1", DataType::Null, true);
913        field1
914            .try_merge(&Field::new("c1", DataType::Float32, false))
915            .expect("should widen type to nullable float");
916        assert_eq!(Field::new("c1", DataType::Float32, true), field1);
917
918        let mut field2 = Field::new("c2", DataType::Utf8, false);
919        field2
920            .try_merge(&Field::new("c2", DataType::Null, true))
921            .expect("should widen type to nullable utf8");
922        assert_eq!(Field::new("c2", DataType::Utf8, true), field2);
923    }
924
925    #[test]
926    fn test_merge_with_nested_null() {
927        let mut struct1 = Field::new(
928            "s1",
929            DataType::Struct(Fields::from(vec![Field::new(
930                "inner",
931                DataType::Float32,
932                false,
933            )])),
934            false,
935        );
936
937        let struct2 = Field::new(
938            "s2",
939            DataType::Struct(Fields::from(vec![Field::new(
940                "inner",
941                DataType::Null,
942                false,
943            )])),
944            true,
945        );
946
947        struct1
948            .try_merge(&struct2)
949            .expect("should widen inner field's type to nullable float");
950        assert_eq!(
951            Field::new(
952                "s1",
953                DataType::Struct(Fields::from(vec![Field::new(
954                    "inner",
955                    DataType::Float32,
956                    true,
957                )])),
958                true,
959            ),
960            struct1
961        );
962
963        let mut list1 = Field::new(
964            "l1",
965            DataType::List(Field::new("inner", DataType::Float32, false).into()),
966            false,
967        );
968
969        let list2 = Field::new(
970            "l2",
971            DataType::List(Field::new("inner", DataType::Null, false).into()),
972            true,
973        );
974
975        list1
976            .try_merge(&list2)
977            .expect("should widen inner field's type to nullable float");
978        assert_eq!(
979            Field::new(
980                "l1",
981                DataType::List(Field::new("inner", DataType::Float32, true).into()),
982                true,
983            ),
984            list1
985        );
986
987        let mut large_list1 = Field::new(
988            "ll1",
989            DataType::LargeList(Field::new("inner", DataType::Float32, false).into()),
990            false,
991        );
992
993        let large_list2 = Field::new(
994            "ll2",
995            DataType::LargeList(Field::new("inner", DataType::Null, false).into()),
996            true,
997        );
998
999        large_list1
1000            .try_merge(&large_list2)
1001            .expect("should widen inner field's type to nullable float");
1002        assert_eq!(
1003            Field::new(
1004                "ll1",
1005                DataType::LargeList(Field::new("inner", DataType::Float32, true).into()),
1006                true,
1007            ),
1008            large_list1
1009        );
1010    }
1011
1012    #[test]
1013    fn test_fields_with_dict_id() {
1014        #[allow(deprecated)]
1015        let dict1 = Field::new_dict(
1016            "dict1",
1017            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1018            false,
1019            10,
1020            false,
1021        );
1022        #[allow(deprecated)]
1023        let dict2 = Field::new_dict(
1024            "dict2",
1025            DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()),
1026            false,
1027            20,
1028            false,
1029        );
1030
1031        let field = Field::new(
1032            "struct<dict1, list[struct<dict2, list[struct<dict1]>]>",
1033            DataType::Struct(Fields::from(vec![
1034                dict1.clone(),
1035                Field::new(
1036                    "list[struct<dict1, list[struct<dict2>]>]",
1037                    DataType::List(Arc::new(Field::new(
1038                        "struct<dict1, list[struct<dict2>]>",
1039                        DataType::Struct(Fields::from(vec![
1040                            dict1.clone(),
1041                            Field::new(
1042                                "list[struct<dict2>]",
1043                                DataType::List(Arc::new(Field::new(
1044                                    "struct<dict2>",
1045                                    DataType::Struct(vec![dict2.clone()].into()),
1046                                    false,
1047                                ))),
1048                                false,
1049                            ),
1050                        ])),
1051                        false,
1052                    ))),
1053                    false,
1054                ),
1055            ])),
1056            false,
1057        );
1058
1059        #[allow(deprecated)]
1060        for field in field.fields_with_dict_id(10) {
1061            assert_eq!(dict1, *field);
1062        }
1063        #[allow(deprecated)]
1064        for field in field.fields_with_dict_id(20) {
1065            assert_eq!(dict2, *field);
1066        }
1067    }
1068
1069    fn get_field_hash(field: &Field) -> u64 {
1070        let mut s = DefaultHasher::new();
1071        field.hash(&mut s);
1072        s.finish()
1073    }
1074
1075    #[test]
1076    fn test_field_comparison_case() {
1077        // dictionary-encoding properties not used for field comparison
1078        #[allow(deprecated)]
1079        let dict1 = Field::new_dict(
1080            "dict1",
1081            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1082            false,
1083            10,
1084            false,
1085        );
1086        #[allow(deprecated)]
1087        let dict2 = Field::new_dict(
1088            "dict1",
1089            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1090            false,
1091            20,
1092            false,
1093        );
1094
1095        assert_eq!(dict1, dict2);
1096        assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2));
1097
1098        #[allow(deprecated)]
1099        let dict1 = Field::new_dict(
1100            "dict0",
1101            DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
1102            false,
1103            10,
1104            false,
1105        );
1106
1107        assert_ne!(dict1, dict2);
1108        assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
1109    }
1110
1111    #[test]
1112    fn test_field_comparison_metadata() {
1113        let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
1114            (String::from("k1"), String::from("v1")),
1115            (String::from("k2"), String::from("v2")),
1116        ]));
1117        let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
1118            (String::from("k1"), String::from("v1")),
1119            (String::from("k3"), String::from("v3")),
1120        ]));
1121        let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
1122            (String::from("k1"), String::from("v1")),
1123            (String::from("k3"), String::from("v4")),
1124        ]));
1125
1126        assert!(f1.cmp(&f2).is_lt());
1127        assert!(f2.cmp(&f3).is_lt());
1128        assert!(f1.cmp(&f3).is_lt());
1129    }
1130
1131    #[test]
1132    fn test_contains_reflexivity() {
1133        let mut field = Field::new("field1", DataType::Float16, false);
1134        field.set_metadata(HashMap::from([
1135            (String::from("k0"), String::from("v0")),
1136            (String::from("k1"), String::from("v1")),
1137        ]));
1138        assert!(field.contains(&field))
1139    }
1140
1141    #[test]
1142    fn test_contains_transitivity() {
1143        let child_field = Field::new("child1", DataType::Float16, false);
1144
1145        let mut field1 = Field::new(
1146            "field1",
1147            DataType::Struct(Fields::from(vec![child_field])),
1148            false,
1149        );
1150        field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));
1151
1152        let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true);
1153        field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
1154        field2.try_merge(&field1).unwrap();
1155
1156        let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false);
1157        field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
1158        field3.try_merge(&field2).unwrap();
1159
1160        assert!(field2.contains(&field1));
1161        assert!(field3.contains(&field2));
1162        assert!(field3.contains(&field1));
1163
1164        assert!(!field1.contains(&field2));
1165        assert!(!field1.contains(&field3));
1166        assert!(!field2.contains(&field3));
1167    }
1168
1169    #[test]
1170    fn test_contains_nullable() {
1171        let field1 = Field::new("field1", DataType::Boolean, true);
1172        let field2 = Field::new("field1", DataType::Boolean, false);
1173        assert!(field1.contains(&field2));
1174        assert!(!field2.contains(&field1));
1175    }
1176
1177    #[test]
1178    fn test_contains_must_have_same_fields() {
1179        let child_field1 = Field::new("child1", DataType::Float16, false);
1180        let child_field2 = Field::new("child2", DataType::Float16, false);
1181
1182        let field1 = Field::new(
1183            "field1",
1184            DataType::Struct(vec![child_field1.clone()].into()),
1185            true,
1186        );
1187        let field2 = Field::new(
1188            "field1",
1189            DataType::Struct(vec![child_field1, child_field2].into()),
1190            true,
1191        );
1192
1193        assert!(!field1.contains(&field2));
1194        assert!(!field2.contains(&field1));
1195
1196        // UnionFields with different type ID
1197        let field1 = Field::new(
1198            "field1",
1199            DataType::Union(
1200                UnionFields::new(
1201                    vec![1, 2],
1202                    vec![
1203                        Field::new("field1", DataType::UInt8, true),
1204                        Field::new("field3", DataType::Utf8, false),
1205                    ],
1206                ),
1207                UnionMode::Dense,
1208            ),
1209            true,
1210        );
1211        let field2 = Field::new(
1212            "field1",
1213            DataType::Union(
1214                UnionFields::new(
1215                    vec![1, 3],
1216                    vec![
1217                        Field::new("field1", DataType::UInt8, false),
1218                        Field::new("field3", DataType::Utf8, false),
1219                    ],
1220                ),
1221                UnionMode::Dense,
1222            ),
1223            true,
1224        );
1225        assert!(!field1.contains(&field2));
1226
1227        // UnionFields with same type ID
1228        let field1 = Field::new(
1229            "field1",
1230            DataType::Union(
1231                UnionFields::new(
1232                    vec![1, 2],
1233                    vec![
1234                        Field::new("field1", DataType::UInt8, true),
1235                        Field::new("field3", DataType::Utf8, false),
1236                    ],
1237                ),
1238                UnionMode::Dense,
1239            ),
1240            true,
1241        );
1242        let field2 = Field::new(
1243            "field1",
1244            DataType::Union(
1245                UnionFields::new(
1246                    vec![1, 2],
1247                    vec![
1248                        Field::new("field1", DataType::UInt8, false),
1249                        Field::new("field3", DataType::Utf8, false),
1250                    ],
1251                ),
1252                UnionMode::Dense,
1253            ),
1254            true,
1255        );
1256        assert!(field1.contains(&field2));
1257    }
1258
1259    #[cfg(feature = "serde")]
1260    fn assert_binary_serde_round_trip(field: Field) {
1261        let serialized = bincode::serialize(&field).unwrap();
1262        let deserialized: Field = bincode::deserialize(&serialized).unwrap();
1263        assert_eq!(field, deserialized)
1264    }
1265
1266    #[cfg(feature = "serde")]
1267    #[test]
1268    fn test_field_without_metadata_serde() {
1269        let field = Field::new("name", DataType::Boolean, true);
1270        assert_binary_serde_round_trip(field)
1271    }
1272
1273    #[cfg(feature = "serde")]
1274    #[test]
1275    fn test_field_with_empty_metadata_serde() {
1276        let field = Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new());
1277
1278        assert_binary_serde_round_trip(field)
1279    }
1280
1281    #[cfg(feature = "serde")]
1282    #[test]
1283    fn test_field_with_nonempty_metadata_serde() {
1284        let mut metadata = HashMap::new();
1285        metadata.insert("hi".to_owned(), "".to_owned());
1286        let field = Field::new("name", DataType::Boolean, false).with_metadata(metadata);
1287
1288        assert_binary_serde_round_trip(field)
1289    }
1290}