Skip to main content

arrow_schema/
datatype.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::str::FromStr;
19use std::sync::Arc;
20
21use crate::{ArrowError, Field, FieldRef, Fields, UnionFields};
22
23/// Datatypes supported by this implementation of Apache Arrow.
24///
25/// The variants of this enum include primitive fixed size types as well as
26/// parametric or nested types. See [`Schema.fbs`] for Arrow's specification.
27///
28/// # Examples
29///
30/// Primitive types
31/// ```
32/// # use arrow_schema::DataType;
33/// // create a new 32-bit signed integer
34/// let data_type = DataType::Int32;
35/// ```
36///
37/// Nested Types
38/// ```
39/// # use arrow_schema::{DataType, Field};
40/// # use std::sync::Arc;
41/// // create a new list of 32-bit signed integers directly
42/// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
43/// // Create the same list type with constructor
44/// let list_data_type2 = DataType::new_list(DataType::Int32, true);
45/// assert_eq!(list_data_type, list_data_type2);
46/// ```
47///
48/// Dictionary Types
49/// ```
50/// # use arrow_schema::{DataType};
51/// // String Dictionary (key type Int32 and value type Utf8)
52/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
53/// ```
54///
55/// Timestamp Types
56/// ```
57/// # use arrow_schema::{DataType, TimeUnit};
58/// // timestamp with millisecond precision without timezone specified
59/// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None);
60/// // timestamp with nanosecond precision in UTC timezone
61/// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()));
62///```
63///
64/// # Display and FromStr
65///
66/// The `Display` and `FromStr` implementations for `DataType` are
67/// human-readable, parseable, and reversible.
68///
69/// ```
70/// # use arrow_schema::DataType;
71/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
72/// let data_type_string = data_type.to_string();
73/// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)");
74/// // display can be parsed back into the original type
75/// let parsed_data_type: DataType = data_type.to_string().parse().unwrap();
76/// assert_eq!(data_type, parsed_data_type);
77/// ```
78///
79/// # Nested Support
80/// Currently, the Rust implementation supports the following nested types:
81///  - `List<T>`
82///  - `LargeList<T>`
83///  - `FixedSizeList<T>`
84///  - `Struct<T, U, V, ...>`
85///  - `Union<T, U, V, ...>`
86///  - `Map<K, V>`
87///
88/// Nested types can themselves be nested within other arrays.
89/// For more information on these types please see
90/// [the physical memory layout of Apache Arrow]
91///
92/// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
93/// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout
94#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
95#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
96pub enum DataType {
97    /// Null type
98    Null,
99    /// A boolean datatype representing the values `true` and `false`.
100    Boolean,
101    /// A signed 8-bit integer.
102    Int8,
103    /// A signed 16-bit integer.
104    Int16,
105    /// A signed 32-bit integer.
106    Int32,
107    /// A signed 64-bit integer.
108    Int64,
109    /// An unsigned 8-bit integer.
110    UInt8,
111    /// An unsigned 16-bit integer.
112    UInt16,
113    /// An unsigned 32-bit integer.
114    UInt32,
115    /// An unsigned 64-bit integer.
116    UInt64,
117    /// A 16-bit floating point number.
118    Float16,
119    /// A 32-bit floating point number.
120    Float32,
121    /// A 64-bit floating point number.
122    Float64,
123    /// A timestamp with an optional timezone.
124    ///
125    /// Time is measured as a Unix epoch, counting the seconds from
126    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
127    /// as a signed 64-bit integer.
128    ///
129    /// The time zone is a string indicating the name of a time zone, one of:
130    ///
131    /// * As used in the Olson time zone database (the "tz database" or
132    ///   "tzdata"), such as "America/New_York"
133    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
134    ///
135    /// Timestamps with a non-empty timezone
136    /// ------------------------------------
137    ///
138    /// If a Timestamp column has a non-empty timezone value, its epoch is
139    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
140    /// (the Unix epoch), regardless of the Timestamp's own timezone.
141    ///
142    /// Therefore, timestamp values with a non-empty timezone correspond to
143    /// physical points in time together with some additional information about
144    /// how the data was obtained and/or how to display it (the timezone).
145    ///
146    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
147    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
148    ///   application may prefer to display it as "January 1st 1970, 01h00" in
149    ///   the Europe/Paris timezone (which is the same physical point in time).
150    ///
151    /// One consequence is that timestamp values with a non-empty timezone
152    /// can be compared and ordered directly, since they all share the same
153    /// well-known point of reference (the Unix epoch).
154    ///
155    /// Timestamps with an unset / empty timezone
156    /// -----------------------------------------
157    ///
158    /// If a Timestamp column has no timezone value, its epoch is
159    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
160    ///
161    /// Therefore, timestamp values without a timezone cannot be meaningfully
162    /// interpreted as physical points in time, but only as calendar / clock
163    /// indications ("wall clock time") in an unspecified timezone.
164    ///
165    ///   For example, the timestamp value 0 with an empty timezone string
166    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
167    ///   is not enough information to interpret it as a well-defined physical
168    ///   point in time.
169    ///
170    /// One consequence is that timestamp values without a timezone cannot
171    /// be reliably compared or ordered, since they may have different points of
172    /// reference.  In particular, it is *not* possible to interpret an unset
173    /// or empty timezone as the same as "UTC".
174    ///
175    /// Conversion between timezones
176    /// ----------------------------
177    ///
178    /// If a Timestamp column has a non-empty timezone, changing the timezone
179    /// to a different non-empty value is a metadata-only operation:
180    /// the timestamp values need not change as their point of reference remains
181    /// the same (the Unix epoch).
182    ///
183    /// However, if a Timestamp column has no timezone value, changing it to a
184    /// non-empty value requires to think about the desired semantics.
185    /// One possibility is to assume that the original timestamp values are
186    /// relative to the epoch of the timezone being set; timestamp values should
187    /// then adjusted to the Unix epoch (for example, changing the timezone from
188    /// empty to "Europe/Paris" would require converting the timestamp values
189    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
190    /// nevertheless correct).
191    ///
192    /// ```
193    /// # use arrow_schema::{DataType, TimeUnit};
194    /// DataType::Timestamp(TimeUnit::Second, None);
195    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
196    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
197    /// ```
198    ///
199    /// # Timezone representation
200    /// ----------------------------
201    /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00".
202    /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous.
203    ///
204    /// Most arrow-rs functionalities use the absolute offset representation,
205    /// such as [`PrimitiveArray::with_timezone_utc`] that applies a
206    /// UTC timezone to timestamp arrays.
207    ///
208    /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc
209    ///
210    /// Timezone string parsing
211    /// -----------------------
212    /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
213    ///
214    /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
215    /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
216    /// timezones.
217    Timestamp(TimeUnit, Option<Arc<str>>),
218    /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
219    /// in days.
220    Date32,
221    /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
222    /// in milliseconds.
223    ///
224    /// # Valid Ranges
225    ///
226    /// According to the Arrow specification ([Schema.fbs]), values of Date64
227    /// are treated as the number of *days*, in milliseconds, since the UNIX
228    /// epoch. Therefore, values of this type  must be evenly divisible by
229    /// `86_400_000`, the number of milliseconds in a standard day.
230    ///
231    /// It is not valid to store milliseconds that do not represent an exact
232    /// day. The reason for this restriction is compatibility with other
233    /// language's native libraries (specifically Java), which historically
234    /// lacked a dedicated date type and only supported timestamps.
235    ///
236    /// # Validation
237    ///
238    /// This library does not validate or enforce that Date64 values are evenly
239    /// divisible by `86_400_000`  for performance and usability reasons. Date64
240    /// values are treated similarly to `Timestamp(TimeUnit::Millisecond,
241    /// None)`: values will be displayed with a time of day if the value does
242    /// not represent an exact day, and arithmetic will be done at the
243    /// millisecond granularity.
244    ///
245    /// # Recommendation
246    ///
247    /// Users should prefer [`Date32`] to cleanly represent the number
248    /// of days, or one of the Timestamp variants to include time as part of the
249    /// representation, depending on their use case.
250    ///
251    /// # Further Reading
252    ///
253    /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288).
254    ///
255    /// [`Date32`]: Self::Date32
256    /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
257    Date64,
258    /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
259    /// Must be either seconds or milliseconds.
260    Time32(TimeUnit),
261    /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
262    /// Must be either microseconds or nanoseconds.
263    Time64(TimeUnit),
264    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
265    Duration(TimeUnit),
266    /// A "calendar" interval which models types that don't necessarily
267    /// have a precise duration without the context of a base timestamp (e.g.
268    /// days can differ in length during day light savings time transitions).
269    Interval(IntervalUnit),
270    /// Opaque binary data of variable length.
271    ///
272    /// A single Binary array can store up to [`i32::MAX`] bytes
273    /// of binary data in total.
274    Binary,
275    /// Opaque binary data of fixed size.
276    ///
277    /// Enum parameter specifies the number of bytes per value, defined by the
278    /// [`byteWidth` field] in the Arrow Spec
279    ///
280    /// [`byteWidth` field]: https://github.com/apache/arrow/blob/2a89d03bbefd620b42126b8e00f8ae57e99cd638/format/Schema.fbs#L211
281    FixedSizeBinary(i32),
282    /// Opaque binary data of variable length and 64-bit offsets.
283    ///
284    /// A single LargeBinary array can store up to [`i64::MAX`] bytes
285    /// of binary data in total.
286    LargeBinary,
287    /// Opaque binary data of variable length.
288    ///
289    /// Logically the same as [`Binary`], but the internal representation uses a view
290    /// struct that contains the string length and either the string's entire data
291    /// inline (for small strings) or an inlined prefix, an index of another buffer,
292    /// and an offset pointing to a slice in that buffer (for non-small strings).
293    ///
294    /// [`Binary`]: Self::Binary
295    BinaryView,
296    /// A variable-length string in Unicode with UTF-8 encoding.
297    ///
298    /// A single Utf8 array can store up to [`i32::MAX`] bytes
299    /// of string data in total.
300    Utf8,
301    /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
302    ///
303    /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
304    /// of string data in total.
305    LargeUtf8,
306    /// A variable-length string in Unicode with UTF-8 encoding
307    ///
308    /// Logically the same as [`Utf8`], but the internal representation uses a view
309    /// struct that contains the string length and either the string's entire data
310    /// inline (for small strings) or an inlined prefix, an index of another buffer,
311    /// and an offset pointing to a slice in that buffer (for non-small strings).
312    ///
313    /// [`Utf8`]: Self::Utf8
314    Utf8View,
315    /// A list of some logical data type with variable length.
316    ///
317    /// A single List array can store up to [`i32::MAX`] elements in total.
318    List(FieldRef),
319    /// A list of some logical data type with variable length.
320    ///
321    /// Logically the same as [`List`], but the internal representation differs in how child
322    /// data is referenced, allowing flexibility in how data is layed out.
323    ///
324    /// [`List`]: Self::List
325    ListView(FieldRef),
326    /// A list of some logical data type with fixed length.
327    FixedSizeList(FieldRef, i32),
328    /// A list of some logical data type with variable length and 64-bit offsets.
329    ///
330    /// A single LargeList array can store up to [`i64::MAX`] elements in total.
331    LargeList(FieldRef),
332    /// A list of some logical data type with variable length and 64-bit offsets.
333    ///
334    /// Logically the same as [`LargeList`], but the internal representation differs in how child
335    /// data is referenced, allowing flexibility in how data is layed out.
336    ///
337    /// [`LargeList`]: Self::LargeList
338    LargeListView(FieldRef),
339    /// A nested datatype that contains a number of sub-fields.
340    Struct(Fields),
341    /// A nested datatype that can represent slots of differing types. Components:
342    ///
343    /// 1. [`UnionFields`]
344    /// 2. The type of union (Sparse or Dense)
345    Union(UnionFields, UnionMode),
346    /// A dictionary encoded array (`key_type`, `value_type`), where
347    /// each array element is an index of `key_type` into an
348    /// associated dictionary of `value_type`.
349    ///
350    /// Dictionary arrays are used to store columns of `value_type`
351    /// that contain many repeated values using less memory, but with
352    /// a higher CPU overhead for some operations.
353    ///
354    /// This type mostly used to represent low cardinality string
355    /// arrays or a limited set of primitive types as integers.
356    Dictionary(Box<DataType>, Box<DataType>),
357    /// Exact 32-bit width decimal value with precision and scale
358    ///
359    /// * precision is the total number of digits
360    /// * scale is the number of digits past the decimal
361    ///
362    /// For example the number 123.45 has precision 5 and scale 2.
363    ///
364    /// In certain situations, scale could be negative number. For
365    /// negative scale, it is the number of padding 0 to the right
366    /// of the digits.
367    ///
368    /// For example the number 12300 could be treated as a decimal
369    /// has precision 3 and scale -2.
370    Decimal32(u8, i8),
371    /// Exact 64-bit width decimal value with precision and scale
372    ///
373    /// * precision is the total number of digits
374    /// * scale is the number of digits past the decimal
375    ///
376    /// For example the number 123.45 has precision 5 and scale 2.
377    ///
378    /// In certain situations, scale could be negative number. For
379    /// negative scale, it is the number of padding 0 to the right
380    /// of the digits.
381    ///
382    /// For example the number 12300 could be treated as a decimal
383    /// has precision 3 and scale -2.
384    Decimal64(u8, i8),
385    /// Exact 128-bit width decimal value with precision and scale
386    ///
387    /// * precision is the total number of digits
388    /// * scale is the number of digits past the decimal
389    ///
390    /// For example the number 123.45 has precision 5 and scale 2.
391    ///
392    /// In certain situations, scale could be negative number. For
393    /// negative scale, it is the number of padding 0 to the right
394    /// of the digits.
395    ///
396    /// For example the number 12300 could be treated as a decimal
397    /// has precision 3 and scale -2.
398    Decimal128(u8, i8),
399    /// Exact 256-bit width decimal value with precision and scale
400    ///
401    /// * precision is the total number of digits
402    /// * scale is the number of digits past the decimal
403    ///
404    /// For example the number 123.45 has precision 5 and scale 2.
405    ///
406    /// In certain situations, scale could be negative number. For
407    /// negative scale, it is the number of padding 0 to the right
408    /// of the digits.
409    ///
410    /// For example the number 12300 could be treated as a decimal
411    /// has precision 3 and scale -2.
412    Decimal256(u8, i8),
413    /// A Map is a logical nested type that is represented as
414    ///
415    /// `List<entries: Struct<key: K, value: V>>`
416    ///
417    /// The keys and values are each respectively contiguous.
418    /// The key and value types are not constrained, but keys should be
419    /// hashable and unique.
420    /// Whether the keys are sorted can be set in the `bool` after the `Field`.
421    ///
422    /// In a field with Map type, the field has a child Struct field, which then
423    /// has two children: key type and the second the value type. The names of the
424    /// child fields may be respectively "entries", "key", and "value", but this is
425    /// not enforced.
426    Map(FieldRef, bool),
427    /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These
428    /// encodings are well-suited for representing data containing sequences of the
429    /// same value, called runs. Each run is represented as a value and an integer giving
430    /// the index in the array where the run ends.
431    ///
432    /// A run-end encoded array has no buffers by itself, but has two child arrays. The
433    /// first child array, called the run ends array, holds either 16, 32, or 64-bit
434    /// signed integers. The actual values of each run are held in the second child array.
435    ///
436    /// These child arrays are prescribed the standard names of "run_ends" and "values"
437    /// respectively.
438    RunEndEncoded(FieldRef, FieldRef),
439}
440
441/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
442#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
443#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
444pub enum TimeUnit {
445    /// Time in seconds.
446    Second,
447    /// Time in milliseconds.
448    Millisecond,
449    /// Time in microseconds.
450    Microsecond,
451    /// Time in nanoseconds.
452    Nanosecond,
453}
454
455impl std::fmt::Display for TimeUnit {
456    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
457        match self {
458            TimeUnit::Second => write!(f, "s"),
459            TimeUnit::Millisecond => write!(f, "ms"),
460            TimeUnit::Microsecond => write!(f, "µs"),
461            TimeUnit::Nanosecond => write!(f, "ns"),
462        }
463    }
464}
465
466/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style.
467#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
468#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
469pub enum IntervalUnit {
470    /// Indicates the number of elapsed whole months, stored as 4-byte integers.
471    YearMonth,
472    /// Indicates the number of elapsed days and milliseconds,
473    /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
474    DayTime,
475    /// A triple of the number of elapsed months, days, and nanoseconds.
476    /// The values are stored contiguously in 16 byte blocks. Months and
477    /// days are encoded as 32 bit integers and nanoseconds is encoded as a
478    /// 64 bit integer. All integers are signed. Each field is independent
479    /// (e.g. there is no constraint that nanoseconds have the same sign
480    /// as days or that the quantity of nanoseconds represents less
481    /// than a day's worth of time).
482    MonthDayNano,
483}
484
485/// Sparse or Dense union layouts
486#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
487#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
488pub enum UnionMode {
489    /// Sparse union layout
490    Sparse,
491    /// Dense union layout
492    Dense,
493}
494
495/// Parses `str` into a `DataType`.
496///
497/// This is the reverse of [`DataType`]'s `Display`
498/// impl, and maintains the invariant that
499/// `DataType::try_from(&data_type.to_string()).unwrap() == data_type`
500///
501/// # Example
502/// ```
503/// use arrow_schema::DataType;
504///
505/// let data_type: DataType = "Int32".parse().unwrap();
506/// assert_eq!(data_type, DataType::Int32);
507/// ```
508impl FromStr for DataType {
509    type Err = ArrowError;
510
511    fn from_str(s: &str) -> Result<Self, Self::Err> {
512        crate::datatype_parse::parse_data_type(s)
513    }
514}
515
516impl TryFrom<&str> for DataType {
517    type Error = ArrowError;
518
519    fn try_from(value: &str) -> Result<Self, Self::Error> {
520        value.parse()
521    }
522}
523
524impl DataType {
525    /// Returns true if the type is primitive: (numeric, temporal).
526    #[inline]
527    pub fn is_primitive(&self) -> bool {
528        self.is_numeric() || self.is_temporal()
529    }
530
531    /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*).
532    #[inline]
533    pub fn is_numeric(&self) -> bool {
534        use DataType::*;
535        matches!(
536            self,
537            UInt8
538                | UInt16
539                | UInt32
540                | UInt64
541                | Int8
542                | Int16
543                | Int32
544                | Int64
545                | Float16
546                | Float32
547                | Float64
548                | Decimal32(_, _)
549                | Decimal64(_, _)
550                | Decimal128(_, _)
551                | Decimal256(_, _)
552        )
553    }
554
555    /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval).
556    #[inline]
557    pub fn is_temporal(&self) -> bool {
558        use DataType::*;
559        matches!(
560            self,
561            Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_)
562        )
563    }
564
565    /// Returns true if this type is floating: (Float*).
566    #[inline]
567    pub fn is_floating(&self) -> bool {
568        use DataType::*;
569        matches!(self, Float16 | Float32 | Float64)
570    }
571
572    /// Returns true if this type is integer: (Int*, UInt*).
573    #[inline]
574    pub fn is_integer(&self) -> bool {
575        self.is_signed_integer() || self.is_unsigned_integer()
576    }
577
578    /// Returns true if this type is signed integer: (Int*).
579    #[inline]
580    pub fn is_signed_integer(&self) -> bool {
581        use DataType::*;
582        matches!(self, Int8 | Int16 | Int32 | Int64)
583    }
584
585    /// Returns true if this type is unsigned integer: (UInt*).
586    #[inline]
587    pub fn is_unsigned_integer(&self) -> bool {
588        use DataType::*;
589        matches!(self, UInt8 | UInt16 | UInt32 | UInt64)
590    }
591
592    /// Returns true if this type is decimal: (Decimal*).
593    #[inline]
594    pub fn is_decimal(&self) -> bool {
595        use DataType::*;
596        matches!(
597            self,
598            Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..)
599        )
600    }
601
602    /// Returns true if this type is valid as a dictionary key
603    #[inline]
604    pub fn is_dictionary_key_type(&self) -> bool {
605        self.is_integer()
606    }
607
608    /// Returns true if this type is valid for run-ends array in RunArray
609    #[inline]
610    pub fn is_run_ends_type(&self) -> bool {
611        use DataType::*;
612        matches!(self, Int16 | Int32 | Int64)
613    }
614
615    /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union,
616    /// or Map), or a dictionary of a nested type
617    #[inline]
618    pub fn is_nested(&self) -> bool {
619        use DataType::*;
620        match self {
621            Dictionary(_, v) => DataType::is_nested(v.as_ref()),
622            RunEndEncoded(_, v) => DataType::is_nested(v.data_type()),
623            List(_)
624            | FixedSizeList(_, _)
625            | LargeList(_)
626            | ListView(_)
627            | LargeListView(_)
628            | Struct(_)
629            | Union(_, _)
630            | Map(_, _) => true,
631            _ => false,
632        }
633    }
634
635    /// Returns true if this type is DataType::Null.
636    #[inline]
637    pub fn is_null(&self) -> bool {
638        use DataType::*;
639        matches!(self, Null)
640    }
641
642    /// Returns true if this type is a String type
643    #[inline]
644    pub fn is_string(&self) -> bool {
645        use DataType::*;
646        matches!(self, Utf8 | LargeUtf8 | Utf8View)
647    }
648
649    /// Returns true if this type is a List type.
650    ///
651    /// List types include List, LargeList, FixedSizeList, ListView, and LargeListView.
652    #[inline]
653    pub fn is_list(&self) -> bool {
654        use DataType::*;
655        matches!(
656            self,
657            List(_) | LargeList(_) | FixedSizeList(_, _) | ListView(_) | LargeListView(_)
658        )
659    }
660
661    /// Returns true if this type is a Binary type.
662    ///
663    /// Binary types include Binary, LargeBinary, FixedSizeBinary and BinaryView.
664    #[inline]
665    pub fn is_binary(&self) -> bool {
666        use DataType::*;
667        matches!(self, Binary | LargeBinary | FixedSizeBinary(_) | BinaryView)
668    }
669
670    /// Compares the datatype with another, ignoring nested field names
671    /// and metadata.
672    pub fn equals_datatype(&self, other: &DataType) -> bool {
673        match (&self, other) {
674            (DataType::List(a), DataType::List(b))
675            | (DataType::LargeList(a), DataType::LargeList(b))
676            | (DataType::ListView(a), DataType::ListView(b))
677            | (DataType::LargeListView(a), DataType::LargeListView(b)) => {
678                a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type())
679            }
680            (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
681                a_size == b_size
682                    && a.is_nullable() == b.is_nullable()
683                    && a.data_type().equals_datatype(b.data_type())
684            }
685            (DataType::Struct(a), DataType::Struct(b)) => {
686                a.len() == b.len()
687                    && a.iter().zip(b).all(|(a, b)| {
688                        a.is_nullable() == b.is_nullable()
689                            && a.data_type().equals_datatype(b.data_type())
690                    })
691            }
692            (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => {
693                a_field.is_nullable() == b_field.is_nullable()
694                    && a_field.data_type().equals_datatype(b_field.data_type())
695                    && a_is_sorted == b_is_sorted
696            }
697            (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => {
698                a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value)
699            }
700            (
701                DataType::RunEndEncoded(a_run_ends, a_values),
702                DataType::RunEndEncoded(b_run_ends, b_values),
703            ) => {
704                a_run_ends.is_nullable() == b_run_ends.is_nullable()
705                    && a_run_ends
706                        .data_type()
707                        .equals_datatype(b_run_ends.data_type())
708                    && a_values.is_nullable() == b_values.is_nullable()
709                    && a_values.data_type().equals_datatype(b_values.data_type())
710            }
711            (
712                DataType::Union(a_union_fields, a_union_mode),
713                DataType::Union(b_union_fields, b_union_mode),
714            ) => {
715                a_union_mode == b_union_mode
716                    && a_union_fields.len() == b_union_fields.len()
717                    && a_union_fields.iter().all(|a| {
718                        b_union_fields.iter().any(|b| {
719                            a.0 == b.0
720                                && a.1.is_nullable() == b.1.is_nullable()
721                                && a.1.data_type().equals_datatype(b.1.data_type())
722                        })
723                    })
724            }
725            _ => self == other,
726        }
727    }
728
729    /// Returns the byte width of this type if it is a primitive type
730    ///
731    /// Returns `None` if not a primitive type
732    #[inline]
733    pub fn primitive_width(&self) -> Option<usize> {
734        match self {
735            DataType::Null => None,
736            DataType::Boolean => None,
737            DataType::Int8 | DataType::UInt8 => Some(1),
738            DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
739            DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
740            DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
741            DataType::Timestamp(_, _) => Some(8),
742            DataType::Date32 | DataType::Time32(_) => Some(4),
743            DataType::Date64 | DataType::Time64(_) => Some(8),
744            DataType::Duration(_) => Some(8),
745            DataType::Interval(IntervalUnit::YearMonth) => Some(4),
746            DataType::Interval(IntervalUnit::DayTime) => Some(8),
747            DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
748            DataType::Decimal32(_, _) => Some(4),
749            DataType::Decimal64(_, _) => Some(8),
750            DataType::Decimal128(_, _) => Some(16),
751            DataType::Decimal256(_, _) => Some(32),
752            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None,
753            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
754            DataType::FixedSizeBinary(_) => None,
755            DataType::List(_)
756            | DataType::ListView(_)
757            | DataType::LargeList(_)
758            | DataType::LargeListView(_)
759            | DataType::Map(_, _) => None,
760            DataType::FixedSizeList(_, _) => None,
761            DataType::Struct(_) => None,
762            DataType::Union(_, _) => None,
763            DataType::Dictionary(_, _) => None,
764            DataType::RunEndEncoded(_, _) => None,
765        }
766    }
767
768    /// Return size of this instance in bytes.
769    ///
770    /// Includes the size of `Self`.
771    pub fn size(&self) -> usize {
772        std::mem::size_of_val(self)
773            + match self {
774                DataType::Null
775                | DataType::Boolean
776                | DataType::Int8
777                | DataType::Int16
778                | DataType::Int32
779                | DataType::Int64
780                | DataType::UInt8
781                | DataType::UInt16
782                | DataType::UInt32
783                | DataType::UInt64
784                | DataType::Float16
785                | DataType::Float32
786                | DataType::Float64
787                | DataType::Date32
788                | DataType::Date64
789                | DataType::Time32(_)
790                | DataType::Time64(_)
791                | DataType::Duration(_)
792                | DataType::Interval(_)
793                | DataType::Binary
794                | DataType::FixedSizeBinary(_)
795                | DataType::LargeBinary
796                | DataType::BinaryView
797                | DataType::Utf8
798                | DataType::LargeUtf8
799                | DataType::Utf8View
800                | DataType::Decimal32(_, _)
801                | DataType::Decimal64(_, _)
802                | DataType::Decimal128(_, _)
803                | DataType::Decimal256(_, _) => 0,
804                DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(),
805                DataType::List(field)
806                | DataType::ListView(field)
807                | DataType::FixedSizeList(field, _)
808                | DataType::LargeList(field)
809                | DataType::LargeListView(field)
810                | DataType::Map(field, _) => field.size(),
811                DataType::Struct(fields) => fields.size(),
812                DataType::Union(fields, _) => fields.size(),
813                DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
814                DataType::RunEndEncoded(run_ends, values) => {
815                    run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
816                        - std::mem::size_of_val(values)
817                }
818            }
819    }
820
821    /// Check to see if `self` is a superset of `other`
822    ///
823    /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type
824    /// else it will check to see if the DataType is equal to the other DataType
825    pub fn contains(&self, other: &DataType) -> bool {
826        match (self, other) {
827            (DataType::List(f1), DataType::List(f2))
828            | (DataType::LargeList(f1), DataType::LargeList(f2))
829            | (DataType::ListView(f1), DataType::ListView(f2))
830            | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2),
831            (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => {
832                s1 == s2 && f1.contains(f2)
833            }
834            (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2),
835            (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2),
836            (DataType::Union(f1, s1), DataType::Union(f2, s2)) => {
837                s1 == s2
838                    && f1
839                        .iter()
840                        .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1)))
841            }
842            (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => {
843                k1.contains(k2) && v1.contains(v2)
844            }
845            _ => self == other,
846        }
847    }
848
849    /// Create a [`DataType::List`] with elements of the specified type
850    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
851    ///
852    /// To specify field level metadata, construct the inner [`Field`]
853    /// directly via [`Field::new`] or [`Field::new_list_field`].
854    pub fn new_list(data_type: DataType, nullable: bool) -> Self {
855        DataType::List(Arc::new(Field::new_list_field(data_type, nullable)))
856    }
857
858    /// Create a [`DataType::LargeList`] with elements of the specified type
859    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
860    ///
861    /// To specify field level metadata, construct the inner [`Field`]
862    /// directly via [`Field::new`] or [`Field::new_list_field`].
863    pub fn new_large_list(data_type: DataType, nullable: bool) -> Self {
864        DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable)))
865    }
866
867    /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size
868    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
869    ///
870    /// To specify field level metadata, construct the inner [`Field`]
871    /// directly via [`Field::new`] or [`Field::new_list_field`].
872    pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self {
873        DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size)
874    }
875}
876
877/// The maximum precision for [DataType::Decimal32] values
878pub const DECIMAL32_MAX_PRECISION: u8 = 9;
879
880/// The maximum scale for [DataType::Decimal32] values
881pub const DECIMAL32_MAX_SCALE: i8 = 9;
882
883/// The maximum precision for [DataType::Decimal64] values
884pub const DECIMAL64_MAX_PRECISION: u8 = 18;
885
886/// The maximum scale for [DataType::Decimal64] values
887pub const DECIMAL64_MAX_SCALE: i8 = 18;
888
889/// The maximum precision for [DataType::Decimal128] values
890pub const DECIMAL128_MAX_PRECISION: u8 = 38;
891
892/// The maximum scale for [DataType::Decimal128] values
893pub const DECIMAL128_MAX_SCALE: i8 = 38;
894
895/// The maximum precision for [DataType::Decimal256] values
896pub const DECIMAL256_MAX_PRECISION: u8 = 76;
897
898/// The maximum scale for [DataType::Decimal256] values
899pub const DECIMAL256_MAX_SCALE: i8 = 76;
900
901/// The default scale for [DataType::Decimal32] values
902pub const DECIMAL32_DEFAULT_SCALE: i8 = 2;
903
904/// The default scale for [DataType::Decimal64] values
905pub const DECIMAL64_DEFAULT_SCALE: i8 = 6;
906
907/// The default scale for [DataType::Decimal128] and [DataType::Decimal256]
908/// values
909pub const DECIMAL_DEFAULT_SCALE: i8 = 10;
910
911#[cfg(test)]
912mod tests {
913    use super::*;
914
915    #[test]
916    #[cfg(feature = "serde")]
917    fn serde_struct_type() {
918        use std::collections::HashMap;
919
920        let kv_array = [("k".to_string(), "v".to_string())];
921        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
922
923        // Non-empty map: should be converted as JSON obj { ... }
924        let first_name =
925            Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
926
927        // Empty map: should be omitted.
928        let last_name =
929            Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default());
930
931        let person = DataType::Struct(Fields::from(vec![
932            first_name,
933            last_name,
934            Field::new(
935                "address",
936                DataType::Struct(Fields::from(vec![
937                    Field::new("street", DataType::Utf8, false),
938                    Field::new("zip", DataType::UInt16, false),
939                ])),
940                false,
941            ),
942        ]));
943
944        let serialized = serde_json::to_string(&person).unwrap();
945
946        // NOTE that this is testing the default (derived) serialization format, not the
947        // JSON format specified in metadata.md
948
949        assert_eq!(
950            "{\"Struct\":[\
951             {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
952             {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
953             {\"name\":\"address\",\"data_type\":{\"Struct\":\
954             [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
955             {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\
956             ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}",
957            serialized
958        );
959
960        let deserialized = serde_json::from_str(&serialized).unwrap();
961
962        assert_eq!(person, deserialized);
963    }
964
965    #[test]
966    fn test_list_datatype_equality() {
967        // tests that list type equality is checked while ignoring list names
968        let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
969        let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true)));
970        let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false)));
971        let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true)));
972        assert!(list_a.equals_datatype(&list_b));
973        assert!(!list_a.equals_datatype(&list_c));
974        assert!(!list_b.equals_datatype(&list_c));
975        assert!(!list_a.equals_datatype(&list_d));
976
977        let list_e =
978            DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3);
979        let list_f =
980            DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3);
981        let list_g = DataType::FixedSizeList(
982            Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)),
983            3,
984        );
985        assert!(list_e.equals_datatype(&list_f));
986        assert!(!list_e.equals_datatype(&list_g));
987        assert!(!list_f.equals_datatype(&list_g));
988
989        let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)]));
990        let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)]));
991        let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)]));
992        let list_k = DataType::Struct(Fields::from(vec![
993            Field::new("f1", list_f.clone(), false),
994            Field::new("f2", list_g.clone(), false),
995            Field::new("f3", DataType::Utf8, true),
996        ]));
997        let list_l = DataType::Struct(Fields::from(vec![
998            Field::new("ff1", list_f.clone(), false),
999            Field::new("ff2", list_g.clone(), false),
1000            Field::new("ff3", DataType::LargeUtf8, true),
1001        ]));
1002        let list_m = DataType::Struct(Fields::from(vec![
1003            Field::new("ff1", list_f, false),
1004            Field::new("ff2", list_g, false),
1005            Field::new("ff3", DataType::Utf8, true),
1006        ]));
1007        assert!(list_h.equals_datatype(&list_i));
1008        assert!(!list_h.equals_datatype(&list_j));
1009        assert!(!list_k.equals_datatype(&list_l));
1010        assert!(list_k.equals_datatype(&list_m));
1011
1012        let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true);
1013        let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true);
1014        let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false);
1015        let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true);
1016        let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true);
1017
1018        assert!(list_n.equals_datatype(&list_o));
1019        assert!(!list_n.equals_datatype(&list_p));
1020        assert!(!list_n.equals_datatype(&list_q));
1021        assert!(!list_n.equals_datatype(&list_r));
1022
1023        let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a));
1024        let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone()));
1025        let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b));
1026        let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c));
1027
1028        assert!(list_s.equals_datatype(&list_t));
1029        assert!(!list_s.equals_datatype(&list_u));
1030        assert!(!list_s.equals_datatype(&list_v));
1031
1032        let union_a = DataType::Union(
1033            UnionFields::try_new(
1034                vec![1, 2],
1035                vec![
1036                    Field::new("f1", DataType::Utf8, false),
1037                    Field::new("f2", DataType::UInt8, false),
1038                ],
1039            )
1040            .unwrap(),
1041            UnionMode::Sparse,
1042        );
1043        let union_b = DataType::Union(
1044            UnionFields::try_new(
1045                vec![1, 2],
1046                vec![
1047                    Field::new("ff1", DataType::Utf8, false),
1048                    Field::new("ff2", DataType::UInt8, false),
1049                ],
1050            )
1051            .unwrap(),
1052            UnionMode::Sparse,
1053        );
1054        let union_c = DataType::Union(
1055            UnionFields::try_new(
1056                vec![2, 1],
1057                vec![
1058                    Field::new("fff2", DataType::UInt8, false),
1059                    Field::new("fff1", DataType::Utf8, false),
1060                ],
1061            )
1062            .unwrap(),
1063            UnionMode::Sparse,
1064        );
1065        let union_d = DataType::Union(
1066            UnionFields::try_new(
1067                vec![2, 1],
1068                vec![
1069                    Field::new("fff1", DataType::Int8, false),
1070                    Field::new("fff2", DataType::UInt8, false),
1071                ],
1072            )
1073            .unwrap(),
1074            UnionMode::Sparse,
1075        );
1076        let union_e = DataType::Union(
1077            UnionFields::try_new(
1078                vec![1, 2],
1079                vec![
1080                    Field::new("f1", DataType::Utf8, true),
1081                    Field::new("f2", DataType::UInt8, false),
1082                ],
1083            )
1084            .unwrap(),
1085            UnionMode::Sparse,
1086        );
1087
1088        assert!(union_a.equals_datatype(&union_b));
1089        assert!(union_a.equals_datatype(&union_c));
1090        assert!(!union_a.equals_datatype(&union_d));
1091        assert!(!union_a.equals_datatype(&union_e));
1092
1093        let list_w = DataType::RunEndEncoded(
1094            Arc::new(Field::new("f1", DataType::Int64, true)),
1095            Arc::new(Field::new("f2", DataType::Utf8, true)),
1096        );
1097        let list_x = DataType::RunEndEncoded(
1098            Arc::new(Field::new("ff1", DataType::Int64, true)),
1099            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1100        );
1101        let list_y = DataType::RunEndEncoded(
1102            Arc::new(Field::new("ff1", DataType::UInt16, true)),
1103            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1104        );
1105        let list_z = DataType::RunEndEncoded(
1106            Arc::new(Field::new("f1", DataType::Int64, false)),
1107            Arc::new(Field::new("f2", DataType::Utf8, true)),
1108        );
1109
1110        assert!(list_w.equals_datatype(&list_x));
1111        assert!(!list_w.equals_datatype(&list_y));
1112        assert!(!list_w.equals_datatype(&list_z));
1113    }
1114
1115    #[test]
1116    fn create_struct_type() {
1117        let _person = DataType::Struct(Fields::from(vec![
1118            Field::new("first_name", DataType::Utf8, false),
1119            Field::new("last_name", DataType::Utf8, false),
1120            Field::new(
1121                "address",
1122                DataType::Struct(Fields::from(vec![
1123                    Field::new("street", DataType::Utf8, false),
1124                    Field::new("zip", DataType::UInt16, false),
1125                ])),
1126                false,
1127            ),
1128        ]));
1129    }
1130
1131    #[test]
1132    fn test_nested() {
1133        let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true)));
1134        let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1135        let large_list_view =
1136            DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1137
1138        assert!(!DataType::is_nested(&DataType::Boolean));
1139        assert!(!DataType::is_nested(&DataType::Int32));
1140        assert!(!DataType::is_nested(&DataType::Utf8));
1141        assert!(DataType::is_nested(&list));
1142        assert!(DataType::is_nested(&list_view));
1143        assert!(DataType::is_nested(&large_list_view));
1144
1145        assert!(!DataType::is_nested(&DataType::Dictionary(
1146            Box::new(DataType::Int32),
1147            Box::new(DataType::Boolean)
1148        )));
1149        assert!(!DataType::is_nested(&DataType::Dictionary(
1150            Box::new(DataType::Int32),
1151            Box::new(DataType::Int64)
1152        )));
1153        assert!(!DataType::is_nested(&DataType::Dictionary(
1154            Box::new(DataType::Int32),
1155            Box::new(DataType::LargeUtf8)
1156        )));
1157        assert!(DataType::is_nested(&DataType::Dictionary(
1158            Box::new(DataType::Int32),
1159            Box::new(list)
1160        )));
1161    }
1162
1163    #[test]
1164    fn test_integer() {
1165        // is_integer
1166        assert!(DataType::is_integer(&DataType::Int32));
1167        assert!(DataType::is_integer(&DataType::UInt64));
1168        assert!(!DataType::is_integer(&DataType::Float16));
1169
1170        // is_signed_integer
1171        assert!(DataType::is_signed_integer(&DataType::Int32));
1172        assert!(!DataType::is_signed_integer(&DataType::UInt64));
1173        assert!(!DataType::is_signed_integer(&DataType::Float16));
1174
1175        // is_unsigned_integer
1176        assert!(!DataType::is_unsigned_integer(&DataType::Int32));
1177        assert!(DataType::is_unsigned_integer(&DataType::UInt64));
1178        assert!(!DataType::is_unsigned_integer(&DataType::Float16));
1179
1180        // is_dictionary_key_type
1181        assert!(DataType::is_dictionary_key_type(&DataType::Int32));
1182        assert!(DataType::is_dictionary_key_type(&DataType::UInt64));
1183        assert!(!DataType::is_dictionary_key_type(&DataType::Float16));
1184    }
1185
1186    #[test]
1187    fn test_string() {
1188        assert!(DataType::is_string(&DataType::Utf8));
1189        assert!(DataType::is_string(&DataType::LargeUtf8));
1190        assert!(DataType::is_string(&DataType::Utf8View));
1191        assert!(!DataType::is_string(&DataType::Int32));
1192    }
1193
1194    #[test]
1195    fn test_floating() {
1196        assert!(DataType::is_floating(&DataType::Float16));
1197        assert!(!DataType::is_floating(&DataType::Int32));
1198    }
1199
1200    #[test]
1201    fn test_decimal() {
1202        assert!(DataType::is_decimal(&DataType::Decimal32(4, 2)));
1203        assert!(DataType::is_decimal(&DataType::Decimal64(4, 2)));
1204        assert!(DataType::is_decimal(&DataType::Decimal128(4, 2)));
1205        assert!(DataType::is_decimal(&DataType::Decimal256(4, 2)));
1206        assert!(!DataType::is_decimal(&DataType::Float16));
1207    }
1208
1209    #[test]
1210    fn test_datatype_is_null() {
1211        assert!(DataType::is_null(&DataType::Null));
1212        assert!(!DataType::is_null(&DataType::Int32));
1213    }
1214
1215    #[test]
1216    fn test_is_list() {
1217        assert!(DataType::is_list(&DataType::new_list(
1218            DataType::Int16,
1219            true
1220        )));
1221        assert!(DataType::is_list(&DataType::new_large_list(
1222            DataType::Int16,
1223            true
1224        )));
1225        assert!(DataType::is_list(&DataType::new_fixed_size_list(
1226            DataType::Int16,
1227            5,
1228            true
1229        )));
1230        assert!(DataType::is_list(&DataType::ListView(Arc::new(
1231            Field::new("f", DataType::Int16, true)
1232        ))));
1233        assert!(DataType::is_list(&DataType::LargeListView(Arc::new(
1234            Field::new("f", DataType::Int16, true)
1235        ))));
1236        assert!(!DataType::is_list(&DataType::Binary));
1237    }
1238
1239    #[test]
1240    fn test_is_binary() {
1241        assert!(DataType::is_binary(&DataType::Binary));
1242        assert!(DataType::is_binary(&DataType::LargeBinary));
1243        assert!(DataType::is_binary(&DataType::BinaryView));
1244        assert!(!DataType::is_list(&DataType::Utf8View));
1245    }
1246
1247    #[test]
1248    fn size_should_not_regress() {
1249        assert_eq!(std::mem::size_of::<DataType>(), 24);
1250    }
1251
1252    #[test]
1253    #[should_panic(expected = "duplicate type id: 1")]
1254    fn test_union_with_duplicated_type_id() {
1255        let type_ids = vec![1, 1];
1256        let _union = DataType::Union(
1257            UnionFields::try_new(
1258                type_ids,
1259                vec![
1260                    Field::new("f1", DataType::Int32, false),
1261                    Field::new("f2", DataType::Utf8, false),
1262                ],
1263            )
1264            .unwrap(),
1265            UnionMode::Dense,
1266        );
1267    }
1268
1269    #[test]
1270    fn test_try_from_str() {
1271        let data_type: DataType = "Int32".try_into().unwrap();
1272        assert_eq!(data_type, DataType::Int32);
1273    }
1274
1275    #[test]
1276    fn test_from_str() {
1277        let data_type: DataType = "UInt64".parse().unwrap();
1278        assert_eq!(data_type, DataType::UInt64);
1279    }
1280
1281    #[test]
1282    #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro
1283    fn test_debug_format_field() {
1284        // Make sure the `Debug` formatting of `DataType` is readable and not too long
1285        insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r"
1286        List(
1287            Field {
1288                data_type: Int8,
1289            },
1290        )
1291        ");
1292    }
1293}