arrow_schema/datatype.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::str::FromStr;
19use std::sync::Arc;
20
21use crate::{ArrowError, Field, FieldRef, Fields, UnionFields};
22
23/// Datatypes supported by this implementation of Apache Arrow.
24///
25/// The variants of this enum include primitive fixed size types as well as
26/// parametric or nested types. See [`Schema.fbs`] for Arrow's specification.
27///
28/// # Examples
29///
30/// Primitive types
31/// ```
32/// # use arrow_schema::DataType;
33/// // create a new 32-bit signed integer
34/// let data_type = DataType::Int32;
35/// ```
36///
37/// Nested Types
38/// ```
39/// # use arrow_schema::{DataType, Field};
40/// # use std::sync::Arc;
41/// // create a new list of 32-bit signed integers directly
42/// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
43/// // Create the same list type with constructor
44/// let list_data_type2 = DataType::new_list(DataType::Int32, true);
45/// assert_eq!(list_data_type, list_data_type2);
46/// ```
47///
48/// Dictionary Types
49/// ```
50/// # use arrow_schema::{DataType};
51/// // String Dictionary (key type Int32 and value type Utf8)
52/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
53/// ```
54///
55/// Timestamp Types
56/// ```
57/// # use arrow_schema::{DataType, TimeUnit};
58/// // timestamp with millisecond precision without timezone specified
59/// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None);
60/// // timestamp with nanosecond precision in UTC timezone
61/// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()));
62///```
63///
64/// # Display and FromStr
65///
66/// The `Display` and `FromStr` implementations for `DataType` are
67/// human-readable, parseable, and reversible.
68///
69/// ```
70/// # use arrow_schema::DataType;
71/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
72/// let data_type_string = data_type.to_string();
73/// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)");
74/// // display can be parsed back into the original type
75/// let parsed_data_type: DataType = data_type.to_string().parse().unwrap();
76/// assert_eq!(data_type, parsed_data_type);
77/// ```
78///
79/// # Nested Support
80/// Currently, the Rust implementation supports the following nested types:
81/// - `List<T>`
82/// - `LargeList<T>`
83/// - `FixedSizeList<T>`
84/// - `Struct<T, U, V, ...>`
85/// - `Union<T, U, V, ...>`
86/// - `Map<K, V>`
87///
88/// Nested types can themselves be nested within other arrays.
89/// For more information on these types please see
90/// [the physical memory layout of Apache Arrow]
91///
92/// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
93/// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout
94#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
95#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
96pub enum DataType {
97 /// Null type
98 Null,
99 /// A boolean datatype representing the values `true` and `false`.
100 Boolean,
101 /// A signed 8-bit integer.
102 Int8,
103 /// A signed 16-bit integer.
104 Int16,
105 /// A signed 32-bit integer.
106 Int32,
107 /// A signed 64-bit integer.
108 Int64,
109 /// An unsigned 8-bit integer.
110 UInt8,
111 /// An unsigned 16-bit integer.
112 UInt16,
113 /// An unsigned 32-bit integer.
114 UInt32,
115 /// An unsigned 64-bit integer.
116 UInt64,
117 /// A 16-bit floating point number.
118 Float16,
119 /// A 32-bit floating point number.
120 Float32,
121 /// A 64-bit floating point number.
122 Float64,
123 /// A timestamp with an optional timezone.
124 ///
125 /// Time is measured as a Unix epoch, counting the seconds from
126 /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
127 /// as a signed 64-bit integer.
128 ///
129 /// The time zone is a string indicating the name of a time zone, one of:
130 ///
131 /// * As used in the Olson time zone database (the "tz database" or
132 /// "tzdata"), such as "America/New_York"
133 /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
134 ///
135 /// Timestamps with a non-empty timezone
136 /// ------------------------------------
137 ///
138 /// If a Timestamp column has a non-empty timezone value, its epoch is
139 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
140 /// (the Unix epoch), regardless of the Timestamp's own timezone.
141 ///
142 /// Therefore, timestamp values with a non-empty timezone correspond to
143 /// physical points in time together with some additional information about
144 /// how the data was obtained and/or how to display it (the timezone).
145 ///
146 /// For example, the timestamp value 0 with the timezone string "Europe/Paris"
147 /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
148 /// application may prefer to display it as "January 1st 1970, 01h00" in
149 /// the Europe/Paris timezone (which is the same physical point in time).
150 ///
151 /// One consequence is that timestamp values with a non-empty timezone
152 /// can be compared and ordered directly, since they all share the same
153 /// well-known point of reference (the Unix epoch).
154 ///
155 /// Timestamps with an unset / empty timezone
156 /// -----------------------------------------
157 ///
158 /// If a Timestamp column has no timezone value, its epoch is
159 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
160 ///
161 /// Therefore, timestamp values without a timezone cannot be meaningfully
162 /// interpreted as physical points in time, but only as calendar / clock
163 /// indications ("wall clock time") in an unspecified timezone.
164 ///
165 /// For example, the timestamp value 0 with an empty timezone string
166 /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
167 /// is not enough information to interpret it as a well-defined physical
168 /// point in time.
169 ///
170 /// One consequence is that timestamp values without a timezone cannot
171 /// be reliably compared or ordered, since they may have different points of
172 /// reference. In particular, it is *not* possible to interpret an unset
173 /// or empty timezone as the same as "UTC".
174 ///
175 /// Conversion between timezones
176 /// ----------------------------
177 ///
178 /// If a Timestamp column has a non-empty timezone, changing the timezone
179 /// to a different non-empty value is a metadata-only operation:
180 /// the timestamp values need not change as their point of reference remains
181 /// the same (the Unix epoch).
182 ///
183 /// However, if a Timestamp column has no timezone value, changing it to a
184 /// non-empty value requires to think about the desired semantics.
185 /// One possibility is to assume that the original timestamp values are
186 /// relative to the epoch of the timezone being set; timestamp values should
187 /// then adjusted to the Unix epoch (for example, changing the timezone from
188 /// empty to "Europe/Paris" would require converting the timestamp values
189 /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
190 /// nevertheless correct).
191 ///
192 /// ```
193 /// # use arrow_schema::{DataType, TimeUnit};
194 /// DataType::Timestamp(TimeUnit::Second, None);
195 /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
196 /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
197 /// ```
198 ///
199 /// # Timezone representation
200 /// ----------------------------
201 /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00".
202 /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous.
203 ///
204 /// Most arrow-rs functionalities use the absolute offset representation,
205 /// such as [`PrimitiveArray::with_timezone_utc`] that applies a
206 /// UTC timezone to timestamp arrays.
207 ///
208 /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc
209 ///
210 /// Timezone string parsing
211 /// -----------------------
212 /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
213 ///
214 /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
215 /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
216 /// timezones.
217 Timestamp(TimeUnit, Option<Arc<str>>),
218 /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
219 /// in days.
220 Date32,
221 /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
222 /// in milliseconds.
223 ///
224 /// # Valid Ranges
225 ///
226 /// According to the Arrow specification ([Schema.fbs]), values of Date64
227 /// are treated as the number of *days*, in milliseconds, since the UNIX
228 /// epoch. Therefore, values of this type must be evenly divisible by
229 /// `86_400_000`, the number of milliseconds in a standard day.
230 ///
231 /// It is not valid to store milliseconds that do not represent an exact
232 /// day. The reason for this restriction is compatibility with other
233 /// language's native libraries (specifically Java), which historically
234 /// lacked a dedicated date type and only supported timestamps.
235 ///
236 /// # Validation
237 ///
238 /// This library does not validate or enforce that Date64 values are evenly
239 /// divisible by `86_400_000` for performance and usability reasons. Date64
240 /// values are treated similarly to `Timestamp(TimeUnit::Millisecond,
241 /// None)`: values will be displayed with a time of day if the value does
242 /// not represent an exact day, and arithmetic will be done at the
243 /// millisecond granularity.
244 ///
245 /// # Recommendation
246 ///
247 /// Users should prefer [`Date32`] to cleanly represent the number
248 /// of days, or one of the Timestamp variants to include time as part of the
249 /// representation, depending on their use case.
250 ///
251 /// # Further Reading
252 ///
253 /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288).
254 ///
255 /// [`Date32`]: Self::Date32
256 /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
257 Date64,
258 /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
259 /// Must be either seconds or milliseconds.
260 Time32(TimeUnit),
261 /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
262 /// Must be either microseconds or nanoseconds.
263 Time64(TimeUnit),
264 /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
265 Duration(TimeUnit),
266 /// A "calendar" interval which models types that don't necessarily
267 /// have a precise duration without the context of a base timestamp (e.g.
268 /// days can differ in length during day light savings time transitions).
269 Interval(IntervalUnit),
270 /// Opaque binary data of variable length.
271 ///
272 /// A single Binary array can store up to [`i32::MAX`] bytes
273 /// of binary data in total.
274 Binary,
275 /// Opaque binary data of fixed size.
276 /// Enum parameter specifies the number of bytes per value.
277 FixedSizeBinary(i32),
278 /// Opaque binary data of variable length and 64-bit offsets.
279 ///
280 /// A single LargeBinary array can store up to [`i64::MAX`] bytes
281 /// of binary data in total.
282 LargeBinary,
283 /// Opaque binary data of variable length.
284 ///
285 /// Logically the same as [`Binary`], but the internal representation uses a view
286 /// struct that contains the string length and either the string's entire data
287 /// inline (for small strings) or an inlined prefix, an index of another buffer,
288 /// and an offset pointing to a slice in that buffer (for non-small strings).
289 ///
290 /// [`Binary`]: Self::Binary
291 BinaryView,
292 /// A variable-length string in Unicode with UTF-8 encoding.
293 ///
294 /// A single Utf8 array can store up to [`i32::MAX`] bytes
295 /// of string data in total.
296 Utf8,
297 /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
298 ///
299 /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
300 /// of string data in total.
301 LargeUtf8,
302 /// A variable-length string in Unicode with UTF-8 encoding
303 ///
304 /// Logically the same as [`Utf8`], but the internal representation uses a view
305 /// struct that contains the string length and either the string's entire data
306 /// inline (for small strings) or an inlined prefix, an index of another buffer,
307 /// and an offset pointing to a slice in that buffer (for non-small strings).
308 ///
309 /// [`Utf8`]: Self::Utf8
310 Utf8View,
311 /// A list of some logical data type with variable length.
312 ///
313 /// A single List array can store up to [`i32::MAX`] elements in total.
314 List(FieldRef),
315
316 /// A list of some logical data type with variable length.
317 ///
318 /// Logically the same as [`List`], but the internal representation differs in how child
319 /// data is referenced, allowing flexibility in how data is layed out.
320 ///
321 /// [`List`]: Self::List
322 ListView(FieldRef),
323 /// A list of some logical data type with fixed length.
324 FixedSizeList(FieldRef, i32),
325 /// A list of some logical data type with variable length and 64-bit offsets.
326 ///
327 /// A single LargeList array can store up to [`i64::MAX`] elements in total.
328 LargeList(FieldRef),
329
330 /// A list of some logical data type with variable length and 64-bit offsets.
331 ///
332 /// Logically the same as [`LargeList`], but the internal representation differs in how child
333 /// data is referenced, allowing flexibility in how data is layed out.
334 ///
335 /// [`LargeList`]: Self::LargeList
336 LargeListView(FieldRef),
337 /// A nested datatype that contains a number of sub-fields.
338 Struct(Fields),
339 /// A nested datatype that can represent slots of differing types. Components:
340 ///
341 /// 1. [`UnionFields`]
342 /// 2. The type of union (Sparse or Dense)
343 Union(UnionFields, UnionMode),
344 /// A dictionary encoded array (`key_type`, `value_type`), where
345 /// each array element is an index of `key_type` into an
346 /// associated dictionary of `value_type`.
347 ///
348 /// Dictionary arrays are used to store columns of `value_type`
349 /// that contain many repeated values using less memory, but with
350 /// a higher CPU overhead for some operations.
351 ///
352 /// This type mostly used to represent low cardinality string
353 /// arrays or a limited set of primitive types as integers.
354 Dictionary(Box<DataType>, Box<DataType>),
355 /// Exact 32-bit width decimal value with precision and scale
356 ///
357 /// * precision is the total number of digits
358 /// * scale is the number of digits past the decimal
359 ///
360 /// For example the number 123.45 has precision 5 and scale 2.
361 ///
362 /// In certain situations, scale could be negative number. For
363 /// negative scale, it is the number of padding 0 to the right
364 /// of the digits.
365 ///
366 /// For example the number 12300 could be treated as a decimal
367 /// has precision 3 and scale -2.
368 Decimal32(u8, i8),
369 /// Exact 64-bit width decimal value with precision and scale
370 ///
371 /// * precision is the total number of digits
372 /// * scale is the number of digits past the decimal
373 ///
374 /// For example the number 123.45 has precision 5 and scale 2.
375 ///
376 /// In certain situations, scale could be negative number. For
377 /// negative scale, it is the number of padding 0 to the right
378 /// of the digits.
379 ///
380 /// For example the number 12300 could be treated as a decimal
381 /// has precision 3 and scale -2.
382 Decimal64(u8, i8),
383 /// Exact 128-bit width decimal value with precision and scale
384 ///
385 /// * precision is the total number of digits
386 /// * scale is the number of digits past the decimal
387 ///
388 /// For example the number 123.45 has precision 5 and scale 2.
389 ///
390 /// In certain situations, scale could be negative number. For
391 /// negative scale, it is the number of padding 0 to the right
392 /// of the digits.
393 ///
394 /// For example the number 12300 could be treated as a decimal
395 /// has precision 3 and scale -2.
396 Decimal128(u8, i8),
397 /// Exact 256-bit width decimal value with precision and scale
398 ///
399 /// * precision is the total number of digits
400 /// * scale is the number of digits past the decimal
401 ///
402 /// For example the number 123.45 has precision 5 and scale 2.
403 ///
404 /// In certain situations, scale could be negative number. For
405 /// negative scale, it is the number of padding 0 to the right
406 /// of the digits.
407 ///
408 /// For example the number 12300 could be treated as a decimal
409 /// has precision 3 and scale -2.
410 Decimal256(u8, i8),
411 /// A Map is a logical nested type that is represented as
412 ///
413 /// `List<entries: Struct<key: K, value: V>>`
414 ///
415 /// The keys and values are each respectively contiguous.
416 /// The key and value types are not constrained, but keys should be
417 /// hashable and unique.
418 /// Whether the keys are sorted can be set in the `bool` after the `Field`.
419 ///
420 /// In a field with Map type, the field has a child Struct field, which then
421 /// has two children: key type and the second the value type. The names of the
422 /// child fields may be respectively "entries", "key", and "value", but this is
423 /// not enforced.
424 Map(FieldRef, bool),
425 /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These
426 /// encodings are well-suited for representing data containing sequences of the
427 /// same value, called runs. Each run is represented as a value and an integer giving
428 /// the index in the array where the run ends.
429 ///
430 /// A run-end encoded array has no buffers by itself, but has two child arrays. The
431 /// first child array, called the run ends array, holds either 16, 32, or 64-bit
432 /// signed integers. The actual values of each run are held in the second child array.
433 ///
434 /// These child arrays are prescribed the standard names of "run_ends" and "values"
435 /// respectively.
436 RunEndEncoded(FieldRef, FieldRef),
437}
438
439/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
440#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
441#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
442pub enum TimeUnit {
443 /// Time in seconds.
444 Second,
445 /// Time in milliseconds.
446 Millisecond,
447 /// Time in microseconds.
448 Microsecond,
449 /// Time in nanoseconds.
450 Nanosecond,
451}
452
453impl std::fmt::Display for TimeUnit {
454 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
455 match self {
456 TimeUnit::Second => write!(f, "s"),
457 TimeUnit::Millisecond => write!(f, "ms"),
458 TimeUnit::Microsecond => write!(f, "µs"),
459 TimeUnit::Nanosecond => write!(f, "ns"),
460 }
461 }
462}
463
464/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style.
465#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
466#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
467pub enum IntervalUnit {
468 /// Indicates the number of elapsed whole months, stored as 4-byte integers.
469 YearMonth,
470 /// Indicates the number of elapsed days and milliseconds,
471 /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
472 DayTime,
473 /// A triple of the number of elapsed months, days, and nanoseconds.
474 /// The values are stored contiguously in 16 byte blocks. Months and
475 /// days are encoded as 32 bit integers and nanoseconds is encoded as a
476 /// 64 bit integer. All integers are signed. Each field is independent
477 /// (e.g. there is no constraint that nanoseconds have the same sign
478 /// as days or that the quantity of nanoseconds represents less
479 /// than a day's worth of time).
480 MonthDayNano,
481}
482
483/// Sparse or Dense union layouts
484#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
485#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
486pub enum UnionMode {
487 /// Sparse union layout
488 Sparse,
489 /// Dense union layout
490 Dense,
491}
492
493/// Parses `str` into a `DataType`.
494///
495/// This is the reverse of [`DataType`]'s `Display`
496/// impl, and maintains the invariant that
497/// `DataType::try_from(&data_type.to_string()).unwrap() == data_type`
498///
499/// # Example
500/// ```
501/// use arrow_schema::DataType;
502///
503/// let data_type: DataType = "Int32".parse().unwrap();
504/// assert_eq!(data_type, DataType::Int32);
505/// ```
506impl FromStr for DataType {
507 type Err = ArrowError;
508
509 fn from_str(s: &str) -> Result<Self, Self::Err> {
510 crate::datatype_parse::parse_data_type(s)
511 }
512}
513
514impl TryFrom<&str> for DataType {
515 type Error = ArrowError;
516
517 fn try_from(value: &str) -> Result<Self, Self::Error> {
518 value.parse()
519 }
520}
521
522impl DataType {
523 /// Returns true if the type is primitive: (numeric, temporal).
524 #[inline]
525 pub fn is_primitive(&self) -> bool {
526 self.is_numeric() || self.is_temporal()
527 }
528
529 /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*).
530 #[inline]
531 pub fn is_numeric(&self) -> bool {
532 use DataType::*;
533 matches!(
534 self,
535 UInt8
536 | UInt16
537 | UInt32
538 | UInt64
539 | Int8
540 | Int16
541 | Int32
542 | Int64
543 | Float16
544 | Float32
545 | Float64
546 | Decimal32(_, _)
547 | Decimal64(_, _)
548 | Decimal128(_, _)
549 | Decimal256(_, _)
550 )
551 }
552
553 /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval).
554 #[inline]
555 pub fn is_temporal(&self) -> bool {
556 use DataType::*;
557 matches!(
558 self,
559 Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_)
560 )
561 }
562
563 /// Returns true if this type is floating: (Float*).
564 #[inline]
565 pub fn is_floating(&self) -> bool {
566 use DataType::*;
567 matches!(self, Float16 | Float32 | Float64)
568 }
569
570 /// Returns true if this type is integer: (Int*, UInt*).
571 #[inline]
572 pub fn is_integer(&self) -> bool {
573 self.is_signed_integer() || self.is_unsigned_integer()
574 }
575
576 /// Returns true if this type is signed integer: (Int*).
577 #[inline]
578 pub fn is_signed_integer(&self) -> bool {
579 use DataType::*;
580 matches!(self, Int8 | Int16 | Int32 | Int64)
581 }
582
583 /// Returns true if this type is unsigned integer: (UInt*).
584 #[inline]
585 pub fn is_unsigned_integer(&self) -> bool {
586 use DataType::*;
587 matches!(self, UInt8 | UInt16 | UInt32 | UInt64)
588 }
589
590 /// Returns true if this type is decimal: (Decimal*).
591 #[inline]
592 pub fn is_decimal(&self) -> bool {
593 use DataType::*;
594 matches!(
595 self,
596 Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..)
597 )
598 }
599
600 /// Returns true if this type is valid as a dictionary key
601 #[inline]
602 pub fn is_dictionary_key_type(&self) -> bool {
603 self.is_integer()
604 }
605
606 /// Returns true if this type is valid for run-ends array in RunArray
607 #[inline]
608 pub fn is_run_ends_type(&self) -> bool {
609 use DataType::*;
610 matches!(self, Int16 | Int32 | Int64)
611 }
612
613 /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union,
614 /// or Map), or a dictionary of a nested type
615 #[inline]
616 pub fn is_nested(&self) -> bool {
617 use DataType::*;
618 match self {
619 Dictionary(_, v) => DataType::is_nested(v.as_ref()),
620 RunEndEncoded(_, v) => DataType::is_nested(v.data_type()),
621 List(_)
622 | FixedSizeList(_, _)
623 | LargeList(_)
624 | ListView(_)
625 | LargeListView(_)
626 | Struct(_)
627 | Union(_, _)
628 | Map(_, _) => true,
629 _ => false,
630 }
631 }
632
633 /// Returns true if this type is DataType::Null.
634 #[inline]
635 pub fn is_null(&self) -> bool {
636 use DataType::*;
637 matches!(self, Null)
638 }
639
640 /// Returns true if this type is a String type
641 #[inline]
642 pub fn is_string(&self) -> bool {
643 use DataType::*;
644 matches!(self, Utf8 | LargeUtf8 | Utf8View)
645 }
646
647 /// Returns true if this type is a List type.
648 ///
649 /// List types include List, LargeList, FixedSizeList, ListView, and LargeListView.
650 #[inline]
651 pub fn is_list(&self) -> bool {
652 use DataType::*;
653 matches!(
654 self,
655 List(_) | LargeList(_) | FixedSizeList(_, _) | ListView(_) | LargeListView(_)
656 )
657 }
658
659 /// Returns true if this type is a Binary type.
660 ///
661 /// Binary types include Binary, LargeBinary, FixedSizeBinary and BinaryView.
662 #[inline]
663 pub fn is_binary(&self) -> bool {
664 use DataType::*;
665 matches!(self, Binary | LargeBinary | FixedSizeBinary(_) | BinaryView)
666 }
667
668 /// Compares the datatype with another, ignoring nested field names
669 /// and metadata.
670 pub fn equals_datatype(&self, other: &DataType) -> bool {
671 match (&self, other) {
672 (DataType::List(a), DataType::List(b))
673 | (DataType::LargeList(a), DataType::LargeList(b))
674 | (DataType::ListView(a), DataType::ListView(b))
675 | (DataType::LargeListView(a), DataType::LargeListView(b)) => {
676 a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type())
677 }
678 (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
679 a_size == b_size
680 && a.is_nullable() == b.is_nullable()
681 && a.data_type().equals_datatype(b.data_type())
682 }
683 (DataType::Struct(a), DataType::Struct(b)) => {
684 a.len() == b.len()
685 && a.iter().zip(b).all(|(a, b)| {
686 a.is_nullable() == b.is_nullable()
687 && a.data_type().equals_datatype(b.data_type())
688 })
689 }
690 (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => {
691 a_field.is_nullable() == b_field.is_nullable()
692 && a_field.data_type().equals_datatype(b_field.data_type())
693 && a_is_sorted == b_is_sorted
694 }
695 (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => {
696 a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value)
697 }
698 (
699 DataType::RunEndEncoded(a_run_ends, a_values),
700 DataType::RunEndEncoded(b_run_ends, b_values),
701 ) => {
702 a_run_ends.is_nullable() == b_run_ends.is_nullable()
703 && a_run_ends
704 .data_type()
705 .equals_datatype(b_run_ends.data_type())
706 && a_values.is_nullable() == b_values.is_nullable()
707 && a_values.data_type().equals_datatype(b_values.data_type())
708 }
709 (
710 DataType::Union(a_union_fields, a_union_mode),
711 DataType::Union(b_union_fields, b_union_mode),
712 ) => {
713 a_union_mode == b_union_mode
714 && a_union_fields.len() == b_union_fields.len()
715 && a_union_fields.iter().all(|a| {
716 b_union_fields.iter().any(|b| {
717 a.0 == b.0
718 && a.1.is_nullable() == b.1.is_nullable()
719 && a.1.data_type().equals_datatype(b.1.data_type())
720 })
721 })
722 }
723 _ => self == other,
724 }
725 }
726
727 /// Returns the byte width of this type if it is a primitive type
728 ///
729 /// Returns `None` if not a primitive type
730 #[inline]
731 pub fn primitive_width(&self) -> Option<usize> {
732 match self {
733 DataType::Null => None,
734 DataType::Boolean => None,
735 DataType::Int8 | DataType::UInt8 => Some(1),
736 DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
737 DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
738 DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
739 DataType::Timestamp(_, _) => Some(8),
740 DataType::Date32 | DataType::Time32(_) => Some(4),
741 DataType::Date64 | DataType::Time64(_) => Some(8),
742 DataType::Duration(_) => Some(8),
743 DataType::Interval(IntervalUnit::YearMonth) => Some(4),
744 DataType::Interval(IntervalUnit::DayTime) => Some(8),
745 DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
746 DataType::Decimal32(_, _) => Some(4),
747 DataType::Decimal64(_, _) => Some(8),
748 DataType::Decimal128(_, _) => Some(16),
749 DataType::Decimal256(_, _) => Some(32),
750 DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None,
751 DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
752 DataType::FixedSizeBinary(_) => None,
753 DataType::List(_)
754 | DataType::ListView(_)
755 | DataType::LargeList(_)
756 | DataType::LargeListView(_)
757 | DataType::Map(_, _) => None,
758 DataType::FixedSizeList(_, _) => None,
759 DataType::Struct(_) => None,
760 DataType::Union(_, _) => None,
761 DataType::Dictionary(_, _) => None,
762 DataType::RunEndEncoded(_, _) => None,
763 }
764 }
765
766 /// Return size of this instance in bytes.
767 ///
768 /// Includes the size of `Self`.
769 pub fn size(&self) -> usize {
770 std::mem::size_of_val(self)
771 + match self {
772 DataType::Null
773 | DataType::Boolean
774 | DataType::Int8
775 | DataType::Int16
776 | DataType::Int32
777 | DataType::Int64
778 | DataType::UInt8
779 | DataType::UInt16
780 | DataType::UInt32
781 | DataType::UInt64
782 | DataType::Float16
783 | DataType::Float32
784 | DataType::Float64
785 | DataType::Date32
786 | DataType::Date64
787 | DataType::Time32(_)
788 | DataType::Time64(_)
789 | DataType::Duration(_)
790 | DataType::Interval(_)
791 | DataType::Binary
792 | DataType::FixedSizeBinary(_)
793 | DataType::LargeBinary
794 | DataType::BinaryView
795 | DataType::Utf8
796 | DataType::LargeUtf8
797 | DataType::Utf8View
798 | DataType::Decimal32(_, _)
799 | DataType::Decimal64(_, _)
800 | DataType::Decimal128(_, _)
801 | DataType::Decimal256(_, _) => 0,
802 DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(),
803 DataType::List(field)
804 | DataType::ListView(field)
805 | DataType::FixedSizeList(field, _)
806 | DataType::LargeList(field)
807 | DataType::LargeListView(field)
808 | DataType::Map(field, _) => field.size(),
809 DataType::Struct(fields) => fields.size(),
810 DataType::Union(fields, _) => fields.size(),
811 DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
812 DataType::RunEndEncoded(run_ends, values) => {
813 run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
814 - std::mem::size_of_val(values)
815 }
816 }
817 }
818
819 /// Check to see if `self` is a superset of `other`
820 ///
821 /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type
822 /// else it will check to see if the DataType is equal to the other DataType
823 pub fn contains(&self, other: &DataType) -> bool {
824 match (self, other) {
825 (DataType::List(f1), DataType::List(f2))
826 | (DataType::LargeList(f1), DataType::LargeList(f2))
827 | (DataType::ListView(f1), DataType::ListView(f2))
828 | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2),
829 (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => {
830 s1 == s2 && f1.contains(f2)
831 }
832 (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2),
833 (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2),
834 (DataType::Union(f1, s1), DataType::Union(f2, s2)) => {
835 s1 == s2
836 && f1
837 .iter()
838 .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1)))
839 }
840 (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => {
841 k1.contains(k2) && v1.contains(v2)
842 }
843 _ => self == other,
844 }
845 }
846
847 /// Create a [`DataType::List`] with elements of the specified type
848 /// and nullability, and conventionally named inner [`Field`] (`"item"`).
849 ///
850 /// To specify field level metadata, construct the inner [`Field`]
851 /// directly via [`Field::new`] or [`Field::new_list_field`].
852 pub fn new_list(data_type: DataType, nullable: bool) -> Self {
853 DataType::List(Arc::new(Field::new_list_field(data_type, nullable)))
854 }
855
856 /// Create a [`DataType::LargeList`] with elements of the specified type
857 /// and nullability, and conventionally named inner [`Field`] (`"item"`).
858 ///
859 /// To specify field level metadata, construct the inner [`Field`]
860 /// directly via [`Field::new`] or [`Field::new_list_field`].
861 pub fn new_large_list(data_type: DataType, nullable: bool) -> Self {
862 DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable)))
863 }
864
865 /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size
866 /// and nullability, and conventionally named inner [`Field`] (`"item"`).
867 ///
868 /// To specify field level metadata, construct the inner [`Field`]
869 /// directly via [`Field::new`] or [`Field::new_list_field`].
870 pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self {
871 DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size)
872 }
873}
874
875/// The maximum precision for [DataType::Decimal32] values
876pub const DECIMAL32_MAX_PRECISION: u8 = 9;
877
878/// The maximum scale for [DataType::Decimal32] values
879pub const DECIMAL32_MAX_SCALE: i8 = 9;
880
881/// The maximum precision for [DataType::Decimal64] values
882pub const DECIMAL64_MAX_PRECISION: u8 = 18;
883
884/// The maximum scale for [DataType::Decimal64] values
885pub const DECIMAL64_MAX_SCALE: i8 = 18;
886
887/// The maximum precision for [DataType::Decimal128] values
888pub const DECIMAL128_MAX_PRECISION: u8 = 38;
889
890/// The maximum scale for [DataType::Decimal128] values
891pub const DECIMAL128_MAX_SCALE: i8 = 38;
892
893/// The maximum precision for [DataType::Decimal256] values
894pub const DECIMAL256_MAX_PRECISION: u8 = 76;
895
896/// The maximum scale for [DataType::Decimal256] values
897pub const DECIMAL256_MAX_SCALE: i8 = 76;
898
899/// The default scale for [DataType::Decimal32] values
900pub const DECIMAL32_DEFAULT_SCALE: i8 = 2;
901
902/// The default scale for [DataType::Decimal64] values
903pub const DECIMAL64_DEFAULT_SCALE: i8 = 6;
904
905/// The default scale for [DataType::Decimal128] and [DataType::Decimal256]
906/// values
907pub const DECIMAL_DEFAULT_SCALE: i8 = 10;
908
909#[cfg(test)]
910mod tests {
911 use super::*;
912
913 #[test]
914 #[cfg(feature = "serde")]
915 fn serde_struct_type() {
916 use std::collections::HashMap;
917
918 let kv_array = [("k".to_string(), "v".to_string())];
919 let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
920
921 // Non-empty map: should be converted as JSON obj { ... }
922 let first_name =
923 Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
924
925 // Empty map: should be omitted.
926 let last_name =
927 Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default());
928
929 let person = DataType::Struct(Fields::from(vec![
930 first_name,
931 last_name,
932 Field::new(
933 "address",
934 DataType::Struct(Fields::from(vec![
935 Field::new("street", DataType::Utf8, false),
936 Field::new("zip", DataType::UInt16, false),
937 ])),
938 false,
939 ),
940 ]));
941
942 let serialized = serde_json::to_string(&person).unwrap();
943
944 // NOTE that this is testing the default (derived) serialization format, not the
945 // JSON format specified in metadata.md
946
947 assert_eq!(
948 "{\"Struct\":[\
949 {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
950 {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
951 {\"name\":\"address\",\"data_type\":{\"Struct\":\
952 [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
953 {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\
954 ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}",
955 serialized
956 );
957
958 let deserialized = serde_json::from_str(&serialized).unwrap();
959
960 assert_eq!(person, deserialized);
961 }
962
963 #[test]
964 fn test_list_datatype_equality() {
965 // tests that list type equality is checked while ignoring list names
966 let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
967 let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true)));
968 let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false)));
969 let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true)));
970 assert!(list_a.equals_datatype(&list_b));
971 assert!(!list_a.equals_datatype(&list_c));
972 assert!(!list_b.equals_datatype(&list_c));
973 assert!(!list_a.equals_datatype(&list_d));
974
975 let list_e =
976 DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3);
977 let list_f =
978 DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3);
979 let list_g = DataType::FixedSizeList(
980 Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)),
981 3,
982 );
983 assert!(list_e.equals_datatype(&list_f));
984 assert!(!list_e.equals_datatype(&list_g));
985 assert!(!list_f.equals_datatype(&list_g));
986
987 let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)]));
988 let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)]));
989 let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)]));
990 let list_k = DataType::Struct(Fields::from(vec![
991 Field::new("f1", list_f.clone(), false),
992 Field::new("f2", list_g.clone(), false),
993 Field::new("f3", DataType::Utf8, true),
994 ]));
995 let list_l = DataType::Struct(Fields::from(vec![
996 Field::new("ff1", list_f.clone(), false),
997 Field::new("ff2", list_g.clone(), false),
998 Field::new("ff3", DataType::LargeUtf8, true),
999 ]));
1000 let list_m = DataType::Struct(Fields::from(vec![
1001 Field::new("ff1", list_f, false),
1002 Field::new("ff2", list_g, false),
1003 Field::new("ff3", DataType::Utf8, true),
1004 ]));
1005 assert!(list_h.equals_datatype(&list_i));
1006 assert!(!list_h.equals_datatype(&list_j));
1007 assert!(!list_k.equals_datatype(&list_l));
1008 assert!(list_k.equals_datatype(&list_m));
1009
1010 let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true);
1011 let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true);
1012 let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false);
1013 let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true);
1014 let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true);
1015
1016 assert!(list_n.equals_datatype(&list_o));
1017 assert!(!list_n.equals_datatype(&list_p));
1018 assert!(!list_n.equals_datatype(&list_q));
1019 assert!(!list_n.equals_datatype(&list_r));
1020
1021 let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a));
1022 let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone()));
1023 let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b));
1024 let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c));
1025
1026 assert!(list_s.equals_datatype(&list_t));
1027 assert!(!list_s.equals_datatype(&list_u));
1028 assert!(!list_s.equals_datatype(&list_v));
1029
1030 let union_a = DataType::Union(
1031 UnionFields::try_new(
1032 vec![1, 2],
1033 vec![
1034 Field::new("f1", DataType::Utf8, false),
1035 Field::new("f2", DataType::UInt8, false),
1036 ],
1037 )
1038 .unwrap(),
1039 UnionMode::Sparse,
1040 );
1041 let union_b = DataType::Union(
1042 UnionFields::try_new(
1043 vec![1, 2],
1044 vec![
1045 Field::new("ff1", DataType::Utf8, false),
1046 Field::new("ff2", DataType::UInt8, false),
1047 ],
1048 )
1049 .unwrap(),
1050 UnionMode::Sparse,
1051 );
1052 let union_c = DataType::Union(
1053 UnionFields::try_new(
1054 vec![2, 1],
1055 vec![
1056 Field::new("fff2", DataType::UInt8, false),
1057 Field::new("fff1", DataType::Utf8, false),
1058 ],
1059 )
1060 .unwrap(),
1061 UnionMode::Sparse,
1062 );
1063 let union_d = DataType::Union(
1064 UnionFields::try_new(
1065 vec![2, 1],
1066 vec![
1067 Field::new("fff1", DataType::Int8, false),
1068 Field::new("fff2", DataType::UInt8, false),
1069 ],
1070 )
1071 .unwrap(),
1072 UnionMode::Sparse,
1073 );
1074 let union_e = DataType::Union(
1075 UnionFields::try_new(
1076 vec![1, 2],
1077 vec![
1078 Field::new("f1", DataType::Utf8, true),
1079 Field::new("f2", DataType::UInt8, false),
1080 ],
1081 )
1082 .unwrap(),
1083 UnionMode::Sparse,
1084 );
1085
1086 assert!(union_a.equals_datatype(&union_b));
1087 assert!(union_a.equals_datatype(&union_c));
1088 assert!(!union_a.equals_datatype(&union_d));
1089 assert!(!union_a.equals_datatype(&union_e));
1090
1091 let list_w = DataType::RunEndEncoded(
1092 Arc::new(Field::new("f1", DataType::Int64, true)),
1093 Arc::new(Field::new("f2", DataType::Utf8, true)),
1094 );
1095 let list_x = DataType::RunEndEncoded(
1096 Arc::new(Field::new("ff1", DataType::Int64, true)),
1097 Arc::new(Field::new("ff2", DataType::Utf8, true)),
1098 );
1099 let list_y = DataType::RunEndEncoded(
1100 Arc::new(Field::new("ff1", DataType::UInt16, true)),
1101 Arc::new(Field::new("ff2", DataType::Utf8, true)),
1102 );
1103 let list_z = DataType::RunEndEncoded(
1104 Arc::new(Field::new("f1", DataType::Int64, false)),
1105 Arc::new(Field::new("f2", DataType::Utf8, true)),
1106 );
1107
1108 assert!(list_w.equals_datatype(&list_x));
1109 assert!(!list_w.equals_datatype(&list_y));
1110 assert!(!list_w.equals_datatype(&list_z));
1111 }
1112
1113 #[test]
1114 fn create_struct_type() {
1115 let _person = DataType::Struct(Fields::from(vec![
1116 Field::new("first_name", DataType::Utf8, false),
1117 Field::new("last_name", DataType::Utf8, false),
1118 Field::new(
1119 "address",
1120 DataType::Struct(Fields::from(vec![
1121 Field::new("street", DataType::Utf8, false),
1122 Field::new("zip", DataType::UInt16, false),
1123 ])),
1124 false,
1125 ),
1126 ]));
1127 }
1128
1129 #[test]
1130 fn test_nested() {
1131 let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true)));
1132 let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1133 let large_list_view =
1134 DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1135
1136 assert!(!DataType::is_nested(&DataType::Boolean));
1137 assert!(!DataType::is_nested(&DataType::Int32));
1138 assert!(!DataType::is_nested(&DataType::Utf8));
1139 assert!(DataType::is_nested(&list));
1140 assert!(DataType::is_nested(&list_view));
1141 assert!(DataType::is_nested(&large_list_view));
1142
1143 assert!(!DataType::is_nested(&DataType::Dictionary(
1144 Box::new(DataType::Int32),
1145 Box::new(DataType::Boolean)
1146 )));
1147 assert!(!DataType::is_nested(&DataType::Dictionary(
1148 Box::new(DataType::Int32),
1149 Box::new(DataType::Int64)
1150 )));
1151 assert!(!DataType::is_nested(&DataType::Dictionary(
1152 Box::new(DataType::Int32),
1153 Box::new(DataType::LargeUtf8)
1154 )));
1155 assert!(DataType::is_nested(&DataType::Dictionary(
1156 Box::new(DataType::Int32),
1157 Box::new(list)
1158 )));
1159 }
1160
1161 #[test]
1162 fn test_integer() {
1163 // is_integer
1164 assert!(DataType::is_integer(&DataType::Int32));
1165 assert!(DataType::is_integer(&DataType::UInt64));
1166 assert!(!DataType::is_integer(&DataType::Float16));
1167
1168 // is_signed_integer
1169 assert!(DataType::is_signed_integer(&DataType::Int32));
1170 assert!(!DataType::is_signed_integer(&DataType::UInt64));
1171 assert!(!DataType::is_signed_integer(&DataType::Float16));
1172
1173 // is_unsigned_integer
1174 assert!(!DataType::is_unsigned_integer(&DataType::Int32));
1175 assert!(DataType::is_unsigned_integer(&DataType::UInt64));
1176 assert!(!DataType::is_unsigned_integer(&DataType::Float16));
1177
1178 // is_dictionary_key_type
1179 assert!(DataType::is_dictionary_key_type(&DataType::Int32));
1180 assert!(DataType::is_dictionary_key_type(&DataType::UInt64));
1181 assert!(!DataType::is_dictionary_key_type(&DataType::Float16));
1182 }
1183
1184 #[test]
1185 fn test_string() {
1186 assert!(DataType::is_string(&DataType::Utf8));
1187 assert!(DataType::is_string(&DataType::LargeUtf8));
1188 assert!(DataType::is_string(&DataType::Utf8View));
1189 assert!(!DataType::is_string(&DataType::Int32));
1190 }
1191
1192 #[test]
1193 fn test_floating() {
1194 assert!(DataType::is_floating(&DataType::Float16));
1195 assert!(!DataType::is_floating(&DataType::Int32));
1196 }
1197
1198 #[test]
1199 fn test_decimal() {
1200 assert!(DataType::is_decimal(&DataType::Decimal32(4, 2)));
1201 assert!(DataType::is_decimal(&DataType::Decimal64(4, 2)));
1202 assert!(DataType::is_decimal(&DataType::Decimal128(4, 2)));
1203 assert!(DataType::is_decimal(&DataType::Decimal256(4, 2)));
1204 assert!(!DataType::is_decimal(&DataType::Float16));
1205 }
1206
1207 #[test]
1208 fn test_datatype_is_null() {
1209 assert!(DataType::is_null(&DataType::Null));
1210 assert!(!DataType::is_null(&DataType::Int32));
1211 }
1212
1213 #[test]
1214 fn test_is_list() {
1215 assert!(DataType::is_list(&DataType::new_list(
1216 DataType::Int16,
1217 true
1218 )));
1219 assert!(DataType::is_list(&DataType::new_large_list(
1220 DataType::Int16,
1221 true
1222 )));
1223 assert!(DataType::is_list(&DataType::new_fixed_size_list(
1224 DataType::Int16,
1225 5,
1226 true
1227 )));
1228 assert!(DataType::is_list(&DataType::ListView(Arc::new(
1229 Field::new("f", DataType::Int16, true)
1230 ))));
1231 assert!(DataType::is_list(&DataType::LargeListView(Arc::new(
1232 Field::new("f", DataType::Int16, true)
1233 ))));
1234 assert!(!DataType::is_list(&DataType::Binary));
1235 }
1236
1237 #[test]
1238 fn test_is_binary() {
1239 assert!(DataType::is_binary(&DataType::Binary));
1240 assert!(DataType::is_binary(&DataType::LargeBinary));
1241 assert!(DataType::is_binary(&DataType::BinaryView));
1242 assert!(!DataType::is_list(&DataType::Utf8View));
1243 }
1244
1245 #[test]
1246 fn size_should_not_regress() {
1247 assert_eq!(std::mem::size_of::<DataType>(), 24);
1248 }
1249
1250 #[test]
1251 #[should_panic(expected = "duplicate type id: 1")]
1252 fn test_union_with_duplicated_type_id() {
1253 let type_ids = vec![1, 1];
1254 let _union = DataType::Union(
1255 UnionFields::try_new(
1256 type_ids,
1257 vec![
1258 Field::new("f1", DataType::Int32, false),
1259 Field::new("f2", DataType::Utf8, false),
1260 ],
1261 )
1262 .unwrap(),
1263 UnionMode::Dense,
1264 );
1265 }
1266
1267 #[test]
1268 fn test_try_from_str() {
1269 let data_type: DataType = "Int32".try_into().unwrap();
1270 assert_eq!(data_type, DataType::Int32);
1271 }
1272
1273 #[test]
1274 fn test_from_str() {
1275 let data_type: DataType = "UInt64".parse().unwrap();
1276 assert_eq!(data_type, DataType::UInt64);
1277 }
1278
1279 #[test]
1280 #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro
1281 fn test_debug_format_field() {
1282 // Make sure the `Debug` formatting of `DataType` is readable and not too long
1283 insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r"
1284 List(
1285 Field {
1286 data_type: Int8,
1287 },
1288 )
1289 ");
1290 }
1291}