arrow_schema/
datatype_parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
21
22pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
23    Parser::new(val).parse()
24}
25
26type ArrowResult<T> = Result<T, ArrowError>;
27
28fn make_error(val: &str, msg: &str) -> ArrowError {
29    let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" );
30    ArrowError::ParseError(msg)
31}
32
33fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
34    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
35}
36
37#[derive(Debug)]
38/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
39struct Parser<'a> {
40    val: &'a str,
41    tokenizer: Tokenizer<'a>,
42}
43
44impl<'a> Parser<'a> {
45    fn new(val: &'a str) -> Self {
46        Self {
47            val,
48            tokenizer: Tokenizer::new(val),
49        }
50    }
51
52    fn parse(mut self) -> ArrowResult<DataType> {
53        let data_type = self.parse_next_type()?;
54        // ensure that there is no trailing content
55        if self.tokenizer.next().is_some() {
56            Err(make_error(
57                self.val,
58                &format!("checking trailing content after parsing '{data_type}'"),
59            ))
60        } else {
61            Ok(data_type)
62        }
63    }
64
65    /// parses the next full DataType
66    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
67        match self.next_token()? {
68            Token::SimpleType(data_type) => Ok(data_type),
69            Token::Timestamp => self.parse_timestamp(),
70            Token::Time32 => self.parse_time32(),
71            Token::Time64 => self.parse_time64(),
72            Token::Duration => self.parse_duration(),
73            Token::Interval => self.parse_interval(),
74            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
75            Token::Decimal32 => self.parse_decimal_32(),
76            Token::Decimal64 => self.parse_decimal_64(),
77            Token::Decimal128 => self.parse_decimal_128(),
78            Token::Decimal256 => self.parse_decimal_256(),
79            Token::Dictionary => self.parse_dictionary(),
80            Token::List => self.parse_list(),
81            Token::LargeList => self.parse_large_list(),
82            Token::FixedSizeList => self.parse_fixed_size_list(),
83            Token::Struct => self.parse_struct(),
84            Token::FieldName(word) => {
85                Err(make_error(self.val, &format!("unrecognized word: {word}")))
86            }
87            tok => Err(make_error(
88                self.val,
89                &format!("finding next type, got unexpected '{tok}'"),
90            )),
91        }
92    }
93
94    /// Parses the List type
95    fn parse_list(&mut self) -> ArrowResult<DataType> {
96        self.expect_token(Token::LParen)?;
97        let data_type = self.parse_next_type()?;
98        self.expect_token(Token::RParen)?;
99        Ok(DataType::List(Arc::new(Field::new_list_field(
100            data_type, true,
101        ))))
102    }
103
104    /// Parses the LargeList type
105    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
106        self.expect_token(Token::LParen)?;
107        let data_type = self.parse_next_type()?;
108        self.expect_token(Token::RParen)?;
109        Ok(DataType::LargeList(Arc::new(Field::new_list_field(
110            data_type, true,
111        ))))
112    }
113
114    /// Parses the FixedSizeList type
115    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
116        self.expect_token(Token::LParen)?;
117        let length = self.parse_i32("FixedSizeList")?;
118        self.expect_token(Token::Comma)?;
119        let data_type = self.parse_next_type()?;
120        self.expect_token(Token::RParen)?;
121        Ok(DataType::FixedSizeList(
122            Arc::new(Field::new_list_field(data_type, true)),
123            length,
124        ))
125    }
126
127    /// Parses the next timeunit
128    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
129        match self.next_token()? {
130            Token::TimeUnit(time_unit) => Ok(time_unit),
131            tok => Err(make_error(
132                self.val,
133                &format!("finding TimeUnit for {context}, got {tok}"),
134            )),
135        }
136    }
137
138    /// Parses the next timezone
139    fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> {
140        match self.next_token()? {
141            Token::None => Ok(None),
142            Token::Some => {
143                self.expect_token(Token::LParen)?;
144                let timezone = self.parse_double_quoted_string("Timezone")?;
145                self.expect_token(Token::RParen)?;
146                Ok(Some(timezone))
147            }
148            tok => Err(make_error(
149                self.val,
150                &format!("finding Timezone for {context}, got {tok}"),
151            )),
152        }
153    }
154
155    /// Parses the next double quoted string
156    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
157        match self.next_token()? {
158            Token::DoubleQuotedString(s) => Ok(s),
159            Token::FieldName(word) => {
160                Err(make_error(self.val, &format!("unrecognized word: {word}")))
161            }
162            tok => Err(make_error(
163                self.val,
164                &format!("finding double quoted string for {context}, got '{tok}'"),
165            )),
166        }
167    }
168
169    /// Parses the next integer value
170    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
171        match self.next_token()? {
172            Token::Integer(v) => Ok(v),
173            tok => Err(make_error(
174                self.val,
175                &format!("finding i64 for {context}, got '{tok}'"),
176            )),
177        }
178    }
179
180    /// Parses the next i32 integer value
181    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
182        let length = self.parse_i64(context)?;
183        length.try_into().map_err(|e| {
184            make_error(
185                self.val,
186                &format!("converting {length} into i32 for {context}: {e}"),
187            )
188        })
189    }
190
191    /// Parses the next i8 integer value
192    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
193        let length = self.parse_i64(context)?;
194        length.try_into().map_err(|e| {
195            make_error(
196                self.val,
197                &format!("converting {length} into i8 for {context}: {e}"),
198            )
199        })
200    }
201
202    /// Parses the next u8 integer value
203    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
204        let length = self.parse_i64(context)?;
205        length.try_into().map_err(|e| {
206            make_error(
207                self.val,
208                &format!("converting {length} into u8 for {context}: {e}"),
209            )
210        })
211    }
212
213    /// Parses the next timestamp (called after `Timestamp` has been consumed)
214    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
215        self.expect_token(Token::LParen)?;
216        let time_unit = self.parse_time_unit("Timestamp")?;
217        self.expect_token(Token::Comma)?;
218        let timezone = self.parse_timezone("Timestamp")?;
219        self.expect_token(Token::RParen)?;
220        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
221    }
222
223    /// Parses the next Time32 (called after `Time32` has been consumed)
224    fn parse_time32(&mut self) -> ArrowResult<DataType> {
225        self.expect_token(Token::LParen)?;
226        let time_unit = self.parse_time_unit("Time32")?;
227        self.expect_token(Token::RParen)?;
228        Ok(DataType::Time32(time_unit))
229    }
230
231    /// Parses the next Time64 (called after `Time64` has been consumed)
232    fn parse_time64(&mut self) -> ArrowResult<DataType> {
233        self.expect_token(Token::LParen)?;
234        let time_unit = self.parse_time_unit("Time64")?;
235        self.expect_token(Token::RParen)?;
236        Ok(DataType::Time64(time_unit))
237    }
238
239    /// Parses the next Duration (called after `Duration` has been consumed)
240    fn parse_duration(&mut self) -> ArrowResult<DataType> {
241        self.expect_token(Token::LParen)?;
242        let time_unit = self.parse_time_unit("Duration")?;
243        self.expect_token(Token::RParen)?;
244        Ok(DataType::Duration(time_unit))
245    }
246
247    /// Parses the next Interval (called after `Interval` has been consumed)
248    fn parse_interval(&mut self) -> ArrowResult<DataType> {
249        self.expect_token(Token::LParen)?;
250        let interval_unit = match self.next_token()? {
251            Token::IntervalUnit(interval_unit) => interval_unit,
252            tok => {
253                return Err(make_error(
254                    self.val,
255                    &format!("finding IntervalUnit for Interval, got {tok}"),
256                ))
257            }
258        };
259        self.expect_token(Token::RParen)?;
260        Ok(DataType::Interval(interval_unit))
261    }
262
263    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
264    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
265        self.expect_token(Token::LParen)?;
266        let length = self.parse_i32("FixedSizeBinary")?;
267        self.expect_token(Token::RParen)?;
268        Ok(DataType::FixedSizeBinary(length))
269    }
270
271    /// Parses the next Decimal32 (called after `Decimal32` has been consumed)
272    fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
273        self.expect_token(Token::LParen)?;
274        let precision = self.parse_u8("Decimal32")?;
275        self.expect_token(Token::Comma)?;
276        let scale = self.parse_i8("Decimal32")?;
277        self.expect_token(Token::RParen)?;
278        Ok(DataType::Decimal32(precision, scale))
279    }
280
281    /// Parses the next Decimal64 (called after `Decimal64` has been consumed)
282    fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
283        self.expect_token(Token::LParen)?;
284        let precision = self.parse_u8("Decimal64")?;
285        self.expect_token(Token::Comma)?;
286        let scale = self.parse_i8("Decimal64")?;
287        self.expect_token(Token::RParen)?;
288        Ok(DataType::Decimal64(precision, scale))
289    }
290
291    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
292    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
293        self.expect_token(Token::LParen)?;
294        let precision = self.parse_u8("Decimal128")?;
295        self.expect_token(Token::Comma)?;
296        let scale = self.parse_i8("Decimal128")?;
297        self.expect_token(Token::RParen)?;
298        Ok(DataType::Decimal128(precision, scale))
299    }
300
301    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
302    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
303        self.expect_token(Token::LParen)?;
304        let precision = self.parse_u8("Decimal256")?;
305        self.expect_token(Token::Comma)?;
306        let scale = self.parse_i8("Decimal256")?;
307        self.expect_token(Token::RParen)?;
308        Ok(DataType::Decimal256(precision, scale))
309    }
310
311    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
312    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
313        self.expect_token(Token::LParen)?;
314        let key_type = self.parse_next_type()?;
315        self.expect_token(Token::Comma)?;
316        let value_type = self.parse_next_type()?;
317        self.expect_token(Token::RParen)?;
318        Ok(DataType::Dictionary(
319            Box::new(key_type),
320            Box::new(value_type),
321        ))
322    }
323    fn parse_struct(&mut self) -> ArrowResult<DataType> {
324        self.expect_token(Token::LParen)?;
325        let mut fields = Vec::new();
326        loop {
327            let field_name = match self.next_token()? {
328                // It's valid to have a name that is a type name
329                Token::SimpleType(data_type) => data_type.to_string(),
330                Token::FieldName(name) => name,
331                Token::RParen => {
332                    if fields.is_empty() {
333                        break;
334                    } else {
335                        return Err(make_error(
336                            self.val,
337                            "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
338                        ));
339                    }
340                }
341                tok => {
342                    return Err(make_error(
343                        self.val,
344                        &format!("Expected a word for the name of Struct, but got {tok}"),
345                    ))
346                }
347            };
348            let field_type = self.parse_next_type()?;
349            fields.push(Arc::new(Field::new(field_name, field_type, true)));
350            match self.next_token()? {
351                Token::Comma => continue,
352                Token::RParen => break,
353                tok => {
354                    return Err(make_error(
355                        self.val,
356                        &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
357                    ))
358                }
359            }
360        }
361        Ok(DataType::Struct(Fields::from(fields)))
362    }
363
364    /// return the next token, or an error if there are none left
365    fn next_token(&mut self) -> ArrowResult<Token> {
366        match self.tokenizer.next() {
367            None => Err(make_error(self.val, "finding next token")),
368            Some(token) => token,
369        }
370    }
371
372    /// consume the next token, returning OK(()) if it matches tok, and Err if not
373    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
374        let next_token = self.next_token()?;
375        if next_token == tok {
376            Ok(())
377        } else {
378            Err(make_error_expected(self.val, &tok, &next_token))
379        }
380    }
381}
382
383/// returns true if this character is a separator
384fn is_separator(c: char) -> bool {
385    c == '(' || c == ')' || c == ',' || c == ' '
386}
387
388#[derive(Debug)]
389/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing
390///
391/// For example the string "Timestamp(Nanosecond, None)" would be parsed into:
392///
393/// * Token::Timestamp
394/// * Token::Lparen
395/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
396/// * Token::Comma,
397/// * Token::None,
398/// * Token::Rparen,
399struct Tokenizer<'a> {
400    val: &'a str,
401    chars: Peekable<Chars<'a>>,
402    // temporary buffer for parsing words
403    word: String,
404}
405
406impl<'a> Tokenizer<'a> {
407    fn new(val: &'a str) -> Self {
408        Self {
409            val,
410            chars: val.chars().peekable(),
411            word: String::new(),
412        }
413    }
414
415    /// returns the next char, without consuming it
416    fn peek_next_char(&mut self) -> Option<char> {
417        self.chars.peek().copied()
418    }
419
420    /// returns the next char, and consuming it
421    fn next_char(&mut self) -> Option<char> {
422        self.chars.next()
423    }
424
425    /// parse the characters in val starting at pos, until the next
426    /// `,`, `(`, or `)` or end of line
427    fn parse_word(&mut self) -> ArrowResult<Token> {
428        // reset temp space
429        self.word.clear();
430        loop {
431            match self.peek_next_char() {
432                None => break,
433                Some(c) if is_separator(c) => break,
434                Some(c) => {
435                    self.next_char();
436                    self.word.push(c);
437                }
438            }
439        }
440
441        if let Some(c) = self.word.chars().next() {
442            // if it started with a number, try parsing it as an integer
443            if c == '-' || c.is_numeric() {
444                let val: i64 = self.word.parse().map_err(|e| {
445                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
446                })?;
447                return Ok(Token::Integer(val));
448            }
449            // if it started with a double quote `"`, try parsing it as a double quoted string
450            else if c == '"' {
451                let len = self.word.chars().count();
452
453                // to verify it's double quoted
454                if let Some(last_c) = self.word.chars().last() {
455                    if last_c != '"' || len < 2 {
456                        return Err(make_error(
457                            self.val,
458                            &format!(
459                                "parsing {} as double quoted string: last char must be \"",
460                                self.word
461                            ),
462                        ));
463                    }
464                }
465
466                if len == 2 {
467                    return Err(make_error(
468                        self.val,
469                        &format!(
470                            "parsing {} as double quoted string: empty string isn't supported",
471                            self.word
472                        ),
473                    ));
474                }
475
476                let val: String = self.word.parse().map_err(|e| {
477                    make_error(
478                        self.val,
479                        &format!("parsing {} as double quoted string: {e}", self.word),
480                    )
481                })?;
482
483                let s = val[1..len - 1].to_string();
484                if s.contains('"') {
485                    return Err(make_error(
486                        self.val,
487                        &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
488                    ));
489                }
490
491                return Ok(Token::DoubleQuotedString(s));
492            }
493        }
494
495        // figure out what the word was
496        let token = match self.word.as_str() {
497            "Null" => Token::SimpleType(DataType::Null),
498            "Boolean" => Token::SimpleType(DataType::Boolean),
499
500            "Int8" => Token::SimpleType(DataType::Int8),
501            "Int16" => Token::SimpleType(DataType::Int16),
502            "Int32" => Token::SimpleType(DataType::Int32),
503            "Int64" => Token::SimpleType(DataType::Int64),
504
505            "UInt8" => Token::SimpleType(DataType::UInt8),
506            "UInt16" => Token::SimpleType(DataType::UInt16),
507            "UInt32" => Token::SimpleType(DataType::UInt32),
508            "UInt64" => Token::SimpleType(DataType::UInt64),
509
510            "Utf8" => Token::SimpleType(DataType::Utf8),
511            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
512            "Utf8View" => Token::SimpleType(DataType::Utf8View),
513            "Binary" => Token::SimpleType(DataType::Binary),
514            "BinaryView" => Token::SimpleType(DataType::BinaryView),
515            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
516
517            "Float16" => Token::SimpleType(DataType::Float16),
518            "Float32" => Token::SimpleType(DataType::Float32),
519            "Float64" => Token::SimpleType(DataType::Float64),
520
521            "Date32" => Token::SimpleType(DataType::Date32),
522            "Date64" => Token::SimpleType(DataType::Date64),
523
524            "List" => Token::List,
525            "LargeList" => Token::LargeList,
526            "FixedSizeList" => Token::FixedSizeList,
527
528            "Second" => Token::TimeUnit(TimeUnit::Second),
529            "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
530            "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
531            "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
532
533            "Timestamp" => Token::Timestamp,
534            "Time32" => Token::Time32,
535            "Time64" => Token::Time64,
536            "Duration" => Token::Duration,
537            "Interval" => Token::Interval,
538            "Dictionary" => Token::Dictionary,
539
540            "FixedSizeBinary" => Token::FixedSizeBinary,
541
542            "Decimal32" => Token::Decimal32,
543            "Decimal64" => Token::Decimal64,
544            "Decimal128" => Token::Decimal128,
545            "Decimal256" => Token::Decimal256,
546
547            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
548            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
549            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
550
551            "Some" => Token::Some,
552            "None" => Token::None,
553
554            "Struct" => Token::Struct,
555            // If we don't recognize the word, treat it as a field name
556            word => Token::FieldName(word.to_string()),
557        };
558        Ok(token)
559    }
560}
561
562impl Iterator for Tokenizer<'_> {
563    type Item = ArrowResult<Token>;
564
565    fn next(&mut self) -> Option<Self::Item> {
566        loop {
567            match self.peek_next_char()? {
568                ' ' => {
569                    // skip whitespace
570                    self.next_char();
571                    continue;
572                }
573                '(' => {
574                    self.next_char();
575                    return Some(Ok(Token::LParen));
576                }
577                ')' => {
578                    self.next_char();
579                    return Some(Ok(Token::RParen));
580                }
581                ',' => {
582                    self.next_char();
583                    return Some(Ok(Token::Comma));
584                }
585                _ => return Some(self.parse_word()),
586            }
587        }
588    }
589}
590
591/// Grammar is
592///
593#[derive(Debug, PartialEq)]
594enum Token {
595    // Null, or Int32
596    SimpleType(DataType),
597    Timestamp,
598    Time32,
599    Time64,
600    Duration,
601    Interval,
602    FixedSizeBinary,
603    Decimal32,
604    Decimal64,
605    Decimal128,
606    Decimal256,
607    Dictionary,
608    TimeUnit(TimeUnit),
609    IntervalUnit(IntervalUnit),
610    LParen,
611    RParen,
612    Comma,
613    Some,
614    None,
615    Integer(i64),
616    DoubleQuotedString(String),
617    List,
618    LargeList,
619    FixedSizeList,
620    Struct,
621    FieldName(String),
622}
623
624impl Display for Token {
625    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
626        match self {
627            Token::SimpleType(t) => write!(f, "{t}"),
628            Token::List => write!(f, "List"),
629            Token::LargeList => write!(f, "LargeList"),
630            Token::FixedSizeList => write!(f, "FixedSizeList"),
631            Token::Timestamp => write!(f, "Timestamp"),
632            Token::Time32 => write!(f, "Time32"),
633            Token::Time64 => write!(f, "Time64"),
634            Token::Duration => write!(f, "Duration"),
635            Token::Interval => write!(f, "Interval"),
636            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
637            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
638            Token::LParen => write!(f, "("),
639            Token::RParen => write!(f, ")"),
640            Token::Comma => write!(f, ","),
641            Token::Some => write!(f, "Some"),
642            Token::None => write!(f, "None"),
643            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
644            Token::Decimal32 => write!(f, "Decimal32"),
645            Token::Decimal64 => write!(f, "Decimal64"),
646            Token::Decimal128 => write!(f, "Decimal128"),
647            Token::Decimal256 => write!(f, "Decimal256"),
648            Token::Dictionary => write!(f, "Dictionary"),
649            Token::Integer(v) => write!(f, "Integer({v})"),
650            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
651            Token::Struct => write!(f, "Struct"),
652            Token::FieldName(s) => write!(f, "FieldName({s})"),
653        }
654    }
655}
656
657#[cfg(test)]
658mod test {
659    use super::*;
660
661    #[test]
662    fn test_parse_data_type() {
663        // this ensures types can be parsed correctly from their string representations
664        for dt in list_datatypes() {
665            round_trip(dt)
666        }
667    }
668
669    /// convert data_type to a string, and then parse it as a type
670    /// verifying it is the same
671    fn round_trip(data_type: DataType) {
672        let data_type_string = data_type.to_string();
673        println!("Input '{data_type_string}' ({data_type:?})");
674        let parsed_type = parse_data_type(&data_type_string).unwrap();
675        assert_eq!(
676            data_type, parsed_type,
677            "Mismatch parsing {data_type_string}"
678        );
679    }
680
681    fn list_datatypes() -> Vec<DataType> {
682        vec![
683            // ---------
684            // Non Nested types
685            // ---------
686            DataType::Null,
687            DataType::Boolean,
688            DataType::Int8,
689            DataType::Int16,
690            DataType::Int32,
691            DataType::Int64,
692            DataType::UInt8,
693            DataType::UInt16,
694            DataType::UInt32,
695            DataType::UInt64,
696            DataType::Float16,
697            DataType::Float32,
698            DataType::Float64,
699            DataType::Timestamp(TimeUnit::Second, None),
700            DataType::Timestamp(TimeUnit::Millisecond, None),
701            DataType::Timestamp(TimeUnit::Microsecond, None),
702            DataType::Timestamp(TimeUnit::Nanosecond, None),
703            // we can't cover all possible timezones, here we only test utc and +08:00
704            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
705            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
706            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
707            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
708            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
709            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
710            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
711            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
712            DataType::Date32,
713            DataType::Date64,
714            DataType::Time32(TimeUnit::Second),
715            DataType::Time32(TimeUnit::Millisecond),
716            DataType::Time32(TimeUnit::Microsecond),
717            DataType::Time32(TimeUnit::Nanosecond),
718            DataType::Time64(TimeUnit::Second),
719            DataType::Time64(TimeUnit::Millisecond),
720            DataType::Time64(TimeUnit::Microsecond),
721            DataType::Time64(TimeUnit::Nanosecond),
722            DataType::Duration(TimeUnit::Second),
723            DataType::Duration(TimeUnit::Millisecond),
724            DataType::Duration(TimeUnit::Microsecond),
725            DataType::Duration(TimeUnit::Nanosecond),
726            DataType::Interval(IntervalUnit::YearMonth),
727            DataType::Interval(IntervalUnit::DayTime),
728            DataType::Interval(IntervalUnit::MonthDayNano),
729            DataType::Binary,
730            DataType::BinaryView,
731            DataType::FixedSizeBinary(0),
732            DataType::FixedSizeBinary(1234),
733            DataType::FixedSizeBinary(-432),
734            DataType::LargeBinary,
735            DataType::Utf8,
736            DataType::Utf8View,
737            DataType::LargeUtf8,
738            DataType::Decimal32(7, 8),
739            DataType::Decimal64(6, 9),
740            DataType::Decimal128(7, 12),
741            DataType::Decimal256(6, 13),
742            // ---------
743            // Nested types
744            // ---------
745            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
746            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
747            DataType::Dictionary(
748                Box::new(DataType::Int8),
749                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
750            ),
751            DataType::Dictionary(
752                Box::new(DataType::Int8),
753                Box::new(DataType::FixedSizeBinary(23)),
754            ),
755            DataType::Dictionary(
756                Box::new(DataType::Int8),
757                Box::new(
758                    // nested dictionaries are probably a bad idea but they are possible
759                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
760                ),
761            ),
762            DataType::Struct(Fields::from(vec![
763                Field::new("f1", DataType::Int64, true),
764                Field::new("f2", DataType::Float64, true),
765                Field::new(
766                    "f3",
767                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
768                    true,
769                ),
770                Field::new(
771                    "f4",
772                    DataType::Dictionary(
773                        Box::new(DataType::Int8),
774                        Box::new(DataType::FixedSizeBinary(23)),
775                    ),
776                    true,
777                ),
778            ])),
779            DataType::Struct(Fields::from(vec![
780                Field::new("Int64", DataType::Int64, true),
781                Field::new("Float64", DataType::Float64, true),
782            ])),
783            DataType::Struct(Fields::from(vec![
784                Field::new("f1", DataType::Int64, true),
785                Field::new(
786                    "nested_struct",
787                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
788                    true,
789                ),
790            ])),
791            DataType::Struct(Fields::empty()),
792            // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc)
793        ]
794    }
795
796    #[test]
797    fn test_parse_data_type_whitespace_tolerance() {
798        // (string to parse, expected DataType)
799        let cases = [
800            ("Int8", DataType::Int8),
801            (
802                "Timestamp        (Nanosecond,      None)",
803                DataType::Timestamp(TimeUnit::Nanosecond, None),
804            ),
805            (
806                "Timestamp        (Nanosecond,      None)  ",
807                DataType::Timestamp(TimeUnit::Nanosecond, None),
808            ),
809            (
810                "          Timestamp        (Nanosecond,      None               )",
811                DataType::Timestamp(TimeUnit::Nanosecond, None),
812            ),
813            (
814                "Timestamp        (Nanosecond,      None               )  ",
815                DataType::Timestamp(TimeUnit::Nanosecond, None),
816            ),
817        ];
818
819        for (data_type_string, expected_data_type) in cases {
820            println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'");
821            let parsed_data_type = parse_data_type(data_type_string).unwrap();
822            assert_eq!(parsed_data_type, expected_data_type);
823        }
824    }
825
826    #[test]
827    fn parse_data_type_errors() {
828        // (string to parse, expected error message)
829        let cases = [
830            ("", "Unsupported type ''"),
831            ("", "Error finding next token"),
832            ("null", "Unsupported type 'null'"),
833            ("Nu", "Unsupported type 'Nu'"),
834            (
835                r#"Timestamp(Nanosecond, Some(+00:00))"#,
836                "Error unrecognized word: +00:00",
837            ),
838            (
839                r#"Timestamp(Nanosecond, Some("+00:00))"#,
840                r#"parsing "+00:00 as double quoted string: last char must be ""#,
841            ),
842            (
843                r#"Timestamp(Nanosecond, Some(""))"#,
844                r#"parsing "" as double quoted string: empty string isn't supported"#,
845            ),
846            (
847                r#"Timestamp(Nanosecond, Some("+00:00""))"#,
848                r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
849            ),
850            ("Timestamp(Nanosecond, ", "Error finding next token"),
851            (
852                "Float32 Float32",
853                "trailing content after parsing 'Float32'",
854            ),
855            ("Int32, ", "trailing content after parsing 'Int32'"),
856            ("Int32(3), ", "trailing content after parsing 'Int32'"),
857            ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"),
858            ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"),
859            // too large for i32
860            ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"),
861            // can't have negative precision
862            ("Decimal32(-3, 5)", "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted"),
863            ("Decimal64(-3, 5)", "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted"),
864            ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"),
865            ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
866            ("Decimal32(3, 500)", "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted"),
867            ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"),
868            ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
869            ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
870            ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
871            ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
872            ("Struct(f1)", "Error finding next type, got unexpected ')'"),
873        ];
874
875        for (data_type_string, expected_message) in cases {
876            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
877            match parse_data_type(data_type_string) {
878                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
879                Err(e) => {
880                    let message = e.to_string();
881                    assert!(
882                        message.contains(expected_message),
883                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
884                    );
885                    // errors should also contain  a help message
886                    assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
887                }
888            }
889        }
890    }
891
892    #[test]
893    fn parse_error_type() {
894        let err = parse_data_type("foobar").unwrap_err();
895        assert!(matches!(err, ArrowError::ParseError(_)));
896        assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
897    }
898}