arrow_schema/
datatype_parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
21
22pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
23    Parser::new(val).parse()
24}
25
26type ArrowResult<T> = Result<T, ArrowError>;
27
28fn make_error(val: &str, msg: &str) -> ArrowError {
29    let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" );
30    ArrowError::ParseError(msg)
31}
32
33fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
34    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
35}
36
37#[derive(Debug)]
38/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
39struct Parser<'a> {
40    val: &'a str,
41    tokenizer: Tokenizer<'a>,
42}
43
44impl<'a> Parser<'a> {
45    fn new(val: &'a str) -> Self {
46        Self {
47            val,
48            tokenizer: Tokenizer::new(val),
49        }
50    }
51
52    fn parse(mut self) -> ArrowResult<DataType> {
53        let data_type = self.parse_next_type()?;
54        // ensure that there is no trailing content
55        if self.tokenizer.next().is_some() {
56            Err(make_error(
57                self.val,
58                &format!("checking trailing content after parsing '{data_type}'"),
59            ))
60        } else {
61            Ok(data_type)
62        }
63    }
64
65    /// parses the next full DataType
66    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
67        match self.next_token()? {
68            Token::SimpleType(data_type) => Ok(data_type),
69            Token::Timestamp => self.parse_timestamp(),
70            Token::Time32 => self.parse_time32(),
71            Token::Time64 => self.parse_time64(),
72            Token::Duration => self.parse_duration(),
73            Token::Interval => self.parse_interval(),
74            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
75            Token::Decimal128 => self.parse_decimal_128(),
76            Token::Decimal256 => self.parse_decimal_256(),
77            Token::Dictionary => self.parse_dictionary(),
78            Token::List => self.parse_list(),
79            Token::LargeList => self.parse_large_list(),
80            Token::FixedSizeList => self.parse_fixed_size_list(),
81            Token::Struct => self.parse_struct(),
82            Token::FieldName(word) => Err(make_error(
83                self.val,
84                &format!("unrecognized word: {}", word),
85            )),
86            tok => Err(make_error(
87                self.val,
88                &format!("finding next type, got unexpected '{tok}'"),
89            )),
90        }
91    }
92
93    /// Parses the List type
94    fn parse_list(&mut self) -> ArrowResult<DataType> {
95        self.expect_token(Token::LParen)?;
96        let data_type = self.parse_next_type()?;
97        self.expect_token(Token::RParen)?;
98        Ok(DataType::List(Arc::new(Field::new_list_field(
99            data_type, true,
100        ))))
101    }
102
103    /// Parses the LargeList type
104    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
105        self.expect_token(Token::LParen)?;
106        let data_type = self.parse_next_type()?;
107        self.expect_token(Token::RParen)?;
108        Ok(DataType::LargeList(Arc::new(Field::new_list_field(
109            data_type, true,
110        ))))
111    }
112
113    /// Parses the FixedSizeList type
114    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
115        self.expect_token(Token::LParen)?;
116        let length = self.parse_i32("FixedSizeList")?;
117        self.expect_token(Token::Comma)?;
118        let data_type = self.parse_next_type()?;
119        self.expect_token(Token::RParen)?;
120        Ok(DataType::FixedSizeList(
121            Arc::new(Field::new_list_field(data_type, true)),
122            length,
123        ))
124    }
125
126    /// Parses the next timeunit
127    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
128        match self.next_token()? {
129            Token::TimeUnit(time_unit) => Ok(time_unit),
130            tok => Err(make_error(
131                self.val,
132                &format!("finding TimeUnit for {context}, got {tok}"),
133            )),
134        }
135    }
136
137    /// Parses the next timezone
138    fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> {
139        match self.next_token()? {
140            Token::None => Ok(None),
141            Token::Some => {
142                self.expect_token(Token::LParen)?;
143                let timezone = self.parse_double_quoted_string("Timezone")?;
144                self.expect_token(Token::RParen)?;
145                Ok(Some(timezone))
146            }
147            tok => Err(make_error(
148                self.val,
149                &format!("finding Timezone for {context}, got {tok}"),
150            )),
151        }
152    }
153
154    /// Parses the next double quoted string
155    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
156        match self.next_token()? {
157            Token::DoubleQuotedString(s) => Ok(s),
158            Token::FieldName(word) => Err(make_error(
159                self.val,
160                &format!("unrecognized word: {}", word),
161            )),
162            tok => Err(make_error(
163                self.val,
164                &format!("finding double quoted string for {context}, got '{tok}'"),
165            )),
166        }
167    }
168
169    /// Parses the next integer value
170    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
171        match self.next_token()? {
172            Token::Integer(v) => Ok(v),
173            tok => Err(make_error(
174                self.val,
175                &format!("finding i64 for {context}, got '{tok}'"),
176            )),
177        }
178    }
179
180    /// Parses the next i32 integer value
181    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
182        let length = self.parse_i64(context)?;
183        length.try_into().map_err(|e| {
184            make_error(
185                self.val,
186                &format!("converting {length} into i32 for {context}: {e}"),
187            )
188        })
189    }
190
191    /// Parses the next i8 integer value
192    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
193        let length = self.parse_i64(context)?;
194        length.try_into().map_err(|e| {
195            make_error(
196                self.val,
197                &format!("converting {length} into i8 for {context}: {e}"),
198            )
199        })
200    }
201
202    /// Parses the next u8 integer value
203    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
204        let length = self.parse_i64(context)?;
205        length.try_into().map_err(|e| {
206            make_error(
207                self.val,
208                &format!("converting {length} into u8 for {context}: {e}"),
209            )
210        })
211    }
212
213    /// Parses the next timestamp (called after `Timestamp` has been consumed)
214    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
215        self.expect_token(Token::LParen)?;
216        let time_unit = self.parse_time_unit("Timestamp")?;
217        self.expect_token(Token::Comma)?;
218        let timezone = self.parse_timezone("Timestamp")?;
219        self.expect_token(Token::RParen)?;
220        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
221    }
222
223    /// Parses the next Time32 (called after `Time32` has been consumed)
224    fn parse_time32(&mut self) -> ArrowResult<DataType> {
225        self.expect_token(Token::LParen)?;
226        let time_unit = self.parse_time_unit("Time32")?;
227        self.expect_token(Token::RParen)?;
228        Ok(DataType::Time32(time_unit))
229    }
230
231    /// Parses the next Time64 (called after `Time64` has been consumed)
232    fn parse_time64(&mut self) -> ArrowResult<DataType> {
233        self.expect_token(Token::LParen)?;
234        let time_unit = self.parse_time_unit("Time64")?;
235        self.expect_token(Token::RParen)?;
236        Ok(DataType::Time64(time_unit))
237    }
238
239    /// Parses the next Duration (called after `Duration` has been consumed)
240    fn parse_duration(&mut self) -> ArrowResult<DataType> {
241        self.expect_token(Token::LParen)?;
242        let time_unit = self.parse_time_unit("Duration")?;
243        self.expect_token(Token::RParen)?;
244        Ok(DataType::Duration(time_unit))
245    }
246
247    /// Parses the next Interval (called after `Interval` has been consumed)
248    fn parse_interval(&mut self) -> ArrowResult<DataType> {
249        self.expect_token(Token::LParen)?;
250        let interval_unit = match self.next_token()? {
251            Token::IntervalUnit(interval_unit) => interval_unit,
252            tok => {
253                return Err(make_error(
254                    self.val,
255                    &format!("finding IntervalUnit for Interval, got {tok}"),
256                ))
257            }
258        };
259        self.expect_token(Token::RParen)?;
260        Ok(DataType::Interval(interval_unit))
261    }
262
263    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
264    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
265        self.expect_token(Token::LParen)?;
266        let length = self.parse_i32("FixedSizeBinary")?;
267        self.expect_token(Token::RParen)?;
268        Ok(DataType::FixedSizeBinary(length))
269    }
270
271    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
272    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
273        self.expect_token(Token::LParen)?;
274        let precision = self.parse_u8("Decimal128")?;
275        self.expect_token(Token::Comma)?;
276        let scale = self.parse_i8("Decimal128")?;
277        self.expect_token(Token::RParen)?;
278        Ok(DataType::Decimal128(precision, scale))
279    }
280
281    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
282    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
283        self.expect_token(Token::LParen)?;
284        let precision = self.parse_u8("Decimal256")?;
285        self.expect_token(Token::Comma)?;
286        let scale = self.parse_i8("Decimal256")?;
287        self.expect_token(Token::RParen)?;
288        Ok(DataType::Decimal256(precision, scale))
289    }
290
291    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
292    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
293        self.expect_token(Token::LParen)?;
294        let key_type = self.parse_next_type()?;
295        self.expect_token(Token::Comma)?;
296        let value_type = self.parse_next_type()?;
297        self.expect_token(Token::RParen)?;
298        Ok(DataType::Dictionary(
299            Box::new(key_type),
300            Box::new(value_type),
301        ))
302    }
303    fn parse_struct(&mut self) -> ArrowResult<DataType> {
304        self.expect_token(Token::LParen)?;
305        let mut fields = Vec::new();
306        loop {
307            let field_name = match self.next_token()? {
308                // It's valid to have a name that is a type name
309                Token::SimpleType(data_type) => data_type.to_string(),
310                Token::FieldName(name) => name,
311                Token::RParen => {
312                    if fields.is_empty() {
313                        break;
314                    } else {
315                        return Err(make_error(
316                            self.val,
317                            "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
318                        ));
319                    }
320                }
321                tok => {
322                    return Err(make_error(
323                        self.val,
324                        &format!("Expected a word for the name of Struct, but got {tok}"),
325                    ))
326                }
327            };
328            let field_type = self.parse_next_type()?;
329            fields.push(Arc::new(Field::new(field_name, field_type, true)));
330            match self.next_token()? {
331                Token::Comma => continue,
332                Token::RParen => break,
333                tok => {
334                    return Err(make_error(
335                        self.val,
336                        &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
337                    ))
338                }
339            }
340        }
341        Ok(DataType::Struct(Fields::from(fields)))
342    }
343
344    /// return the next token, or an error if there are none left
345    fn next_token(&mut self) -> ArrowResult<Token> {
346        match self.tokenizer.next() {
347            None => Err(make_error(self.val, "finding next token")),
348            Some(token) => token,
349        }
350    }
351
352    /// consume the next token, returning OK(()) if it matches tok, and Err if not
353    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
354        let next_token = self.next_token()?;
355        if next_token == tok {
356            Ok(())
357        } else {
358            Err(make_error_expected(self.val, &tok, &next_token))
359        }
360    }
361}
362
363/// returns true if this character is a separator
364fn is_separator(c: char) -> bool {
365    c == '(' || c == ')' || c == ',' || c == ' '
366}
367
368#[derive(Debug)]
369/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing
370///
371/// For example the string "Timestamp(Nanosecond, None)" would be parsed into:
372///
373/// * Token::Timestamp
374/// * Token::Lparen
375/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
376/// * Token::Comma,
377/// * Token::None,
378/// * Token::Rparen,
379struct Tokenizer<'a> {
380    val: &'a str,
381    chars: Peekable<Chars<'a>>,
382    // temporary buffer for parsing words
383    word: String,
384}
385
386impl<'a> Tokenizer<'a> {
387    fn new(val: &'a str) -> Self {
388        Self {
389            val,
390            chars: val.chars().peekable(),
391            word: String::new(),
392        }
393    }
394
395    /// returns the next char, without consuming it
396    fn peek_next_char(&mut self) -> Option<char> {
397        self.chars.peek().copied()
398    }
399
400    /// returns the next char, and consuming it
401    fn next_char(&mut self) -> Option<char> {
402        self.chars.next()
403    }
404
405    /// parse the characters in val starting at pos, until the next
406    /// `,`, `(`, or `)` or end of line
407    fn parse_word(&mut self) -> ArrowResult<Token> {
408        // reset temp space
409        self.word.clear();
410        loop {
411            match self.peek_next_char() {
412                None => break,
413                Some(c) if is_separator(c) => break,
414                Some(c) => {
415                    self.next_char();
416                    self.word.push(c);
417                }
418            }
419        }
420
421        if let Some(c) = self.word.chars().next() {
422            // if it started with a number, try parsing it as an integer
423            if c == '-' || c.is_numeric() {
424                let val: i64 = self.word.parse().map_err(|e| {
425                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
426                })?;
427                return Ok(Token::Integer(val));
428            }
429            // if it started with a double quote `"`, try parsing it as a double quoted string
430            else if c == '"' {
431                let len = self.word.chars().count();
432
433                // to verify it's double quoted
434                if let Some(last_c) = self.word.chars().last() {
435                    if last_c != '"' || len < 2 {
436                        return Err(make_error(
437                            self.val,
438                            &format!(
439                                "parsing {} as double quoted string: last char must be \"",
440                                self.word
441                            ),
442                        ));
443                    }
444                }
445
446                if len == 2 {
447                    return Err(make_error(
448                        self.val,
449                        &format!(
450                            "parsing {} as double quoted string: empty string isn't supported",
451                            self.word
452                        ),
453                    ));
454                }
455
456                let val: String = self.word.parse().map_err(|e| {
457                    make_error(
458                        self.val,
459                        &format!("parsing {} as double quoted string: {e}", self.word),
460                    )
461                })?;
462
463                let s = val[1..len - 1].to_string();
464                if s.contains('"') {
465                    return Err(make_error(
466                        self.val,
467                        &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
468                    ));
469                }
470
471                return Ok(Token::DoubleQuotedString(s));
472            }
473        }
474
475        // figure out what the word was
476        let token = match self.word.as_str() {
477            "Null" => Token::SimpleType(DataType::Null),
478            "Boolean" => Token::SimpleType(DataType::Boolean),
479
480            "Int8" => Token::SimpleType(DataType::Int8),
481            "Int16" => Token::SimpleType(DataType::Int16),
482            "Int32" => Token::SimpleType(DataType::Int32),
483            "Int64" => Token::SimpleType(DataType::Int64),
484
485            "UInt8" => Token::SimpleType(DataType::UInt8),
486            "UInt16" => Token::SimpleType(DataType::UInt16),
487            "UInt32" => Token::SimpleType(DataType::UInt32),
488            "UInt64" => Token::SimpleType(DataType::UInt64),
489
490            "Utf8" => Token::SimpleType(DataType::Utf8),
491            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
492            "Utf8View" => Token::SimpleType(DataType::Utf8View),
493            "Binary" => Token::SimpleType(DataType::Binary),
494            "BinaryView" => Token::SimpleType(DataType::BinaryView),
495            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
496
497            "Float16" => Token::SimpleType(DataType::Float16),
498            "Float32" => Token::SimpleType(DataType::Float32),
499            "Float64" => Token::SimpleType(DataType::Float64),
500
501            "Date32" => Token::SimpleType(DataType::Date32),
502            "Date64" => Token::SimpleType(DataType::Date64),
503
504            "List" => Token::List,
505            "LargeList" => Token::LargeList,
506            "FixedSizeList" => Token::FixedSizeList,
507
508            "Second" => Token::TimeUnit(TimeUnit::Second),
509            "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
510            "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
511            "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
512
513            "Timestamp" => Token::Timestamp,
514            "Time32" => Token::Time32,
515            "Time64" => Token::Time64,
516            "Duration" => Token::Duration,
517            "Interval" => Token::Interval,
518            "Dictionary" => Token::Dictionary,
519
520            "FixedSizeBinary" => Token::FixedSizeBinary,
521            "Decimal128" => Token::Decimal128,
522            "Decimal256" => Token::Decimal256,
523
524            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
525            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
526            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
527
528            "Some" => Token::Some,
529            "None" => Token::None,
530
531            "Struct" => Token::Struct,
532            // If we don't recognize the word, treat it as a field name
533            word => Token::FieldName(word.to_string()),
534        };
535        Ok(token)
536    }
537}
538
539impl Iterator for Tokenizer<'_> {
540    type Item = ArrowResult<Token>;
541
542    fn next(&mut self) -> Option<Self::Item> {
543        loop {
544            match self.peek_next_char()? {
545                ' ' => {
546                    // skip whitespace
547                    self.next_char();
548                    continue;
549                }
550                '(' => {
551                    self.next_char();
552                    return Some(Ok(Token::LParen));
553                }
554                ')' => {
555                    self.next_char();
556                    return Some(Ok(Token::RParen));
557                }
558                ',' => {
559                    self.next_char();
560                    return Some(Ok(Token::Comma));
561                }
562                _ => return Some(self.parse_word()),
563            }
564        }
565    }
566}
567
568/// Grammar is
569///
570#[derive(Debug, PartialEq)]
571enum Token {
572    // Null, or Int32
573    SimpleType(DataType),
574    Timestamp,
575    Time32,
576    Time64,
577    Duration,
578    Interval,
579    FixedSizeBinary,
580    Decimal128,
581    Decimal256,
582    Dictionary,
583    TimeUnit(TimeUnit),
584    IntervalUnit(IntervalUnit),
585    LParen,
586    RParen,
587    Comma,
588    Some,
589    None,
590    Integer(i64),
591    DoubleQuotedString(String),
592    List,
593    LargeList,
594    FixedSizeList,
595    Struct,
596    FieldName(String),
597}
598
599impl Display for Token {
600    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
601        match self {
602            Token::SimpleType(t) => write!(f, "{t}"),
603            Token::List => write!(f, "List"),
604            Token::LargeList => write!(f, "LargeList"),
605            Token::FixedSizeList => write!(f, "FixedSizeList"),
606            Token::Timestamp => write!(f, "Timestamp"),
607            Token::Time32 => write!(f, "Time32"),
608            Token::Time64 => write!(f, "Time64"),
609            Token::Duration => write!(f, "Duration"),
610            Token::Interval => write!(f, "Interval"),
611            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
612            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
613            Token::LParen => write!(f, "("),
614            Token::RParen => write!(f, ")"),
615            Token::Comma => write!(f, ","),
616            Token::Some => write!(f, "Some"),
617            Token::None => write!(f, "None"),
618            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
619            Token::Decimal128 => write!(f, "Decimal128"),
620            Token::Decimal256 => write!(f, "Decimal256"),
621            Token::Dictionary => write!(f, "Dictionary"),
622            Token::Integer(v) => write!(f, "Integer({v})"),
623            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
624            Token::Struct => write!(f, "Struct"),
625            Token::FieldName(s) => write!(f, "FieldName({s})"),
626        }
627    }
628}
629
630#[cfg(test)]
631mod test {
632    use super::*;
633
634    #[test]
635    fn test_parse_data_type() {
636        // this ensures types can be parsed correctly from their string representations
637        for dt in list_datatypes() {
638            round_trip(dt)
639        }
640    }
641
642    /// convert data_type to a string, and then parse it as a type
643    /// verifying it is the same
644    fn round_trip(data_type: DataType) {
645        let data_type_string = data_type.to_string();
646        println!("Input '{data_type_string}' ({data_type:?})");
647        let parsed_type = parse_data_type(&data_type_string).unwrap();
648        assert_eq!(
649            data_type, parsed_type,
650            "Mismatch parsing {data_type_string}"
651        );
652    }
653
654    fn list_datatypes() -> Vec<DataType> {
655        vec![
656            // ---------
657            // Non Nested types
658            // ---------
659            DataType::Null,
660            DataType::Boolean,
661            DataType::Int8,
662            DataType::Int16,
663            DataType::Int32,
664            DataType::Int64,
665            DataType::UInt8,
666            DataType::UInt16,
667            DataType::UInt32,
668            DataType::UInt64,
669            DataType::Float16,
670            DataType::Float32,
671            DataType::Float64,
672            DataType::Timestamp(TimeUnit::Second, None),
673            DataType::Timestamp(TimeUnit::Millisecond, None),
674            DataType::Timestamp(TimeUnit::Microsecond, None),
675            DataType::Timestamp(TimeUnit::Nanosecond, None),
676            // we can't cover all possible timezones, here we only test utc and +08:00
677            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
678            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
679            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
680            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
681            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
682            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
683            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
684            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
685            DataType::Date32,
686            DataType::Date64,
687            DataType::Time32(TimeUnit::Second),
688            DataType::Time32(TimeUnit::Millisecond),
689            DataType::Time32(TimeUnit::Microsecond),
690            DataType::Time32(TimeUnit::Nanosecond),
691            DataType::Time64(TimeUnit::Second),
692            DataType::Time64(TimeUnit::Millisecond),
693            DataType::Time64(TimeUnit::Microsecond),
694            DataType::Time64(TimeUnit::Nanosecond),
695            DataType::Duration(TimeUnit::Second),
696            DataType::Duration(TimeUnit::Millisecond),
697            DataType::Duration(TimeUnit::Microsecond),
698            DataType::Duration(TimeUnit::Nanosecond),
699            DataType::Interval(IntervalUnit::YearMonth),
700            DataType::Interval(IntervalUnit::DayTime),
701            DataType::Interval(IntervalUnit::MonthDayNano),
702            DataType::Binary,
703            DataType::BinaryView,
704            DataType::FixedSizeBinary(0),
705            DataType::FixedSizeBinary(1234),
706            DataType::FixedSizeBinary(-432),
707            DataType::LargeBinary,
708            DataType::Utf8,
709            DataType::Utf8View,
710            DataType::LargeUtf8,
711            DataType::Decimal128(7, 12),
712            DataType::Decimal256(6, 13),
713            // ---------
714            // Nested types
715            // ---------
716            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
717            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
718            DataType::Dictionary(
719                Box::new(DataType::Int8),
720                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
721            ),
722            DataType::Dictionary(
723                Box::new(DataType::Int8),
724                Box::new(DataType::FixedSizeBinary(23)),
725            ),
726            DataType::Dictionary(
727                Box::new(DataType::Int8),
728                Box::new(
729                    // nested dictionaries are probably a bad idea but they are possible
730                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
731                ),
732            ),
733            DataType::Struct(Fields::from(vec![
734                Field::new("f1", DataType::Int64, true),
735                Field::new("f2", DataType::Float64, true),
736                Field::new(
737                    "f3",
738                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
739                    true,
740                ),
741                Field::new(
742                    "f4",
743                    DataType::Dictionary(
744                        Box::new(DataType::Int8),
745                        Box::new(DataType::FixedSizeBinary(23)),
746                    ),
747                    true,
748                ),
749            ])),
750            DataType::Struct(Fields::from(vec![
751                Field::new("Int64", DataType::Int64, true),
752                Field::new("Float64", DataType::Float64, true),
753            ])),
754            DataType::Struct(Fields::from(vec![
755                Field::new("f1", DataType::Int64, true),
756                Field::new(
757                    "nested_struct",
758                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
759                    true,
760                ),
761            ])),
762            DataType::Struct(Fields::empty()),
763            // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc)
764        ]
765    }
766
767    #[test]
768    fn test_parse_data_type_whitespace_tolerance() {
769        // (string to parse, expected DataType)
770        let cases = [
771            ("Int8", DataType::Int8),
772            (
773                "Timestamp        (Nanosecond,      None)",
774                DataType::Timestamp(TimeUnit::Nanosecond, None),
775            ),
776            (
777                "Timestamp        (Nanosecond,      None)  ",
778                DataType::Timestamp(TimeUnit::Nanosecond, None),
779            ),
780            (
781                "          Timestamp        (Nanosecond,      None               )",
782                DataType::Timestamp(TimeUnit::Nanosecond, None),
783            ),
784            (
785                "Timestamp        (Nanosecond,      None               )  ",
786                DataType::Timestamp(TimeUnit::Nanosecond, None),
787            ),
788        ];
789
790        for (data_type_string, expected_data_type) in cases {
791            println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'");
792            let parsed_data_type = parse_data_type(data_type_string).unwrap();
793            assert_eq!(parsed_data_type, expected_data_type);
794        }
795    }
796
797    #[test]
798    fn parse_data_type_errors() {
799        // (string to parse, expected error message)
800        let cases = [
801            ("", "Unsupported type ''"),
802            ("", "Error finding next token"),
803            ("null", "Unsupported type 'null'"),
804            ("Nu", "Unsupported type 'Nu'"),
805            (
806                r#"Timestamp(Nanosecond, Some(+00:00))"#,
807                "Error unrecognized word: +00:00",
808            ),
809            (
810                r#"Timestamp(Nanosecond, Some("+00:00))"#,
811                r#"parsing "+00:00 as double quoted string: last char must be ""#,
812            ),
813            (
814                r#"Timestamp(Nanosecond, Some(""))"#,
815                r#"parsing "" as double quoted string: empty string isn't supported"#,
816            ),
817            (
818                r#"Timestamp(Nanosecond, Some("+00:00""))"#,
819                r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
820            ),
821            ("Timestamp(Nanosecond, ", "Error finding next token"),
822            (
823                "Float32 Float32",
824                "trailing content after parsing 'Float32'",
825            ),
826            ("Int32, ", "trailing content after parsing 'Int32'"),
827            ("Int32(3), ", "trailing content after parsing 'Int32'"),
828            ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"),
829            ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"),
830            // too large for i32
831            ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"),
832            // can't have negative precision
833            ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"),
834            ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
835            ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
836            ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
837            ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
838            ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
839            ("Struct(f1)", "Error finding next type, got unexpected ')'"),
840        ];
841
842        for (data_type_string, expected_message) in cases {
843            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
844            match parse_data_type(data_type_string) {
845                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
846                Err(e) => {
847                    let message = e.to_string();
848                    assert!(
849                        message.contains(expected_message),
850                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
851                    );
852                    // errors should also contain  a help message
853                    assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
854                }
855            }
856        }
857    }
858
859    #[test]
860    fn parse_error_type() {
861        let err = parse_data_type("foobar").unwrap_err();
862        assert!(matches!(err, ArrowError::ParseError(_)));
863        assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
864    }
865}