arrow_schema/
datatype_parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, IntervalUnit, TimeUnit};
21
22pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
23    Parser::new(val).parse()
24}
25
26type ArrowResult<T> = Result<T, ArrowError>;
27
28fn make_error(val: &str, msg: &str) -> ArrowError {
29    let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" );
30    ArrowError::ParseError(msg)
31}
32
33fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
34    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
35}
36
37#[derive(Debug)]
38/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
39struct Parser<'a> {
40    val: &'a str,
41    tokenizer: Tokenizer<'a>,
42}
43
44impl<'a> Parser<'a> {
45    fn new(val: &'a str) -> Self {
46        Self {
47            val,
48            tokenizer: Tokenizer::new(val),
49        }
50    }
51
52    fn parse(mut self) -> ArrowResult<DataType> {
53        let data_type = self.parse_next_type()?;
54        // ensure that there is no trailing content
55        if self.tokenizer.next().is_some() {
56            Err(make_error(
57                self.val,
58                &format!("checking trailing content after parsing '{data_type}'"),
59            ))
60        } else {
61            Ok(data_type)
62        }
63    }
64
65    /// parses the next full DataType
66    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
67        match self.next_token()? {
68            Token::SimpleType(data_type) => Ok(data_type),
69            Token::Timestamp => self.parse_timestamp(),
70            Token::Time32 => self.parse_time32(),
71            Token::Time64 => self.parse_time64(),
72            Token::Duration => self.parse_duration(),
73            Token::Interval => self.parse_interval(),
74            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
75            Token::Decimal128 => self.parse_decimal_128(),
76            Token::Decimal256 => self.parse_decimal_256(),
77            Token::Dictionary => self.parse_dictionary(),
78            Token::List => self.parse_list(),
79            Token::LargeList => self.parse_large_list(),
80            Token::FixedSizeList => self.parse_fixed_size_list(),
81            tok => Err(make_error(
82                self.val,
83                &format!("finding next type, got unexpected '{tok}'"),
84            )),
85        }
86    }
87
88    /// Parses the List type
89    fn parse_list(&mut self) -> ArrowResult<DataType> {
90        self.expect_token(Token::LParen)?;
91        let data_type = self.parse_next_type()?;
92        self.expect_token(Token::RParen)?;
93        Ok(DataType::List(Arc::new(Field::new_list_field(
94            data_type, true,
95        ))))
96    }
97
98    /// Parses the LargeList type
99    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
100        self.expect_token(Token::LParen)?;
101        let data_type = self.parse_next_type()?;
102        self.expect_token(Token::RParen)?;
103        Ok(DataType::LargeList(Arc::new(Field::new_list_field(
104            data_type, true,
105        ))))
106    }
107
108    /// Parses the FixedSizeList type
109    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
110        self.expect_token(Token::LParen)?;
111        let length = self.parse_i32("FixedSizeList")?;
112        self.expect_token(Token::Comma)?;
113        let data_type = self.parse_next_type()?;
114        self.expect_token(Token::RParen)?;
115        Ok(DataType::FixedSizeList(
116            Arc::new(Field::new_list_field(data_type, true)),
117            length,
118        ))
119    }
120
121    /// Parses the next timeunit
122    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
123        match self.next_token()? {
124            Token::TimeUnit(time_unit) => Ok(time_unit),
125            tok => Err(make_error(
126                self.val,
127                &format!("finding TimeUnit for {context}, got {tok}"),
128            )),
129        }
130    }
131
132    /// Parses the next timezone
133    fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> {
134        match self.next_token()? {
135            Token::None => Ok(None),
136            Token::Some => {
137                self.expect_token(Token::LParen)?;
138                let timezone = self.parse_double_quoted_string("Timezone")?;
139                self.expect_token(Token::RParen)?;
140                Ok(Some(timezone))
141            }
142            tok => Err(make_error(
143                self.val,
144                &format!("finding Timezone for {context}, got {tok}"),
145            )),
146        }
147    }
148
149    /// Parses the next double quoted string
150    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
151        match self.next_token()? {
152            Token::DoubleQuotedString(s) => Ok(s),
153            tok => Err(make_error(
154                self.val,
155                &format!("finding double quoted string for {context}, got '{tok}'"),
156            )),
157        }
158    }
159
160    /// Parses the next integer value
161    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
162        match self.next_token()? {
163            Token::Integer(v) => Ok(v),
164            tok => Err(make_error(
165                self.val,
166                &format!("finding i64 for {context}, got '{tok}'"),
167            )),
168        }
169    }
170
171    /// Parses the next i32 integer value
172    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
173        let length = self.parse_i64(context)?;
174        length.try_into().map_err(|e| {
175            make_error(
176                self.val,
177                &format!("converting {length} into i32 for {context}: {e}"),
178            )
179        })
180    }
181
182    /// Parses the next i8 integer value
183    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
184        let length = self.parse_i64(context)?;
185        length.try_into().map_err(|e| {
186            make_error(
187                self.val,
188                &format!("converting {length} into i8 for {context}: {e}"),
189            )
190        })
191    }
192
193    /// Parses the next u8 integer value
194    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
195        let length = self.parse_i64(context)?;
196        length.try_into().map_err(|e| {
197            make_error(
198                self.val,
199                &format!("converting {length} into u8 for {context}: {e}"),
200            )
201        })
202    }
203
204    /// Parses the next timestamp (called after `Timestamp` has been consumed)
205    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
206        self.expect_token(Token::LParen)?;
207        let time_unit = self.parse_time_unit("Timestamp")?;
208        self.expect_token(Token::Comma)?;
209        let timezone = self.parse_timezone("Timestamp")?;
210        self.expect_token(Token::RParen)?;
211        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
212    }
213
214    /// Parses the next Time32 (called after `Time32` has been consumed)
215    fn parse_time32(&mut self) -> ArrowResult<DataType> {
216        self.expect_token(Token::LParen)?;
217        let time_unit = self.parse_time_unit("Time32")?;
218        self.expect_token(Token::RParen)?;
219        Ok(DataType::Time32(time_unit))
220    }
221
222    /// Parses the next Time64 (called after `Time64` has been consumed)
223    fn parse_time64(&mut self) -> ArrowResult<DataType> {
224        self.expect_token(Token::LParen)?;
225        let time_unit = self.parse_time_unit("Time64")?;
226        self.expect_token(Token::RParen)?;
227        Ok(DataType::Time64(time_unit))
228    }
229
230    /// Parses the next Duration (called after `Duration` has been consumed)
231    fn parse_duration(&mut self) -> ArrowResult<DataType> {
232        self.expect_token(Token::LParen)?;
233        let time_unit = self.parse_time_unit("Duration")?;
234        self.expect_token(Token::RParen)?;
235        Ok(DataType::Duration(time_unit))
236    }
237
238    /// Parses the next Interval (called after `Interval` has been consumed)
239    fn parse_interval(&mut self) -> ArrowResult<DataType> {
240        self.expect_token(Token::LParen)?;
241        let interval_unit = match self.next_token()? {
242            Token::IntervalUnit(interval_unit) => interval_unit,
243            tok => {
244                return Err(make_error(
245                    self.val,
246                    &format!("finding IntervalUnit for Interval, got {tok}"),
247                ))
248            }
249        };
250        self.expect_token(Token::RParen)?;
251        Ok(DataType::Interval(interval_unit))
252    }
253
254    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
255    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
256        self.expect_token(Token::LParen)?;
257        let length = self.parse_i32("FixedSizeBinary")?;
258        self.expect_token(Token::RParen)?;
259        Ok(DataType::FixedSizeBinary(length))
260    }
261
262    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
263    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
264        self.expect_token(Token::LParen)?;
265        let precision = self.parse_u8("Decimal128")?;
266        self.expect_token(Token::Comma)?;
267        let scale = self.parse_i8("Decimal128")?;
268        self.expect_token(Token::RParen)?;
269        Ok(DataType::Decimal128(precision, scale))
270    }
271
272    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
273    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
274        self.expect_token(Token::LParen)?;
275        let precision = self.parse_u8("Decimal256")?;
276        self.expect_token(Token::Comma)?;
277        let scale = self.parse_i8("Decimal256")?;
278        self.expect_token(Token::RParen)?;
279        Ok(DataType::Decimal256(precision, scale))
280    }
281
282    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
283    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
284        self.expect_token(Token::LParen)?;
285        let key_type = self.parse_next_type()?;
286        self.expect_token(Token::Comma)?;
287        let value_type = self.parse_next_type()?;
288        self.expect_token(Token::RParen)?;
289        Ok(DataType::Dictionary(
290            Box::new(key_type),
291            Box::new(value_type),
292        ))
293    }
294
295    /// return the next token, or an error if there are none left
296    fn next_token(&mut self) -> ArrowResult<Token> {
297        match self.tokenizer.next() {
298            None => Err(make_error(self.val, "finding next token")),
299            Some(token) => token,
300        }
301    }
302
303    /// consume the next token, returning OK(()) if it matches tok, and Err if not
304    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
305        let next_token = self.next_token()?;
306        if next_token == tok {
307            Ok(())
308        } else {
309            Err(make_error_expected(self.val, &tok, &next_token))
310        }
311    }
312}
313
314/// returns true if this character is a separator
315fn is_separator(c: char) -> bool {
316    c == '(' || c == ')' || c == ',' || c == ' '
317}
318
319#[derive(Debug)]
320/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing
321///
322/// For example the string "Timestamp(Nanosecond, None)" would be parsed into:
323///
324/// * Token::Timestamp
325/// * Token::Lparen
326/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
327/// * Token::Comma,
328/// * Token::None,
329/// * Token::Rparen,
330struct Tokenizer<'a> {
331    val: &'a str,
332    chars: Peekable<Chars<'a>>,
333    // temporary buffer for parsing words
334    word: String,
335}
336
337impl<'a> Tokenizer<'a> {
338    fn new(val: &'a str) -> Self {
339        Self {
340            val,
341            chars: val.chars().peekable(),
342            word: String::new(),
343        }
344    }
345
346    /// returns the next char, without consuming it
347    fn peek_next_char(&mut self) -> Option<char> {
348        self.chars.peek().copied()
349    }
350
351    /// returns the next char, and consuming it
352    fn next_char(&mut self) -> Option<char> {
353        self.chars.next()
354    }
355
356    /// parse the characters in val starting at pos, until the next
357    /// `,`, `(`, or `)` or end of line
358    fn parse_word(&mut self) -> ArrowResult<Token> {
359        // reset temp space
360        self.word.clear();
361        loop {
362            match self.peek_next_char() {
363                None => break,
364                Some(c) if is_separator(c) => break,
365                Some(c) => {
366                    self.next_char();
367                    self.word.push(c);
368                }
369            }
370        }
371
372        if let Some(c) = self.word.chars().next() {
373            // if it started with a number, try parsing it as an integer
374            if c == '-' || c.is_numeric() {
375                let val: i64 = self.word.parse().map_err(|e| {
376                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
377                })?;
378                return Ok(Token::Integer(val));
379            }
380            // if it started with a double quote `"`, try parsing it as a double quoted string
381            else if c == '"' {
382                let len = self.word.chars().count();
383
384                // to verify it's double quoted
385                if let Some(last_c) = self.word.chars().last() {
386                    if last_c != '"' || len < 2 {
387                        return Err(make_error(
388                            self.val,
389                            &format!(
390                                "parsing {} as double quoted string: last char must be \"",
391                                self.word
392                            ),
393                        ));
394                    }
395                }
396
397                if len == 2 {
398                    return Err(make_error(
399                        self.val,
400                        &format!(
401                            "parsing {} as double quoted string: empty string isn't supported",
402                            self.word
403                        ),
404                    ));
405                }
406
407                let val: String = self.word.parse().map_err(|e| {
408                    make_error(
409                        self.val,
410                        &format!("parsing {} as double quoted string: {e}", self.word),
411                    )
412                })?;
413
414                let s = val[1..len - 1].to_string();
415                if s.contains('"') {
416                    return Err(make_error(
417                        self.val,
418                        &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
419                    ));
420                }
421
422                return Ok(Token::DoubleQuotedString(s));
423            }
424        }
425
426        // figure out what the word was
427        let token = match self.word.as_str() {
428            "Null" => Token::SimpleType(DataType::Null),
429            "Boolean" => Token::SimpleType(DataType::Boolean),
430
431            "Int8" => Token::SimpleType(DataType::Int8),
432            "Int16" => Token::SimpleType(DataType::Int16),
433            "Int32" => Token::SimpleType(DataType::Int32),
434            "Int64" => Token::SimpleType(DataType::Int64),
435
436            "UInt8" => Token::SimpleType(DataType::UInt8),
437            "UInt16" => Token::SimpleType(DataType::UInt16),
438            "UInt32" => Token::SimpleType(DataType::UInt32),
439            "UInt64" => Token::SimpleType(DataType::UInt64),
440
441            "Utf8" => Token::SimpleType(DataType::Utf8),
442            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
443            "Utf8View" => Token::SimpleType(DataType::Utf8View),
444            "Binary" => Token::SimpleType(DataType::Binary),
445            "BinaryView" => Token::SimpleType(DataType::BinaryView),
446            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
447
448            "Float16" => Token::SimpleType(DataType::Float16),
449            "Float32" => Token::SimpleType(DataType::Float32),
450            "Float64" => Token::SimpleType(DataType::Float64),
451
452            "Date32" => Token::SimpleType(DataType::Date32),
453            "Date64" => Token::SimpleType(DataType::Date64),
454
455            "List" => Token::List,
456            "LargeList" => Token::LargeList,
457            "FixedSizeList" => Token::FixedSizeList,
458
459            "Second" => Token::TimeUnit(TimeUnit::Second),
460            "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
461            "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
462            "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
463
464            "Timestamp" => Token::Timestamp,
465            "Time32" => Token::Time32,
466            "Time64" => Token::Time64,
467            "Duration" => Token::Duration,
468            "Interval" => Token::Interval,
469            "Dictionary" => Token::Dictionary,
470
471            "FixedSizeBinary" => Token::FixedSizeBinary,
472            "Decimal128" => Token::Decimal128,
473            "Decimal256" => Token::Decimal256,
474
475            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
476            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
477            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
478
479            "Some" => Token::Some,
480            "None" => Token::None,
481
482            _ => {
483                return Err(make_error(
484                    self.val,
485                    &format!("unrecognized word: {}", self.word),
486                ))
487            }
488        };
489        Ok(token)
490    }
491}
492
493impl Iterator for Tokenizer<'_> {
494    type Item = ArrowResult<Token>;
495
496    fn next(&mut self) -> Option<Self::Item> {
497        loop {
498            match self.peek_next_char()? {
499                ' ' => {
500                    // skip whitespace
501                    self.next_char();
502                    continue;
503                }
504                '(' => {
505                    self.next_char();
506                    return Some(Ok(Token::LParen));
507                }
508                ')' => {
509                    self.next_char();
510                    return Some(Ok(Token::RParen));
511                }
512                ',' => {
513                    self.next_char();
514                    return Some(Ok(Token::Comma));
515                }
516                _ => return Some(self.parse_word()),
517            }
518        }
519    }
520}
521
522/// Grammar is
523///
524#[derive(Debug, PartialEq)]
525enum Token {
526    // Null, or Int32
527    SimpleType(DataType),
528    Timestamp,
529    Time32,
530    Time64,
531    Duration,
532    Interval,
533    FixedSizeBinary,
534    Decimal128,
535    Decimal256,
536    Dictionary,
537    TimeUnit(TimeUnit),
538    IntervalUnit(IntervalUnit),
539    LParen,
540    RParen,
541    Comma,
542    Some,
543    None,
544    Integer(i64),
545    DoubleQuotedString(String),
546    List,
547    LargeList,
548    FixedSizeList,
549}
550
551impl Display for Token {
552    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
553        match self {
554            Token::SimpleType(t) => write!(f, "{t}"),
555            Token::List => write!(f, "List"),
556            Token::LargeList => write!(f, "LargeList"),
557            Token::FixedSizeList => write!(f, "FixedSizeList"),
558            Token::Timestamp => write!(f, "Timestamp"),
559            Token::Time32 => write!(f, "Time32"),
560            Token::Time64 => write!(f, "Time64"),
561            Token::Duration => write!(f, "Duration"),
562            Token::Interval => write!(f, "Interval"),
563            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
564            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
565            Token::LParen => write!(f, "("),
566            Token::RParen => write!(f, ")"),
567            Token::Comma => write!(f, ","),
568            Token::Some => write!(f, "Some"),
569            Token::None => write!(f, "None"),
570            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
571            Token::Decimal128 => write!(f, "Decimal128"),
572            Token::Decimal256 => write!(f, "Decimal256"),
573            Token::Dictionary => write!(f, "Dictionary"),
574            Token::Integer(v) => write!(f, "Integer({v})"),
575            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
576        }
577    }
578}
579
580#[cfg(test)]
581mod test {
582    use super::*;
583
584    #[test]
585    fn test_parse_data_type() {
586        // this ensures types can be parsed correctly from their string representations
587        for dt in list_datatypes() {
588            round_trip(dt)
589        }
590    }
591
592    /// convert data_type to a string, and then parse it as a type
593    /// verifying it is the same
594    fn round_trip(data_type: DataType) {
595        let data_type_string = data_type.to_string();
596        println!("Input '{data_type_string}' ({data_type:?})");
597        let parsed_type = parse_data_type(&data_type_string).unwrap();
598        assert_eq!(
599            data_type, parsed_type,
600            "Mismatch parsing {data_type_string}"
601        );
602    }
603
604    fn list_datatypes() -> Vec<DataType> {
605        vec![
606            // ---------
607            // Non Nested types
608            // ---------
609            DataType::Null,
610            DataType::Boolean,
611            DataType::Int8,
612            DataType::Int16,
613            DataType::Int32,
614            DataType::Int64,
615            DataType::UInt8,
616            DataType::UInt16,
617            DataType::UInt32,
618            DataType::UInt64,
619            DataType::Float16,
620            DataType::Float32,
621            DataType::Float64,
622            DataType::Timestamp(TimeUnit::Second, None),
623            DataType::Timestamp(TimeUnit::Millisecond, None),
624            DataType::Timestamp(TimeUnit::Microsecond, None),
625            DataType::Timestamp(TimeUnit::Nanosecond, None),
626            // we can't cover all possible timezones, here we only test utc and +08:00
627            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
628            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
629            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
630            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
631            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
632            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
633            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
634            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
635            DataType::Date32,
636            DataType::Date64,
637            DataType::Time32(TimeUnit::Second),
638            DataType::Time32(TimeUnit::Millisecond),
639            DataType::Time32(TimeUnit::Microsecond),
640            DataType::Time32(TimeUnit::Nanosecond),
641            DataType::Time64(TimeUnit::Second),
642            DataType::Time64(TimeUnit::Millisecond),
643            DataType::Time64(TimeUnit::Microsecond),
644            DataType::Time64(TimeUnit::Nanosecond),
645            DataType::Duration(TimeUnit::Second),
646            DataType::Duration(TimeUnit::Millisecond),
647            DataType::Duration(TimeUnit::Microsecond),
648            DataType::Duration(TimeUnit::Nanosecond),
649            DataType::Interval(IntervalUnit::YearMonth),
650            DataType::Interval(IntervalUnit::DayTime),
651            DataType::Interval(IntervalUnit::MonthDayNano),
652            DataType::Binary,
653            DataType::BinaryView,
654            DataType::FixedSizeBinary(0),
655            DataType::FixedSizeBinary(1234),
656            DataType::FixedSizeBinary(-432),
657            DataType::LargeBinary,
658            DataType::Utf8,
659            DataType::Utf8View,
660            DataType::LargeUtf8,
661            DataType::Decimal128(7, 12),
662            DataType::Decimal256(6, 13),
663            // ---------
664            // Nested types
665            // ---------
666            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
667            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
668            DataType::Dictionary(
669                Box::new(DataType::Int8),
670                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
671            ),
672            DataType::Dictionary(
673                Box::new(DataType::Int8),
674                Box::new(DataType::FixedSizeBinary(23)),
675            ),
676            DataType::Dictionary(
677                Box::new(DataType::Int8),
678                Box::new(
679                    // nested dictionaries are probably a bad idea but they are possible
680                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
681                ),
682            ),
683            // TODO support more structured types (List, LargeList, Struct, Union, Map, RunEndEncoded, etc)
684        ]
685    }
686
687    #[test]
688    fn test_parse_data_type_whitespace_tolerance() {
689        // (string to parse, expected DataType)
690        let cases = [
691            ("Int8", DataType::Int8),
692            (
693                "Timestamp        (Nanosecond,      None)",
694                DataType::Timestamp(TimeUnit::Nanosecond, None),
695            ),
696            (
697                "Timestamp        (Nanosecond,      None)  ",
698                DataType::Timestamp(TimeUnit::Nanosecond, None),
699            ),
700            (
701                "          Timestamp        (Nanosecond,      None               )",
702                DataType::Timestamp(TimeUnit::Nanosecond, None),
703            ),
704            (
705                "Timestamp        (Nanosecond,      None               )  ",
706                DataType::Timestamp(TimeUnit::Nanosecond, None),
707            ),
708        ];
709
710        for (data_type_string, expected_data_type) in cases {
711            println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'");
712            let parsed_data_type = parse_data_type(data_type_string).unwrap();
713            assert_eq!(parsed_data_type, expected_data_type);
714        }
715    }
716
717    #[test]
718    fn parse_data_type_errors() {
719        // (string to parse, expected error message)
720        let cases = [
721            ("", "Unsupported type ''"),
722            ("", "Error finding next token"),
723            ("null", "Unsupported type 'null'"),
724            ("Nu", "Unsupported type 'Nu'"),
725            (
726                r#"Timestamp(Nanosecond, Some(+00:00))"#,
727                "Error unrecognized word: +00:00",
728            ),
729            (
730                r#"Timestamp(Nanosecond, Some("+00:00))"#,
731                r#"parsing "+00:00 as double quoted string: last char must be ""#,
732            ),
733            (
734                r#"Timestamp(Nanosecond, Some(""))"#,
735                r#"parsing "" as double quoted string: empty string isn't supported"#,
736            ),
737            (
738                r#"Timestamp(Nanosecond, Some("+00:00""))"#,
739                r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
740            ),
741            ("Timestamp(Nanosecond, ", "Error finding next token"),
742            (
743                "Float32 Float32",
744                "trailing content after parsing 'Float32'",
745            ),
746            ("Int32, ", "trailing content after parsing 'Int32'"),
747            ("Int32(3), ", "trailing content after parsing 'Int32'"),
748            ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"),
749            ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"),
750            // too large for i32
751            ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"),
752            // can't have negative precision
753            ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"),
754            ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
755            ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
756            ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
757
758        ];
759
760        for (data_type_string, expected_message) in cases {
761            print!("Parsing '{data_type_string}', expecting '{expected_message}'");
762            match parse_data_type(data_type_string) {
763                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
764                Err(e) => {
765                    let message = e.to_string();
766                    assert!(
767                        message.contains(expected_message),
768                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
769                    );
770                    // errors should also contain  a help message
771                    assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
772                }
773            }
774        }
775    }
776
777    #[test]
778    fn parse_error_type() {
779        let err = parse_data_type("foobar").unwrap_err();
780        assert!(matches!(err, ArrowError::ParseError(_)));
781        assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
782    }
783}