Skip to main content

arrow_schema/
datatype_parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields, UnionMode};
21
22/// Parses a DataType from a string representation
23///
24/// For example, the string "Int32" would be parsed into [`DataType::Int32`]
25pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
26    Parser::new(val).parse()
27}
28
29type ArrowResult<T> = Result<T, ArrowError>;
30
31fn make_error(val: &str, msg: &str) -> ArrowError {
32    let msg = format!(
33        "Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error {msg}"
34    );
35    ArrowError::ParseError(msg)
36}
37
38fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
39    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
40}
41
42/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
43#[derive(Debug)]
44struct Parser<'a> {
45    val: &'a str,
46    tokenizer: Peekable<Tokenizer<'a>>,
47}
48
49impl<'a> Parser<'a> {
50    fn new(val: &'a str) -> Self {
51        Self {
52            val,
53            tokenizer: Tokenizer::new(val).peekable(),
54        }
55    }
56
57    fn parse(mut self) -> ArrowResult<DataType> {
58        let data_type = self.parse_next_type()?;
59        // ensure that there is no trailing content
60        if self.tokenizer.next().is_some() {
61            Err(make_error(
62                self.val,
63                &format!("checking trailing content after parsing '{data_type}'"),
64            ))
65        } else {
66            Ok(data_type)
67        }
68    }
69
70    /// parses the next full DataType
71    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
72        match self.next_token()? {
73            Token::SimpleType(data_type) => Ok(data_type),
74            Token::Timestamp => self.parse_timestamp(),
75            Token::Time32 => self.parse_time32(),
76            Token::Time64 => self.parse_time64(),
77            Token::Duration => self.parse_duration(),
78            Token::Interval => self.parse_interval(),
79            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
80            Token::Decimal32 => self.parse_decimal_32(),
81            Token::Decimal64 => self.parse_decimal_64(),
82            Token::Decimal128 => self.parse_decimal_128(),
83            Token::Decimal256 => self.parse_decimal_256(),
84            Token::Dictionary => self.parse_dictionary(),
85            Token::List => self.parse_list(),
86            Token::ListView => self.parse_list_view(),
87            Token::LargeList => self.parse_large_list(),
88            Token::LargeListView => self.parse_large_list_view(),
89            Token::FixedSizeList => self.parse_fixed_size_list(),
90            Token::Struct => self.parse_struct(),
91            Token::Union => self.parse_union(),
92            Token::Map => self.parse_map(),
93            Token::RunEndEncoded => self.parse_run_end_encoded(),
94            tok => Err(make_error(
95                self.val,
96                &format!("finding next type, got unexpected '{tok}'"),
97            )),
98        }
99    }
100
101    /// parses Field, this is the inversion of `format_field` in `datatype_display.rs`.
102    /// E.g: "a": non-null Int64
103    ///
104    /// TODO: support metadata: `"a": non-null Int64 metadata: {"foo": "value"}`
105    fn parse_field(&mut self) -> ArrowResult<Field> {
106        let name = self.parse_double_quoted_string("Field")?;
107        self.expect_token(Token::Colon)?;
108        let nullable = self.parse_opt_nullable();
109        let data_type = self.parse_next_type()?;
110        Ok(Field::new(name, data_type, nullable))
111    }
112
113    /// Parses field inside a list. Use `Field::LIST_FIELD_DEFAULT_NAME`
114    /// if no field name is specified.
115    /// E.g: `non-null Int64, field: 'foo'` or `non-null Int64`
116    ///
117    /// TODO: support metadata: `non-ull Int64, metadata: {"foo2": "value"}`
118    fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> {
119        let nullable = self.parse_opt_nullable();
120        let data_type = self.parse_next_type()?;
121
122        // the field name (if exists) must be after a comma
123        let field_name = if self
124            .tokenizer
125            .next_if(|next| matches!(next, Ok(Token::Comma)))
126            .is_none()
127        {
128            Field::LIST_FIELD_DEFAULT_NAME.into()
129        } else {
130            // expects: `field: 'field_name'`.
131            self.expect_token(Token::Field)?;
132            self.expect_token(Token::Colon)?;
133            self.parse_single_quoted_string(context)?
134        };
135
136        Ok(Field::new(field_name, data_type, nullable))
137    }
138
139    /// Parses the List type (called after `List` has been consumed)
140    /// E.g: List(non-null Int64, field: 'foo')
141    fn parse_list(&mut self) -> ArrowResult<DataType> {
142        self.expect_token(Token::LParen)?;
143        let field = self.parse_list_field("List")?;
144        self.expect_token(Token::RParen)?;
145        Ok(DataType::List(Arc::new(field)))
146    }
147
148    /// Parses the ListView type (called after `ListView` has been consumed)
149    /// E.g: ListView(non-null Int64, field: 'foo')
150    fn parse_list_view(&mut self) -> ArrowResult<DataType> {
151        self.expect_token(Token::LParen)?;
152        let field = self.parse_list_field("ListView")?;
153        self.expect_token(Token::RParen)?;
154        Ok(DataType::ListView(Arc::new(field)))
155    }
156
157    /// Parses the LargeList type (called after `LargeList` has been consumed)
158    /// E.g: LargeList(non-null Int64, field: 'foo')
159    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
160        self.expect_token(Token::LParen)?;
161        let field = self.parse_list_field("LargeList")?;
162        self.expect_token(Token::RParen)?;
163        Ok(DataType::LargeList(Arc::new(field)))
164    }
165
166    /// Parses the LargeListView type (called after `LargeListView` has been consumed)
167    /// E.g: LargeListView(non-null Int64, field: 'foo')
168    fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
169        self.expect_token(Token::LParen)?;
170        let field = self.parse_list_field("LargeListView")?;
171        self.expect_token(Token::RParen)?;
172        Ok(DataType::LargeListView(Arc::new(field)))
173    }
174
175    /// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed)
176    ///
177    /// Examples:
178    /// * `FixedSizeList(5 x non-null Int64, field: 'foo')`
179    /// * `FixedSizeList(4, Int64)`
180    ///
181    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
182        self.expect_token(Token::LParen)?;
183        let length = self.parse_i32("FixedSizeList")?;
184        match self.next_token()? {
185            // `FixedSizeList(5 x non-null Int64, field: 'foo')` format
186            Token::X => {
187                let field = self.parse_list_field("FixedSizeList")?;
188                self.expect_token(Token::RParen)?;
189                Ok(DataType::FixedSizeList(Arc::new(field), length))
190            }
191            // `FixedSizeList(4, Int64)` format
192            Token::Comma => {
193                let data_type = self.parse_next_type()?;
194                self.expect_token(Token::RParen)?;
195                Ok(DataType::FixedSizeList(
196                    Arc::new(Field::new_list_field(data_type, true)),
197                    length,
198                ))
199            }
200            tok => Err(make_error(
201                self.val,
202                &format!("Expected 'x' or ',' after length for FixedSizeList, got '{tok}'"),
203            )),
204        }
205    }
206
207    /// Parses the next timeunit
208    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
209        match self.next_token()? {
210            Token::TimeUnit(time_unit) => Ok(time_unit),
211            tok => Err(make_error(
212                self.val,
213                &format!("finding TimeUnit for {context}, got {tok}"),
214            )),
215        }
216    }
217
218    /// Parses the next double quoted string
219    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
220        let token = self.next_token()?;
221        if let Token::DoubleQuotedString(string) = token {
222            Ok(string)
223        } else {
224            Err(make_error(
225                self.val,
226                &format!("expected double quoted string for {context}, got '{token}'"),
227            ))
228        }
229    }
230
231    /// Parses the next single quoted string
232    fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
233        let token = self.next_token()?;
234        if let Token::SingleQuotedString(string) = token {
235            Ok(string)
236        } else {
237            Err(make_error(
238                self.val,
239                &format!("expected single quoted string for {context}, got '{token}'"),
240            ))
241        }
242    }
243
244    /// Parses the next integer value
245    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
246        match self.next_token()? {
247            Token::Integer(v) => Ok(v),
248            tok => Err(make_error(
249                self.val,
250                &format!("finding i64 for {context}, got '{tok}'"),
251            )),
252        }
253    }
254
255    /// Parses the next i32 integer value
256    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
257        let length = self.parse_i64(context)?;
258        length.try_into().map_err(|e| {
259            make_error(
260                self.val,
261                &format!("converting {length} into i32 for {context}: {e}"),
262            )
263        })
264    }
265
266    /// Parses the next i8 integer value
267    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
268        let length = self.parse_i64(context)?;
269        length.try_into().map_err(|e| {
270            make_error(
271                self.val,
272                &format!("converting {length} into i8 for {context}: {e}"),
273            )
274        })
275    }
276
277    /// Parses the next u8 integer value
278    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
279        let length = self.parse_i64(context)?;
280        length.try_into().map_err(|e| {
281            make_error(
282                self.val,
283                &format!("converting {length} into u8 for {context}: {e}"),
284            )
285        })
286    }
287
288    /// Parses the next timestamp (called after `Timestamp` has been consumed)
289    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
290        self.expect_token(Token::LParen)?;
291        let time_unit = self.parse_time_unit("Timestamp")?;
292
293        let timezone;
294        match self.next_token()? {
295            Token::Comma => {
296                match self.next_token()? {
297                    // Support old style `Timestamp(Nanosecond, None)`
298                    Token::None => {
299                        timezone = None;
300                    }
301                    // Support old style `Timestamp(Nanosecond, Some("Timezone"))`
302                    Token::Some => {
303                        self.expect_token(Token::LParen)?;
304                        timezone = Some(self.parse_double_quoted_string("Timezone")?);
305                        self.expect_token(Token::RParen)?;
306                    }
307                    Token::DoubleQuotedString(tz) => {
308                        // Support new style `Timestamp(Nanosecond, "Timezone")`
309                        timezone = Some(tz);
310                    }
311                    tok => {
312                        return Err(make_error(
313                            self.val,
314                            &format!("Expected None, Some, or a timezone string, got {tok:?}"),
315                        ));
316                    }
317                };
318                self.expect_token(Token::RParen)?;
319            }
320            // No timezone (e.g `Timestamp(ns)`)
321            Token::RParen => {
322                timezone = None;
323            }
324            next_token => {
325                return Err(make_error(
326                    self.val,
327                    &format!("Expected comma followed by a timezone, or an ), got {next_token:?}"),
328                ));
329            }
330        }
331        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
332    }
333
334    /// Parses the next Time32 (called after `Time32` has been consumed)
335    fn parse_time32(&mut self) -> ArrowResult<DataType> {
336        self.expect_token(Token::LParen)?;
337        let time_unit = self.parse_time_unit("Time32")?;
338        self.expect_token(Token::RParen)?;
339        Ok(DataType::Time32(time_unit))
340    }
341
342    /// Parses the next Time64 (called after `Time64` has been consumed)
343    fn parse_time64(&mut self) -> ArrowResult<DataType> {
344        self.expect_token(Token::LParen)?;
345        let time_unit = self.parse_time_unit("Time64")?;
346        self.expect_token(Token::RParen)?;
347        Ok(DataType::Time64(time_unit))
348    }
349
350    /// Parses the next Duration (called after `Duration` has been consumed)
351    fn parse_duration(&mut self) -> ArrowResult<DataType> {
352        self.expect_token(Token::LParen)?;
353        let time_unit = self.parse_time_unit("Duration")?;
354        self.expect_token(Token::RParen)?;
355        Ok(DataType::Duration(time_unit))
356    }
357
358    /// Parses the next Interval (called after `Interval` has been consumed)
359    fn parse_interval(&mut self) -> ArrowResult<DataType> {
360        self.expect_token(Token::LParen)?;
361        let interval_unit = match self.next_token()? {
362            Token::IntervalUnit(interval_unit) => interval_unit,
363            tok => {
364                return Err(make_error(
365                    self.val,
366                    &format!("finding IntervalUnit for Interval, got {tok}"),
367                ));
368            }
369        };
370        self.expect_token(Token::RParen)?;
371        Ok(DataType::Interval(interval_unit))
372    }
373
374    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
375    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
376        self.expect_token(Token::LParen)?;
377        let length = self.parse_i32("FixedSizeBinary")?;
378        if length < 0 {
379            return Err(make_error(
380                self.val,
381                &format!("FixedSizeBinary length must be non-negative, got {length}"),
382            ));
383        }
384        self.expect_token(Token::RParen)?;
385        Ok(DataType::FixedSizeBinary(length))
386    }
387
388    /// Parses the next Decimal32 (called after `Decimal32` has been consumed)
389    fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
390        self.expect_token(Token::LParen)?;
391        let precision = self.parse_u8("Decimal32")?;
392        self.expect_token(Token::Comma)?;
393        let scale = self.parse_i8("Decimal32")?;
394        self.expect_token(Token::RParen)?;
395        Ok(DataType::Decimal32(precision, scale))
396    }
397
398    /// Parses the next Decimal64 (called after `Decimal64` has been consumed)
399    fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
400        self.expect_token(Token::LParen)?;
401        let precision = self.parse_u8("Decimal64")?;
402        self.expect_token(Token::Comma)?;
403        let scale = self.parse_i8("Decimal64")?;
404        self.expect_token(Token::RParen)?;
405        Ok(DataType::Decimal64(precision, scale))
406    }
407
408    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
409    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
410        self.expect_token(Token::LParen)?;
411        let precision = self.parse_u8("Decimal128")?;
412        self.expect_token(Token::Comma)?;
413        let scale = self.parse_i8("Decimal128")?;
414        self.expect_token(Token::RParen)?;
415        Ok(DataType::Decimal128(precision, scale))
416    }
417
418    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
419    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
420        self.expect_token(Token::LParen)?;
421        let precision = self.parse_u8("Decimal256")?;
422        self.expect_token(Token::Comma)?;
423        let scale = self.parse_i8("Decimal256")?;
424        self.expect_token(Token::RParen)?;
425        Ok(DataType::Decimal256(precision, scale))
426    }
427
428    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
429    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
430        self.expect_token(Token::LParen)?;
431        let key_type = self.parse_next_type()?;
432        self.expect_token(Token::Comma)?;
433        let value_type = self.parse_next_type()?;
434        self.expect_token(Token::RParen)?;
435        Ok(DataType::Dictionary(
436            Box::new(key_type),
437            Box::new(value_type),
438        ))
439    }
440
441    /// Parses the next Struct (called after `Struct` has been consumed)
442    fn parse_struct(&mut self) -> ArrowResult<DataType> {
443        self.expect_token(Token::LParen)?;
444        let mut fields = Vec::new();
445        loop {
446            if self
447                .tokenizer
448                .next_if(|next| matches!(next, Ok(Token::RParen)))
449                .is_some()
450            {
451                break;
452            }
453
454            let field = self.parse_field()?;
455            fields.push(Arc::new(field));
456            match self.next_token()? {
457                Token::Comma => continue,
458                Token::RParen => break,
459                tok => {
460                    return Err(make_error(
461                        self.val,
462                        &format!(
463                            "Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"
464                        ),
465                    ));
466                }
467            }
468        }
469        Ok(DataType::Struct(Fields::from(fields)))
470    }
471
472    /// Parses the next Union (called after `Union` has been consumed)
473    /// E.g: Union(Sparse, 0: ("a": Int32), 1: ("b": non-null Utf8))
474    fn parse_union(&mut self) -> ArrowResult<DataType> {
475        self.expect_token(Token::LParen)?;
476        let union_mode = self.parse_union_mode()?;
477        let mut type_ids = vec![];
478        let mut fields = vec![];
479        loop {
480            if self
481                .tokenizer
482                .next_if(|next| matches!(next, Ok(Token::RParen)))
483                .is_some()
484            {
485                break;
486            }
487            self.expect_token(Token::Comma)?;
488            let (type_id, field) = self.parse_union_field()?;
489            type_ids.push(type_id);
490            fields.push(field);
491        }
492        Ok(DataType::Union(
493            UnionFields::try_new(type_ids, fields)?,
494            union_mode,
495        ))
496    }
497
498    /// Parses the next UnionMode
499    fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> {
500        match self.next_token()? {
501            Token::UnionMode(union_mode) => Ok(union_mode),
502            tok => Err(make_error(
503                self.val,
504                &format!("finding UnionMode for Union, got {tok}"),
505            )),
506        }
507    }
508
509    /// Parses the next UnionField
510    /// 0: ("a": non-null Int32)
511    fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> {
512        let type_id = self.parse_i8("UnionField")?;
513        self.expect_token(Token::Colon)?;
514        self.expect_token(Token::LParen)?;
515        let field = self.parse_field()?;
516        self.expect_token(Token::RParen)?;
517        Ok((type_id, field))
518    }
519
520    /// Parses the next Map (called after `Map` has been consumed)
521    /// E.g: Map("entries": Struct("key": Utf8, "value": non-null Int32), sorted)
522    fn parse_map(&mut self) -> ArrowResult<DataType> {
523        self.expect_token(Token::LParen)?;
524        let field = self.parse_field()?;
525        self.expect_token(Token::Comma)?;
526        let sorted = self.parse_map_sorted()?;
527        self.expect_token(Token::RParen)?;
528        Ok(DataType::Map(Arc::new(field), sorted))
529    }
530
531    /// Parses map's sorted
532    fn parse_map_sorted(&mut self) -> ArrowResult<bool> {
533        match self.next_token()? {
534            Token::MapSorted(sorted) => Ok(sorted),
535            tok => Err(make_error(
536                self.val,
537                &format!("Expected sorted or unsorted for a map; got {tok:?}"),
538            )),
539        }
540    }
541
542    /// Parses the next RunEndEncoded (called after `RunEndEncoded` has been consumed)
543    /// E.g: RunEndEncoded("run_ends": UInt32, "values": nonnull Int32)
544    fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> {
545        self.expect_token(Token::LParen)?;
546        let run_ends = self.parse_field()?;
547        self.expect_token(Token::Comma)?;
548        let values = self.parse_field()?;
549        self.expect_token(Token::RParen)?;
550        Ok(DataType::RunEndEncoded(
551            Arc::new(run_ends),
552            Arc::new(values),
553        ))
554    }
555
556    /// consume the next token and return `false` if the field is `nonnull`.
557    fn parse_opt_nullable(&mut self) -> bool {
558        let tok = self
559            .tokenizer
560            .next_if(|next| matches!(next, Ok(Token::NonNull | Token::Nullable)));
561        !matches!(tok, Some(Ok(Token::NonNull)))
562    }
563
564    /// return the next token, or an error if there are none left
565    fn next_token(&mut self) -> ArrowResult<Token> {
566        match self.tokenizer.next() {
567            None => Err(make_error(self.val, "finding next token")),
568            Some(token) => token,
569        }
570    }
571
572    /// consume the next token, returning OK(()) if it matches tok, and Err if not
573    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
574        let next_token = self.next_token()?;
575        if next_token == tok {
576            Ok(())
577        } else {
578            Err(make_error_expected(self.val, &tok, &next_token))
579        }
580    }
581}
582
583/// returns true if this character is a separator
584fn is_separator(c: char) -> bool {
585    c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
586}
587
588enum QuoteType {
589    Double,
590    Single,
591}
592
593#[derive(Debug)]
594/// Splits a strings like Dictionary(Int32, Int64) into tokens suitable for parsing
595///
596/// For example the string "Timestamp(ns)" would be parsed into:
597///
598/// * Token::Timestamp
599/// * Token::Lparen
600/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
601/// * Token::Rparen,
602struct Tokenizer<'a> {
603    val: &'a str,
604    chars: Peekable<Chars<'a>>,
605    // temporary buffer for parsing words
606    word: String,
607}
608
609impl<'a> Tokenizer<'a> {
610    fn new(val: &'a str) -> Self {
611        Self {
612            val,
613            chars: val.chars().peekable(),
614            word: String::new(),
615        }
616    }
617
618    /// returns the next char, without consuming it
619    fn peek_next_char(&mut self) -> Option<char> {
620        self.chars.peek().copied()
621    }
622
623    /// returns the next char, and consuming it
624    fn next_char(&mut self) -> Option<char> {
625        self.chars.next()
626    }
627
628    /// parse the characters in val starting at pos, until the next
629    /// `,`, `(`, or `)` or end of line
630    fn parse_word(&mut self) -> ArrowResult<Token> {
631        // reset temp space
632        self.word.clear();
633        loop {
634            match self.peek_next_char() {
635                None => break,
636                Some(c) if is_separator(c) => break,
637                Some(c) => {
638                    self.next_char();
639                    self.word.push(c);
640                }
641            }
642        }
643
644        if let Some(c) = self.word.chars().next() {
645            // if it started with a number, try parsing it as an integer
646            if c == '-' || c.is_numeric() {
647                let val: i64 = self.word.parse().map_err(|e| {
648                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
649                })?;
650                return Ok(Token::Integer(val));
651            }
652        }
653
654        // figure out what the word was
655        let token = match self.word.as_str() {
656            "Null" => Token::SimpleType(DataType::Null),
657            "Boolean" => Token::SimpleType(DataType::Boolean),
658
659            "Int8" => Token::SimpleType(DataType::Int8),
660            "Int16" => Token::SimpleType(DataType::Int16),
661            "Int32" => Token::SimpleType(DataType::Int32),
662            "Int64" => Token::SimpleType(DataType::Int64),
663
664            "UInt8" => Token::SimpleType(DataType::UInt8),
665            "UInt16" => Token::SimpleType(DataType::UInt16),
666            "UInt32" => Token::SimpleType(DataType::UInt32),
667            "UInt64" => Token::SimpleType(DataType::UInt64),
668
669            "Utf8" => Token::SimpleType(DataType::Utf8),
670            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
671            "Utf8View" => Token::SimpleType(DataType::Utf8View),
672            "Binary" => Token::SimpleType(DataType::Binary),
673            "BinaryView" => Token::SimpleType(DataType::BinaryView),
674            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
675
676            "Float16" => Token::SimpleType(DataType::Float16),
677            "Float32" => Token::SimpleType(DataType::Float32),
678            "Float64" => Token::SimpleType(DataType::Float64),
679
680            "Date32" => Token::SimpleType(DataType::Date32),
681            "Date64" => Token::SimpleType(DataType::Date64),
682
683            "List" => Token::List,
684            "ListView" => Token::ListView,
685            "LargeList" => Token::LargeList,
686            "LargeListView" => Token::LargeListView,
687            "FixedSizeList" => Token::FixedSizeList,
688
689            "s" | "Second" => Token::TimeUnit(TimeUnit::Second),
690            "ms" | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
691            "µs" | "us" | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
692            "ns" | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
693
694            "Timestamp" => Token::Timestamp,
695            "Time32" => Token::Time32,
696            "Time64" => Token::Time64,
697            "Duration" => Token::Duration,
698            "Interval" => Token::Interval,
699            "Dictionary" => Token::Dictionary,
700
701            "FixedSizeBinary" => Token::FixedSizeBinary,
702
703            "Decimal32" => Token::Decimal32,
704            "Decimal64" => Token::Decimal64,
705            "Decimal128" => Token::Decimal128,
706            "Decimal256" => Token::Decimal256,
707
708            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
709            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
710            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
711
712            "Some" => Token::Some,
713            "None" => Token::None,
714
715            "non-null" => Token::NonNull,
716            "nullable" => Token::Nullable,
717            "field" => Token::Field,
718            "x" => Token::X,
719
720            "Struct" => Token::Struct,
721
722            "Union" => Token::Union,
723            "Sparse" => Token::UnionMode(UnionMode::Sparse),
724            "Dense" => Token::UnionMode(UnionMode::Dense),
725
726            "Map" => Token::Map,
727            "sorted" => Token::MapSorted(true),
728            "unsorted" => Token::MapSorted(false),
729
730            "RunEndEncoded" => Token::RunEndEncoded,
731
732            token => {
733                return Err(make_error(self.val, &format!("unknown token: {token}")));
734            }
735        };
736        Ok(token)
737    }
738
739    /// Parses e.g. `"foo bar"`, `'foo bar'`
740    fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> {
741        let quote = match quote_type {
742            QuoteType::Double => '\"',
743            QuoteType::Single => '\'',
744        };
745
746        if self.next_char() != Some(quote) {
747            return Err(make_error(self.val, "Expected \""));
748        }
749
750        // reset temp space
751        self.word.clear();
752
753        let mut is_escaped = false;
754
755        loop {
756            match self.next_char() {
757                None => {
758                    return Err(ArrowError::ParseError(format!(
759                        "Unterminated string at: \"{}",
760                        self.word
761                    )));
762                }
763                Some(c) => match c {
764                    '\\' => {
765                        is_escaped = true;
766                        self.word.push(c);
767                    }
768                    c if c == quote => {
769                        if is_escaped {
770                            self.word.push(c);
771                            is_escaped = false;
772                        } else {
773                            break;
774                        }
775                    }
776                    c => {
777                        self.word.push(c);
778                    }
779                },
780            }
781        }
782
783        let val: String = self.word.parse().map_err(|err| {
784            ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
785        })?;
786
787        if val.is_empty() {
788            // Using empty strings as field names is just asking for trouble
789            return Err(make_error(self.val, "empty strings aren't allowed"));
790        }
791
792        match quote_type {
793            QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
794            QuoteType::Single => Ok(Token::SingleQuotedString(val)),
795        }
796    }
797}
798
799impl Iterator for Tokenizer<'_> {
800    type Item = ArrowResult<Token>;
801
802    fn next(&mut self) -> Option<Self::Item> {
803        loop {
804            match self.peek_next_char()? {
805                ' ' => {
806                    // skip whitespace
807                    self.next_char();
808                    continue;
809                }
810                '"' => {
811                    return Some(self.parse_quoted_string(QuoteType::Double));
812                }
813                '\'' => {
814                    return Some(self.parse_quoted_string(QuoteType::Single));
815                }
816                '(' => {
817                    self.next_char();
818                    return Some(Ok(Token::LParen));
819                }
820                ')' => {
821                    self.next_char();
822                    return Some(Ok(Token::RParen));
823                }
824                ',' => {
825                    self.next_char();
826                    return Some(Ok(Token::Comma));
827                }
828                ':' => {
829                    self.next_char();
830                    return Some(Ok(Token::Colon));
831                }
832                _ => return Some(self.parse_word()),
833            }
834        }
835    }
836}
837
838/// Grammar is
839///
840#[derive(Debug, PartialEq)]
841enum Token {
842    // Null, or Int32
843    SimpleType(DataType),
844    Timestamp,
845    Time32,
846    Time64,
847    Duration,
848    Interval,
849    FixedSizeBinary,
850    Decimal32,
851    Decimal64,
852    Decimal128,
853    Decimal256,
854    Dictionary,
855    TimeUnit(TimeUnit),
856    IntervalUnit(IntervalUnit),
857    LParen,
858    RParen,
859    Comma,
860    Colon,
861    Some,
862    None,
863    Integer(i64),
864    DoubleQuotedString(String),
865    SingleQuotedString(String),
866    List,
867    ListView,
868    LargeList,
869    LargeListView,
870    FixedSizeList,
871    Struct,
872    Union,
873    UnionMode(UnionMode),
874    Map,
875    MapSorted(bool),
876    RunEndEncoded,
877    NonNull,
878    Nullable,
879    Field,
880    X,
881}
882
883impl Display for Token {
884    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
885        match self {
886            Token::SimpleType(t) => write!(f, "{t}"),
887            Token::List => write!(f, "List"),
888            Token::ListView => write!(f, "ListView"),
889            Token::LargeList => write!(f, "LargeList"),
890            Token::LargeListView => write!(f, "LargeListView"),
891            Token::FixedSizeList => write!(f, "FixedSizeList"),
892            Token::Timestamp => write!(f, "Timestamp"),
893            Token::Time32 => write!(f, "Time32"),
894            Token::Time64 => write!(f, "Time64"),
895            Token::Duration => write!(f, "Duration"),
896            Token::Interval => write!(f, "Interval"),
897            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
898            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
899            Token::LParen => write!(f, "("),
900            Token::RParen => write!(f, ")"),
901            Token::Comma => write!(f, ","),
902            Token::Colon => write!(f, ":"),
903            Token::Some => write!(f, "Some"),
904            Token::None => write!(f, "None"),
905            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
906            Token::Decimal32 => write!(f, "Decimal32"),
907            Token::Decimal64 => write!(f, "Decimal64"),
908            Token::Decimal128 => write!(f, "Decimal128"),
909            Token::Decimal256 => write!(f, "Decimal256"),
910            Token::Dictionary => write!(f, "Dictionary"),
911            Token::Integer(v) => write!(f, "Integer({v})"),
912            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
913            Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"),
914            Token::Struct => write!(f, "Struct"),
915            Token::Union => write!(f, "Union"),
916            Token::UnionMode(m) => write!(f, "{m:?}"),
917            Token::Map => write!(f, "Map"),
918            Token::MapSorted(sorted) => {
919                write!(f, "{}", if *sorted { "sorted" } else { "unsorted" })
920            }
921            Token::RunEndEncoded => write!(f, "RunEndEncoded"),
922            Token::NonNull => write!(f, "non-null"),
923            Token::Nullable => write!(f, "nullable"),
924            Token::Field => write!(f, "field"),
925            Token::X => write!(f, "x"),
926        }
927    }
928}
929
930#[cfg(test)]
931mod test {
932    use super::*;
933
934    #[test]
935    fn test_parse_data_type() {
936        // this ensures types can be parsed correctly from their string representations
937        for dt in list_datatypes() {
938            round_trip(dt)
939        }
940    }
941
942    /// Ensure we converting data_type to a string, and then parse it as a type
943    /// verifying it is the same
944    fn round_trip(data_type: DataType) {
945        let data_type_string = data_type.to_string();
946        println!("Input '{data_type_string}' ({data_type:?})");
947        let parsed_type = parse_data_type(&data_type_string).unwrap();
948        assert_eq!(
949            data_type, parsed_type,
950            "Mismatch parsing {data_type_string}"
951        );
952    }
953
954    fn list_datatypes() -> Vec<DataType> {
955        vec![
956            // ---------
957            // Non Nested types
958            // ---------
959            DataType::Null,
960            DataType::Boolean,
961            DataType::Int8,
962            DataType::Int16,
963            DataType::Int32,
964            DataType::Int64,
965            DataType::UInt8,
966            DataType::UInt16,
967            DataType::UInt32,
968            DataType::UInt64,
969            DataType::Float16,
970            DataType::Float32,
971            DataType::Float64,
972            DataType::Timestamp(TimeUnit::Second, None),
973            DataType::Timestamp(TimeUnit::Millisecond, None),
974            DataType::Timestamp(TimeUnit::Microsecond, None),
975            DataType::Timestamp(TimeUnit::Nanosecond, None),
976            // we can't cover all possible timezones, here we only test utc and +08:00
977            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
978            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
979            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
980            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
981            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
982            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
983            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
984            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
985            DataType::Date32,
986            DataType::Date64,
987            DataType::Time32(TimeUnit::Second),
988            DataType::Time32(TimeUnit::Millisecond),
989            DataType::Time32(TimeUnit::Microsecond),
990            DataType::Time32(TimeUnit::Nanosecond),
991            DataType::Time64(TimeUnit::Second),
992            DataType::Time64(TimeUnit::Millisecond),
993            DataType::Time64(TimeUnit::Microsecond),
994            DataType::Time64(TimeUnit::Nanosecond),
995            DataType::Duration(TimeUnit::Second),
996            DataType::Duration(TimeUnit::Millisecond),
997            DataType::Duration(TimeUnit::Microsecond),
998            DataType::Duration(TimeUnit::Nanosecond),
999            DataType::Interval(IntervalUnit::YearMonth),
1000            DataType::Interval(IntervalUnit::DayTime),
1001            DataType::Interval(IntervalUnit::MonthDayNano),
1002            DataType::Binary,
1003            DataType::BinaryView,
1004            DataType::FixedSizeBinary(0),
1005            DataType::FixedSizeBinary(1234),
1006            DataType::LargeBinary,
1007            DataType::Utf8,
1008            DataType::Utf8View,
1009            DataType::LargeUtf8,
1010            DataType::Decimal32(7, 8),
1011            DataType::Decimal64(6, 9),
1012            DataType::Decimal128(7, 12),
1013            DataType::Decimal256(6, 13),
1014            // ---------
1015            // Nested types
1016            // ---------
1017            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1018            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1019            DataType::Dictionary(
1020                Box::new(DataType::Int8),
1021                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
1022            ),
1023            DataType::Dictionary(
1024                Box::new(DataType::Int8),
1025                Box::new(DataType::FixedSizeBinary(23)),
1026            ),
1027            DataType::Dictionary(
1028                Box::new(DataType::Int8),
1029                Box::new(
1030                    // nested dictionaries are probably a bad idea but they are possible
1031                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1032                ),
1033            ),
1034            DataType::Struct(Fields::from(vec![
1035                Field::new("f1", DataType::Int64, true),
1036                Field::new("f2", DataType::Float64, true),
1037                Field::new(
1038                    "f3",
1039                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1040                    true,
1041                ),
1042                Field::new(
1043                    "f4",
1044                    DataType::Dictionary(
1045                        Box::new(DataType::Int8),
1046                        Box::new(DataType::FixedSizeBinary(23)),
1047                    ),
1048                    true,
1049                ),
1050            ])),
1051            DataType::Struct(Fields::from(vec![
1052                Field::new("Int64", DataType::Int64, true),
1053                Field::new("Float64", DataType::Float64, true),
1054            ])),
1055            DataType::Struct(Fields::from(vec![
1056                Field::new("f1", DataType::Int64, true),
1057                Field::new(
1058                    "nested_struct",
1059                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
1060                    true,
1061                ),
1062            ])),
1063            DataType::Struct(Fields::from(vec![Field::new("f1", DataType::Int64, true)])),
1064            DataType::Struct(Fields::empty()),
1065            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))),
1066            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
1067            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1068            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))),
1069            DataType::List(Arc::new(Field::new(
1070                "nested_list",
1071                DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1072                true,
1073            ))),
1074            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1075            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1076            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1077            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1078            DataType::ListView(Arc::new(Field::new(
1079                "nested_list_view",
1080                DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1081                true,
1082            ))),
1083            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
1084            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
1085            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1086            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))),
1087            DataType::LargeList(Arc::new(Field::new(
1088                "nested_large_list",
1089                DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1090                true,
1091            ))),
1092            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1093            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1094            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1095            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1096            DataType::LargeListView(Arc::new(Field::new(
1097                "nested_large_list_view",
1098                DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1099                true,
1100            ))),
1101            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2),
1102            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2),
1103            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2),
1104            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2),
1105            DataType::FixedSizeList(
1106                Arc::new(Field::new(
1107                    "nested_fixed_size_list",
1108                    DataType::FixedSizeList(
1109                        Arc::new(Field::new("Int64", DataType::Int64, true)),
1110                        2,
1111                    ),
1112                    true,
1113                )),
1114                2,
1115            ),
1116            DataType::Union(
1117                UnionFields::from_fields(vec![
1118                    Field::new("Int32", DataType::Int32, false),
1119                    Field::new("Utf8", DataType::Utf8, true),
1120                ]),
1121                UnionMode::Sparse,
1122            ),
1123            DataType::Union(
1124                UnionFields::from_fields(vec![
1125                    Field::new("Int32", DataType::Int32, false),
1126                    Field::new("Utf8", DataType::Utf8, true),
1127                ]),
1128                UnionMode::Dense,
1129            ),
1130            DataType::Union(
1131                UnionFields::from_fields(vec![
1132                    Field::new_union(
1133                        "nested_union",
1134                        vec![0, 1],
1135                        vec![
1136                            Field::new("Int32", DataType::Int32, false),
1137                            Field::new("Utf8", DataType::Utf8, true),
1138                        ],
1139                        UnionMode::Dense,
1140                    ),
1141                    Field::new("Utf8", DataType::Utf8, true),
1142                ]),
1143                UnionMode::Sparse,
1144            ),
1145            DataType::Union(
1146                UnionFields::from_fields(vec![Field::new("Int32", DataType::Int32, false)]),
1147                UnionMode::Dense,
1148            ),
1149            DataType::Union(
1150                UnionFields::try_new(Vec::<i8>::new(), Vec::<Field>::new()).unwrap(),
1151                UnionMode::Sparse,
1152            ),
1153            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), true),
1154            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), false),
1155            DataType::Map(
1156                Arc::new(Field::new_map(
1157                    "nested_map",
1158                    "entries",
1159                    Field::new("key", DataType::Utf8, false),
1160                    Field::new("value", DataType::Int32, true),
1161                    false,
1162                    true,
1163                )),
1164                true,
1165            ),
1166            DataType::RunEndEncoded(
1167                Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1168                Arc::new(Field::new("values", DataType::Int32, true)),
1169            ),
1170            DataType::RunEndEncoded(
1171                Arc::new(Field::new(
1172                    "nested_run_end_encoded",
1173                    DataType::RunEndEncoded(
1174                        Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1175                        Arc::new(Field::new("values", DataType::Int32, true)),
1176                    ),
1177                    true,
1178                )),
1179                Arc::new(Field::new("values", DataType::Int32, true)),
1180            ),
1181        ]
1182    }
1183
1184    #[test]
1185    fn test_parse_data_type_whitespace_tolerance() {
1186        // (string to parse, expected DataType)
1187        let cases = [
1188            ("Int8", DataType::Int8),
1189            (
1190                "Timestamp        (ns)",
1191                DataType::Timestamp(TimeUnit::Nanosecond, None),
1192            ),
1193            (
1194                "Timestamp        (ns)  ",
1195                DataType::Timestamp(TimeUnit::Nanosecond, None),
1196            ),
1197            (
1198                "          Timestamp        (ns               )",
1199                DataType::Timestamp(TimeUnit::Nanosecond, None),
1200            ),
1201            (
1202                "Timestamp        (ns               )  ",
1203                DataType::Timestamp(TimeUnit::Nanosecond, None),
1204            ),
1205        ];
1206
1207        for (data_type_string, expected_data_type) in cases {
1208            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1209            assert_eq!(
1210                parsed_data_type, expected_data_type,
1211                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1212            );
1213        }
1214    }
1215
1216    /// Ensure that old style types can still be parsed
1217    #[test]
1218    fn test_parse_data_type_backwards_compatibility() {
1219        use DataType::*;
1220        use IntervalUnit::*;
1221        use TimeUnit::*;
1222        // List below created with:
1223        for t in list_datatypes() {
1224            println!(r#"("{t}", {t:?}),"#);
1225        }
1226        // (string to parse, expected DataType)
1227        let cases = [
1228            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1229            ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
1230            ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
1231            ("Timestamp(Second, None)", Timestamp(Second, None)),
1232            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1233            // Timezones
1234            (
1235                r#"Timestamp(Nanosecond, Some("+00:00"))"#,
1236                Timestamp(Nanosecond, Some("+00:00".into())),
1237            ),
1238            (
1239                r#"Timestamp(Microsecond, Some("+00:00"))"#,
1240                Timestamp(Microsecond, Some("+00:00".into())),
1241            ),
1242            (
1243                r#"Timestamp(Millisecond, Some("+00:00"))"#,
1244                Timestamp(Millisecond, Some("+00:00".into())),
1245            ),
1246            (
1247                r#"Timestamp(Second, Some("+00:00"))"#,
1248                Timestamp(Second, Some("+00:00".into())),
1249            ),
1250            ("Null", Null),
1251            ("Boolean", Boolean),
1252            ("Int8", Int8),
1253            ("Int16", Int16),
1254            ("Int32", Int32),
1255            ("Int64", Int64),
1256            ("UInt8", UInt8),
1257            ("UInt16", UInt16),
1258            ("UInt32", UInt32),
1259            ("UInt64", UInt64),
1260            ("Float16", Float16),
1261            ("Float32", Float32),
1262            ("Float64", Float64),
1263            ("Timestamp(s)", Timestamp(Second, None)),
1264            ("Timestamp(ms)", Timestamp(Millisecond, None)),
1265            ("Timestamp(µs)", Timestamp(Microsecond, None)),
1266            ("Timestamp(ns)", Timestamp(Nanosecond, None)),
1267            (
1268                r#"Timestamp(ns, "+00:00")"#,
1269                Timestamp(Nanosecond, Some("+00:00".into())),
1270            ),
1271            (
1272                r#"Timestamp(µs, "+00:00")"#,
1273                Timestamp(Microsecond, Some("+00:00".into())),
1274            ),
1275            (
1276                r#"Timestamp(ms, "+00:00")"#,
1277                Timestamp(Millisecond, Some("+00:00".into())),
1278            ),
1279            (
1280                r#"Timestamp(s, "+00:00")"#,
1281                Timestamp(Second, Some("+00:00".into())),
1282            ),
1283            (
1284                r#"Timestamp(ns, "+08:00")"#,
1285                Timestamp(Nanosecond, Some("+08:00".into())),
1286            ),
1287            (
1288                r#"Timestamp(µs, "+08:00")"#,
1289                Timestamp(Microsecond, Some("+08:00".into())),
1290            ),
1291            (
1292                r#"Timestamp(ms, "+08:00")"#,
1293                Timestamp(Millisecond, Some("+08:00".into())),
1294            ),
1295            (
1296                r#"Timestamp(s, "+08:00")"#,
1297                Timestamp(Second, Some("+08:00".into())),
1298            ),
1299            ("Date32", Date32),
1300            ("Date64", Date64),
1301            ("Time32(s)", Time32(Second)),
1302            ("Time32(ms)", Time32(Millisecond)),
1303            ("Time32(µs)", Time32(Microsecond)),
1304            ("Time32(ns)", Time32(Nanosecond)),
1305            ("Time64(s)", Time64(Second)),
1306            ("Time64(ms)", Time64(Millisecond)),
1307            ("Time64(µs)", Time64(Microsecond)),
1308            ("Time64(ns)", Time64(Nanosecond)),
1309            ("Duration(s)", Duration(Second)),
1310            ("Duration(ms)", Duration(Millisecond)),
1311            ("Duration(µs)", Duration(Microsecond)),
1312            ("Duration(ns)", Duration(Nanosecond)),
1313            ("Interval(YearMonth)", Interval(YearMonth)),
1314            ("Interval(DayTime)", Interval(DayTime)),
1315            ("Interval(MonthDayNano)", Interval(MonthDayNano)),
1316            ("Binary", Binary),
1317            ("BinaryView", BinaryView),
1318            ("FixedSizeBinary(0)", FixedSizeBinary(0)),
1319            ("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
1320            ("LargeBinary", LargeBinary),
1321            ("Utf8", Utf8),
1322            ("Utf8View", Utf8View),
1323            ("LargeUtf8", LargeUtf8),
1324            ("Decimal32(7, 8)", Decimal32(7, 8)),
1325            ("Decimal64(6, 9)", Decimal64(6, 9)),
1326            ("Decimal128(7, 12)", Decimal128(7, 12)),
1327            ("Decimal256(6, 13)", Decimal256(6, 13)),
1328            (
1329                "Dictionary(Int32, Utf8)",
1330                Dictionary(Box::new(Int32), Box::new(Utf8)),
1331            ),
1332            (
1333                "Dictionary(Int8, Utf8)",
1334                Dictionary(Box::new(Int8), Box::new(Utf8)),
1335            ),
1336            (
1337                "Dictionary(Int8, Timestamp(ns))",
1338                Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))),
1339            ),
1340            (
1341                "Dictionary(Int8, FixedSizeBinary(23))",
1342                Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1343            ),
1344            (
1345                "Dictionary(Int8, Dictionary(Int8, Utf8))",
1346                Dictionary(
1347                    Box::new(Int8),
1348                    Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
1349                ),
1350            ),
1351            (
1352                r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#,
1353                Struct(Fields::from(vec![
1354                    Field::new("f1", Int64, true),
1355                    Field::new("f2", Float64, true),
1356                    Field::new("f3", Timestamp(Second, Some("+08:00".into())), true),
1357                    Field::new(
1358                        "f4",
1359                        Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1360                        true,
1361                    ),
1362                ])),
1363            ),
1364            (
1365                r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#,
1366                Struct(Fields::from(vec![
1367                    Field::new("Int64", Int64, true),
1368                    Field::new("Float64", Float64, true),
1369                ])),
1370            ),
1371            (
1372                r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#,
1373                Struct(Fields::from(vec![
1374                    Field::new("f1", Int64, true),
1375                    Field::new(
1376                        "nested_struct",
1377                        Struct(Fields::from(vec![Field::new("n1", Int64, true)])),
1378                        true,
1379                    ),
1380                ])),
1381            ),
1382            (r#"Struct()"#, Struct(Fields::empty())),
1383            (
1384                "FixedSizeList(4, Int64)",
1385                FixedSizeList(Arc::new(Field::new_list_field(Int64, true)), 4),
1386            ),
1387            (
1388                "List(Int64)",
1389                List(Arc::new(Field::new_list_field(Int64, true))),
1390            ),
1391            (
1392                "LargeList(Int64)",
1393                LargeList(Arc::new(Field::new_list_field(Int64, true))),
1394            ),
1395        ];
1396
1397        for (data_type_string, expected_data_type) in cases {
1398            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1399            assert_eq!(
1400                parsed_data_type, expected_data_type,
1401                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1402            );
1403        }
1404    }
1405
1406    #[test]
1407    fn parse_data_type_errors() {
1408        // (string to parse, expected error message)
1409        let cases = [
1410            ("", "Unsupported type ''"),
1411            ("", "Error finding next token"),
1412            ("null", "Unsupported type 'null'"),
1413            ("Nu", "Unsupported type 'Nu'"),
1414            (r#"Timestamp(ns, +00:00)"#, "Error unknown token: +00"),
1415            (
1416                r#"Timestamp(ns, "+00:00)"#,
1417                r#"Unterminated string at: "+00:00)"#,
1418            ),
1419            (r#"Timestamp(ns, "")"#, r#"empty strings aren't allowed"#),
1420            (
1421                r#"Timestamp(ns, "+00:00"")"#,
1422                r#"Parser error: Unterminated string at: ")"#,
1423            ),
1424            ("Timestamp(ns, ", "Error finding next token"),
1425            (
1426                "Float32 Float32",
1427                "trailing content after parsing 'Float32'",
1428            ),
1429            ("Int32, ", "trailing content after parsing 'Int32'"),
1430            ("Int32(3), ", "trailing content after parsing 'Int32'"),
1431            (
1432                "FixedSizeBinary(Int32), ",
1433                "Error finding i64 for FixedSizeBinary, got 'Int32'",
1434            ),
1435            (
1436                "FixedSizeBinary(3.0), ",
1437                "Error parsing 3.0 as integer: invalid digit found in string",
1438            ),
1439            // too large for i32
1440            (
1441                "FixedSizeBinary(4000000000), ",
1442                "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted",
1443            ),
1444            // can't have negative width
1445            (
1446                "FixedSizeBinary(-1), ",
1447                "FixedSizeBinary length must be non-negative, got -1",
1448            ),
1449            // can't have negative precision
1450            (
1451                "Decimal32(-3, 5)",
1452                "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted",
1453            ),
1454            (
1455                "Decimal64(-3, 5)",
1456                "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted",
1457            ),
1458            (
1459                "Decimal128(-3, 5)",
1460                "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted",
1461            ),
1462            (
1463                "Decimal256(-3, 5)",
1464                "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted",
1465            ),
1466            (
1467                "Decimal32(3, 500)",
1468                "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted",
1469            ),
1470            (
1471                "Decimal64(3, 500)",
1472                "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted",
1473            ),
1474            (
1475                "Decimal128(3, 500)",
1476                "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted",
1477            ),
1478            (
1479                "Decimal256(3, 500)",
1480                "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted",
1481            ),
1482            ("Struct(f1 Int64)", "Error unknown token: f1"),
1483            ("Struct(\"f1\" Int64)", "Expected ':'"),
1484            (
1485                "Struct(\"f1\": )",
1486                "Error finding next type, got unexpected ')'",
1487            ),
1488        ];
1489
1490        for (data_type_string, expected_message) in cases {
1491            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
1492            match parse_data_type(data_type_string) {
1493                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
1494                Err(e) => {
1495                    let message = e.to_string();
1496                    assert!(
1497                        message.contains(expected_message),
1498                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
1499                    );
1500
1501                    if !message.contains("Unterminated string") {
1502                        // errors should also contain a help message
1503                        assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
1504                    }
1505                }
1506            }
1507        }
1508    }
1509
1510    #[test]
1511    fn parse_error_type() {
1512        let err = parse_data_type("foobar").unwrap_err();
1513        assert!(matches!(err, ArrowError::ParseError(_)));
1514        assert_eq!(
1515            err.to_string(),
1516            "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar"
1517        );
1518    }
1519}