arrow_schema/
datatype_parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields, UnionMode};
21
22/// Parses a DataType from a string representation
23///
24/// For example, the string "Int32" would be parsed into [`DataType::Int32`]
25pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
26    Parser::new(val).parse()
27}
28
29type ArrowResult<T> = Result<T, ArrowError>;
30
31fn make_error(val: &str, msg: &str) -> ArrowError {
32    let msg = format!(
33        "Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error {msg}"
34    );
35    ArrowError::ParseError(msg)
36}
37
38fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
39    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
40}
41
42/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
43#[derive(Debug)]
44struct Parser<'a> {
45    val: &'a str,
46    tokenizer: Peekable<Tokenizer<'a>>,
47}
48
49impl<'a> Parser<'a> {
50    fn new(val: &'a str) -> Self {
51        Self {
52            val,
53            tokenizer: Tokenizer::new(val).peekable(),
54        }
55    }
56
57    fn parse(mut self) -> ArrowResult<DataType> {
58        let data_type = self.parse_next_type()?;
59        // ensure that there is no trailing content
60        if self.tokenizer.next().is_some() {
61            Err(make_error(
62                self.val,
63                &format!("checking trailing content after parsing '{data_type}'"),
64            ))
65        } else {
66            Ok(data_type)
67        }
68    }
69
70    /// parses the next full DataType
71    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
72        match self.next_token()? {
73            Token::SimpleType(data_type) => Ok(data_type),
74            Token::Timestamp => self.parse_timestamp(),
75            Token::Time32 => self.parse_time32(),
76            Token::Time64 => self.parse_time64(),
77            Token::Duration => self.parse_duration(),
78            Token::Interval => self.parse_interval(),
79            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
80            Token::Decimal32 => self.parse_decimal_32(),
81            Token::Decimal64 => self.parse_decimal_64(),
82            Token::Decimal128 => self.parse_decimal_128(),
83            Token::Decimal256 => self.parse_decimal_256(),
84            Token::Dictionary => self.parse_dictionary(),
85            Token::List => self.parse_list(),
86            Token::ListView => self.parse_list_view(),
87            Token::LargeList => self.parse_large_list(),
88            Token::LargeListView => self.parse_large_list_view(),
89            Token::FixedSizeList => self.parse_fixed_size_list(),
90            Token::Struct => self.parse_struct(),
91            Token::Union => self.parse_union(),
92            Token::Map => self.parse_map(),
93            Token::RunEndEncoded => self.parse_run_end_encoded(),
94            tok => Err(make_error(
95                self.val,
96                &format!("finding next type, got unexpected '{tok}'"),
97            )),
98        }
99    }
100
101    /// parses Field, this is the inversion of `format_field` in `datatype_display.rs`.
102    /// E.g: "a": non-null Int64
103    ///
104    /// TODO: support metadata: `"a": non-null Int64 metadata: {"foo": "value"}`
105    fn parse_field(&mut self) -> ArrowResult<Field> {
106        let name = self.parse_double_quoted_string("Field")?;
107        self.expect_token(Token::Colon)?;
108        let nullable = self.parse_opt_nullable();
109        let data_type = self.parse_next_type()?;
110        Ok(Field::new(name, data_type, nullable))
111    }
112
113    /// Parses field inside a list. Use `Field::LIST_FIELD_DEFAULT_NAME`
114    /// if no field name is specified.
115    /// E.g: `non-null Int64, field: 'foo'` or `non-null Int64`
116    ///
117    /// TODO: support metadata: `non-ull Int64, metadata: {"foo2": "value"}`
118    fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> {
119        let nullable = self.parse_opt_nullable();
120        let data_type = self.parse_next_type()?;
121
122        // the field name (if exists) must be after a comma
123        let field_name = if self
124            .tokenizer
125            .next_if(|next| matches!(next, Ok(Token::Comma)))
126            .is_none()
127        {
128            Field::LIST_FIELD_DEFAULT_NAME.into()
129        } else {
130            // expects: `field: 'field_name'`.
131            self.expect_token(Token::Field)?;
132            self.expect_token(Token::Colon)?;
133            self.parse_single_quoted_string(context)?
134        };
135
136        Ok(Field::new(field_name, data_type, nullable))
137    }
138
139    /// Parses the List type (called after `List` has been consumed)
140    /// E.g: List(non-null Int64, field: 'foo')
141    fn parse_list(&mut self) -> ArrowResult<DataType> {
142        self.expect_token(Token::LParen)?;
143        let field = self.parse_list_field("List")?;
144        self.expect_token(Token::RParen)?;
145        Ok(DataType::List(Arc::new(field)))
146    }
147
148    /// Parses the ListView type (called after `ListView` has been consumed)
149    /// E.g: ListView(non-null Int64, field: 'foo')
150    fn parse_list_view(&mut self) -> ArrowResult<DataType> {
151        self.expect_token(Token::LParen)?;
152        let field = self.parse_list_field("ListView")?;
153        self.expect_token(Token::RParen)?;
154        Ok(DataType::ListView(Arc::new(field)))
155    }
156
157    /// Parses the LargeList type (called after `LargeList` has been consumed)
158    /// E.g: LargeList(non-null Int64, field: 'foo')
159    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
160        self.expect_token(Token::LParen)?;
161        let field = self.parse_list_field("LargeList")?;
162        self.expect_token(Token::RParen)?;
163        Ok(DataType::LargeList(Arc::new(field)))
164    }
165
166    /// Parses the LargeListView type (called after `LargeListView` has been consumed)
167    /// E.g: LargeListView(non-null Int64, field: 'foo')
168    fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
169        self.expect_token(Token::LParen)?;
170        let field = self.parse_list_field("LargeListView")?;
171        self.expect_token(Token::RParen)?;
172        Ok(DataType::LargeListView(Arc::new(field)))
173    }
174
175    /// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed)
176    ///
177    /// Examples:
178    /// * `FixedSizeList(5 x non-null Int64, field: 'foo')`
179    /// * `FixedSizeList(4, Int64)`
180    ///
181    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
182        self.expect_token(Token::LParen)?;
183        let length = self.parse_i32("FixedSizeList")?;
184        match self.next_token()? {
185            // `FixedSizeList(5 x non-null Int64, field: 'foo')` format
186            Token::X => {
187                let field = self.parse_list_field("FixedSizeList")?;
188                self.expect_token(Token::RParen)?;
189                Ok(DataType::FixedSizeList(Arc::new(field), length))
190            }
191            // `FixedSizeList(4, Int64)` format
192            Token::Comma => {
193                let data_type = self.parse_next_type()?;
194                self.expect_token(Token::RParen)?;
195                Ok(DataType::FixedSizeList(
196                    Arc::new(Field::new_list_field(data_type, true)),
197                    length,
198                ))
199            }
200            tok => Err(make_error(
201                self.val,
202                &format!("Expected 'x' or ',' after length for FixedSizeList, got '{tok}'"),
203            )),
204        }
205    }
206
207    /// Parses the next timeunit
208    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
209        match self.next_token()? {
210            Token::TimeUnit(time_unit) => Ok(time_unit),
211            tok => Err(make_error(
212                self.val,
213                &format!("finding TimeUnit for {context}, got {tok}"),
214            )),
215        }
216    }
217
218    /// Parses the next double quoted string
219    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
220        let token = self.next_token()?;
221        if let Token::DoubleQuotedString(string) = token {
222            Ok(string)
223        } else {
224            Err(make_error(
225                self.val,
226                &format!("expected double quoted string for {context}, got '{token}'"),
227            ))
228        }
229    }
230
231    /// Parses the next single quoted string
232    fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
233        let token = self.next_token()?;
234        if let Token::SingleQuotedString(string) = token {
235            Ok(string)
236        } else {
237            Err(make_error(
238                self.val,
239                &format!("expected single quoted string for {context}, got '{token}'"),
240            ))
241        }
242    }
243
244    /// Parses the next integer value
245    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
246        match self.next_token()? {
247            Token::Integer(v) => Ok(v),
248            tok => Err(make_error(
249                self.val,
250                &format!("finding i64 for {context}, got '{tok}'"),
251            )),
252        }
253    }
254
255    /// Parses the next i32 integer value
256    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
257        let length = self.parse_i64(context)?;
258        length.try_into().map_err(|e| {
259            make_error(
260                self.val,
261                &format!("converting {length} into i32 for {context}: {e}"),
262            )
263        })
264    }
265
266    /// Parses the next i8 integer value
267    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
268        let length = self.parse_i64(context)?;
269        length.try_into().map_err(|e| {
270            make_error(
271                self.val,
272                &format!("converting {length} into i8 for {context}: {e}"),
273            )
274        })
275    }
276
277    /// Parses the next u8 integer value
278    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
279        let length = self.parse_i64(context)?;
280        length.try_into().map_err(|e| {
281            make_error(
282                self.val,
283                &format!("converting {length} into u8 for {context}: {e}"),
284            )
285        })
286    }
287
288    /// Parses the next timestamp (called after `Timestamp` has been consumed)
289    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
290        self.expect_token(Token::LParen)?;
291        let time_unit = self.parse_time_unit("Timestamp")?;
292
293        let timezone;
294        match self.next_token()? {
295            Token::Comma => {
296                match self.next_token()? {
297                    // Support old style `Timestamp(Nanosecond, None)`
298                    Token::None => {
299                        timezone = None;
300                    }
301                    // Support old style `Timestamp(Nanosecond, Some("Timezone"))`
302                    Token::Some => {
303                        self.expect_token(Token::LParen)?;
304                        timezone = Some(self.parse_double_quoted_string("Timezone")?);
305                        self.expect_token(Token::RParen)?;
306                    }
307                    Token::DoubleQuotedString(tz) => {
308                        // Support new style `Timestamp(Nanosecond, "Timezone")`
309                        timezone = Some(tz);
310                    }
311                    tok => {
312                        return Err(make_error(
313                            self.val,
314                            &format!("Expected None, Some, or a timezone string, got {tok:?}"),
315                        ));
316                    }
317                };
318                self.expect_token(Token::RParen)?;
319            }
320            // No timezone (e.g `Timestamp(ns)`)
321            Token::RParen => {
322                timezone = None;
323            }
324            next_token => {
325                return Err(make_error(
326                    self.val,
327                    &format!("Expected comma followed by a timezone, or an ), got {next_token:?}"),
328                ));
329            }
330        }
331        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
332    }
333
334    /// Parses the next Time32 (called after `Time32` has been consumed)
335    fn parse_time32(&mut self) -> ArrowResult<DataType> {
336        self.expect_token(Token::LParen)?;
337        let time_unit = self.parse_time_unit("Time32")?;
338        self.expect_token(Token::RParen)?;
339        Ok(DataType::Time32(time_unit))
340    }
341
342    /// Parses the next Time64 (called after `Time64` has been consumed)
343    fn parse_time64(&mut self) -> ArrowResult<DataType> {
344        self.expect_token(Token::LParen)?;
345        let time_unit = self.parse_time_unit("Time64")?;
346        self.expect_token(Token::RParen)?;
347        Ok(DataType::Time64(time_unit))
348    }
349
350    /// Parses the next Duration (called after `Duration` has been consumed)
351    fn parse_duration(&mut self) -> ArrowResult<DataType> {
352        self.expect_token(Token::LParen)?;
353        let time_unit = self.parse_time_unit("Duration")?;
354        self.expect_token(Token::RParen)?;
355        Ok(DataType::Duration(time_unit))
356    }
357
358    /// Parses the next Interval (called after `Interval` has been consumed)
359    fn parse_interval(&mut self) -> ArrowResult<DataType> {
360        self.expect_token(Token::LParen)?;
361        let interval_unit = match self.next_token()? {
362            Token::IntervalUnit(interval_unit) => interval_unit,
363            tok => {
364                return Err(make_error(
365                    self.val,
366                    &format!("finding IntervalUnit for Interval, got {tok}"),
367                ));
368            }
369        };
370        self.expect_token(Token::RParen)?;
371        Ok(DataType::Interval(interval_unit))
372    }
373
374    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
375    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
376        self.expect_token(Token::LParen)?;
377        let length = self.parse_i32("FixedSizeBinary")?;
378        self.expect_token(Token::RParen)?;
379        Ok(DataType::FixedSizeBinary(length))
380    }
381
382    /// Parses the next Decimal32 (called after `Decimal32` has been consumed)
383    fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
384        self.expect_token(Token::LParen)?;
385        let precision = self.parse_u8("Decimal32")?;
386        self.expect_token(Token::Comma)?;
387        let scale = self.parse_i8("Decimal32")?;
388        self.expect_token(Token::RParen)?;
389        Ok(DataType::Decimal32(precision, scale))
390    }
391
392    /// Parses the next Decimal64 (called after `Decimal64` has been consumed)
393    fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
394        self.expect_token(Token::LParen)?;
395        let precision = self.parse_u8("Decimal64")?;
396        self.expect_token(Token::Comma)?;
397        let scale = self.parse_i8("Decimal64")?;
398        self.expect_token(Token::RParen)?;
399        Ok(DataType::Decimal64(precision, scale))
400    }
401
402    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
403    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
404        self.expect_token(Token::LParen)?;
405        let precision = self.parse_u8("Decimal128")?;
406        self.expect_token(Token::Comma)?;
407        let scale = self.parse_i8("Decimal128")?;
408        self.expect_token(Token::RParen)?;
409        Ok(DataType::Decimal128(precision, scale))
410    }
411
412    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
413    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
414        self.expect_token(Token::LParen)?;
415        let precision = self.parse_u8("Decimal256")?;
416        self.expect_token(Token::Comma)?;
417        let scale = self.parse_i8("Decimal256")?;
418        self.expect_token(Token::RParen)?;
419        Ok(DataType::Decimal256(precision, scale))
420    }
421
422    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
423    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
424        self.expect_token(Token::LParen)?;
425        let key_type = self.parse_next_type()?;
426        self.expect_token(Token::Comma)?;
427        let value_type = self.parse_next_type()?;
428        self.expect_token(Token::RParen)?;
429        Ok(DataType::Dictionary(
430            Box::new(key_type),
431            Box::new(value_type),
432        ))
433    }
434
435    /// Parses the next Struct (called after `Struct` has been consumed)
436    fn parse_struct(&mut self) -> ArrowResult<DataType> {
437        self.expect_token(Token::LParen)?;
438        let mut fields = Vec::new();
439        loop {
440            if self
441                .tokenizer
442                .next_if(|next| matches!(next, Ok(Token::RParen)))
443                .is_some()
444            {
445                break;
446            }
447
448            let field = self.parse_field()?;
449            fields.push(Arc::new(field));
450            match self.next_token()? {
451                Token::Comma => continue,
452                Token::RParen => break,
453                tok => {
454                    return Err(make_error(
455                        self.val,
456                        &format!(
457                            "Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"
458                        ),
459                    ));
460                }
461            }
462        }
463        Ok(DataType::Struct(Fields::from(fields)))
464    }
465
466    /// Parses the next Union (called after `Union` has been consumed)
467    /// E.g: Union(Sparse, 0: ("a": Int32), 1: ("b": non-null Utf8))
468    fn parse_union(&mut self) -> ArrowResult<DataType> {
469        self.expect_token(Token::LParen)?;
470        let union_mode = self.parse_union_mode()?;
471        let mut type_ids = vec![];
472        let mut fields = vec![];
473        loop {
474            if self
475                .tokenizer
476                .next_if(|next| matches!(next, Ok(Token::RParen)))
477                .is_some()
478            {
479                break;
480            }
481            self.expect_token(Token::Comma)?;
482            let (type_id, field) = self.parse_union_field()?;
483            type_ids.push(type_id);
484            fields.push(field);
485        }
486        Ok(DataType::Union(
487            UnionFields::new(type_ids, fields),
488            union_mode,
489        ))
490    }
491
492    /// Parses the next UnionMode
493    fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> {
494        match self.next_token()? {
495            Token::UnionMode(union_mode) => Ok(union_mode),
496            tok => Err(make_error(
497                self.val,
498                &format!("finding UnionMode for Union, got {tok}"),
499            )),
500        }
501    }
502
503    /// Parses the next UnionField
504    /// 0: ("a": non-null Int32)
505    fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> {
506        let type_id = self.parse_i8("UnionField")?;
507        self.expect_token(Token::Colon)?;
508        self.expect_token(Token::LParen)?;
509        let field = self.parse_field()?;
510        self.expect_token(Token::RParen)?;
511        Ok((type_id, field))
512    }
513
514    /// Parses the next Map (called after `Map` has been consumed)
515    /// E.g: Map("entries": Struct("key": Utf8, "value": non-null Int32), sorted)
516    fn parse_map(&mut self) -> ArrowResult<DataType> {
517        self.expect_token(Token::LParen)?;
518        let field = self.parse_field()?;
519        self.expect_token(Token::Comma)?;
520        let sorted = self.parse_map_sorted()?;
521        self.expect_token(Token::RParen)?;
522        Ok(DataType::Map(Arc::new(field), sorted))
523    }
524
525    /// Parses map's sorted
526    fn parse_map_sorted(&mut self) -> ArrowResult<bool> {
527        match self.next_token()? {
528            Token::MapSorted(sorted) => Ok(sorted),
529            tok => Err(make_error(
530                self.val,
531                &format!("Expected sorted or unsorted for a map; got {tok:?}"),
532            )),
533        }
534    }
535
536    /// Parses the next RunEndEncoded (called after `RunEndEncoded` has been consumed)
537    /// E.g: RunEndEncoded("run_ends": UInt32, "values": nonnull Int32)
538    fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> {
539        self.expect_token(Token::LParen)?;
540        let run_ends = self.parse_field()?;
541        self.expect_token(Token::Comma)?;
542        let values = self.parse_field()?;
543        self.expect_token(Token::RParen)?;
544        Ok(DataType::RunEndEncoded(
545            Arc::new(run_ends),
546            Arc::new(values),
547        ))
548    }
549
550    /// consume the next token and return `false` if the field is `nonnull`.
551    fn parse_opt_nullable(&mut self) -> bool {
552        let tok = self
553            .tokenizer
554            .next_if(|next| matches!(next, Ok(Token::NonNull | Token::Nullable)));
555        !matches!(tok, Some(Ok(Token::NonNull)))
556    }
557
558    /// return the next token, or an error if there are none left
559    fn next_token(&mut self) -> ArrowResult<Token> {
560        match self.tokenizer.next() {
561            None => Err(make_error(self.val, "finding next token")),
562            Some(token) => token,
563        }
564    }
565
566    /// consume the next token, returning OK(()) if it matches tok, and Err if not
567    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
568        let next_token = self.next_token()?;
569        if next_token == tok {
570            Ok(())
571        } else {
572            Err(make_error_expected(self.val, &tok, &next_token))
573        }
574    }
575}
576
577/// returns true if this character is a separator
578fn is_separator(c: char) -> bool {
579    c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
580}
581
582enum QuoteType {
583    Double,
584    Single,
585}
586
587#[derive(Debug)]
588/// Splits a strings like Dictionary(Int32, Int64) into tokens suitable for parsing
589///
590/// For example the string "Timestamp(ns)" would be parsed into:
591///
592/// * Token::Timestamp
593/// * Token::Lparen
594/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
595/// * Token::Rparen,
596struct Tokenizer<'a> {
597    val: &'a str,
598    chars: Peekable<Chars<'a>>,
599    // temporary buffer for parsing words
600    word: String,
601}
602
603impl<'a> Tokenizer<'a> {
604    fn new(val: &'a str) -> Self {
605        Self {
606            val,
607            chars: val.chars().peekable(),
608            word: String::new(),
609        }
610    }
611
612    /// returns the next char, without consuming it
613    fn peek_next_char(&mut self) -> Option<char> {
614        self.chars.peek().copied()
615    }
616
617    /// returns the next char, and consuming it
618    fn next_char(&mut self) -> Option<char> {
619        self.chars.next()
620    }
621
622    /// parse the characters in val starting at pos, until the next
623    /// `,`, `(`, or `)` or end of line
624    fn parse_word(&mut self) -> ArrowResult<Token> {
625        // reset temp space
626        self.word.clear();
627        loop {
628            match self.peek_next_char() {
629                None => break,
630                Some(c) if is_separator(c) => break,
631                Some(c) => {
632                    self.next_char();
633                    self.word.push(c);
634                }
635            }
636        }
637
638        if let Some(c) = self.word.chars().next() {
639            // if it started with a number, try parsing it as an integer
640            if c == '-' || c.is_numeric() {
641                let val: i64 = self.word.parse().map_err(|e| {
642                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
643                })?;
644                return Ok(Token::Integer(val));
645            }
646        }
647
648        // figure out what the word was
649        let token = match self.word.as_str() {
650            "Null" => Token::SimpleType(DataType::Null),
651            "Boolean" => Token::SimpleType(DataType::Boolean),
652
653            "Int8" => Token::SimpleType(DataType::Int8),
654            "Int16" => Token::SimpleType(DataType::Int16),
655            "Int32" => Token::SimpleType(DataType::Int32),
656            "Int64" => Token::SimpleType(DataType::Int64),
657
658            "UInt8" => Token::SimpleType(DataType::UInt8),
659            "UInt16" => Token::SimpleType(DataType::UInt16),
660            "UInt32" => Token::SimpleType(DataType::UInt32),
661            "UInt64" => Token::SimpleType(DataType::UInt64),
662
663            "Utf8" => Token::SimpleType(DataType::Utf8),
664            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
665            "Utf8View" => Token::SimpleType(DataType::Utf8View),
666            "Binary" => Token::SimpleType(DataType::Binary),
667            "BinaryView" => Token::SimpleType(DataType::BinaryView),
668            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
669
670            "Float16" => Token::SimpleType(DataType::Float16),
671            "Float32" => Token::SimpleType(DataType::Float32),
672            "Float64" => Token::SimpleType(DataType::Float64),
673
674            "Date32" => Token::SimpleType(DataType::Date32),
675            "Date64" => Token::SimpleType(DataType::Date64),
676
677            "List" => Token::List,
678            "ListView" => Token::ListView,
679            "LargeList" => Token::LargeList,
680            "LargeListView" => Token::LargeListView,
681            "FixedSizeList" => Token::FixedSizeList,
682
683            "s" | "Second" => Token::TimeUnit(TimeUnit::Second),
684            "ms" | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
685            "µs" | "us" | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
686            "ns" | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
687
688            "Timestamp" => Token::Timestamp,
689            "Time32" => Token::Time32,
690            "Time64" => Token::Time64,
691            "Duration" => Token::Duration,
692            "Interval" => Token::Interval,
693            "Dictionary" => Token::Dictionary,
694
695            "FixedSizeBinary" => Token::FixedSizeBinary,
696
697            "Decimal32" => Token::Decimal32,
698            "Decimal64" => Token::Decimal64,
699            "Decimal128" => Token::Decimal128,
700            "Decimal256" => Token::Decimal256,
701
702            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
703            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
704            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
705
706            "Some" => Token::Some,
707            "None" => Token::None,
708
709            "non-null" => Token::NonNull,
710            "nullable" => Token::Nullable,
711            "field" => Token::Field,
712            "x" => Token::X,
713
714            "Struct" => Token::Struct,
715
716            "Union" => Token::Union,
717            "Sparse" => Token::UnionMode(UnionMode::Sparse),
718            "Dense" => Token::UnionMode(UnionMode::Dense),
719
720            "Map" => Token::Map,
721            "sorted" => Token::MapSorted(true),
722            "unsorted" => Token::MapSorted(false),
723
724            "RunEndEncoded" => Token::RunEndEncoded,
725
726            token => {
727                return Err(make_error(self.val, &format!("unknown token: {token}")));
728            }
729        };
730        Ok(token)
731    }
732
733    /// Parses e.g. `"foo bar"`, `'foo bar'`
734    fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> {
735        let quote = match quote_type {
736            QuoteType::Double => '\"',
737            QuoteType::Single => '\'',
738        };
739
740        if self.next_char() != Some(quote) {
741            return Err(make_error(self.val, "Expected \""));
742        }
743
744        // reset temp space
745        self.word.clear();
746
747        let mut is_escaped = false;
748
749        loop {
750            match self.next_char() {
751                None => {
752                    return Err(ArrowError::ParseError(format!(
753                        "Unterminated string at: \"{}",
754                        self.word
755                    )));
756                }
757                Some(c) => match c {
758                    '\\' => {
759                        is_escaped = true;
760                        self.word.push(c);
761                    }
762                    c if c == quote => {
763                        if is_escaped {
764                            self.word.push(c);
765                            is_escaped = false;
766                        } else {
767                            break;
768                        }
769                    }
770                    c => {
771                        self.word.push(c);
772                    }
773                },
774            }
775        }
776
777        let val: String = self.word.parse().map_err(|err| {
778            ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
779        })?;
780
781        if val.is_empty() {
782            // Using empty strings as field names is just asking for trouble
783            return Err(make_error(self.val, "empty strings aren't allowed"));
784        }
785
786        match quote_type {
787            QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
788            QuoteType::Single => Ok(Token::SingleQuotedString(val)),
789        }
790    }
791}
792
793impl Iterator for Tokenizer<'_> {
794    type Item = ArrowResult<Token>;
795
796    fn next(&mut self) -> Option<Self::Item> {
797        loop {
798            match self.peek_next_char()? {
799                ' ' => {
800                    // skip whitespace
801                    self.next_char();
802                    continue;
803                }
804                '"' => {
805                    return Some(self.parse_quoted_string(QuoteType::Double));
806                }
807                '\'' => {
808                    return Some(self.parse_quoted_string(QuoteType::Single));
809                }
810                '(' => {
811                    self.next_char();
812                    return Some(Ok(Token::LParen));
813                }
814                ')' => {
815                    self.next_char();
816                    return Some(Ok(Token::RParen));
817                }
818                ',' => {
819                    self.next_char();
820                    return Some(Ok(Token::Comma));
821                }
822                ':' => {
823                    self.next_char();
824                    return Some(Ok(Token::Colon));
825                }
826                _ => return Some(self.parse_word()),
827            }
828        }
829    }
830}
831
832/// Grammar is
833///
834#[derive(Debug, PartialEq)]
835enum Token {
836    // Null, or Int32
837    SimpleType(DataType),
838    Timestamp,
839    Time32,
840    Time64,
841    Duration,
842    Interval,
843    FixedSizeBinary,
844    Decimal32,
845    Decimal64,
846    Decimal128,
847    Decimal256,
848    Dictionary,
849    TimeUnit(TimeUnit),
850    IntervalUnit(IntervalUnit),
851    LParen,
852    RParen,
853    Comma,
854    Colon,
855    Some,
856    None,
857    Integer(i64),
858    DoubleQuotedString(String),
859    SingleQuotedString(String),
860    List,
861    ListView,
862    LargeList,
863    LargeListView,
864    FixedSizeList,
865    Struct,
866    Union,
867    UnionMode(UnionMode),
868    Map,
869    MapSorted(bool),
870    RunEndEncoded,
871    NonNull,
872    Nullable,
873    Field,
874    X,
875}
876
877impl Display for Token {
878    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
879        match self {
880            Token::SimpleType(t) => write!(f, "{t}"),
881            Token::List => write!(f, "List"),
882            Token::ListView => write!(f, "ListView"),
883            Token::LargeList => write!(f, "LargeList"),
884            Token::LargeListView => write!(f, "LargeListView"),
885            Token::FixedSizeList => write!(f, "FixedSizeList"),
886            Token::Timestamp => write!(f, "Timestamp"),
887            Token::Time32 => write!(f, "Time32"),
888            Token::Time64 => write!(f, "Time64"),
889            Token::Duration => write!(f, "Duration"),
890            Token::Interval => write!(f, "Interval"),
891            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
892            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
893            Token::LParen => write!(f, "("),
894            Token::RParen => write!(f, ")"),
895            Token::Comma => write!(f, ","),
896            Token::Colon => write!(f, ":"),
897            Token::Some => write!(f, "Some"),
898            Token::None => write!(f, "None"),
899            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
900            Token::Decimal32 => write!(f, "Decimal32"),
901            Token::Decimal64 => write!(f, "Decimal64"),
902            Token::Decimal128 => write!(f, "Decimal128"),
903            Token::Decimal256 => write!(f, "Decimal256"),
904            Token::Dictionary => write!(f, "Dictionary"),
905            Token::Integer(v) => write!(f, "Integer({v})"),
906            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
907            Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"),
908            Token::Struct => write!(f, "Struct"),
909            Token::Union => write!(f, "Union"),
910            Token::UnionMode(m) => write!(f, "{m:?}"),
911            Token::Map => write!(f, "Map"),
912            Token::MapSorted(sorted) => {
913                write!(f, "{}", if *sorted { "sorted" } else { "unsorted" })
914            }
915            Token::RunEndEncoded => write!(f, "RunEndEncoded"),
916            Token::NonNull => write!(f, "non-null"),
917            Token::Nullable => write!(f, "nullable"),
918            Token::Field => write!(f, "field"),
919            Token::X => write!(f, "x"),
920        }
921    }
922}
923
924#[cfg(test)]
925mod test {
926    use super::*;
927
928    #[test]
929    fn test_parse_data_type() {
930        // this ensures types can be parsed correctly from their string representations
931        for dt in list_datatypes() {
932            round_trip(dt)
933        }
934    }
935
936    /// Ensure we converting data_type to a string, and then parse it as a type
937    /// verifying it is the same
938    fn round_trip(data_type: DataType) {
939        let data_type_string = data_type.to_string();
940        println!("Input '{data_type_string}' ({data_type:?})");
941        let parsed_type = parse_data_type(&data_type_string).unwrap();
942        assert_eq!(
943            data_type, parsed_type,
944            "Mismatch parsing {data_type_string}"
945        );
946    }
947
948    fn list_datatypes() -> Vec<DataType> {
949        vec![
950            // ---------
951            // Non Nested types
952            // ---------
953            DataType::Null,
954            DataType::Boolean,
955            DataType::Int8,
956            DataType::Int16,
957            DataType::Int32,
958            DataType::Int64,
959            DataType::UInt8,
960            DataType::UInt16,
961            DataType::UInt32,
962            DataType::UInt64,
963            DataType::Float16,
964            DataType::Float32,
965            DataType::Float64,
966            DataType::Timestamp(TimeUnit::Second, None),
967            DataType::Timestamp(TimeUnit::Millisecond, None),
968            DataType::Timestamp(TimeUnit::Microsecond, None),
969            DataType::Timestamp(TimeUnit::Nanosecond, None),
970            // we can't cover all possible timezones, here we only test utc and +08:00
971            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
972            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
973            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
974            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
975            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
976            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
977            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
978            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
979            DataType::Date32,
980            DataType::Date64,
981            DataType::Time32(TimeUnit::Second),
982            DataType::Time32(TimeUnit::Millisecond),
983            DataType::Time32(TimeUnit::Microsecond),
984            DataType::Time32(TimeUnit::Nanosecond),
985            DataType::Time64(TimeUnit::Second),
986            DataType::Time64(TimeUnit::Millisecond),
987            DataType::Time64(TimeUnit::Microsecond),
988            DataType::Time64(TimeUnit::Nanosecond),
989            DataType::Duration(TimeUnit::Second),
990            DataType::Duration(TimeUnit::Millisecond),
991            DataType::Duration(TimeUnit::Microsecond),
992            DataType::Duration(TimeUnit::Nanosecond),
993            DataType::Interval(IntervalUnit::YearMonth),
994            DataType::Interval(IntervalUnit::DayTime),
995            DataType::Interval(IntervalUnit::MonthDayNano),
996            DataType::Binary,
997            DataType::BinaryView,
998            DataType::FixedSizeBinary(0),
999            DataType::FixedSizeBinary(1234),
1000            DataType::FixedSizeBinary(-432),
1001            DataType::LargeBinary,
1002            DataType::Utf8,
1003            DataType::Utf8View,
1004            DataType::LargeUtf8,
1005            DataType::Decimal32(7, 8),
1006            DataType::Decimal64(6, 9),
1007            DataType::Decimal128(7, 12),
1008            DataType::Decimal256(6, 13),
1009            // ---------
1010            // Nested types
1011            // ---------
1012            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1013            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1014            DataType::Dictionary(
1015                Box::new(DataType::Int8),
1016                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
1017            ),
1018            DataType::Dictionary(
1019                Box::new(DataType::Int8),
1020                Box::new(DataType::FixedSizeBinary(23)),
1021            ),
1022            DataType::Dictionary(
1023                Box::new(DataType::Int8),
1024                Box::new(
1025                    // nested dictionaries are probably a bad idea but they are possible
1026                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1027                ),
1028            ),
1029            DataType::Struct(Fields::from(vec![
1030                Field::new("f1", DataType::Int64, true),
1031                Field::new("f2", DataType::Float64, true),
1032                Field::new(
1033                    "f3",
1034                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1035                    true,
1036                ),
1037                Field::new(
1038                    "f4",
1039                    DataType::Dictionary(
1040                        Box::new(DataType::Int8),
1041                        Box::new(DataType::FixedSizeBinary(23)),
1042                    ),
1043                    true,
1044                ),
1045            ])),
1046            DataType::Struct(Fields::from(vec![
1047                Field::new("Int64", DataType::Int64, true),
1048                Field::new("Float64", DataType::Float64, true),
1049            ])),
1050            DataType::Struct(Fields::from(vec![
1051                Field::new("f1", DataType::Int64, true),
1052                Field::new(
1053                    "nested_struct",
1054                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
1055                    true,
1056                ),
1057            ])),
1058            DataType::Struct(Fields::from(vec![Field::new("f1", DataType::Int64, true)])),
1059            DataType::Struct(Fields::empty()),
1060            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))),
1061            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
1062            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1063            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))),
1064            DataType::List(Arc::new(Field::new(
1065                "nested_list",
1066                DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1067                true,
1068            ))),
1069            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1070            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1071            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1072            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1073            DataType::ListView(Arc::new(Field::new(
1074                "nested_list_view",
1075                DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1076                true,
1077            ))),
1078            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
1079            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
1080            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1081            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))),
1082            DataType::LargeList(Arc::new(Field::new(
1083                "nested_large_list",
1084                DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1085                true,
1086            ))),
1087            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1088            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1089            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1090            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1091            DataType::LargeListView(Arc::new(Field::new(
1092                "nested_large_list_view",
1093                DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1094                true,
1095            ))),
1096            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2),
1097            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2),
1098            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2),
1099            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2),
1100            DataType::FixedSizeList(
1101                Arc::new(Field::new(
1102                    "nested_fixed_size_list",
1103                    DataType::FixedSizeList(
1104                        Arc::new(Field::new("Int64", DataType::Int64, true)),
1105                        2,
1106                    ),
1107                    true,
1108                )),
1109                2,
1110            ),
1111            DataType::Union(
1112                UnionFields::new(
1113                    vec![0, 1],
1114                    vec![
1115                        Field::new("Int32", DataType::Int32, false),
1116                        Field::new("Utf8", DataType::Utf8, true),
1117                    ],
1118                ),
1119                UnionMode::Sparse,
1120            ),
1121            DataType::Union(
1122                UnionFields::new(
1123                    vec![0, 1],
1124                    vec![
1125                        Field::new("Int32", DataType::Int32, false),
1126                        Field::new("Utf8", DataType::Utf8, true),
1127                    ],
1128                ),
1129                UnionMode::Dense,
1130            ),
1131            DataType::Union(
1132                UnionFields::new(
1133                    vec![0, 1],
1134                    vec![
1135                        Field::new_union(
1136                            "nested_union",
1137                            vec![0, 1],
1138                            vec![
1139                                Field::new("Int32", DataType::Int32, false),
1140                                Field::new("Utf8", DataType::Utf8, true),
1141                            ],
1142                            UnionMode::Dense,
1143                        ),
1144                        Field::new("Utf8", DataType::Utf8, true),
1145                    ],
1146                ),
1147                UnionMode::Sparse,
1148            ),
1149            DataType::Union(
1150                UnionFields::new(vec![0], vec![Field::new("Int32", DataType::Int32, false)]),
1151                UnionMode::Dense,
1152            ),
1153            DataType::Union(
1154                UnionFields::new(Vec::<i8>::new(), Vec::<Field>::new()),
1155                UnionMode::Sparse,
1156            ),
1157            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), true),
1158            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), false),
1159            DataType::Map(
1160                Arc::new(Field::new_map(
1161                    "nested_map",
1162                    "entries",
1163                    Field::new("key", DataType::Utf8, false),
1164                    Field::new("value", DataType::Int32, true),
1165                    false,
1166                    true,
1167                )),
1168                true,
1169            ),
1170            DataType::RunEndEncoded(
1171                Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1172                Arc::new(Field::new("values", DataType::Int32, true)),
1173            ),
1174            DataType::RunEndEncoded(
1175                Arc::new(Field::new(
1176                    "nested_run_end_encoded",
1177                    DataType::RunEndEncoded(
1178                        Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1179                        Arc::new(Field::new("values", DataType::Int32, true)),
1180                    ),
1181                    true,
1182                )),
1183                Arc::new(Field::new("values", DataType::Int32, true)),
1184            ),
1185        ]
1186    }
1187
1188    #[test]
1189    fn test_parse_data_type_whitespace_tolerance() {
1190        // (string to parse, expected DataType)
1191        let cases = [
1192            ("Int8", DataType::Int8),
1193            (
1194                "Timestamp        (ns)",
1195                DataType::Timestamp(TimeUnit::Nanosecond, None),
1196            ),
1197            (
1198                "Timestamp        (ns)  ",
1199                DataType::Timestamp(TimeUnit::Nanosecond, None),
1200            ),
1201            (
1202                "          Timestamp        (ns               )",
1203                DataType::Timestamp(TimeUnit::Nanosecond, None),
1204            ),
1205            (
1206                "Timestamp        (ns               )  ",
1207                DataType::Timestamp(TimeUnit::Nanosecond, None),
1208            ),
1209        ];
1210
1211        for (data_type_string, expected_data_type) in cases {
1212            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1213            assert_eq!(
1214                parsed_data_type, expected_data_type,
1215                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1216            );
1217        }
1218    }
1219
1220    /// Ensure that old style types can still be parsed
1221    #[test]
1222    fn test_parse_data_type_backwards_compatibility() {
1223        use DataType::*;
1224        use IntervalUnit::*;
1225        use TimeUnit::*;
1226        // List below created with:
1227        for t in list_datatypes() {
1228            println!(r#"("{t}", {t:?}),"#);
1229        }
1230        // (string to parse, expected DataType)
1231        let cases = [
1232            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1233            ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
1234            ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
1235            ("Timestamp(Second, None)", Timestamp(Second, None)),
1236            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1237            // Timezones
1238            (
1239                r#"Timestamp(Nanosecond, Some("+00:00"))"#,
1240                Timestamp(Nanosecond, Some("+00:00".into())),
1241            ),
1242            (
1243                r#"Timestamp(Microsecond, Some("+00:00"))"#,
1244                Timestamp(Microsecond, Some("+00:00".into())),
1245            ),
1246            (
1247                r#"Timestamp(Millisecond, Some("+00:00"))"#,
1248                Timestamp(Millisecond, Some("+00:00".into())),
1249            ),
1250            (
1251                r#"Timestamp(Second, Some("+00:00"))"#,
1252                Timestamp(Second, Some("+00:00".into())),
1253            ),
1254            ("Null", Null),
1255            ("Boolean", Boolean),
1256            ("Int8", Int8),
1257            ("Int16", Int16),
1258            ("Int32", Int32),
1259            ("Int64", Int64),
1260            ("UInt8", UInt8),
1261            ("UInt16", UInt16),
1262            ("UInt32", UInt32),
1263            ("UInt64", UInt64),
1264            ("Float16", Float16),
1265            ("Float32", Float32),
1266            ("Float64", Float64),
1267            ("Timestamp(s)", Timestamp(Second, None)),
1268            ("Timestamp(ms)", Timestamp(Millisecond, None)),
1269            ("Timestamp(µs)", Timestamp(Microsecond, None)),
1270            ("Timestamp(ns)", Timestamp(Nanosecond, None)),
1271            (
1272                r#"Timestamp(ns, "+00:00")"#,
1273                Timestamp(Nanosecond, Some("+00:00".into())),
1274            ),
1275            (
1276                r#"Timestamp(µs, "+00:00")"#,
1277                Timestamp(Microsecond, Some("+00:00".into())),
1278            ),
1279            (
1280                r#"Timestamp(ms, "+00:00")"#,
1281                Timestamp(Millisecond, Some("+00:00".into())),
1282            ),
1283            (
1284                r#"Timestamp(s, "+00:00")"#,
1285                Timestamp(Second, Some("+00:00".into())),
1286            ),
1287            (
1288                r#"Timestamp(ns, "+08:00")"#,
1289                Timestamp(Nanosecond, Some("+08:00".into())),
1290            ),
1291            (
1292                r#"Timestamp(µs, "+08:00")"#,
1293                Timestamp(Microsecond, Some("+08:00".into())),
1294            ),
1295            (
1296                r#"Timestamp(ms, "+08:00")"#,
1297                Timestamp(Millisecond, Some("+08:00".into())),
1298            ),
1299            (
1300                r#"Timestamp(s, "+08:00")"#,
1301                Timestamp(Second, Some("+08:00".into())),
1302            ),
1303            ("Date32", Date32),
1304            ("Date64", Date64),
1305            ("Time32(s)", Time32(Second)),
1306            ("Time32(ms)", Time32(Millisecond)),
1307            ("Time32(µs)", Time32(Microsecond)),
1308            ("Time32(ns)", Time32(Nanosecond)),
1309            ("Time64(s)", Time64(Second)),
1310            ("Time64(ms)", Time64(Millisecond)),
1311            ("Time64(µs)", Time64(Microsecond)),
1312            ("Time64(ns)", Time64(Nanosecond)),
1313            ("Duration(s)", Duration(Second)),
1314            ("Duration(ms)", Duration(Millisecond)),
1315            ("Duration(µs)", Duration(Microsecond)),
1316            ("Duration(ns)", Duration(Nanosecond)),
1317            ("Interval(YearMonth)", Interval(YearMonth)),
1318            ("Interval(DayTime)", Interval(DayTime)),
1319            ("Interval(MonthDayNano)", Interval(MonthDayNano)),
1320            ("Binary", Binary),
1321            ("BinaryView", BinaryView),
1322            ("FixedSizeBinary(0)", FixedSizeBinary(0)),
1323            ("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
1324            ("FixedSizeBinary(-432)", FixedSizeBinary(-432)),
1325            ("LargeBinary", LargeBinary),
1326            ("Utf8", Utf8),
1327            ("Utf8View", Utf8View),
1328            ("LargeUtf8", LargeUtf8),
1329            ("Decimal32(7, 8)", Decimal32(7, 8)),
1330            ("Decimal64(6, 9)", Decimal64(6, 9)),
1331            ("Decimal128(7, 12)", Decimal128(7, 12)),
1332            ("Decimal256(6, 13)", Decimal256(6, 13)),
1333            (
1334                "Dictionary(Int32, Utf8)",
1335                Dictionary(Box::new(Int32), Box::new(Utf8)),
1336            ),
1337            (
1338                "Dictionary(Int8, Utf8)",
1339                Dictionary(Box::new(Int8), Box::new(Utf8)),
1340            ),
1341            (
1342                "Dictionary(Int8, Timestamp(ns))",
1343                Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))),
1344            ),
1345            (
1346                "Dictionary(Int8, FixedSizeBinary(23))",
1347                Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1348            ),
1349            (
1350                "Dictionary(Int8, Dictionary(Int8, Utf8))",
1351                Dictionary(
1352                    Box::new(Int8),
1353                    Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
1354                ),
1355            ),
1356            (
1357                r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#,
1358                Struct(Fields::from(vec![
1359                    Field::new("f1", Int64, true),
1360                    Field::new("f2", Float64, true),
1361                    Field::new("f3", Timestamp(Second, Some("+08:00".into())), true),
1362                    Field::new(
1363                        "f4",
1364                        Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1365                        true,
1366                    ),
1367                ])),
1368            ),
1369            (
1370                r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#,
1371                Struct(Fields::from(vec![
1372                    Field::new("Int64", Int64, true),
1373                    Field::new("Float64", Float64, true),
1374                ])),
1375            ),
1376            (
1377                r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#,
1378                Struct(Fields::from(vec![
1379                    Field::new("f1", Int64, true),
1380                    Field::new(
1381                        "nested_struct",
1382                        Struct(Fields::from(vec![Field::new("n1", Int64, true)])),
1383                        true,
1384                    ),
1385                ])),
1386            ),
1387            (r#"Struct()"#, Struct(Fields::empty())),
1388            (
1389                "FixedSizeList(4, Int64)",
1390                FixedSizeList(Arc::new(Field::new_list_field(Int64, true)), 4),
1391            ),
1392            (
1393                "List(Int64)",
1394                List(Arc::new(Field::new_list_field(Int64, true))),
1395            ),
1396            (
1397                "LargeList(Int64)",
1398                LargeList(Arc::new(Field::new_list_field(Int64, true))),
1399            ),
1400        ];
1401
1402        for (data_type_string, expected_data_type) in cases {
1403            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1404            assert_eq!(
1405                parsed_data_type, expected_data_type,
1406                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1407            );
1408        }
1409    }
1410
1411    #[test]
1412    fn parse_data_type_errors() {
1413        // (string to parse, expected error message)
1414        let cases = [
1415            ("", "Unsupported type ''"),
1416            ("", "Error finding next token"),
1417            ("null", "Unsupported type 'null'"),
1418            ("Nu", "Unsupported type 'Nu'"),
1419            (r#"Timestamp(ns, +00:00)"#, "Error unknown token: +00"),
1420            (
1421                r#"Timestamp(ns, "+00:00)"#,
1422                r#"Unterminated string at: "+00:00)"#,
1423            ),
1424            (r#"Timestamp(ns, "")"#, r#"empty strings aren't allowed"#),
1425            (
1426                r#"Timestamp(ns, "+00:00"")"#,
1427                r#"Parser error: Unterminated string at: ")"#,
1428            ),
1429            ("Timestamp(ns, ", "Error finding next token"),
1430            (
1431                "Float32 Float32",
1432                "trailing content after parsing 'Float32'",
1433            ),
1434            ("Int32, ", "trailing content after parsing 'Int32'"),
1435            ("Int32(3), ", "trailing content after parsing 'Int32'"),
1436            (
1437                "FixedSizeBinary(Int32), ",
1438                "Error finding i64 for FixedSizeBinary, got 'Int32'",
1439            ),
1440            (
1441                "FixedSizeBinary(3.0), ",
1442                "Error parsing 3.0 as integer: invalid digit found in string",
1443            ),
1444            // too large for i32
1445            (
1446                "FixedSizeBinary(4000000000), ",
1447                "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted",
1448            ),
1449            // can't have negative precision
1450            (
1451                "Decimal32(-3, 5)",
1452                "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted",
1453            ),
1454            (
1455                "Decimal64(-3, 5)",
1456                "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted",
1457            ),
1458            (
1459                "Decimal128(-3, 5)",
1460                "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted",
1461            ),
1462            (
1463                "Decimal256(-3, 5)",
1464                "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted",
1465            ),
1466            (
1467                "Decimal32(3, 500)",
1468                "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted",
1469            ),
1470            (
1471                "Decimal64(3, 500)",
1472                "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted",
1473            ),
1474            (
1475                "Decimal128(3, 500)",
1476                "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted",
1477            ),
1478            (
1479                "Decimal256(3, 500)",
1480                "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted",
1481            ),
1482            ("Struct(f1 Int64)", "Error unknown token: f1"),
1483            ("Struct(\"f1\" Int64)", "Expected ':'"),
1484            (
1485                "Struct(\"f1\": )",
1486                "Error finding next type, got unexpected ')'",
1487            ),
1488        ];
1489
1490        for (data_type_string, expected_message) in cases {
1491            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
1492            match parse_data_type(data_type_string) {
1493                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
1494                Err(e) => {
1495                    let message = e.to_string();
1496                    assert!(
1497                        message.contains(expected_message),
1498                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
1499                    );
1500
1501                    if !message.contains("Unterminated string") {
1502                        // errors should also contain a help message
1503                        assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
1504                    }
1505                }
1506            }
1507        }
1508    }
1509
1510    #[test]
1511    fn parse_error_type() {
1512        let err = parse_data_type("foobar").unwrap_err();
1513        assert!(matches!(err, ArrowError::ParseError(_)));
1514        assert_eq!(
1515            err.to_string(),
1516            "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar"
1517        );
1518    }
1519}