Skip to main content

arrow_schema/
datatype_parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{
21    ArrowError, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION,
22    DECIMAL256_MAX_PRECISION, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields,
23    UnionMode,
24};
25
26/// Parses a DataType from a string representation
27///
28/// For example, the string "Int32" would be parsed into [`DataType::Int32`]
29pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
30    Parser::new(val).parse()
31}
32
33type ArrowResult<T> = Result<T, ArrowError>;
34
35fn make_error(val: &str, msg: &str) -> ArrowError {
36    let msg = format!(
37        "Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error {msg}"
38    );
39    ArrowError::ParseError(msg)
40}
41
42fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
43    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
44}
45
46/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
47#[derive(Debug)]
48struct Parser<'a> {
49    val: &'a str,
50    tokenizer: Peekable<Tokenizer<'a>>,
51}
52
53impl<'a> Parser<'a> {
54    fn new(val: &'a str) -> Self {
55        Self {
56            val,
57            tokenizer: Tokenizer::new(val).peekable(),
58        }
59    }
60
61    fn parse(mut self) -> ArrowResult<DataType> {
62        let data_type = self.parse_next_type()?;
63        // ensure that there is no trailing content
64        if self.tokenizer.next().is_some() {
65            Err(make_error(
66                self.val,
67                &format!("checking trailing content after parsing '{data_type}'"),
68            ))
69        } else {
70            Ok(data_type)
71        }
72    }
73
74    /// parses the next full DataType
75    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
76        match self.next_token()? {
77            Token::SimpleType(data_type) => Ok(data_type),
78            Token::Timestamp => self.parse_timestamp(),
79            Token::Time32 => self.parse_time32(),
80            Token::Time64 => self.parse_time64(),
81            Token::Duration => self.parse_duration(),
82            Token::Interval => self.parse_interval(),
83            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
84            Token::Decimal32 => self.parse_decimal_32(),
85            Token::Decimal64 => self.parse_decimal_64(),
86            Token::Decimal128 => self.parse_decimal_128(),
87            Token::Decimal256 => self.parse_decimal_256(),
88            Token::Dictionary => self.parse_dictionary(),
89            Token::List => self.parse_list(),
90            Token::ListView => self.parse_list_view(),
91            Token::LargeList => self.parse_large_list(),
92            Token::LargeListView => self.parse_large_list_view(),
93            Token::FixedSizeList => self.parse_fixed_size_list(),
94            Token::Struct => self.parse_struct(),
95            Token::Union => self.parse_union(),
96            Token::Map => self.parse_map(),
97            Token::RunEndEncoded => self.parse_run_end_encoded(),
98            tok => Err(make_error(
99                self.val,
100                &format!("finding next type, got unexpected '{tok}'"),
101            )),
102        }
103    }
104
105    /// parses Field, this is the inversion of `format_field` in `datatype_display.rs`.
106    /// E.g: "a": non-null Int64
107    ///
108    /// TODO: support metadata: `"a": non-null Int64 metadata: {"foo": "value"}`
109    fn parse_field(&mut self) -> ArrowResult<Field> {
110        let name = self.parse_double_quoted_string("Field")?;
111        self.expect_token(Token::Colon)?;
112        let nullable = self.parse_opt_nullable();
113        let data_type = self.parse_next_type()?;
114        Ok(Field::new(name, data_type, nullable))
115    }
116
117    /// Parses field inside a list. Use `Field::LIST_FIELD_DEFAULT_NAME`
118    /// if no field name is specified.
119    /// E.g: `non-null Int64, field: 'foo'` or `non-null Int64`
120    ///
121    /// TODO: support metadata: `non-ull Int64, metadata: {"foo2": "value"}`
122    fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> {
123        let nullable = self.parse_opt_nullable();
124        let data_type = self.parse_next_type()?;
125
126        // the field name (if exists) must be after a comma
127        let field_name = if self
128            .tokenizer
129            .next_if(|next| matches!(next, Ok(Token::Comma)))
130            .is_none()
131        {
132            Field::LIST_FIELD_DEFAULT_NAME.into()
133        } else {
134            // expects: `field: 'field_name'`.
135            self.expect_token(Token::Field)?;
136            self.expect_token(Token::Colon)?;
137            self.parse_single_quoted_string(context)?
138        };
139
140        Ok(Field::new(field_name, data_type, nullable))
141    }
142
143    /// Parses the List type (called after `List` has been consumed)
144    /// E.g: List(non-null Int64, field: 'foo')
145    fn parse_list(&mut self) -> ArrowResult<DataType> {
146        self.expect_token(Token::LParen)?;
147        let field = self.parse_list_field("List")?;
148        self.expect_token(Token::RParen)?;
149        Ok(DataType::List(Arc::new(field)))
150    }
151
152    /// Parses the ListView type (called after `ListView` has been consumed)
153    /// E.g: ListView(non-null Int64, field: 'foo')
154    fn parse_list_view(&mut self) -> ArrowResult<DataType> {
155        self.expect_token(Token::LParen)?;
156        let field = self.parse_list_field("ListView")?;
157        self.expect_token(Token::RParen)?;
158        Ok(DataType::ListView(Arc::new(field)))
159    }
160
161    /// Parses the LargeList type (called after `LargeList` has been consumed)
162    /// E.g: LargeList(non-null Int64, field: 'foo')
163    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
164        self.expect_token(Token::LParen)?;
165        let field = self.parse_list_field("LargeList")?;
166        self.expect_token(Token::RParen)?;
167        Ok(DataType::LargeList(Arc::new(field)))
168    }
169
170    /// Parses the LargeListView type (called after `LargeListView` has been consumed)
171    /// E.g: LargeListView(non-null Int64, field: 'foo')
172    fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
173        self.expect_token(Token::LParen)?;
174        let field = self.parse_list_field("LargeListView")?;
175        self.expect_token(Token::RParen)?;
176        Ok(DataType::LargeListView(Arc::new(field)))
177    }
178
179    /// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed)
180    ///
181    /// Examples:
182    /// * `FixedSizeList(5 x non-null Int64, field: 'foo')`
183    /// * `FixedSizeList(4, Int64)`
184    ///
185    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
186        self.expect_token(Token::LParen)?;
187        let length = self.parse_i32("FixedSizeList")?;
188        if length < 0 {
189            return Err(make_error(
190                self.val,
191                &format!("FixedSizeList length must be non-negative, got {length}"),
192            ));
193        }
194        match self.next_token()? {
195            // `FixedSizeList(5 x non-null Int64, field: 'foo')` format
196            Token::X => {
197                let field = self.parse_list_field("FixedSizeList")?;
198                self.expect_token(Token::RParen)?;
199                Ok(DataType::FixedSizeList(Arc::new(field), length))
200            }
201            // `FixedSizeList(4, Int64)` format
202            Token::Comma => {
203                let data_type = self.parse_next_type()?;
204                self.expect_token(Token::RParen)?;
205                Ok(DataType::FixedSizeList(
206                    Arc::new(Field::new_list_field(data_type, true)),
207                    length,
208                ))
209            }
210            tok => Err(make_error(
211                self.val,
212                &format!("Expected 'x' or ',' after length for FixedSizeList, got '{tok}'"),
213            )),
214        }
215    }
216
217    /// Parses the next timeunit
218    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
219        match self.next_token()? {
220            Token::TimeUnit(time_unit) => Ok(time_unit),
221            tok => Err(make_error(
222                self.val,
223                &format!("finding TimeUnit for {context}, got {tok}"),
224            )),
225        }
226    }
227
228    /// Parses the next double quoted string
229    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
230        let token = self.next_token()?;
231        if let Token::DoubleQuotedString(string) = token {
232            Ok(string)
233        } else {
234            Err(make_error(
235                self.val,
236                &format!("expected double quoted string for {context}, got '{token}'"),
237            ))
238        }
239    }
240
241    /// Parses the next single quoted string
242    fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
243        let token = self.next_token()?;
244        if let Token::SingleQuotedString(string) = token {
245            Ok(string)
246        } else {
247            Err(make_error(
248                self.val,
249                &format!("expected single quoted string for {context}, got '{token}'"),
250            ))
251        }
252    }
253
254    /// Parses the next integer value
255    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
256        match self.next_token()? {
257            Token::Integer(v) => Ok(v),
258            tok => Err(make_error(
259                self.val,
260                &format!("finding i64 for {context}, got '{tok}'"),
261            )),
262        }
263    }
264
265    /// Parses the next i32 integer value
266    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
267        let length = self.parse_i64(context)?;
268        length.try_into().map_err(|e| {
269            make_error(
270                self.val,
271                &format!("converting {length} into i32 for {context}: {e}"),
272            )
273        })
274    }
275
276    /// Parses the next i8 integer value
277    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
278        let length = self.parse_i64(context)?;
279        length.try_into().map_err(|e| {
280            make_error(
281                self.val,
282                &format!("converting {length} into i8 for {context}: {e}"),
283            )
284        })
285    }
286
287    /// Parses the next u8 integer value
288    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
289        let length = self.parse_i64(context)?;
290        length.try_into().map_err(|e| {
291            make_error(
292                self.val,
293                &format!("converting {length} into u8 for {context}: {e}"),
294            )
295        })
296    }
297
298    /// Parses the next timestamp (called after `Timestamp` has been consumed)
299    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
300        self.expect_token(Token::LParen)?;
301        let time_unit = self.parse_time_unit("Timestamp")?;
302
303        let timezone;
304        match self.next_token()? {
305            Token::Comma => {
306                match self.next_token()? {
307                    // Support old style `Timestamp(Nanosecond, None)`
308                    Token::None => {
309                        timezone = None;
310                    }
311                    // Support old style `Timestamp(Nanosecond, Some("Timezone"))`
312                    Token::Some => {
313                        self.expect_token(Token::LParen)?;
314                        timezone = Some(self.parse_double_quoted_string("Timezone")?);
315                        self.expect_token(Token::RParen)?;
316                    }
317                    Token::DoubleQuotedString(tz) => {
318                        // Support new style `Timestamp(Nanosecond, "Timezone")`
319                        timezone = Some(tz);
320                    }
321                    tok => {
322                        return Err(make_error(
323                            self.val,
324                            &format!("Expected None, Some, or a timezone string, got {tok:?}"),
325                        ));
326                    }
327                };
328                self.expect_token(Token::RParen)?;
329            }
330            // No timezone (e.g `Timestamp(ns)`)
331            Token::RParen => {
332                timezone = None;
333            }
334            next_token => {
335                return Err(make_error(
336                    self.val,
337                    &format!("Expected comma followed by a timezone, or an ), got {next_token:?}"),
338                ));
339            }
340        }
341        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
342    }
343
344    /// Parses the next Time32 (called after `Time32` has been consumed)
345    fn parse_time32(&mut self) -> ArrowResult<DataType> {
346        self.expect_token(Token::LParen)?;
347        let time_unit = self.parse_time_unit("Time32")?;
348        match time_unit {
349            TimeUnit::Second | TimeUnit::Millisecond => (),
350            TimeUnit::Microsecond | TimeUnit::Nanosecond => {
351                return Err(make_error(
352                    self.val,
353                    &format!("Time32 time unit must be 's' or 'ms', got '{time_unit}'"),
354                ));
355            }
356        };
357        self.expect_token(Token::RParen)?;
358        Ok(DataType::Time32(time_unit))
359    }
360
361    /// Parses the next Time64 (called after `Time64` has been consumed)
362    fn parse_time64(&mut self) -> ArrowResult<DataType> {
363        self.expect_token(Token::LParen)?;
364        let time_unit = self.parse_time_unit("Time64")?;
365        match time_unit {
366            TimeUnit::Microsecond | TimeUnit::Nanosecond => (),
367            TimeUnit::Second | TimeUnit::Millisecond => {
368                return Err(make_error(
369                    self.val,
370                    &format!("Time64 time unit must be 'µs' or 'ns', got '{time_unit}'"),
371                ));
372            }
373        };
374        self.expect_token(Token::RParen)?;
375        Ok(DataType::Time64(time_unit))
376    }
377
378    /// Parses the next Duration (called after `Duration` has been consumed)
379    fn parse_duration(&mut self) -> ArrowResult<DataType> {
380        self.expect_token(Token::LParen)?;
381        let time_unit = self.parse_time_unit("Duration")?;
382        self.expect_token(Token::RParen)?;
383        Ok(DataType::Duration(time_unit))
384    }
385
386    /// Parses the next Interval (called after `Interval` has been consumed)
387    fn parse_interval(&mut self) -> ArrowResult<DataType> {
388        self.expect_token(Token::LParen)?;
389        let interval_unit = match self.next_token()? {
390            Token::IntervalUnit(interval_unit) => interval_unit,
391            tok => {
392                return Err(make_error(
393                    self.val,
394                    &format!("finding IntervalUnit for Interval, got {tok}"),
395                ));
396            }
397        };
398        self.expect_token(Token::RParen)?;
399        Ok(DataType::Interval(interval_unit))
400    }
401
402    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
403    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
404        self.expect_token(Token::LParen)?;
405        let length = self.parse_i32("FixedSizeBinary")?;
406        if length < 0 {
407            return Err(make_error(
408                self.val,
409                &format!("FixedSizeBinary length must be non-negative, got {length}"),
410            ));
411        }
412        self.expect_token(Token::RParen)?;
413        Ok(DataType::FixedSizeBinary(length))
414    }
415
416    fn validate_decimal(
417        &self,
418        precision: u8,
419        scale: i8,
420        type_name: &str,
421        max_precision: u8,
422    ) -> ArrowResult<()> {
423        if precision == 0 || precision > max_precision {
424            return Err(make_error(
425                self.val,
426                &format!(
427                    "{type_name} precision must be in range [1, {max_precision}], got '{precision}'"
428                ),
429            ));
430        }
431        if scale > 0 && scale as u8 > precision {
432            return Err(make_error(
433                self.val,
434                &format!(
435                    "{type_name} scale '{scale}' cannot be greater than precision '{precision}'"
436                ),
437            ));
438        }
439        Ok(())
440    }
441
442    /// Parses the next Decimal32 (called after `Decimal32` has been consumed)
443    fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
444        self.expect_token(Token::LParen)?;
445        let precision = self.parse_u8("Decimal32")?;
446        self.expect_token(Token::Comma)?;
447        let scale = self.parse_i8("Decimal32")?;
448        self.expect_token(Token::RParen)?;
449        self.validate_decimal(precision, scale, "Decimal32", DECIMAL32_MAX_PRECISION)?;
450        Ok(DataType::Decimal32(precision, scale))
451    }
452
453    /// Parses the next Decimal64 (called after `Decimal64` has been consumed)
454    fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
455        self.expect_token(Token::LParen)?;
456        let precision = self.parse_u8("Decimal64")?;
457        self.expect_token(Token::Comma)?;
458        let scale = self.parse_i8("Decimal64")?;
459        self.expect_token(Token::RParen)?;
460        self.validate_decimal(precision, scale, "Decimal64", DECIMAL64_MAX_PRECISION)?;
461        Ok(DataType::Decimal64(precision, scale))
462    }
463
464    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
465    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
466        self.expect_token(Token::LParen)?;
467        let precision = self.parse_u8("Decimal128")?;
468        self.expect_token(Token::Comma)?;
469        let scale = self.parse_i8("Decimal128")?;
470        self.expect_token(Token::RParen)?;
471        self.validate_decimal(precision, scale, "Decimal128", DECIMAL128_MAX_PRECISION)?;
472        Ok(DataType::Decimal128(precision, scale))
473    }
474
475    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
476    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
477        self.expect_token(Token::LParen)?;
478        let precision = self.parse_u8("Decimal256")?;
479        self.expect_token(Token::Comma)?;
480        let scale = self.parse_i8("Decimal256")?;
481        self.expect_token(Token::RParen)?;
482        self.validate_decimal(precision, scale, "Decimal256", DECIMAL256_MAX_PRECISION)?;
483        Ok(DataType::Decimal256(precision, scale))
484    }
485
486    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
487    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
488        self.expect_token(Token::LParen)?;
489        let key_type = self.parse_next_type()?;
490        self.expect_token(Token::Comma)?;
491        let value_type = self.parse_next_type()?;
492        self.expect_token(Token::RParen)?;
493        Ok(DataType::Dictionary(
494            Box::new(key_type),
495            Box::new(value_type),
496        ))
497    }
498
499    /// Parses the next Struct (called after `Struct` has been consumed)
500    fn parse_struct(&mut self) -> ArrowResult<DataType> {
501        self.expect_token(Token::LParen)?;
502        let mut fields = Vec::new();
503        loop {
504            if self
505                .tokenizer
506                .next_if(|next| matches!(next, Ok(Token::RParen)))
507                .is_some()
508            {
509                break;
510            }
511
512            let field = self.parse_field()?;
513            fields.push(Arc::new(field));
514            match self.next_token()? {
515                Token::Comma => continue,
516                Token::RParen => break,
517                tok => {
518                    return Err(make_error(
519                        self.val,
520                        &format!(
521                            "Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"
522                        ),
523                    ));
524                }
525            }
526        }
527        Ok(DataType::Struct(Fields::from(fields)))
528    }
529
530    /// Parses the next Union (called after `Union` has been consumed)
531    /// E.g: Union(Sparse, 0: ("a": Int32), 1: ("b": non-null Utf8))
532    fn parse_union(&mut self) -> ArrowResult<DataType> {
533        self.expect_token(Token::LParen)?;
534        let union_mode = self.parse_union_mode()?;
535        let mut type_ids = vec![];
536        let mut fields = vec![];
537        loop {
538            if self
539                .tokenizer
540                .next_if(|next| matches!(next, Ok(Token::RParen)))
541                .is_some()
542            {
543                break;
544            }
545            self.expect_token(Token::Comma)?;
546            let (type_id, field) = self.parse_union_field()?;
547            type_ids.push(type_id);
548            fields.push(field);
549        }
550        Ok(DataType::Union(
551            UnionFields::try_new(type_ids, fields)?,
552            union_mode,
553        ))
554    }
555
556    /// Parses the next UnionMode
557    fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> {
558        match self.next_token()? {
559            Token::UnionMode(union_mode) => Ok(union_mode),
560            tok => Err(make_error(
561                self.val,
562                &format!("finding UnionMode for Union, got {tok}"),
563            )),
564        }
565    }
566
567    /// Parses the next UnionField
568    /// 0: ("a": non-null Int32)
569    fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> {
570        let type_id = self.parse_i8("UnionField")?;
571        self.expect_token(Token::Colon)?;
572        self.expect_token(Token::LParen)?;
573        let field = self.parse_field()?;
574        self.expect_token(Token::RParen)?;
575        Ok((type_id, field))
576    }
577
578    /// Parses the next Map (called after `Map` has been consumed)
579    /// E.g: Map("entries": Struct("key": Utf8, "value": non-null Int32), sorted)
580    fn parse_map(&mut self) -> ArrowResult<DataType> {
581        self.expect_token(Token::LParen)?;
582        let field = self.parse_field()?;
583        self.expect_token(Token::Comma)?;
584        let sorted = self.parse_map_sorted()?;
585        self.expect_token(Token::RParen)?;
586        Ok(DataType::Map(Arc::new(field), sorted))
587    }
588
589    /// Parses map's sorted
590    fn parse_map_sorted(&mut self) -> ArrowResult<bool> {
591        match self.next_token()? {
592            Token::MapSorted(sorted) => Ok(sorted),
593            tok => Err(make_error(
594                self.val,
595                &format!("Expected sorted or unsorted for a map; got {tok:?}"),
596            )),
597        }
598    }
599
600    /// Parses the next RunEndEncoded (called after `RunEndEncoded` has been consumed)
601    /// E.g: RunEndEncoded("run_ends": UInt32, "values": nonnull Int32)
602    fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> {
603        self.expect_token(Token::LParen)?;
604        let run_ends = self.parse_field()?;
605        self.expect_token(Token::Comma)?;
606        let values = self.parse_field()?;
607        self.expect_token(Token::RParen)?;
608        Ok(DataType::RunEndEncoded(
609            Arc::new(run_ends),
610            Arc::new(values),
611        ))
612    }
613
614    /// consume the next token and return `false` if the field is `nonnull`.
615    fn parse_opt_nullable(&mut self) -> bool {
616        let tok = self
617            .tokenizer
618            .next_if(|next| matches!(next, Ok(Token::NonNull | Token::Nullable)));
619        !matches!(tok, Some(Ok(Token::NonNull)))
620    }
621
622    /// return the next token, or an error if there are none left
623    fn next_token(&mut self) -> ArrowResult<Token> {
624        match self.tokenizer.next() {
625            None => Err(make_error(self.val, "finding next token")),
626            Some(token) => token,
627        }
628    }
629
630    /// consume the next token, returning OK(()) if it matches tok, and Err if not
631    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
632        let next_token = self.next_token()?;
633        if next_token == tok {
634            Ok(())
635        } else {
636            Err(make_error_expected(self.val, &tok, &next_token))
637        }
638    }
639}
640
641/// returns true if this character is a separator
642fn is_separator(c: char) -> bool {
643    c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
644}
645
646enum QuoteType {
647    Double,
648    Single,
649}
650
651#[derive(Debug)]
652/// Splits a strings like Dictionary(Int32, Int64) into tokens suitable for parsing
653///
654/// For example the string "Timestamp(ns)" would be parsed into:
655///
656/// * Token::Timestamp
657/// * Token::Lparen
658/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
659/// * Token::Rparen,
660struct Tokenizer<'a> {
661    val: &'a str,
662    chars: Peekable<Chars<'a>>,
663    // temporary buffer for parsing words
664    word: String,
665}
666
667impl<'a> Tokenizer<'a> {
668    fn new(val: &'a str) -> Self {
669        Self {
670            val,
671            chars: val.chars().peekable(),
672            word: String::new(),
673        }
674    }
675
676    /// returns the next char, without consuming it
677    fn peek_next_char(&mut self) -> Option<char> {
678        self.chars.peek().copied()
679    }
680
681    /// returns the next char, and consuming it
682    fn next_char(&mut self) -> Option<char> {
683        self.chars.next()
684    }
685
686    /// parse the characters in val starting at pos, until the next
687    /// `,`, `(`, or `)` or end of line
688    fn parse_word(&mut self) -> ArrowResult<Token> {
689        // reset temp space
690        self.word.clear();
691        loop {
692            match self.peek_next_char() {
693                None => break,
694                Some(c) if is_separator(c) => break,
695                Some(c) => {
696                    self.next_char();
697                    self.word.push(c);
698                }
699            }
700        }
701
702        if let Some(c) = self.word.chars().next() {
703            // if it started with a number, try parsing it as an integer
704            if c == '-' || c.is_numeric() {
705                let val: i64 = self.word.parse().map_err(|e| {
706                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
707                })?;
708                return Ok(Token::Integer(val));
709            }
710        }
711
712        // figure out what the word was
713        let token = match self.word.as_str() {
714            "Null" => Token::SimpleType(DataType::Null),
715            "Boolean" => Token::SimpleType(DataType::Boolean),
716
717            "Int8" => Token::SimpleType(DataType::Int8),
718            "Int16" => Token::SimpleType(DataType::Int16),
719            "Int32" => Token::SimpleType(DataType::Int32),
720            "Int64" => Token::SimpleType(DataType::Int64),
721
722            "UInt8" => Token::SimpleType(DataType::UInt8),
723            "UInt16" => Token::SimpleType(DataType::UInt16),
724            "UInt32" => Token::SimpleType(DataType::UInt32),
725            "UInt64" => Token::SimpleType(DataType::UInt64),
726
727            "Utf8" => Token::SimpleType(DataType::Utf8),
728            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
729            "Utf8View" => Token::SimpleType(DataType::Utf8View),
730            "Binary" => Token::SimpleType(DataType::Binary),
731            "BinaryView" => Token::SimpleType(DataType::BinaryView),
732            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
733
734            "Float16" => Token::SimpleType(DataType::Float16),
735            "Float32" => Token::SimpleType(DataType::Float32),
736            "Float64" => Token::SimpleType(DataType::Float64),
737
738            "Date32" => Token::SimpleType(DataType::Date32),
739            "Date64" => Token::SimpleType(DataType::Date64),
740
741            "List" => Token::List,
742            "ListView" => Token::ListView,
743            "LargeList" => Token::LargeList,
744            "LargeListView" => Token::LargeListView,
745            "FixedSizeList" => Token::FixedSizeList,
746
747            "s" | "Second" => Token::TimeUnit(TimeUnit::Second),
748            "ms" | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
749            "µs" | "us" | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
750            "ns" | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
751
752            "Timestamp" => Token::Timestamp,
753            "Time32" => Token::Time32,
754            "Time64" => Token::Time64,
755            "Duration" => Token::Duration,
756            "Interval" => Token::Interval,
757            "Dictionary" => Token::Dictionary,
758
759            "FixedSizeBinary" => Token::FixedSizeBinary,
760
761            "Decimal32" => Token::Decimal32,
762            "Decimal64" => Token::Decimal64,
763            "Decimal128" => Token::Decimal128,
764            "Decimal256" => Token::Decimal256,
765
766            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
767            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
768            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
769
770            "Some" => Token::Some,
771            "None" => Token::None,
772
773            "non-null" => Token::NonNull,
774            "nullable" => Token::Nullable,
775            "field" => Token::Field,
776            "x" => Token::X,
777
778            "Struct" => Token::Struct,
779
780            "Union" => Token::Union,
781            "Sparse" => Token::UnionMode(UnionMode::Sparse),
782            "Dense" => Token::UnionMode(UnionMode::Dense),
783
784            "Map" => Token::Map,
785            "sorted" => Token::MapSorted(true),
786            "unsorted" => Token::MapSorted(false),
787
788            "RunEndEncoded" => Token::RunEndEncoded,
789
790            token => {
791                return Err(make_error(self.val, &format!("unknown token: {token}")));
792            }
793        };
794        Ok(token)
795    }
796
797    /// Parses e.g. `"foo bar"`, `'foo bar'`
798    fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> {
799        let quote = match quote_type {
800            QuoteType::Double => '\"',
801            QuoteType::Single => '\'',
802        };
803
804        if self.next_char() != Some(quote) {
805            return Err(make_error(self.val, "Expected \""));
806        }
807
808        // reset temp space
809        self.word.clear();
810
811        let mut is_escaped = false;
812
813        loop {
814            match self.next_char() {
815                None => {
816                    return Err(ArrowError::ParseError(format!(
817                        "Unterminated string at: \"{}",
818                        self.word
819                    )));
820                }
821                Some(c) => match c {
822                    '\\' => {
823                        is_escaped = true;
824                        self.word.push(c);
825                    }
826                    c if c == quote => {
827                        if is_escaped {
828                            self.word.push(c);
829                            is_escaped = false;
830                        } else {
831                            break;
832                        }
833                    }
834                    c => {
835                        self.word.push(c);
836                    }
837                },
838            }
839        }
840
841        let val: String = self.word.parse().map_err(|err| {
842            ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
843        })?;
844
845        if val.is_empty() {
846            // Using empty strings as field names is just asking for trouble
847            return Err(make_error(self.val, "empty strings aren't allowed"));
848        }
849
850        match quote_type {
851            QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
852            QuoteType::Single => Ok(Token::SingleQuotedString(val)),
853        }
854    }
855}
856
857impl Iterator for Tokenizer<'_> {
858    type Item = ArrowResult<Token>;
859
860    fn next(&mut self) -> Option<Self::Item> {
861        loop {
862            match self.peek_next_char()? {
863                ' ' => {
864                    // skip whitespace
865                    self.next_char();
866                    continue;
867                }
868                '"' => {
869                    return Some(self.parse_quoted_string(QuoteType::Double));
870                }
871                '\'' => {
872                    return Some(self.parse_quoted_string(QuoteType::Single));
873                }
874                '(' => {
875                    self.next_char();
876                    return Some(Ok(Token::LParen));
877                }
878                ')' => {
879                    self.next_char();
880                    return Some(Ok(Token::RParen));
881                }
882                ',' => {
883                    self.next_char();
884                    return Some(Ok(Token::Comma));
885                }
886                ':' => {
887                    self.next_char();
888                    return Some(Ok(Token::Colon));
889                }
890                _ => return Some(self.parse_word()),
891            }
892        }
893    }
894}
895
896/// Grammar is
897///
898#[derive(Debug, PartialEq)]
899enum Token {
900    // Null, or Int32
901    SimpleType(DataType),
902    Timestamp,
903    Time32,
904    Time64,
905    Duration,
906    Interval,
907    FixedSizeBinary,
908    Decimal32,
909    Decimal64,
910    Decimal128,
911    Decimal256,
912    Dictionary,
913    TimeUnit(TimeUnit),
914    IntervalUnit(IntervalUnit),
915    LParen,
916    RParen,
917    Comma,
918    Colon,
919    Some,
920    None,
921    Integer(i64),
922    DoubleQuotedString(String),
923    SingleQuotedString(String),
924    List,
925    ListView,
926    LargeList,
927    LargeListView,
928    FixedSizeList,
929    Struct,
930    Union,
931    UnionMode(UnionMode),
932    Map,
933    MapSorted(bool),
934    RunEndEncoded,
935    NonNull,
936    Nullable,
937    Field,
938    X,
939}
940
941impl Display for Token {
942    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
943        match self {
944            Token::SimpleType(t) => write!(f, "{t}"),
945            Token::List => write!(f, "List"),
946            Token::ListView => write!(f, "ListView"),
947            Token::LargeList => write!(f, "LargeList"),
948            Token::LargeListView => write!(f, "LargeListView"),
949            Token::FixedSizeList => write!(f, "FixedSizeList"),
950            Token::Timestamp => write!(f, "Timestamp"),
951            Token::Time32 => write!(f, "Time32"),
952            Token::Time64 => write!(f, "Time64"),
953            Token::Duration => write!(f, "Duration"),
954            Token::Interval => write!(f, "Interval"),
955            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
956            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
957            Token::LParen => write!(f, "("),
958            Token::RParen => write!(f, ")"),
959            Token::Comma => write!(f, ","),
960            Token::Colon => write!(f, ":"),
961            Token::Some => write!(f, "Some"),
962            Token::None => write!(f, "None"),
963            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
964            Token::Decimal32 => write!(f, "Decimal32"),
965            Token::Decimal64 => write!(f, "Decimal64"),
966            Token::Decimal128 => write!(f, "Decimal128"),
967            Token::Decimal256 => write!(f, "Decimal256"),
968            Token::Dictionary => write!(f, "Dictionary"),
969            Token::Integer(v) => write!(f, "Integer({v})"),
970            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
971            Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"),
972            Token::Struct => write!(f, "Struct"),
973            Token::Union => write!(f, "Union"),
974            Token::UnionMode(m) => write!(f, "{m:?}"),
975            Token::Map => write!(f, "Map"),
976            Token::MapSorted(sorted) => {
977                write!(f, "{}", if *sorted { "sorted" } else { "unsorted" })
978            }
979            Token::RunEndEncoded => write!(f, "RunEndEncoded"),
980            Token::NonNull => write!(f, "non-null"),
981            Token::Nullable => write!(f, "nullable"),
982            Token::Field => write!(f, "field"),
983            Token::X => write!(f, "x"),
984        }
985    }
986}
987
988#[cfg(test)]
989mod test {
990    use super::*;
991
992    #[test]
993    fn test_parse_data_type() {
994        // this ensures types can be parsed correctly from their string representations
995        for dt in list_datatypes() {
996            round_trip(dt)
997        }
998    }
999
1000    /// Ensure we converting data_type to a string, and then parse it as a type
1001    /// verifying it is the same
1002    fn round_trip(data_type: DataType) {
1003        let data_type_string = data_type.to_string();
1004        println!("Input '{data_type_string}' ({data_type:?})");
1005        let parsed_type = parse_data_type(&data_type_string).unwrap();
1006        assert_eq!(
1007            data_type, parsed_type,
1008            "Mismatch parsing {data_type_string}"
1009        );
1010    }
1011
1012    fn list_datatypes() -> Vec<DataType> {
1013        vec![
1014            // ---------
1015            // Non Nested types
1016            // ---------
1017            DataType::Null,
1018            DataType::Boolean,
1019            DataType::Int8,
1020            DataType::Int16,
1021            DataType::Int32,
1022            DataType::Int64,
1023            DataType::UInt8,
1024            DataType::UInt16,
1025            DataType::UInt32,
1026            DataType::UInt64,
1027            DataType::Float16,
1028            DataType::Float32,
1029            DataType::Float64,
1030            DataType::Timestamp(TimeUnit::Second, None),
1031            DataType::Timestamp(TimeUnit::Millisecond, None),
1032            DataType::Timestamp(TimeUnit::Microsecond, None),
1033            DataType::Timestamp(TimeUnit::Nanosecond, None),
1034            // we can't cover all possible timezones, here we only test utc and +08:00
1035            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
1036            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
1037            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
1038            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
1039            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
1040            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
1041            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
1042            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1043            DataType::Date32,
1044            DataType::Date64,
1045            DataType::Time32(TimeUnit::Second),
1046            DataType::Time32(TimeUnit::Millisecond),
1047            DataType::Time64(TimeUnit::Microsecond),
1048            DataType::Time64(TimeUnit::Nanosecond),
1049            DataType::Duration(TimeUnit::Second),
1050            DataType::Duration(TimeUnit::Millisecond),
1051            DataType::Duration(TimeUnit::Microsecond),
1052            DataType::Duration(TimeUnit::Nanosecond),
1053            DataType::Interval(IntervalUnit::YearMonth),
1054            DataType::Interval(IntervalUnit::DayTime),
1055            DataType::Interval(IntervalUnit::MonthDayNano),
1056            DataType::Binary,
1057            DataType::BinaryView,
1058            DataType::FixedSizeBinary(0),
1059            DataType::FixedSizeBinary(1234),
1060            DataType::LargeBinary,
1061            DataType::Utf8,
1062            DataType::Utf8View,
1063            DataType::LargeUtf8,
1064            DataType::Decimal32(7, 6),
1065            DataType::Decimal64(6, 5),
1066            DataType::Decimal128(7, 6),
1067            DataType::Decimal256(6, 5),
1068            // ---------
1069            // Nested types
1070            // ---------
1071            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1072            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1073            DataType::Dictionary(
1074                Box::new(DataType::Int8),
1075                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
1076            ),
1077            DataType::Dictionary(
1078                Box::new(DataType::Int8),
1079                Box::new(DataType::FixedSizeBinary(23)),
1080            ),
1081            DataType::Dictionary(
1082                Box::new(DataType::Int8),
1083                Box::new(
1084                    // nested dictionaries are probably a bad idea but they are possible
1085                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1086                ),
1087            ),
1088            DataType::Struct(Fields::from(vec![
1089                Field::new("f1", DataType::Int64, true),
1090                Field::new("f2", DataType::Float64, true),
1091                Field::new(
1092                    "f3",
1093                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1094                    true,
1095                ),
1096                Field::new(
1097                    "f4",
1098                    DataType::Dictionary(
1099                        Box::new(DataType::Int8),
1100                        Box::new(DataType::FixedSizeBinary(23)),
1101                    ),
1102                    true,
1103                ),
1104            ])),
1105            DataType::Struct(Fields::from(vec![
1106                Field::new("Int64", DataType::Int64, true),
1107                Field::new("Float64", DataType::Float64, true),
1108            ])),
1109            DataType::Struct(Fields::from(vec![
1110                Field::new("f1", DataType::Int64, true),
1111                Field::new(
1112                    "nested_struct",
1113                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
1114                    true,
1115                ),
1116            ])),
1117            DataType::Struct(Fields::from(vec![Field::new("f1", DataType::Int64, true)])),
1118            DataType::Struct(Fields::empty()),
1119            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))),
1120            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
1121            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1122            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))),
1123            DataType::List(Arc::new(Field::new(
1124                "nested_list",
1125                DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1126                true,
1127            ))),
1128            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1129            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1130            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1131            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1132            DataType::ListView(Arc::new(Field::new(
1133                "nested_list_view",
1134                DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1135                true,
1136            ))),
1137            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
1138            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
1139            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1140            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))),
1141            DataType::LargeList(Arc::new(Field::new(
1142                "nested_large_list",
1143                DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1144                true,
1145            ))),
1146            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1147            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1148            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1149            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1150            DataType::LargeListView(Arc::new(Field::new(
1151                "nested_large_list_view",
1152                DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1153                true,
1154            ))),
1155            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2),
1156            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2),
1157            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2),
1158            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2),
1159            DataType::FixedSizeList(
1160                Arc::new(Field::new(
1161                    "nested_fixed_size_list",
1162                    DataType::FixedSizeList(
1163                        Arc::new(Field::new("Int64", DataType::Int64, true)),
1164                        2,
1165                    ),
1166                    true,
1167                )),
1168                2,
1169            ),
1170            DataType::Union(
1171                UnionFields::from_fields(vec![
1172                    Field::new("Int32", DataType::Int32, false),
1173                    Field::new("Utf8", DataType::Utf8, true),
1174                ]),
1175                UnionMode::Sparse,
1176            ),
1177            DataType::Union(
1178                UnionFields::from_fields(vec![
1179                    Field::new("Int32", DataType::Int32, false),
1180                    Field::new("Utf8", DataType::Utf8, true),
1181                ]),
1182                UnionMode::Dense,
1183            ),
1184            DataType::Union(
1185                UnionFields::from_fields(vec![
1186                    Field::new_union(
1187                        "nested_union",
1188                        vec![0, 1],
1189                        vec![
1190                            Field::new("Int32", DataType::Int32, false),
1191                            Field::new("Utf8", DataType::Utf8, true),
1192                        ],
1193                        UnionMode::Dense,
1194                    ),
1195                    Field::new("Utf8", DataType::Utf8, true),
1196                ]),
1197                UnionMode::Sparse,
1198            ),
1199            DataType::Union(
1200                UnionFields::from_fields(vec![Field::new("Int32", DataType::Int32, false)]),
1201                UnionMode::Dense,
1202            ),
1203            DataType::Union(
1204                UnionFields::try_new(Vec::<i8>::new(), Vec::<Field>::new()).unwrap(),
1205                UnionMode::Sparse,
1206            ),
1207            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), true),
1208            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), false),
1209            DataType::Map(
1210                Arc::new(Field::new_map(
1211                    "nested_map",
1212                    "entries",
1213                    Field::new("key", DataType::Utf8, false),
1214                    Field::new("value", DataType::Int32, true),
1215                    false,
1216                    true,
1217                )),
1218                true,
1219            ),
1220            DataType::RunEndEncoded(
1221                Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1222                Arc::new(Field::new("values", DataType::Int32, true)),
1223            ),
1224            DataType::RunEndEncoded(
1225                Arc::new(Field::new(
1226                    "nested_run_end_encoded",
1227                    DataType::RunEndEncoded(
1228                        Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1229                        Arc::new(Field::new("values", DataType::Int32, true)),
1230                    ),
1231                    true,
1232                )),
1233                Arc::new(Field::new("values", DataType::Int32, true)),
1234            ),
1235        ]
1236    }
1237
1238    #[test]
1239    fn test_parse_data_type_whitespace_tolerance() {
1240        // (string to parse, expected DataType)
1241        let cases = [
1242            ("Int8", DataType::Int8),
1243            (
1244                "Timestamp        (ns)",
1245                DataType::Timestamp(TimeUnit::Nanosecond, None),
1246            ),
1247            (
1248                "Timestamp        (ns)  ",
1249                DataType::Timestamp(TimeUnit::Nanosecond, None),
1250            ),
1251            (
1252                "          Timestamp        (ns               )",
1253                DataType::Timestamp(TimeUnit::Nanosecond, None),
1254            ),
1255            (
1256                "Timestamp        (ns               )  ",
1257                DataType::Timestamp(TimeUnit::Nanosecond, None),
1258            ),
1259        ];
1260
1261        for (data_type_string, expected_data_type) in cases {
1262            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1263            assert_eq!(
1264                parsed_data_type, expected_data_type,
1265                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1266            );
1267        }
1268    }
1269
1270    /// Ensure that old style types can still be parsed
1271    #[test]
1272    fn test_parse_data_type_backwards_compatibility() {
1273        use DataType::*;
1274        use IntervalUnit::*;
1275        use TimeUnit::*;
1276        // List below created with:
1277        for t in list_datatypes() {
1278            println!(r#"("{t}", {t:?}),"#);
1279        }
1280        // (string to parse, expected DataType)
1281        let cases = [
1282            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1283            ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
1284            ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
1285            ("Timestamp(Second, None)", Timestamp(Second, None)),
1286            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1287            // Timezones
1288            (
1289                r#"Timestamp(Nanosecond, Some("+00:00"))"#,
1290                Timestamp(Nanosecond, Some("+00:00".into())),
1291            ),
1292            (
1293                r#"Timestamp(Microsecond, Some("+00:00"))"#,
1294                Timestamp(Microsecond, Some("+00:00".into())),
1295            ),
1296            (
1297                r#"Timestamp(Millisecond, Some("+00:00"))"#,
1298                Timestamp(Millisecond, Some("+00:00".into())),
1299            ),
1300            (
1301                r#"Timestamp(Second, Some("+00:00"))"#,
1302                Timestamp(Second, Some("+00:00".into())),
1303            ),
1304            ("Null", Null),
1305            ("Boolean", Boolean),
1306            ("Int8", Int8),
1307            ("Int16", Int16),
1308            ("Int32", Int32),
1309            ("Int64", Int64),
1310            ("UInt8", UInt8),
1311            ("UInt16", UInt16),
1312            ("UInt32", UInt32),
1313            ("UInt64", UInt64),
1314            ("Float16", Float16),
1315            ("Float32", Float32),
1316            ("Float64", Float64),
1317            ("Timestamp(s)", Timestamp(Second, None)),
1318            ("Timestamp(ms)", Timestamp(Millisecond, None)),
1319            ("Timestamp(µs)", Timestamp(Microsecond, None)),
1320            ("Timestamp(ns)", Timestamp(Nanosecond, None)),
1321            (
1322                r#"Timestamp(ns, "+00:00")"#,
1323                Timestamp(Nanosecond, Some("+00:00".into())),
1324            ),
1325            (
1326                r#"Timestamp(µs, "+00:00")"#,
1327                Timestamp(Microsecond, Some("+00:00".into())),
1328            ),
1329            (
1330                r#"Timestamp(ms, "+00:00")"#,
1331                Timestamp(Millisecond, Some("+00:00".into())),
1332            ),
1333            (
1334                r#"Timestamp(s, "+00:00")"#,
1335                Timestamp(Second, Some("+00:00".into())),
1336            ),
1337            (
1338                r#"Timestamp(ns, "+08:00")"#,
1339                Timestamp(Nanosecond, Some("+08:00".into())),
1340            ),
1341            (
1342                r#"Timestamp(µs, "+08:00")"#,
1343                Timestamp(Microsecond, Some("+08:00".into())),
1344            ),
1345            (
1346                r#"Timestamp(ms, "+08:00")"#,
1347                Timestamp(Millisecond, Some("+08:00".into())),
1348            ),
1349            (
1350                r#"Timestamp(s, "+08:00")"#,
1351                Timestamp(Second, Some("+08:00".into())),
1352            ),
1353            ("Date32", Date32),
1354            ("Date64", Date64),
1355            ("Time32(s)", Time32(Second)),
1356            ("Time32(ms)", Time32(Millisecond)),
1357            ("Time64(µs)", Time64(Microsecond)),
1358            ("Time64(ns)", Time64(Nanosecond)),
1359            ("Duration(s)", Duration(Second)),
1360            ("Duration(ms)", Duration(Millisecond)),
1361            ("Duration(µs)", Duration(Microsecond)),
1362            ("Duration(ns)", Duration(Nanosecond)),
1363            ("Interval(YearMonth)", Interval(YearMonth)),
1364            ("Interval(DayTime)", Interval(DayTime)),
1365            ("Interval(MonthDayNano)", Interval(MonthDayNano)),
1366            ("Binary", Binary),
1367            ("BinaryView", BinaryView),
1368            ("FixedSizeBinary(0)", FixedSizeBinary(0)),
1369            ("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
1370            ("LargeBinary", LargeBinary),
1371            ("Utf8", Utf8),
1372            ("Utf8View", Utf8View),
1373            ("LargeUtf8", LargeUtf8),
1374            ("Decimal32(7, 6)", Decimal32(7, 6)),
1375            ("Decimal64(6, 5)", Decimal64(6, 5)),
1376            ("Decimal128(7, 6)", Decimal128(7, 6)),
1377            ("Decimal256(6, 5)", Decimal256(6, 5)),
1378            (
1379                "Dictionary(Int32, Utf8)",
1380                Dictionary(Box::new(Int32), Box::new(Utf8)),
1381            ),
1382            (
1383                "Dictionary(Int8, Utf8)",
1384                Dictionary(Box::new(Int8), Box::new(Utf8)),
1385            ),
1386            (
1387                "Dictionary(Int8, Timestamp(ns))",
1388                Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))),
1389            ),
1390            (
1391                "Dictionary(Int8, FixedSizeBinary(23))",
1392                Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1393            ),
1394            (
1395                "Dictionary(Int8, Dictionary(Int8, Utf8))",
1396                Dictionary(
1397                    Box::new(Int8),
1398                    Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
1399                ),
1400            ),
1401            (
1402                r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#,
1403                Struct(Fields::from(vec![
1404                    Field::new("f1", Int64, true),
1405                    Field::new("f2", Float64, true),
1406                    Field::new("f3", Timestamp(Second, Some("+08:00".into())), true),
1407                    Field::new(
1408                        "f4",
1409                        Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1410                        true,
1411                    ),
1412                ])),
1413            ),
1414            (
1415                r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#,
1416                Struct(Fields::from(vec![
1417                    Field::new("Int64", Int64, true),
1418                    Field::new("Float64", Float64, true),
1419                ])),
1420            ),
1421            (
1422                r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#,
1423                Struct(Fields::from(vec![
1424                    Field::new("f1", Int64, true),
1425                    Field::new(
1426                        "nested_struct",
1427                        Struct(Fields::from(vec![Field::new("n1", Int64, true)])),
1428                        true,
1429                    ),
1430                ])),
1431            ),
1432            (r#"Struct()"#, Struct(Fields::empty())),
1433            (
1434                "FixedSizeList(4, Int64)",
1435                FixedSizeList(Arc::new(Field::new_list_field(Int64, true)), 4),
1436            ),
1437            (
1438                "List(Int64)",
1439                List(Arc::new(Field::new_list_field(Int64, true))),
1440            ),
1441            (
1442                "LargeList(Int64)",
1443                LargeList(Arc::new(Field::new_list_field(Int64, true))),
1444            ),
1445        ];
1446
1447        for (data_type_string, expected_data_type) in cases {
1448            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1449            assert_eq!(
1450                parsed_data_type, expected_data_type,
1451                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1452            );
1453        }
1454    }
1455
1456    #[test]
1457    fn parse_data_type_errors() {
1458        // (string to parse, expected error message)
1459        let cases = [
1460            ("", "Unsupported type ''"),
1461            ("", "Error finding next token"),
1462            ("null", "Unsupported type 'null'"),
1463            ("Nu", "Unsupported type 'Nu'"),
1464            (r#"Timestamp(ns, +00:00)"#, "Error unknown token: +00"),
1465            (
1466                r#"Timestamp(ns, "+00:00)"#,
1467                r#"Unterminated string at: "+00:00)"#,
1468            ),
1469            (r#"Timestamp(ns, "")"#, r#"empty strings aren't allowed"#),
1470            (
1471                r#"Timestamp(ns, "+00:00"")"#,
1472                r#"Parser error: Unterminated string at: ")"#,
1473            ),
1474            ("Timestamp(ns, ", "Error finding next token"),
1475            (
1476                "Float32 Float32",
1477                "trailing content after parsing 'Float32'",
1478            ),
1479            ("Int32, ", "trailing content after parsing 'Int32'"),
1480            ("Int32(3), ", "trailing content after parsing 'Int32'"),
1481            (
1482                "FixedSizeBinary(Int32), ",
1483                "Error finding i64 for FixedSizeBinary, got 'Int32'",
1484            ),
1485            (
1486                "FixedSizeBinary(3.0), ",
1487                "Error parsing 3.0 as integer: invalid digit found in string",
1488            ),
1489            // too large for i32
1490            (
1491                "FixedSizeBinary(4000000000), ",
1492                "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted",
1493            ),
1494            // can't have negative width
1495            (
1496                "FixedSizeBinary(-1), ",
1497                "FixedSizeBinary length must be non-negative, got -1",
1498            ),
1499            (
1500                "FixedSizeList(-1, Int64), ",
1501                "FixedSizeList length must be non-negative, got -1",
1502            ),
1503            // can't have negative precision
1504            (
1505                "Decimal32(-3, 5)",
1506                "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted",
1507            ),
1508            (
1509                "Decimal64(-3, 5)",
1510                "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted",
1511            ),
1512            (
1513                "Decimal128(-3, 5)",
1514                "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted",
1515            ),
1516            (
1517                "Decimal256(-3, 5)",
1518                "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted",
1519            ),
1520            (
1521                "Decimal32(3, 500)",
1522                "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted",
1523            ),
1524            (
1525                "Decimal64(3, 500)",
1526                "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted",
1527            ),
1528            (
1529                "Decimal128(3, 500)",
1530                "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted",
1531            ),
1532            (
1533                "Decimal256(3, 500)",
1534                "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted",
1535            ),
1536            ("Struct(f1 Int64)", "Error unknown token: f1"),
1537            ("Struct(\"f1\" Int64)", "Expected ':'"),
1538            (
1539                "Struct(\"f1\": )",
1540                "Error finding next type, got unexpected ')'",
1541            ),
1542            // Invalid time combinations
1543            (
1544                "Time32(µs)",
1545                "Error Time32 time unit must be 's' or 'ms', got 'µs'",
1546            ),
1547            (
1548                "Time32(ns)",
1549                "Error Time32 time unit must be 's' or 'ms', got 'ns'",
1550            ),
1551            (
1552                "Time64(s)",
1553                "Error Time64 time unit must be 'µs' or 'ns', got 's'",
1554            ),
1555            (
1556                "Time64(ms)",
1557                "Error Time64 time unit must be 'µs' or 'ns', got 'ms'",
1558            ),
1559            // Decimals can't have scale exceeding precision
1560            (
1561                "Decimal32(5, 6)",
1562                "Error Decimal32 scale '6' cannot be greater than precision '5'",
1563            ),
1564            (
1565                "Decimal64(5, 6)",
1566                "Error Decimal64 scale '6' cannot be greater than precision '5'",
1567            ),
1568            (
1569                "Decimal128(5, 6)",
1570                "Error Decimal128 scale '6' cannot be greater than precision '5'",
1571            ),
1572            (
1573                "Decimal256(5, 6)",
1574                "Error Decimal256 scale '6' cannot be greater than precision '5'",
1575            ),
1576            // Decimals have a max supported precision
1577            (
1578                "Decimal32(10, 0)",
1579                "Error Decimal32 precision must be in range [1, 9], got '10'",
1580            ),
1581            (
1582                "Decimal64(19, 0)",
1583                "Error Decimal64 precision must be in range [1, 18], got '19'",
1584            ),
1585            (
1586                "Decimal128(39, 0)",
1587                "Error Decimal128 precision must be in range [1, 38], got '39'",
1588            ),
1589            (
1590                "Decimal256(77, 0)",
1591                "Error Decimal256 precision must be in range [1, 76], got '77'",
1592            ),
1593            // Decimals precision can't be 0
1594            (
1595                "Decimal32(0, 0)",
1596                "Error Decimal32 precision must be in range [1, 9], got '0'",
1597            ),
1598            (
1599                "Decimal64(0, 0)",
1600                "Error Decimal64 precision must be in range [1, 18], got '0'",
1601            ),
1602            (
1603                "Decimal128(0, 0)",
1604                "Error Decimal128 precision must be in range [1, 38], got '0'",
1605            ),
1606            (
1607                "Decimal256(0, 0)",
1608                "Error Decimal256 precision must be in range [1, 76], got '0'",
1609            ),
1610        ];
1611
1612        for (data_type_string, expected_message) in cases {
1613            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
1614            match parse_data_type(data_type_string) {
1615                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
1616                Err(e) => {
1617                    let message = e.to_string();
1618                    assert!(
1619                        message.contains(expected_message),
1620                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
1621                    );
1622
1623                    if !message.contains("Unterminated string") {
1624                        // errors should also contain a help message
1625                        assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
1626                    }
1627                }
1628            }
1629        }
1630    }
1631
1632    #[test]
1633    fn parse_error_type() {
1634        let err = parse_data_type("foobar").unwrap_err();
1635        assert!(matches!(err, ArrowError::ParseError(_)));
1636        assert_eq!(
1637            err.to_string(),
1638            "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar"
1639        );
1640    }
1641}