1use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
21
22pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
23 Parser::new(val).parse()
24}
25
26type ArrowResult<T> = Result<T, ArrowError>;
27
28fn make_error(val: &str, msg: &str) -> ArrowError {
29 let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" );
30 ArrowError::ParseError(msg)
31}
32
33fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
34 make_error(val, &format!("Expected '{expected}', got '{actual}'"))
35}
36
37#[derive(Debug)]
38struct Parser<'a> {
40 val: &'a str,
41 tokenizer: Tokenizer<'a>,
42}
43
44impl<'a> Parser<'a> {
45 fn new(val: &'a str) -> Self {
46 Self {
47 val,
48 tokenizer: Tokenizer::new(val),
49 }
50 }
51
52 fn parse(mut self) -> ArrowResult<DataType> {
53 let data_type = self.parse_next_type()?;
54 if self.tokenizer.next().is_some() {
56 Err(make_error(
57 self.val,
58 &format!("checking trailing content after parsing '{data_type}'"),
59 ))
60 } else {
61 Ok(data_type)
62 }
63 }
64
65 fn parse_next_type(&mut self) -> ArrowResult<DataType> {
67 match self.next_token()? {
68 Token::SimpleType(data_type) => Ok(data_type),
69 Token::Timestamp => self.parse_timestamp(),
70 Token::Time32 => self.parse_time32(),
71 Token::Time64 => self.parse_time64(),
72 Token::Duration => self.parse_duration(),
73 Token::Interval => self.parse_interval(),
74 Token::FixedSizeBinary => self.parse_fixed_size_binary(),
75 Token::Decimal128 => self.parse_decimal_128(),
76 Token::Decimal256 => self.parse_decimal_256(),
77 Token::Dictionary => self.parse_dictionary(),
78 Token::List => self.parse_list(),
79 Token::LargeList => self.parse_large_list(),
80 Token::FixedSizeList => self.parse_fixed_size_list(),
81 Token::Struct => self.parse_struct(),
82 Token::FieldName(word) => Err(make_error(
83 self.val,
84 &format!("unrecognized word: {}", word),
85 )),
86 tok => Err(make_error(
87 self.val,
88 &format!("finding next type, got unexpected '{tok}'"),
89 )),
90 }
91 }
92
93 fn parse_list(&mut self) -> ArrowResult<DataType> {
95 self.expect_token(Token::LParen)?;
96 let data_type = self.parse_next_type()?;
97 self.expect_token(Token::RParen)?;
98 Ok(DataType::List(Arc::new(Field::new_list_field(
99 data_type, true,
100 ))))
101 }
102
103 fn parse_large_list(&mut self) -> ArrowResult<DataType> {
105 self.expect_token(Token::LParen)?;
106 let data_type = self.parse_next_type()?;
107 self.expect_token(Token::RParen)?;
108 Ok(DataType::LargeList(Arc::new(Field::new_list_field(
109 data_type, true,
110 ))))
111 }
112
113 fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
115 self.expect_token(Token::LParen)?;
116 let length = self.parse_i32("FixedSizeList")?;
117 self.expect_token(Token::Comma)?;
118 let data_type = self.parse_next_type()?;
119 self.expect_token(Token::RParen)?;
120 Ok(DataType::FixedSizeList(
121 Arc::new(Field::new_list_field(data_type, true)),
122 length,
123 ))
124 }
125
126 fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
128 match self.next_token()? {
129 Token::TimeUnit(time_unit) => Ok(time_unit),
130 tok => Err(make_error(
131 self.val,
132 &format!("finding TimeUnit for {context}, got {tok}"),
133 )),
134 }
135 }
136
137 fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> {
139 match self.next_token()? {
140 Token::None => Ok(None),
141 Token::Some => {
142 self.expect_token(Token::LParen)?;
143 let timezone = self.parse_double_quoted_string("Timezone")?;
144 self.expect_token(Token::RParen)?;
145 Ok(Some(timezone))
146 }
147 tok => Err(make_error(
148 self.val,
149 &format!("finding Timezone for {context}, got {tok}"),
150 )),
151 }
152 }
153
154 fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
156 match self.next_token()? {
157 Token::DoubleQuotedString(s) => Ok(s),
158 Token::FieldName(word) => Err(make_error(
159 self.val,
160 &format!("unrecognized word: {}", word),
161 )),
162 tok => Err(make_error(
163 self.val,
164 &format!("finding double quoted string for {context}, got '{tok}'"),
165 )),
166 }
167 }
168
169 fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
171 match self.next_token()? {
172 Token::Integer(v) => Ok(v),
173 tok => Err(make_error(
174 self.val,
175 &format!("finding i64 for {context}, got '{tok}'"),
176 )),
177 }
178 }
179
180 fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
182 let length = self.parse_i64(context)?;
183 length.try_into().map_err(|e| {
184 make_error(
185 self.val,
186 &format!("converting {length} into i32 for {context}: {e}"),
187 )
188 })
189 }
190
191 fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
193 let length = self.parse_i64(context)?;
194 length.try_into().map_err(|e| {
195 make_error(
196 self.val,
197 &format!("converting {length} into i8 for {context}: {e}"),
198 )
199 })
200 }
201
202 fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
204 let length = self.parse_i64(context)?;
205 length.try_into().map_err(|e| {
206 make_error(
207 self.val,
208 &format!("converting {length} into u8 for {context}: {e}"),
209 )
210 })
211 }
212
213 fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
215 self.expect_token(Token::LParen)?;
216 let time_unit = self.parse_time_unit("Timestamp")?;
217 self.expect_token(Token::Comma)?;
218 let timezone = self.parse_timezone("Timestamp")?;
219 self.expect_token(Token::RParen)?;
220 Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
221 }
222
223 fn parse_time32(&mut self) -> ArrowResult<DataType> {
225 self.expect_token(Token::LParen)?;
226 let time_unit = self.parse_time_unit("Time32")?;
227 self.expect_token(Token::RParen)?;
228 Ok(DataType::Time32(time_unit))
229 }
230
231 fn parse_time64(&mut self) -> ArrowResult<DataType> {
233 self.expect_token(Token::LParen)?;
234 let time_unit = self.parse_time_unit("Time64")?;
235 self.expect_token(Token::RParen)?;
236 Ok(DataType::Time64(time_unit))
237 }
238
239 fn parse_duration(&mut self) -> ArrowResult<DataType> {
241 self.expect_token(Token::LParen)?;
242 let time_unit = self.parse_time_unit("Duration")?;
243 self.expect_token(Token::RParen)?;
244 Ok(DataType::Duration(time_unit))
245 }
246
247 fn parse_interval(&mut self) -> ArrowResult<DataType> {
249 self.expect_token(Token::LParen)?;
250 let interval_unit = match self.next_token()? {
251 Token::IntervalUnit(interval_unit) => interval_unit,
252 tok => {
253 return Err(make_error(
254 self.val,
255 &format!("finding IntervalUnit for Interval, got {tok}"),
256 ))
257 }
258 };
259 self.expect_token(Token::RParen)?;
260 Ok(DataType::Interval(interval_unit))
261 }
262
263 fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
265 self.expect_token(Token::LParen)?;
266 let length = self.parse_i32("FixedSizeBinary")?;
267 self.expect_token(Token::RParen)?;
268 Ok(DataType::FixedSizeBinary(length))
269 }
270
271 fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
273 self.expect_token(Token::LParen)?;
274 let precision = self.parse_u8("Decimal128")?;
275 self.expect_token(Token::Comma)?;
276 let scale = self.parse_i8("Decimal128")?;
277 self.expect_token(Token::RParen)?;
278 Ok(DataType::Decimal128(precision, scale))
279 }
280
281 fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
283 self.expect_token(Token::LParen)?;
284 let precision = self.parse_u8("Decimal256")?;
285 self.expect_token(Token::Comma)?;
286 let scale = self.parse_i8("Decimal256")?;
287 self.expect_token(Token::RParen)?;
288 Ok(DataType::Decimal256(precision, scale))
289 }
290
291 fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
293 self.expect_token(Token::LParen)?;
294 let key_type = self.parse_next_type()?;
295 self.expect_token(Token::Comma)?;
296 let value_type = self.parse_next_type()?;
297 self.expect_token(Token::RParen)?;
298 Ok(DataType::Dictionary(
299 Box::new(key_type),
300 Box::new(value_type),
301 ))
302 }
303 fn parse_struct(&mut self) -> ArrowResult<DataType> {
304 self.expect_token(Token::LParen)?;
305 let mut fields = Vec::new();
306 loop {
307 let field_name = match self.next_token()? {
308 Token::SimpleType(data_type) => data_type.to_string(),
310 Token::FieldName(name) => name,
311 Token::RParen => {
312 if fields.is_empty() {
313 break;
314 } else {
315 return Err(make_error(
316 self.val,
317 "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
318 ));
319 }
320 }
321 tok => {
322 return Err(make_error(
323 self.val,
324 &format!("Expected a word for the name of Struct, but got {tok}"),
325 ))
326 }
327 };
328 let field_type = self.parse_next_type()?;
329 fields.push(Arc::new(Field::new(field_name, field_type, true)));
330 match self.next_token()? {
331 Token::Comma => continue,
332 Token::RParen => break,
333 tok => {
334 return Err(make_error(
335 self.val,
336 &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
337 ))
338 }
339 }
340 }
341 Ok(DataType::Struct(Fields::from(fields)))
342 }
343
344 fn next_token(&mut self) -> ArrowResult<Token> {
346 match self.tokenizer.next() {
347 None => Err(make_error(self.val, "finding next token")),
348 Some(token) => token,
349 }
350 }
351
352 fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
354 let next_token = self.next_token()?;
355 if next_token == tok {
356 Ok(())
357 } else {
358 Err(make_error_expected(self.val, &tok, &next_token))
359 }
360 }
361}
362
363fn is_separator(c: char) -> bool {
365 c == '(' || c == ')' || c == ',' || c == ' '
366}
367
368#[derive(Debug)]
369struct Tokenizer<'a> {
380 val: &'a str,
381 chars: Peekable<Chars<'a>>,
382 word: String,
384}
385
386impl<'a> Tokenizer<'a> {
387 fn new(val: &'a str) -> Self {
388 Self {
389 val,
390 chars: val.chars().peekable(),
391 word: String::new(),
392 }
393 }
394
395 fn peek_next_char(&mut self) -> Option<char> {
397 self.chars.peek().copied()
398 }
399
400 fn next_char(&mut self) -> Option<char> {
402 self.chars.next()
403 }
404
405 fn parse_word(&mut self) -> ArrowResult<Token> {
408 self.word.clear();
410 loop {
411 match self.peek_next_char() {
412 None => break,
413 Some(c) if is_separator(c) => break,
414 Some(c) => {
415 self.next_char();
416 self.word.push(c);
417 }
418 }
419 }
420
421 if let Some(c) = self.word.chars().next() {
422 if c == '-' || c.is_numeric() {
424 let val: i64 = self.word.parse().map_err(|e| {
425 make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
426 })?;
427 return Ok(Token::Integer(val));
428 }
429 else if c == '"' {
431 let len = self.word.chars().count();
432
433 if let Some(last_c) = self.word.chars().last() {
435 if last_c != '"' || len < 2 {
436 return Err(make_error(
437 self.val,
438 &format!(
439 "parsing {} as double quoted string: last char must be \"",
440 self.word
441 ),
442 ));
443 }
444 }
445
446 if len == 2 {
447 return Err(make_error(
448 self.val,
449 &format!(
450 "parsing {} as double quoted string: empty string isn't supported",
451 self.word
452 ),
453 ));
454 }
455
456 let val: String = self.word.parse().map_err(|e| {
457 make_error(
458 self.val,
459 &format!("parsing {} as double quoted string: {e}", self.word),
460 )
461 })?;
462
463 let s = val[1..len - 1].to_string();
464 if s.contains('"') {
465 return Err(make_error(
466 self.val,
467 &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
468 ));
469 }
470
471 return Ok(Token::DoubleQuotedString(s));
472 }
473 }
474
475 let token = match self.word.as_str() {
477 "Null" => Token::SimpleType(DataType::Null),
478 "Boolean" => Token::SimpleType(DataType::Boolean),
479
480 "Int8" => Token::SimpleType(DataType::Int8),
481 "Int16" => Token::SimpleType(DataType::Int16),
482 "Int32" => Token::SimpleType(DataType::Int32),
483 "Int64" => Token::SimpleType(DataType::Int64),
484
485 "UInt8" => Token::SimpleType(DataType::UInt8),
486 "UInt16" => Token::SimpleType(DataType::UInt16),
487 "UInt32" => Token::SimpleType(DataType::UInt32),
488 "UInt64" => Token::SimpleType(DataType::UInt64),
489
490 "Utf8" => Token::SimpleType(DataType::Utf8),
491 "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
492 "Utf8View" => Token::SimpleType(DataType::Utf8View),
493 "Binary" => Token::SimpleType(DataType::Binary),
494 "BinaryView" => Token::SimpleType(DataType::BinaryView),
495 "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
496
497 "Float16" => Token::SimpleType(DataType::Float16),
498 "Float32" => Token::SimpleType(DataType::Float32),
499 "Float64" => Token::SimpleType(DataType::Float64),
500
501 "Date32" => Token::SimpleType(DataType::Date32),
502 "Date64" => Token::SimpleType(DataType::Date64),
503
504 "List" => Token::List,
505 "LargeList" => Token::LargeList,
506 "FixedSizeList" => Token::FixedSizeList,
507
508 "Second" => Token::TimeUnit(TimeUnit::Second),
509 "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
510 "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
511 "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
512
513 "Timestamp" => Token::Timestamp,
514 "Time32" => Token::Time32,
515 "Time64" => Token::Time64,
516 "Duration" => Token::Duration,
517 "Interval" => Token::Interval,
518 "Dictionary" => Token::Dictionary,
519
520 "FixedSizeBinary" => Token::FixedSizeBinary,
521 "Decimal128" => Token::Decimal128,
522 "Decimal256" => Token::Decimal256,
523
524 "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
525 "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
526 "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
527
528 "Some" => Token::Some,
529 "None" => Token::None,
530
531 "Struct" => Token::Struct,
532 word => Token::FieldName(word.to_string()),
534 };
535 Ok(token)
536 }
537}
538
539impl Iterator for Tokenizer<'_> {
540 type Item = ArrowResult<Token>;
541
542 fn next(&mut self) -> Option<Self::Item> {
543 loop {
544 match self.peek_next_char()? {
545 ' ' => {
546 self.next_char();
548 continue;
549 }
550 '(' => {
551 self.next_char();
552 return Some(Ok(Token::LParen));
553 }
554 ')' => {
555 self.next_char();
556 return Some(Ok(Token::RParen));
557 }
558 ',' => {
559 self.next_char();
560 return Some(Ok(Token::Comma));
561 }
562 _ => return Some(self.parse_word()),
563 }
564 }
565 }
566}
567
568#[derive(Debug, PartialEq)]
571enum Token {
572 SimpleType(DataType),
574 Timestamp,
575 Time32,
576 Time64,
577 Duration,
578 Interval,
579 FixedSizeBinary,
580 Decimal128,
581 Decimal256,
582 Dictionary,
583 TimeUnit(TimeUnit),
584 IntervalUnit(IntervalUnit),
585 LParen,
586 RParen,
587 Comma,
588 Some,
589 None,
590 Integer(i64),
591 DoubleQuotedString(String),
592 List,
593 LargeList,
594 FixedSizeList,
595 Struct,
596 FieldName(String),
597}
598
599impl Display for Token {
600 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
601 match self {
602 Token::SimpleType(t) => write!(f, "{t}"),
603 Token::List => write!(f, "List"),
604 Token::LargeList => write!(f, "LargeList"),
605 Token::FixedSizeList => write!(f, "FixedSizeList"),
606 Token::Timestamp => write!(f, "Timestamp"),
607 Token::Time32 => write!(f, "Time32"),
608 Token::Time64 => write!(f, "Time64"),
609 Token::Duration => write!(f, "Duration"),
610 Token::Interval => write!(f, "Interval"),
611 Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
612 Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
613 Token::LParen => write!(f, "("),
614 Token::RParen => write!(f, ")"),
615 Token::Comma => write!(f, ","),
616 Token::Some => write!(f, "Some"),
617 Token::None => write!(f, "None"),
618 Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
619 Token::Decimal128 => write!(f, "Decimal128"),
620 Token::Decimal256 => write!(f, "Decimal256"),
621 Token::Dictionary => write!(f, "Dictionary"),
622 Token::Integer(v) => write!(f, "Integer({v})"),
623 Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
624 Token::Struct => write!(f, "Struct"),
625 Token::FieldName(s) => write!(f, "FieldName({s})"),
626 }
627 }
628}
629
630#[cfg(test)]
631mod test {
632 use super::*;
633
634 #[test]
635 fn test_parse_data_type() {
636 for dt in list_datatypes() {
638 round_trip(dt)
639 }
640 }
641
642 fn round_trip(data_type: DataType) {
645 let data_type_string = data_type.to_string();
646 println!("Input '{data_type_string}' ({data_type:?})");
647 let parsed_type = parse_data_type(&data_type_string).unwrap();
648 assert_eq!(
649 data_type, parsed_type,
650 "Mismatch parsing {data_type_string}"
651 );
652 }
653
654 fn list_datatypes() -> Vec<DataType> {
655 vec![
656 DataType::Null,
660 DataType::Boolean,
661 DataType::Int8,
662 DataType::Int16,
663 DataType::Int32,
664 DataType::Int64,
665 DataType::UInt8,
666 DataType::UInt16,
667 DataType::UInt32,
668 DataType::UInt64,
669 DataType::Float16,
670 DataType::Float32,
671 DataType::Float64,
672 DataType::Timestamp(TimeUnit::Second, None),
673 DataType::Timestamp(TimeUnit::Millisecond, None),
674 DataType::Timestamp(TimeUnit::Microsecond, None),
675 DataType::Timestamp(TimeUnit::Nanosecond, None),
676 DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
678 DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
679 DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
680 DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
681 DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
682 DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
683 DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
684 DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
685 DataType::Date32,
686 DataType::Date64,
687 DataType::Time32(TimeUnit::Second),
688 DataType::Time32(TimeUnit::Millisecond),
689 DataType::Time32(TimeUnit::Microsecond),
690 DataType::Time32(TimeUnit::Nanosecond),
691 DataType::Time64(TimeUnit::Second),
692 DataType::Time64(TimeUnit::Millisecond),
693 DataType::Time64(TimeUnit::Microsecond),
694 DataType::Time64(TimeUnit::Nanosecond),
695 DataType::Duration(TimeUnit::Second),
696 DataType::Duration(TimeUnit::Millisecond),
697 DataType::Duration(TimeUnit::Microsecond),
698 DataType::Duration(TimeUnit::Nanosecond),
699 DataType::Interval(IntervalUnit::YearMonth),
700 DataType::Interval(IntervalUnit::DayTime),
701 DataType::Interval(IntervalUnit::MonthDayNano),
702 DataType::Binary,
703 DataType::BinaryView,
704 DataType::FixedSizeBinary(0),
705 DataType::FixedSizeBinary(1234),
706 DataType::FixedSizeBinary(-432),
707 DataType::LargeBinary,
708 DataType::Utf8,
709 DataType::Utf8View,
710 DataType::LargeUtf8,
711 DataType::Decimal128(7, 12),
712 DataType::Decimal256(6, 13),
713 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
717 DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
718 DataType::Dictionary(
719 Box::new(DataType::Int8),
720 Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
721 ),
722 DataType::Dictionary(
723 Box::new(DataType::Int8),
724 Box::new(DataType::FixedSizeBinary(23)),
725 ),
726 DataType::Dictionary(
727 Box::new(DataType::Int8),
728 Box::new(
729 DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
731 ),
732 ),
733 DataType::Struct(Fields::from(vec![
734 Field::new("f1", DataType::Int64, true),
735 Field::new("f2", DataType::Float64, true),
736 Field::new(
737 "f3",
738 DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
739 true,
740 ),
741 Field::new(
742 "f4",
743 DataType::Dictionary(
744 Box::new(DataType::Int8),
745 Box::new(DataType::FixedSizeBinary(23)),
746 ),
747 true,
748 ),
749 ])),
750 DataType::Struct(Fields::from(vec![
751 Field::new("Int64", DataType::Int64, true),
752 Field::new("Float64", DataType::Float64, true),
753 ])),
754 DataType::Struct(Fields::from(vec![
755 Field::new("f1", DataType::Int64, true),
756 Field::new(
757 "nested_struct",
758 DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
759 true,
760 ),
761 ])),
762 DataType::Struct(Fields::empty()),
763 ]
765 }
766
767 #[test]
768 fn test_parse_data_type_whitespace_tolerance() {
769 let cases = [
771 ("Int8", DataType::Int8),
772 (
773 "Timestamp (Nanosecond, None)",
774 DataType::Timestamp(TimeUnit::Nanosecond, None),
775 ),
776 (
777 "Timestamp (Nanosecond, None) ",
778 DataType::Timestamp(TimeUnit::Nanosecond, None),
779 ),
780 (
781 " Timestamp (Nanosecond, None )",
782 DataType::Timestamp(TimeUnit::Nanosecond, None),
783 ),
784 (
785 "Timestamp (Nanosecond, None ) ",
786 DataType::Timestamp(TimeUnit::Nanosecond, None),
787 ),
788 ];
789
790 for (data_type_string, expected_data_type) in cases {
791 println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'");
792 let parsed_data_type = parse_data_type(data_type_string).unwrap();
793 assert_eq!(parsed_data_type, expected_data_type);
794 }
795 }
796
797 #[test]
798 fn parse_data_type_errors() {
799 let cases = [
801 ("", "Unsupported type ''"),
802 ("", "Error finding next token"),
803 ("null", "Unsupported type 'null'"),
804 ("Nu", "Unsupported type 'Nu'"),
805 (
806 r#"Timestamp(Nanosecond, Some(+00:00))"#,
807 "Error unrecognized word: +00:00",
808 ),
809 (
810 r#"Timestamp(Nanosecond, Some("+00:00))"#,
811 r#"parsing "+00:00 as double quoted string: last char must be ""#,
812 ),
813 (
814 r#"Timestamp(Nanosecond, Some(""))"#,
815 r#"parsing "" as double quoted string: empty string isn't supported"#,
816 ),
817 (
818 r#"Timestamp(Nanosecond, Some("+00:00""))"#,
819 r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
820 ),
821 ("Timestamp(Nanosecond, ", "Error finding next token"),
822 (
823 "Float32 Float32",
824 "trailing content after parsing 'Float32'",
825 ),
826 ("Int32, ", "trailing content after parsing 'Int32'"),
827 ("Int32(3), ", "trailing content after parsing 'Int32'"),
828 ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"),
829 ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"),
830 ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"),
832 ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"),
834 ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
835 ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
836 ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
837 ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
838 ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
839 ("Struct(f1)", "Error finding next type, got unexpected ')'"),
840 ];
841
842 for (data_type_string, expected_message) in cases {
843 println!("Parsing '{data_type_string}', expecting '{expected_message}'");
844 match parse_data_type(data_type_string) {
845 Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
846 Err(e) => {
847 let message = e.to_string();
848 assert!(
849 message.contains(expected_message),
850 "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
851 );
852 assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
854 }
855 }
856 }
857 }
858
859 #[test]
860 fn parse_error_type() {
861 let err = parse_data_type("foobar").unwrap_err();
862 assert!(matches!(err, ArrowError::ParseError(_)));
863 assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
864 }
865}