1use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
21
22pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
23 Parser::new(val).parse()
24}
25
26type ArrowResult<T> = Result<T, ArrowError>;
27
28fn make_error(val: &str, msg: &str) -> ArrowError {
29 let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" );
30 ArrowError::ParseError(msg)
31}
32
33fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
34 make_error(val, &format!("Expected '{expected}', got '{actual}'"))
35}
36
37#[derive(Debug)]
38struct Parser<'a> {
40 val: &'a str,
41 tokenizer: Tokenizer<'a>,
42}
43
44impl<'a> Parser<'a> {
45 fn new(val: &'a str) -> Self {
46 Self {
47 val,
48 tokenizer: Tokenizer::new(val),
49 }
50 }
51
52 fn parse(mut self) -> ArrowResult<DataType> {
53 let data_type = self.parse_next_type()?;
54 if self.tokenizer.next().is_some() {
56 Err(make_error(
57 self.val,
58 &format!("checking trailing content after parsing '{data_type}'"),
59 ))
60 } else {
61 Ok(data_type)
62 }
63 }
64
65 fn parse_next_type(&mut self) -> ArrowResult<DataType> {
67 match self.next_token()? {
68 Token::SimpleType(data_type) => Ok(data_type),
69 Token::Timestamp => self.parse_timestamp(),
70 Token::Time32 => self.parse_time32(),
71 Token::Time64 => self.parse_time64(),
72 Token::Duration => self.parse_duration(),
73 Token::Interval => self.parse_interval(),
74 Token::FixedSizeBinary => self.parse_fixed_size_binary(),
75 Token::Decimal32 => self.parse_decimal_32(),
76 Token::Decimal64 => self.parse_decimal_64(),
77 Token::Decimal128 => self.parse_decimal_128(),
78 Token::Decimal256 => self.parse_decimal_256(),
79 Token::Dictionary => self.parse_dictionary(),
80 Token::List => self.parse_list(),
81 Token::LargeList => self.parse_large_list(),
82 Token::FixedSizeList => self.parse_fixed_size_list(),
83 Token::Struct => self.parse_struct(),
84 Token::FieldName(word) => {
85 Err(make_error(self.val, &format!("unrecognized word: {word}")))
86 }
87 tok => Err(make_error(
88 self.val,
89 &format!("finding next type, got unexpected '{tok}'"),
90 )),
91 }
92 }
93
94 fn parse_list(&mut self) -> ArrowResult<DataType> {
96 self.expect_token(Token::LParen)?;
97 let data_type = self.parse_next_type()?;
98 self.expect_token(Token::RParen)?;
99 Ok(DataType::List(Arc::new(Field::new_list_field(
100 data_type, true,
101 ))))
102 }
103
104 fn parse_large_list(&mut self) -> ArrowResult<DataType> {
106 self.expect_token(Token::LParen)?;
107 let data_type = self.parse_next_type()?;
108 self.expect_token(Token::RParen)?;
109 Ok(DataType::LargeList(Arc::new(Field::new_list_field(
110 data_type, true,
111 ))))
112 }
113
114 fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
116 self.expect_token(Token::LParen)?;
117 let length = self.parse_i32("FixedSizeList")?;
118 self.expect_token(Token::Comma)?;
119 let data_type = self.parse_next_type()?;
120 self.expect_token(Token::RParen)?;
121 Ok(DataType::FixedSizeList(
122 Arc::new(Field::new_list_field(data_type, true)),
123 length,
124 ))
125 }
126
127 fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
129 match self.next_token()? {
130 Token::TimeUnit(time_unit) => Ok(time_unit),
131 tok => Err(make_error(
132 self.val,
133 &format!("finding TimeUnit for {context}, got {tok}"),
134 )),
135 }
136 }
137
138 fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> {
140 match self.next_token()? {
141 Token::None => Ok(None),
142 Token::Some => {
143 self.expect_token(Token::LParen)?;
144 let timezone = self.parse_double_quoted_string("Timezone")?;
145 self.expect_token(Token::RParen)?;
146 Ok(Some(timezone))
147 }
148 tok => Err(make_error(
149 self.val,
150 &format!("finding Timezone for {context}, got {tok}"),
151 )),
152 }
153 }
154
155 fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
157 match self.next_token()? {
158 Token::DoubleQuotedString(s) => Ok(s),
159 Token::FieldName(word) => {
160 Err(make_error(self.val, &format!("unrecognized word: {word}")))
161 }
162 tok => Err(make_error(
163 self.val,
164 &format!("finding double quoted string for {context}, got '{tok}'"),
165 )),
166 }
167 }
168
169 fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
171 match self.next_token()? {
172 Token::Integer(v) => Ok(v),
173 tok => Err(make_error(
174 self.val,
175 &format!("finding i64 for {context}, got '{tok}'"),
176 )),
177 }
178 }
179
180 fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
182 let length = self.parse_i64(context)?;
183 length.try_into().map_err(|e| {
184 make_error(
185 self.val,
186 &format!("converting {length} into i32 for {context}: {e}"),
187 )
188 })
189 }
190
191 fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
193 let length = self.parse_i64(context)?;
194 length.try_into().map_err(|e| {
195 make_error(
196 self.val,
197 &format!("converting {length} into i8 for {context}: {e}"),
198 )
199 })
200 }
201
202 fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
204 let length = self.parse_i64(context)?;
205 length.try_into().map_err(|e| {
206 make_error(
207 self.val,
208 &format!("converting {length} into u8 for {context}: {e}"),
209 )
210 })
211 }
212
213 fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
215 self.expect_token(Token::LParen)?;
216 let time_unit = self.parse_time_unit("Timestamp")?;
217 self.expect_token(Token::Comma)?;
218 let timezone = self.parse_timezone("Timestamp")?;
219 self.expect_token(Token::RParen)?;
220 Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
221 }
222
223 fn parse_time32(&mut self) -> ArrowResult<DataType> {
225 self.expect_token(Token::LParen)?;
226 let time_unit = self.parse_time_unit("Time32")?;
227 self.expect_token(Token::RParen)?;
228 Ok(DataType::Time32(time_unit))
229 }
230
231 fn parse_time64(&mut self) -> ArrowResult<DataType> {
233 self.expect_token(Token::LParen)?;
234 let time_unit = self.parse_time_unit("Time64")?;
235 self.expect_token(Token::RParen)?;
236 Ok(DataType::Time64(time_unit))
237 }
238
239 fn parse_duration(&mut self) -> ArrowResult<DataType> {
241 self.expect_token(Token::LParen)?;
242 let time_unit = self.parse_time_unit("Duration")?;
243 self.expect_token(Token::RParen)?;
244 Ok(DataType::Duration(time_unit))
245 }
246
247 fn parse_interval(&mut self) -> ArrowResult<DataType> {
249 self.expect_token(Token::LParen)?;
250 let interval_unit = match self.next_token()? {
251 Token::IntervalUnit(interval_unit) => interval_unit,
252 tok => {
253 return Err(make_error(
254 self.val,
255 &format!("finding IntervalUnit for Interval, got {tok}"),
256 ))
257 }
258 };
259 self.expect_token(Token::RParen)?;
260 Ok(DataType::Interval(interval_unit))
261 }
262
263 fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
265 self.expect_token(Token::LParen)?;
266 let length = self.parse_i32("FixedSizeBinary")?;
267 self.expect_token(Token::RParen)?;
268 Ok(DataType::FixedSizeBinary(length))
269 }
270
271 fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
273 self.expect_token(Token::LParen)?;
274 let precision = self.parse_u8("Decimal32")?;
275 self.expect_token(Token::Comma)?;
276 let scale = self.parse_i8("Decimal32")?;
277 self.expect_token(Token::RParen)?;
278 Ok(DataType::Decimal32(precision, scale))
279 }
280
281 fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
283 self.expect_token(Token::LParen)?;
284 let precision = self.parse_u8("Decimal64")?;
285 self.expect_token(Token::Comma)?;
286 let scale = self.parse_i8("Decimal64")?;
287 self.expect_token(Token::RParen)?;
288 Ok(DataType::Decimal64(precision, scale))
289 }
290
291 fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
293 self.expect_token(Token::LParen)?;
294 let precision = self.parse_u8("Decimal128")?;
295 self.expect_token(Token::Comma)?;
296 let scale = self.parse_i8("Decimal128")?;
297 self.expect_token(Token::RParen)?;
298 Ok(DataType::Decimal128(precision, scale))
299 }
300
301 fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
303 self.expect_token(Token::LParen)?;
304 let precision = self.parse_u8("Decimal256")?;
305 self.expect_token(Token::Comma)?;
306 let scale = self.parse_i8("Decimal256")?;
307 self.expect_token(Token::RParen)?;
308 Ok(DataType::Decimal256(precision, scale))
309 }
310
311 fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
313 self.expect_token(Token::LParen)?;
314 let key_type = self.parse_next_type()?;
315 self.expect_token(Token::Comma)?;
316 let value_type = self.parse_next_type()?;
317 self.expect_token(Token::RParen)?;
318 Ok(DataType::Dictionary(
319 Box::new(key_type),
320 Box::new(value_type),
321 ))
322 }
323 fn parse_struct(&mut self) -> ArrowResult<DataType> {
324 self.expect_token(Token::LParen)?;
325 let mut fields = Vec::new();
326 loop {
327 let field_name = match self.next_token()? {
328 Token::SimpleType(data_type) => data_type.to_string(),
330 Token::FieldName(name) => name,
331 Token::RParen => {
332 if fields.is_empty() {
333 break;
334 } else {
335 return Err(make_error(
336 self.val,
337 "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
338 ));
339 }
340 }
341 tok => {
342 return Err(make_error(
343 self.val,
344 &format!("Expected a word for the name of Struct, but got {tok}"),
345 ))
346 }
347 };
348 let field_type = self.parse_next_type()?;
349 fields.push(Arc::new(Field::new(field_name, field_type, true)));
350 match self.next_token()? {
351 Token::Comma => continue,
352 Token::RParen => break,
353 tok => {
354 return Err(make_error(
355 self.val,
356 &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
357 ))
358 }
359 }
360 }
361 Ok(DataType::Struct(Fields::from(fields)))
362 }
363
364 fn next_token(&mut self) -> ArrowResult<Token> {
366 match self.tokenizer.next() {
367 None => Err(make_error(self.val, "finding next token")),
368 Some(token) => token,
369 }
370 }
371
372 fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
374 let next_token = self.next_token()?;
375 if next_token == tok {
376 Ok(())
377 } else {
378 Err(make_error_expected(self.val, &tok, &next_token))
379 }
380 }
381}
382
383fn is_separator(c: char) -> bool {
385 c == '(' || c == ')' || c == ',' || c == ' '
386}
387
388#[derive(Debug)]
389struct Tokenizer<'a> {
400 val: &'a str,
401 chars: Peekable<Chars<'a>>,
402 word: String,
404}
405
406impl<'a> Tokenizer<'a> {
407 fn new(val: &'a str) -> Self {
408 Self {
409 val,
410 chars: val.chars().peekable(),
411 word: String::new(),
412 }
413 }
414
415 fn peek_next_char(&mut self) -> Option<char> {
417 self.chars.peek().copied()
418 }
419
420 fn next_char(&mut self) -> Option<char> {
422 self.chars.next()
423 }
424
425 fn parse_word(&mut self) -> ArrowResult<Token> {
428 self.word.clear();
430 loop {
431 match self.peek_next_char() {
432 None => break,
433 Some(c) if is_separator(c) => break,
434 Some(c) => {
435 self.next_char();
436 self.word.push(c);
437 }
438 }
439 }
440
441 if let Some(c) = self.word.chars().next() {
442 if c == '-' || c.is_numeric() {
444 let val: i64 = self.word.parse().map_err(|e| {
445 make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
446 })?;
447 return Ok(Token::Integer(val));
448 }
449 else if c == '"' {
451 let len = self.word.chars().count();
452
453 if let Some(last_c) = self.word.chars().last() {
455 if last_c != '"' || len < 2 {
456 return Err(make_error(
457 self.val,
458 &format!(
459 "parsing {} as double quoted string: last char must be \"",
460 self.word
461 ),
462 ));
463 }
464 }
465
466 if len == 2 {
467 return Err(make_error(
468 self.val,
469 &format!(
470 "parsing {} as double quoted string: empty string isn't supported",
471 self.word
472 ),
473 ));
474 }
475
476 let val: String = self.word.parse().map_err(|e| {
477 make_error(
478 self.val,
479 &format!("parsing {} as double quoted string: {e}", self.word),
480 )
481 })?;
482
483 let s = val[1..len - 1].to_string();
484 if s.contains('"') {
485 return Err(make_error(
486 self.val,
487 &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
488 ));
489 }
490
491 return Ok(Token::DoubleQuotedString(s));
492 }
493 }
494
495 let token = match self.word.as_str() {
497 "Null" => Token::SimpleType(DataType::Null),
498 "Boolean" => Token::SimpleType(DataType::Boolean),
499
500 "Int8" => Token::SimpleType(DataType::Int8),
501 "Int16" => Token::SimpleType(DataType::Int16),
502 "Int32" => Token::SimpleType(DataType::Int32),
503 "Int64" => Token::SimpleType(DataType::Int64),
504
505 "UInt8" => Token::SimpleType(DataType::UInt8),
506 "UInt16" => Token::SimpleType(DataType::UInt16),
507 "UInt32" => Token::SimpleType(DataType::UInt32),
508 "UInt64" => Token::SimpleType(DataType::UInt64),
509
510 "Utf8" => Token::SimpleType(DataType::Utf8),
511 "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
512 "Utf8View" => Token::SimpleType(DataType::Utf8View),
513 "Binary" => Token::SimpleType(DataType::Binary),
514 "BinaryView" => Token::SimpleType(DataType::BinaryView),
515 "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
516
517 "Float16" => Token::SimpleType(DataType::Float16),
518 "Float32" => Token::SimpleType(DataType::Float32),
519 "Float64" => Token::SimpleType(DataType::Float64),
520
521 "Date32" => Token::SimpleType(DataType::Date32),
522 "Date64" => Token::SimpleType(DataType::Date64),
523
524 "List" => Token::List,
525 "LargeList" => Token::LargeList,
526 "FixedSizeList" => Token::FixedSizeList,
527
528 "Second" => Token::TimeUnit(TimeUnit::Second),
529 "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
530 "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
531 "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
532
533 "Timestamp" => Token::Timestamp,
534 "Time32" => Token::Time32,
535 "Time64" => Token::Time64,
536 "Duration" => Token::Duration,
537 "Interval" => Token::Interval,
538 "Dictionary" => Token::Dictionary,
539
540 "FixedSizeBinary" => Token::FixedSizeBinary,
541
542 "Decimal32" => Token::Decimal32,
543 "Decimal64" => Token::Decimal64,
544 "Decimal128" => Token::Decimal128,
545 "Decimal256" => Token::Decimal256,
546
547 "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
548 "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
549 "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
550
551 "Some" => Token::Some,
552 "None" => Token::None,
553
554 "Struct" => Token::Struct,
555 word => Token::FieldName(word.to_string()),
557 };
558 Ok(token)
559 }
560}
561
562impl Iterator for Tokenizer<'_> {
563 type Item = ArrowResult<Token>;
564
565 fn next(&mut self) -> Option<Self::Item> {
566 loop {
567 match self.peek_next_char()? {
568 ' ' => {
569 self.next_char();
571 continue;
572 }
573 '(' => {
574 self.next_char();
575 return Some(Ok(Token::LParen));
576 }
577 ')' => {
578 self.next_char();
579 return Some(Ok(Token::RParen));
580 }
581 ',' => {
582 self.next_char();
583 return Some(Ok(Token::Comma));
584 }
585 _ => return Some(self.parse_word()),
586 }
587 }
588 }
589}
590
591#[derive(Debug, PartialEq)]
594enum Token {
595 SimpleType(DataType),
597 Timestamp,
598 Time32,
599 Time64,
600 Duration,
601 Interval,
602 FixedSizeBinary,
603 Decimal32,
604 Decimal64,
605 Decimal128,
606 Decimal256,
607 Dictionary,
608 TimeUnit(TimeUnit),
609 IntervalUnit(IntervalUnit),
610 LParen,
611 RParen,
612 Comma,
613 Some,
614 None,
615 Integer(i64),
616 DoubleQuotedString(String),
617 List,
618 LargeList,
619 FixedSizeList,
620 Struct,
621 FieldName(String),
622}
623
624impl Display for Token {
625 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
626 match self {
627 Token::SimpleType(t) => write!(f, "{t}"),
628 Token::List => write!(f, "List"),
629 Token::LargeList => write!(f, "LargeList"),
630 Token::FixedSizeList => write!(f, "FixedSizeList"),
631 Token::Timestamp => write!(f, "Timestamp"),
632 Token::Time32 => write!(f, "Time32"),
633 Token::Time64 => write!(f, "Time64"),
634 Token::Duration => write!(f, "Duration"),
635 Token::Interval => write!(f, "Interval"),
636 Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
637 Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
638 Token::LParen => write!(f, "("),
639 Token::RParen => write!(f, ")"),
640 Token::Comma => write!(f, ","),
641 Token::Some => write!(f, "Some"),
642 Token::None => write!(f, "None"),
643 Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
644 Token::Decimal32 => write!(f, "Decimal32"),
645 Token::Decimal64 => write!(f, "Decimal64"),
646 Token::Decimal128 => write!(f, "Decimal128"),
647 Token::Decimal256 => write!(f, "Decimal256"),
648 Token::Dictionary => write!(f, "Dictionary"),
649 Token::Integer(v) => write!(f, "Integer({v})"),
650 Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
651 Token::Struct => write!(f, "Struct"),
652 Token::FieldName(s) => write!(f, "FieldName({s})"),
653 }
654 }
655}
656
657#[cfg(test)]
658mod test {
659 use super::*;
660
661 #[test]
662 fn test_parse_data_type() {
663 for dt in list_datatypes() {
665 round_trip(dt)
666 }
667 }
668
669 fn round_trip(data_type: DataType) {
672 let data_type_string = data_type.to_string();
673 println!("Input '{data_type_string}' ({data_type:?})");
674 let parsed_type = parse_data_type(&data_type_string).unwrap();
675 assert_eq!(
676 data_type, parsed_type,
677 "Mismatch parsing {data_type_string}"
678 );
679 }
680
681 fn list_datatypes() -> Vec<DataType> {
682 vec![
683 DataType::Null,
687 DataType::Boolean,
688 DataType::Int8,
689 DataType::Int16,
690 DataType::Int32,
691 DataType::Int64,
692 DataType::UInt8,
693 DataType::UInt16,
694 DataType::UInt32,
695 DataType::UInt64,
696 DataType::Float16,
697 DataType::Float32,
698 DataType::Float64,
699 DataType::Timestamp(TimeUnit::Second, None),
700 DataType::Timestamp(TimeUnit::Millisecond, None),
701 DataType::Timestamp(TimeUnit::Microsecond, None),
702 DataType::Timestamp(TimeUnit::Nanosecond, None),
703 DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
705 DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
706 DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
707 DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
708 DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
709 DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
710 DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
711 DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
712 DataType::Date32,
713 DataType::Date64,
714 DataType::Time32(TimeUnit::Second),
715 DataType::Time32(TimeUnit::Millisecond),
716 DataType::Time32(TimeUnit::Microsecond),
717 DataType::Time32(TimeUnit::Nanosecond),
718 DataType::Time64(TimeUnit::Second),
719 DataType::Time64(TimeUnit::Millisecond),
720 DataType::Time64(TimeUnit::Microsecond),
721 DataType::Time64(TimeUnit::Nanosecond),
722 DataType::Duration(TimeUnit::Second),
723 DataType::Duration(TimeUnit::Millisecond),
724 DataType::Duration(TimeUnit::Microsecond),
725 DataType::Duration(TimeUnit::Nanosecond),
726 DataType::Interval(IntervalUnit::YearMonth),
727 DataType::Interval(IntervalUnit::DayTime),
728 DataType::Interval(IntervalUnit::MonthDayNano),
729 DataType::Binary,
730 DataType::BinaryView,
731 DataType::FixedSizeBinary(0),
732 DataType::FixedSizeBinary(1234),
733 DataType::FixedSizeBinary(-432),
734 DataType::LargeBinary,
735 DataType::Utf8,
736 DataType::Utf8View,
737 DataType::LargeUtf8,
738 DataType::Decimal32(7, 8),
739 DataType::Decimal64(6, 9),
740 DataType::Decimal128(7, 12),
741 DataType::Decimal256(6, 13),
742 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
746 DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
747 DataType::Dictionary(
748 Box::new(DataType::Int8),
749 Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
750 ),
751 DataType::Dictionary(
752 Box::new(DataType::Int8),
753 Box::new(DataType::FixedSizeBinary(23)),
754 ),
755 DataType::Dictionary(
756 Box::new(DataType::Int8),
757 Box::new(
758 DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
760 ),
761 ),
762 DataType::Struct(Fields::from(vec![
763 Field::new("f1", DataType::Int64, true),
764 Field::new("f2", DataType::Float64, true),
765 Field::new(
766 "f3",
767 DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
768 true,
769 ),
770 Field::new(
771 "f4",
772 DataType::Dictionary(
773 Box::new(DataType::Int8),
774 Box::new(DataType::FixedSizeBinary(23)),
775 ),
776 true,
777 ),
778 ])),
779 DataType::Struct(Fields::from(vec![
780 Field::new("Int64", DataType::Int64, true),
781 Field::new("Float64", DataType::Float64, true),
782 ])),
783 DataType::Struct(Fields::from(vec![
784 Field::new("f1", DataType::Int64, true),
785 Field::new(
786 "nested_struct",
787 DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
788 true,
789 ),
790 ])),
791 DataType::Struct(Fields::empty()),
792 ]
794 }
795
796 #[test]
797 fn test_parse_data_type_whitespace_tolerance() {
798 let cases = [
800 ("Int8", DataType::Int8),
801 (
802 "Timestamp (Nanosecond, None)",
803 DataType::Timestamp(TimeUnit::Nanosecond, None),
804 ),
805 (
806 "Timestamp (Nanosecond, None) ",
807 DataType::Timestamp(TimeUnit::Nanosecond, None),
808 ),
809 (
810 " Timestamp (Nanosecond, None )",
811 DataType::Timestamp(TimeUnit::Nanosecond, None),
812 ),
813 (
814 "Timestamp (Nanosecond, None ) ",
815 DataType::Timestamp(TimeUnit::Nanosecond, None),
816 ),
817 ];
818
819 for (data_type_string, expected_data_type) in cases {
820 println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'");
821 let parsed_data_type = parse_data_type(data_type_string).unwrap();
822 assert_eq!(parsed_data_type, expected_data_type);
823 }
824 }
825
826 #[test]
827 fn parse_data_type_errors() {
828 let cases = [
830 ("", "Unsupported type ''"),
831 ("", "Error finding next token"),
832 ("null", "Unsupported type 'null'"),
833 ("Nu", "Unsupported type 'Nu'"),
834 (
835 r#"Timestamp(Nanosecond, Some(+00:00))"#,
836 "Error unrecognized word: +00:00",
837 ),
838 (
839 r#"Timestamp(Nanosecond, Some("+00:00))"#,
840 r#"parsing "+00:00 as double quoted string: last char must be ""#,
841 ),
842 (
843 r#"Timestamp(Nanosecond, Some(""))"#,
844 r#"parsing "" as double quoted string: empty string isn't supported"#,
845 ),
846 (
847 r#"Timestamp(Nanosecond, Some("+00:00""))"#,
848 r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
849 ),
850 ("Timestamp(Nanosecond, ", "Error finding next token"),
851 (
852 "Float32 Float32",
853 "trailing content after parsing 'Float32'",
854 ),
855 ("Int32, ", "trailing content after parsing 'Int32'"),
856 ("Int32(3), ", "trailing content after parsing 'Int32'"),
857 ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"),
858 ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"),
859 ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"),
861 ("Decimal32(-3, 5)", "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted"),
863 ("Decimal64(-3, 5)", "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted"),
864 ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"),
865 ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
866 ("Decimal32(3, 500)", "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted"),
867 ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"),
868 ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
869 ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
870 ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
871 ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
872 ("Struct(f1)", "Error finding next type, got unexpected ')'"),
873 ];
874
875 for (data_type_string, expected_message) in cases {
876 println!("Parsing '{data_type_string}', expecting '{expected_message}'");
877 match parse_data_type(data_type_string) {
878 Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
879 Err(e) => {
880 let message = e.to_string();
881 assert!(
882 message.contains(expected_message),
883 "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
884 );
885 assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
887 }
888 }
889 }
890 }
891
892 #[test]
893 fn parse_error_type() {
894 let err = parse_data_type("foobar").unwrap_err();
895 assert!(matches!(err, ArrowError::ParseError(_)));
896 assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
897 }
898}