1use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20use crate::{
21 ArrowError, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION,
22 DECIMAL256_MAX_PRECISION, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields,
23 UnionMode,
24};
25
26pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
30 Parser::new(val).parse()
31}
32
33type ArrowResult<T> = Result<T, ArrowError>;
34
35fn make_error(val: &str, msg: &str) -> ArrowError {
36 let msg = format!(
37 "Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error {msg}"
38 );
39 ArrowError::ParseError(msg)
40}
41
42fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
43 make_error(val, &format!("Expected '{expected}', got '{actual}'"))
44}
45
46#[derive(Debug)]
48struct Parser<'a> {
49 val: &'a str,
50 tokenizer: Peekable<Tokenizer<'a>>,
51}
52
53impl<'a> Parser<'a> {
54 fn new(val: &'a str) -> Self {
55 Self {
56 val,
57 tokenizer: Tokenizer::new(val).peekable(),
58 }
59 }
60
61 fn parse(mut self) -> ArrowResult<DataType> {
62 let data_type = self.parse_next_type()?;
63 if self.tokenizer.next().is_some() {
65 Err(make_error(
66 self.val,
67 &format!("checking trailing content after parsing '{data_type}'"),
68 ))
69 } else {
70 Ok(data_type)
71 }
72 }
73
74 fn parse_next_type(&mut self) -> ArrowResult<DataType> {
76 match self.next_token()? {
77 Token::SimpleType(data_type) => Ok(data_type),
78 Token::Timestamp => self.parse_timestamp(),
79 Token::Time32 => self.parse_time32(),
80 Token::Time64 => self.parse_time64(),
81 Token::Duration => self.parse_duration(),
82 Token::Interval => self.parse_interval(),
83 Token::FixedSizeBinary => self.parse_fixed_size_binary(),
84 Token::Decimal32 => self.parse_decimal_32(),
85 Token::Decimal64 => self.parse_decimal_64(),
86 Token::Decimal128 => self.parse_decimal_128(),
87 Token::Decimal256 => self.parse_decimal_256(),
88 Token::Dictionary => self.parse_dictionary(),
89 Token::List => self.parse_list(),
90 Token::ListView => self.parse_list_view(),
91 Token::LargeList => self.parse_large_list(),
92 Token::LargeListView => self.parse_large_list_view(),
93 Token::FixedSizeList => self.parse_fixed_size_list(),
94 Token::Struct => self.parse_struct(),
95 Token::Union => self.parse_union(),
96 Token::Map => self.parse_map(),
97 Token::RunEndEncoded => self.parse_run_end_encoded(),
98 tok => Err(make_error(
99 self.val,
100 &format!("finding next type, got unexpected '{tok}'"),
101 )),
102 }
103 }
104
105 fn parse_field(&mut self) -> ArrowResult<Field> {
110 let name = self.parse_double_quoted_string("Field")?;
111 self.expect_token(Token::Colon)?;
112 let nullable = self.parse_opt_nullable();
113 let data_type = self.parse_next_type()?;
114 Ok(Field::new(name, data_type, nullable))
115 }
116
117 fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> {
123 let nullable = self.parse_opt_nullable();
124 let data_type = self.parse_next_type()?;
125
126 let field_name = if self
128 .tokenizer
129 .next_if(|next| matches!(next, Ok(Token::Comma)))
130 .is_none()
131 {
132 Field::LIST_FIELD_DEFAULT_NAME.into()
133 } else {
134 self.expect_token(Token::Field)?;
136 self.expect_token(Token::Colon)?;
137 self.parse_single_quoted_string(context)?
138 };
139
140 Ok(Field::new(field_name, data_type, nullable))
141 }
142
143 fn parse_list(&mut self) -> ArrowResult<DataType> {
146 self.expect_token(Token::LParen)?;
147 let field = self.parse_list_field("List")?;
148 self.expect_token(Token::RParen)?;
149 Ok(DataType::List(Arc::new(field)))
150 }
151
152 fn parse_list_view(&mut self) -> ArrowResult<DataType> {
155 self.expect_token(Token::LParen)?;
156 let field = self.parse_list_field("ListView")?;
157 self.expect_token(Token::RParen)?;
158 Ok(DataType::ListView(Arc::new(field)))
159 }
160
161 fn parse_large_list(&mut self) -> ArrowResult<DataType> {
164 self.expect_token(Token::LParen)?;
165 let field = self.parse_list_field("LargeList")?;
166 self.expect_token(Token::RParen)?;
167 Ok(DataType::LargeList(Arc::new(field)))
168 }
169
170 fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
173 self.expect_token(Token::LParen)?;
174 let field = self.parse_list_field("LargeListView")?;
175 self.expect_token(Token::RParen)?;
176 Ok(DataType::LargeListView(Arc::new(field)))
177 }
178
179 fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
186 self.expect_token(Token::LParen)?;
187 let length = self.parse_i32("FixedSizeList")?;
188 if length < 0 {
189 return Err(make_error(
190 self.val,
191 &format!("FixedSizeList length must be non-negative, got {length}"),
192 ));
193 }
194 match self.next_token()? {
195 Token::X => {
197 let field = self.parse_list_field("FixedSizeList")?;
198 self.expect_token(Token::RParen)?;
199 Ok(DataType::FixedSizeList(Arc::new(field), length))
200 }
201 Token::Comma => {
203 let data_type = self.parse_next_type()?;
204 self.expect_token(Token::RParen)?;
205 Ok(DataType::FixedSizeList(
206 Arc::new(Field::new_list_field(data_type, true)),
207 length,
208 ))
209 }
210 tok => Err(make_error(
211 self.val,
212 &format!("Expected 'x' or ',' after length for FixedSizeList, got '{tok}'"),
213 )),
214 }
215 }
216
217 fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
219 match self.next_token()? {
220 Token::TimeUnit(time_unit) => Ok(time_unit),
221 tok => Err(make_error(
222 self.val,
223 &format!("finding TimeUnit for {context}, got {tok}"),
224 )),
225 }
226 }
227
228 fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
230 let token = self.next_token()?;
231 if let Token::DoubleQuotedString(string) = token {
232 Ok(string)
233 } else {
234 Err(make_error(
235 self.val,
236 &format!("expected double quoted string for {context}, got '{token}'"),
237 ))
238 }
239 }
240
241 fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
243 let token = self.next_token()?;
244 if let Token::SingleQuotedString(string) = token {
245 Ok(string)
246 } else {
247 Err(make_error(
248 self.val,
249 &format!("expected single quoted string for {context}, got '{token}'"),
250 ))
251 }
252 }
253
254 fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
256 match self.next_token()? {
257 Token::Integer(v) => Ok(v),
258 tok => Err(make_error(
259 self.val,
260 &format!("finding i64 for {context}, got '{tok}'"),
261 )),
262 }
263 }
264
265 fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
267 let length = self.parse_i64(context)?;
268 length.try_into().map_err(|e| {
269 make_error(
270 self.val,
271 &format!("converting {length} into i32 for {context}: {e}"),
272 )
273 })
274 }
275
276 fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
278 let length = self.parse_i64(context)?;
279 length.try_into().map_err(|e| {
280 make_error(
281 self.val,
282 &format!("converting {length} into i8 for {context}: {e}"),
283 )
284 })
285 }
286
287 fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
289 let length = self.parse_i64(context)?;
290 length.try_into().map_err(|e| {
291 make_error(
292 self.val,
293 &format!("converting {length} into u8 for {context}: {e}"),
294 )
295 })
296 }
297
298 fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
300 self.expect_token(Token::LParen)?;
301 let time_unit = self.parse_time_unit("Timestamp")?;
302
303 let timezone;
304 match self.next_token()? {
305 Token::Comma => {
306 match self.next_token()? {
307 Token::None => {
309 timezone = None;
310 }
311 Token::Some => {
313 self.expect_token(Token::LParen)?;
314 timezone = Some(self.parse_double_quoted_string("Timezone")?);
315 self.expect_token(Token::RParen)?;
316 }
317 Token::DoubleQuotedString(tz) => {
318 timezone = Some(tz);
320 }
321 tok => {
322 return Err(make_error(
323 self.val,
324 &format!("Expected None, Some, or a timezone string, got {tok:?}"),
325 ));
326 }
327 };
328 self.expect_token(Token::RParen)?;
329 }
330 Token::RParen => {
332 timezone = None;
333 }
334 next_token => {
335 return Err(make_error(
336 self.val,
337 &format!("Expected comma followed by a timezone, or an ), got {next_token:?}"),
338 ));
339 }
340 }
341 Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
342 }
343
344 fn parse_time32(&mut self) -> ArrowResult<DataType> {
346 self.expect_token(Token::LParen)?;
347 let time_unit = self.parse_time_unit("Time32")?;
348 match time_unit {
349 TimeUnit::Second | TimeUnit::Millisecond => (),
350 TimeUnit::Microsecond | TimeUnit::Nanosecond => {
351 return Err(make_error(
352 self.val,
353 &format!("Time32 time unit must be 's' or 'ms', got '{time_unit}'"),
354 ));
355 }
356 };
357 self.expect_token(Token::RParen)?;
358 Ok(DataType::Time32(time_unit))
359 }
360
361 fn parse_time64(&mut self) -> ArrowResult<DataType> {
363 self.expect_token(Token::LParen)?;
364 let time_unit = self.parse_time_unit("Time64")?;
365 match time_unit {
366 TimeUnit::Microsecond | TimeUnit::Nanosecond => (),
367 TimeUnit::Second | TimeUnit::Millisecond => {
368 return Err(make_error(
369 self.val,
370 &format!("Time64 time unit must be 'µs' or 'ns', got '{time_unit}'"),
371 ));
372 }
373 };
374 self.expect_token(Token::RParen)?;
375 Ok(DataType::Time64(time_unit))
376 }
377
378 fn parse_duration(&mut self) -> ArrowResult<DataType> {
380 self.expect_token(Token::LParen)?;
381 let time_unit = self.parse_time_unit("Duration")?;
382 self.expect_token(Token::RParen)?;
383 Ok(DataType::Duration(time_unit))
384 }
385
386 fn parse_interval(&mut self) -> ArrowResult<DataType> {
388 self.expect_token(Token::LParen)?;
389 let interval_unit = match self.next_token()? {
390 Token::IntervalUnit(interval_unit) => interval_unit,
391 tok => {
392 return Err(make_error(
393 self.val,
394 &format!("finding IntervalUnit for Interval, got {tok}"),
395 ));
396 }
397 };
398 self.expect_token(Token::RParen)?;
399 Ok(DataType::Interval(interval_unit))
400 }
401
402 fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
404 self.expect_token(Token::LParen)?;
405 let length = self.parse_i32("FixedSizeBinary")?;
406 if length < 0 {
407 return Err(make_error(
408 self.val,
409 &format!("FixedSizeBinary length must be non-negative, got {length}"),
410 ));
411 }
412 self.expect_token(Token::RParen)?;
413 Ok(DataType::FixedSizeBinary(length))
414 }
415
416 fn validate_decimal(
417 &self,
418 precision: u8,
419 scale: i8,
420 type_name: &str,
421 max_precision: u8,
422 ) -> ArrowResult<()> {
423 if precision == 0 || precision > max_precision {
424 return Err(make_error(
425 self.val,
426 &format!(
427 "{type_name} precision must be in range [1, {max_precision}], got '{precision}'"
428 ),
429 ));
430 }
431 if scale > 0 && scale as u8 > precision {
432 return Err(make_error(
433 self.val,
434 &format!(
435 "{type_name} scale '{scale}' cannot be greater than precision '{precision}'"
436 ),
437 ));
438 }
439 Ok(())
440 }
441
442 fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
444 self.expect_token(Token::LParen)?;
445 let precision = self.parse_u8("Decimal32")?;
446 self.expect_token(Token::Comma)?;
447 let scale = self.parse_i8("Decimal32")?;
448 self.expect_token(Token::RParen)?;
449 self.validate_decimal(precision, scale, "Decimal32", DECIMAL32_MAX_PRECISION)?;
450 Ok(DataType::Decimal32(precision, scale))
451 }
452
453 fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
455 self.expect_token(Token::LParen)?;
456 let precision = self.parse_u8("Decimal64")?;
457 self.expect_token(Token::Comma)?;
458 let scale = self.parse_i8("Decimal64")?;
459 self.expect_token(Token::RParen)?;
460 self.validate_decimal(precision, scale, "Decimal64", DECIMAL64_MAX_PRECISION)?;
461 Ok(DataType::Decimal64(precision, scale))
462 }
463
464 fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
466 self.expect_token(Token::LParen)?;
467 let precision = self.parse_u8("Decimal128")?;
468 self.expect_token(Token::Comma)?;
469 let scale = self.parse_i8("Decimal128")?;
470 self.expect_token(Token::RParen)?;
471 self.validate_decimal(precision, scale, "Decimal128", DECIMAL128_MAX_PRECISION)?;
472 Ok(DataType::Decimal128(precision, scale))
473 }
474
475 fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
477 self.expect_token(Token::LParen)?;
478 let precision = self.parse_u8("Decimal256")?;
479 self.expect_token(Token::Comma)?;
480 let scale = self.parse_i8("Decimal256")?;
481 self.expect_token(Token::RParen)?;
482 self.validate_decimal(precision, scale, "Decimal256", DECIMAL256_MAX_PRECISION)?;
483 Ok(DataType::Decimal256(precision, scale))
484 }
485
486 fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
488 self.expect_token(Token::LParen)?;
489 let key_type = self.parse_next_type()?;
490 self.expect_token(Token::Comma)?;
491 let value_type = self.parse_next_type()?;
492 self.expect_token(Token::RParen)?;
493 Ok(DataType::Dictionary(
494 Box::new(key_type),
495 Box::new(value_type),
496 ))
497 }
498
499 fn parse_struct(&mut self) -> ArrowResult<DataType> {
501 self.expect_token(Token::LParen)?;
502 let mut fields = Vec::new();
503 loop {
504 if self
505 .tokenizer
506 .next_if(|next| matches!(next, Ok(Token::RParen)))
507 .is_some()
508 {
509 break;
510 }
511
512 let field = self.parse_field()?;
513 fields.push(Arc::new(field));
514 match self.next_token()? {
515 Token::Comma => continue,
516 Token::RParen => break,
517 tok => {
518 return Err(make_error(
519 self.val,
520 &format!(
521 "Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"
522 ),
523 ));
524 }
525 }
526 }
527 Ok(DataType::Struct(Fields::from(fields)))
528 }
529
530 fn parse_union(&mut self) -> ArrowResult<DataType> {
533 self.expect_token(Token::LParen)?;
534 let union_mode = self.parse_union_mode()?;
535 let mut type_ids = vec![];
536 let mut fields = vec![];
537 loop {
538 if self
539 .tokenizer
540 .next_if(|next| matches!(next, Ok(Token::RParen)))
541 .is_some()
542 {
543 break;
544 }
545 self.expect_token(Token::Comma)?;
546 let (type_id, field) = self.parse_union_field()?;
547 type_ids.push(type_id);
548 fields.push(field);
549 }
550 Ok(DataType::Union(
551 UnionFields::try_new(type_ids, fields)?,
552 union_mode,
553 ))
554 }
555
556 fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> {
558 match self.next_token()? {
559 Token::UnionMode(union_mode) => Ok(union_mode),
560 tok => Err(make_error(
561 self.val,
562 &format!("finding UnionMode for Union, got {tok}"),
563 )),
564 }
565 }
566
567 fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> {
570 let type_id = self.parse_i8("UnionField")?;
571 self.expect_token(Token::Colon)?;
572 self.expect_token(Token::LParen)?;
573 let field = self.parse_field()?;
574 self.expect_token(Token::RParen)?;
575 Ok((type_id, field))
576 }
577
578 fn parse_map(&mut self) -> ArrowResult<DataType> {
581 self.expect_token(Token::LParen)?;
582 let field = self.parse_field()?;
583 self.expect_token(Token::Comma)?;
584 let sorted = self.parse_map_sorted()?;
585 self.expect_token(Token::RParen)?;
586 Ok(DataType::Map(Arc::new(field), sorted))
587 }
588
589 fn parse_map_sorted(&mut self) -> ArrowResult<bool> {
591 match self.next_token()? {
592 Token::MapSorted(sorted) => Ok(sorted),
593 tok => Err(make_error(
594 self.val,
595 &format!("Expected sorted or unsorted for a map; got {tok:?}"),
596 )),
597 }
598 }
599
600 fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> {
603 self.expect_token(Token::LParen)?;
604 let run_ends = self.parse_field()?;
605 self.expect_token(Token::Comma)?;
606 let values = self.parse_field()?;
607 self.expect_token(Token::RParen)?;
608 Ok(DataType::RunEndEncoded(
609 Arc::new(run_ends),
610 Arc::new(values),
611 ))
612 }
613
614 fn parse_opt_nullable(&mut self) -> bool {
616 let tok = self
617 .tokenizer
618 .next_if(|next| matches!(next, Ok(Token::NonNull | Token::Nullable)));
619 !matches!(tok, Some(Ok(Token::NonNull)))
620 }
621
622 fn next_token(&mut self) -> ArrowResult<Token> {
624 match self.tokenizer.next() {
625 None => Err(make_error(self.val, "finding next token")),
626 Some(token) => token,
627 }
628 }
629
630 fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
632 let next_token = self.next_token()?;
633 if next_token == tok {
634 Ok(())
635 } else {
636 Err(make_error_expected(self.val, &tok, &next_token))
637 }
638 }
639}
640
641fn is_separator(c: char) -> bool {
643 c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
644}
645
646enum QuoteType {
647 Double,
648 Single,
649}
650
651#[derive(Debug)]
652struct Tokenizer<'a> {
661 val: &'a str,
662 chars: Peekable<Chars<'a>>,
663 word: String,
665}
666
667impl<'a> Tokenizer<'a> {
668 fn new(val: &'a str) -> Self {
669 Self {
670 val,
671 chars: val.chars().peekable(),
672 word: String::new(),
673 }
674 }
675
676 fn peek_next_char(&mut self) -> Option<char> {
678 self.chars.peek().copied()
679 }
680
681 fn next_char(&mut self) -> Option<char> {
683 self.chars.next()
684 }
685
686 fn parse_word(&mut self) -> ArrowResult<Token> {
689 self.word.clear();
691 loop {
692 match self.peek_next_char() {
693 None => break,
694 Some(c) if is_separator(c) => break,
695 Some(c) => {
696 self.next_char();
697 self.word.push(c);
698 }
699 }
700 }
701
702 if let Some(c) = self.word.chars().next() {
703 if c == '-' || c.is_numeric() {
705 let val: i64 = self.word.parse().map_err(|e| {
706 make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
707 })?;
708 return Ok(Token::Integer(val));
709 }
710 }
711
712 let token = match self.word.as_str() {
714 "Null" => Token::SimpleType(DataType::Null),
715 "Boolean" => Token::SimpleType(DataType::Boolean),
716
717 "Int8" => Token::SimpleType(DataType::Int8),
718 "Int16" => Token::SimpleType(DataType::Int16),
719 "Int32" => Token::SimpleType(DataType::Int32),
720 "Int64" => Token::SimpleType(DataType::Int64),
721
722 "UInt8" => Token::SimpleType(DataType::UInt8),
723 "UInt16" => Token::SimpleType(DataType::UInt16),
724 "UInt32" => Token::SimpleType(DataType::UInt32),
725 "UInt64" => Token::SimpleType(DataType::UInt64),
726
727 "Utf8" => Token::SimpleType(DataType::Utf8),
728 "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
729 "Utf8View" => Token::SimpleType(DataType::Utf8View),
730 "Binary" => Token::SimpleType(DataType::Binary),
731 "BinaryView" => Token::SimpleType(DataType::BinaryView),
732 "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
733
734 "Float16" => Token::SimpleType(DataType::Float16),
735 "Float32" => Token::SimpleType(DataType::Float32),
736 "Float64" => Token::SimpleType(DataType::Float64),
737
738 "Date32" => Token::SimpleType(DataType::Date32),
739 "Date64" => Token::SimpleType(DataType::Date64),
740
741 "List" => Token::List,
742 "ListView" => Token::ListView,
743 "LargeList" => Token::LargeList,
744 "LargeListView" => Token::LargeListView,
745 "FixedSizeList" => Token::FixedSizeList,
746
747 "s" | "Second" => Token::TimeUnit(TimeUnit::Second),
748 "ms" | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
749 "µs" | "us" | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
750 "ns" | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
751
752 "Timestamp" => Token::Timestamp,
753 "Time32" => Token::Time32,
754 "Time64" => Token::Time64,
755 "Duration" => Token::Duration,
756 "Interval" => Token::Interval,
757 "Dictionary" => Token::Dictionary,
758
759 "FixedSizeBinary" => Token::FixedSizeBinary,
760
761 "Decimal32" => Token::Decimal32,
762 "Decimal64" => Token::Decimal64,
763 "Decimal128" => Token::Decimal128,
764 "Decimal256" => Token::Decimal256,
765
766 "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
767 "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
768 "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
769
770 "Some" => Token::Some,
771 "None" => Token::None,
772
773 "non-null" => Token::NonNull,
774 "nullable" => Token::Nullable,
775 "field" => Token::Field,
776 "x" => Token::X,
777
778 "Struct" => Token::Struct,
779
780 "Union" => Token::Union,
781 "Sparse" => Token::UnionMode(UnionMode::Sparse),
782 "Dense" => Token::UnionMode(UnionMode::Dense),
783
784 "Map" => Token::Map,
785 "sorted" => Token::MapSorted(true),
786 "unsorted" => Token::MapSorted(false),
787
788 "RunEndEncoded" => Token::RunEndEncoded,
789
790 token => {
791 return Err(make_error(self.val, &format!("unknown token: {token}")));
792 }
793 };
794 Ok(token)
795 }
796
797 fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> {
799 let quote = match quote_type {
800 QuoteType::Double => '\"',
801 QuoteType::Single => '\'',
802 };
803
804 if self.next_char() != Some(quote) {
805 return Err(make_error(self.val, "Expected \""));
806 }
807
808 self.word.clear();
810
811 let mut is_escaped = false;
812
813 loop {
814 match self.next_char() {
815 None => {
816 return Err(ArrowError::ParseError(format!(
817 "Unterminated string at: \"{}",
818 self.word
819 )));
820 }
821 Some(c) => match c {
822 '\\' => {
823 is_escaped = true;
824 self.word.push(c);
825 }
826 c if c == quote => {
827 if is_escaped {
828 self.word.push(c);
829 is_escaped = false;
830 } else {
831 break;
832 }
833 }
834 c => {
835 self.word.push(c);
836 }
837 },
838 }
839 }
840
841 let val: String = self.word.parse().map_err(|err| {
842 ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
843 })?;
844
845 if val.is_empty() {
846 return Err(make_error(self.val, "empty strings aren't allowed"));
848 }
849
850 match quote_type {
851 QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
852 QuoteType::Single => Ok(Token::SingleQuotedString(val)),
853 }
854 }
855}
856
857impl Iterator for Tokenizer<'_> {
858 type Item = ArrowResult<Token>;
859
860 fn next(&mut self) -> Option<Self::Item> {
861 loop {
862 match self.peek_next_char()? {
863 ' ' => {
864 self.next_char();
866 continue;
867 }
868 '"' => {
869 return Some(self.parse_quoted_string(QuoteType::Double));
870 }
871 '\'' => {
872 return Some(self.parse_quoted_string(QuoteType::Single));
873 }
874 '(' => {
875 self.next_char();
876 return Some(Ok(Token::LParen));
877 }
878 ')' => {
879 self.next_char();
880 return Some(Ok(Token::RParen));
881 }
882 ',' => {
883 self.next_char();
884 return Some(Ok(Token::Comma));
885 }
886 ':' => {
887 self.next_char();
888 return Some(Ok(Token::Colon));
889 }
890 _ => return Some(self.parse_word()),
891 }
892 }
893 }
894}
895
896#[derive(Debug, PartialEq)]
899enum Token {
900 SimpleType(DataType),
902 Timestamp,
903 Time32,
904 Time64,
905 Duration,
906 Interval,
907 FixedSizeBinary,
908 Decimal32,
909 Decimal64,
910 Decimal128,
911 Decimal256,
912 Dictionary,
913 TimeUnit(TimeUnit),
914 IntervalUnit(IntervalUnit),
915 LParen,
916 RParen,
917 Comma,
918 Colon,
919 Some,
920 None,
921 Integer(i64),
922 DoubleQuotedString(String),
923 SingleQuotedString(String),
924 List,
925 ListView,
926 LargeList,
927 LargeListView,
928 FixedSizeList,
929 Struct,
930 Union,
931 UnionMode(UnionMode),
932 Map,
933 MapSorted(bool),
934 RunEndEncoded,
935 NonNull,
936 Nullable,
937 Field,
938 X,
939}
940
941impl Display for Token {
942 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
943 match self {
944 Token::SimpleType(t) => write!(f, "{t}"),
945 Token::List => write!(f, "List"),
946 Token::ListView => write!(f, "ListView"),
947 Token::LargeList => write!(f, "LargeList"),
948 Token::LargeListView => write!(f, "LargeListView"),
949 Token::FixedSizeList => write!(f, "FixedSizeList"),
950 Token::Timestamp => write!(f, "Timestamp"),
951 Token::Time32 => write!(f, "Time32"),
952 Token::Time64 => write!(f, "Time64"),
953 Token::Duration => write!(f, "Duration"),
954 Token::Interval => write!(f, "Interval"),
955 Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
956 Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
957 Token::LParen => write!(f, "("),
958 Token::RParen => write!(f, ")"),
959 Token::Comma => write!(f, ","),
960 Token::Colon => write!(f, ":"),
961 Token::Some => write!(f, "Some"),
962 Token::None => write!(f, "None"),
963 Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
964 Token::Decimal32 => write!(f, "Decimal32"),
965 Token::Decimal64 => write!(f, "Decimal64"),
966 Token::Decimal128 => write!(f, "Decimal128"),
967 Token::Decimal256 => write!(f, "Decimal256"),
968 Token::Dictionary => write!(f, "Dictionary"),
969 Token::Integer(v) => write!(f, "Integer({v})"),
970 Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
971 Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"),
972 Token::Struct => write!(f, "Struct"),
973 Token::Union => write!(f, "Union"),
974 Token::UnionMode(m) => write!(f, "{m:?}"),
975 Token::Map => write!(f, "Map"),
976 Token::MapSorted(sorted) => {
977 write!(f, "{}", if *sorted { "sorted" } else { "unsorted" })
978 }
979 Token::RunEndEncoded => write!(f, "RunEndEncoded"),
980 Token::NonNull => write!(f, "non-null"),
981 Token::Nullable => write!(f, "nullable"),
982 Token::Field => write!(f, "field"),
983 Token::X => write!(f, "x"),
984 }
985 }
986}
987
988#[cfg(test)]
989mod test {
990 use super::*;
991
992 #[test]
993 fn test_parse_data_type() {
994 for dt in list_datatypes() {
996 round_trip(dt)
997 }
998 }
999
1000 fn round_trip(data_type: DataType) {
1003 let data_type_string = data_type.to_string();
1004 println!("Input '{data_type_string}' ({data_type:?})");
1005 let parsed_type = parse_data_type(&data_type_string).unwrap();
1006 assert_eq!(
1007 data_type, parsed_type,
1008 "Mismatch parsing {data_type_string}"
1009 );
1010 }
1011
1012 fn list_datatypes() -> Vec<DataType> {
1013 vec![
1014 DataType::Null,
1018 DataType::Boolean,
1019 DataType::Int8,
1020 DataType::Int16,
1021 DataType::Int32,
1022 DataType::Int64,
1023 DataType::UInt8,
1024 DataType::UInt16,
1025 DataType::UInt32,
1026 DataType::UInt64,
1027 DataType::Float16,
1028 DataType::Float32,
1029 DataType::Float64,
1030 DataType::Timestamp(TimeUnit::Second, None),
1031 DataType::Timestamp(TimeUnit::Millisecond, None),
1032 DataType::Timestamp(TimeUnit::Microsecond, None),
1033 DataType::Timestamp(TimeUnit::Nanosecond, None),
1034 DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
1036 DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
1037 DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
1038 DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
1039 DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
1040 DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
1041 DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
1042 DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1043 DataType::Date32,
1044 DataType::Date64,
1045 DataType::Time32(TimeUnit::Second),
1046 DataType::Time32(TimeUnit::Millisecond),
1047 DataType::Time64(TimeUnit::Microsecond),
1048 DataType::Time64(TimeUnit::Nanosecond),
1049 DataType::Duration(TimeUnit::Second),
1050 DataType::Duration(TimeUnit::Millisecond),
1051 DataType::Duration(TimeUnit::Microsecond),
1052 DataType::Duration(TimeUnit::Nanosecond),
1053 DataType::Interval(IntervalUnit::YearMonth),
1054 DataType::Interval(IntervalUnit::DayTime),
1055 DataType::Interval(IntervalUnit::MonthDayNano),
1056 DataType::Binary,
1057 DataType::BinaryView,
1058 DataType::FixedSizeBinary(0),
1059 DataType::FixedSizeBinary(1234),
1060 DataType::LargeBinary,
1061 DataType::Utf8,
1062 DataType::Utf8View,
1063 DataType::LargeUtf8,
1064 DataType::Decimal32(7, 6),
1065 DataType::Decimal64(6, 5),
1066 DataType::Decimal128(7, 6),
1067 DataType::Decimal256(6, 5),
1068 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1072 DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1073 DataType::Dictionary(
1074 Box::new(DataType::Int8),
1075 Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
1076 ),
1077 DataType::Dictionary(
1078 Box::new(DataType::Int8),
1079 Box::new(DataType::FixedSizeBinary(23)),
1080 ),
1081 DataType::Dictionary(
1082 Box::new(DataType::Int8),
1083 Box::new(
1084 DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1086 ),
1087 ),
1088 DataType::Struct(Fields::from(vec![
1089 Field::new("f1", DataType::Int64, true),
1090 Field::new("f2", DataType::Float64, true),
1091 Field::new(
1092 "f3",
1093 DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1094 true,
1095 ),
1096 Field::new(
1097 "f4",
1098 DataType::Dictionary(
1099 Box::new(DataType::Int8),
1100 Box::new(DataType::FixedSizeBinary(23)),
1101 ),
1102 true,
1103 ),
1104 ])),
1105 DataType::Struct(Fields::from(vec![
1106 Field::new("Int64", DataType::Int64, true),
1107 Field::new("Float64", DataType::Float64, true),
1108 ])),
1109 DataType::Struct(Fields::from(vec![
1110 Field::new("f1", DataType::Int64, true),
1111 Field::new(
1112 "nested_struct",
1113 DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
1114 true,
1115 ),
1116 ])),
1117 DataType::Struct(Fields::from(vec![Field::new("f1", DataType::Int64, true)])),
1118 DataType::Struct(Fields::empty()),
1119 DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))),
1120 DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
1121 DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1122 DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))),
1123 DataType::List(Arc::new(Field::new(
1124 "nested_list",
1125 DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1126 true,
1127 ))),
1128 DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1129 DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1130 DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1131 DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1132 DataType::ListView(Arc::new(Field::new(
1133 "nested_list_view",
1134 DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1135 true,
1136 ))),
1137 DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
1138 DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
1139 DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1140 DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))),
1141 DataType::LargeList(Arc::new(Field::new(
1142 "nested_large_list",
1143 DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1144 true,
1145 ))),
1146 DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1147 DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1148 DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1149 DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1150 DataType::LargeListView(Arc::new(Field::new(
1151 "nested_large_list_view",
1152 DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1153 true,
1154 ))),
1155 DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2),
1156 DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2),
1157 DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2),
1158 DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2),
1159 DataType::FixedSizeList(
1160 Arc::new(Field::new(
1161 "nested_fixed_size_list",
1162 DataType::FixedSizeList(
1163 Arc::new(Field::new("Int64", DataType::Int64, true)),
1164 2,
1165 ),
1166 true,
1167 )),
1168 2,
1169 ),
1170 DataType::Union(
1171 UnionFields::from_fields(vec![
1172 Field::new("Int32", DataType::Int32, false),
1173 Field::new("Utf8", DataType::Utf8, true),
1174 ]),
1175 UnionMode::Sparse,
1176 ),
1177 DataType::Union(
1178 UnionFields::from_fields(vec![
1179 Field::new("Int32", DataType::Int32, false),
1180 Field::new("Utf8", DataType::Utf8, true),
1181 ]),
1182 UnionMode::Dense,
1183 ),
1184 DataType::Union(
1185 UnionFields::from_fields(vec![
1186 Field::new_union(
1187 "nested_union",
1188 vec![0, 1],
1189 vec![
1190 Field::new("Int32", DataType::Int32, false),
1191 Field::new("Utf8", DataType::Utf8, true),
1192 ],
1193 UnionMode::Dense,
1194 ),
1195 Field::new("Utf8", DataType::Utf8, true),
1196 ]),
1197 UnionMode::Sparse,
1198 ),
1199 DataType::Union(
1200 UnionFields::from_fields(vec![Field::new("Int32", DataType::Int32, false)]),
1201 UnionMode::Dense,
1202 ),
1203 DataType::Union(
1204 UnionFields::try_new(Vec::<i8>::new(), Vec::<Field>::new()).unwrap(),
1205 UnionMode::Sparse,
1206 ),
1207 DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), true),
1208 DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), false),
1209 DataType::Map(
1210 Arc::new(Field::new_map(
1211 "nested_map",
1212 "entries",
1213 Field::new("key", DataType::Utf8, false),
1214 Field::new("value", DataType::Int32, true),
1215 false,
1216 true,
1217 )),
1218 true,
1219 ),
1220 DataType::RunEndEncoded(
1221 Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1222 Arc::new(Field::new("values", DataType::Int32, true)),
1223 ),
1224 DataType::RunEndEncoded(
1225 Arc::new(Field::new(
1226 "nested_run_end_encoded",
1227 DataType::RunEndEncoded(
1228 Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1229 Arc::new(Field::new("values", DataType::Int32, true)),
1230 ),
1231 true,
1232 )),
1233 Arc::new(Field::new("values", DataType::Int32, true)),
1234 ),
1235 ]
1236 }
1237
1238 #[test]
1239 fn test_parse_data_type_whitespace_tolerance() {
1240 let cases = [
1242 ("Int8", DataType::Int8),
1243 (
1244 "Timestamp (ns)",
1245 DataType::Timestamp(TimeUnit::Nanosecond, None),
1246 ),
1247 (
1248 "Timestamp (ns) ",
1249 DataType::Timestamp(TimeUnit::Nanosecond, None),
1250 ),
1251 (
1252 " Timestamp (ns )",
1253 DataType::Timestamp(TimeUnit::Nanosecond, None),
1254 ),
1255 (
1256 "Timestamp (ns ) ",
1257 DataType::Timestamp(TimeUnit::Nanosecond, None),
1258 ),
1259 ];
1260
1261 for (data_type_string, expected_data_type) in cases {
1262 let parsed_data_type = parse_data_type(data_type_string).unwrap();
1263 assert_eq!(
1264 parsed_data_type, expected_data_type,
1265 "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1266 );
1267 }
1268 }
1269
1270 #[test]
1272 fn test_parse_data_type_backwards_compatibility() {
1273 use DataType::*;
1274 use IntervalUnit::*;
1275 use TimeUnit::*;
1276 for t in list_datatypes() {
1278 println!(r#"("{t}", {t:?}),"#);
1279 }
1280 let cases = [
1282 ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1283 ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
1284 ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
1285 ("Timestamp(Second, None)", Timestamp(Second, None)),
1286 ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1287 (
1289 r#"Timestamp(Nanosecond, Some("+00:00"))"#,
1290 Timestamp(Nanosecond, Some("+00:00".into())),
1291 ),
1292 (
1293 r#"Timestamp(Microsecond, Some("+00:00"))"#,
1294 Timestamp(Microsecond, Some("+00:00".into())),
1295 ),
1296 (
1297 r#"Timestamp(Millisecond, Some("+00:00"))"#,
1298 Timestamp(Millisecond, Some("+00:00".into())),
1299 ),
1300 (
1301 r#"Timestamp(Second, Some("+00:00"))"#,
1302 Timestamp(Second, Some("+00:00".into())),
1303 ),
1304 ("Null", Null),
1305 ("Boolean", Boolean),
1306 ("Int8", Int8),
1307 ("Int16", Int16),
1308 ("Int32", Int32),
1309 ("Int64", Int64),
1310 ("UInt8", UInt8),
1311 ("UInt16", UInt16),
1312 ("UInt32", UInt32),
1313 ("UInt64", UInt64),
1314 ("Float16", Float16),
1315 ("Float32", Float32),
1316 ("Float64", Float64),
1317 ("Timestamp(s)", Timestamp(Second, None)),
1318 ("Timestamp(ms)", Timestamp(Millisecond, None)),
1319 ("Timestamp(µs)", Timestamp(Microsecond, None)),
1320 ("Timestamp(ns)", Timestamp(Nanosecond, None)),
1321 (
1322 r#"Timestamp(ns, "+00:00")"#,
1323 Timestamp(Nanosecond, Some("+00:00".into())),
1324 ),
1325 (
1326 r#"Timestamp(µs, "+00:00")"#,
1327 Timestamp(Microsecond, Some("+00:00".into())),
1328 ),
1329 (
1330 r#"Timestamp(ms, "+00:00")"#,
1331 Timestamp(Millisecond, Some("+00:00".into())),
1332 ),
1333 (
1334 r#"Timestamp(s, "+00:00")"#,
1335 Timestamp(Second, Some("+00:00".into())),
1336 ),
1337 (
1338 r#"Timestamp(ns, "+08:00")"#,
1339 Timestamp(Nanosecond, Some("+08:00".into())),
1340 ),
1341 (
1342 r#"Timestamp(µs, "+08:00")"#,
1343 Timestamp(Microsecond, Some("+08:00".into())),
1344 ),
1345 (
1346 r#"Timestamp(ms, "+08:00")"#,
1347 Timestamp(Millisecond, Some("+08:00".into())),
1348 ),
1349 (
1350 r#"Timestamp(s, "+08:00")"#,
1351 Timestamp(Second, Some("+08:00".into())),
1352 ),
1353 ("Date32", Date32),
1354 ("Date64", Date64),
1355 ("Time32(s)", Time32(Second)),
1356 ("Time32(ms)", Time32(Millisecond)),
1357 ("Time64(µs)", Time64(Microsecond)),
1358 ("Time64(ns)", Time64(Nanosecond)),
1359 ("Duration(s)", Duration(Second)),
1360 ("Duration(ms)", Duration(Millisecond)),
1361 ("Duration(µs)", Duration(Microsecond)),
1362 ("Duration(ns)", Duration(Nanosecond)),
1363 ("Interval(YearMonth)", Interval(YearMonth)),
1364 ("Interval(DayTime)", Interval(DayTime)),
1365 ("Interval(MonthDayNano)", Interval(MonthDayNano)),
1366 ("Binary", Binary),
1367 ("BinaryView", BinaryView),
1368 ("FixedSizeBinary(0)", FixedSizeBinary(0)),
1369 ("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
1370 ("LargeBinary", LargeBinary),
1371 ("Utf8", Utf8),
1372 ("Utf8View", Utf8View),
1373 ("LargeUtf8", LargeUtf8),
1374 ("Decimal32(7, 6)", Decimal32(7, 6)),
1375 ("Decimal64(6, 5)", Decimal64(6, 5)),
1376 ("Decimal128(7, 6)", Decimal128(7, 6)),
1377 ("Decimal256(6, 5)", Decimal256(6, 5)),
1378 (
1379 "Dictionary(Int32, Utf8)",
1380 Dictionary(Box::new(Int32), Box::new(Utf8)),
1381 ),
1382 (
1383 "Dictionary(Int8, Utf8)",
1384 Dictionary(Box::new(Int8), Box::new(Utf8)),
1385 ),
1386 (
1387 "Dictionary(Int8, Timestamp(ns))",
1388 Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))),
1389 ),
1390 (
1391 "Dictionary(Int8, FixedSizeBinary(23))",
1392 Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1393 ),
1394 (
1395 "Dictionary(Int8, Dictionary(Int8, Utf8))",
1396 Dictionary(
1397 Box::new(Int8),
1398 Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
1399 ),
1400 ),
1401 (
1402 r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#,
1403 Struct(Fields::from(vec![
1404 Field::new("f1", Int64, true),
1405 Field::new("f2", Float64, true),
1406 Field::new("f3", Timestamp(Second, Some("+08:00".into())), true),
1407 Field::new(
1408 "f4",
1409 Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1410 true,
1411 ),
1412 ])),
1413 ),
1414 (
1415 r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#,
1416 Struct(Fields::from(vec![
1417 Field::new("Int64", Int64, true),
1418 Field::new("Float64", Float64, true),
1419 ])),
1420 ),
1421 (
1422 r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#,
1423 Struct(Fields::from(vec![
1424 Field::new("f1", Int64, true),
1425 Field::new(
1426 "nested_struct",
1427 Struct(Fields::from(vec![Field::new("n1", Int64, true)])),
1428 true,
1429 ),
1430 ])),
1431 ),
1432 (r#"Struct()"#, Struct(Fields::empty())),
1433 (
1434 "FixedSizeList(4, Int64)",
1435 FixedSizeList(Arc::new(Field::new_list_field(Int64, true)), 4),
1436 ),
1437 (
1438 "List(Int64)",
1439 List(Arc::new(Field::new_list_field(Int64, true))),
1440 ),
1441 (
1442 "LargeList(Int64)",
1443 LargeList(Arc::new(Field::new_list_field(Int64, true))),
1444 ),
1445 ];
1446
1447 for (data_type_string, expected_data_type) in cases {
1448 let parsed_data_type = parse_data_type(data_type_string).unwrap();
1449 assert_eq!(
1450 parsed_data_type, expected_data_type,
1451 "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1452 );
1453 }
1454 }
1455
1456 #[test]
1457 fn parse_data_type_errors() {
1458 let cases = [
1460 ("", "Unsupported type ''"),
1461 ("", "Error finding next token"),
1462 ("null", "Unsupported type 'null'"),
1463 ("Nu", "Unsupported type 'Nu'"),
1464 (r#"Timestamp(ns, +00:00)"#, "Error unknown token: +00"),
1465 (
1466 r#"Timestamp(ns, "+00:00)"#,
1467 r#"Unterminated string at: "+00:00)"#,
1468 ),
1469 (r#"Timestamp(ns, "")"#, r#"empty strings aren't allowed"#),
1470 (
1471 r#"Timestamp(ns, "+00:00"")"#,
1472 r#"Parser error: Unterminated string at: ")"#,
1473 ),
1474 ("Timestamp(ns, ", "Error finding next token"),
1475 (
1476 "Float32 Float32",
1477 "trailing content after parsing 'Float32'",
1478 ),
1479 ("Int32, ", "trailing content after parsing 'Int32'"),
1480 ("Int32(3), ", "trailing content after parsing 'Int32'"),
1481 (
1482 "FixedSizeBinary(Int32), ",
1483 "Error finding i64 for FixedSizeBinary, got 'Int32'",
1484 ),
1485 (
1486 "FixedSizeBinary(3.0), ",
1487 "Error parsing 3.0 as integer: invalid digit found in string",
1488 ),
1489 (
1491 "FixedSizeBinary(4000000000), ",
1492 "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted",
1493 ),
1494 (
1496 "FixedSizeBinary(-1), ",
1497 "FixedSizeBinary length must be non-negative, got -1",
1498 ),
1499 (
1500 "FixedSizeList(-1, Int64), ",
1501 "FixedSizeList length must be non-negative, got -1",
1502 ),
1503 (
1505 "Decimal32(-3, 5)",
1506 "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted",
1507 ),
1508 (
1509 "Decimal64(-3, 5)",
1510 "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted",
1511 ),
1512 (
1513 "Decimal128(-3, 5)",
1514 "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted",
1515 ),
1516 (
1517 "Decimal256(-3, 5)",
1518 "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted",
1519 ),
1520 (
1521 "Decimal32(3, 500)",
1522 "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted",
1523 ),
1524 (
1525 "Decimal64(3, 500)",
1526 "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted",
1527 ),
1528 (
1529 "Decimal128(3, 500)",
1530 "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted",
1531 ),
1532 (
1533 "Decimal256(3, 500)",
1534 "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted",
1535 ),
1536 ("Struct(f1 Int64)", "Error unknown token: f1"),
1537 ("Struct(\"f1\" Int64)", "Expected ':'"),
1538 (
1539 "Struct(\"f1\": )",
1540 "Error finding next type, got unexpected ')'",
1541 ),
1542 (
1544 "Time32(µs)",
1545 "Error Time32 time unit must be 's' or 'ms', got 'µs'",
1546 ),
1547 (
1548 "Time32(ns)",
1549 "Error Time32 time unit must be 's' or 'ms', got 'ns'",
1550 ),
1551 (
1552 "Time64(s)",
1553 "Error Time64 time unit must be 'µs' or 'ns', got 's'",
1554 ),
1555 (
1556 "Time64(ms)",
1557 "Error Time64 time unit must be 'µs' or 'ns', got 'ms'",
1558 ),
1559 (
1561 "Decimal32(5, 6)",
1562 "Error Decimal32 scale '6' cannot be greater than precision '5'",
1563 ),
1564 (
1565 "Decimal64(5, 6)",
1566 "Error Decimal64 scale '6' cannot be greater than precision '5'",
1567 ),
1568 (
1569 "Decimal128(5, 6)",
1570 "Error Decimal128 scale '6' cannot be greater than precision '5'",
1571 ),
1572 (
1573 "Decimal256(5, 6)",
1574 "Error Decimal256 scale '6' cannot be greater than precision '5'",
1575 ),
1576 (
1578 "Decimal32(10, 0)",
1579 "Error Decimal32 precision must be in range [1, 9], got '10'",
1580 ),
1581 (
1582 "Decimal64(19, 0)",
1583 "Error Decimal64 precision must be in range [1, 18], got '19'",
1584 ),
1585 (
1586 "Decimal128(39, 0)",
1587 "Error Decimal128 precision must be in range [1, 38], got '39'",
1588 ),
1589 (
1590 "Decimal256(77, 0)",
1591 "Error Decimal256 precision must be in range [1, 76], got '77'",
1592 ),
1593 (
1595 "Decimal32(0, 0)",
1596 "Error Decimal32 precision must be in range [1, 9], got '0'",
1597 ),
1598 (
1599 "Decimal64(0, 0)",
1600 "Error Decimal64 precision must be in range [1, 18], got '0'",
1601 ),
1602 (
1603 "Decimal128(0, 0)",
1604 "Error Decimal128 precision must be in range [1, 38], got '0'",
1605 ),
1606 (
1607 "Decimal256(0, 0)",
1608 "Error Decimal256 precision must be in range [1, 76], got '0'",
1609 ),
1610 ];
1611
1612 for (data_type_string, expected_message) in cases {
1613 println!("Parsing '{data_type_string}', expecting '{expected_message}'");
1614 match parse_data_type(data_type_string) {
1615 Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
1616 Err(e) => {
1617 let message = e.to_string();
1618 assert!(
1619 message.contains(expected_message),
1620 "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
1621 );
1622
1623 if !message.contains("Unterminated string") {
1624 assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
1626 }
1627 }
1628 }
1629 }
1630 }
1631
1632 #[test]
1633 fn parse_error_type() {
1634 let err = parse_data_type("foobar").unwrap_err();
1635 assert!(matches!(err, ArrowError::ParseError(_)));
1636 assert_eq!(
1637 err.to_string(),
1638 "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar"
1639 );
1640 }
1641}