parquet/schema/
parser.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet schema parser.
19//! Provides methods to parse and validate string message type into Parquet
20//! [`Type`].
21//!
22//! # Example
23//!
24//! ```rust
25//! use parquet::schema::parser::parse_message_type;
26//!
27//! let message_type = "
28//!   message spark_schema {
29//!     OPTIONAL BYTE_ARRAY a (UTF8);
30//!     REQUIRED INT32 b;
31//!     REQUIRED DOUBLE c;
32//!     REQUIRED BOOLEAN d;
33//!     OPTIONAL group e (LIST) {
34//!       REPEATED group list {
35//!         REQUIRED INT32 element;
36//!       }
37//!     }
38//!   }
39//! ";
40//!
41//! let schema = parse_message_type(message_type).expect("Expected valid schema");
42//! println!("{:?}", schema);
43//! ```
44
45use std::sync::Arc;
46
47use crate::basic::{ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType};
48use crate::errors::{ParquetError, Result};
49use crate::schema::types::{Type, TypePtr};
50
51/// Parses message type as string into a Parquet [`Type`]
52/// which, for example, could be used to extract individual columns. Returns Parquet
53/// general error when parsing or validation fails.
54pub fn parse_message_type(message_type: &str) -> Result<Type> {
55    let mut parser = Parser {
56        tokenizer: &mut Tokenizer::from_str(message_type),
57    };
58    parser.parse_message_type()
59}
60
61/// Tokenizer to split message type string into tokens that are separated using characters
62/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens.
63/// Tokenizer provides Iterator interface to process tokens; it also allows to step back
64/// to reprocess previous tokens.
65struct Tokenizer<'a> {
66    // List of all tokens for a string
67    tokens: Vec<&'a str>,
68    // Current index of vector
69    index: usize,
70}
71
72impl<'a> Tokenizer<'a> {
73    // Create tokenizer from message type string
74    pub fn from_str(string: &'a str) -> Self {
75        let vec = string
76            .split_whitespace()
77            .flat_map(Self::split_token)
78            .collect();
79        Tokenizer {
80            tokens: vec,
81            index: 0,
82        }
83    }
84
85    // List of all special characters in schema
86    fn is_schema_delim(c: char) -> bool {
87        c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ','
88    }
89
90    /// Splits string into tokens; input string can already be token or can contain
91    /// delimiters, e.g. required" -> Vec("required") and
92    /// "(UTF8);" -> Vec("(", "UTF8", ")", ";")
93    fn split_token(string: &str) -> Vec<&str> {
94        let mut buffer: Vec<&str> = Vec::new();
95        let mut tail = string;
96        while let Some(index) = tail.find(Self::is_schema_delim) {
97            let (h, t) = tail.split_at(index);
98            if !h.is_empty() {
99                buffer.push(h);
100            }
101            buffer.push(&t[0..1]);
102            tail = &t[1..];
103        }
104        if !tail.is_empty() {
105            buffer.push(tail);
106        }
107        buffer
108    }
109
110    // Move pointer to a previous element
111    fn backtrack(&mut self) {
112        self.index -= 1;
113    }
114}
115
116impl<'a> Iterator for Tokenizer<'a> {
117    type Item = &'a str;
118
119    fn next(&mut self) -> Option<&'a str> {
120        if self.index < self.tokens.len() {
121            self.index += 1;
122            Some(self.tokens[self.index - 1])
123        } else {
124            None
125        }
126    }
127}
128
129/// Internal Schema parser.
130/// Traverses message type using tokenizer and parses each group/primitive type
131/// recursively.
132struct Parser<'a> {
133    tokenizer: &'a mut Tokenizer<'a>,
134}
135
136// Utility function to assert token on validity.
137fn assert_token(token: Option<&str>, expected: &str) -> Result<()> {
138    match token {
139        Some(value) if value == expected => Ok(()),
140        Some(other) => Err(general_err!(
141            "Expected '{}', found token '{}'",
142            expected,
143            other
144        )),
145        None => Err(general_err!(
146            "Expected '{}', but no token found (None)",
147            expected
148        )),
149    }
150}
151
152// Utility function to parse i32 or return general error.
153#[inline]
154fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result<i32> {
155    value
156        .ok_or_else(|| general_err!(not_found_msg))
157        .and_then(|v| v.parse::<i32>().map_err(|_| general_err!(parse_fail_msg)))
158}
159
160// Utility function to parse boolean or return general error.
161#[inline]
162fn parse_bool(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result<bool> {
163    value
164        .ok_or_else(|| general_err!(not_found_msg))
165        .and_then(|v| {
166            v.to_lowercase()
167                .parse::<bool>()
168                .map_err(|_| general_err!(parse_fail_msg))
169        })
170}
171
172// Utility function to parse TimeUnit or return general error.
173fn parse_timeunit(
174    value: Option<&str>,
175    not_found_msg: &str,
176    parse_fail_msg: &str,
177) -> Result<TimeUnit> {
178    value
179        .ok_or_else(|| general_err!(not_found_msg))
180        .and_then(|v| match v.to_uppercase().as_str() {
181            "MILLIS" => Ok(TimeUnit::MILLIS),
182            "MICROS" => Ok(TimeUnit::MICROS),
183            "NANOS" => Ok(TimeUnit::NANOS),
184            _ => Err(general_err!(parse_fail_msg)),
185        })
186}
187
188impl Parser<'_> {
189    // Entry function to parse message type, uses internal tokenizer.
190    fn parse_message_type(&mut self) -> Result<Type> {
191        // Check that message type starts with "message".
192        match self.tokenizer.next() {
193            Some("message") => {
194                let name = self
195                    .tokenizer
196                    .next()
197                    .ok_or_else(|| general_err!("Expected name, found None"))?;
198                Type::group_type_builder(name)
199                    .with_fields(self.parse_child_types()?)
200                    .build()
201            }
202            _ => Err(general_err!("Message type does not start with 'message'")),
203        }
204    }
205
206    // Parses child types for a current group type.
207    // This is only invoked on root and group types.
208    fn parse_child_types(&mut self) -> Result<Vec<TypePtr>> {
209        assert_token(self.tokenizer.next(), "{")?;
210        let mut vec = Vec::new();
211        while let Some(value) = self.tokenizer.next() {
212            if value == "}" {
213                break;
214            } else {
215                self.tokenizer.backtrack();
216                vec.push(Arc::new(self.add_type()?));
217            }
218        }
219        Ok(vec)
220    }
221
222    fn add_type(&mut self) -> Result<Type> {
223        // Parse repetition
224        let repetition = self
225            .tokenizer
226            .next()
227            .ok_or_else(|| general_err!("Expected repetition, found None"))
228            .and_then(|v| v.to_uppercase().parse::<Repetition>())?;
229
230        match self.tokenizer.next() {
231            Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)),
232            Some(type_string) => {
233                let physical_type = type_string.to_uppercase().parse::<PhysicalType>()?;
234                self.add_primitive_type(repetition, physical_type)
235            }
236            None => Err(general_err!("Invalid type, could not extract next token")),
237        }
238    }
239
240    fn add_group_type(&mut self, repetition: Option<Repetition>) -> Result<Type> {
241        // Parse name of the group type
242        let name = self
243            .tokenizer
244            .next()
245            .ok_or_else(|| general_err!("Expected name, found None"))?;
246
247        // Parse logical or converted type if exists
248        let (logical_type, converted_type) = if let Some("(") = self.tokenizer.next() {
249            let tpe = self
250                .tokenizer
251                .next()
252                .ok_or_else(|| general_err!("Expected converted type, found None"))
253                .and_then(|v| {
254                    // Try logical type first
255                    let upper = v.to_uppercase();
256                    let logical = upper.parse::<LogicalType>();
257                    match logical {
258                        Ok(logical) => {
259                            Ok((Some(logical.clone()), ConvertedType::from(Some(logical))))
260                        }
261                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
262                    }
263                })?;
264            assert_token(self.tokenizer.next(), ")")?;
265            tpe
266        } else {
267            self.tokenizer.backtrack();
268            (None, ConvertedType::NONE)
269        };
270
271        // Parse optional id
272        let id = if let Some("=") = self.tokenizer.next() {
273            self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())
274        } else {
275            self.tokenizer.backtrack();
276            None
277        };
278
279        let mut builder = Type::group_type_builder(name)
280            .with_logical_type(logical_type)
281            .with_converted_type(converted_type)
282            .with_fields(self.parse_child_types()?)
283            .with_id(id);
284        if let Some(rep) = repetition {
285            builder = builder.with_repetition(rep);
286        }
287        builder.build()
288    }
289
290    fn add_primitive_type(
291        &mut self,
292        repetition: Repetition,
293        physical_type: PhysicalType,
294    ) -> Result<Type> {
295        // Read type length if the type is FIXED_LEN_BYTE_ARRAY.
296        let mut length: i32 = -1;
297        if physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY {
298            assert_token(self.tokenizer.next(), "(")?;
299            length = parse_i32(
300                self.tokenizer.next(),
301                "Expected length for FIXED_LEN_BYTE_ARRAY, found None",
302                "Failed to parse length for FIXED_LEN_BYTE_ARRAY",
303            )?;
304            assert_token(self.tokenizer.next(), ")")?;
305        }
306
307        // Parse name of the primitive type
308        let name = self
309            .tokenizer
310            .next()
311            .ok_or_else(|| general_err!("Expected name, found None"))?;
312
313        // Parse converted type
314        let (logical_type, converted_type, precision, scale) = if let Some("(") =
315            self.tokenizer.next()
316        {
317            let (mut logical, mut converted) = self
318                .tokenizer
319                .next()
320                .ok_or_else(|| general_err!("Expected logical or converted type, found None"))
321                .and_then(|v| {
322                    let upper = v.to_uppercase();
323                    let logical = upper.parse::<LogicalType>();
324                    match logical {
325                        Ok(logical) => {
326                            Ok((Some(logical.clone()), ConvertedType::from(Some(logical))))
327                        }
328                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
329                    }
330                })?;
331
332            // Parse precision and scale for decimals
333            let mut precision: i32 = -1;
334            let mut scale: i32 = -1;
335
336            // Parse the concrete logical type
337            if let Some(tpe) = &logical {
338                match tpe {
339                    LogicalType::Decimal { .. } => {
340                        if let Some("(") = self.tokenizer.next() {
341                            precision = parse_i32(
342                                self.tokenizer.next(),
343                                "Expected precision, found None",
344                                "Failed to parse precision for DECIMAL type",
345                            )?;
346                            if let Some(",") = self.tokenizer.next() {
347                                scale = parse_i32(
348                                    self.tokenizer.next(),
349                                    "Expected scale, found None",
350                                    "Failed to parse scale for DECIMAL type",
351                                )?;
352                                assert_token(self.tokenizer.next(), ")")?;
353                            } else {
354                                scale = 0
355                            }
356                            logical = Some(LogicalType::Decimal { scale, precision });
357                            converted = ConvertedType::from(logical.clone());
358                        }
359                    }
360                    LogicalType::Time { .. } => {
361                        if let Some("(") = self.tokenizer.next() {
362                            let unit = parse_timeunit(
363                                self.tokenizer.next(),
364                                "Invalid timeunit found",
365                                "Failed to parse timeunit for TIME type",
366                            )?;
367                            if let Some(",") = self.tokenizer.next() {
368                                let is_adjusted_to_u_t_c = parse_bool(
369                                    self.tokenizer.next(),
370                                    "Invalid boolean found",
371                                    "Failed to parse timezone info for TIME type",
372                                )?;
373                                assert_token(self.tokenizer.next(), ")")?;
374                                logical = Some(LogicalType::Time {
375                                    is_adjusted_to_u_t_c,
376                                    unit,
377                                });
378                                converted = ConvertedType::from(logical.clone());
379                            } else {
380                                // Invalid token for unit
381                                self.tokenizer.backtrack();
382                            }
383                        }
384                    }
385                    LogicalType::Timestamp { .. } => {
386                        if let Some("(") = self.tokenizer.next() {
387                            let unit = parse_timeunit(
388                                self.tokenizer.next(),
389                                "Invalid timeunit found",
390                                "Failed to parse timeunit for TIMESTAMP type",
391                            )?;
392                            if let Some(",") = self.tokenizer.next() {
393                                let is_adjusted_to_u_t_c = parse_bool(
394                                    self.tokenizer.next(),
395                                    "Invalid boolean found",
396                                    "Failed to parse timezone info for TIMESTAMP type",
397                                )?;
398                                assert_token(self.tokenizer.next(), ")")?;
399                                logical = Some(LogicalType::Timestamp {
400                                    is_adjusted_to_u_t_c,
401                                    unit,
402                                });
403                                converted = ConvertedType::from(logical.clone());
404                            } else {
405                                // Invalid token for unit
406                                self.tokenizer.backtrack();
407                            }
408                        }
409                    }
410                    LogicalType::Integer { .. } => {
411                        if let Some("(") = self.tokenizer.next() {
412                            let bit_width = parse_i32(
413                                self.tokenizer.next(),
414                                "Invalid bit_width found",
415                                "Failed to parse bit_width for INTEGER type",
416                            )? as i8;
417                            match physical_type {
418                                PhysicalType::INT32 => match bit_width {
419                                    8 | 16 | 32 => {}
420                                    _ => {
421                                        return Err(general_err!(
422                                            "Incorrect bit width {} for INT32",
423                                            bit_width
424                                        ));
425                                    }
426                                },
427                                PhysicalType::INT64 => {
428                                    if bit_width != 64 {
429                                        return Err(general_err!(
430                                            "Incorrect bit width {} for INT64",
431                                            bit_width
432                                        ));
433                                    }
434                                }
435                                _ => {
436                                    return Err(general_err!(
437                                        "Logical type Integer cannot be used with physical type {}",
438                                        physical_type
439                                    ));
440                                }
441                            }
442                            if let Some(",") = self.tokenizer.next() {
443                                let is_signed = parse_bool(
444                                    self.tokenizer.next(),
445                                    "Invalid boolean found",
446                                    "Failed to parse is_signed for INTEGER type",
447                                )?;
448                                assert_token(self.tokenizer.next(), ")")?;
449                                logical = Some(LogicalType::Integer {
450                                    bit_width,
451                                    is_signed,
452                                });
453                                converted = ConvertedType::from(logical.clone());
454                            } else {
455                                // Invalid token for unit
456                                self.tokenizer.backtrack();
457                            }
458                        }
459                    }
460                    _ => {}
461                }
462            } else if converted == ConvertedType::DECIMAL {
463                if let Some("(") = self.tokenizer.next() {
464                    // Parse precision
465                    precision = parse_i32(
466                        self.tokenizer.next(),
467                        "Expected precision, found None",
468                        "Failed to parse precision for DECIMAL type",
469                    )?;
470
471                    // Parse scale
472                    scale = if let Some(",") = self.tokenizer.next() {
473                        parse_i32(
474                            self.tokenizer.next(),
475                            "Expected scale, found None",
476                            "Failed to parse scale for DECIMAL type",
477                        )?
478                    } else {
479                        // Scale is not provided, set it to 0.
480                        self.tokenizer.backtrack();
481                        0
482                    };
483
484                    assert_token(self.tokenizer.next(), ")")?;
485                } else {
486                    self.tokenizer.backtrack();
487                }
488            }
489
490            assert_token(self.tokenizer.next(), ")")?;
491            (logical, converted, precision, scale)
492        } else {
493            self.tokenizer.backtrack();
494            (None, ConvertedType::NONE, -1, -1)
495        };
496
497        // Parse optional id
498        let id = if let Some("=") = self.tokenizer.next() {
499            self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())
500        } else {
501            self.tokenizer.backtrack();
502            None
503        };
504        assert_token(self.tokenizer.next(), ";")?;
505
506        Type::primitive_type_builder(name, physical_type)
507            .with_repetition(repetition)
508            .with_logical_type(logical_type)
509            .with_converted_type(converted_type)
510            .with_length(length)
511            .with_precision(precision)
512            .with_scale(scale)
513            .with_id(id)
514            .build()
515    }
516}
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    #[test]
523    fn test_tokenize_empty_string() {
524        assert_eq!(Tokenizer::from_str("").next(), None);
525    }
526
527    #[test]
528    fn test_tokenize_delimiters() {
529        let mut iter = Tokenizer::from_str(",;{}()=");
530        assert_eq!(iter.next(), Some(","));
531        assert_eq!(iter.next(), Some(";"));
532        assert_eq!(iter.next(), Some("{"));
533        assert_eq!(iter.next(), Some("}"));
534        assert_eq!(iter.next(), Some("("));
535        assert_eq!(iter.next(), Some(")"));
536        assert_eq!(iter.next(), Some("="));
537        assert_eq!(iter.next(), None);
538    }
539
540    #[test]
541    fn test_tokenize_delimiters_with_whitespaces() {
542        let mut iter = Tokenizer::from_str(" , ; { } ( ) = ");
543        assert_eq!(iter.next(), Some(","));
544        assert_eq!(iter.next(), Some(";"));
545        assert_eq!(iter.next(), Some("{"));
546        assert_eq!(iter.next(), Some("}"));
547        assert_eq!(iter.next(), Some("("));
548        assert_eq!(iter.next(), Some(")"));
549        assert_eq!(iter.next(), Some("="));
550        assert_eq!(iter.next(), None);
551    }
552
553    #[test]
554    fn test_tokenize_words() {
555        let mut iter = Tokenizer::from_str("abc def ghi jkl mno");
556        assert_eq!(iter.next(), Some("abc"));
557        assert_eq!(iter.next(), Some("def"));
558        assert_eq!(iter.next(), Some("ghi"));
559        assert_eq!(iter.next(), Some("jkl"));
560        assert_eq!(iter.next(), Some("mno"));
561        assert_eq!(iter.next(), None);
562    }
563
564    #[test]
565    fn test_tokenize_backtrack() {
566        let mut iter = Tokenizer::from_str("abc;");
567        assert_eq!(iter.next(), Some("abc"));
568        assert_eq!(iter.next(), Some(";"));
569        iter.backtrack();
570        assert_eq!(iter.next(), Some(";"));
571        assert_eq!(iter.next(), None);
572    }
573
574    #[test]
575    fn test_tokenize_message_type() {
576        let schema = "
577    message schema {
578      required int32 a;
579      optional binary c (UTF8);
580      required group d {
581        required int32 a;
582        optional binary c (UTF8);
583      }
584      required group e (LIST) {
585        repeated group list {
586          required int32 element;
587        }
588      }
589    }
590    ";
591        let iter = Tokenizer::from_str(schema);
592        let mut res = Vec::new();
593        for token in iter {
594            res.push(token);
595        }
596        assert_eq!(
597            res,
598            vec![
599                "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c",
600                "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a",
601                ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group",
602                "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32",
603                "element", ";", "}", "}", "}"
604            ]
605        );
606    }
607
608    #[test]
609    fn test_assert_token() {
610        assert!(assert_token(Some("a"), "a").is_ok());
611        assert!(assert_token(Some("a"), "b").is_err());
612        assert!(assert_token(None, "b").is_err());
613    }
614
615    fn parse(schema: &str) -> Result<Type, ParquetError> {
616        let mut iter = Tokenizer::from_str(schema);
617        Parser {
618            tokenizer: &mut iter,
619        }
620        .parse_message_type()
621    }
622
623    #[test]
624    fn test_parse_message_type_invalid() {
625        assert_eq!(
626            parse("test").unwrap_err().to_string(),
627            "Parquet error: Message type does not start with 'message'"
628        );
629    }
630
631    #[test]
632    fn test_parse_message_type_no_name() {
633        assert_eq!(
634            parse("message").unwrap_err().to_string(),
635            "Parquet error: Expected name, found None"
636        );
637    }
638
639    #[test]
640    fn test_parse_message_type_fixed_byte_array() {
641        let schema = "
642            message schema {
643              REQUIRED FIXED_LEN_BYTE_ARRAY col;
644            }
645        ";
646        assert_eq!(
647            parse(schema).unwrap_err().to_string(),
648            "Parquet error: Expected '(', found token 'col'"
649        );
650
651        let schema = "
652            message schema {
653              REQUIRED FIXED_LEN_BYTE_ARRAY(16) col;
654            }
655        ";
656        parse(schema).unwrap();
657    }
658
659    #[test]
660    fn test_parse_message_type_integer() {
661        // Invalid integer syntax
662        let schema = "
663            message root {
664              optional int64 f1 (INTEGER());
665            }
666        ";
667        assert_eq!(
668            parse(schema).unwrap_err().to_string(),
669            "Parquet error: Failed to parse bit_width for INTEGER type"
670        );
671
672        // Invalid integer syntax, needs both bit-width and UTC sign
673        let schema = "
674    message root {
675      optional int64 f1 (INTEGER(32,));
676    }
677    ";
678        assert_eq!(
679            parse(schema).unwrap_err().to_string(),
680            "Parquet error: Incorrect bit width 32 for INT64"
681        );
682
683        // Invalid integer because of non-numeric bit width
684        let schema = "
685            message root {
686              optional int32 f1 (INTEGER(eight,true));
687            }
688        ";
689        assert_eq!(
690            parse(schema).unwrap_err().to_string(),
691            "Parquet error: Failed to parse bit_width for INTEGER type"
692        );
693
694        // Valid types
695        let schema = "
696            message root {
697              optional int32 f1 (INTEGER(8,false));
698              optional int32 f2 (INTEGER(8,true));
699              optional int32 f3 (INTEGER(16,false));
700              optional int32 f4 (INTEGER(16,true));
701              optional int32 f5 (INTEGER(32,false));
702              optional int32 f6 (INTEGER(32,true));
703              optional int64 f7 (INTEGER(64,false));
704              optional int64 f7 (INTEGER(64,true));
705            }
706        ";
707        parse(schema).unwrap();
708    }
709
710    #[test]
711    fn test_parse_message_type_temporal() {
712        // Invalid timestamp syntax
713        let schema = "
714            message root {
715              optional int64 f1 (TIMESTAMP();
716            }
717        ";
718        assert_eq!(
719            parse(schema).unwrap_err().to_string(),
720            "Parquet error: Failed to parse timeunit for TIMESTAMP type"
721        );
722
723        // Invalid timestamp syntax, needs both unit and UTC adjustment
724        let schema = "
725            message root {
726              optional int64 f1 (TIMESTAMP(MILLIS,));
727            }
728        ";
729        assert_eq!(
730            parse(schema).unwrap_err().to_string(),
731            "Parquet error: Failed to parse timezone info for TIMESTAMP type"
732        );
733
734        // Invalid timestamp because of unknown unit
735        let schema = "
736            message root {
737              optional int64 f1 (TIMESTAMP(YOCTOS,));
738            }
739        ";
740
741        assert_eq!(
742            parse(schema).unwrap_err().to_string(),
743            "Parquet error: Failed to parse timeunit for TIMESTAMP type"
744        );
745
746        // Valid types
747        let schema = "
748            message root {
749              optional int32 f1 (DATE);
750              optional int32 f2 (TIME(MILLIS,true));
751              optional int64 f3 (TIME(MICROS,false));
752              optional int64 f4 (TIME(NANOS,true));
753              optional int64 f5 (TIMESTAMP(MILLIS,true));
754              optional int64 f6 (TIMESTAMP(MICROS,true));
755              optional int64 f7 (TIMESTAMP(NANOS,false));
756            }
757        ";
758        parse(schema).unwrap();
759    }
760
761    #[test]
762    fn test_parse_message_type_decimal() {
763        // It is okay for decimal to omit precision and scale with right syntax.
764        // Here we test wrong syntax of decimal type
765
766        // Invalid decimal syntax
767        let schema = "
768            message root {
769              optional int32 f1 (DECIMAL();
770            }
771        ";
772        assert_eq!(
773            parse(schema).unwrap_err().to_string(),
774            "Parquet error: Failed to parse precision for DECIMAL type"
775        );
776
777        // Invalid decimal, need precision and scale
778        let schema = "
779            message root {
780              optional int32 f1 (DECIMAL());
781            }
782        ";
783        assert_eq!(
784            parse(schema).unwrap_err().to_string(),
785            "Parquet error: Failed to parse precision for DECIMAL type"
786        );
787
788        // Invalid decimal because of `,` - has precision, needs scale
789        let schema = "
790            message root {
791              optional int32 f1 (DECIMAL(8,));
792            }
793        ";
794        assert_eq!(
795            parse(schema).unwrap_err().to_string(),
796            "Parquet error: Failed to parse scale for DECIMAL type"
797        );
798
799        // Invalid decimal because, we always require either precision or scale to be
800        // specified as part of converted type
801        let schema = "
802            message root {
803              optional int32 f3 (DECIMAL);
804            }
805        ";
806        assert_eq!(
807            parse(schema).unwrap_err().to_string(),
808            "Parquet error: Expected ')', found token ';'"
809        );
810
811        // Valid decimal (precision, scale)
812        let schema = "
813            message root {
814              optional int32 f1 (DECIMAL(8, 3));
815              optional int32 f2 (DECIMAL(8));
816            }
817        ";
818        parse(schema).unwrap();
819    }
820
821    #[test]
822    fn test_parse_message_type_compare_1() {
823        let schema = "
824            message root {
825              optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
826              optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18));
827              optional fixed_len_byte_array (2) f3 (FLOAT16);
828            }
829        ";
830        let message = parse(schema).unwrap();
831
832        let expected = Type::group_type_builder("root")
833            .with_fields(vec![
834                Arc::new(
835                    Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY)
836                        .with_logical_type(Some(LogicalType::Decimal {
837                            precision: 9,
838                            scale: 3,
839                        }))
840                        .with_converted_type(ConvertedType::DECIMAL)
841                        .with_length(5)
842                        .with_precision(9)
843                        .with_scale(3)
844                        .build()
845                        .unwrap(),
846                ),
847                Arc::new(
848                    Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY)
849                        .with_logical_type(Some(LogicalType::Decimal {
850                            precision: 38,
851                            scale: 18,
852                        }))
853                        .with_converted_type(ConvertedType::DECIMAL)
854                        .with_length(16)
855                        .with_precision(38)
856                        .with_scale(18)
857                        .build()
858                        .unwrap(),
859                ),
860                Arc::new(
861                    Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY)
862                        .with_logical_type(Some(LogicalType::Float16))
863                        .with_length(2)
864                        .build()
865                        .unwrap(),
866                ),
867            ])
868            .build()
869            .unwrap();
870
871        assert_eq!(message, expected);
872    }
873
874    #[test]
875    fn test_parse_message_type_compare_2() {
876        let schema = "
877            message root {
878              required group a0 {
879                optional group a1 (LIST) {
880                  repeated binary a2 (UTF8);
881                }
882
883                optional group b1 (LIST) {
884                  repeated group b2 {
885                    optional int32 b3;
886                    optional double b4;
887                  }
888                }
889              }
890            }
891        ";
892        let message = parse(schema).unwrap();
893
894        let expected = Type::group_type_builder("root")
895            .with_fields(vec![Arc::new(
896                Type::group_type_builder("a0")
897                    .with_repetition(Repetition::REQUIRED)
898                    .with_fields(vec![
899                        Arc::new(
900                            Type::group_type_builder("a1")
901                                .with_repetition(Repetition::OPTIONAL)
902                                .with_logical_type(Some(LogicalType::List))
903                                .with_converted_type(ConvertedType::LIST)
904                                .with_fields(vec![Arc::new(
905                                    Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY)
906                                        .with_repetition(Repetition::REPEATED)
907                                        .with_converted_type(ConvertedType::UTF8)
908                                        .build()
909                                        .unwrap(),
910                                )])
911                                .build()
912                                .unwrap(),
913                        ),
914                        Arc::new(
915                            Type::group_type_builder("b1")
916                                .with_repetition(Repetition::OPTIONAL)
917                                .with_logical_type(Some(LogicalType::List))
918                                .with_converted_type(ConvertedType::LIST)
919                                .with_fields(vec![Arc::new(
920                                    Type::group_type_builder("b2")
921                                        .with_repetition(Repetition::REPEATED)
922                                        .with_fields(vec![
923                                            Arc::new(
924                                                Type::primitive_type_builder(
925                                                    "b3",
926                                                    PhysicalType::INT32,
927                                                )
928                                                .build()
929                                                .unwrap(),
930                                            ),
931                                            Arc::new(
932                                                Type::primitive_type_builder(
933                                                    "b4",
934                                                    PhysicalType::DOUBLE,
935                                                )
936                                                .build()
937                                                .unwrap(),
938                                            ),
939                                        ])
940                                        .build()
941                                        .unwrap(),
942                                )])
943                                .build()
944                                .unwrap(),
945                        ),
946                    ])
947                    .build()
948                    .unwrap(),
949            )])
950            .build()
951            .unwrap();
952
953        assert_eq!(message, expected);
954    }
955
956    #[test]
957    fn test_parse_message_type_compare_3() {
958        let schema = "
959            message root {
960              required int32 _1 (INT_8);
961              required int32 _2 (INT_16);
962              required float _3;
963              required double _4;
964              optional int32 _5 (DATE);
965              optional binary _6 (UTF8);
966            }
967        ";
968        let message = parse(schema).unwrap();
969
970        let fields = vec![
971            Arc::new(
972                Type::primitive_type_builder("_1", PhysicalType::INT32)
973                    .with_repetition(Repetition::REQUIRED)
974                    .with_converted_type(ConvertedType::INT_8)
975                    .build()
976                    .unwrap(),
977            ),
978            Arc::new(
979                Type::primitive_type_builder("_2", PhysicalType::INT32)
980                    .with_repetition(Repetition::REQUIRED)
981                    .with_converted_type(ConvertedType::INT_16)
982                    .build()
983                    .unwrap(),
984            ),
985            Arc::new(
986                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
987                    .with_repetition(Repetition::REQUIRED)
988                    .build()
989                    .unwrap(),
990            ),
991            Arc::new(
992                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
993                    .with_repetition(Repetition::REQUIRED)
994                    .build()
995                    .unwrap(),
996            ),
997            Arc::new(
998                Type::primitive_type_builder("_5", PhysicalType::INT32)
999                    .with_logical_type(Some(LogicalType::Date))
1000                    .with_converted_type(ConvertedType::DATE)
1001                    .build()
1002                    .unwrap(),
1003            ),
1004            Arc::new(
1005                Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY)
1006                    .with_converted_type(ConvertedType::UTF8)
1007                    .build()
1008                    .unwrap(),
1009            ),
1010        ];
1011
1012        let expected = Type::group_type_builder("root")
1013            .with_fields(fields)
1014            .build()
1015            .unwrap();
1016        assert_eq!(message, expected);
1017    }
1018
1019    #[test]
1020    fn test_parse_message_type_compare_4() {
1021        let schema = "
1022            message root {
1023              required int32 _1 (INTEGER(8,true));
1024              required int32 _2 (INTEGER(16,false));
1025              required float _3;
1026              required double _4;
1027              optional int32 _5 (DATE);
1028              optional int32 _6 (TIME(MILLIS,false));
1029              optional int64 _7 (TIME(MICROS,true));
1030              optional int64 _8 (TIMESTAMP(MILLIS,true));
1031              optional int64 _9 (TIMESTAMP(NANOS,false));
1032              optional binary _10 (STRING);
1033            }
1034        ";
1035        let message = parse(schema).unwrap();
1036
1037        let fields = vec![
1038            Arc::new(
1039                Type::primitive_type_builder("_1", PhysicalType::INT32)
1040                    .with_repetition(Repetition::REQUIRED)
1041                    .with_logical_type(Some(LogicalType::Integer {
1042                        bit_width: 8,
1043                        is_signed: true,
1044                    }))
1045                    .build()
1046                    .unwrap(),
1047            ),
1048            Arc::new(
1049                Type::primitive_type_builder("_2", PhysicalType::INT32)
1050                    .with_repetition(Repetition::REQUIRED)
1051                    .with_logical_type(Some(LogicalType::Integer {
1052                        bit_width: 16,
1053                        is_signed: false,
1054                    }))
1055                    .build()
1056                    .unwrap(),
1057            ),
1058            Arc::new(
1059                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
1060                    .with_repetition(Repetition::REQUIRED)
1061                    .build()
1062                    .unwrap(),
1063            ),
1064            Arc::new(
1065                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
1066                    .with_repetition(Repetition::REQUIRED)
1067                    .build()
1068                    .unwrap(),
1069            ),
1070            Arc::new(
1071                Type::primitive_type_builder("_5", PhysicalType::INT32)
1072                    .with_logical_type(Some(LogicalType::Date))
1073                    .build()
1074                    .unwrap(),
1075            ),
1076            Arc::new(
1077                Type::primitive_type_builder("_6", PhysicalType::INT32)
1078                    .with_logical_type(Some(LogicalType::Time {
1079                        unit: TimeUnit::MILLIS,
1080                        is_adjusted_to_u_t_c: false,
1081                    }))
1082                    .build()
1083                    .unwrap(),
1084            ),
1085            Arc::new(
1086                Type::primitive_type_builder("_7", PhysicalType::INT64)
1087                    .with_logical_type(Some(LogicalType::Time {
1088                        unit: TimeUnit::MICROS,
1089                        is_adjusted_to_u_t_c: true,
1090                    }))
1091                    .build()
1092                    .unwrap(),
1093            ),
1094            Arc::new(
1095                Type::primitive_type_builder("_8", PhysicalType::INT64)
1096                    .with_logical_type(Some(LogicalType::Timestamp {
1097                        unit: TimeUnit::MILLIS,
1098                        is_adjusted_to_u_t_c: true,
1099                    }))
1100                    .build()
1101                    .unwrap(),
1102            ),
1103            Arc::new(
1104                Type::primitive_type_builder("_9", PhysicalType::INT64)
1105                    .with_logical_type(Some(LogicalType::Timestamp {
1106                        unit: TimeUnit::NANOS,
1107                        is_adjusted_to_u_t_c: false,
1108                    }))
1109                    .build()
1110                    .unwrap(),
1111            ),
1112            Arc::new(
1113                Type::primitive_type_builder("_10", PhysicalType::BYTE_ARRAY)
1114                    .with_logical_type(Some(LogicalType::String))
1115                    .build()
1116                    .unwrap(),
1117            ),
1118        ];
1119
1120        let expected = Type::group_type_builder("root")
1121            .with_fields(fields)
1122            .build()
1123            .unwrap();
1124        assert_eq!(message, expected);
1125    }
1126}