Skip to main content

parquet/schema/
parser.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet schema parser.
19//! Provides methods to parse and validate string message type into Parquet
20//! [`Type`].
21//!
22//! # Example
23//!
24//! ```rust
25//! use parquet::schema::parser::parse_message_type;
26//!
27//! let message_type = "
28//!   message spark_schema {
29//!     OPTIONAL BYTE_ARRAY a (UTF8);
30//!     REQUIRED INT32 b;
31//!     REQUIRED DOUBLE c;
32//!     REQUIRED BOOLEAN d;
33//!     OPTIONAL group e (LIST) {
34//!       REPEATED group list {
35//!         REQUIRED INT32 element;
36//!       }
37//!     }
38//!   }
39//! ";
40//!
41//! let schema = parse_message_type(message_type).expect("Expected valid schema");
42//! println!("{:?}", schema);
43//! ```
44
45use std::sync::Arc;
46
47use crate::basic::{ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType};
48use crate::errors::{ParquetError, Result};
49use crate::schema::types::{Type, TypePtr};
50
51/// Parses message type as string into a Parquet [`Type`]
52/// which, for example, could be used to extract individual columns. Returns Parquet
53/// general error when parsing or validation fails.
54pub fn parse_message_type(message_type: &str) -> Result<Type> {
55    let mut parser = Parser {
56        tokenizer: &mut Tokenizer::from_str(message_type),
57    };
58    parser.parse_message_type()
59}
60
61/// Tokenizer to split message type string into tokens that are separated using characters
62/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens.
63/// Tokenizer provides Iterator interface to process tokens; it also allows to step back
64/// to reprocess previous tokens.
65struct Tokenizer<'a> {
66    // List of all tokens for a string
67    tokens: Vec<&'a str>,
68    // Current index of vector
69    index: usize,
70}
71
72impl<'a> Tokenizer<'a> {
73    // Create tokenizer from message type string
74    pub fn from_str(string: &'a str) -> Self {
75        let vec = string
76            .split_whitespace()
77            .flat_map(Self::split_token)
78            .collect();
79        Tokenizer {
80            tokens: vec,
81            index: 0,
82        }
83    }
84
85    // List of all special characters in schema
86    fn is_schema_delim(c: char) -> bool {
87        c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ','
88    }
89
90    /// Splits string into tokens; input string can already be token or can contain
91    /// delimiters, e.g. required" -> Vec("required") and
92    /// "(UTF8);" -> Vec("(", "UTF8", ")", ";")
93    fn split_token(string: &str) -> Vec<&str> {
94        let mut buffer: Vec<&str> = Vec::new();
95        let mut tail = string;
96        while let Some(index) = tail.find(Self::is_schema_delim) {
97            let (h, t) = tail.split_at(index);
98            if !h.is_empty() {
99                buffer.push(h);
100            }
101            buffer.push(&t[0..1]);
102            tail = &t[1..];
103        }
104        if !tail.is_empty() {
105            buffer.push(tail);
106        }
107        buffer
108    }
109
110    // Move pointer to a previous element
111    fn backtrack(&mut self) {
112        self.index -= 1;
113    }
114}
115
116impl<'a> Iterator for Tokenizer<'a> {
117    type Item = &'a str;
118
119    fn next(&mut self) -> Option<&'a str> {
120        if self.index < self.tokens.len() {
121            self.index += 1;
122            Some(self.tokens[self.index - 1])
123        } else {
124            None
125        }
126    }
127}
128
129/// Internal Schema parser.
130/// Traverses message type using tokenizer and parses each group/primitive type
131/// recursively.
132struct Parser<'a> {
133    tokenizer: &'a mut Tokenizer<'a>,
134}
135
136// Utility function to assert token on validity.
137fn assert_token(token: Option<&str>, expected: &str) -> Result<()> {
138    match token {
139        Some(value) if value == expected => Ok(()),
140        Some(other) => Err(general_err!(
141            "Expected '{}', found token '{}'",
142            expected,
143            other
144        )),
145        None => Err(general_err!(
146            "Expected '{}', but no token found (None)",
147            expected
148        )),
149    }
150}
151
152// Utility function to parse i32 or return general error.
153#[inline]
154fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result<i32> {
155    value
156        .ok_or_else(|| general_err!(not_found_msg))
157        .and_then(|v| v.parse::<i32>().map_err(|_| general_err!(parse_fail_msg)))
158}
159
160// Utility function to parse boolean or return general error.
161#[inline]
162fn parse_bool(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result<bool> {
163    value
164        .ok_or_else(|| general_err!(not_found_msg))
165        .and_then(|v| {
166            v.to_lowercase()
167                .parse::<bool>()
168                .map_err(|_| general_err!(parse_fail_msg))
169        })
170}
171
172// Utility function to parse TimeUnit or return general error.
173fn parse_timeunit(
174    value: Option<&str>,
175    not_found_msg: &str,
176    parse_fail_msg: &str,
177) -> Result<TimeUnit> {
178    value
179        .ok_or_else(|| general_err!(not_found_msg))
180        .and_then(|v| match v.to_uppercase().as_str() {
181            "MILLIS" => Ok(TimeUnit::MILLIS),
182            "MICROS" => Ok(TimeUnit::MICROS),
183            "NANOS" => Ok(TimeUnit::NANOS),
184            _ => Err(general_err!(parse_fail_msg)),
185        })
186}
187
188impl Parser<'_> {
189    // Entry function to parse message type, uses internal tokenizer.
190    fn parse_message_type(&mut self) -> Result<Type> {
191        // Check that message type starts with "message".
192        match self.tokenizer.next() {
193            Some("message") => {
194                let name = self
195                    .tokenizer
196                    .next()
197                    .ok_or_else(|| general_err!("Expected name, found None"))?;
198                Type::group_type_builder(name)
199                    .with_fields(self.parse_child_types()?)
200                    .build()
201            }
202            _ => Err(general_err!("Message type does not start with 'message'")),
203        }
204    }
205
206    // Parses child types for a current group type.
207    // This is only invoked on root and group types.
208    fn parse_child_types(&mut self) -> Result<Vec<TypePtr>> {
209        assert_token(self.tokenizer.next(), "{")?;
210        let mut vec = Vec::new();
211        while let Some(value) = self.tokenizer.next() {
212            if value == "}" {
213                break;
214            } else {
215                self.tokenizer.backtrack();
216                vec.push(Arc::new(self.add_type()?));
217            }
218        }
219        Ok(vec)
220    }
221
222    fn add_type(&mut self) -> Result<Type> {
223        // Parse repetition
224        let repetition = self
225            .tokenizer
226            .next()
227            .ok_or_else(|| general_err!("Expected repetition, found None"))
228            .and_then(|v| v.to_uppercase().parse::<Repetition>())?;
229
230        match self.tokenizer.next() {
231            Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)),
232            Some(type_string) => {
233                let physical_type = type_string.to_uppercase().parse::<PhysicalType>()?;
234                self.add_primitive_type(repetition, physical_type)
235            }
236            None => Err(general_err!("Invalid type, could not extract next token")),
237        }
238    }
239
240    fn add_group_type(&mut self, repetition: Option<Repetition>) -> Result<Type> {
241        // Parse name of the group type
242        let name = self
243            .tokenizer
244            .next()
245            .ok_or_else(|| general_err!("Expected name, found None"))?;
246
247        // Parse logical or converted type if exists
248        let (logical_type, converted_type) = if let Some("(") = self.tokenizer.next() {
249            let tpe = self
250                .tokenizer
251                .next()
252                .ok_or_else(|| general_err!("Expected converted type, found None"))
253                .and_then(|v| {
254                    // Try logical type first
255                    let upper = v.to_uppercase();
256                    let logical = upper.parse::<LogicalType>();
257                    match logical {
258                        Ok(logical) => {
259                            Ok((Some(logical.clone()), ConvertedType::from(Some(logical))))
260                        }
261                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
262                    }
263                })?;
264            assert_token(self.tokenizer.next(), ")")?;
265            tpe
266        } else {
267            self.tokenizer.backtrack();
268            (None, ConvertedType::NONE)
269        };
270
271        // Parse optional id
272        let id = if let Some("=") = self.tokenizer.next() {
273            self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())
274        } else {
275            self.tokenizer.backtrack();
276            None
277        };
278
279        let mut builder = Type::group_type_builder(name)
280            .with_logical_type(logical_type)
281            .with_converted_type(converted_type)
282            .with_fields(self.parse_child_types()?)
283            .with_id(id);
284        if let Some(rep) = repetition {
285            builder = builder.with_repetition(rep);
286        }
287        builder.build()
288    }
289
290    fn add_primitive_type(
291        &mut self,
292        repetition: Repetition,
293        physical_type: PhysicalType,
294    ) -> Result<Type> {
295        // Read type length if the type is FIXED_LEN_BYTE_ARRAY.
296        let mut length: i32 = -1;
297        if physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY {
298            assert_token(self.tokenizer.next(), "(")?;
299            length = parse_i32(
300                self.tokenizer.next(),
301                "Expected length for FIXED_LEN_BYTE_ARRAY, found None",
302                "Failed to parse length for FIXED_LEN_BYTE_ARRAY",
303            )?;
304            assert_token(self.tokenizer.next(), ")")?;
305        }
306
307        // Parse name of the primitive type
308        let name = self
309            .tokenizer
310            .next()
311            .ok_or_else(|| general_err!("Expected name, found None"))?;
312
313        // Parse converted type
314        let (logical_type, converted_type, precision, scale) = if let Some("(") =
315            self.tokenizer.next()
316        {
317            let (mut logical, mut converted) = self
318                .tokenizer
319                .next()
320                .ok_or_else(|| general_err!("Expected logical or converted type, found None"))
321                .and_then(|v| {
322                    let upper = v.to_uppercase();
323                    let logical = upper.parse::<LogicalType>();
324                    match logical {
325                        Ok(logical) => {
326                            Ok((Some(logical.clone()), ConvertedType::from(Some(logical))))
327                        }
328                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
329                    }
330                })?;
331
332            // Parse precision and scale for decimals
333            let mut precision: i32 = -1;
334            let mut scale: i32 = -1;
335
336            // Parse the concrete logical type
337            if let Some(tpe) = &logical {
338                match tpe {
339                    LogicalType::Decimal { .. } => {
340                        if let Some("(") = self.tokenizer.next() {
341                            precision = parse_i32(
342                                self.tokenizer.next(),
343                                "Expected precision, found None",
344                                "Failed to parse precision for DECIMAL type",
345                            )?;
346                            if let Some(",") = self.tokenizer.next() {
347                                scale = parse_i32(
348                                    self.tokenizer.next(),
349                                    "Expected scale, found None",
350                                    "Failed to parse scale for DECIMAL type",
351                                )?;
352                                assert_token(self.tokenizer.next(), ")")?;
353                            } else {
354                                scale = 0
355                            }
356                            logical = Some(LogicalType::decimal(scale, precision));
357                            converted = ConvertedType::from(logical.clone());
358                        }
359                    }
360                    LogicalType::Time { .. } => {
361                        if let Some("(") = self.tokenizer.next() {
362                            let unit = parse_timeunit(
363                                self.tokenizer.next(),
364                                "Invalid timeunit found",
365                                "Failed to parse timeunit for TIME type",
366                            )?;
367                            if let Some(",") = self.tokenizer.next() {
368                                let is_adjusted_to_u_t_c = parse_bool(
369                                    self.tokenizer.next(),
370                                    "Invalid boolean found",
371                                    "Failed to parse timezone info for TIME type",
372                                )?;
373                                assert_token(self.tokenizer.next(), ")")?;
374                                logical = Some(LogicalType::time(is_adjusted_to_u_t_c, unit));
375                                converted = ConvertedType::from(logical.clone());
376                            } else {
377                                // Invalid token for unit
378                                self.tokenizer.backtrack();
379                            }
380                        }
381                    }
382                    LogicalType::Timestamp { .. } => {
383                        if let Some("(") = self.tokenizer.next() {
384                            let unit = parse_timeunit(
385                                self.tokenizer.next(),
386                                "Invalid timeunit found",
387                                "Failed to parse timeunit for TIMESTAMP type",
388                            )?;
389                            if let Some(",") = self.tokenizer.next() {
390                                let is_adjusted_to_u_t_c = parse_bool(
391                                    self.tokenizer.next(),
392                                    "Invalid boolean found",
393                                    "Failed to parse timezone info for TIMESTAMP type",
394                                )?;
395                                assert_token(self.tokenizer.next(), ")")?;
396                                logical = Some(LogicalType::timestamp(is_adjusted_to_u_t_c, unit));
397                                converted = ConvertedType::from(logical.clone());
398                            } else {
399                                // Invalid token for unit
400                                self.tokenizer.backtrack();
401                            }
402                        }
403                    }
404                    LogicalType::Integer { .. } => {
405                        if let Some("(") = self.tokenizer.next() {
406                            let bit_width = parse_i32(
407                                self.tokenizer.next(),
408                                "Invalid bit_width found",
409                                "Failed to parse bit_width for INTEGER type",
410                            )? as i8;
411                            match physical_type {
412                                PhysicalType::INT32 => match bit_width {
413                                    8 | 16 | 32 => {}
414                                    _ => {
415                                        return Err(general_err!(
416                                            "Incorrect bit width {} for INT32",
417                                            bit_width
418                                        ));
419                                    }
420                                },
421                                PhysicalType::INT64 => {
422                                    if bit_width != 64 {
423                                        return Err(general_err!(
424                                            "Incorrect bit width {} for INT64",
425                                            bit_width
426                                        ));
427                                    }
428                                }
429                                _ => {
430                                    return Err(general_err!(
431                                        "Logical type Integer cannot be used with physical type {}",
432                                        physical_type
433                                    ));
434                                }
435                            }
436                            if let Some(",") = self.tokenizer.next() {
437                                let is_signed = parse_bool(
438                                    self.tokenizer.next(),
439                                    "Invalid boolean found",
440                                    "Failed to parse is_signed for INTEGER type",
441                                )?;
442                                assert_token(self.tokenizer.next(), ")")?;
443                                logical = Some(LogicalType::integer(bit_width, is_signed));
444                                converted = ConvertedType::from(logical.clone());
445                            } else {
446                                // Invalid token for unit
447                                self.tokenizer.backtrack();
448                            }
449                        }
450                    }
451                    _ => {}
452                }
453            } else if converted == ConvertedType::DECIMAL {
454                if let Some("(") = self.tokenizer.next() {
455                    // Parse precision
456                    precision = parse_i32(
457                        self.tokenizer.next(),
458                        "Expected precision, found None",
459                        "Failed to parse precision for DECIMAL type",
460                    )?;
461
462                    // Parse scale
463                    scale = if let Some(",") = self.tokenizer.next() {
464                        parse_i32(
465                            self.tokenizer.next(),
466                            "Expected scale, found None",
467                            "Failed to parse scale for DECIMAL type",
468                        )?
469                    } else {
470                        // Scale is not provided, set it to 0.
471                        self.tokenizer.backtrack();
472                        0
473                    };
474
475                    assert_token(self.tokenizer.next(), ")")?;
476                } else {
477                    self.tokenizer.backtrack();
478                }
479            }
480
481            assert_token(self.tokenizer.next(), ")")?;
482            (logical, converted, precision, scale)
483        } else {
484            self.tokenizer.backtrack();
485            (None, ConvertedType::NONE, -1, -1)
486        };
487
488        // Parse optional id
489        let id = if let Some("=") = self.tokenizer.next() {
490            self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())
491        } else {
492            self.tokenizer.backtrack();
493            None
494        };
495        assert_token(self.tokenizer.next(), ";")?;
496
497        Type::primitive_type_builder(name, physical_type)
498            .with_repetition(repetition)
499            .with_logical_type(logical_type)
500            .with_converted_type(converted_type)
501            .with_length(length)
502            .with_precision(precision)
503            .with_scale(scale)
504            .with_id(id)
505            .build()
506    }
507}
508
509#[cfg(test)]
510mod tests {
511    use super::*;
512
513    #[test]
514    fn test_tokenize_empty_string() {
515        assert_eq!(Tokenizer::from_str("").next(), None);
516    }
517
518    #[test]
519    fn test_tokenize_delimiters() {
520        let mut iter = Tokenizer::from_str(",;{}()=");
521        assert_eq!(iter.next(), Some(","));
522        assert_eq!(iter.next(), Some(";"));
523        assert_eq!(iter.next(), Some("{"));
524        assert_eq!(iter.next(), Some("}"));
525        assert_eq!(iter.next(), Some("("));
526        assert_eq!(iter.next(), Some(")"));
527        assert_eq!(iter.next(), Some("="));
528        assert_eq!(iter.next(), None);
529    }
530
531    #[test]
532    fn test_tokenize_delimiters_with_whitespaces() {
533        let mut iter = Tokenizer::from_str(" , ; { } ( ) = ");
534        assert_eq!(iter.next(), Some(","));
535        assert_eq!(iter.next(), Some(";"));
536        assert_eq!(iter.next(), Some("{"));
537        assert_eq!(iter.next(), Some("}"));
538        assert_eq!(iter.next(), Some("("));
539        assert_eq!(iter.next(), Some(")"));
540        assert_eq!(iter.next(), Some("="));
541        assert_eq!(iter.next(), None);
542    }
543
544    #[test]
545    fn test_tokenize_words() {
546        let mut iter = Tokenizer::from_str("abc def ghi jkl mno");
547        assert_eq!(iter.next(), Some("abc"));
548        assert_eq!(iter.next(), Some("def"));
549        assert_eq!(iter.next(), Some("ghi"));
550        assert_eq!(iter.next(), Some("jkl"));
551        assert_eq!(iter.next(), Some("mno"));
552        assert_eq!(iter.next(), None);
553    }
554
555    #[test]
556    fn test_tokenize_backtrack() {
557        let mut iter = Tokenizer::from_str("abc;");
558        assert_eq!(iter.next(), Some("abc"));
559        assert_eq!(iter.next(), Some(";"));
560        iter.backtrack();
561        assert_eq!(iter.next(), Some(";"));
562        assert_eq!(iter.next(), None);
563    }
564
565    #[test]
566    fn test_tokenize_message_type() {
567        let schema = "
568    message schema {
569      required int32 a;
570      optional binary c (UTF8);
571      required group d {
572        required int32 a;
573        optional binary c (UTF8);
574      }
575      required group e (LIST) {
576        repeated group list {
577          required int32 element;
578        }
579      }
580    }
581    ";
582        let iter = Tokenizer::from_str(schema);
583        let mut res = Vec::new();
584        for token in iter {
585            res.push(token);
586        }
587        assert_eq!(
588            res,
589            vec![
590                "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c",
591                "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a",
592                ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group",
593                "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32",
594                "element", ";", "}", "}", "}"
595            ]
596        );
597    }
598
599    #[test]
600    fn test_assert_token() {
601        assert!(assert_token(Some("a"), "a").is_ok());
602        assert!(assert_token(Some("a"), "b").is_err());
603        assert!(assert_token(None, "b").is_err());
604    }
605
606    fn parse(schema: &str) -> Result<Type, ParquetError> {
607        let mut iter = Tokenizer::from_str(schema);
608        Parser {
609            tokenizer: &mut iter,
610        }
611        .parse_message_type()
612    }
613
614    #[test]
615    fn test_parse_message_type_invalid() {
616        assert_eq!(
617            parse("test").unwrap_err().to_string(),
618            "Parquet error: Message type does not start with 'message'"
619        );
620    }
621
622    #[test]
623    fn test_parse_message_type_no_name() {
624        assert_eq!(
625            parse("message").unwrap_err().to_string(),
626            "Parquet error: Expected name, found None"
627        );
628    }
629
630    #[test]
631    fn test_parse_message_type_fixed_byte_array() {
632        let schema = "
633            message schema {
634              REQUIRED FIXED_LEN_BYTE_ARRAY col;
635            }
636        ";
637        assert_eq!(
638            parse(schema).unwrap_err().to_string(),
639            "Parquet error: Expected '(', found token 'col'"
640        );
641
642        let schema = "
643            message schema {
644              REQUIRED FIXED_LEN_BYTE_ARRAY(16) col;
645            }
646        ";
647        parse(schema).unwrap();
648    }
649
650    #[test]
651    fn test_parse_message_type_integer() {
652        // Invalid integer syntax
653        let schema = "
654            message root {
655              optional int64 f1 (INTEGER());
656            }
657        ";
658        assert_eq!(
659            parse(schema).unwrap_err().to_string(),
660            "Parquet error: Failed to parse bit_width for INTEGER type"
661        );
662
663        // Invalid integer syntax, needs both bit-width and UTC sign
664        let schema = "
665    message root {
666      optional int64 f1 (INTEGER(32,));
667    }
668    ";
669        assert_eq!(
670            parse(schema).unwrap_err().to_string(),
671            "Parquet error: Incorrect bit width 32 for INT64"
672        );
673
674        // Invalid integer because of non-numeric bit width
675        let schema = "
676            message root {
677              optional int32 f1 (INTEGER(eight,true));
678            }
679        ";
680        assert_eq!(
681            parse(schema).unwrap_err().to_string(),
682            "Parquet error: Failed to parse bit_width for INTEGER type"
683        );
684
685        // Valid types
686        let schema = "
687            message root {
688              optional int32 f1 (INTEGER(8,false));
689              optional int32 f2 (INTEGER(8,true));
690              optional int32 f3 (INTEGER(16,false));
691              optional int32 f4 (INTEGER(16,true));
692              optional int32 f5 (INTEGER(32,false));
693              optional int32 f6 (INTEGER(32,true));
694              optional int64 f7 (INTEGER(64,false));
695              optional int64 f7 (INTEGER(64,true));
696            }
697        ";
698        parse(schema).unwrap();
699    }
700
701    #[test]
702    fn test_parse_message_type_temporal() {
703        // Invalid timestamp syntax
704        let schema = "
705            message root {
706              optional int64 f1 (TIMESTAMP();
707            }
708        ";
709        assert_eq!(
710            parse(schema).unwrap_err().to_string(),
711            "Parquet error: Failed to parse timeunit for TIMESTAMP type"
712        );
713
714        // Invalid timestamp syntax, needs both unit and UTC adjustment
715        let schema = "
716            message root {
717              optional int64 f1 (TIMESTAMP(MILLIS,));
718            }
719        ";
720        assert_eq!(
721            parse(schema).unwrap_err().to_string(),
722            "Parquet error: Failed to parse timezone info for TIMESTAMP type"
723        );
724
725        // Invalid timestamp because of unknown unit
726        let schema = "
727            message root {
728              optional int64 f1 (TIMESTAMP(YOCTOS,));
729            }
730        ";
731
732        assert_eq!(
733            parse(schema).unwrap_err().to_string(),
734            "Parquet error: Failed to parse timeunit for TIMESTAMP type"
735        );
736
737        // Valid types
738        let schema = "
739            message root {
740              optional int32 f1 (DATE);
741              optional int32 f2 (TIME(MILLIS,true));
742              optional int64 f3 (TIME(MICROS,false));
743              optional int64 f4 (TIME(NANOS,true));
744              optional int64 f5 (TIMESTAMP(MILLIS,true));
745              optional int64 f6 (TIMESTAMP(MICROS,true));
746              optional int64 f7 (TIMESTAMP(NANOS,false));
747            }
748        ";
749        parse(schema).unwrap();
750    }
751
752    #[test]
753    fn test_parse_message_type_decimal() {
754        // It is okay for decimal to omit precision and scale with right syntax.
755        // Here we test wrong syntax of decimal type
756
757        // Invalid decimal syntax
758        let schema = "
759            message root {
760              optional int32 f1 (DECIMAL();
761            }
762        ";
763        assert_eq!(
764            parse(schema).unwrap_err().to_string(),
765            "Parquet error: Failed to parse precision for DECIMAL type"
766        );
767
768        // Invalid decimal, need precision and scale
769        let schema = "
770            message root {
771              optional int32 f1 (DECIMAL());
772            }
773        ";
774        assert_eq!(
775            parse(schema).unwrap_err().to_string(),
776            "Parquet error: Failed to parse precision for DECIMAL type"
777        );
778
779        // Invalid decimal because of `,` - has precision, needs scale
780        let schema = "
781            message root {
782              optional int32 f1 (DECIMAL(8,));
783            }
784        ";
785        assert_eq!(
786            parse(schema).unwrap_err().to_string(),
787            "Parquet error: Failed to parse scale for DECIMAL type"
788        );
789
790        // Invalid decimal because, we always require either precision or scale to be
791        // specified as part of converted type
792        let schema = "
793            message root {
794              optional int32 f3 (DECIMAL);
795            }
796        ";
797        assert_eq!(
798            parse(schema).unwrap_err().to_string(),
799            "Parquet error: Expected ')', found token ';'"
800        );
801
802        // Valid decimal (precision, scale)
803        let schema = "
804            message root {
805              optional int32 f1 (DECIMAL(8, 3));
806              optional int32 f2 (DECIMAL(8));
807            }
808        ";
809        parse(schema).unwrap();
810    }
811
812    #[test]
813    fn test_parse_message_type_compare_1() {
814        let schema = "
815            message root {
816              optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
817              optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18));
818              optional fixed_len_byte_array (2) f3 (FLOAT16);
819            }
820        ";
821        let message = parse(schema).unwrap();
822
823        let expected = Type::group_type_builder("root")
824            .with_fields(vec![
825                Arc::new(
826                    Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY)
827                        .with_logical_type(Some(LogicalType::decimal(3, 9)))
828                        .with_converted_type(ConvertedType::DECIMAL)
829                        .with_length(5)
830                        .with_precision(9)
831                        .with_scale(3)
832                        .build()
833                        .unwrap(),
834                ),
835                Arc::new(
836                    Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY)
837                        .with_logical_type(Some(LogicalType::decimal(18, 38)))
838                        .with_converted_type(ConvertedType::DECIMAL)
839                        .with_length(16)
840                        .with_precision(38)
841                        .with_scale(18)
842                        .build()
843                        .unwrap(),
844                ),
845                Arc::new(
846                    Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY)
847                        .with_logical_type(Some(LogicalType::Float16))
848                        .with_length(2)
849                        .build()
850                        .unwrap(),
851                ),
852            ])
853            .build()
854            .unwrap();
855
856        assert_eq!(message, expected);
857    }
858
859    #[test]
860    fn test_parse_message_type_compare_2() {
861        let schema = "
862            message root {
863              required group a0 {
864                optional group a1 (LIST) {
865                  repeated binary a2 (UTF8);
866                }
867
868                optional group b1 (LIST) {
869                  repeated group b2 {
870                    optional int32 b3;
871                    optional double b4;
872                  }
873                }
874              }
875            }
876        ";
877        let message = parse(schema).unwrap();
878
879        let expected = Type::group_type_builder("root")
880            .with_fields(vec![Arc::new(
881                Type::group_type_builder("a0")
882                    .with_repetition(Repetition::REQUIRED)
883                    .with_fields(vec![
884                        Arc::new(
885                            Type::group_type_builder("a1")
886                                .with_repetition(Repetition::OPTIONAL)
887                                .with_logical_type(Some(LogicalType::List))
888                                .with_converted_type(ConvertedType::LIST)
889                                .with_fields(vec![Arc::new(
890                                    Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY)
891                                        .with_repetition(Repetition::REPEATED)
892                                        .with_converted_type(ConvertedType::UTF8)
893                                        .build()
894                                        .unwrap(),
895                                )])
896                                .build()
897                                .unwrap(),
898                        ),
899                        Arc::new(
900                            Type::group_type_builder("b1")
901                                .with_repetition(Repetition::OPTIONAL)
902                                .with_logical_type(Some(LogicalType::List))
903                                .with_converted_type(ConvertedType::LIST)
904                                .with_fields(vec![Arc::new(
905                                    Type::group_type_builder("b2")
906                                        .with_repetition(Repetition::REPEATED)
907                                        .with_fields(vec![
908                                            Arc::new(
909                                                Type::primitive_type_builder(
910                                                    "b3",
911                                                    PhysicalType::INT32,
912                                                )
913                                                .build()
914                                                .unwrap(),
915                                            ),
916                                            Arc::new(
917                                                Type::primitive_type_builder(
918                                                    "b4",
919                                                    PhysicalType::DOUBLE,
920                                                )
921                                                .build()
922                                                .unwrap(),
923                                            ),
924                                        ])
925                                        .build()
926                                        .unwrap(),
927                                )])
928                                .build()
929                                .unwrap(),
930                        ),
931                    ])
932                    .build()
933                    .unwrap(),
934            )])
935            .build()
936            .unwrap();
937
938        assert_eq!(message, expected);
939    }
940
941    #[test]
942    fn test_parse_message_type_compare_3() {
943        let schema = "
944            message root {
945              required int32 _1 (INT_8);
946              required int32 _2 (INT_16);
947              required float _3;
948              required double _4;
949              optional int32 _5 (DATE);
950              optional binary _6 (UTF8);
951            }
952        ";
953        let message = parse(schema).unwrap();
954
955        let fields = vec![
956            Arc::new(
957                Type::primitive_type_builder("_1", PhysicalType::INT32)
958                    .with_repetition(Repetition::REQUIRED)
959                    .with_converted_type(ConvertedType::INT_8)
960                    .build()
961                    .unwrap(),
962            ),
963            Arc::new(
964                Type::primitive_type_builder("_2", PhysicalType::INT32)
965                    .with_repetition(Repetition::REQUIRED)
966                    .with_converted_type(ConvertedType::INT_16)
967                    .build()
968                    .unwrap(),
969            ),
970            Arc::new(
971                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
972                    .with_repetition(Repetition::REQUIRED)
973                    .build()
974                    .unwrap(),
975            ),
976            Arc::new(
977                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
978                    .with_repetition(Repetition::REQUIRED)
979                    .build()
980                    .unwrap(),
981            ),
982            Arc::new(
983                Type::primitive_type_builder("_5", PhysicalType::INT32)
984                    .with_logical_type(Some(LogicalType::Date))
985                    .with_converted_type(ConvertedType::DATE)
986                    .build()
987                    .unwrap(),
988            ),
989            Arc::new(
990                Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY)
991                    .with_converted_type(ConvertedType::UTF8)
992                    .build()
993                    .unwrap(),
994            ),
995        ];
996
997        let expected = Type::group_type_builder("root")
998            .with_fields(fields)
999            .build()
1000            .unwrap();
1001        assert_eq!(message, expected);
1002    }
1003
1004    #[test]
1005    fn test_parse_message_type_compare_4() {
1006        let schema = "
1007            message root {
1008              required int32 _1 (INTEGER(8,true));
1009              required int32 _2 (INTEGER(16,false));
1010              required float _3;
1011              required double _4;
1012              optional int32 _5 (DATE);
1013              optional int32 _6 (TIME(MILLIS,false));
1014              optional int64 _7 (TIME(MICROS,true));
1015              optional int64 _8 (TIMESTAMP(MILLIS,true));
1016              optional int64 _9 (TIMESTAMP(NANOS,false));
1017              optional binary _10 (STRING);
1018            }
1019        ";
1020        let message = parse(schema).unwrap();
1021
1022        let fields = vec![
1023            Arc::new(
1024                Type::primitive_type_builder("_1", PhysicalType::INT32)
1025                    .with_repetition(Repetition::REQUIRED)
1026                    .with_logical_type(Some(LogicalType::integer(8, true)))
1027                    .build()
1028                    .unwrap(),
1029            ),
1030            Arc::new(
1031                Type::primitive_type_builder("_2", PhysicalType::INT32)
1032                    .with_repetition(Repetition::REQUIRED)
1033                    .with_logical_type(Some(LogicalType::integer(16, false)))
1034                    .build()
1035                    .unwrap(),
1036            ),
1037            Arc::new(
1038                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
1039                    .with_repetition(Repetition::REQUIRED)
1040                    .build()
1041                    .unwrap(),
1042            ),
1043            Arc::new(
1044                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
1045                    .with_repetition(Repetition::REQUIRED)
1046                    .build()
1047                    .unwrap(),
1048            ),
1049            Arc::new(
1050                Type::primitive_type_builder("_5", PhysicalType::INT32)
1051                    .with_logical_type(Some(LogicalType::Date))
1052                    .build()
1053                    .unwrap(),
1054            ),
1055            Arc::new(
1056                Type::primitive_type_builder("_6", PhysicalType::INT32)
1057                    .with_logical_type(Some(LogicalType::time(false, TimeUnit::MILLIS)))
1058                    .build()
1059                    .unwrap(),
1060            ),
1061            Arc::new(
1062                Type::primitive_type_builder("_7", PhysicalType::INT64)
1063                    .with_logical_type(Some(LogicalType::time(true, TimeUnit::MICROS)))
1064                    .build()
1065                    .unwrap(),
1066            ),
1067            Arc::new(
1068                Type::primitive_type_builder("_8", PhysicalType::INT64)
1069                    .with_logical_type(Some(LogicalType::timestamp(true, TimeUnit::MILLIS)))
1070                    .build()
1071                    .unwrap(),
1072            ),
1073            Arc::new(
1074                Type::primitive_type_builder("_9", PhysicalType::INT64)
1075                    .with_logical_type(Some(LogicalType::timestamp(false, TimeUnit::NANOS)))
1076                    .build()
1077                    .unwrap(),
1078            ),
1079            Arc::new(
1080                Type::primitive_type_builder("_10", PhysicalType::BYTE_ARRAY)
1081                    .with_logical_type(Some(LogicalType::String))
1082                    .build()
1083                    .unwrap(),
1084            ),
1085        ];
1086
1087        let expected = Type::group_type_builder("root")
1088            .with_fields(fields)
1089            .build()
1090            .unwrap();
1091        assert_eq!(message, expected);
1092    }
1093}