arrow_integration_test/
schema.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::{field_from_json, field_to_json};
19use arrow::datatypes::{Fields, Schema};
20use arrow::error::{ArrowError, Result};
21use std::collections::HashMap;
22
23/// Generate a JSON representation of the `Schema`.
24pub fn schema_to_json(schema: &Schema) -> serde_json::Value {
25    serde_json::json!({
26        "fields": schema.fields().iter().map(|f| field_to_json(f.as_ref())).collect::<Vec<_>>(),
27        "metadata": serde_json::to_value(schema.metadata()).unwrap()
28    })
29}
30
31/// Parse a `Schema` definition from a JSON representation.
32pub fn schema_from_json(json: &serde_json::Value) -> Result<Schema> {
33    use serde_json::Value;
34    match *json {
35        Value::Object(ref schema) => {
36            let fields: Fields = match schema.get("fields") {
37                Some(Value::Array(fields)) => {
38                    fields.iter().map(field_from_json).collect::<Result<_>>()?
39                }
40                _ => {
41                    return Err(ArrowError::ParseError(
42                        "Schema fields should be an array".to_string(),
43                    ))
44                }
45            };
46
47            let metadata = if let Some(value) = schema.get("metadata") {
48                from_metadata(value)?
49            } else {
50                HashMap::default()
51            };
52
53            Ok(Schema::new_with_metadata(fields, metadata))
54        }
55        _ => Err(ArrowError::ParseError(
56            "Invalid json value type for schema".to_string(),
57        )),
58    }
59}
60
61/// Parse a `metadata` definition from a JSON representation.
62/// The JSON can either be an Object or an Array of Objects.
63fn from_metadata(json: &serde_json::Value) -> Result<HashMap<String, String>> {
64    use serde_json::Value;
65    match json {
66        Value::Array(_) => {
67            let mut hashmap = HashMap::new();
68            let values: Vec<MetadataKeyValue> =
69                serde_json::from_value(json.clone()).map_err(|_| {
70                    ArrowError::JsonError("Unable to parse object into key-value pair".to_string())
71                })?;
72            for meta in values {
73                hashmap.insert(meta.key.clone(), meta.value);
74            }
75            Ok(hashmap)
76        }
77        Value::Object(md) => md
78            .iter()
79            .map(|(k, v)| {
80                if let Value::String(v) = v {
81                    Ok((k.to_string(), v.to_string()))
82                } else {
83                    Err(ArrowError::ParseError(
84                        "metadata `value` field must be a string".to_string(),
85                    ))
86                }
87            })
88            .collect::<Result<_>>(),
89        _ => Err(ArrowError::ParseError(
90            "`metadata` field must be an object".to_string(),
91        )),
92    }
93}
94
95#[derive(serde::Deserialize)]
96struct MetadataKeyValue {
97    key: String,
98    value: String,
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104    use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
105    use serde_json::Value;
106    use std::sync::Arc;
107
108    #[test]
109    fn schema_json() {
110        // Add some custom metadata
111        let metadata: HashMap<String, String> = [("Key".to_string(), "Value".to_string())]
112            .iter()
113            .cloned()
114            .collect();
115
116        let schema = Schema::new_with_metadata(
117            vec![
118                Field::new("c1", DataType::Utf8, false),
119                Field::new("c2", DataType::Binary, false),
120                Field::new("c3", DataType::FixedSizeBinary(3), false),
121                Field::new("c4", DataType::Boolean, false),
122                Field::new("c5", DataType::Date32, false),
123                Field::new("c6", DataType::Date64, false),
124                Field::new("c7", DataType::Time32(TimeUnit::Second), false),
125                Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false),
126                Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false),
127                Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false),
128                Field::new("c11", DataType::Time64(TimeUnit::Second), false),
129                Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false),
130                Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false),
131                Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false),
132                Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false),
133                Field::new(
134                    "c16",
135                    DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
136                    false,
137                ),
138                Field::new(
139                    "c17",
140                    DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())),
141                    false,
142                ),
143                Field::new(
144                    "c18",
145                    DataType::Timestamp(TimeUnit::Nanosecond, None),
146                    false,
147                ),
148                Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
149                Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
150                Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false),
151                Field::new(
152                    "c22",
153                    DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))),
154                    false,
155                ),
156                Field::new(
157                    "c23",
158                    DataType::FixedSizeList(
159                        Arc::new(Field::new("bools", DataType::Boolean, false)),
160                        5,
161                    ),
162                    false,
163                ),
164                Field::new(
165                    "c24",
166                    DataType::List(Arc::new(Field::new(
167                        "inner_list",
168                        DataType::List(Arc::new(Field::new(
169                            "struct",
170                            DataType::Struct(Fields::empty()),
171                            true,
172                        ))),
173                        false,
174                    ))),
175                    true,
176                ),
177                Field::new(
178                    "c25",
179                    DataType::Struct(Fields::from(vec![
180                        Field::new("a", DataType::Utf8, false),
181                        Field::new("b", DataType::UInt16, false),
182                    ])),
183                    false,
184                ),
185                Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true),
186                Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true),
187                Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true),
188                Field::new("c29", DataType::Duration(TimeUnit::Second), false),
189                Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false),
190                Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false),
191                Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false),
192                #[allow(deprecated)]
193                Field::new_dict(
194                    "c33",
195                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
196                    true,
197                    123,
198                    true,
199                ),
200                Field::new("c34", DataType::LargeBinary, true),
201                Field::new("c35", DataType::LargeUtf8, true),
202                Field::new(
203                    "c36",
204                    DataType::LargeList(Arc::new(Field::new(
205                        "inner_large_list",
206                        DataType::LargeList(Arc::new(Field::new(
207                            "struct",
208                            DataType::Struct(Fields::empty()),
209                            false,
210                        ))),
211                        true,
212                    ))),
213                    true,
214                ),
215                Field::new(
216                    "c37",
217                    DataType::Map(
218                        Arc::new(Field::new(
219                            "my_entries",
220                            DataType::Struct(Fields::from(vec![
221                                Field::new("my_keys", DataType::Utf8, false),
222                                Field::new("my_values", DataType::UInt16, true),
223                            ])),
224                            false,
225                        )),
226                        true,
227                    ),
228                    false,
229                ),
230            ],
231            metadata,
232        );
233
234        let expected = schema_to_json(&schema);
235        let json = r#"{
236                "fields": [
237                    {
238                        "name": "c1",
239                        "nullable": false,
240                        "type": {
241                            "name": "utf8"
242                        },
243                        "children": []
244                    },
245                    {
246                        "name": "c2",
247                        "nullable": false,
248                        "type": {
249                            "name": "binary"
250                        },
251                        "children": []
252                    },
253                    {
254                        "name": "c3",
255                        "nullable": false,
256                        "type": {
257                            "name": "fixedsizebinary",
258                            "byteWidth": 3
259                        },
260                        "children": []
261                    },
262                    {
263                        "name": "c4",
264                        "nullable": false,
265                        "type": {
266                            "name": "bool"
267                        },
268                        "children": []
269                    },
270                    {
271                        "name": "c5",
272                        "nullable": false,
273                        "type": {
274                            "name": "date",
275                            "unit": "DAY"
276                        },
277                        "children": []
278                    },
279                    {
280                        "name": "c6",
281                        "nullable": false,
282                        "type": {
283                            "name": "date",
284                            "unit": "MILLISECOND"
285                        },
286                        "children": []
287                    },
288                    {
289                        "name": "c7",
290                        "nullable": false,
291                        "type": {
292                            "name": "time",
293                            "bitWidth": 32,
294                            "unit": "SECOND"
295                        },
296                        "children": []
297                    },
298                    {
299                        "name": "c8",
300                        "nullable": false,
301                        "type": {
302                            "name": "time",
303                            "bitWidth": 32,
304                            "unit": "MILLISECOND"
305                        },
306                        "children": []
307                    },
308                    {
309                        "name": "c9",
310                        "nullable": false,
311                        "type": {
312                            "name": "time",
313                            "bitWidth": 32,
314                            "unit": "MICROSECOND"
315                        },
316                        "children": []
317                    },
318                    {
319                        "name": "c10",
320                        "nullable": false,
321                        "type": {
322                            "name": "time",
323                            "bitWidth": 32,
324                            "unit": "NANOSECOND"
325                        },
326                        "children": []
327                    },
328                    {
329                        "name": "c11",
330                        "nullable": false,
331                        "type": {
332                            "name": "time",
333                            "bitWidth": 64,
334                            "unit": "SECOND"
335                        },
336                        "children": []
337                    },
338                    {
339                        "name": "c12",
340                        "nullable": false,
341                        "type": {
342                            "name": "time",
343                            "bitWidth": 64,
344                            "unit": "MILLISECOND"
345                        },
346                        "children": []
347                    },
348                    {
349                        "name": "c13",
350                        "nullable": false,
351                        "type": {
352                            "name": "time",
353                            "bitWidth": 64,
354                            "unit": "MICROSECOND"
355                        },
356                        "children": []
357                    },
358                    {
359                        "name": "c14",
360                        "nullable": false,
361                        "type": {
362                            "name": "time",
363                            "bitWidth": 64,
364                            "unit": "NANOSECOND"
365                        },
366                        "children": []
367                    },
368                    {
369                        "name": "c15",
370                        "nullable": false,
371                        "type": {
372                            "name": "timestamp",
373                            "unit": "SECOND"
374                        },
375                        "children": []
376                    },
377                    {
378                        "name": "c16",
379                        "nullable": false,
380                        "type": {
381                            "name": "timestamp",
382                            "unit": "MILLISECOND",
383                            "timezone": "UTC"
384                        },
385                        "children": []
386                    },
387                    {
388                        "name": "c17",
389                        "nullable": false,
390                        "type": {
391                            "name": "timestamp",
392                            "unit": "MICROSECOND",
393                            "timezone": "Africa/Johannesburg"
394                        },
395                        "children": []
396                    },
397                    {
398                        "name": "c18",
399                        "nullable": false,
400                        "type": {
401                            "name": "timestamp",
402                            "unit": "NANOSECOND"
403                        },
404                        "children": []
405                    },
406                    {
407                        "name": "c19",
408                        "nullable": false,
409                        "type": {
410                            "name": "interval",
411                            "unit": "DAY_TIME"
412                        },
413                        "children": []
414                    },
415                    {
416                        "name": "c20",
417                        "nullable": false,
418                        "type": {
419                            "name": "interval",
420                            "unit": "YEAR_MONTH"
421                        },
422                        "children": []
423                    },
424                    {
425                        "name": "c21",
426                        "nullable": false,
427                        "type": {
428                            "name": "interval",
429                            "unit": "MONTH_DAY_NANO"
430                        },
431                        "children": []
432                    },
433                    {
434                        "name": "c22",
435                        "nullable": false,
436                        "type": {
437                            "name": "list"
438                        },
439                        "children": [
440                            {
441                                "name": "item",
442                                "nullable": true,
443                                "type": {
444                                    "name": "bool"
445                                },
446                                "children": []
447                            }
448                        ]
449                    },
450                    {
451                        "name": "c23",
452                        "nullable": false,
453                        "type": {
454                            "name": "fixedsizelist",
455                            "listSize": 5
456                        },
457                        "children": [
458                            {
459                                "name": "bools",
460                                "nullable": false,
461                                "type": {
462                                    "name": "bool"
463                                },
464                                "children": []
465                            }
466                        ]
467                    },
468                    {
469                        "name": "c24",
470                        "nullable": true,
471                        "type": {
472                            "name": "list"
473                        },
474                        "children": [
475                            {
476                                "name": "inner_list",
477                                "nullable": false,
478                                "type": {
479                                    "name": "list"
480                                },
481                                "children": [
482                                    {
483                                        "name": "struct",
484                                        "nullable": true,
485                                        "type": {
486                                            "name": "struct"
487                                        },
488                                        "children": []
489                                    }
490                                ]
491                            }
492                        ]
493                    },
494                    {
495                        "name": "c25",
496                        "nullable": false,
497                        "type": {
498                            "name": "struct"
499                        },
500                        "children": [
501                            {
502                                "name": "a",
503                                "nullable": false,
504                                "type": {
505                                    "name": "utf8"
506                                },
507                                "children": []
508                            },
509                            {
510                                "name": "b",
511                                "nullable": false,
512                                "type": {
513                                    "name": "int",
514                                    "bitWidth": 16,
515                                    "isSigned": false
516                                },
517                                "children": []
518                            }
519                        ]
520                    },
521                    {
522                        "name": "c26",
523                        "nullable": true,
524                        "type": {
525                            "name": "interval",
526                            "unit": "YEAR_MONTH"
527                        },
528                        "children": []
529                    },
530                    {
531                        "name": "c27",
532                        "nullable": true,
533                        "type": {
534                            "name": "interval",
535                            "unit": "DAY_TIME"
536                        },
537                        "children": []
538                    },
539                    {
540                        "name": "c28",
541                        "nullable": true,
542                        "type": {
543                            "name": "interval",
544                            "unit": "MONTH_DAY_NANO"
545                        },
546                        "children": []
547                    },
548                    {
549                        "name": "c29",
550                        "nullable": false,
551                        "type": {
552                            "name": "duration",
553                            "unit": "SECOND"
554                        },
555                        "children": []
556                    },
557                    {
558                        "name": "c30",
559                        "nullable": false,
560                        "type": {
561                            "name": "duration",
562                            "unit": "MILLISECOND"
563                        },
564                        "children": []
565                    },
566                    {
567                        "name": "c31",
568                        "nullable": false,
569                        "type": {
570                            "name": "duration",
571                            "unit": "MICROSECOND"
572                        },
573                        "children": []
574                    },
575                    {
576                        "name": "c32",
577                        "nullable": false,
578                        "type": {
579                            "name": "duration",
580                            "unit": "NANOSECOND"
581                        },
582                        "children": []
583                    },
584                    {
585                        "name": "c33",
586                        "nullable": true,
587                        "children": [],
588                        "type": {
589                          "name": "utf8"
590                        },
591                        "dictionary": {
592                          "id": 123,
593                          "indexType": {
594                            "name": "int",
595                            "bitWidth": 32,
596                            "isSigned": true
597                          },
598                          "isOrdered": true
599                        }
600                    },
601                    {
602                        "name": "c34",
603                        "nullable": true,
604                        "type": {
605                          "name": "largebinary"
606                        },
607                        "children": []
608                    },
609                    {
610                        "name": "c35",
611                        "nullable": true,
612                        "type": {
613                          "name": "largeutf8"
614                        },
615                        "children": []
616                    },
617                    {
618                        "name": "c36",
619                        "nullable": true,
620                        "type": {
621                          "name": "largelist"
622                        },
623                        "children": [
624                            {
625                                "name": "inner_large_list",
626                                "nullable": true,
627                                "type": {
628                                    "name": "largelist"
629                                },
630                                "children": [
631                                    {
632                                        "name": "struct",
633                                        "nullable": false,
634                                        "type": {
635                                            "name": "struct"
636                                        },
637                                        "children": []
638                                    }
639                                ]
640                            }
641                        ]
642                    },
643                    {
644                        "name": "c37",
645                        "nullable": false,
646                        "type": {
647                            "name": "map",
648                            "keysSorted": true
649                        },
650                        "children": [
651                            {
652                                "name": "my_entries",
653                                "nullable": false,
654                                "type": {
655                                    "name": "struct"
656                                },
657                                "children": [
658                                    {
659                                        "name": "my_keys",
660                                        "nullable": false,
661                                        "type": {
662                                            "name": "utf8"
663                                        },
664                                        "children": []
665                                    },
666                                    {
667                                        "name": "my_values",
668                                        "nullable": true,
669                                        "type": {
670                                            "name": "int",
671                                            "bitWidth": 16,
672                                            "isSigned": false
673                                        },
674                                        "children": []
675                                    }
676                                ]
677                            }
678                        ]
679                    }
680                ],
681                "metadata" : {
682                    "Key": "Value"
683                }
684            }"#;
685        let value: Value = serde_json::from_str(json).unwrap();
686        assert_eq!(expected, value);
687
688        // convert back to a schema
689        let value: Value = serde_json::from_str(json).unwrap();
690        let schema2 = schema_from_json(&value).unwrap();
691
692        assert_eq!(schema, schema2);
693
694        // Check that empty metadata produces empty value in JSON and can be parsed
695        let json = r#"{
696                "fields": [
697                    {
698                        "name": "c1",
699                        "nullable": false,
700                        "type": {
701                            "name": "utf8"
702                        },
703                        "children": []
704                    }
705                ],
706                "metadata": {}
707            }"#;
708        let value: Value = serde_json::from_str(json).unwrap();
709        let schema = schema_from_json(&value).unwrap();
710        assert!(schema.metadata.is_empty());
711
712        // Check that metadata field is not required in the JSON.
713        let json = r#"{
714                "fields": [
715                    {
716                        "name": "c1",
717                        "nullable": false,
718                        "type": {
719                            "name": "utf8"
720                        },
721                        "children": []
722                    }
723                ]
724            }"#;
725        let value: Value = serde_json::from_str(json).unwrap();
726        let schema = schema_from_json(&value).unwrap();
727        assert!(schema.metadata.is_empty());
728    }
729}