arrow_integration_test/schema.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::{field_from_json, field_to_json};
19use arrow::datatypes::{Fields, Schema};
20use arrow::error::{ArrowError, Result};
21use std::collections::HashMap;
22
23/// Generate a JSON representation of the `Schema`.
24pub fn schema_to_json(schema: &Schema) -> serde_json::Value {
25 serde_json::json!({
26 "fields": schema.fields().iter().map(|f| field_to_json(f.as_ref())).collect::<Vec<_>>(),
27 "metadata": serde_json::to_value(schema.metadata()).unwrap()
28 })
29}
30
31/// Parse a `Schema` definition from a JSON representation.
32pub fn schema_from_json(json: &serde_json::Value) -> Result<Schema> {
33 use serde_json::Value;
34 match *json {
35 Value::Object(ref schema) => {
36 let fields: Fields = match schema.get("fields") {
37 Some(Value::Array(fields)) => {
38 fields.iter().map(field_from_json).collect::<Result<_>>()?
39 }
40 _ => {
41 return Err(ArrowError::ParseError(
42 "Schema fields should be an array".to_string(),
43 ))
44 }
45 };
46
47 let metadata = if let Some(value) = schema.get("metadata") {
48 from_metadata(value)?
49 } else {
50 HashMap::default()
51 };
52
53 Ok(Schema::new_with_metadata(fields, metadata))
54 }
55 _ => Err(ArrowError::ParseError(
56 "Invalid json value type for schema".to_string(),
57 )),
58 }
59}
60
61/// Parse a `metadata` definition from a JSON representation.
62/// The JSON can either be an Object or an Array of Objects.
63fn from_metadata(json: &serde_json::Value) -> Result<HashMap<String, String>> {
64 use serde_json::Value;
65 match json {
66 Value::Array(_) => {
67 let mut hashmap = HashMap::new();
68 let values: Vec<MetadataKeyValue> =
69 serde_json::from_value(json.clone()).map_err(|_| {
70 ArrowError::JsonError("Unable to parse object into key-value pair".to_string())
71 })?;
72 for meta in values {
73 hashmap.insert(meta.key.clone(), meta.value);
74 }
75 Ok(hashmap)
76 }
77 Value::Object(md) => md
78 .iter()
79 .map(|(k, v)| {
80 if let Value::String(v) = v {
81 Ok((k.to_string(), v.to_string()))
82 } else {
83 Err(ArrowError::ParseError(
84 "metadata `value` field must be a string".to_string(),
85 ))
86 }
87 })
88 .collect::<Result<_>>(),
89 _ => Err(ArrowError::ParseError(
90 "`metadata` field must be an object".to_string(),
91 )),
92 }
93}
94
95#[derive(serde::Deserialize)]
96struct MetadataKeyValue {
97 key: String,
98 value: String,
99}
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104 use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
105 use serde_json::Value;
106 use std::sync::Arc;
107
108 #[test]
109 fn schema_json() {
110 // Add some custom metadata
111 let metadata: HashMap<String, String> = [("Key".to_string(), "Value".to_string())]
112 .iter()
113 .cloned()
114 .collect();
115
116 let schema = Schema::new_with_metadata(
117 vec![
118 Field::new("c1", DataType::Utf8, false),
119 Field::new("c2", DataType::Binary, false),
120 Field::new("c3", DataType::FixedSizeBinary(3), false),
121 Field::new("c4", DataType::Boolean, false),
122 Field::new("c5", DataType::Date32, false),
123 Field::new("c6", DataType::Date64, false),
124 Field::new("c7", DataType::Time32(TimeUnit::Second), false),
125 Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false),
126 Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false),
127 Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false),
128 Field::new("c11", DataType::Time64(TimeUnit::Second), false),
129 Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false),
130 Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false),
131 Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false),
132 Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false),
133 Field::new(
134 "c16",
135 DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
136 false,
137 ),
138 Field::new(
139 "c17",
140 DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())),
141 false,
142 ),
143 Field::new(
144 "c18",
145 DataType::Timestamp(TimeUnit::Nanosecond, None),
146 false,
147 ),
148 Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
149 Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
150 Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false),
151 Field::new(
152 "c22",
153 DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))),
154 false,
155 ),
156 Field::new(
157 "c23",
158 DataType::FixedSizeList(
159 Arc::new(Field::new("bools", DataType::Boolean, false)),
160 5,
161 ),
162 false,
163 ),
164 Field::new(
165 "c24",
166 DataType::List(Arc::new(Field::new(
167 "inner_list",
168 DataType::List(Arc::new(Field::new(
169 "struct",
170 DataType::Struct(Fields::empty()),
171 true,
172 ))),
173 false,
174 ))),
175 true,
176 ),
177 Field::new(
178 "c25",
179 DataType::Struct(Fields::from(vec![
180 Field::new("a", DataType::Utf8, false),
181 Field::new("b", DataType::UInt16, false),
182 ])),
183 false,
184 ),
185 Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true),
186 Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true),
187 Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true),
188 Field::new("c29", DataType::Duration(TimeUnit::Second), false),
189 Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false),
190 Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false),
191 Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false),
192 #[allow(deprecated)]
193 Field::new_dict(
194 "c33",
195 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
196 true,
197 123,
198 true,
199 ),
200 Field::new("c34", DataType::LargeBinary, true),
201 Field::new("c35", DataType::LargeUtf8, true),
202 Field::new(
203 "c36",
204 DataType::LargeList(Arc::new(Field::new(
205 "inner_large_list",
206 DataType::LargeList(Arc::new(Field::new(
207 "struct",
208 DataType::Struct(Fields::empty()),
209 false,
210 ))),
211 true,
212 ))),
213 true,
214 ),
215 Field::new(
216 "c37",
217 DataType::Map(
218 Arc::new(Field::new(
219 "my_entries",
220 DataType::Struct(Fields::from(vec![
221 Field::new("my_keys", DataType::Utf8, false),
222 Field::new("my_values", DataType::UInt16, true),
223 ])),
224 false,
225 )),
226 true,
227 ),
228 false,
229 ),
230 ],
231 metadata,
232 );
233
234 let expected = schema_to_json(&schema);
235 let json = r#"{
236 "fields": [
237 {
238 "name": "c1",
239 "nullable": false,
240 "type": {
241 "name": "utf8"
242 },
243 "children": []
244 },
245 {
246 "name": "c2",
247 "nullable": false,
248 "type": {
249 "name": "binary"
250 },
251 "children": []
252 },
253 {
254 "name": "c3",
255 "nullable": false,
256 "type": {
257 "name": "fixedsizebinary",
258 "byteWidth": 3
259 },
260 "children": []
261 },
262 {
263 "name": "c4",
264 "nullable": false,
265 "type": {
266 "name": "bool"
267 },
268 "children": []
269 },
270 {
271 "name": "c5",
272 "nullable": false,
273 "type": {
274 "name": "date",
275 "unit": "DAY"
276 },
277 "children": []
278 },
279 {
280 "name": "c6",
281 "nullable": false,
282 "type": {
283 "name": "date",
284 "unit": "MILLISECOND"
285 },
286 "children": []
287 },
288 {
289 "name": "c7",
290 "nullable": false,
291 "type": {
292 "name": "time",
293 "bitWidth": 32,
294 "unit": "SECOND"
295 },
296 "children": []
297 },
298 {
299 "name": "c8",
300 "nullable": false,
301 "type": {
302 "name": "time",
303 "bitWidth": 32,
304 "unit": "MILLISECOND"
305 },
306 "children": []
307 },
308 {
309 "name": "c9",
310 "nullable": false,
311 "type": {
312 "name": "time",
313 "bitWidth": 32,
314 "unit": "MICROSECOND"
315 },
316 "children": []
317 },
318 {
319 "name": "c10",
320 "nullable": false,
321 "type": {
322 "name": "time",
323 "bitWidth": 32,
324 "unit": "NANOSECOND"
325 },
326 "children": []
327 },
328 {
329 "name": "c11",
330 "nullable": false,
331 "type": {
332 "name": "time",
333 "bitWidth": 64,
334 "unit": "SECOND"
335 },
336 "children": []
337 },
338 {
339 "name": "c12",
340 "nullable": false,
341 "type": {
342 "name": "time",
343 "bitWidth": 64,
344 "unit": "MILLISECOND"
345 },
346 "children": []
347 },
348 {
349 "name": "c13",
350 "nullable": false,
351 "type": {
352 "name": "time",
353 "bitWidth": 64,
354 "unit": "MICROSECOND"
355 },
356 "children": []
357 },
358 {
359 "name": "c14",
360 "nullable": false,
361 "type": {
362 "name": "time",
363 "bitWidth": 64,
364 "unit": "NANOSECOND"
365 },
366 "children": []
367 },
368 {
369 "name": "c15",
370 "nullable": false,
371 "type": {
372 "name": "timestamp",
373 "unit": "SECOND"
374 },
375 "children": []
376 },
377 {
378 "name": "c16",
379 "nullable": false,
380 "type": {
381 "name": "timestamp",
382 "unit": "MILLISECOND",
383 "timezone": "UTC"
384 },
385 "children": []
386 },
387 {
388 "name": "c17",
389 "nullable": false,
390 "type": {
391 "name": "timestamp",
392 "unit": "MICROSECOND",
393 "timezone": "Africa/Johannesburg"
394 },
395 "children": []
396 },
397 {
398 "name": "c18",
399 "nullable": false,
400 "type": {
401 "name": "timestamp",
402 "unit": "NANOSECOND"
403 },
404 "children": []
405 },
406 {
407 "name": "c19",
408 "nullable": false,
409 "type": {
410 "name": "interval",
411 "unit": "DAY_TIME"
412 },
413 "children": []
414 },
415 {
416 "name": "c20",
417 "nullable": false,
418 "type": {
419 "name": "interval",
420 "unit": "YEAR_MONTH"
421 },
422 "children": []
423 },
424 {
425 "name": "c21",
426 "nullable": false,
427 "type": {
428 "name": "interval",
429 "unit": "MONTH_DAY_NANO"
430 },
431 "children": []
432 },
433 {
434 "name": "c22",
435 "nullable": false,
436 "type": {
437 "name": "list"
438 },
439 "children": [
440 {
441 "name": "item",
442 "nullable": true,
443 "type": {
444 "name": "bool"
445 },
446 "children": []
447 }
448 ]
449 },
450 {
451 "name": "c23",
452 "nullable": false,
453 "type": {
454 "name": "fixedsizelist",
455 "listSize": 5
456 },
457 "children": [
458 {
459 "name": "bools",
460 "nullable": false,
461 "type": {
462 "name": "bool"
463 },
464 "children": []
465 }
466 ]
467 },
468 {
469 "name": "c24",
470 "nullable": true,
471 "type": {
472 "name": "list"
473 },
474 "children": [
475 {
476 "name": "inner_list",
477 "nullable": false,
478 "type": {
479 "name": "list"
480 },
481 "children": [
482 {
483 "name": "struct",
484 "nullable": true,
485 "type": {
486 "name": "struct"
487 },
488 "children": []
489 }
490 ]
491 }
492 ]
493 },
494 {
495 "name": "c25",
496 "nullable": false,
497 "type": {
498 "name": "struct"
499 },
500 "children": [
501 {
502 "name": "a",
503 "nullable": false,
504 "type": {
505 "name": "utf8"
506 },
507 "children": []
508 },
509 {
510 "name": "b",
511 "nullable": false,
512 "type": {
513 "name": "int",
514 "bitWidth": 16,
515 "isSigned": false
516 },
517 "children": []
518 }
519 ]
520 },
521 {
522 "name": "c26",
523 "nullable": true,
524 "type": {
525 "name": "interval",
526 "unit": "YEAR_MONTH"
527 },
528 "children": []
529 },
530 {
531 "name": "c27",
532 "nullable": true,
533 "type": {
534 "name": "interval",
535 "unit": "DAY_TIME"
536 },
537 "children": []
538 },
539 {
540 "name": "c28",
541 "nullable": true,
542 "type": {
543 "name": "interval",
544 "unit": "MONTH_DAY_NANO"
545 },
546 "children": []
547 },
548 {
549 "name": "c29",
550 "nullable": false,
551 "type": {
552 "name": "duration",
553 "unit": "SECOND"
554 },
555 "children": []
556 },
557 {
558 "name": "c30",
559 "nullable": false,
560 "type": {
561 "name": "duration",
562 "unit": "MILLISECOND"
563 },
564 "children": []
565 },
566 {
567 "name": "c31",
568 "nullable": false,
569 "type": {
570 "name": "duration",
571 "unit": "MICROSECOND"
572 },
573 "children": []
574 },
575 {
576 "name": "c32",
577 "nullable": false,
578 "type": {
579 "name": "duration",
580 "unit": "NANOSECOND"
581 },
582 "children": []
583 },
584 {
585 "name": "c33",
586 "nullable": true,
587 "children": [],
588 "type": {
589 "name": "utf8"
590 },
591 "dictionary": {
592 "id": 123,
593 "indexType": {
594 "name": "int",
595 "bitWidth": 32,
596 "isSigned": true
597 },
598 "isOrdered": true
599 }
600 },
601 {
602 "name": "c34",
603 "nullable": true,
604 "type": {
605 "name": "largebinary"
606 },
607 "children": []
608 },
609 {
610 "name": "c35",
611 "nullable": true,
612 "type": {
613 "name": "largeutf8"
614 },
615 "children": []
616 },
617 {
618 "name": "c36",
619 "nullable": true,
620 "type": {
621 "name": "largelist"
622 },
623 "children": [
624 {
625 "name": "inner_large_list",
626 "nullable": true,
627 "type": {
628 "name": "largelist"
629 },
630 "children": [
631 {
632 "name": "struct",
633 "nullable": false,
634 "type": {
635 "name": "struct"
636 },
637 "children": []
638 }
639 ]
640 }
641 ]
642 },
643 {
644 "name": "c37",
645 "nullable": false,
646 "type": {
647 "name": "map",
648 "keysSorted": true
649 },
650 "children": [
651 {
652 "name": "my_entries",
653 "nullable": false,
654 "type": {
655 "name": "struct"
656 },
657 "children": [
658 {
659 "name": "my_keys",
660 "nullable": false,
661 "type": {
662 "name": "utf8"
663 },
664 "children": []
665 },
666 {
667 "name": "my_values",
668 "nullable": true,
669 "type": {
670 "name": "int",
671 "bitWidth": 16,
672 "isSigned": false
673 },
674 "children": []
675 }
676 ]
677 }
678 ]
679 }
680 ],
681 "metadata" : {
682 "Key": "Value"
683 }
684 }"#;
685 let value: Value = serde_json::from_str(json).unwrap();
686 assert_eq!(expected, value);
687
688 // convert back to a schema
689 let value: Value = serde_json::from_str(json).unwrap();
690 let schema2 = schema_from_json(&value).unwrap();
691
692 assert_eq!(schema, schema2);
693
694 // Check that empty metadata produces empty value in JSON and can be parsed
695 let json = r#"{
696 "fields": [
697 {
698 "name": "c1",
699 "nullable": false,
700 "type": {
701 "name": "utf8"
702 },
703 "children": []
704 }
705 ],
706 "metadata": {}
707 }"#;
708 let value: Value = serde_json::from_str(json).unwrap();
709 let schema = schema_from_json(&value).unwrap();
710 assert!(schema.metadata.is_empty());
711
712 // Check that metadata field is not required in the JSON.
713 let json = r#"{
714 "fields": [
715 {
716 "name": "c1",
717 "nullable": false,
718 "type": {
719 "name": "utf8"
720 },
721 "children": []
722 }
723 ]
724 }"#;
725 let value: Value = serde_json::from_str(json).unwrap();
726 let schema = schema_from_json(&value).unwrap();
727 assert!(schema.metadata.is_empty());
728 }
729}