Skip to main content

arrow_integration_test/
field.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::{data_type_from_json, data_type_to_json};
19use arrow::datatypes::{DataType, Field};
20use arrow::error::{ArrowError, Result};
21use std::collections::HashMap;
22use std::sync::Arc;
23
24/// Parse a `Field` definition from a JSON representation.
25pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
26    use serde_json::Value;
27    match *json {
28        Value::Object(ref map) => {
29            let name = match map.get("name") {
30                Some(Value::String(name)) => name.to_string(),
31                _ => {
32                    return Err(ArrowError::ParseError(
33                        "Field missing 'name' attribute".to_string(),
34                    ));
35                }
36            };
37            let nullable = match map.get("nullable") {
38                Some(&Value::Bool(b)) => b,
39                _ => {
40                    return Err(ArrowError::ParseError(
41                        "Field missing 'nullable' attribute".to_string(),
42                    ));
43                }
44            };
45            let data_type = match map.get("type") {
46                Some(t) => data_type_from_json(t)?,
47                _ => {
48                    return Err(ArrowError::ParseError(
49                        "Field missing 'type' attribute".to_string(),
50                    ));
51                }
52            };
53
54            // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz
55            let metadata = match map.get("metadata") {
56                Some(Value::Array(values)) => {
57                    let mut res: HashMap<String, String> = HashMap::default();
58                    for value in values {
59                        match value.as_object() {
60                            Some(map) => {
61                                if map.len() != 2 {
62                                    return Err(ArrowError::ParseError(
63                                        "Field 'metadata' must have exact two entries for each key-value map".to_string(),
64                                    ));
65                                }
66                                if let (Some(k), Some(v)) = (map.get("key"), map.get("value")) {
67                                    if let (Some(k_str), Some(v_str)) = (k.as_str(), v.as_str()) {
68                                        res.insert(
69                                            k_str.to_string().clone(),
70                                            v_str.to_string().clone(),
71                                        );
72                                    } else {
73                                        return Err(ArrowError::ParseError(
74                                            "Field 'metadata' must have map value of string type"
75                                                .to_string(),
76                                        ));
77                                    }
78                                } else {
79                                    return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string()));
80                                }
81                            }
82                            _ => {
83                                return Err(ArrowError::ParseError(
84                                    "Field 'metadata' contains non-object key-value pair"
85                                        .to_string(),
86                                ));
87                            }
88                        }
89                    }
90                    res
91                }
92                // We also support map format, because Schema's metadata supports this.
93                // See https://github.com/apache/arrow/pull/5907
94                Some(Value::Object(values)) => {
95                    let mut res: HashMap<String, String> = HashMap::default();
96                    for (k, v) in values {
97                        if let Some(str_value) = v.as_str() {
98                            res.insert(k.clone(), str_value.to_string().clone());
99                        } else {
100                            return Err(ArrowError::ParseError(format!(
101                                "Field 'metadata' contains non-string value for key {k}"
102                            )));
103                        }
104                    }
105                    res
106                }
107                Some(_) => {
108                    return Err(ArrowError::ParseError(
109                        "Field `metadata` is not json array".to_string(),
110                    ));
111                }
112                _ => HashMap::default(),
113            };
114
115            // if data_type is a struct or list, get its children
116            let data_type = match data_type {
117                DataType::List(_)
118                | DataType::LargeList(_)
119                | DataType::ListView(_)
120                | DataType::LargeListView(_)
121                | DataType::FixedSizeList(_, _) => match map.get("children") {
122                    Some(Value::Array(values)) => {
123                        if values.len() != 1 {
124                            return Err(ArrowError::ParseError(
125                                "Field 'children' must have one element for a list data type"
126                                    .to_string(),
127                            ));
128                        }
129                        match data_type {
130                            DataType::List(_) => {
131                                DataType::List(Arc::new(field_from_json(&values[0])?))
132                            }
133                            DataType::LargeList(_) => {
134                                DataType::LargeList(Arc::new(field_from_json(&values[0])?))
135                            }
136                            DataType::ListView(_) => {
137                                DataType::ListView(Arc::new(field_from_json(&values[0])?))
138                            }
139                            DataType::LargeListView(_) => {
140                                DataType::LargeListView(Arc::new(field_from_json(&values[0])?))
141                            }
142                            DataType::FixedSizeList(_, int) => {
143                                DataType::FixedSizeList(Arc::new(field_from_json(&values[0])?), int)
144                            }
145                            _ => unreachable!(
146                                "Data type should be a list, largelist, listview, largelistview or fixedsizelist"
147                            ),
148                        }
149                    }
150                    Some(_) => {
151                        return Err(ArrowError::ParseError(
152                            "Field 'children' must be an array".to_string(),
153                        ));
154                    }
155                    None => {
156                        return Err(ArrowError::ParseError(
157                            "Field missing 'children' attribute".to_string(),
158                        ));
159                    }
160                },
161                DataType::Struct(_) => match map.get("children") {
162                    Some(Value::Array(values)) => {
163                        DataType::Struct(values.iter().map(field_from_json).collect::<Result<_>>()?)
164                    }
165                    Some(_) => {
166                        return Err(ArrowError::ParseError(
167                            "Field 'children' must be an array".to_string(),
168                        ));
169                    }
170                    None => {
171                        return Err(ArrowError::ParseError(
172                            "Field missing 'children' attribute".to_string(),
173                        ));
174                    }
175                },
176                DataType::Map(_, keys_sorted) => {
177                    match map.get("children") {
178                        Some(Value::Array(values)) if values.len() == 1 => {
179                            let child = field_from_json(&values[0])?;
180                            // child must be a struct
181                            match child.data_type() {
182                                DataType::Struct(map_fields) if map_fields.len() == 2 => {
183                                    DataType::Map(Arc::new(child), keys_sorted)
184                                }
185                                t => {
186                                    return Err(ArrowError::ParseError(format!(
187                                        "Map children should be a struct with 2 fields, found {t:?}"
188                                    )));
189                                }
190                            }
191                        }
192                        Some(_) => {
193                            return Err(ArrowError::ParseError(
194                                "Field 'children' must be an array with 1 element".to_string(),
195                            ));
196                        }
197                        None => {
198                            return Err(ArrowError::ParseError(
199                                "Field missing 'children' attribute".to_string(),
200                            ));
201                        }
202                    }
203                }
204                DataType::Union(fields, mode) => match map.get("children") {
205                    Some(Value::Array(values)) => {
206                        let fields = fields
207                            .iter()
208                            .zip(values)
209                            .map(|((id, _), value)| Ok((id, Arc::new(field_from_json(value)?))))
210                            .collect::<Result<_>>()?;
211
212                        DataType::Union(fields, mode)
213                    }
214                    Some(_) => {
215                        return Err(ArrowError::ParseError(
216                            "Field 'children' must be an array".to_string(),
217                        ));
218                    }
219                    None => {
220                        return Err(ArrowError::ParseError(
221                            "Field missing 'children' attribute".to_string(),
222                        ));
223                    }
224                },
225                DataType::RunEndEncoded(_, _) => match map.get("children") {
226                    Some(Value::Array(values)) => {
227                        if values.len() != 2 {
228                            return Err(ArrowError::ParseError(
229                                "Field 'children' must have exactly 2 elements for RunEndEncoded"
230                                    .to_string(),
231                            ));
232                        }
233                        let run_ends = Arc::new(field_from_json(&values[0])?);
234                        let values_field = Arc::new(field_from_json(&values[1])?);
235                        DataType::RunEndEncoded(run_ends, values_field)
236                    }
237                    Some(_) => {
238                        return Err(ArrowError::ParseError(
239                            "Field 'children' must be an array".to_string(),
240                        ));
241                    }
242                    None => {
243                        return Err(ArrowError::ParseError(
244                            "Field missing 'children' attribute".to_string(),
245                        ));
246                    }
247                },
248                _ => data_type,
249            };
250
251            let mut dict_id = 0;
252            let mut dict_is_ordered = false;
253
254            let data_type = match map.get("dictionary") {
255                Some(dictionary) => {
256                    let index_type = match dictionary.get("indexType") {
257                        Some(t) => data_type_from_json(t)?,
258                        _ => {
259                            return Err(ArrowError::ParseError(
260                                "Field missing 'indexType' attribute".to_string(),
261                            ));
262                        }
263                    };
264                    dict_id = match dictionary.get("id") {
265                        Some(Value::Number(n)) => n.as_i64().unwrap(),
266                        _ => {
267                            return Err(ArrowError::ParseError(
268                                "Field missing 'id' attribute".to_string(),
269                            ));
270                        }
271                    };
272                    dict_is_ordered = match dictionary.get("isOrdered") {
273                        Some(&Value::Bool(n)) => n,
274                        _ => {
275                            return Err(ArrowError::ParseError(
276                                "Field missing 'isOrdered' attribute".to_string(),
277                            ));
278                        }
279                    };
280                    DataType::Dictionary(Box::new(index_type), Box::new(data_type))
281                }
282                _ => data_type,
283            };
284
285            #[allow(deprecated)]
286            let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered);
287            field.set_metadata(metadata);
288            Ok(field)
289        }
290        _ => Err(ArrowError::ParseError(
291            "Invalid json value type for field".to_string(),
292        )),
293    }
294}
295
296/// Generate a JSON representation of the `Field`.
297pub fn field_to_json(field: &Field) -> serde_json::Value {
298    let children: Vec<serde_json::Value> = match field.data_type() {
299        DataType::Struct(fields) => fields.iter().map(|x| field_to_json(x.as_ref())).collect(),
300        DataType::List(field)
301        | DataType::LargeList(field)
302        | DataType::ListView(field)
303        | DataType::LargeListView(field)
304        | DataType::FixedSizeList(field, _)
305        | DataType::Map(field, _) => vec![field_to_json(field)],
306        DataType::RunEndEncoded(run_ends, values) => {
307            vec![field_to_json(run_ends), field_to_json(values)]
308        }
309        _ => vec![],
310    };
311
312    match field.data_type() {
313        DataType::Dictionary(index_type, value_type) => {
314            #[allow(deprecated)]
315            let dict_id = field.dict_id().unwrap();
316            serde_json::json!({
317                "name": field.name(),
318                "nullable": field.is_nullable(),
319                "type": data_type_to_json(value_type),
320                "children": children,
321                "dictionary": {
322                    "id": dict_id,
323                    "indexType": data_type_to_json(index_type),
324                    "isOrdered": field.dict_is_ordered().unwrap(),
325                }
326            })
327        }
328        _ => serde_json::json!({
329            "name": field.name(),
330            "nullable": field.is_nullable(),
331            "type": data_type_to_json(field.data_type()),
332            "children": children
333        }),
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use arrow::datatypes::UnionMode;
341    use serde_json::Value;
342
343    #[test]
344    fn struct_field_to_json() {
345        let f = Field::new_struct(
346            "address",
347            vec![
348                Field::new("street", DataType::Utf8, false),
349                Field::new("zip", DataType::UInt16, false),
350            ],
351            false,
352        );
353        let value: Value = serde_json::from_str(
354            r#"{
355                "name": "address",
356                "nullable": false,
357                "type": {
358                    "name": "struct"
359                },
360                "children": [
361                    {
362                        "name": "street",
363                        "nullable": false,
364                        "type": {
365                            "name": "utf8"
366                        },
367                        "children": []
368                    },
369                    {
370                        "name": "zip",
371                        "nullable": false,
372                        "type": {
373                            "name": "int",
374                            "bitWidth": 16,
375                            "isSigned": false
376                        },
377                        "children": []
378                    }
379                ]
380            }"#,
381        )
382        .unwrap();
383        assert_eq!(value, field_to_json(&f));
384    }
385
386    #[test]
387    fn map_field_to_json() {
388        let f = Field::new_map(
389            "my_map",
390            "my_entries",
391            Field::new("my_keys", DataType::Utf8, false),
392            Field::new("my_values", DataType::UInt16, true),
393            true,
394            false,
395        );
396        let value: Value = serde_json::from_str(
397            r#"{
398                "name": "my_map",
399                "nullable": false,
400                "type": {
401                    "name": "map",
402                    "keysSorted": true
403                },
404                "children": [
405                    {
406                        "name": "my_entries",
407                        "nullable": false,
408                        "type": {
409                            "name": "struct"
410                        },
411                        "children": [
412                            {
413                                "name": "my_keys",
414                                "nullable": false,
415                                "type": {
416                                    "name": "utf8"
417                                },
418                                "children": []
419                            },
420                            {
421                                "name": "my_values",
422                                "nullable": true,
423                                "type": {
424                                    "name": "int",
425                                    "bitWidth": 16,
426                                    "isSigned": false
427                                },
428                                "children": []
429                            }
430                        ]
431                    }
432                ]
433            }"#,
434        )
435        .unwrap();
436        assert_eq!(value, field_to_json(&f));
437    }
438
439    #[test]
440    fn primitive_field_to_json() {
441        let f = Field::new("first_name", DataType::Utf8, false);
442        let value: Value = serde_json::from_str(
443            r#"{
444                "name": "first_name",
445                "nullable": false,
446                "type": {
447                    "name": "utf8"
448                },
449                "children": []
450            }"#,
451        )
452        .unwrap();
453        assert_eq!(value, field_to_json(&f));
454    }
455    #[test]
456    fn parse_struct_from_json() {
457        let json = r#"
458        {
459            "name": "address",
460            "type": {
461                "name": "struct"
462            },
463            "nullable": false,
464            "children": [
465                {
466                    "name": "street",
467                    "type": {
468                    "name": "utf8"
469                    },
470                    "nullable": false,
471                    "children": []
472                },
473                {
474                    "name": "zip",
475                    "type": {
476                    "name": "int",
477                    "isSigned": false,
478                    "bitWidth": 16
479                    },
480                    "nullable": false,
481                    "children": []
482                }
483            ]
484        }
485        "#;
486        let value: Value = serde_json::from_str(json).unwrap();
487        let dt = field_from_json(&value).unwrap();
488
489        let expected = Field::new_struct(
490            "address",
491            vec![
492                Field::new("street", DataType::Utf8, false),
493                Field::new("zip", DataType::UInt16, false),
494            ],
495            false,
496        );
497
498        assert_eq!(expected, dt);
499    }
500
501    #[test]
502    fn parse_map_from_json() {
503        let json = r#"
504        {
505            "name": "my_map",
506            "nullable": false,
507            "type": {
508                "name": "map",
509                "keysSorted": true
510            },
511            "children": [
512                {
513                    "name": "my_entries",
514                    "nullable": false,
515                    "type": {
516                        "name": "struct"
517                    },
518                    "children": [
519                        {
520                            "name": "my_keys",
521                            "nullable": false,
522                            "type": {
523                                "name": "utf8"
524                            },
525                            "children": []
526                        },
527                        {
528                            "name": "my_values",
529                            "nullable": true,
530                            "type": {
531                                "name": "int",
532                                "bitWidth": 16,
533                                "isSigned": false
534                            },
535                            "children": []
536                        }
537                    ]
538                }
539            ]
540        }
541        "#;
542        let value: Value = serde_json::from_str(json).unwrap();
543        let dt = field_from_json(&value).unwrap();
544
545        let expected = Field::new_map(
546            "my_map",
547            "my_entries",
548            Field::new("my_keys", DataType::Utf8, false),
549            Field::new("my_values", DataType::UInt16, true),
550            true,
551            false,
552        );
553
554        assert_eq!(expected, dt);
555    }
556
557    #[test]
558    fn parse_union_from_json() {
559        let json = r#"
560        {
561            "name": "my_union",
562            "nullable": false,
563            "type": {
564                "name": "union",
565                "mode": "SPARSE",
566                "typeIds": [
567                    5,
568                    7
569                ]
570            },
571            "children": [
572                {
573                    "name": "f1",
574                    "type": {
575                        "name": "int",
576                        "isSigned": true,
577                        "bitWidth": 32
578                    },
579                    "nullable": true,
580                    "children": []
581                },
582                {
583                    "name": "f2",
584                    "type": {
585                        "name": "utf8"
586                    },
587                    "nullable": true,
588                    "children": []
589                }
590            ]
591        }
592        "#;
593        let value: Value = serde_json::from_str(json).unwrap();
594        let dt = field_from_json(&value).unwrap();
595
596        let expected = Field::new_union(
597            "my_union",
598            vec![5, 7],
599            vec![
600                Field::new("f1", DataType::Int32, true),
601                Field::new("f2", DataType::Utf8, true),
602            ],
603            UnionMode::Sparse,
604        );
605
606        assert_eq!(expected, dt);
607    }
608}