arrow_schema/
ffi.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
19//!
20//! ```
21//! # use arrow_schema::{DataType, Field, Schema};
22//! # use arrow_schema::ffi::FFI_ArrowSchema;
23//!
24//! // Create from data type
25//! let ffi_data_type = FFI_ArrowSchema::try_from(&DataType::LargeUtf8).unwrap();
26//! let back = DataType::try_from(&ffi_data_type).unwrap();
27//! assert_eq!(back, DataType::LargeUtf8);
28//!
29//! // Create from schema
30//! let schema = Schema::new(vec![Field::new("foo", DataType::Int64, false)]);
31//! let ffi_schema = FFI_ArrowSchema::try_from(&schema).unwrap();
32//! let back = Schema::try_from(&ffi_schema).unwrap();
33//!
34//! assert_eq!(schema, back);
35//! ```
36
37use crate::{
38    ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode,
39};
40use bitflags::bitflags;
41use std::borrow::Cow;
42use std::sync::Arc;
43use std::{
44    collections::HashMap,
45    ffi::{CStr, CString, c_char, c_void},
46};
47
48bitflags! {
49    /// Flags for [`FFI_ArrowSchema`]
50    ///
51    /// Old Workaround at <https://github.com/bitflags/bitflags/issues/356>
52    /// is no longer required as `bitflags` [fixed the issue](https://github.com/bitflags/bitflags/pull/355).
53    pub struct Flags: i64 {
54        /// Indicates that the dictionary is ordered
55        const DICTIONARY_ORDERED = 0b00000001;
56        /// Indicates that the field is nullable
57        const NULLABLE = 0b00000010;
58        /// Indicates that the map keys are sorted
59        const MAP_KEYS_SORTED = 0b00000100;
60    }
61}
62
63/// ABI-compatible struct for `ArrowSchema` from C Data Interface
64/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
65///
66/// ```
67/// # use arrow_schema::DataType;
68/// # use arrow_schema::ffi::FFI_ArrowSchema;
69/// fn array_schema(data_type: &DataType) -> FFI_ArrowSchema {
70///     FFI_ArrowSchema::try_from(data_type).unwrap()
71/// }
72/// ```
73///
74#[repr(C)]
75#[derive(Debug)]
76#[allow(non_camel_case_types)]
77pub struct FFI_ArrowSchema {
78    format: *const c_char,
79    name: *const c_char,
80    metadata: *const c_char,
81    /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags)
82    flags: i64,
83    n_children: i64,
84    children: *mut *mut FFI_ArrowSchema,
85    dictionary: *mut FFI_ArrowSchema,
86    release: Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowSchema)>,
87    private_data: *mut c_void,
88}
89
90struct SchemaPrivateData {
91    children: Box<[*mut FFI_ArrowSchema]>,
92    dictionary: *mut FFI_ArrowSchema,
93    metadata: Option<Vec<u8>>,
94}
95
96// callback used to drop [FFI_ArrowSchema] when it is exported.
97unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
98    if schema.is_null() {
99        return;
100    }
101    let schema = unsafe { &mut *schema };
102
103    // take ownership back to release it.
104    drop(unsafe { CString::from_raw(schema.format as *mut c_char) });
105    if !schema.name.is_null() {
106        drop(unsafe { CString::from_raw(schema.name as *mut c_char) });
107    }
108    if !schema.private_data.is_null() {
109        let private_data = unsafe { Box::from_raw(schema.private_data as *mut SchemaPrivateData) };
110        for child in private_data.children.iter() {
111            drop(unsafe { Box::from_raw(*child) })
112        }
113        if !private_data.dictionary.is_null() {
114            drop(unsafe { Box::from_raw(private_data.dictionary) });
115        }
116
117        drop(private_data);
118    }
119
120    schema.release = None;
121}
122
123impl FFI_ArrowSchema {
124    /// create a new [`FFI_ArrowSchema`]. This fails if the fields'
125    /// [`DataType`] is not supported.
126    pub fn try_new(
127        format: &str,
128        children: Vec<FFI_ArrowSchema>,
129        dictionary: Option<FFI_ArrowSchema>,
130    ) -> Result<Self, ArrowError> {
131        let mut this = Self::empty();
132
133        let children_ptr = children
134            .into_iter()
135            .map(Box::new)
136            .map(Box::into_raw)
137            .collect::<Box<_>>();
138
139        this.format = CString::new(format).unwrap().into_raw();
140        this.release = Some(release_schema);
141        this.n_children = children_ptr.len() as i64;
142
143        let dictionary_ptr = dictionary
144            .map(|d| Box::into_raw(Box::new(d)))
145            .unwrap_or(std::ptr::null_mut());
146
147        let mut private_data = Box::new(SchemaPrivateData {
148            children: children_ptr,
149            dictionary: dictionary_ptr,
150            metadata: None,
151        });
152
153        // intentionally set from private_data (see https://github.com/apache/arrow-rs/issues/580)
154        this.children = private_data.children.as_mut_ptr();
155
156        this.dictionary = dictionary_ptr;
157
158        this.private_data = Box::into_raw(private_data) as *mut c_void;
159
160        Ok(this)
161    }
162
163    /// Set the name of the schema
164    pub fn with_name(mut self, name: &str) -> Result<Self, ArrowError> {
165        self.name = CString::new(name).unwrap().into_raw();
166        Ok(self)
167    }
168
169    /// Set the flags of the schema
170    pub fn with_flags(mut self, flags: Flags) -> Result<Self, ArrowError> {
171        self.flags = flags.bits();
172        Ok(self)
173    }
174
175    /// Add metadata to the schema
176    pub fn with_metadata<I, S>(mut self, metadata: I) -> Result<Self, ArrowError>
177    where
178        I: IntoIterator<Item = (S, S)>,
179        S: AsRef<str>,
180    {
181        let metadata: Vec<(S, S)> = metadata.into_iter().collect();
182        // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata
183        let new_metadata = if !metadata.is_empty() {
184            let mut metadata_serialized: Vec<u8> = Vec::new();
185            let num_entries: i32 = metadata.len().try_into().map_err(|_| {
186                ArrowError::CDataInterface(format!(
187                    "metadata can only have {} entries, but {} were provided",
188                    i32::MAX,
189                    metadata.len()
190                ))
191            })?;
192            metadata_serialized.extend(num_entries.to_ne_bytes());
193
194            for (key, value) in metadata.into_iter() {
195                let key_len: i32 = key.as_ref().len().try_into().map_err(|_| {
196                    ArrowError::CDataInterface(format!(
197                        "metadata key can only have {} bytes, but {} were provided",
198                        i32::MAX,
199                        key.as_ref().len()
200                    ))
201                })?;
202                let value_len: i32 = value.as_ref().len().try_into().map_err(|_| {
203                    ArrowError::CDataInterface(format!(
204                        "metadata value can only have {} bytes, but {} were provided",
205                        i32::MAX,
206                        value.as_ref().len()
207                    ))
208                })?;
209
210                metadata_serialized.extend(key_len.to_ne_bytes());
211                metadata_serialized.extend_from_slice(key.as_ref().as_bytes());
212                metadata_serialized.extend(value_len.to_ne_bytes());
213                metadata_serialized.extend_from_slice(value.as_ref().as_bytes());
214            }
215
216            self.metadata = metadata_serialized.as_ptr() as *const c_char;
217            Some(metadata_serialized)
218        } else {
219            self.metadata = std::ptr::null_mut();
220            None
221        };
222
223        unsafe {
224            let mut private_data = Box::from_raw(self.private_data as *mut SchemaPrivateData);
225            private_data.metadata = new_metadata;
226            self.private_data = Box::into_raw(private_data) as *mut c_void;
227        }
228
229        Ok(self)
230    }
231
232    /// Takes ownership of the pointed to [`FFI_ArrowSchema`]
233    ///
234    /// This acts to [move] the data out of `schema`, setting the release callback to NULL
235    ///
236    /// # Safety
237    ///
238    /// * `schema` must be [valid] for reads and writes
239    /// * `schema` must be properly aligned
240    /// * `schema` must point to a properly initialized value of [`FFI_ArrowSchema`]
241    ///
242    /// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array
243    /// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety
244    pub unsafe fn from_raw(schema: *mut FFI_ArrowSchema) -> Self {
245        unsafe { std::ptr::replace(schema, Self::empty()) }
246    }
247
248    /// Create an empty [`FFI_ArrowSchema`]
249    pub fn empty() -> Self {
250        Self {
251            format: std::ptr::null_mut(),
252            name: std::ptr::null_mut(),
253            metadata: std::ptr::null_mut(),
254            flags: 0,
255            n_children: 0,
256            children: std::ptr::null_mut(),
257            dictionary: std::ptr::null_mut(),
258            release: None,
259            private_data: std::ptr::null_mut(),
260        }
261    }
262
263    /// Returns the format of this schema.
264    pub fn format(&self) -> &str {
265        assert!(!self.format.is_null());
266        // safe because the lifetime of `self.format` equals `self`
267        unsafe { CStr::from_ptr(self.format) }
268            .to_str()
269            .expect("The external API has a non-utf8 as format")
270    }
271
272    /// Returns the name of this schema.
273    pub fn name(&self) -> Option<&str> {
274        if self.name.is_null() {
275            None
276        } else {
277            // safe because the lifetime of `self.name` equals `self`
278            Some(
279                unsafe { CStr::from_ptr(self.name) }
280                    .to_str()
281                    .expect("The external API has a non-utf8 as name"),
282            )
283        }
284    }
285
286    /// Returns the flags of this schema.
287    pub fn flags(&self) -> Option<Flags> {
288        Flags::from_bits(self.flags)
289    }
290
291    /// Returns the child of this schema at `index`.
292    ///
293    /// # Panics
294    ///
295    /// Panics if `index` is greater than or equal to the number of children.
296    ///
297    /// This is to make sure that the unsafe acces to raw pointer is sound.
298    pub fn child(&self, index: usize) -> &Self {
299        assert!(index < self.n_children as usize);
300        unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() }
301    }
302
303    /// Returns an iterator to the schema's children.
304    pub fn children(&self) -> impl Iterator<Item = &Self> {
305        (0..self.n_children as usize).map(move |i| self.child(i))
306    }
307
308    /// Returns if the field is semantically nullable,
309    /// regardless of whether it actually has null values.
310    pub fn nullable(&self) -> bool {
311        (self.flags / 2) & 1 == 1
312    }
313
314    /// Returns the reference to the underlying dictionary of the schema.
315    /// Check [ArrowSchema.dictionary](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.dictionary).
316    ///
317    /// This must be `Some` if the schema represents a dictionary-encoded type, `None` otherwise.
318    pub fn dictionary(&self) -> Option<&Self> {
319        unsafe { self.dictionary.as_ref() }
320    }
321
322    /// For map types, returns whether the keys within each map value are sorted.
323    ///
324    /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags)
325    pub fn map_keys_sorted(&self) -> bool {
326        self.flags & 0b00000100 != 0
327    }
328
329    /// For dictionary-encoded types, returns whether the ordering of dictionary indices is semantically meaningful.
330    pub fn dictionary_ordered(&self) -> bool {
331        self.flags & 0b00000001 != 0
332    }
333
334    /// Returns the metadata in the schema as `Key-Value` pairs
335    pub fn metadata(&self) -> Result<HashMap<String, String>, ArrowError> {
336        if self.metadata.is_null() {
337            Ok(HashMap::new())
338        } else {
339            let mut pos = 0;
340
341            // On some platforms, c_char = u8, and on some, c_char = i8. Where c_char = u8, clippy
342            // wants to complain that we're casting to the same type, but if we remove the cast,
343            // this will fail to compile on the other platforms. So we must allow it.
344            #[allow(clippy::unnecessary_cast)]
345            let buffer: *const u8 = self.metadata as *const u8;
346
347            fn next_four_bytes(buffer: *const u8, pos: &mut isize) -> [u8; 4] {
348                let out = unsafe {
349                    [
350                        *buffer.offset(*pos),
351                        *buffer.offset(*pos + 1),
352                        *buffer.offset(*pos + 2),
353                        *buffer.offset(*pos + 3),
354                    ]
355                };
356                *pos += 4;
357                out
358            }
359
360            fn next_n_bytes(buffer: *const u8, pos: &mut isize, n: i32) -> &[u8] {
361                let out = unsafe {
362                    std::slice::from_raw_parts(buffer.offset(*pos), n.try_into().unwrap())
363                };
364                *pos += isize::try_from(n).unwrap();
365                out
366            }
367
368            let num_entries = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos));
369            if num_entries < 0 {
370                return Err(ArrowError::CDataInterface(
371                    "Negative number of metadata entries".to_string(),
372                ));
373            }
374
375            let mut metadata =
376                HashMap::with_capacity(num_entries.try_into().expect("Too many metadata entries"));
377
378            for _ in 0..num_entries {
379                let key_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos));
380                if key_length < 0 {
381                    return Err(ArrowError::CDataInterface(
382                        "Negative key length in metadata".to_string(),
383                    ));
384                }
385                let key = String::from_utf8(next_n_bytes(buffer, &mut pos, key_length).to_vec())?;
386                let value_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos));
387                if value_length < 0 {
388                    return Err(ArrowError::CDataInterface(
389                        "Negative value length in metadata".to_string(),
390                    ));
391                }
392                let value =
393                    String::from_utf8(next_n_bytes(buffer, &mut pos, value_length).to_vec())?;
394                metadata.insert(key, value);
395            }
396
397            Ok(metadata)
398        }
399    }
400}
401
402impl Drop for FFI_ArrowSchema {
403    fn drop(&mut self) {
404        match self.release {
405            None => (),
406            Some(release) => unsafe { release(self) },
407        };
408    }
409}
410
411unsafe impl Send for FFI_ArrowSchema {}
412
413impl TryFrom<&FFI_ArrowSchema> for DataType {
414    type Error = ArrowError;
415
416    /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings)
417    fn try_from(c_schema: &FFI_ArrowSchema) -> Result<Self, ArrowError> {
418        let mut dtype = match c_schema.format() {
419            "n" => DataType::Null,
420            "b" => DataType::Boolean,
421            "c" => DataType::Int8,
422            "C" => DataType::UInt8,
423            "s" => DataType::Int16,
424            "S" => DataType::UInt16,
425            "i" => DataType::Int32,
426            "I" => DataType::UInt32,
427            "l" => DataType::Int64,
428            "L" => DataType::UInt64,
429            "e" => DataType::Float16,
430            "f" => DataType::Float32,
431            "g" => DataType::Float64,
432            "vz" => DataType::BinaryView,
433            "z" => DataType::Binary,
434            "Z" => DataType::LargeBinary,
435            "vu" => DataType::Utf8View,
436            "u" => DataType::Utf8,
437            "U" => DataType::LargeUtf8,
438            "tdD" => DataType::Date32,
439            "tdm" => DataType::Date64,
440            "tts" => DataType::Time32(TimeUnit::Second),
441            "ttm" => DataType::Time32(TimeUnit::Millisecond),
442            "ttu" => DataType::Time64(TimeUnit::Microsecond),
443            "ttn" => DataType::Time64(TimeUnit::Nanosecond),
444            "tDs" => DataType::Duration(TimeUnit::Second),
445            "tDm" => DataType::Duration(TimeUnit::Millisecond),
446            "tDu" => DataType::Duration(TimeUnit::Microsecond),
447            "tDn" => DataType::Duration(TimeUnit::Nanosecond),
448            "tiM" => DataType::Interval(IntervalUnit::YearMonth),
449            "tiD" => DataType::Interval(IntervalUnit::DayTime),
450            "tin" => DataType::Interval(IntervalUnit::MonthDayNano),
451            "+l" => {
452                let c_child = c_schema.child(0);
453                DataType::List(Arc::new(Field::try_from(c_child)?))
454            }
455            "+L" => {
456                let c_child = c_schema.child(0);
457                DataType::LargeList(Arc::new(Field::try_from(c_child)?))
458            }
459            "+vl" => {
460                let c_child = c_schema.child(0);
461                DataType::ListView(Arc::new(Field::try_from(c_child)?))
462            }
463            "+vL" => {
464                let c_child = c_schema.child(0);
465                DataType::LargeListView(Arc::new(Field::try_from(c_child)?))
466            }
467            "+s" => {
468                let fields = c_schema.children().map(Field::try_from);
469                DataType::Struct(fields.collect::<Result<_, ArrowError>>()?)
470            }
471            "+m" => {
472                let c_child = c_schema.child(0);
473                let map_keys_sorted = c_schema.map_keys_sorted();
474                DataType::Map(Arc::new(Field::try_from(c_child)?), map_keys_sorted)
475            }
476            "+r" => {
477                let c_run_ends = c_schema.child(0);
478                let c_values = c_schema.child(1);
479                DataType::RunEndEncoded(
480                    Arc::new(Field::try_from(c_run_ends)?),
481                    Arc::new(Field::try_from(c_values)?),
482                )
483            }
484            // Parametrized types, requiring string parse
485            other => {
486                match other.splitn(2, ':').collect::<Vec<&str>>().as_slice() {
487                    // FixedSizeBinary type in format "w:num_bytes"
488                    ["w", num_bytes] => {
489                        let parsed_num_bytes = num_bytes.parse::<i32>().map_err(|_| {
490                            ArrowError::CDataInterface(
491                                "FixedSizeBinary requires an integer parameter representing number of bytes per element".to_string())
492                        })?;
493                        DataType::FixedSizeBinary(parsed_num_bytes)
494                    }
495                    // FixedSizeList type in format "+w:num_elems"
496                    ["+w", num_elems] => {
497                        let c_child = c_schema.child(0);
498                        let parsed_num_elems = num_elems.parse::<i32>().map_err(|_| {
499                            ArrowError::CDataInterface(
500                                "The FixedSizeList type requires an integer parameter representing number of elements per list".to_string())
501                        })?;
502                        DataType::FixedSizeList(
503                            Arc::new(Field::try_from(c_child)?),
504                            parsed_num_elems,
505                        )
506                    }
507                    // Decimal types in format "d:precision,scale" or "d:precision,scale,bitWidth"
508                    ["d", extra] => match extra.splitn(3, ',').collect::<Vec<&str>>().as_slice() {
509                        [precision, scale] => {
510                            let parsed_precision = precision.parse::<u8>().map_err(|_| {
511                                ArrowError::CDataInterface(
512                                    "The decimal type requires an integer precision".to_string(),
513                                )
514                            })?;
515                            let parsed_scale = scale.parse::<i8>().map_err(|_| {
516                                ArrowError::CDataInterface(
517                                    "The decimal type requires an integer scale".to_string(),
518                                )
519                            })?;
520                            DataType::Decimal128(parsed_precision, parsed_scale)
521                        }
522                        [precision, scale, bits] => {
523                            let parsed_precision = precision.parse::<u8>().map_err(|_| {
524                                ArrowError::CDataInterface(
525                                    "The decimal type requires an integer precision".to_string(),
526                                )
527                            })?;
528                            let parsed_scale = scale.parse::<i8>().map_err(|_| {
529                                ArrowError::CDataInterface(
530                                    "The decimal type requires an integer scale".to_string(),
531                                )
532                            })?;
533                            match *bits {
534                                    "32" => DataType::Decimal32(parsed_precision, parsed_scale),
535                                    "64" => DataType::Decimal64(parsed_precision, parsed_scale),
536                                    "128" => DataType::Decimal128(parsed_precision, parsed_scale),
537                                    "256" => DataType::Decimal256(parsed_precision, parsed_scale),
538                                    _ => return Err(ArrowError::CDataInterface("Only 32/64/128/256 bit wide decimals are supported in the Rust implementation".to_string())),
539                                }
540                        }
541                        _ => {
542                            return Err(ArrowError::CDataInterface(format!(
543                                "The decimal pattern \"d:{extra:?}\" is not supported in the Rust implementation"
544                            )));
545                        }
546                    },
547                    // DenseUnion
548                    ["+ud", extra] => {
549                        let type_ids = extra
550                            .split(',')
551                            .map(|t| {
552                                t.parse::<i8>().map_err(|_| {
553                                    ArrowError::CDataInterface(
554                                        "The Union type requires an integer type id".to_string(),
555                                    )
556                                })
557                            })
558                            .collect::<Result<Vec<_>, ArrowError>>()?;
559                        let mut fields = Vec::with_capacity(type_ids.len());
560                        for idx in 0..c_schema.n_children {
561                            let c_child = c_schema.child(idx as usize);
562                            let field = Field::try_from(c_child)?;
563                            fields.push(field);
564                        }
565
566                        if fields.len() != type_ids.len() {
567                            return Err(ArrowError::CDataInterface(
568                                "The Union type requires same number of fields and type ids"
569                                    .to_string(),
570                            ));
571                        }
572
573                        DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense)
574                    }
575                    // SparseUnion
576                    ["+us", extra] => {
577                        let type_ids = extra
578                            .split(',')
579                            .map(|t| {
580                                t.parse::<i8>().map_err(|_| {
581                                    ArrowError::CDataInterface(
582                                        "The Union type requires an integer type id".to_string(),
583                                    )
584                                })
585                            })
586                            .collect::<Result<Vec<_>, ArrowError>>()?;
587                        let mut fields = Vec::with_capacity(type_ids.len());
588                        for idx in 0..c_schema.n_children {
589                            let c_child = c_schema.child(idx as usize);
590                            let field = Field::try_from(c_child)?;
591                            fields.push(field);
592                        }
593
594                        if fields.len() != type_ids.len() {
595                            return Err(ArrowError::CDataInterface(
596                                "The Union type requires same number of fields and type ids"
597                                    .to_string(),
598                            ));
599                        }
600
601                        DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Sparse)
602                    }
603
604                    // Timestamps in format "tts:" and "tts:America/New_York" for no timezones and timezones resp.
605                    ["tss", ""] => DataType::Timestamp(TimeUnit::Second, None),
606                    ["tsm", ""] => DataType::Timestamp(TimeUnit::Millisecond, None),
607                    ["tsu", ""] => DataType::Timestamp(TimeUnit::Microsecond, None),
608                    ["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None),
609                    ["tss", tz] => DataType::Timestamp(TimeUnit::Second, Some(Arc::from(*tz))),
610                    ["tsm", tz] => DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from(*tz))),
611                    ["tsu", tz] => DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from(*tz))),
612                    ["tsn", tz] => DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from(*tz))),
613                    _ => {
614                        return Err(ArrowError::CDataInterface(format!(
615                            "The datatype \"{other:?}\" is still not supported in Rust implementation"
616                        )));
617                    }
618                }
619            }
620        };
621
622        if let Some(dict_schema) = c_schema.dictionary() {
623            let value_type = Self::try_from(dict_schema)?;
624            dtype = DataType::Dictionary(Box::new(dtype), Box::new(value_type));
625        }
626
627        Ok(dtype)
628    }
629}
630
631impl TryFrom<&FFI_ArrowSchema> for Field {
632    type Error = ArrowError;
633
634    fn try_from(c_schema: &FFI_ArrowSchema) -> Result<Self, ArrowError> {
635        let dtype = DataType::try_from(c_schema)?;
636        let mut field = Field::new(c_schema.name().unwrap_or(""), dtype, c_schema.nullable());
637        field.set_metadata(c_schema.metadata()?);
638        Ok(field)
639    }
640}
641
642impl TryFrom<&FFI_ArrowSchema> for Schema {
643    type Error = ArrowError;
644
645    fn try_from(c_schema: &FFI_ArrowSchema) -> Result<Self, ArrowError> {
646        // interpret it as a struct type then extract its fields
647        let dtype = DataType::try_from(c_schema)?;
648        if let DataType::Struct(fields) = dtype {
649            Ok(Schema::new(fields).with_metadata(c_schema.metadata()?))
650        } else {
651            Err(ArrowError::CDataInterface(
652                "Unable to interpret C data struct as a Schema".to_string(),
653            ))
654        }
655    }
656}
657
658impl TryFrom<&DataType> for FFI_ArrowSchema {
659    type Error = ArrowError;
660
661    /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings)
662    fn try_from(dtype: &DataType) -> Result<Self, ArrowError> {
663        let format = get_format_string(dtype)?;
664        // allocate and hold the children
665        let children = match dtype {
666            DataType::List(child)
667            | DataType::LargeList(child)
668            | DataType::ListView(child)
669            | DataType::LargeListView(child)
670            | DataType::FixedSizeList(child, _)
671            | DataType::Map(child, _) => {
672                vec![FFI_ArrowSchema::try_from(child.as_ref())?]
673            }
674            DataType::Union(fields, _) => fields
675                .iter()
676                .map(|(_, f)| f.as_ref().try_into())
677                .collect::<Result<Vec<_>, ArrowError>>()?,
678            DataType::Struct(fields) => fields
679                .iter()
680                .map(FFI_ArrowSchema::try_from)
681                .collect::<Result<Vec<_>, ArrowError>>()?,
682            DataType::RunEndEncoded(run_ends, values) => vec![
683                FFI_ArrowSchema::try_from(run_ends.as_ref())?,
684                FFI_ArrowSchema::try_from(values.as_ref())?,
685            ],
686            _ => vec![],
687        };
688        let dictionary = if let DataType::Dictionary(_, value_data_type) = dtype {
689            Some(Self::try_from(value_data_type.as_ref())?)
690        } else {
691            None
692        };
693
694        let flags = match dtype {
695            DataType::Map(_, true) => Flags::MAP_KEYS_SORTED,
696            _ => Flags::empty(),
697        };
698
699        FFI_ArrowSchema::try_new(&format, children, dictionary)?.with_flags(flags)
700    }
701}
702
703fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError> {
704    match dtype {
705        DataType::Null => Ok("n".into()),
706        DataType::Boolean => Ok("b".into()),
707        DataType::Int8 => Ok("c".into()),
708        DataType::UInt8 => Ok("C".into()),
709        DataType::Int16 => Ok("s".into()),
710        DataType::UInt16 => Ok("S".into()),
711        DataType::Int32 => Ok("i".into()),
712        DataType::UInt32 => Ok("I".into()),
713        DataType::Int64 => Ok("l".into()),
714        DataType::UInt64 => Ok("L".into()),
715        DataType::Float16 => Ok("e".into()),
716        DataType::Float32 => Ok("f".into()),
717        DataType::Float64 => Ok("g".into()),
718        DataType::BinaryView => Ok("vz".into()),
719        DataType::Binary => Ok("z".into()),
720        DataType::LargeBinary => Ok("Z".into()),
721        DataType::Utf8View => Ok("vu".into()),
722        DataType::Utf8 => Ok("u".into()),
723        DataType::LargeUtf8 => Ok("U".into()),
724        DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))),
725        DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))),
726        DataType::Decimal32(precision, scale) => {
727            Ok(Cow::Owned(format!("d:{precision},{scale},32")))
728        }
729        DataType::Decimal64(precision, scale) => {
730            Ok(Cow::Owned(format!("d:{precision},{scale},64")))
731        }
732        DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))),
733        DataType::Decimal256(precision, scale) => {
734            Ok(Cow::Owned(format!("d:{precision},{scale},256")))
735        }
736        DataType::Date32 => Ok("tdD".into()),
737        DataType::Date64 => Ok("tdm".into()),
738        DataType::Time32(TimeUnit::Second) => Ok("tts".into()),
739        DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".into()),
740        DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".into()),
741        DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".into()),
742        DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".into()),
743        DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".into()),
744        DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".into()),
745        DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".into()),
746        DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(Cow::Owned(format!("tss:{tz}"))),
747        DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(Cow::Owned(format!("tsm:{tz}"))),
748        DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(Cow::Owned(format!("tsu:{tz}"))),
749        DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(Cow::Owned(format!("tsn:{tz}"))),
750        DataType::Duration(TimeUnit::Second) => Ok("tDs".into()),
751        DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".into()),
752        DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".into()),
753        DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".into()),
754        DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".into()),
755        DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".into()),
756        DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
757        DataType::List(_) => Ok("+l".into()),
758        DataType::LargeList(_) => Ok("+L".into()),
759        DataType::ListView(_) => Ok("+vl".into()),
760        DataType::LargeListView(_) => Ok("+vL".into()),
761        DataType::Struct(_) => Ok("+s".into()),
762        DataType::Map(_, _) => Ok("+m".into()),
763        DataType::RunEndEncoded(_, _) => Ok("+r".into()),
764        DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type),
765        DataType::Union(fields, mode) => {
766            let formats = fields
767                .iter()
768                .map(|(t, _)| t.to_string())
769                .collect::<Vec<_>>();
770            match mode {
771                UnionMode::Dense => Ok(Cow::Owned(format!("{}:{}", "+ud", formats.join(",")))),
772                UnionMode::Sparse => Ok(Cow::Owned(format!("{}:{}", "+us", formats.join(",")))),
773            }
774        }
775        other => Err(ArrowError::CDataInterface(format!(
776            "The datatype \"{other:?}\" is still not supported in Rust implementation"
777        ))),
778    }
779}
780
781impl TryFrom<&FieldRef> for FFI_ArrowSchema {
782    type Error = ArrowError;
783
784    fn try_from(value: &FieldRef) -> Result<Self, Self::Error> {
785        value.as_ref().try_into()
786    }
787}
788
789impl TryFrom<&Field> for FFI_ArrowSchema {
790    type Error = ArrowError;
791
792    fn try_from(field: &Field) -> Result<Self, ArrowError> {
793        let mut flags = if field.is_nullable() {
794            Flags::NULLABLE
795        } else {
796            Flags::empty()
797        };
798
799        if let Some(true) = field.dict_is_ordered() {
800            flags |= Flags::DICTIONARY_ORDERED;
801        }
802
803        FFI_ArrowSchema::try_from(field.data_type())?
804            .with_name(field.name())?
805            .with_flags(flags)?
806            .with_metadata(field.metadata())
807    }
808}
809
810impl TryFrom<&Schema> for FFI_ArrowSchema {
811    type Error = ArrowError;
812
813    fn try_from(schema: &Schema) -> Result<Self, ArrowError> {
814        let dtype = DataType::Struct(schema.fields().clone());
815        let c_schema = FFI_ArrowSchema::try_from(&dtype)?.with_metadata(&schema.metadata)?;
816        Ok(c_schema)
817    }
818}
819
820impl TryFrom<DataType> for FFI_ArrowSchema {
821    type Error = ArrowError;
822
823    fn try_from(dtype: DataType) -> Result<Self, ArrowError> {
824        FFI_ArrowSchema::try_from(&dtype)
825    }
826}
827
828impl TryFrom<Field> for FFI_ArrowSchema {
829    type Error = ArrowError;
830
831    fn try_from(field: Field) -> Result<Self, ArrowError> {
832        FFI_ArrowSchema::try_from(&field)
833    }
834}
835
836impl TryFrom<Schema> for FFI_ArrowSchema {
837    type Error = ArrowError;
838
839    fn try_from(schema: Schema) -> Result<Self, ArrowError> {
840        FFI_ArrowSchema::try_from(&schema)
841    }
842}
843
844#[cfg(test)]
845mod tests {
846    use super::*;
847    use crate::Fields;
848
849    fn round_trip_type(dtype: DataType) {
850        let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap();
851        let restored = DataType::try_from(&c_schema).unwrap();
852        assert_eq!(restored, dtype);
853    }
854
855    fn round_trip_field(field: Field) {
856        let c_schema = FFI_ArrowSchema::try_from(&field).unwrap();
857        let restored = Field::try_from(&c_schema).unwrap();
858        assert_eq!(restored, field);
859    }
860
861    fn round_trip_schema(schema: Schema) {
862        let c_schema = FFI_ArrowSchema::try_from(&schema).unwrap();
863        let restored = Schema::try_from(&c_schema).unwrap();
864        assert_eq!(restored, schema);
865    }
866
867    #[test]
868    fn test_type() {
869        round_trip_type(DataType::Int64);
870        round_trip_type(DataType::UInt64);
871        round_trip_type(DataType::Float64);
872        round_trip_type(DataType::Date64);
873        round_trip_type(DataType::Time64(TimeUnit::Nanosecond));
874        round_trip_type(DataType::FixedSizeBinary(12));
875        round_trip_type(DataType::FixedSizeList(
876            Arc::new(Field::new("a", DataType::Int64, false)),
877            5,
878        ));
879        round_trip_type(DataType::Utf8);
880        round_trip_type(DataType::Utf8View);
881        round_trip_type(DataType::BinaryView);
882        round_trip_type(DataType::Binary);
883        round_trip_type(DataType::LargeBinary);
884        round_trip_type(DataType::List(Arc::new(Field::new(
885            "a",
886            DataType::Int16,
887            false,
888        ))));
889        round_trip_type(DataType::ListView(Arc::new(Field::new(
890            "a",
891            DataType::Int16,
892            false,
893        ))));
894        round_trip_type(DataType::LargeListView(Arc::new(Field::new(
895            "a",
896            DataType::Int16,
897            false,
898        ))));
899        round_trip_type(DataType::Struct(Fields::from(vec![Field::new(
900            "a",
901            DataType::Utf8,
902            true,
903        )])));
904        round_trip_type(DataType::RunEndEncoded(
905            Arc::new(Field::new("run_ends", DataType::Int32, false)),
906            Arc::new(Field::new("values", DataType::Binary, true)),
907        ));
908    }
909
910    #[test]
911    fn test_field() {
912        let dtype = DataType::Struct(vec![Field::new("a", DataType::Utf8, true)].into());
913        round_trip_field(Field::new("test", dtype, true));
914    }
915
916    #[test]
917    fn test_schema() {
918        let schema = Schema::new(vec![
919            Field::new("name", DataType::Utf8, false),
920            Field::new("address", DataType::Utf8, false),
921            Field::new("priority", DataType::UInt8, false),
922        ])
923        .with_metadata([("hello".to_string(), "world".to_string())].into());
924
925        round_trip_schema(schema);
926
927        // test that we can interpret struct types as schema
928        let dtype = DataType::Struct(Fields::from(vec![
929            Field::new("a", DataType::Utf8, true),
930            Field::new("b", DataType::Int16, false),
931        ]));
932        let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap();
933        let schema = Schema::try_from(&c_schema).unwrap();
934        assert_eq!(schema.fields().len(), 2);
935
936        // test that we assert the input type
937        let c_schema = FFI_ArrowSchema::try_from(&DataType::Float64).unwrap();
938        let result = Schema::try_from(&c_schema);
939        assert!(result.is_err());
940    }
941
942    #[test]
943    fn test_map_keys_sorted() {
944        let keys = Field::new("keys", DataType::Int32, false);
945        let values = Field::new("values", DataType::UInt32, false);
946        let entry_struct = DataType::Struct(vec![keys, values].into());
947
948        // Construct a map array from the above two
949        let map_data_type =
950            DataType::Map(Arc::new(Field::new("entries", entry_struct, false)), true);
951
952        let arrow_schema = FFI_ArrowSchema::try_from(map_data_type).unwrap();
953        assert!(arrow_schema.map_keys_sorted());
954    }
955
956    #[test]
957    fn test_dictionary_ordered() {
958        #[allow(deprecated)]
959        let schema = Schema::new(vec![Field::new_dict(
960            "dict",
961            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
962            false,
963            0,
964            true,
965        )]);
966
967        let arrow_schema = FFI_ArrowSchema::try_from(schema).unwrap();
968        assert!(arrow_schema.child(0).dictionary_ordered());
969    }
970
971    #[test]
972    fn test_set_field_metadata() {
973        let metadata_cases: Vec<HashMap<String, String>> = vec![
974            [].into(),
975            [("key".to_string(), "value".to_string())].into(),
976            [
977                ("key".to_string(), "".to_string()),
978                ("ascii123".to_string(), "你好".to_string()),
979                ("".to_string(), "value".to_string()),
980            ]
981            .into(),
982        ];
983
984        let mut schema = FFI_ArrowSchema::try_new("b", vec![], None)
985            .unwrap()
986            .with_name("test")
987            .unwrap();
988
989        for metadata in metadata_cases {
990            schema = schema.with_metadata(&metadata).unwrap();
991            let field = Field::try_from(&schema).unwrap();
992            assert_eq!(field.metadata(), &metadata);
993        }
994    }
995
996    #[test]
997    fn test_import_field_with_null_name() {
998        let dtype = DataType::Int16;
999        let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap();
1000        assert!(c_schema.name().is_none());
1001        let field = Field::try_from(&c_schema).unwrap();
1002        assert_eq!(field.name(), "");
1003    }
1004}