Skip to main content

arrow_schema/
ffi.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
19//!
20//! ```
21//! # use arrow_schema::{DataType, Field, Schema};
22//! # use arrow_schema::ffi::FFI_ArrowSchema;
23//!
24//! // Create from data type
25//! let ffi_data_type = FFI_ArrowSchema::try_from(&DataType::LargeUtf8).unwrap();
26//! let back = DataType::try_from(&ffi_data_type).unwrap();
27//! assert_eq!(back, DataType::LargeUtf8);
28//!
29//! // Create from schema
30//! let schema = Schema::new(vec![Field::new("foo", DataType::Int64, false)]);
31//! let ffi_schema = FFI_ArrowSchema::try_from(&schema).unwrap();
32//! let back = Schema::try_from(&ffi_schema).unwrap();
33//!
34//! assert_eq!(schema, back);
35//! ```
36
37use crate::{
38    ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode,
39};
40use bitflags::bitflags;
41use std::borrow::Cow;
42use std::sync::Arc;
43use std::{
44    collections::HashMap,
45    ffi::{CStr, CString, c_char, c_void},
46};
47
48bitflags! {
49    /// Flags for [`FFI_ArrowSchema`]
50    ///
51    /// Old Workaround at <https://github.com/bitflags/bitflags/issues/356>
52    /// is no longer required as `bitflags` [fixed the issue](https://github.com/bitflags/bitflags/pull/355).
53    pub struct Flags: i64 {
54        /// Indicates that the dictionary is ordered
55        const DICTIONARY_ORDERED = 0b00000001;
56        /// Indicates that the field is nullable
57        const NULLABLE = 0b00000010;
58        /// Indicates that the map keys are sorted
59        const MAP_KEYS_SORTED = 0b00000100;
60    }
61}
62
63/// ABI-compatible struct for `ArrowSchema` from C Data Interface
64/// See <https://arrow.apache.org/docs/format/CDataInterface.html#the-arrowschema-structure>
65///
66/// ```
67/// # use arrow_schema::DataType;
68/// # use arrow_schema::ffi::FFI_ArrowSchema;
69/// fn array_schema(data_type: &DataType) -> FFI_ArrowSchema {
70///     FFI_ArrowSchema::try_from(data_type).unwrap()
71/// }
72/// ```
73///
74#[repr(C)]
75#[derive(Debug)]
76#[allow(non_camel_case_types)]
77pub struct FFI_ArrowSchema {
78    /// Null-terminated, UTF8-encoded string describing the data type
79    pub format: *const c_char,
80    /// Null-terminated, UTF8-encoded string of the field or array name
81    pub name: *const c_char,
82    /// Binary string describing the type’s metadata
83    pub metadata: *const c_char,
84    /// A bitfield of flags enriching the type description
85    /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags)
86    pub flags: i64,
87    /// The number of children this type has
88    pub n_children: i64,
89    /// C array of pointers to each child type of this type
90    pub children: *mut *mut FFI_ArrowSchema,
91    /// Pointer to the type of dictionary values
92    pub dictionary: *mut FFI_ArrowSchema,
93    /// Pointer to a producer-provided release callback
94    pub release: Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowSchema)>,
95    /// Opaque pointer to producer-provided private data
96    pub private_data: *mut c_void,
97}
98
99struct SchemaPrivateData {
100    children: Box<[*mut FFI_ArrowSchema]>,
101    dictionary: *mut FFI_ArrowSchema,
102    metadata: Option<Vec<u8>>,
103}
104
105// callback used to drop [FFI_ArrowSchema] when it is exported.
106unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
107    if schema.is_null() {
108        return;
109    }
110    let schema = unsafe { &mut *schema };
111
112    // take ownership back to release it.
113    drop(unsafe { CString::from_raw(schema.format as *mut c_char) });
114    if !schema.name.is_null() {
115        drop(unsafe { CString::from_raw(schema.name as *mut c_char) });
116    }
117    if !schema.private_data.is_null() {
118        let private_data = unsafe { Box::from_raw(schema.private_data as *mut SchemaPrivateData) };
119        for child in private_data.children.iter() {
120            drop(unsafe { Box::from_raw(*child) })
121        }
122        if !private_data.dictionary.is_null() {
123            drop(unsafe { Box::from_raw(private_data.dictionary) });
124        }
125
126        drop(private_data);
127    }
128
129    schema.release = None;
130}
131
132impl FFI_ArrowSchema {
133    /// create a new [`FFI_ArrowSchema`]. This fails if the fields'
134    /// [`DataType`] is not supported.
135    pub fn try_new(
136        format: &str,
137        children: Vec<FFI_ArrowSchema>,
138        dictionary: Option<FFI_ArrowSchema>,
139    ) -> Result<Self, ArrowError> {
140        let mut this = Self::empty();
141
142        let children_ptr = children
143            .into_iter()
144            .map(Box::new)
145            .map(Box::into_raw)
146            .collect::<Box<_>>();
147
148        this.format = CString::new(format).unwrap().into_raw();
149        this.release = Some(release_schema);
150        this.n_children = children_ptr.len() as i64;
151
152        let dictionary_ptr = dictionary
153            .map(|d| Box::into_raw(Box::new(d)))
154            .unwrap_or(std::ptr::null_mut());
155
156        let mut private_data = Box::new(SchemaPrivateData {
157            children: children_ptr,
158            dictionary: dictionary_ptr,
159            metadata: None,
160        });
161
162        // intentionally set from private_data (see https://github.com/apache/arrow-rs/issues/580)
163        this.children = private_data.children.as_mut_ptr();
164
165        this.dictionary = dictionary_ptr;
166
167        this.private_data = Box::into_raw(private_data) as *mut c_void;
168
169        Ok(this)
170    }
171
172    /// Set the name of the schema
173    pub fn with_name(mut self, name: &str) -> Result<Self, ArrowError> {
174        self.name = CString::new(name).unwrap().into_raw();
175        Ok(self)
176    }
177
178    /// Set the flags of the schema
179    pub fn with_flags(mut self, flags: Flags) -> Result<Self, ArrowError> {
180        self.flags = flags.bits();
181        Ok(self)
182    }
183
184    /// Add metadata to the schema
185    pub fn with_metadata<I, S>(mut self, metadata: I) -> Result<Self, ArrowError>
186    where
187        I: IntoIterator<Item = (S, S)>,
188        S: AsRef<str>,
189    {
190        let metadata: Vec<(S, S)> = metadata.into_iter().collect();
191        // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata
192        let new_metadata = if !metadata.is_empty() {
193            let mut metadata_serialized: Vec<u8> = Vec::new();
194            let num_entries: i32 = metadata.len().try_into().map_err(|_| {
195                ArrowError::CDataInterface(format!(
196                    "metadata can only have {} entries, but {} were provided",
197                    i32::MAX,
198                    metadata.len()
199                ))
200            })?;
201            metadata_serialized.extend(num_entries.to_ne_bytes());
202
203            for (key, value) in metadata.into_iter() {
204                let key_len: i32 = key.as_ref().len().try_into().map_err(|_| {
205                    ArrowError::CDataInterface(format!(
206                        "metadata key can only have {} bytes, but {} were provided",
207                        i32::MAX,
208                        key.as_ref().len()
209                    ))
210                })?;
211                let value_len: i32 = value.as_ref().len().try_into().map_err(|_| {
212                    ArrowError::CDataInterface(format!(
213                        "metadata value can only have {} bytes, but {} were provided",
214                        i32::MAX,
215                        value.as_ref().len()
216                    ))
217                })?;
218
219                metadata_serialized.extend(key_len.to_ne_bytes());
220                metadata_serialized.extend_from_slice(key.as_ref().as_bytes());
221                metadata_serialized.extend(value_len.to_ne_bytes());
222                metadata_serialized.extend_from_slice(value.as_ref().as_bytes());
223            }
224
225            self.metadata = metadata_serialized.as_ptr() as *const c_char;
226            Some(metadata_serialized)
227        } else {
228            self.metadata = std::ptr::null_mut();
229            None
230        };
231
232        unsafe {
233            let mut private_data = Box::from_raw(self.private_data as *mut SchemaPrivateData);
234            private_data.metadata = new_metadata;
235            self.private_data = Box::into_raw(private_data) as *mut c_void;
236        }
237
238        Ok(self)
239    }
240
241    /// Takes ownership of the pointed to [`FFI_ArrowSchema`]
242    ///
243    /// This acts to [move] the data out of `schema`, setting the release callback to NULL
244    ///
245    /// # Safety
246    ///
247    /// * `schema` must be [valid] for reads and writes
248    /// * `schema` must be properly aligned
249    /// * `schema` must point to a properly initialized value of [`FFI_ArrowSchema`]
250    ///
251    /// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array
252    /// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety
253    pub unsafe fn from_raw(schema: *mut FFI_ArrowSchema) -> Self {
254        unsafe { std::ptr::replace(schema, Self::empty()) }
255    }
256
257    /// Create an empty [`FFI_ArrowSchema`]
258    pub fn empty() -> Self {
259        Self {
260            format: std::ptr::null_mut(),
261            name: std::ptr::null_mut(),
262            metadata: std::ptr::null_mut(),
263            flags: 0,
264            n_children: 0,
265            children: std::ptr::null_mut(),
266            dictionary: std::ptr::null_mut(),
267            release: None,
268            private_data: std::ptr::null_mut(),
269        }
270    }
271
272    /// Returns the format of this schema.
273    pub fn format(&self) -> &str {
274        assert!(!self.format.is_null());
275        // safe because the lifetime of `self.format` equals `self`
276        unsafe { CStr::from_ptr(self.format) }
277            .to_str()
278            .expect("The external API has a non-utf8 as format")
279    }
280
281    /// Returns the name of this schema.
282    pub fn name(&self) -> Option<&str> {
283        if self.name.is_null() {
284            None
285        } else {
286            // safe because the lifetime of `self.name` equals `self`
287            Some(
288                unsafe { CStr::from_ptr(self.name) }
289                    .to_str()
290                    .expect("The external API has a non-utf8 as name"),
291            )
292        }
293    }
294
295    /// Returns the flags of this schema.
296    pub fn flags(&self) -> Option<Flags> {
297        Flags::from_bits(self.flags)
298    }
299
300    /// Returns the child of this schema at `index`.
301    ///
302    /// # Panics
303    ///
304    /// Panics if `index` is greater than or equal to the number of children.
305    ///
306    /// This is to make sure that the unsafe acces to raw pointer is sound.
307    pub fn child(&self, index: usize) -> &Self {
308        assert!(index < self.n_children as usize);
309        unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() }
310    }
311
312    /// Returns an iterator to the schema's children.
313    pub fn children(&self) -> impl Iterator<Item = &Self> {
314        (0..self.n_children as usize).map(move |i| self.child(i))
315    }
316
317    /// Returns if the field is semantically nullable,
318    /// regardless of whether it actually has null values.
319    pub fn nullable(&self) -> bool {
320        (self.flags / 2) & 1 == 1
321    }
322
323    /// Returns the reference to the underlying dictionary of the schema.
324    /// Check [ArrowSchema.dictionary](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.dictionary).
325    ///
326    /// This must be `Some` if the schema represents a dictionary-encoded type, `None` otherwise.
327    pub fn dictionary(&self) -> Option<&Self> {
328        unsafe { self.dictionary.as_ref() }
329    }
330
331    /// For map types, returns whether the keys within each map value are sorted.
332    ///
333    /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags)
334    pub fn map_keys_sorted(&self) -> bool {
335        self.flags & 0b00000100 != 0
336    }
337
338    /// For dictionary-encoded types, returns whether the ordering of dictionary indices is semantically meaningful.
339    pub fn dictionary_ordered(&self) -> bool {
340        self.flags & 0b00000001 != 0
341    }
342
343    /// Returns the metadata in the schema as `Key-Value` pairs
344    pub fn metadata(&self) -> Result<HashMap<String, String>, ArrowError> {
345        if self.metadata.is_null() {
346            Ok(HashMap::new())
347        } else {
348            let mut pos = 0;
349
350            // On some platforms, c_char = u8, and on some, c_char = i8. Where c_char = u8, clippy
351            // wants to complain that we're casting to the same type, but if we remove the cast,
352            // this will fail to compile on the other platforms. So we must allow it.
353            #[allow(clippy::unnecessary_cast)]
354            let buffer: *const u8 = self.metadata as *const u8;
355
356            fn next_four_bytes(buffer: *const u8, pos: &mut isize) -> [u8; 4] {
357                let out = unsafe {
358                    [
359                        *buffer.offset(*pos),
360                        *buffer.offset(*pos + 1),
361                        *buffer.offset(*pos + 2),
362                        *buffer.offset(*pos + 3),
363                    ]
364                };
365                *pos += 4;
366                out
367            }
368
369            fn next_n_bytes(buffer: *const u8, pos: &mut isize, n: i32) -> &[u8] {
370                let out = unsafe {
371                    std::slice::from_raw_parts(buffer.offset(*pos), n.try_into().unwrap())
372                };
373                *pos += isize::try_from(n).unwrap();
374                out
375            }
376
377            let num_entries = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos));
378            if num_entries < 0 {
379                return Err(ArrowError::CDataInterface(
380                    "Negative number of metadata entries".to_string(),
381                ));
382            }
383
384            let mut metadata =
385                HashMap::with_capacity(num_entries.try_into().expect("Too many metadata entries"));
386
387            for _ in 0..num_entries {
388                let key_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos));
389                if key_length < 0 {
390                    return Err(ArrowError::CDataInterface(
391                        "Negative key length in metadata".to_string(),
392                    ));
393                }
394                let key = String::from_utf8(next_n_bytes(buffer, &mut pos, key_length).to_vec())?;
395                let value_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos));
396                if value_length < 0 {
397                    return Err(ArrowError::CDataInterface(
398                        "Negative value length in metadata".to_string(),
399                    ));
400                }
401                let value =
402                    String::from_utf8(next_n_bytes(buffer, &mut pos, value_length).to_vec())?;
403                metadata.insert(key, value);
404            }
405
406            Ok(metadata)
407        }
408    }
409}
410
411impl Drop for FFI_ArrowSchema {
412    fn drop(&mut self) {
413        match self.release {
414            None => (),
415            Some(release) => unsafe { release(self) },
416        };
417    }
418}
419
420unsafe impl Send for FFI_ArrowSchema {}
421
422impl TryFrom<&FFI_ArrowSchema> for DataType {
423    type Error = ArrowError;
424
425    /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings)
426    fn try_from(c_schema: &FFI_ArrowSchema) -> Result<Self, ArrowError> {
427        let mut dtype = match c_schema.format() {
428            "n" => DataType::Null,
429            "b" => DataType::Boolean,
430            "c" => DataType::Int8,
431            "C" => DataType::UInt8,
432            "s" => DataType::Int16,
433            "S" => DataType::UInt16,
434            "i" => DataType::Int32,
435            "I" => DataType::UInt32,
436            "l" => DataType::Int64,
437            "L" => DataType::UInt64,
438            "e" => DataType::Float16,
439            "f" => DataType::Float32,
440            "g" => DataType::Float64,
441            "vz" => DataType::BinaryView,
442            "z" => DataType::Binary,
443            "Z" => DataType::LargeBinary,
444            "vu" => DataType::Utf8View,
445            "u" => DataType::Utf8,
446            "U" => DataType::LargeUtf8,
447            "tdD" => DataType::Date32,
448            "tdm" => DataType::Date64,
449            "tts" => DataType::Time32(TimeUnit::Second),
450            "ttm" => DataType::Time32(TimeUnit::Millisecond),
451            "ttu" => DataType::Time64(TimeUnit::Microsecond),
452            "ttn" => DataType::Time64(TimeUnit::Nanosecond),
453            "tDs" => DataType::Duration(TimeUnit::Second),
454            "tDm" => DataType::Duration(TimeUnit::Millisecond),
455            "tDu" => DataType::Duration(TimeUnit::Microsecond),
456            "tDn" => DataType::Duration(TimeUnit::Nanosecond),
457            "tiM" => DataType::Interval(IntervalUnit::YearMonth),
458            "tiD" => DataType::Interval(IntervalUnit::DayTime),
459            "tin" => DataType::Interval(IntervalUnit::MonthDayNano),
460            "+l" => {
461                let c_child = c_schema.child(0);
462                DataType::List(Arc::new(Field::try_from(c_child)?))
463            }
464            "+L" => {
465                let c_child = c_schema.child(0);
466                DataType::LargeList(Arc::new(Field::try_from(c_child)?))
467            }
468            "+vl" => {
469                let c_child = c_schema.child(0);
470                DataType::ListView(Arc::new(Field::try_from(c_child)?))
471            }
472            "+vL" => {
473                let c_child = c_schema.child(0);
474                DataType::LargeListView(Arc::new(Field::try_from(c_child)?))
475            }
476            "+s" => {
477                let fields = c_schema.children().map(Field::try_from);
478                DataType::Struct(fields.collect::<Result<_, ArrowError>>()?)
479            }
480            "+m" => {
481                let c_child = c_schema.child(0);
482                let map_keys_sorted = c_schema.map_keys_sorted();
483                DataType::Map(Arc::new(Field::try_from(c_child)?), map_keys_sorted)
484            }
485            "+r" => {
486                let c_run_ends = c_schema.child(0);
487                let c_values = c_schema.child(1);
488                DataType::RunEndEncoded(
489                    Arc::new(Field::try_from(c_run_ends)?),
490                    Arc::new(Field::try_from(c_values)?),
491                )
492            }
493            // Parametrized types, requiring string parse
494            other => {
495                match other.splitn(2, ':').collect::<Vec<&str>>().as_slice() {
496                    // FixedSizeBinary type in format "w:num_bytes"
497                    ["w", num_bytes] => {
498                        let parsed_num_bytes = num_bytes.parse::<i32>().map_err(|_| {
499                            ArrowError::CDataInterface(
500                                "FixedSizeBinary requires an integer parameter representing number of bytes per element".to_string())
501                        })?;
502                        DataType::FixedSizeBinary(parsed_num_bytes)
503                    }
504                    // FixedSizeList type in format "+w:num_elems"
505                    ["+w", num_elems] => {
506                        let c_child = c_schema.child(0);
507                        let parsed_num_elems = num_elems.parse::<i32>().map_err(|_| {
508                            ArrowError::CDataInterface(
509                                "The FixedSizeList type requires an integer parameter representing number of elements per list".to_string())
510                        })?;
511                        DataType::FixedSizeList(
512                            Arc::new(Field::try_from(c_child)?),
513                            parsed_num_elems,
514                        )
515                    }
516                    // Decimal types in format "d:precision,scale" or "d:precision,scale,bitWidth"
517                    ["d", extra] => match extra.splitn(3, ',').collect::<Vec<&str>>().as_slice() {
518                        [precision, scale] => {
519                            let parsed_precision = precision.parse::<u8>().map_err(|_| {
520                                ArrowError::CDataInterface(
521                                    "The decimal type requires an integer precision".to_string(),
522                                )
523                            })?;
524                            let parsed_scale = scale.parse::<i8>().map_err(|_| {
525                                ArrowError::CDataInterface(
526                                    "The decimal type requires an integer scale".to_string(),
527                                )
528                            })?;
529                            DataType::Decimal128(parsed_precision, parsed_scale)
530                        }
531                        [precision, scale, bits] => {
532                            let parsed_precision = precision.parse::<u8>().map_err(|_| {
533                                ArrowError::CDataInterface(
534                                    "The decimal type requires an integer precision".to_string(),
535                                )
536                            })?;
537                            let parsed_scale = scale.parse::<i8>().map_err(|_| {
538                                ArrowError::CDataInterface(
539                                    "The decimal type requires an integer scale".to_string(),
540                                )
541                            })?;
542                            match *bits {
543                                    "32" => DataType::Decimal32(parsed_precision, parsed_scale),
544                                    "64" => DataType::Decimal64(parsed_precision, parsed_scale),
545                                    "128" => DataType::Decimal128(parsed_precision, parsed_scale),
546                                    "256" => DataType::Decimal256(parsed_precision, parsed_scale),
547                                    _ => return Err(ArrowError::CDataInterface("Only 32/64/128/256 bit wide decimals are supported in the Rust implementation".to_string())),
548                                }
549                        }
550                        _ => {
551                            return Err(ArrowError::CDataInterface(format!(
552                                "The decimal pattern \"d:{extra:?}\" is not supported in the Rust implementation"
553                            )));
554                        }
555                    },
556                    // DenseUnion
557                    ["+ud", extra] => {
558                        let type_ids = extra
559                            .split(',')
560                            .map(|t| {
561                                t.parse::<i8>().map_err(|_| {
562                                    ArrowError::CDataInterface(
563                                        "The Union type requires an integer type id".to_string(),
564                                    )
565                                })
566                            })
567                            .collect::<Result<Vec<_>, ArrowError>>()?;
568                        let mut fields = Vec::with_capacity(type_ids.len());
569                        for idx in 0..c_schema.n_children {
570                            let c_child = c_schema.child(idx as usize);
571                            let field = Field::try_from(c_child)?;
572                            fields.push(field);
573                        }
574
575                        if fields.len() != type_ids.len() {
576                            return Err(ArrowError::CDataInterface(
577                                "The Union type requires same number of fields and type ids"
578                                    .to_string(),
579                            ));
580                        }
581
582                        DataType::Union(UnionFields::try_new(type_ids, fields)?, UnionMode::Dense)
583                    }
584                    // SparseUnion
585                    ["+us", extra] => {
586                        let type_ids = extra
587                            .split(',')
588                            .map(|t| {
589                                t.parse::<i8>().map_err(|_| {
590                                    ArrowError::CDataInterface(
591                                        "The Union type requires an integer type id".to_string(),
592                                    )
593                                })
594                            })
595                            .collect::<Result<Vec<_>, ArrowError>>()?;
596                        let mut fields = Vec::with_capacity(type_ids.len());
597                        for idx in 0..c_schema.n_children {
598                            let c_child = c_schema.child(idx as usize);
599                            let field = Field::try_from(c_child)?;
600                            fields.push(field);
601                        }
602
603                        if fields.len() != type_ids.len() {
604                            return Err(ArrowError::CDataInterface(
605                                "The Union type requires same number of fields and type ids"
606                                    .to_string(),
607                            ));
608                        }
609
610                        DataType::Union(UnionFields::try_new(type_ids, fields)?, UnionMode::Sparse)
611                    }
612
613                    // Timestamps in format "tts:" and "tts:America/New_York" for no timezones and timezones resp.
614                    ["tss", ""] => DataType::Timestamp(TimeUnit::Second, None),
615                    ["tsm", ""] => DataType::Timestamp(TimeUnit::Millisecond, None),
616                    ["tsu", ""] => DataType::Timestamp(TimeUnit::Microsecond, None),
617                    ["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None),
618                    ["tss", tz] => DataType::Timestamp(TimeUnit::Second, Some(Arc::from(*tz))),
619                    ["tsm", tz] => DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from(*tz))),
620                    ["tsu", tz] => DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from(*tz))),
621                    ["tsn", tz] => DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from(*tz))),
622                    _ => {
623                        return Err(ArrowError::CDataInterface(format!(
624                            "The datatype \"{other:?}\" is still not supported in Rust implementation"
625                        )));
626                    }
627                }
628            }
629        };
630
631        if let Some(dict_schema) = c_schema.dictionary() {
632            let value_type = Self::try_from(dict_schema)?;
633            dtype = DataType::Dictionary(Box::new(dtype), Box::new(value_type));
634        }
635
636        Ok(dtype)
637    }
638}
639
640impl TryFrom<&FFI_ArrowSchema> for Field {
641    type Error = ArrowError;
642
643    fn try_from(c_schema: &FFI_ArrowSchema) -> Result<Self, ArrowError> {
644        let dtype = DataType::try_from(c_schema)?;
645        let mut field = Field::new(c_schema.name().unwrap_or(""), dtype, c_schema.nullable());
646        field.set_metadata(c_schema.metadata()?);
647        Ok(field)
648    }
649}
650
651impl TryFrom<&FFI_ArrowSchema> for Schema {
652    type Error = ArrowError;
653
654    fn try_from(c_schema: &FFI_ArrowSchema) -> Result<Self, ArrowError> {
655        // interpret it as a struct type then extract its fields
656        let dtype = DataType::try_from(c_schema)?;
657        if let DataType::Struct(fields) = dtype {
658            Ok(Schema::new(fields).with_metadata(c_schema.metadata()?))
659        } else {
660            Err(ArrowError::CDataInterface(
661                "Unable to interpret C data struct as a Schema".to_string(),
662            ))
663        }
664    }
665}
666
667impl TryFrom<&DataType> for FFI_ArrowSchema {
668    type Error = ArrowError;
669
670    /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings)
671    fn try_from(dtype: &DataType) -> Result<Self, ArrowError> {
672        let format = get_format_string(dtype)?;
673        // allocate and hold the children
674        let children = match dtype {
675            DataType::List(child)
676            | DataType::LargeList(child)
677            | DataType::ListView(child)
678            | DataType::LargeListView(child)
679            | DataType::FixedSizeList(child, _)
680            | DataType::Map(child, _) => {
681                vec![FFI_ArrowSchema::try_from(child.as_ref())?]
682            }
683            DataType::Union(fields, _) => fields
684                .iter()
685                .map(|(_, f)| f.as_ref().try_into())
686                .collect::<Result<Vec<_>, ArrowError>>()?,
687            DataType::Struct(fields) => fields
688                .iter()
689                .map(FFI_ArrowSchema::try_from)
690                .collect::<Result<Vec<_>, ArrowError>>()?,
691            DataType::RunEndEncoded(run_ends, values) => vec![
692                FFI_ArrowSchema::try_from(run_ends.as_ref())?,
693                FFI_ArrowSchema::try_from(values.as_ref())?,
694            ],
695            _ => vec![],
696        };
697        let dictionary = if let DataType::Dictionary(_, value_data_type) = dtype {
698            Some(Self::try_from(value_data_type.as_ref())?)
699        } else {
700            None
701        };
702
703        let flags = match dtype {
704            DataType::Map(_, true) => Flags::MAP_KEYS_SORTED,
705            _ => Flags::empty(),
706        };
707
708        FFI_ArrowSchema::try_new(&format, children, dictionary)?.with_flags(flags)
709    }
710}
711
712fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError> {
713    match dtype {
714        DataType::Null => Ok("n".into()),
715        DataType::Boolean => Ok("b".into()),
716        DataType::Int8 => Ok("c".into()),
717        DataType::UInt8 => Ok("C".into()),
718        DataType::Int16 => Ok("s".into()),
719        DataType::UInt16 => Ok("S".into()),
720        DataType::Int32 => Ok("i".into()),
721        DataType::UInt32 => Ok("I".into()),
722        DataType::Int64 => Ok("l".into()),
723        DataType::UInt64 => Ok("L".into()),
724        DataType::Float16 => Ok("e".into()),
725        DataType::Float32 => Ok("f".into()),
726        DataType::Float64 => Ok("g".into()),
727        DataType::BinaryView => Ok("vz".into()),
728        DataType::Binary => Ok("z".into()),
729        DataType::LargeBinary => Ok("Z".into()),
730        DataType::Utf8View => Ok("vu".into()),
731        DataType::Utf8 => Ok("u".into()),
732        DataType::LargeUtf8 => Ok("U".into()),
733        DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))),
734        DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))),
735        DataType::Decimal32(precision, scale) => {
736            Ok(Cow::Owned(format!("d:{precision},{scale},32")))
737        }
738        DataType::Decimal64(precision, scale) => {
739            Ok(Cow::Owned(format!("d:{precision},{scale},64")))
740        }
741        DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))),
742        DataType::Decimal256(precision, scale) => {
743            Ok(Cow::Owned(format!("d:{precision},{scale},256")))
744        }
745        DataType::Date32 => Ok("tdD".into()),
746        DataType::Date64 => Ok("tdm".into()),
747        DataType::Time32(TimeUnit::Second) => Ok("tts".into()),
748        DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".into()),
749        DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".into()),
750        DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".into()),
751        DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".into()),
752        DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".into()),
753        DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".into()),
754        DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".into()),
755        DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(Cow::Owned(format!("tss:{tz}"))),
756        DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(Cow::Owned(format!("tsm:{tz}"))),
757        DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(Cow::Owned(format!("tsu:{tz}"))),
758        DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(Cow::Owned(format!("tsn:{tz}"))),
759        DataType::Duration(TimeUnit::Second) => Ok("tDs".into()),
760        DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".into()),
761        DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".into()),
762        DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".into()),
763        DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".into()),
764        DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".into()),
765        DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
766        DataType::List(_) => Ok("+l".into()),
767        DataType::LargeList(_) => Ok("+L".into()),
768        DataType::ListView(_) => Ok("+vl".into()),
769        DataType::LargeListView(_) => Ok("+vL".into()),
770        DataType::Struct(_) => Ok("+s".into()),
771        DataType::Map(_, _) => Ok("+m".into()),
772        DataType::RunEndEncoded(_, _) => Ok("+r".into()),
773        DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type),
774        DataType::Union(fields, mode) => {
775            let formats = fields
776                .iter()
777                .map(|(t, _)| t.to_string())
778                .collect::<Vec<_>>();
779            match mode {
780                UnionMode::Dense => Ok(Cow::Owned(format!("{}:{}", "+ud", formats.join(",")))),
781                UnionMode::Sparse => Ok(Cow::Owned(format!("{}:{}", "+us", formats.join(",")))),
782            }
783        }
784        other => Err(ArrowError::CDataInterface(format!(
785            "The datatype \"{other:?}\" is still not supported in Rust implementation"
786        ))),
787    }
788}
789
790impl TryFrom<&FieldRef> for FFI_ArrowSchema {
791    type Error = ArrowError;
792
793    fn try_from(value: &FieldRef) -> Result<Self, Self::Error> {
794        value.as_ref().try_into()
795    }
796}
797
798impl TryFrom<&Field> for FFI_ArrowSchema {
799    type Error = ArrowError;
800
801    fn try_from(field: &Field) -> Result<Self, ArrowError> {
802        let mut flags = if field.is_nullable() {
803            Flags::NULLABLE
804        } else {
805            Flags::empty()
806        };
807
808        if let Some(true) = field.dict_is_ordered() {
809            flags |= Flags::DICTIONARY_ORDERED;
810        }
811
812        FFI_ArrowSchema::try_from(field.data_type())?
813            .with_name(field.name())?
814            .with_flags(flags)?
815            .with_metadata(field.metadata())
816    }
817}
818
819impl TryFrom<&Schema> for FFI_ArrowSchema {
820    type Error = ArrowError;
821
822    fn try_from(schema: &Schema) -> Result<Self, ArrowError> {
823        let dtype = DataType::Struct(schema.fields().clone());
824        let c_schema = FFI_ArrowSchema::try_from(&dtype)?.with_metadata(&schema.metadata)?;
825        Ok(c_schema)
826    }
827}
828
829impl TryFrom<DataType> for FFI_ArrowSchema {
830    type Error = ArrowError;
831
832    fn try_from(dtype: DataType) -> Result<Self, ArrowError> {
833        FFI_ArrowSchema::try_from(&dtype)
834    }
835}
836
837impl TryFrom<Field> for FFI_ArrowSchema {
838    type Error = ArrowError;
839
840    fn try_from(field: Field) -> Result<Self, ArrowError> {
841        FFI_ArrowSchema::try_from(&field)
842    }
843}
844
845impl TryFrom<Schema> for FFI_ArrowSchema {
846    type Error = ArrowError;
847
848    fn try_from(schema: Schema) -> Result<Self, ArrowError> {
849        FFI_ArrowSchema::try_from(&schema)
850    }
851}
852
853#[cfg(test)]
854mod tests {
855    use super::*;
856    use crate::Fields;
857
858    fn round_trip_type(dtype: DataType) {
859        let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap();
860        let restored = DataType::try_from(&c_schema).unwrap();
861        assert_eq!(restored, dtype);
862    }
863
864    fn round_trip_field(field: Field) {
865        let c_schema = FFI_ArrowSchema::try_from(&field).unwrap();
866        let restored = Field::try_from(&c_schema).unwrap();
867        assert_eq!(restored, field);
868    }
869
870    fn round_trip_schema(schema: Schema) {
871        let c_schema = FFI_ArrowSchema::try_from(&schema).unwrap();
872        let restored = Schema::try_from(&c_schema).unwrap();
873        assert_eq!(restored, schema);
874    }
875
876    #[test]
877    fn test_type() {
878        round_trip_type(DataType::Int64);
879        round_trip_type(DataType::UInt64);
880        round_trip_type(DataType::Float64);
881        round_trip_type(DataType::Date64);
882        round_trip_type(DataType::Time64(TimeUnit::Nanosecond));
883        round_trip_type(DataType::FixedSizeBinary(12));
884        round_trip_type(DataType::FixedSizeList(
885            Arc::new(Field::new("a", DataType::Int64, false)),
886            5,
887        ));
888        round_trip_type(DataType::Utf8);
889        round_trip_type(DataType::Utf8View);
890        round_trip_type(DataType::BinaryView);
891        round_trip_type(DataType::Binary);
892        round_trip_type(DataType::LargeBinary);
893        round_trip_type(DataType::List(Arc::new(Field::new(
894            "a",
895            DataType::Int16,
896            false,
897        ))));
898        round_trip_type(DataType::ListView(Arc::new(Field::new(
899            "a",
900            DataType::Int16,
901            false,
902        ))));
903        round_trip_type(DataType::LargeListView(Arc::new(Field::new(
904            "a",
905            DataType::Int16,
906            false,
907        ))));
908        round_trip_type(DataType::Struct(Fields::from(vec![Field::new(
909            "a",
910            DataType::Utf8,
911            true,
912        )])));
913        round_trip_type(DataType::RunEndEncoded(
914            Arc::new(Field::new("run_ends", DataType::Int32, false)),
915            Arc::new(Field::new("values", DataType::Binary, true)),
916        ));
917    }
918
919    #[test]
920    fn test_field() {
921        let dtype = DataType::Struct(vec![Field::new("a", DataType::Utf8, true)].into());
922        round_trip_field(Field::new("test", dtype, true));
923    }
924
925    #[test]
926    fn test_schema() {
927        let schema = Schema::new(vec![
928            Field::new("name", DataType::Utf8, false),
929            Field::new("address", DataType::Utf8, false),
930            Field::new("priority", DataType::UInt8, false),
931        ])
932        .with_metadata([("hello".to_string(), "world".to_string())].into());
933
934        round_trip_schema(schema);
935
936        // test that we can interpret struct types as schema
937        let dtype = DataType::Struct(Fields::from(vec![
938            Field::new("a", DataType::Utf8, true),
939            Field::new("b", DataType::Int16, false),
940        ]));
941        let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap();
942        let schema = Schema::try_from(&c_schema).unwrap();
943        assert_eq!(schema.fields().len(), 2);
944
945        // test that we assert the input type
946        let c_schema = FFI_ArrowSchema::try_from(&DataType::Float64).unwrap();
947        let result = Schema::try_from(&c_schema);
948        assert!(result.is_err());
949    }
950
951    #[test]
952    fn test_map_keys_sorted() {
953        let keys = Field::new("keys", DataType::Int32, false);
954        let values = Field::new("values", DataType::UInt32, false);
955        let entry_struct = DataType::Struct(vec![keys, values].into());
956
957        // Construct a map array from the above two
958        let map_data_type =
959            DataType::Map(Arc::new(Field::new("entries", entry_struct, false)), true);
960
961        let arrow_schema = FFI_ArrowSchema::try_from(map_data_type).unwrap();
962        assert!(arrow_schema.map_keys_sorted());
963    }
964
965    #[test]
966    fn test_dictionary_ordered() {
967        #[allow(deprecated)]
968        let schema = Schema::new(vec![Field::new_dict(
969            "dict",
970            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
971            false,
972            0,
973            true,
974        )]);
975
976        let arrow_schema = FFI_ArrowSchema::try_from(schema).unwrap();
977        assert!(arrow_schema.child(0).dictionary_ordered());
978    }
979
980    #[test]
981    fn test_set_field_metadata() {
982        let metadata_cases: Vec<HashMap<String, String>> = vec![
983            [].into(),
984            [("key".to_string(), "value".to_string())].into(),
985            [
986                ("key".to_string(), "".to_string()),
987                ("ascii123".to_string(), "你好".to_string()),
988                ("".to_string(), "value".to_string()),
989            ]
990            .into(),
991        ];
992
993        let mut schema = FFI_ArrowSchema::try_new("b", vec![], None)
994            .unwrap()
995            .with_name("test")
996            .unwrap();
997
998        for metadata in metadata_cases {
999            schema = schema.with_metadata(&metadata).unwrap();
1000            let field = Field::try_from(&schema).unwrap();
1001            assert_eq!(field.metadata(), &metadata);
1002        }
1003    }
1004
1005    #[test]
1006    fn test_import_field_with_null_name() {
1007        let dtype = DataType::Int16;
1008        let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap();
1009        assert!(c_schema.name().is_none());
1010        let field = Field::try_from(&c_schema).unwrap();
1011        assert_eq!(field.name(), "");
1012    }
1013}