arrow_schema/extension/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Extension types.
19//!
20//! <div class="warning">This module is experimental. There might be breaking changes between minor releases.</div>
21
22#[cfg(feature = "canonical_extension_types")]
23mod canonical;
24#[cfg(feature = "canonical_extension_types")]
25pub use canonical::*;
26
27use crate::{ArrowError, DataType};
28
29/// The metadata key for the string name identifying an [`ExtensionType`].
30pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
31
32/// The metadata key for a serialized representation of the [`ExtensionType`]
33/// necessary to reconstruct the custom type.
34pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
35
36/// Extension types.
37///
38/// User-defined “extension” types can be defined setting certain key value
39/// pairs in the [`Field`] metadata structure. These extension keys are:
40/// - [`EXTENSION_TYPE_NAME_KEY`]
41/// - [`EXTENSION_TYPE_METADATA_KEY`]
42///
43/// Canonical extension types support in this crate requires the
44/// `canonical_extension_types` feature.
45///
46/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
47/// field.
48///
49/// # Example
50///
51/// The example below demonstrates how to implement this trait for a `Uuid`
52/// type. Note this is not the canonical extension type for `Uuid`, which does
53/// not include information about the `Uuid` version.
54///
55/// ```
56/// # use arrow_schema::ArrowError;
57/// # fn main() -> Result<(), ArrowError> {
58/// use arrow_schema::{DataType, extension::ExtensionType, Field};
59/// use std::{fmt, str::FromStr};
60///
61/// /// The different Uuid versions.
62/// #[derive(Clone, Copy, Debug, PartialEq)]
63/// enum UuidVersion {
64///     V1,
65///     V2,
66///     V3,
67///     V4,
68///     V5,
69///     V6,
70///     V7,
71///     V8,
72/// }
73///
74/// // We'll use `Display` to serialize.
75/// impl fmt::Display for UuidVersion {
76///     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77///         write!(
78///             f,
79///             "{}",
80///             match self {
81///                 Self::V1 => "V1",
82///                 Self::V2 => "V2",
83///                 Self::V3 => "V3",
84///                 Self::V4 => "V4",
85///                 Self::V5 => "V5",
86///                 Self::V6 => "V6",
87///                 Self::V7 => "V7",
88///                 Self::V8 => "V8",
89///             }
90///         )
91///     }
92/// }
93///
94/// // And `FromStr` to deserialize.
95/// impl FromStr for UuidVersion {
96///     type Err = ArrowError;
97///
98///     fn from_str(s: &str) -> Result<Self, Self::Err> {
99///         match s {
100///             "V1" => Ok(Self::V1),
101///             "V2" => Ok(Self::V2),
102///             "V3" => Ok(Self::V3),
103///             "V4" => Ok(Self::V4),
104///             "V5" => Ok(Self::V5),
105///             "V6" => Ok(Self::V6),
106///             "V7" => Ok(Self::V7),
107///             "V8" => Ok(Self::V8),
108///             _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())),
109///         }
110///     }
111/// }
112///
113/// /// This is the extension type, not the container for Uuid values. It
114/// /// stores the Uuid version (this is the metadata of this extension type).
115/// #[derive(Clone, Copy, Debug, PartialEq)]
116/// struct Uuid(UuidVersion);
117///
118/// impl ExtensionType for Uuid {
119///     // We use a namespace as suggested by the specification.
120///     const NAME: &'static str = "myorg.example.uuid";
121///
122///     // The metadata type is the Uuid version.
123///     type Metadata = UuidVersion;
124///
125///     // We just return a reference to the Uuid version.
126///     fn metadata(&self) -> &Self::Metadata {
127///         &self.0
128///     }
129///
130///     // We use the `Display` implementation to serialize the Uuid
131///     // version.
132///     fn serialize_metadata(&self) -> Option<String> {
133///         Some(self.0.to_string())
134///     }
135///
136///     // We use the `FromStr` implementation to deserialize the Uuid
137///     // version.
138///     fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
139///         metadata.map_or_else(
140///             || {
141///                 Err(ArrowError::InvalidArgumentError(
142///                     "Uuid extension type metadata missing".to_owned(),
143///                 ))
144///             },
145///             str::parse,
146///         )
147///     }
148///
149///     // The only supported data type is `FixedSizeBinary(16)`.
150///     fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
151///         match data_type {
152///             DataType::FixedSizeBinary(16) => Ok(()),
153///             data_type => Err(ArrowError::InvalidArgumentError(format!(
154///                 "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}"
155///             ))),
156///         }
157///     }
158///
159///     // We should always check if the data type is supported before
160///     // constructing the extension type.
161///     fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
162///         let uuid = Self(metadata);
163///         uuid.supports_data_type(data_type)?;
164///         Ok(uuid)
165///     }
166/// }
167///
168/// // We can now construct the extension type.
169/// let uuid_v1 = Uuid(UuidVersion::V1);
170///
171/// // And add it to a field.
172/// let mut field =
173///     Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1);
174///
175/// // And extract it from this field.
176/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1);
177///
178/// // When we try to add this to a field with an unsupported data type we
179/// // get an error.
180/// let result = Field::new("", DataType::Null, false).try_with_extension_type(uuid_v1);
181/// assert!(result.is_err());
182/// # Ok(()) }
183/// ```
184///
185/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
186///
187/// [`Field`]: crate::Field
188pub trait ExtensionType: Sized {
189    /// The name identifying this extension type.
190    ///
191    /// This is the string value that is used for the
192    /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`]
193    /// to identify this extension type.
194    ///
195    /// We recommend that you use a “namespace”-style prefix for extension
196    /// type names to minimize the possibility of conflicts with multiple Arrow
197    /// readers and writers in the same application. For example, use
198    /// `myorg.name_of_type` instead of simply `name_of_type`.
199    ///
200    /// Extension names beginning with `arrow.` are reserved for canonical
201    /// extension types, they should not be used for third-party extension
202    /// types.
203    ///
204    /// Extension names are case-sensitive.
205    ///
206    /// [`Field`]: crate::Field
207    /// [`Field::metadata`]: crate::Field::metadata
208    const NAME: &'static str;
209
210    /// The metadata type of this extension type.
211    ///
212    /// Implementations can use strongly or loosly typed data structures here
213    /// depending on the complexity of the metadata.
214    ///
215    /// Implementations can also use `Self` here if the extension type can be
216    /// constructed directly from its metadata.
217    ///
218    /// If an extension type defines no metadata it should use `()` to indicate
219    /// this.
220    type Metadata;
221
222    /// Returns a reference to the metadata of this extension type, or `&()` if
223    /// if this extension type defines no metadata (`Self::Metadata=()`).
224    fn metadata(&self) -> &Self::Metadata;
225
226    /// Returns the serialized representation of the metadata of this extension
227    /// type, or `None` if this extension type defines no metadata
228    /// (`Self::Metadata=()`).
229    ///
230    /// This is string value that is used for the
231    /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a
232    /// [`Field`].
233    ///
234    /// [`Field`]: crate::Field
235    /// [`Field::metadata`]: crate::Field::metadata
236    fn serialize_metadata(&self) -> Option<String>;
237
238    /// Deserialize the metadata of this extension type from the serialized
239    /// representation of the metadata. An extension type that defines no
240    /// metadata should expect `None` for the serialized metadata and return
241    /// `Ok(())`.
242    ///
243    /// This function should return an error when
244    /// - expected metadata is missing (for extensions types with non-optional
245    ///   metadata)
246    /// - unexpected metadata is set (for extension types without metadata)
247    /// - deserialization of metadata fails
248    fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError>;
249
250    /// Returns `OK())` iff the given data type is supported by this extension
251    /// type.
252    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError>;
253
254    /// Construct this extension type for a field with the given data type and
255    /// metadata.
256    ///
257    /// This should return an error if the given data type is not supported by
258    /// this extension type.
259    fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError>;
260}