Skip to main content

arrow_schema/extension/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Extension types.
19
20#[cfg(feature = "canonical_extension_types")]
21mod canonical;
22#[cfg(feature = "canonical_extension_types")]
23pub use canonical::*;
24
25use crate::{ArrowError, DataType};
26use std::collections::HashMap;
27
28/// The metadata key for the string name identifying an [`ExtensionType`].
29pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
30
31/// The metadata key for a serialized representation of the [`ExtensionType`]
32/// necessary to reconstruct the custom type.
33pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
34
35/// Extension types.
36///
37/// User-defined “extension” types can be defined setting certain key value
38/// pairs in the [`Field`] metadata structure. These extension keys are:
39/// - [`EXTENSION_TYPE_NAME_KEY`]
40/// - [`EXTENSION_TYPE_METADATA_KEY`]
41///
42/// Canonical extension types support in this crate requires the
43/// `canonical_extension_types` feature.
44///
45/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
46/// field.
47///
48/// # Example
49///
50/// The example below demonstrates how to implement this trait for a `Uuid`
51/// type. Note this is not the canonical extension type for `Uuid`, which does
52/// not include information about the `Uuid` version.
53///
54/// ```
55/// # use arrow_schema::ArrowError;
56/// # fn main() -> Result<(), ArrowError> {
57/// use arrow_schema::{DataType, extension::ExtensionType, Field};
58/// use std::{fmt, str::FromStr};
59///
60/// /// The different Uuid versions.
61/// #[derive(Clone, Copy, Debug, PartialEq)]
62/// enum UuidVersion {
63///     V1,
64///     V2,
65///     V3,
66///     V4,
67///     V5,
68///     V6,
69///     V7,
70///     V8,
71/// }
72///
73/// // We'll use `Display` to serialize.
74/// impl fmt::Display for UuidVersion {
75///     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76///         write!(
77///             f,
78///             "{}",
79///             match self {
80///                 Self::V1 => "V1",
81///                 Self::V2 => "V2",
82///                 Self::V3 => "V3",
83///                 Self::V4 => "V4",
84///                 Self::V5 => "V5",
85///                 Self::V6 => "V6",
86///                 Self::V7 => "V7",
87///                 Self::V8 => "V8",
88///             }
89///         )
90///     }
91/// }
92///
93/// // And `FromStr` to deserialize.
94/// impl FromStr for UuidVersion {
95///     type Err = ArrowError;
96///
97///     fn from_str(s: &str) -> Result<Self, Self::Err> {
98///         match s {
99///             "V1" => Ok(Self::V1),
100///             "V2" => Ok(Self::V2),
101///             "V3" => Ok(Self::V3),
102///             "V4" => Ok(Self::V4),
103///             "V5" => Ok(Self::V5),
104///             "V6" => Ok(Self::V6),
105///             "V7" => Ok(Self::V7),
106///             "V8" => Ok(Self::V8),
107///             _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())),
108///         }
109///     }
110/// }
111///
112/// /// This is the extension type, not the container for Uuid values. It
113/// /// stores the Uuid version (this is the metadata of this extension type).
114/// #[derive(Clone, Copy, Debug, PartialEq)]
115/// struct Uuid(UuidVersion);
116///
117/// impl ExtensionType for Uuid {
118///     // We use a namespace as suggested by the specification.
119///     const NAME: &'static str = "myorg.example.uuid";
120///
121///     // The metadata type is the Uuid version.
122///     type Metadata = UuidVersion;
123///
124///     // We just return a reference to the Uuid version.
125///     fn metadata(&self) -> &Self::Metadata {
126///         &self.0
127///     }
128///
129///     // We use the `Display` implementation to serialize the Uuid
130///     // version.
131///     fn serialize_metadata(&self) -> Option<String> {
132///         Some(self.0.to_string())
133///     }
134///
135///     // We use the `FromStr` implementation to deserialize the Uuid
136///     // version.
137///     fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
138///         metadata.map_or_else(
139///             || {
140///                 Err(ArrowError::InvalidArgumentError(
141///                     "Uuid extension type metadata missing".to_owned(),
142///                 ))
143///             },
144///             str::parse,
145///         )
146///     }
147///
148///     // The only supported data type is `FixedSizeBinary(16)`.
149///     fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
150///         match data_type {
151///             DataType::FixedSizeBinary(16) => Ok(()),
152///             data_type => Err(ArrowError::InvalidArgumentError(format!(
153///                 "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}"
154///             ))),
155///         }
156///     }
157///
158///     // We should always check if the data type is supported before
159///     // constructing the extension type.
160///     fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
161///         let uuid = Self(metadata);
162///         uuid.supports_data_type(data_type)?;
163///         Ok(uuid)
164///     }
165/// }
166///
167/// // We can now construct the extension type.
168/// let uuid_v1 = Uuid(UuidVersion::V1);
169///
170/// // And add it to a field.
171/// let mut field =
172///     Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1);
173///
174/// // And extract it from this field.
175/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1);
176///
177/// // When we try to add this to a field with an unsupported data type we
178/// // get an error.
179/// let result = Field::new("", DataType::Null, false).try_with_extension_type(uuid_v1);
180/// assert!(result.is_err());
181/// # Ok(()) }
182/// ```
183///
184/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
185///
186/// [`Field`]: crate::Field
187pub trait ExtensionType: Sized {
188    /// The name identifying this extension type.
189    ///
190    /// This is the string value that is used for the
191    /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`]
192    /// to identify this extension type.
193    ///
194    /// We recommend that you use a “namespace”-style prefix for extension
195    /// type names to minimize the possibility of conflicts with multiple Arrow
196    /// readers and writers in the same application. For example, use
197    /// `myorg.name_of_type` instead of simply `name_of_type`.
198    ///
199    /// Extension names beginning with `arrow.` are reserved for canonical
200    /// extension types, they should not be used for third-party extension
201    /// types.
202    ///
203    /// Extension names are case-sensitive.
204    ///
205    /// [`Field`]: crate::Field
206    /// [`Field::metadata`]: crate::Field::metadata
207    const NAME: &'static str;
208
209    /// The metadata type of this extension type.
210    ///
211    /// Implementations can use strongly or loosly typed data structures here
212    /// depending on the complexity of the metadata.
213    ///
214    /// Implementations can also use `Self` here if the extension type can be
215    /// constructed directly from its metadata.
216    ///
217    /// If an extension type defines no metadata it should use `()` to indicate
218    /// this.
219    type Metadata;
220
221    /// Returns a reference to the metadata of this extension type, or `&()` if
222    /// if this extension type defines no metadata (`Self::Metadata=()`).
223    fn metadata(&self) -> &Self::Metadata;
224
225    /// Returns the serialized representation of the metadata of this extension
226    /// type, or `None` if this extension type defines no metadata
227    /// (`Self::Metadata=()`).
228    ///
229    /// This is string value that is used for the
230    /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a
231    /// [`Field`].
232    ///
233    /// [`Field`]: crate::Field
234    /// [`Field::metadata`]: crate::Field::metadata
235    fn serialize_metadata(&self) -> Option<String>;
236
237    /// Deserialize the metadata of this extension type from the serialized
238    /// representation of the metadata. An extension type that defines no
239    /// metadata should expect `None` for the serialized metadata and return
240    /// `Ok(())`.
241    ///
242    /// This function should return an error when
243    /// - expected metadata is missing (for extensions types with non-optional
244    ///   metadata)
245    /// - unexpected metadata is set (for extension types without metadata)
246    /// - deserialization of metadata fails
247    fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError>;
248
249    /// Returns `Ok(())` iff the given data type is supported by this extension
250    /// type.
251    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError>;
252
253    /// Construct this extension type for a field with the given data type and
254    /// metadata.
255    ///
256    /// This should return an error if the given data type is not supported by
257    /// this extension type.
258    fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError>;
259
260    /// Construct this extension type from field metadata and data type.
261    ///
262    /// This is a provided method that extracts extension type information from
263    /// metadata (using [`EXTENSION_TYPE_NAME_KEY`] and
264    /// [`EXTENSION_TYPE_METADATA_KEY`]) and delegates to [`Self::try_new`].
265    ///
266    /// Returns an error if:
267    /// - The extension type name is missing or doesn't match [`Self::NAME`]
268    /// - Metadata deserialization fails
269    /// - The data type is not supported
270    ///
271    /// This method enables extension type checking without requiring a full
272    /// [`Field`] instance, useful when only metadata and data type are available.
273    ///
274    /// [`Field`]: crate::Field
275    fn try_new_from_field_metadata(
276        data_type: &DataType,
277        metadata: &HashMap<String, String>,
278    ) -> Result<Self, ArrowError> {
279        // Check the extension name in the metadata
280        match metadata.get(EXTENSION_TYPE_NAME_KEY).map(|s| s.as_str()) {
281            // It should match the name of the given extension type
282            Some(name) if name == Self::NAME => {
283                // Deserialize the metadata and try to construct the extension type
284                let ext_metadata = metadata
285                    .get(EXTENSION_TYPE_METADATA_KEY)
286                    .map(|s| s.as_str());
287                let parsed = Self::deserialize_metadata(ext_metadata)?;
288                Self::try_new(data_type, parsed)
289            }
290            // Name mismatch
291            Some(name) => Err(ArrowError::InvalidArgumentError(format!(
292                "Extension type name mismatch: expected {}, got {name}",
293                Self::NAME
294            ))),
295            // Name missing
296            None => Err(ArrowError::InvalidArgumentError(
297                "Extension type name missing".to_string(),
298            )),
299        }
300    }
301}