arrow_schema/extension/mod.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Extension types.
19
20#[cfg(feature = "canonical_extension_types")]
21mod canonical;
22#[cfg(feature = "canonical_extension_types")]
23pub use canonical::*;
24
25use crate::{ArrowError, DataType};
26use std::collections::HashMap;
27
28/// The metadata key for the string name identifying an [`ExtensionType`].
29pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
30
31/// The metadata key for a serialized representation of the [`ExtensionType`]
32/// necessary to reconstruct the custom type.
33pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
34
35/// Extension types.
36///
37/// User-defined “extension” types can be defined setting certain key value
38/// pairs in the [`Field`] metadata structure. These extension keys are:
39/// - [`EXTENSION_TYPE_NAME_KEY`]
40/// - [`EXTENSION_TYPE_METADATA_KEY`]
41///
42/// Canonical extension types support in this crate requires the
43/// `canonical_extension_types` feature.
44///
45/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
46/// field.
47///
48/// # Example
49///
50/// The example below demonstrates how to implement this trait for a `Uuid`
51/// type. Note this is not the canonical extension type for `Uuid`, which does
52/// not include information about the `Uuid` version.
53///
54/// ```
55/// # use arrow_schema::ArrowError;
56/// # fn main() -> Result<(), ArrowError> {
57/// use arrow_schema::{DataType, extension::ExtensionType, Field};
58/// use std::{fmt, str::FromStr};
59///
60/// /// The different Uuid versions.
61/// #[derive(Clone, Copy, Debug, PartialEq)]
62/// enum UuidVersion {
63/// V1,
64/// V2,
65/// V3,
66/// V4,
67/// V5,
68/// V6,
69/// V7,
70/// V8,
71/// }
72///
73/// // We'll use `Display` to serialize.
74/// impl fmt::Display for UuidVersion {
75/// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76/// write!(
77/// f,
78/// "{}",
79/// match self {
80/// Self::V1 => "V1",
81/// Self::V2 => "V2",
82/// Self::V3 => "V3",
83/// Self::V4 => "V4",
84/// Self::V5 => "V5",
85/// Self::V6 => "V6",
86/// Self::V7 => "V7",
87/// Self::V8 => "V8",
88/// }
89/// )
90/// }
91/// }
92///
93/// // And `FromStr` to deserialize.
94/// impl FromStr for UuidVersion {
95/// type Err = ArrowError;
96///
97/// fn from_str(s: &str) -> Result<Self, Self::Err> {
98/// match s {
99/// "V1" => Ok(Self::V1),
100/// "V2" => Ok(Self::V2),
101/// "V3" => Ok(Self::V3),
102/// "V4" => Ok(Self::V4),
103/// "V5" => Ok(Self::V5),
104/// "V6" => Ok(Self::V6),
105/// "V7" => Ok(Self::V7),
106/// "V8" => Ok(Self::V8),
107/// _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())),
108/// }
109/// }
110/// }
111///
112/// /// This is the extension type, not the container for Uuid values. It
113/// /// stores the Uuid version (this is the metadata of this extension type).
114/// #[derive(Clone, Copy, Debug, PartialEq)]
115/// struct Uuid(UuidVersion);
116///
117/// impl ExtensionType for Uuid {
118/// // We use a namespace as suggested by the specification.
119/// const NAME: &'static str = "myorg.example.uuid";
120///
121/// // The metadata type is the Uuid version.
122/// type Metadata = UuidVersion;
123///
124/// // We just return a reference to the Uuid version.
125/// fn metadata(&self) -> &Self::Metadata {
126/// &self.0
127/// }
128///
129/// // We use the `Display` implementation to serialize the Uuid
130/// // version.
131/// fn serialize_metadata(&self) -> Option<String> {
132/// Some(self.0.to_string())
133/// }
134///
135/// // We use the `FromStr` implementation to deserialize the Uuid
136/// // version.
137/// fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
138/// metadata.map_or_else(
139/// || {
140/// Err(ArrowError::InvalidArgumentError(
141/// "Uuid extension type metadata missing".to_owned(),
142/// ))
143/// },
144/// str::parse,
145/// )
146/// }
147///
148/// // The only supported data type is `FixedSizeBinary(16)`.
149/// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
150/// match data_type {
151/// DataType::FixedSizeBinary(16) => Ok(()),
152/// data_type => Err(ArrowError::InvalidArgumentError(format!(
153/// "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}"
154/// ))),
155/// }
156/// }
157///
158/// // We should always check if the data type is supported before
159/// // constructing the extension type.
160/// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
161/// let uuid = Self(metadata);
162/// uuid.supports_data_type(data_type)?;
163/// Ok(uuid)
164/// }
165/// }
166///
167/// // We can now construct the extension type.
168/// let uuid_v1 = Uuid(UuidVersion::V1);
169///
170/// // And add it to a field.
171/// let mut field =
172/// Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1);
173///
174/// // And extract it from this field.
175/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1);
176///
177/// // When we try to add this to a field with an unsupported data type we
178/// // get an error.
179/// let result = Field::new("", DataType::Null, false).try_with_extension_type(uuid_v1);
180/// assert!(result.is_err());
181/// # Ok(()) }
182/// ```
183///
184/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
185///
186/// [`Field`]: crate::Field
187pub trait ExtensionType: Sized {
188 /// The name identifying this extension type.
189 ///
190 /// This is the string value that is used for the
191 /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`]
192 /// to identify this extension type.
193 ///
194 /// We recommend that you use a “namespace”-style prefix for extension
195 /// type names to minimize the possibility of conflicts with multiple Arrow
196 /// readers and writers in the same application. For example, use
197 /// `myorg.name_of_type` instead of simply `name_of_type`.
198 ///
199 /// Extension names beginning with `arrow.` are reserved for canonical
200 /// extension types, they should not be used for third-party extension
201 /// types.
202 ///
203 /// Extension names are case-sensitive.
204 ///
205 /// [`Field`]: crate::Field
206 /// [`Field::metadata`]: crate::Field::metadata
207 const NAME: &'static str;
208
209 /// The metadata type of this extension type.
210 ///
211 /// Implementations can use strongly or loosly typed data structures here
212 /// depending on the complexity of the metadata.
213 ///
214 /// Implementations can also use `Self` here if the extension type can be
215 /// constructed directly from its metadata.
216 ///
217 /// If an extension type defines no metadata it should use `()` to indicate
218 /// this.
219 type Metadata;
220
221 /// Returns a reference to the metadata of this extension type, or `&()` if
222 /// if this extension type defines no metadata (`Self::Metadata=()`).
223 fn metadata(&self) -> &Self::Metadata;
224
225 /// Returns the serialized representation of the metadata of this extension
226 /// type, or `None` if this extension type defines no metadata
227 /// (`Self::Metadata=()`).
228 ///
229 /// This is string value that is used for the
230 /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a
231 /// [`Field`].
232 ///
233 /// [`Field`]: crate::Field
234 /// [`Field::metadata`]: crate::Field::metadata
235 fn serialize_metadata(&self) -> Option<String>;
236
237 /// Deserialize the metadata of this extension type from the serialized
238 /// representation of the metadata. An extension type that defines no
239 /// metadata should expect `None` for the serialized metadata and return
240 /// `Ok(())`.
241 ///
242 /// This function should return an error when
243 /// - expected metadata is missing (for extensions types with non-optional
244 /// metadata)
245 /// - unexpected metadata is set (for extension types without metadata)
246 /// - deserialization of metadata fails
247 fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError>;
248
249 /// Returns `Ok(())` iff the given data type is supported by this extension
250 /// type.
251 fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError>;
252
253 /// Construct this extension type for a field with the given data type and
254 /// metadata.
255 ///
256 /// This should return an error if the given data type is not supported by
257 /// this extension type.
258 fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError>;
259
260 /// Construct this extension type from field metadata and data type.
261 ///
262 /// This is a provided method that extracts extension type information from
263 /// metadata (using [`EXTENSION_TYPE_NAME_KEY`] and
264 /// [`EXTENSION_TYPE_METADATA_KEY`]) and delegates to [`Self::try_new`].
265 ///
266 /// Returns an error if:
267 /// - The extension type name is missing or doesn't match [`Self::NAME`]
268 /// - Metadata deserialization fails
269 /// - The data type is not supported
270 ///
271 /// This method enables extension type checking without requiring a full
272 /// [`Field`] instance, useful when only metadata and data type are available.
273 ///
274 /// [`Field`]: crate::Field
275 fn try_new_from_field_metadata(
276 data_type: &DataType,
277 metadata: &HashMap<String, String>,
278 ) -> Result<Self, ArrowError> {
279 // Check the extension name in the metadata
280 match metadata.get(EXTENSION_TYPE_NAME_KEY).map(|s| s.as_str()) {
281 // It should match the name of the given extension type
282 Some(name) if name == Self::NAME => {
283 // Deserialize the metadata and try to construct the extension type
284 let ext_metadata = metadata
285 .get(EXTENSION_TYPE_METADATA_KEY)
286 .map(|s| s.as_str());
287 let parsed = Self::deserialize_metadata(ext_metadata)?;
288 Self::try_new(data_type, parsed)
289 }
290 // Name mismatch
291 Some(name) => Err(ArrowError::InvalidArgumentError(format!(
292 "Extension type name mismatch: expected {}, got {name}",
293 Self::NAME
294 ))),
295 // Name missing
296 None => Err(ArrowError::InvalidArgumentError(
297 "Extension type name missing".to_string(),
298 )),
299 }
300 }
301}