arrow_schema/extension/canonical/
timestamp_with_offset.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Timestamp with an offset in minutes
19//!
20//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
21
22use crate::{ArrowError, DataType, extension::ExtensionType};
23
24/// The extension type for `TimestampWithOffset`.
25///
26/// Extension name: `arrow.timestamp_with_offset`.
27///
28/// This type represents a timestamp column that stores potentially different timezone offsets per
29/// value. The timestamp is stored in UTC alongside the original timezone offset in minutes. This
30/// extension type is intended to be compatible with ANSI SQL's `TIMESTAMP WITH TIME ZONE`, which
31/// is supported by multiple database engines.
32///
33/// The storage type of the extension is a `Struct` with 2 fields, in order: - `timestamp`: a
34/// non-nullable `Timestamp(time_unit, "UTC")`, where `time_unit` is any Arrow `TimeUnit` (s, ms,
35/// us or ns). - `offset_minutes`: a non-nullable signed 16-bit integer (`Int16`) representing the
36/// offset in minutes from the UTC timezone. Negative offsets represent time zones west of UTC,
37/// while positive offsets represent east. Offsets normally range from -779 (-12:59) to +780
38/// (+13:00).
39///
40/// This type has no type parameters.
41///
42/// Metadata is either empty or an empty string.
43///
44/// It is also *permissible* for the `offset_minutes` field to be dictionary-encoded with a
45/// preferred (*but not required*) index type of `int8`, or run-end-encoded with a preferred (*but
46/// not required*) runs type of `int8`.
47///
48/// It's worth noting that the data source needs to resolve timezone strings such as `UTC` or
49/// `Americas/Los_Angeles` into an offset in minutes in order to construct a `TimestampWithOffset`.
50/// This makes `TimestampWithOffset` type "lossy" in the sense that any original "unresolved"
51/// timezone string gets lost in this conversion. It's a tradeoff for optimizing the row
52/// representation and simplifying the client code, which does not need to know how to convert from
53/// timezone string to its corresponding offset in minutes.
54///
55/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
56#[derive(Debug, Default, Clone, Copy, PartialEq)]
57pub struct TimestampWithOffset;
58
59const TIMESTAMP_FIELD_NAME: &str = "timestamp";
60const OFFSET_FIELD_NAME: &str = "offset_minutes";
61
62impl ExtensionType for TimestampWithOffset {
63    const NAME: &'static str = "arrow.timestamp_with_offset";
64
65    type Metadata = ();
66
67    fn metadata(&self) -> &Self::Metadata {
68        &()
69    }
70
71    fn serialize_metadata(&self) -> Option<String> {
72        None
73    }
74
75    fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
76        metadata.map_or_else(
77            || Ok(()),
78            |v| {
79                if !v.is_empty() {
80                    Err(ArrowError::InvalidArgumentError(
81                        "TimestampWithOffset extension type expects no metadata".to_owned(),
82                    ))
83                } else {
84                    Ok(())
85                }
86            },
87        )
88    }
89
90    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
91        let ok = match data_type {
92            DataType::Struct(fields) => match fields.len() {
93                2 => {
94                    let maybe_timestamp = fields.first().unwrap();
95                    let maybe_offset = fields.get(1).unwrap();
96
97                    let timestamp_type_ok = matches!(maybe_timestamp.data_type(), DataType::Timestamp(_, tz) if {
98                        match tz {
99                            Some(tz) => {
100                                tz.as_ref() == "UTC"
101                            },
102                            None => false
103                        }
104                    });
105
106                    let offset_type_ok = match maybe_offset.data_type() {
107                        DataType::Int16 => true,
108                        DataType::Dictionary(key_type, value_type) => {
109                            key_type.is_dictionary_key_type()
110                                && matches!(value_type.as_ref(), DataType::Int16)
111                        }
112                        DataType::RunEndEncoded(run_ends, values) => {
113                            run_ends.data_type().is_run_ends_type()
114                                && matches!(values.data_type(), DataType::Int16)
115                        }
116                        _ => false,
117                    };
118
119                    maybe_timestamp.name() == TIMESTAMP_FIELD_NAME
120                        && timestamp_type_ok
121                        && !maybe_timestamp.is_nullable()
122                        && maybe_offset.name() == OFFSET_FIELD_NAME
123                        && offset_type_ok
124                        && !maybe_offset.is_nullable()
125                }
126                _ => false,
127            },
128            _ => false,
129        };
130
131        match ok {
132            true => Ok(()),
133            false => Err(ArrowError::InvalidArgumentError(format!(
134                "TimestampWithOffset data type mismatch, expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found {data_type}"
135            ))),
136        }
137    }
138
139    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
140        Self.supports_data_type(data_type).map(|_| Self)
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use std::sync::Arc;
147
148    #[cfg(feature = "canonical_extension_types")]
149    use crate::extension::CanonicalExtensionType;
150    use crate::{
151        Field, Fields, TimeUnit,
152        extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
153    };
154
155    use super::*;
156
157    fn make_valid_field_primitive(time_unit: TimeUnit) -> Field {
158        Field::new(
159            "",
160            DataType::Struct(Fields::from_iter([
161                Field::new(
162                    TIMESTAMP_FIELD_NAME,
163                    DataType::Timestamp(time_unit, Some("UTC".into())),
164                    false,
165                ),
166                Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
167            ])),
168            false,
169        )
170    }
171
172    fn make_valid_field_dict_encoded(time_unit: TimeUnit, key_type: DataType) -> Field {
173        assert!(key_type.is_dictionary_key_type());
174
175        Field::new(
176            "",
177            DataType::Struct(Fields::from_iter([
178                Field::new(
179                    TIMESTAMP_FIELD_NAME,
180                    DataType::Timestamp(time_unit, Some("UTC".into())),
181                    false,
182                ),
183                Field::new(
184                    OFFSET_FIELD_NAME,
185                    DataType::Dictionary(Box::new(key_type), Box::new(DataType::Int16)),
186                    false,
187                ),
188            ])),
189            false,
190        )
191    }
192
193    fn make_valid_field_run_end_encoded(time_unit: TimeUnit, run_ends_type: DataType) -> Field {
194        assert!(run_ends_type.is_run_ends_type());
195        Field::new(
196            "",
197            DataType::Struct(Fields::from_iter([
198                Field::new(
199                    TIMESTAMP_FIELD_NAME,
200                    DataType::Timestamp(time_unit, Some("UTC".into())),
201                    false,
202                ),
203                Field::new(
204                    OFFSET_FIELD_NAME,
205                    DataType::RunEndEncoded(
206                        Arc::new(Field::new("run_ends", run_ends_type, false)),
207                        Arc::new(Field::new("values", DataType::Int16, false)),
208                    ),
209                    false,
210                ),
211            ])),
212            false,
213        )
214    }
215
216    #[test]
217    fn valid_primitive_offsets() -> Result<(), ArrowError> {
218        let time_units = [
219            TimeUnit::Second,
220            TimeUnit::Millisecond,
221            TimeUnit::Microsecond,
222            TimeUnit::Nanosecond,
223        ];
224
225        for time_unit in time_units {
226            let mut field = make_valid_field_primitive(time_unit);
227            field.try_with_extension_type(TimestampWithOffset)?;
228            field.try_extension_type::<TimestampWithOffset>()?;
229            #[cfg(feature = "canonical_extension_types")]
230            assert_eq!(
231                field.try_canonical_extension_type()?,
232                CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
233            );
234        }
235
236        Ok(())
237    }
238
239    #[test]
240    fn valid_dict_encoded_offsets() -> Result<(), ArrowError> {
241        let time_units = [
242            TimeUnit::Second,
243            TimeUnit::Millisecond,
244            TimeUnit::Microsecond,
245            TimeUnit::Nanosecond,
246        ];
247
248        let key_types = [
249            DataType::UInt8,
250            DataType::UInt16,
251            DataType::UInt32,
252            DataType::UInt64,
253            DataType::Int8,
254            DataType::Int16,
255            DataType::Int32,
256            DataType::Int64,
257        ];
258
259        for time_unit in time_units {
260            for key_type in &key_types {
261                let mut field = make_valid_field_dict_encoded(time_unit, key_type.clone());
262                field.try_with_extension_type(TimestampWithOffset)?;
263                field.try_extension_type::<TimestampWithOffset>()?;
264                #[cfg(feature = "canonical_extension_types")]
265                assert_eq!(
266                    field.try_canonical_extension_type()?,
267                    CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
268                );
269            }
270        }
271
272        Ok(())
273    }
274
275    #[test]
276    fn valid_run_end_encoded_offsets() -> Result<(), ArrowError> {
277        let time_units = [
278            TimeUnit::Second,
279            TimeUnit::Millisecond,
280            TimeUnit::Microsecond,
281            TimeUnit::Nanosecond,
282        ];
283
284        let run_ends_types = [DataType::Int16, DataType::Int32, DataType::Int64];
285
286        for time_unit in time_units {
287            for run_ends_type in &run_ends_types {
288                let mut field = make_valid_field_run_end_encoded(time_unit, run_ends_type.clone());
289                field.try_with_extension_type(TimestampWithOffset)?;
290                field.try_extension_type::<TimestampWithOffset>()?;
291                #[cfg(feature = "canonical_extension_types")]
292                assert_eq!(
293                    field.try_canonical_extension_type()?,
294                    CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
295                );
296            }
297        }
298
299        Ok(())
300    }
301
302    #[test]
303    #[should_panic(expected = "Field extension type name missing")]
304    fn missing_name() {
305        let field = make_valid_field_primitive(TimeUnit::Second)
306            .with_metadata([(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())].into());
307        field.extension_type::<TimestampWithOffset>();
308    }
309
310    #[test]
311    #[should_panic(
312        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Boolean"
313    )]
314    fn invalid_type_top_level() {
315        Field::new("", DataType::Boolean, false).with_extension_type(TimestampWithOffset);
316    }
317
318    #[test]
319    #[should_panic(
320        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
321    )]
322    fn invalid_type_struct_field_count() {
323        let data_type =
324            DataType::Struct(Fields::from_iter([Field::new("", DataType::Int16, false)]));
325        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
326    }
327
328    #[test]
329    #[should_panic(
330        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
331    )]
332    fn invalid_type_wrong_timestamp_type() {
333        let data_type = DataType::Struct(Fields::from_iter([
334            Field::new(TIMESTAMP_FIELD_NAME, DataType::Int16, false),
335            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
336        ]));
337        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
338    }
339
340    #[test]
341    #[should_panic(
342        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
343    )]
344    fn invalid_type_wrong_offset_type() {
345        let data_type = DataType::Struct(Fields::from_iter([
346            Field::new(
347                TIMESTAMP_FIELD_NAME,
348                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
349                false,
350            ),
351            Field::new(OFFSET_FIELD_NAME, DataType::UInt64, false),
352        ]));
353        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
354    }
355
356    #[test]
357    #[should_panic(
358        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
359    )]
360    fn invalid_type_wrong_offset_key_dict_encoded() {
361        let data_type = DataType::Struct(Fields::from_iter([
362            Field::new(
363                TIMESTAMP_FIELD_NAME,
364                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
365                false,
366            ),
367            Field::new(
368                OFFSET_FIELD_NAME,
369                DataType::Dictionary(Box::new(DataType::Boolean), Box::new(DataType::Int16)),
370                false,
371            ),
372        ]));
373        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
374    }
375
376    #[test]
377    #[should_panic(
378        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
379    )]
380    fn invalid_type_wrong_offset_value_dict_encoded() {
381        let data_type = DataType::Struct(Fields::from_iter([
382            Field::new(
383                TIMESTAMP_FIELD_NAME,
384                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
385                false,
386            ),
387            Field::new(
388                OFFSET_FIELD_NAME,
389                DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Int32)),
390                false,
391            ),
392        ]));
393        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
394    }
395
396    #[test]
397    #[should_panic(
398        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
399    )]
400    fn invalid_type_wrong_run_ends_run_end_encoded() {
401        let data_type = DataType::Struct(Fields::from_iter([
402            Field::new(
403                TIMESTAMP_FIELD_NAME,
404                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
405                false,
406            ),
407            Field::new(
408                OFFSET_FIELD_NAME,
409                DataType::RunEndEncoded(
410                    Arc::new(Field::new("run_ends", DataType::Boolean, false)),
411                    Arc::new(Field::new("values", DataType::Int16, false)),
412                ),
413                false,
414            ),
415        ]));
416        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
417    }
418
419    #[test]
420    #[should_panic(
421        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
422    )]
423    fn invalid_type_wrong_values_run_end_encoded() {
424        let data_type = DataType::Struct(Fields::from_iter([
425            Field::new(
426                TIMESTAMP_FIELD_NAME,
427                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
428                false,
429            ),
430            Field::new(
431                OFFSET_FIELD_NAME,
432                DataType::RunEndEncoded(
433                    Arc::new(Field::new("run_ends", DataType::UInt16, false)),
434                    Arc::new(Field::new("values", DataType::Int32, false)),
435                ),
436                false,
437            ),
438        ]));
439        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
440    }
441
442    #[test]
443    #[should_panic(
444        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
445    )]
446    fn invalid_type_nullable_timestamp() {
447        let data_type = DataType::Struct(Fields::from_iter([
448            Field::new(
449                TIMESTAMP_FIELD_NAME,
450                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
451                true,
452            ),
453            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
454        ]));
455        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
456    }
457
458    #[test]
459    #[should_panic(
460        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
461    )]
462    fn invalid_type_nullable_offset() {
463        let data_type = DataType::Struct(Fields::from_iter([
464            Field::new(
465                TIMESTAMP_FIELD_NAME,
466                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
467                false,
468            ),
469            Field::new(OFFSET_FIELD_NAME, DataType::Int16, true),
470        ]));
471        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
472    }
473
474    #[test]
475    #[should_panic(
476        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
477    )]
478    fn invalid_type_no_timezone() {
479        let data_type = DataType::Struct(Fields::from_iter([
480            Field::new(
481                TIMESTAMP_FIELD_NAME,
482                DataType::Timestamp(TimeUnit::Second, None),
483                false,
484            ),
485            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
486        ]));
487        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
488    }
489
490    #[test]
491    #[should_panic(
492        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
493    )]
494    fn invalid_type_wrong_timezone() {
495        let data_type = DataType::Struct(Fields::from_iter([
496            Field::new(
497                TIMESTAMP_FIELD_NAME,
498                DataType::Timestamp(TimeUnit::Second, Some("Americas/Sao_Paulo".into())),
499                false,
500            ),
501            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
502        ]));
503        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
504    }
505
506    #[test]
507    fn no_metadata() {
508        let field = make_valid_field_primitive(TimeUnit::Second).with_metadata(
509            [(
510                EXTENSION_TYPE_NAME_KEY.to_owned(),
511                TimestampWithOffset::NAME.to_owned(),
512            )]
513            .into(),
514        );
515        field.extension_type::<TimestampWithOffset>();
516    }
517
518    #[test]
519    fn empty_metadata() {
520        let field = make_valid_field_primitive(TimeUnit::Second).with_metadata(
521            [
522                (
523                    EXTENSION_TYPE_NAME_KEY.to_owned(),
524                    TimestampWithOffset::NAME.to_owned(),
525                ),
526                (EXTENSION_TYPE_METADATA_KEY.to_owned(), String::new()),
527            ]
528            .into(),
529        );
530        field.extension_type::<TimestampWithOffset>();
531    }
532}