Skip to main content

arrow_schema/extension/canonical/
timestamp_with_offset.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Timestamp with an offset in minutes
19//!
20//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
21
22use crate::{ArrowError, DataType, extension::ExtensionType};
23
24/// The extension type for `TimestampWithOffset`.
25///
26/// Extension name: `arrow.timestamp_with_offset`.
27///
28/// This type represents a timestamp column that stores potentially different timezone offsets per
29/// value. The timestamp is stored in UTC alongside the original timezone offset in minutes. This
30/// extension type is intended to be compatible with ANSI SQL's `TIMESTAMP WITH TIME ZONE`, which
31/// is supported by multiple database engines.
32///
33/// The storage type of the extension is a `Struct` with 2 fields, in order: - `timestamp`: a
34/// non-nullable `Timestamp(time_unit, "UTC")`, where `time_unit` is any Arrow `TimeUnit` (s, ms,
35/// us or ns). - `offset_minutes`: a non-nullable signed 16-bit integer (`Int16`) representing the
36/// offset in minutes from the UTC timezone. Negative offsets represent time zones west of UTC,
37/// while positive offsets represent east. Offsets normally range from -779 (-12:59) to +780
38/// (+13:00).
39///
40/// This type has no type parameters.
41///
42/// Metadata is either empty or an empty string.
43///
44/// It is also *permissible* for the `offset_minutes` field to be dictionary-encoded with a
45/// preferred (*but not required*) index type of `int8`, or run-end-encoded with a preferred (*but
46/// not required*) runs type of `int8`.
47///
48/// It's worth noting that the data source needs to resolve timezone strings such as `UTC` or
49/// `Americas/Los_Angeles` into an offset in minutes in order to construct a `TimestampWithOffset`.
50/// This makes `TimestampWithOffset` type "lossy" in the sense that any original "unresolved"
51/// timezone string gets lost in this conversion. It's a tradeoff for optimizing the row
52/// representation and simplifying the client code, which does not need to know how to convert from
53/// timezone string to its corresponding offset in minutes.
54///
55/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
56#[derive(Debug, Default, Clone, Copy, PartialEq)]
57pub struct TimestampWithOffset;
58
59const TIMESTAMP_FIELD_NAME: &str = "timestamp";
60const OFFSET_FIELD_NAME: &str = "offset_minutes";
61
62impl ExtensionType for TimestampWithOffset {
63    const NAME: &'static str = "arrow.timestamp_with_offset";
64
65    type Metadata = ();
66
67    fn metadata(&self) -> &Self::Metadata {
68        &()
69    }
70
71    fn serialize_metadata(&self) -> Option<String> {
72        None
73    }
74
75    fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
76        metadata.map_or_else(
77            || Ok(()),
78            |v| {
79                if !v.is_empty() {
80                    Err(ArrowError::InvalidArgumentError(
81                        "TimestampWithOffset extension type expects no metadata".to_owned(),
82                    ))
83                } else {
84                    Ok(())
85                }
86            },
87        )
88    }
89
90    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
91        let ok = match data_type {
92            DataType::Struct(fields) => match fields.len() {
93                2 => {
94                    let maybe_timestamp = fields.first().unwrap();
95                    let maybe_offset = fields.get(1).unwrap();
96
97                    let timestamp_type_ok = matches!(maybe_timestamp.data_type(), DataType::Timestamp(_, tz) if {
98                        match tz {
99                            Some(tz) => {
100                                tz.as_ref() == "UTC"
101                            },
102                            None => false
103                        }
104                    });
105
106                    let offset_type_ok = match maybe_offset.data_type() {
107                        DataType::Int16 => true,
108                        DataType::Dictionary(key_type, value_type) => {
109                            key_type.is_dictionary_key_type()
110                                && matches!(value_type.as_ref(), DataType::Int16)
111                        }
112                        DataType::RunEndEncoded(run_ends, values) => {
113                            run_ends.data_type().is_run_ends_type()
114                                && matches!(values.data_type(), DataType::Int16)
115                        }
116                        _ => false,
117                    };
118
119                    maybe_timestamp.name() == TIMESTAMP_FIELD_NAME
120                        && timestamp_type_ok
121                        && !maybe_timestamp.is_nullable()
122                        && maybe_offset.name() == OFFSET_FIELD_NAME
123                        && offset_type_ok
124                        && !maybe_offset.is_nullable()
125                }
126                _ => false,
127            },
128            _ => false,
129        };
130
131        match ok {
132            true => Ok(()),
133            false => Err(ArrowError::InvalidArgumentError(format!(
134                "TimestampWithOffset data type mismatch, expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found {data_type}"
135            ))),
136        }
137    }
138
139    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
140        Self.supports_data_type(data_type).map(|_| Self)
141    }
142
143    fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
144        Self.supports_data_type(data_type)
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use std::sync::Arc;
151
152    #[cfg(feature = "canonical_extension_types")]
153    use crate::extension::CanonicalExtensionType;
154    use crate::{
155        Field, Fields, TimeUnit,
156        extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
157    };
158
159    use super::*;
160
161    fn make_valid_field_primitive(time_unit: TimeUnit) -> Field {
162        Field::new(
163            "",
164            DataType::Struct(Fields::from_iter([
165                Field::new(
166                    TIMESTAMP_FIELD_NAME,
167                    DataType::Timestamp(time_unit, Some("UTC".into())),
168                    false,
169                ),
170                Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
171            ])),
172            false,
173        )
174    }
175
176    fn make_valid_field_dict_encoded(time_unit: TimeUnit, key_type: DataType) -> Field {
177        assert!(key_type.is_dictionary_key_type());
178
179        Field::new(
180            "",
181            DataType::Struct(Fields::from_iter([
182                Field::new(
183                    TIMESTAMP_FIELD_NAME,
184                    DataType::Timestamp(time_unit, Some("UTC".into())),
185                    false,
186                ),
187                Field::new(
188                    OFFSET_FIELD_NAME,
189                    DataType::Dictionary(Box::new(key_type), Box::new(DataType::Int16)),
190                    false,
191                ),
192            ])),
193            false,
194        )
195    }
196
197    fn make_valid_field_run_end_encoded(time_unit: TimeUnit, run_ends_type: DataType) -> Field {
198        assert!(run_ends_type.is_run_ends_type());
199        Field::new(
200            "",
201            DataType::Struct(Fields::from_iter([
202                Field::new(
203                    TIMESTAMP_FIELD_NAME,
204                    DataType::Timestamp(time_unit, Some("UTC".into())),
205                    false,
206                ),
207                Field::new(
208                    OFFSET_FIELD_NAME,
209                    DataType::RunEndEncoded(
210                        Arc::new(Field::new("run_ends", run_ends_type, false)),
211                        Arc::new(Field::new("values", DataType::Int16, false)),
212                    ),
213                    false,
214                ),
215            ])),
216            false,
217        )
218    }
219
220    #[test]
221    fn valid_primitive_offsets() -> Result<(), ArrowError> {
222        let time_units = [
223            TimeUnit::Second,
224            TimeUnit::Millisecond,
225            TimeUnit::Microsecond,
226            TimeUnit::Nanosecond,
227        ];
228
229        for time_unit in time_units {
230            let mut field = make_valid_field_primitive(time_unit);
231            field.try_with_extension_type(TimestampWithOffset)?;
232            field.try_extension_type::<TimestampWithOffset>()?;
233            #[cfg(feature = "canonical_extension_types")]
234            assert_eq!(
235                field.try_canonical_extension_type()?,
236                CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
237            );
238        }
239
240        Ok(())
241    }
242
243    #[test]
244    fn valid_dict_encoded_offsets() -> Result<(), ArrowError> {
245        let time_units = [
246            TimeUnit::Second,
247            TimeUnit::Millisecond,
248            TimeUnit::Microsecond,
249            TimeUnit::Nanosecond,
250        ];
251
252        let key_types = [
253            DataType::UInt8,
254            DataType::UInt16,
255            DataType::UInt32,
256            DataType::UInt64,
257            DataType::Int8,
258            DataType::Int16,
259            DataType::Int32,
260            DataType::Int64,
261        ];
262
263        for time_unit in time_units {
264            for key_type in &key_types {
265                let mut field = make_valid_field_dict_encoded(time_unit, key_type.clone());
266                field.try_with_extension_type(TimestampWithOffset)?;
267                field.try_extension_type::<TimestampWithOffset>()?;
268                #[cfg(feature = "canonical_extension_types")]
269                assert_eq!(
270                    field.try_canonical_extension_type()?,
271                    CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
272                );
273            }
274        }
275
276        Ok(())
277    }
278
279    #[test]
280    fn valid_run_end_encoded_offsets() -> Result<(), ArrowError> {
281        let time_units = [
282            TimeUnit::Second,
283            TimeUnit::Millisecond,
284            TimeUnit::Microsecond,
285            TimeUnit::Nanosecond,
286        ];
287
288        let run_ends_types = [DataType::Int16, DataType::Int32, DataType::Int64];
289
290        for time_unit in time_units {
291            for run_ends_type in &run_ends_types {
292                let mut field = make_valid_field_run_end_encoded(time_unit, run_ends_type.clone());
293                field.try_with_extension_type(TimestampWithOffset)?;
294                field.try_extension_type::<TimestampWithOffset>()?;
295                #[cfg(feature = "canonical_extension_types")]
296                assert_eq!(
297                    field.try_canonical_extension_type()?,
298                    CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
299                );
300            }
301        }
302
303        Ok(())
304    }
305
306    #[test]
307    #[should_panic(expected = "Extension type name missing")]
308    fn missing_name() {
309        let field = make_valid_field_primitive(TimeUnit::Second)
310            .with_metadata([(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())].into());
311        field.extension_type::<TimestampWithOffset>();
312    }
313
314    #[test]
315    #[should_panic(
316        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Boolean"
317    )]
318    fn invalid_type_top_level() {
319        Field::new("", DataType::Boolean, false).with_extension_type(TimestampWithOffset);
320    }
321
322    #[test]
323    #[should_panic(
324        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
325    )]
326    fn invalid_type_struct_field_count() {
327        let data_type =
328            DataType::Struct(Fields::from_iter([Field::new("", DataType::Int16, false)]));
329        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
330    }
331
332    #[test]
333    #[should_panic(
334        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
335    )]
336    fn invalid_type_wrong_timestamp_type() {
337        let data_type = DataType::Struct(Fields::from_iter([
338            Field::new(TIMESTAMP_FIELD_NAME, DataType::Int16, false),
339            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
340        ]));
341        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
342    }
343
344    #[test]
345    #[should_panic(
346        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
347    )]
348    fn invalid_type_wrong_offset_type() {
349        let data_type = DataType::Struct(Fields::from_iter([
350            Field::new(
351                TIMESTAMP_FIELD_NAME,
352                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
353                false,
354            ),
355            Field::new(OFFSET_FIELD_NAME, DataType::UInt64, false),
356        ]));
357        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
358    }
359
360    #[test]
361    #[should_panic(
362        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
363    )]
364    fn invalid_type_wrong_offset_key_dict_encoded() {
365        let data_type = DataType::Struct(Fields::from_iter([
366            Field::new(
367                TIMESTAMP_FIELD_NAME,
368                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
369                false,
370            ),
371            Field::new(
372                OFFSET_FIELD_NAME,
373                DataType::Dictionary(Box::new(DataType::Boolean), Box::new(DataType::Int16)),
374                false,
375            ),
376        ]));
377        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
378    }
379
380    #[test]
381    #[should_panic(
382        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
383    )]
384    fn invalid_type_wrong_offset_value_dict_encoded() {
385        let data_type = DataType::Struct(Fields::from_iter([
386            Field::new(
387                TIMESTAMP_FIELD_NAME,
388                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
389                false,
390            ),
391            Field::new(
392                OFFSET_FIELD_NAME,
393                DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Int32)),
394                false,
395            ),
396        ]));
397        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
398    }
399
400    #[test]
401    #[should_panic(
402        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
403    )]
404    fn invalid_type_wrong_run_ends_run_end_encoded() {
405        let data_type = DataType::Struct(Fields::from_iter([
406            Field::new(
407                TIMESTAMP_FIELD_NAME,
408                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
409                false,
410            ),
411            Field::new(
412                OFFSET_FIELD_NAME,
413                DataType::RunEndEncoded(
414                    Arc::new(Field::new("run_ends", DataType::Boolean, false)),
415                    Arc::new(Field::new("values", DataType::Int16, false)),
416                ),
417                false,
418            ),
419        ]));
420        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
421    }
422
423    #[test]
424    #[should_panic(
425        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
426    )]
427    fn invalid_type_wrong_values_run_end_encoded() {
428        let data_type = DataType::Struct(Fields::from_iter([
429            Field::new(
430                TIMESTAMP_FIELD_NAME,
431                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
432                false,
433            ),
434            Field::new(
435                OFFSET_FIELD_NAME,
436                DataType::RunEndEncoded(
437                    Arc::new(Field::new("run_ends", DataType::UInt16, false)),
438                    Arc::new(Field::new("values", DataType::Int32, false)),
439                ),
440                false,
441            ),
442        ]));
443        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
444    }
445
446    #[test]
447    #[should_panic(
448        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
449    )]
450    fn invalid_type_nullable_timestamp() {
451        let data_type = DataType::Struct(Fields::from_iter([
452            Field::new(
453                TIMESTAMP_FIELD_NAME,
454                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
455                true,
456            ),
457            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
458        ]));
459        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
460    }
461
462    #[test]
463    #[should_panic(
464        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
465    )]
466    fn invalid_type_nullable_offset() {
467        let data_type = DataType::Struct(Fields::from_iter([
468            Field::new(
469                TIMESTAMP_FIELD_NAME,
470                DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
471                false,
472            ),
473            Field::new(OFFSET_FIELD_NAME, DataType::Int16, true),
474        ]));
475        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
476    }
477
478    #[test]
479    #[should_panic(
480        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
481    )]
482    fn invalid_type_no_timezone() {
483        let data_type = DataType::Struct(Fields::from_iter([
484            Field::new(
485                TIMESTAMP_FIELD_NAME,
486                DataType::Timestamp(TimeUnit::Second, None),
487                false,
488            ),
489            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
490        ]));
491        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
492    }
493
494    #[test]
495    #[should_panic(
496        expected = "expected Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
497    )]
498    fn invalid_type_wrong_timezone() {
499        let data_type = DataType::Struct(Fields::from_iter([
500            Field::new(
501                TIMESTAMP_FIELD_NAME,
502                DataType::Timestamp(TimeUnit::Second, Some("Americas/Sao_Paulo".into())),
503                false,
504            ),
505            Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
506        ]));
507        Field::new("", data_type, false).with_extension_type(TimestampWithOffset);
508    }
509
510    #[test]
511    fn no_metadata() {
512        let field = make_valid_field_primitive(TimeUnit::Second).with_metadata(
513            [(
514                EXTENSION_TYPE_NAME_KEY.to_owned(),
515                TimestampWithOffset::NAME.to_owned(),
516            )]
517            .into(),
518        );
519        field.extension_type::<TimestampWithOffset>();
520    }
521
522    #[test]
523    fn empty_metadata() {
524        let field = make_valid_field_primitive(TimeUnit::Second).with_metadata(
525            [
526                (
527                    EXTENSION_TYPE_NAME_KEY.to_owned(),
528                    TimestampWithOffset::NAME.to_owned(),
529                ),
530                (EXTENSION_TYPE_METADATA_KEY.to_owned(), String::new()),
531            ]
532            .into(),
533        );
534        field.extension_type::<TimestampWithOffset>();
535    }
536}