parquet_geospatial/
types.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow_schema::{ArrowError, DataType, extension::ExtensionType};
19use serde::{Deserialize, Serialize};
20
21/// Hints at the likely Parquet geospatial logical type represented by a [`Metadata`].
22///
23/// Based on the `algorithm` field:
24/// - [`Hint::Geometry`]: WKB format with linear/planar edge interpolation
25/// - [`Hint::Geography`]: WKB format with explicit non-linear/non-planar edge interpolation
26///
27/// See the [Parquet Geospatial specification](https://github.com/apache/parquet-format/blob/master/Geospatial.md)
28/// for more details.
29#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
30pub enum Hint {
31    /// Geospatial features in WKB format with linear/planar edge interpolation
32    Geometry,
33    /// Geospatial features in WKB format with explicit non-linear/non-planar edge interpolation
34    Geography,
35}
36
37/// The edge interpolation algorithms used with `GEOMETRY` logical types.
38#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
39#[serde(rename_all = "lowercase")]
40pub enum Edges {
41    /// Edges are interpolated as geodesics on a sphere.
42    #[default]
43    Spherical,
44    /// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
45    Vincenty,
46    /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970
47    Thomas,
48    /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965.
49    Andoyer,
50    /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55
51    Karney,
52}
53
54/// The metadata associated with a [`WkbType`].
55#[derive(Clone, Debug, Default, Serialize, Deserialize)]
56pub struct Metadata {
57    /// The Coordinate Reference System (CRS) of the [`WkbType`], if present.
58    ///
59    /// This may be a raw string value (e.g., "EPSG:3857") or a JSON object (e.g., PROJJSON).
60    /// Note: Common lon/lat CRS representations (EPSG:4326, OGC:CRS84) are canonicalized
61    /// to `None` during serialization to match Parquet conventions.
62    #[serde(skip_serializing_if = "Option::is_none")]
63    pub crs: Option<serde_json::Value>,
64    /// The edge interpolation algorithm of the [`WkbType`], if present.
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub algorithm: Option<Edges>,
67}
68
69impl Metadata {
70    /// Constructs a new [`Metadata`] with the given CRS and algorithm.
71    ///
72    /// If a CRS is provided, and can be parsed as JSON, it will be stored as a JSON object instead
73    /// of its string representation.
74    pub fn new(crs: Option<&str>, algorithm: Option<Edges>) -> Self {
75        let crs = crs.map(|c| match serde_json::from_str(c) {
76            Ok(crs) => crs,
77            Err(_) => serde_json::Value::String(c.to_string()),
78        });
79
80        Self { crs, algorithm }
81    }
82
83    /// Returns a [`Hint`] to the likely underlying Logical Type that this [`Metadata`] represents.
84    pub fn type_hint(&self) -> Hint {
85        match &self.algorithm {
86            Some(_) => Hint::Geography,
87            None => Hint::Geometry,
88        }
89    }
90
91    /// Detect if the CRS is a common representation of lon/lat on the standard WGS84 ellipsoid
92    fn crs_is_lon_lat(&self) -> bool {
93        use serde_json::Value;
94
95        let Some(crs) = &self.crs else {
96            return false;
97        };
98
99        match crs {
100            Value::String(s) if s == "EPSG:4326" || s == "OGC:CRS84" => true,
101            Value::Object(_) => match (&crs["id"]["authority"], &crs["id"]["code"]) {
102                (Value::String(auth), Value::String(code)) if auth == "OGC" && code == "CRS84" => {
103                    true
104                }
105                (Value::String(auth), Value::String(code)) if auth == "EPSG" && code == "4326" => {
106                    true
107                }
108                (Value::String(auth), Value::Number(code))
109                    if auth == "EPSG" && code.as_i64() == Some(4326) =>
110                {
111                    true
112                }
113                _ => false,
114            },
115            _ => false,
116        }
117    }
118}
119
120/// Well-Known Binary (WKB) [`ExtensionType`] for geospatial data.
121///
122/// Represents the canonical Arrow Extension Type for storing
123/// [GeoArrow](https://github.com/geoarrow/geoarrow) data.
124#[derive(Debug, Default)]
125pub struct WkbType(Metadata);
126
127impl WkbType {
128    /// Constructs a new [`WkbType`] with the given [`Metadata`].
129    ///
130    /// If `None` is provided, default (empty) metadata is used.
131    pub fn new(metadata: Option<Metadata>) -> Self {
132        Self(metadata.unwrap_or_default())
133    }
134}
135
136type ArrowResult<T> = Result<T, ArrowError>;
137impl ExtensionType for WkbType {
138    const NAME: &'static str = "geoarrow.wkb";
139
140    type Metadata = Metadata;
141
142    fn metadata(&self) -> &Self::Metadata {
143        &self.0
144    }
145
146    fn serialize_metadata(&self) -> Option<String> {
147        let md = if self.0.crs_is_lon_lat() {
148            &Metadata {
149                crs: None, // lon/lat CRS is canonicalized as omitted (None) for Parquet
150                algorithm: self.0.algorithm,
151            }
152        } else {
153            &self.0
154        };
155
156        serde_json::to_string(md).ok()
157    }
158
159    fn deserialize_metadata(metadata: Option<&str>) -> ArrowResult<Self::Metadata> {
160        let Some(metadata) = metadata else {
161            return Ok(Self::Metadata::default());
162        };
163
164        serde_json::from_str(metadata).map_err(|e| ArrowError::JsonError(e.to_string()))
165    }
166
167    fn supports_data_type(&self, data_type: &arrow_schema::DataType) -> ArrowResult<()> {
168        match data_type {
169            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Ok(()),
170            dt => Err(ArrowError::InvalidArgumentError(format!(
171                "Geometry data type mismatch, expected one of Binary, LargeBinary, BinaryView. Found {dt}"
172            ))),
173        }
174    }
175
176    fn try_new(data_type: &arrow_schema::DataType, metadata: Self::Metadata) -> ArrowResult<Self> {
177        let wkb = Self(metadata);
178        wkb.supports_data_type(data_type)?;
179        Ok(wkb)
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use arrow_schema::Field;
187
188    /// Test metadata serialization and deserialization with empty/default metadata
189    #[test]
190    fn test_metadata_empty_roundtrip() -> ArrowResult<()> {
191        let metadata = Metadata::default();
192        let wkb = WkbType::new(Some(metadata));
193
194        let serialized = wkb.serialize_metadata().unwrap();
195        assert_eq!(serialized, "{}");
196
197        let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
198        assert!(deserialized.crs.is_none());
199        assert!(deserialized.algorithm.is_none());
200
201        Ok(())
202    }
203
204    /// Test metadata serialization with CRS as a simple string
205    #[test]
206    fn test_metadata_crs_string_roundtrip() -> ArrowResult<()> {
207        let metadata = Metadata::new(Some("srid:1234"), None);
208        let wkb = WkbType::new(Some(metadata));
209
210        let serialized = wkb.serialize_metadata().unwrap();
211        assert_eq!(serialized, r#"{"crs":"srid:1234"}"#);
212
213        let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
214        assert_eq!(
215            deserialized.crs.unwrap(),
216            serde_json::Value::String(String::from("srid:1234"))
217        );
218        assert!(deserialized.algorithm.is_none());
219
220        Ok(())
221    }
222
223    /// Test metadata serialization with CRS as a JSON object
224    #[test]
225    fn test_metadata_crs_json_object_roundtrip() -> ArrowResult<()> {
226        let crs_json = r#"{"type":"custom_json","properties":{"name":"EPSG:4326"}}"#;
227        let metadata = Metadata::new(Some(crs_json), None);
228        let wkb = WkbType::new(Some(metadata));
229
230        let serialized = wkb.serialize_metadata().unwrap();
231        // Validate by parsing the JSON and checking structure (field order is not guaranteed)
232        let parsed: serde_json::Value = serde_json::from_str(&serialized).unwrap();
233        assert_eq!(parsed["crs"]["type"], "custom_json");
234        assert_eq!(parsed["crs"]["properties"]["name"], "EPSG:4326");
235
236        let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
237
238        // Verify it's a JSON object with expected structure
239        let crs = deserialized.crs.unwrap();
240        assert!(crs.is_object());
241        assert_eq!(crs["type"], "custom_json");
242        assert_eq!(crs["properties"]["name"], "EPSG:4326");
243
244        Ok(())
245    }
246
247    /// Test metadata serialization with algorithm field
248    #[test]
249    fn test_metadata_algorithm_roundtrip() -> ArrowResult<()> {
250        let metadata = Metadata::new(None, Some(Edges::Spherical));
251        let wkb = WkbType::new(Some(metadata));
252
253        let serialized = wkb.serialize_metadata().unwrap();
254        assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
255
256        let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
257        assert!(deserialized.crs.is_none());
258        assert_eq!(deserialized.algorithm, Some(Edges::Spherical));
259
260        Ok(())
261    }
262
263    /// Test metadata serialization with both CRS and algorithm
264    #[test]
265    fn test_metadata_full_roundtrip() -> ArrowResult<()> {
266        let metadata = Metadata::new(Some("srid:1234"), Some(Edges::Spherical));
267        let wkb = WkbType::new(Some(metadata));
268
269        let serialized = wkb.serialize_metadata().unwrap();
270        assert_eq!(serialized, r#"{"crs":"srid:1234","algorithm":"spherical"}"#);
271
272        let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
273        assert_eq!(
274            deserialized.crs.unwrap(),
275            serde_json::Value::String("srid:1234".to_string())
276        );
277        assert_eq!(deserialized.algorithm, Some(Edges::Spherical));
278
279        Ok(())
280    }
281
282    /// Test deserialization of None metadata
283    #[test]
284    fn test_metadata_deserialize_none() -> ArrowResult<()> {
285        let deserialized = WkbType::deserialize_metadata(None)?;
286        assert!(deserialized.crs.is_none());
287        assert!(deserialized.algorithm.is_none());
288        Ok(())
289    }
290
291    /// Test deserialization of invalid JSON
292    #[test]
293    fn test_metadata_deserialize_invalid_json() {
294        let result = WkbType::deserialize_metadata(Some("not valid json {"));
295        assert!(matches!(result, Err(ArrowError::JsonError(_))));
296    }
297
298    /// Test metadata that results in a Geometry type hint
299    #[test]
300    fn test_type_hint_geometry() {
301        let metadata = Metadata::new(None, None);
302        assert!(matches!(metadata.type_hint(), Hint::Geometry));
303    }
304
305    /// Test metadata that results in a Geography type hint
306    #[test]
307    fn test_type_hint_edges_is_geography() {
308        let algorithms = vec![
309            Edges::Spherical,
310            Edges::Vincenty,
311            Edges::Thomas,
312            Edges::Andoyer,
313            Edges::Karney,
314        ];
315        for algo in algorithms {
316            let metadata = Metadata::new(None, Some(algo));
317            assert!(matches!(metadata.type_hint(), Hint::Geography));
318        }
319    }
320
321    /// Test extension type integration using a Field
322    #[test]
323    fn test_extension_type_with_field() -> ArrowResult<()> {
324        let metadata = Metadata::new(Some("srid:1234"), None);
325        let wkb_type = WkbType::new(Some(metadata));
326
327        let mut field = Field::new("geometry", DataType::Binary, false);
328        field.try_with_extension_type(wkb_type)?;
329
330        // Verify we can extract the extension type back
331        let extracted = field.try_extension_type::<WkbType>()?;
332        assert_eq!(
333            extracted.metadata().crs.as_ref().unwrap(),
334            &serde_json::Value::String(String::from("srid:1234"))
335        );
336
337        Ok(())
338    }
339
340    /// Test extension type DataType support
341    #[test]
342    fn test_extension_type_support() -> ArrowResult<()> {
343        let wkb = WkbType::default();
344        // supported types
345        wkb.supports_data_type(&DataType::Binary)?;
346        wkb.supports_data_type(&DataType::LargeBinary)?;
347        wkb.supports_data_type(&DataType::BinaryView)?;
348
349        // reject unsupported types with an error
350        let result = wkb.supports_data_type(&DataType::Utf8);
351        assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
352
353        Ok(())
354    }
355
356    /// Test CRS canonicalization logic for common lon/lat representations
357    #[test]
358    fn test_crs_canonicalization() -> ArrowResult<()> {
359        // EPSG:4326 as string should be omitted
360        let metadata = Metadata::new(Some("EPSG:4326"), None);
361        let wkb = WkbType::new(Some(metadata));
362        let serialized = wkb.serialize_metadata().unwrap();
363        assert_eq!(serialized, "{}");
364
365        // OGC:CRS84 as string should be omitted
366        let metadata = Metadata::new(Some("OGC:CRS84"), None);
367        let wkb = WkbType::new(Some(metadata));
368        let serialized = wkb.serialize_metadata().unwrap();
369        assert_eq!(serialized, "{}");
370
371        // A JSON object that reasonably looks like PROJJSON for EPSG:4326 should be omitted
372        // detect "4326" as a string
373        let crs_json = r#"{"id":{"authority":"EPSG","code":"4326"}}"#;
374        let metadata = Metadata::new(Some(crs_json), None);
375        let wkb = WkbType::new(Some(metadata));
376        let serialized = wkb.serialize_metadata().unwrap();
377        assert_eq!(serialized, "{}");
378
379        // detect 4326 as a number
380        let crs_json = r#"{"id":{"authority":"EPSG","code":4326}}"#;
381        let metadata = Metadata::new(Some(crs_json), None);
382        let wkb = WkbType::new(Some(metadata));
383        let serialized = wkb.serialize_metadata().unwrap();
384        assert_eq!(serialized, "{}");
385
386        // A JSON object that reasonably looks like PROJJSON for OGC:CRS84 should be omitted
387        let crs_json = r#"{"id":{"authority":"OGC","code":"CRS84"}}"#;
388        let metadata = Metadata::new(Some(crs_json), None);
389        let wkb = WkbType::new(Some(metadata));
390        let serialized = wkb.serialize_metadata().unwrap();
391        assert_eq!(serialized, "{}");
392
393        // Other input types should be preserved
394        let metadata = Metadata::new(Some("srid:1234"), None);
395        let wkb = WkbType::new(Some(metadata));
396        let serialized = wkb.serialize_metadata().unwrap();
397        assert_eq!(serialized, r#"{"crs":"srid:1234"}"#);
398
399        // Canonicalization should work with algorithm field
400        let metadata = Metadata::new(Some("EPSG:4326"), Some(Edges::Spherical));
401        let wkb = WkbType::new(Some(metadata));
402        let serialized = wkb.serialize_metadata().unwrap();
403        assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
404
405        Ok(())
406    }
407}