parquet_geospatial/
bounding.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::collections::HashSet;
19
20use arrow_schema::ArrowError;
21use geo_traits::{
22    CoordTrait, Dimensions, GeometryCollectionTrait, GeometryTrait, GeometryType, LineStringTrait,
23    MultiLineStringTrait, MultiPointTrait, MultiPolygonTrait, PointTrait, PolygonTrait,
24};
25use wkb::reader::Wkb;
26
27use crate::interval::{Interval, IntervalTrait, WraparoundInterval};
28
29/// Geometry bounder
30///
31/// Utility to accumulate statistics for geometries as they are written.
32/// This bounder is designed to output statistics accumulated according
33/// to the Parquet specification such that the output can be written to
34/// Parquet statistics with minimal modification.
35///
36/// See the [IntervalTrait] for an in-depth discussion of wraparound bounding
37/// (which adds some complexity to this implementation).
38#[derive(Debug)]
39pub struct GeometryBounder {
40    /// Union of all contiguous x intervals to the left of the wraparound midpoint
41    x_left: Interval,
42    /// Union of all contiguous x intervals that intersect the wraparound midpoint
43    x_mid: Interval,
44    /// Union of all contiguous x intervals to the right of the wraparound midpoint
45    x_right: Interval,
46    /// Union of all y intervals
47    y: Interval,
48    /// Union of all z intervals
49    z: Interval,
50    /// Union of all m intervals
51    m: Interval,
52    /// Unique geometry type codes encountered by the bounder
53    ///
54    /// The integer codes are identical to the ISO WKB geometry type codes and
55    /// are documented as part of the Parquet specification:
56    /// <https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types>
57    geometry_types: HashSet<i32>,
58    wraparound_hint: Interval,
59}
60
61impl GeometryBounder {
62    /// Create a new, empty bounder that represents empty input
63    pub fn empty() -> Self {
64        Self {
65            x_left: Interval::empty(),
66            x_mid: Interval::empty(),
67            x_right: Interval::empty(),
68            y: Interval::empty(),
69            z: Interval::empty(),
70            m: Interval::empty(),
71            geometry_types: HashSet::<i32>::default(),
72            wraparound_hint: Interval::empty(),
73        }
74    }
75
76    /// Set the hint to use for generation of potential wraparound xmin/xmax output
77    ///
78    /// Usually this value should be set to (-180, 180), as wraparound is primarily
79    /// targeted at lon/lat coordinate systems where collections of features with
80    /// components at the very far left and very far right of the coordinate system
81    /// are actually very close to each other.
82    ///
83    /// It is safe to set this value even when the actual coordinate system of the
84    /// input is unknown: if the input has coordinate values that are outside the
85    /// range of the wraparound hint, wraparound xmin/xmax values will not be
86    /// generated. If the input has coordinate values that are well inside of the
87    /// range of the wraparound hint, the wraparound xmin/xmax value will be
88    /// substantially wider than the non-wraparound version and will not be returned.
89    pub fn with_wraparound_hint(self, wraparound_hint: impl Into<Interval>) -> Self {
90        Self {
91            wraparound_hint: wraparound_hint.into(),
92            ..self
93        }
94    }
95
96    /// Calculate the final xmin and xmax for geometries encountered by this bounder
97    ///
98    /// The interval returned may wraparound if a hint was set and the input
99    /// encountered by this bounder were exclusively at the far left and far right
100    /// of the input range. See [IntervalTrait] for an in-depth description of
101    /// wraparound intervals.
102    pub fn x(&self) -> WraparoundInterval {
103        let out_all = Interval::empty()
104            .merge_interval(&self.x_left)
105            .merge_interval(&self.x_mid)
106            .merge_interval(&self.x_right);
107
108        // Check if this even makes sense: if anything is covering the midpoint
109        // of the wraparound hint or the bounds don't make sense for the provided
110        // wraparound hint, just return the Cartesian bounds.
111        if !self.x_mid.is_empty() || !self.wraparound_hint.contains_interval(&out_all) {
112            return out_all.into();
113        }
114
115        // Check if our wraparound bounds are any better than our Cartesian bounds
116        // If the Cartesian bounds are tighter, return them.
117        let out_width = (self.x_left.hi() - self.wraparound_hint.lo())
118            + (self.wraparound_hint.hi() - self.x_right.hi());
119        if out_all.width() < out_width {
120            return out_all.into();
121        }
122
123        // Wraparound!
124        WraparoundInterval::new(self.x_right.lo(), self.x_left.hi())
125    }
126
127    /// Calculate the final ymin and ymax for geometries encountered by this bounder
128    pub fn y(&self) -> Interval {
129        self.y
130    }
131
132    /// Calculate the final zmin and zmax for geometries encountered by this bounder
133    pub fn z(&self) -> Interval {
134        self.z
135    }
136
137    /// Calculate the final mmin and mmax values for geometries encountered by this bounder
138    pub fn m(&self) -> Interval {
139        self.m
140    }
141
142    /// Calculate the final geometry type set
143    ///
144    /// Returns a copy of the unique geometry type/dimension combinations encountered
145    /// by this bounder. These identifiers are ISO WKB identifiers (e.g., 1001
146    /// for PointZ). The output is always returned sorted.
147    pub fn geometry_types(&self) -> Vec<i32> {
148        let mut out = self.geometry_types.iter().copied().collect::<Vec<_>>();
149        out.sort();
150        out
151    }
152
153    /// Update this bounder with one WKB-encoded geometry
154    ///
155    /// Parses and accumulates the bounds of one WKB-encoded geometry. This function
156    /// will error for invalid WKB input; however, clients may wish to ignore such
157    /// an error for the purposes of writing statistics.
158    pub fn update_wkb(&mut self, wkb: &[u8]) -> Result<(), ArrowError> {
159        let wkb = Wkb::try_new(wkb).map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
160        self.update_geometry(&wkb)?;
161        Ok(())
162    }
163
164    fn update_geometry(&mut self, geom: &impl GeometryTrait<T = f64>) -> Result<(), ArrowError> {
165        let geometry_type = geometry_type(geom)?;
166        self.geometry_types.insert(geometry_type);
167
168        visit_intervals(geom, 'x', &mut |x| self.update_x(&x))?;
169        visit_intervals(geom, 'y', &mut |y| self.y.update_interval(&y))?;
170        visit_intervals(geom, 'z', &mut |z| self.z.update_interval(&z))?;
171        visit_intervals(geom, 'm', &mut |m| self.m.update_interval(&m))?;
172
173        Ok(())
174    }
175
176    fn update_x(&mut self, x: &Interval) {
177        if x.hi() < self.wraparound_hint.mid() {
178            // If the x interval is completely to the left of the midpoint, merge it
179            // with x_left
180            self.x_left.update_interval(x);
181        } else if x.lo() > self.wraparound_hint.mid() {
182            // If the x interval is completely to the right of the midpoint, merge it
183            // with x_right
184            self.x_right.update_interval(x);
185        } else {
186            // Otherwise, merge it with x_mid
187            self.x_mid.update_interval(x);
188        }
189    }
190}
191
192/// Visit contiguous intervals for a given dimension within a [GeometryTrait]
193///
194/// Here, contiguous intervals refers to intervals that must not be separated
195/// by wraparound bounding. Point components of a geometry are visited as
196/// degenerate intervals of a single value; linestring or polygon ring components
197/// are visited as single intervals.
198fn visit_intervals(
199    geom: &impl GeometryTrait<T = f64>,
200    dimension: char,
201    func: &mut impl FnMut(Interval),
202) -> Result<(), ArrowError> {
203    let n = if let Some(n) = dimension_index(geom.dim(), dimension) {
204        n
205    } else {
206        return Ok(());
207    };
208
209    match geom.as_type() {
210        GeometryType::Point(pt) => {
211            if let Some(coord) = PointTrait::coord(pt) {
212                visit_point(coord, n, func);
213            }
214        }
215        GeometryType::LineString(ls) => {
216            visit_sequence(ls.coords(), n, func);
217        }
218        GeometryType::Polygon(pl) => {
219            if let Some(exterior) = pl.exterior() {
220                visit_sequence(exterior.coords(), n, func);
221            }
222
223            for interior in pl.interiors() {
224                visit_sequence(interior.coords(), n, func);
225            }
226        }
227        GeometryType::MultiPoint(multi_pt) => {
228            visit_collection(multi_pt.points(), dimension, func)?;
229        }
230        GeometryType::MultiLineString(multi_ls) => {
231            visit_collection(multi_ls.line_strings(), dimension, func)?;
232        }
233        GeometryType::MultiPolygon(multi_pl) => {
234            visit_collection(multi_pl.polygons(), dimension, func)?;
235        }
236        GeometryType::GeometryCollection(collection) => {
237            visit_collection(collection.geometries(), dimension, func)?;
238        }
239        _ => {
240            return Err(ArrowError::InvalidArgumentError(
241                "GeometryType not supported for dimension bounds".to_string(),
242            ));
243        }
244    }
245
246    Ok(())
247}
248
249/// Visit a point
250///
251/// Points can be separated by wraparound bounding even if they occur within
252/// the same feature, so we visit them as individual degenerate intervals.
253fn visit_point(coord: impl CoordTrait<T = f64>, n: usize, func: &mut impl FnMut(Interval)) {
254    let val = unsafe { coord.nth_unchecked(n) };
255    func((val, val).into());
256}
257
258/// Visit contiguous sequences
259///
260/// Sequences (e.g., linestrings or polygon rings) must always be considered
261/// together (i.e., are never separated by wraparound bounding).
262fn visit_sequence(
263    coords: impl IntoIterator<Item = impl CoordTrait<T = f64>>,
264    n: usize,
265    func: &mut impl FnMut(Interval),
266) {
267    let mut interval = Interval::empty();
268    for coord in coords {
269        interval.update_value(unsafe { coord.nth_unchecked(n) });
270    }
271
272    func(interval);
273}
274
275/// Visit intervals in a collection of geometries
276fn visit_collection(
277    collection: impl IntoIterator<Item = impl GeometryTrait<T = f64>>,
278    target: char,
279    func: &mut impl FnMut(Interval),
280) -> Result<(), ArrowError> {
281    for geom in collection {
282        visit_intervals(&geom, target, func)?;
283    }
284
285    Ok(())
286}
287
288/// Extract the geometry type code encountered by the bounder
289///
290/// The integer code is a ISO WKB geometry type codes is documented as part
291/// of the Parquet specification:
292/// <https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types>
293///
294/// This can also be derived from bytes 2-5 (possibly endian-swapped according to byte 1)
295/// of the input WKB buffer but is slightly clearer recomputed.
296fn geometry_type(geom: &impl GeometryTrait<T = f64>) -> Result<i32, ArrowError> {
297    let dimension_type = match geom.dim() {
298        Dimensions::Xy => 0,
299        Dimensions::Xyz => 1000,
300        Dimensions::Xym => 2000,
301        Dimensions::Xyzm => 3000,
302        Dimensions::Unknown(_) => {
303            return Err(ArrowError::InvalidArgumentError(
304                "Unsupported dimensions".to_string(),
305            ));
306        }
307    };
308
309    let geometry_type = match geom.as_type() {
310        GeometryType::Point(_) => 1,
311        GeometryType::LineString(_) => 2,
312        GeometryType::Polygon(_) => 3,
313        GeometryType::MultiPoint(_) => 4,
314        GeometryType::MultiLineString(_) => 5,
315        GeometryType::MultiPolygon(_) => 6,
316        GeometryType::GeometryCollection(_) => 7,
317        _ => {
318            return Err(ArrowError::InvalidArgumentError(
319                "GeometryType not supported for dimension bounds".to_string(),
320            ));
321        }
322    };
323
324    Ok(dimension_type + geometry_type)
325}
326
327fn dimension_index(dim: Dimensions, target: char) -> Option<usize> {
328    match target {
329        'x' => return Some(0),
330        'y' => return Some(1),
331        _ => {}
332    }
333
334    match (dim, target) {
335        (Dimensions::Xyz, 'z') => Some(2),
336        (Dimensions::Xym, 'm') => Some(2),
337        (Dimensions::Xyzm, 'z') => Some(2),
338        (Dimensions::Xyzm, 'm') => Some(3),
339        (_, _) => None,
340    }
341}
342
343#[cfg(test)]
344mod test {
345
346    use std::str::FromStr;
347
348    use wkt::Wkt;
349
350    use super::*;
351
352    fn wkt_bounds(
353        wkt_values: impl IntoIterator<Item = impl AsRef<str>>,
354    ) -> Result<GeometryBounder, ArrowError> {
355        wkt_bounds_with_wraparound(wkt_values, Interval::empty())
356    }
357
358    fn wkt_bounds_with_wraparound(
359        wkt_values: impl IntoIterator<Item = impl AsRef<str>>,
360        wraparound: impl Into<Interval>,
361    ) -> Result<GeometryBounder, ArrowError> {
362        let mut bounder = GeometryBounder::empty().with_wraparound_hint(wraparound);
363        for wkt_value in wkt_values {
364            let wkt: Wkt = Wkt::from_str(wkt_value.as_ref())
365                .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?;
366            bounder.update_geometry(&wkt)?;
367        }
368        Ok(bounder)
369    }
370
371    #[test]
372    fn test_wkb() {
373        let wkt: Wkt = Wkt::from_str("LINESTRING (0 1, 2 3)").unwrap();
374        let mut wkb = Vec::new();
375        wkb::writer::write_geometry(&mut wkb, &wkt, &Default::default()).unwrap();
376
377        let mut bounds = GeometryBounder::empty();
378        bounds.update_wkb(&wkb).unwrap();
379
380        assert_eq!(bounds.x(), (0, 2).into());
381        assert_eq!(bounds.y(), (1, 3).into());
382    }
383
384    #[test]
385    fn test_geometry_types() {
386        let empties = [
387            "POINT EMPTY",
388            "LINESTRING EMPTY",
389            "POLYGON EMPTY",
390            "MULTIPOINT EMPTY",
391            "MULTILINESTRING EMPTY",
392            "MULTIPOLYGON EMPTY",
393            "GEOMETRYCOLLECTION EMPTY",
394        ];
395
396        assert_eq!(
397            wkt_bounds(empties).unwrap().geometry_types(),
398            vec![1, 2, 3, 4, 5, 6, 7]
399        );
400
401        let empties_z = [
402            "POINT Z EMPTY",
403            "LINESTRING Z EMPTY",
404            "POLYGON Z EMPTY",
405            "MULTIPOINT Z EMPTY",
406            "MULTILINESTRING Z EMPTY",
407            "MULTIPOLYGON Z EMPTY",
408            "GEOMETRYCOLLECTION Z EMPTY",
409        ];
410
411        assert_eq!(
412            wkt_bounds(empties_z).unwrap().geometry_types(),
413            vec![1001, 1002, 1003, 1004, 1005, 1006, 1007]
414        );
415
416        let empties_m = [
417            "POINT M EMPTY",
418            "LINESTRING M EMPTY",
419            "POLYGON M EMPTY",
420            "MULTIPOINT M EMPTY",
421            "MULTILINESTRING M EMPTY",
422            "MULTIPOLYGON M EMPTY",
423            "GEOMETRYCOLLECTION M EMPTY",
424        ];
425
426        assert_eq!(
427            wkt_bounds(empties_m).unwrap().geometry_types(),
428            vec![2001, 2002, 2003, 2004, 2005, 2006, 2007]
429        );
430
431        let empties_zm = [
432            "POINT ZM EMPTY",
433            "LINESTRING ZM EMPTY",
434            "POLYGON ZM EMPTY",
435            "MULTIPOINT ZM EMPTY",
436            "MULTILINESTRING ZM EMPTY",
437            "MULTIPOLYGON ZM EMPTY",
438            "GEOMETRYCOLLECTION ZM EMPTY",
439        ];
440
441        assert_eq!(
442            wkt_bounds(empties_zm).unwrap().geometry_types(),
443            vec![3001, 3002, 3003, 3004, 3005, 3006, 3007]
444        );
445    }
446
447    #[test]
448    fn test_bounds_empty() {
449        let empties = [
450            "POINT EMPTY",
451            "LINESTRING EMPTY",
452            "POLYGON EMPTY",
453            "MULTIPOINT EMPTY",
454            "MULTILINESTRING EMPTY",
455            "MULTIPOLYGON EMPTY",
456            "GEOMETRYCOLLECTION EMPTY",
457        ];
458
459        let bounds = wkt_bounds(empties).unwrap();
460        assert!(bounds.x().is_empty());
461        assert!(bounds.y().is_empty());
462        assert!(bounds.z().is_empty());
463        assert!(bounds.m().is_empty());
464
465        // With wraparound, still empty
466        let bounds = wkt_bounds_with_wraparound(empties, (-180, 180)).unwrap();
467        assert!(bounds.x().is_empty());
468        assert!(bounds.y().is_empty());
469        assert!(bounds.z().is_empty());
470        assert!(bounds.m().is_empty());
471    }
472
473    #[test]
474    fn test_bounds_coord() {
475        let bounds = wkt_bounds(["POINT (0 1)", "POINT (2 3)"]).unwrap();
476        assert_eq!(bounds.x(), (0, 2).into());
477        assert_eq!(bounds.y(), (1, 3).into());
478        assert!(bounds.z().is_empty());
479        assert!(bounds.m().is_empty());
480
481        let bounds = wkt_bounds(["POINT Z (0 1 2)", "POINT Z (3 4 5)"]).unwrap();
482        assert_eq!(bounds.x(), (0, 3).into());
483        assert_eq!(bounds.y(), (1, 4).into());
484        assert_eq!(bounds.z(), (2, 5).into());
485        assert!(bounds.m().is_empty());
486
487        let bounds = wkt_bounds(["POINT M (0 1 2)", "POINT M (3 4 5)"]).unwrap();
488        assert_eq!(bounds.x(), (0, 3).into());
489        assert_eq!(bounds.y(), (1, 4).into());
490        assert!(bounds.z().is_empty());
491        assert_eq!(bounds.m(), (2, 5).into());
492
493        let bounds = wkt_bounds(["POINT ZM (0 1 2 3)", "POINT ZM (4 5 6 7)"]).unwrap();
494        assert_eq!(bounds.x(), (0, 4).into());
495        assert_eq!(bounds.y(), (1, 5).into());
496        assert_eq!(bounds.z(), (2, 6).into());
497        assert_eq!(bounds.m(), (3, 7).into());
498    }
499
500    #[test]
501    fn test_bounds_sequence() {
502        let bounds = wkt_bounds(["LINESTRING (0 1, 2 3)"]).unwrap();
503        assert_eq!(bounds.x(), (0, 2).into());
504        assert_eq!(bounds.y(), (1, 3).into());
505        assert!(bounds.z().is_empty());
506        assert!(bounds.m().is_empty());
507
508        let bounds = wkt_bounds(["LINESTRING Z (0 1 2, 3 4 5)"]).unwrap();
509        assert_eq!(bounds.x(), (0, 3).into());
510        assert_eq!(bounds.y(), (1, 4).into());
511        assert_eq!(bounds.z(), (2, 5).into());
512        assert!(bounds.m().is_empty());
513
514        let bounds = wkt_bounds(["LINESTRING M (0 1 2, 3 4 5)"]).unwrap();
515        assert_eq!(bounds.x(), (0, 3).into());
516        assert_eq!(bounds.y(), (1, 4).into());
517        assert!(bounds.z().is_empty());
518        assert_eq!(bounds.m(), (2, 5).into());
519
520        let bounds = wkt_bounds(["LINESTRING ZM (0 1 2 3, 4 5 6 7)"]).unwrap();
521        assert_eq!(bounds.x(), (0, 4).into());
522        assert_eq!(bounds.y(), (1, 5).into());
523        assert_eq!(bounds.z(), (2, 6).into());
524        assert_eq!(bounds.m(), (3, 7).into());
525    }
526
527    #[test]
528    fn test_bounds_geometry_type() {
529        let bounds = wkt_bounds(["POINT (0 1)", "POINT (2 3)"]).unwrap();
530        assert_eq!(bounds.x(), (0, 2).into());
531        assert_eq!(bounds.y(), (1, 3).into());
532
533        let bounds = wkt_bounds(["LINESTRING (0 1, 2 3)"]).unwrap();
534        assert_eq!(bounds.x(), (0, 2).into());
535        assert_eq!(bounds.y(), (1, 3).into());
536
537        // Normally interiors are supposed to be inside the exterior; however, we
538        // include a poorly formed polygon just to make sure they are considered
539        let bounds =
540            wkt_bounds(["POLYGON ((0 0, 0 1, 1 0, 0 0), (10 10, 10 11, 11 10, 10 10))"]).unwrap();
541        assert_eq!(bounds.x(), (0, 11).into());
542        assert_eq!(bounds.y(), (0, 11).into());
543
544        let bounds = wkt_bounds(["MULTIPOINT ((0 1), (2 3))"]).unwrap();
545        assert_eq!(bounds.x(), (0, 2).into());
546        assert_eq!(bounds.y(), (1, 3).into());
547
548        let bounds = wkt_bounds(["MULTILINESTRING ((0 1, 2 3))"]).unwrap();
549        assert_eq!(bounds.x(), (0, 2).into());
550        assert_eq!(bounds.y(), (1, 3).into());
551
552        let bounds = wkt_bounds(["MULTIPOLYGON (((0 0, 0 1, 1 0, 0 0)))"]).unwrap();
553        assert_eq!(bounds.x(), (0, 1).into());
554        assert_eq!(bounds.y(), (0, 1).into());
555
556        let bounds = wkt_bounds(["GEOMETRYCOLLECTION (POINT (0 1), POINT (2 3))"]).unwrap();
557        assert_eq!(bounds.x(), (0, 2).into());
558        assert_eq!(bounds.y(), (1, 3).into());
559    }
560
561    #[test]
562    fn test_bounds_wrap_basic() {
563        let geoms = ["POINT (-170 0)", "POINT (170 0)"];
564
565        // No wraparound because it was disabled
566        let bounds = wkt_bounds_with_wraparound(geoms, Interval::empty()).unwrap();
567        assert_eq!(bounds.x(), (-170, 170).into());
568
569        // Wraparound that can't happen because something is covering
570        // the midpoint.
571        let mut geoms_with_mid = geoms.to_vec();
572        geoms_with_mid.push("LINESTRING (-10 0, 10 0)");
573        let bounds = wkt_bounds_with_wraparound(geoms_with_mid, (-180, 180)).unwrap();
574        assert_eq!(bounds.x(), (-170, 170).into());
575
576        // Wraparound where the wrapped box is *not* better
577        let bounds = wkt_bounds_with_wraparound(geoms, (-1000, 1000)).unwrap();
578        assert_eq!(bounds.x(), (-170, 170).into());
579
580        // Wraparound where the wrapped box is inappropriate because it is
581        // outside the wrap hint
582        let bounds = wkt_bounds_with_wraparound(geoms, (-10, 10)).unwrap();
583        assert_eq!(bounds.x(), (-170, 170).into());
584
585        // Wraparound where the wrapped box *is* better
586        let bounds = wkt_bounds_with_wraparound(geoms, (-180, 180)).unwrap();
587        assert_eq!(bounds.x(), (170, -170).into());
588    }
589
590    #[test]
591    fn test_bounds_wrap_multipart() {
592        let fiji = "MULTIPOLYGON (
593        ((-180 -15.51, -180 -19.78, -178.61 -21.14, -178.02 -18.22, -178.57 -16.04, -180 -15.51)),
594        ((180 -15.51, 177.98 -16.25, 176.67 -17.14, 177.83 -19.31, 180 -19.78, 180 -15.51))
595        )";
596
597        let bounds = wkt_bounds_with_wraparound([fiji], (-180, 180)).unwrap();
598        assert!(bounds.x().is_wraparound());
599        assert_eq!(bounds.x(), (176.67, -178.02).into());
600        assert_eq!(bounds.y(), (-21.14, -15.51).into());
601    }
602}