parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::basic::Type;
45use crate::data_type::private::ParquetValueType;
46use crate::data_type::*;
47use crate::errors::{ParquetError, Result};
48use crate::file::metadata::thrift::PageStatistics;
49use crate::util::bit_util::FromBytes;
50
51pub(crate) mod private {
52    use super::*;
53
54    pub trait MakeStatistics {
55        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
56        where
57            Self: Sized;
58    }
59
60    macro_rules! gen_make_statistics {
61        ($value_ty:ty, $stat:ident) => {
62            impl MakeStatistics for $value_ty {
63                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
64                where
65                    Self: Sized,
66                {
67                    Statistics::$stat(statistics)
68                }
69            }
70        };
71    }
72
73    gen_make_statistics!(bool, Boolean);
74    gen_make_statistics!(i32, Int32);
75    gen_make_statistics!(i64, Int64);
76    gen_make_statistics!(Int96, Int96);
77    gen_make_statistics!(f32, Float);
78    gen_make_statistics!(f64, Double);
79    gen_make_statistics!(ByteArray, ByteArray);
80    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
81}
82
83/// Macro to generate methods to create Statistics.
84macro_rules! statistics_new_func {
85    ($func:ident, $vtype:ty, $stat:ident) => {
86        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
87        pub fn $func(
88            min: $vtype,
89            max: $vtype,
90            distinct: Option<u64>,
91            nulls: Option<u64>,
92            is_deprecated: bool,
93        ) -> Self {
94            Statistics::$stat(ValueStatistics::new(
95                min,
96                max,
97                distinct,
98                nulls,
99                is_deprecated,
100            ))
101        }
102    };
103}
104
105// Macro to generate getter functions for Statistics.
106macro_rules! statistics_enum_func {
107    ($self:ident, $func:ident) => {{
108        match *$self {
109            Statistics::Boolean(ref typed) => typed.$func(),
110            Statistics::Int32(ref typed) => typed.$func(),
111            Statistics::Int64(ref typed) => typed.$func(),
112            Statistics::Int96(ref typed) => typed.$func(),
113            Statistics::Float(ref typed) => typed.$func(),
114            Statistics::Double(ref typed) => typed.$func(),
115            Statistics::ByteArray(ref typed) => typed.$func(),
116            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
117        }
118    }};
119}
120
121/// Converts Thrift definition into `Statistics`.
122pub(crate) fn from_thrift_page_stats(
123    physical_type: Type,
124    thrift_stats: Option<PageStatistics>,
125) -> Result<Option<Statistics>> {
126    Ok(match thrift_stats {
127        Some(stats) => {
128            // Number of nulls recorded, when it is not available, we just mark it as 0.
129            // TODO this should be `None` if there is no information about NULLS.
130            // see https://github.com/apache/arrow-rs/pull/6216/files
131            let null_count = stats.null_count.unwrap_or(0);
132
133            if null_count < 0 {
134                return Err(ParquetError::General(format!(
135                    "Statistics null count is negative {null_count}",
136                )));
137            }
138
139            // Generic null count.
140            let null_count = Some(null_count as u64);
141            // Generic distinct count (count of distinct values occurring)
142            let distinct_count = stats.distinct_count.map(|value| value as u64);
143            // Whether or not statistics use deprecated min/max fields.
144            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
145            // Generic min value as bytes.
146            let min = if old_format {
147                stats.min
148            } else {
149                stats.min_value
150            };
151            // Generic max value as bytes.
152            let max = if old_format {
153                stats.max
154            } else {
155                stats.max_value
156            };
157
158            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
159                if let Some(min) = min {
160                    if min.len() < len {
161                        return Err(ParquetError::General(
162                            "Insufficient bytes to parse min statistic".to_string(),
163                        ));
164                    }
165                }
166                if let Some(max) = max {
167                    if max.len() < len {
168                        return Err(ParquetError::General(
169                            "Insufficient bytes to parse max statistic".to_string(),
170                        ));
171                    }
172                }
173                Ok(())
174            }
175
176            match physical_type {
177                Type::BOOLEAN => check_len(&min, &max, 1),
178                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
179                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
180                Type::INT96 => check_len(&min, &max, 12),
181                _ => Ok(()),
182            }?;
183
184            // Values are encoded using PLAIN encoding definition, except that
185            // variable-length byte arrays do not include a length prefix.
186            //
187            // Instead of using actual decoder, we manually convert values.
188            let res = match physical_type {
189                Type::BOOLEAN => Statistics::boolean(
190                    min.map(|data| data[0] != 0),
191                    max.map(|data| data[0] != 0),
192                    distinct_count,
193                    null_count,
194                    old_format,
195                ),
196                Type::INT32 => Statistics::int32(
197                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
198                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199                    distinct_count,
200                    null_count,
201                    old_format,
202                ),
203                Type::INT64 => Statistics::int64(
204                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
205                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206                    distinct_count,
207                    null_count,
208                    old_format,
209                ),
210                Type::INT96 => {
211                    // INT96 statistics may not be correct, because comparison is signed
212                    let min = if let Some(data) = min {
213                        if data.len() != 12 {
214                            return Err(ParquetError::General(
215                                "Incorrect Int96 min statistics".to_string(),
216                            ));
217                        }
218                        Some(Int96::try_from_le_slice(&data)?)
219                    } else {
220                        None
221                    };
222                    let max = if let Some(data) = max {
223                        if data.len() != 12 {
224                            return Err(ParquetError::General(
225                                "Incorrect Int96 max statistics".to_string(),
226                            ));
227                        }
228                        Some(Int96::try_from_le_slice(&data)?)
229                    } else {
230                        None
231                    };
232                    Statistics::int96(min, max, distinct_count, null_count, old_format)
233                }
234                Type::FLOAT => Statistics::float(
235                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
236                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
237                    distinct_count,
238                    null_count,
239                    old_format,
240                ),
241                Type::DOUBLE => Statistics::double(
242                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
243                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
244                    distinct_count,
245                    null_count,
246                    old_format,
247                ),
248                Type::BYTE_ARRAY => Statistics::ByteArray(
249                    ValueStatistics::new(
250                        min.map(ByteArray::from),
251                        max.map(ByteArray::from),
252                        distinct_count,
253                        null_count,
254                        old_format,
255                    )
256                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
257                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
258                ),
259                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
260                    ValueStatistics::new(
261                        min.map(ByteArray::from).map(FixedLenByteArray::from),
262                        max.map(ByteArray::from).map(FixedLenByteArray::from),
263                        distinct_count,
264                        null_count,
265                        old_format,
266                    )
267                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
268                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
269                ),
270            };
271
272            Some(res)
273        }
274        None => None,
275    })
276}
277
278/// Convert Statistics into Thrift definition.
279pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> {
280    let stats = stats?;
281
282    // record null count if it can fit in i64
283    let null_count = stats
284        .null_count_opt()
285        .and_then(|value| i64::try_from(value).ok());
286
287    // record distinct count if it can fit in i64
288    let distinct_count = stats
289        .distinct_count_opt()
290        .and_then(|value| i64::try_from(value).ok());
291
292    let mut thrift_stats = PageStatistics {
293        max: None,
294        min: None,
295        null_count,
296        distinct_count,
297        max_value: None,
298        min_value: None,
299        is_max_value_exact: None,
300        is_min_value_exact: None,
301    };
302
303    // Get min/max if set.
304    let (min, max, min_exact, max_exact) = (
305        stats.min_bytes_opt().map(|x| x.to_vec()),
306        stats.max_bytes_opt().map(|x| x.to_vec()),
307        Some(stats.min_is_exact()),
308        Some(stats.max_is_exact()),
309    );
310    if stats.is_min_max_backwards_compatible() {
311        // Copy to deprecated min, max values for compatibility with older readers
312        thrift_stats.min.clone_from(&min);
313        thrift_stats.max.clone_from(&max);
314    }
315
316    if !stats.is_min_max_deprecated() {
317        thrift_stats.min_value = min;
318        thrift_stats.max_value = max;
319    }
320
321    thrift_stats.is_min_value_exact = min_exact;
322    thrift_stats.is_max_value_exact = max_exact;
323
324    Some(thrift_stats)
325}
326
327/// Strongly typed statistics for a column chunk within a row group.
328///
329/// This structure is a natively typed, in memory representation of the thrift
330/// `Statistics` structure in a Parquet file footer. The statistics stored in
331/// this structure can be used by query engines to skip decoding pages while
332/// reading parquet data.
333///
334/// Page level statistics are stored separately, in [ColumnIndexMetaData].
335///
336/// [ColumnIndexMetaData]: crate::file::page_index::column_index::ColumnIndexMetaData
337#[derive(Debug, Clone, PartialEq)]
338pub enum Statistics {
339    /// Statistics for Boolean column
340    Boolean(ValueStatistics<bool>),
341    /// Statistics for Int32 column
342    Int32(ValueStatistics<i32>),
343    /// Statistics for Int64 column
344    Int64(ValueStatistics<i64>),
345    /// Statistics for Int96 column
346    Int96(ValueStatistics<Int96>),
347    /// Statistics for Float column
348    Float(ValueStatistics<f32>),
349    /// Statistics for Double column
350    Double(ValueStatistics<f64>),
351    /// Statistics for ByteArray column
352    ByteArray(ValueStatistics<ByteArray>),
353    /// Statistics for FixedLenByteArray column
354    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
355}
356
357impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
358    fn from(t: ValueStatistics<T>) -> Self {
359        T::make_statistics(t)
360    }
361}
362
363impl Statistics {
364    /// Creates new statistics for a column type
365    pub fn new<T: ParquetValueType>(
366        min: Option<T>,
367        max: Option<T>,
368        distinct_count: Option<u64>,
369        null_count: Option<u64>,
370        is_deprecated: bool,
371    ) -> Self {
372        Self::from(ValueStatistics::new(
373            min,
374            max,
375            distinct_count,
376            null_count,
377            is_deprecated,
378        ))
379    }
380
381    statistics_new_func![boolean, Option<bool>, Boolean];
382
383    statistics_new_func![int32, Option<i32>, Int32];
384
385    statistics_new_func![int64, Option<i64>, Int64];
386
387    statistics_new_func![int96, Option<Int96>, Int96];
388
389    statistics_new_func![float, Option<f32>, Float];
390
391    statistics_new_func![double, Option<f64>, Double];
392
393    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
394
395    statistics_new_func![
396        fixed_len_byte_array,
397        Option<FixedLenByteArray>,
398        FixedLenByteArray
399    ];
400
401    /// Returns `true` if statistics have old `min` and `max` fields set.
402    /// This means that the column order is likely to be undefined, which, for old files
403    /// could mean a signed sort order of values.
404    ///
405    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
406    /// [`SortOrder`](crate::basic::SortOrder) for more information.
407    pub fn is_min_max_deprecated(&self) -> bool {
408        statistics_enum_func![self, is_min_max_deprecated]
409    }
410
411    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
412    /// using signed comparison. This resulted in an undefined ordering for unsigned
413    /// quantities, such as booleans and unsigned integers.
414    ///
415    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
416    /// which have a type-defined sort order.
417    ///
418    /// However, not all readers have been updated. For backwards compatibility, this method
419    /// returns `true` if the statistics within this have a signed sort order, that is
420    /// compatible with being stored in the deprecated `min` and `max` fields
421    pub fn is_min_max_backwards_compatible(&self) -> bool {
422        statistics_enum_func![self, is_min_max_backwards_compatible]
423    }
424
425    /// Returns optional value of number of distinct values occurring.
426    /// When it is `None`, the value should be ignored.
427    pub fn distinct_count_opt(&self) -> Option<u64> {
428        statistics_enum_func![self, distinct_count]
429    }
430
431    /// Returns number of null values for the column, if known.
432    /// Note that this includes all nulls when column is part of the complex type.
433    ///
434    /// Note this API returns Some(0) even if the null count was not present
435    /// in the statistics.
436    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
437    pub fn null_count_opt(&self) -> Option<u64> {
438        statistics_enum_func![self, null_count_opt]
439    }
440
441    /// Returns `true` if the min value is set, and is an exact min value.
442    pub fn min_is_exact(&self) -> bool {
443        statistics_enum_func![self, min_is_exact]
444    }
445
446    /// Returns `true` if the max value is set, and is an exact max value.
447    pub fn max_is_exact(&self) -> bool {
448        statistics_enum_func![self, max_is_exact]
449    }
450
451    /// Returns slice of bytes that represent min value, if min value is known.
452    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
453        statistics_enum_func![self, min_bytes_opt]
454    }
455
456    /// Returns slice of bytes that represent max value, if max value is known.
457    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
458        statistics_enum_func![self, max_bytes_opt]
459    }
460
461    /// Returns physical type associated with statistics.
462    pub fn physical_type(&self) -> Type {
463        match self {
464            Statistics::Boolean(_) => Type::BOOLEAN,
465            Statistics::Int32(_) => Type::INT32,
466            Statistics::Int64(_) => Type::INT64,
467            Statistics::Int96(_) => Type::INT96,
468            Statistics::Float(_) => Type::FLOAT,
469            Statistics::Double(_) => Type::DOUBLE,
470            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
471            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
472        }
473    }
474}
475
476impl fmt::Display for Statistics {
477    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
478        match self {
479            Statistics::Boolean(typed) => write!(f, "{typed}"),
480            Statistics::Int32(typed) => write!(f, "{typed}"),
481            Statistics::Int64(typed) => write!(f, "{typed}"),
482            Statistics::Int96(typed) => write!(f, "{typed}"),
483            Statistics::Float(typed) => write!(f, "{typed}"),
484            Statistics::Double(typed) => write!(f, "{typed}"),
485            Statistics::ByteArray(typed) => write!(f, "{typed}"),
486            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
487        }
488    }
489}
490
491/// Typed implementation for [`Statistics`].
492pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
493
494/// Typed statistics for one column chunk
495///
496/// See [`Statistics`] for more details
497#[derive(Clone, Eq, PartialEq)]
498pub struct ValueStatistics<T> {
499    min: Option<T>,
500    max: Option<T>,
501    // Distinct count could be omitted in some cases
502    distinct_count: Option<u64>,
503    null_count: Option<u64>,
504
505    // Whether or not the min or max values are exact, or truncated.
506    is_max_value_exact: bool,
507    is_min_value_exact: bool,
508
509    /// If `true` populate the deprecated `min` and `max` fields instead of
510    /// `min_value` and `max_value`
511    is_min_max_deprecated: bool,
512
513    /// If `true` the statistics are compatible with the deprecated `min` and
514    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
515    is_min_max_backwards_compatible: bool,
516}
517
518impl<T> ValueStatistics<T> {
519    /// Creates new typed statistics.
520    pub fn new(
521        min: Option<T>,
522        max: Option<T>,
523        distinct_count: Option<u64>,
524        null_count: Option<u64>,
525        is_min_max_deprecated: bool,
526    ) -> Self {
527        Self {
528            is_max_value_exact: max.is_some(),
529            is_min_value_exact: min.is_some(),
530            min,
531            max,
532            distinct_count,
533            null_count,
534            is_min_max_deprecated,
535            is_min_max_backwards_compatible: is_min_max_deprecated,
536        }
537    }
538
539    /// Set whether the stored `min` field represents the exact
540    /// minimum, or just a bound on the minimum value.
541    ///
542    /// see [`Self::min_is_exact`]
543    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
544        Self {
545            is_min_value_exact,
546            ..self
547        }
548    }
549
550    /// Set whether the stored `max` field represents the exact
551    /// maximum, or just a bound on the maximum value.
552    ///
553    /// see [`Self::max_is_exact`]
554    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
555        Self {
556            is_max_value_exact,
557            ..self
558        }
559    }
560
561    /// Set whether to write the deprecated `min` and `max` fields
562    /// for compatibility with older parquet writers
563    ///
564    /// This should only be enabled if the field is signed,
565    /// see [`Self::is_min_max_backwards_compatible`]
566    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
567        Self {
568            is_min_max_backwards_compatible: backwards_compatible,
569            ..self
570        }
571    }
572
573    /// Returns min value of the statistics, if known.
574    pub fn min_opt(&self) -> Option<&T> {
575        self.min.as_ref()
576    }
577
578    /// Returns max value of the statistics, if known.
579    pub fn max_opt(&self) -> Option<&T> {
580        self.max.as_ref()
581    }
582
583    /// Whether or not min and max values are set.
584    /// Normally both min/max values will be set to `Some(value)` or `None`.
585    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
586        self.min.is_some() && self.max.is_some()
587    }
588
589    /// Whether or not max value is set, and is an exact value.
590    pub fn max_is_exact(&self) -> bool {
591        self.max.is_some() && self.is_max_value_exact
592    }
593
594    /// Whether or not min value is set, and is an exact value.
595    pub fn min_is_exact(&self) -> bool {
596        self.min.is_some() && self.is_min_value_exact
597    }
598
599    /// Returns optional value of number of distinct values occurring.
600    pub fn distinct_count(&self) -> Option<u64> {
601        self.distinct_count
602    }
603
604    /// Returns null count.
605    pub fn null_count_opt(&self) -> Option<u64> {
606        self.null_count
607    }
608
609    /// Returns `true` if statistics were created using old min/max fields.
610    fn is_min_max_deprecated(&self) -> bool {
611        self.is_min_max_deprecated
612    }
613
614    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
615    /// using signed comparison. This resulted in an undefined ordering for unsigned
616    /// quantities, such as booleans and unsigned integers.
617    ///
618    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
619    /// which have a type-defined sort order.
620    ///
621    /// However, not all readers have been updated. For backwards compatibility, this method
622    /// returns `true` if the statistics within this have a signed sort order, that is
623    /// compatible with being stored in the deprecated `min` and `max` fields
624    pub fn is_min_max_backwards_compatible(&self) -> bool {
625        self.is_min_max_backwards_compatible
626    }
627}
628
629impl<T: AsBytes> ValueStatistics<T> {
630    /// Returns min value as bytes of the statistics, if min value is known.
631    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
632        self.min_opt().map(AsBytes::as_bytes)
633    }
634
635    /// Returns max value as bytes of the statistics, if max value is known.
636    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
637        self.max_opt().map(AsBytes::as_bytes)
638    }
639}
640
641impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
642    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
643        write!(f, "{{")?;
644        write!(f, "min: ")?;
645        match self.min {
646            Some(ref value) => write!(f, "{value}")?,
647            None => write!(f, "N/A")?,
648        }
649        write!(f, ", max: ")?;
650        match self.max {
651            Some(ref value) => write!(f, "{value}")?,
652            None => write!(f, "N/A")?,
653        }
654        write!(f, ", distinct_count: ")?;
655        match self.distinct_count {
656            Some(value) => write!(f, "{value}")?,
657            None => write!(f, "N/A")?,
658        }
659        write!(f, ", null_count: ")?;
660        match self.null_count {
661            Some(value) => write!(f, "{value}")?,
662            None => write!(f, "N/A")?,
663        }
664        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
665        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
666        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
667        write!(f, "}}")
668    }
669}
670
671impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
672    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
673        write!(
674            f,
675            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
676             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
677            self.min,
678            self.max,
679            self.distinct_count,
680            self.null_count,
681            self.is_min_max_deprecated,
682            self.is_min_max_backwards_compatible,
683            self.is_max_value_exact,
684            self.is_min_value_exact
685        )
686    }
687}
688
689#[cfg(test)]
690mod tests {
691    use super::*;
692
693    #[test]
694    fn test_statistics_min_max_bytes() {
695        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
696        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
697        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
698
699        let stats = Statistics::byte_array(
700            Some(ByteArray::from(vec![1, 2, 3])),
701            Some(ByteArray::from(vec![3, 4, 5])),
702            None,
703            Some(1),
704            true,
705        );
706        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
707        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
708    }
709
710    #[test]
711    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
712    fn test_statistics_negative_null_count() {
713        let thrift_stats = PageStatistics {
714            max: None,
715            min: None,
716            null_count: Some(-10),
717            distinct_count: None,
718            max_value: None,
719            min_value: None,
720            is_max_value_exact: None,
721            is_min_value_exact: None,
722        };
723
724        from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap();
725    }
726
727    #[test]
728    fn test_statistics_thrift_none() {
729        assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None);
730        assert_eq!(
731            from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(),
732            None
733        );
734    }
735
736    #[test]
737    fn test_statistics_debug() {
738        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
739        assert_eq!(
740            format!("{stats:?}"),
741            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
742             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
743        );
744
745        let stats = Statistics::int32(None, None, None, Some(7), false);
746        assert_eq!(
747            format!("{stats:?}"),
748            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
749             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
750        )
751    }
752
753    #[test]
754    fn test_statistics_display() {
755        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
756        assert_eq!(
757            format!("{stats}"),
758            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
759        );
760
761        let stats = Statistics::int64(None, None, None, Some(7), false);
762        assert_eq!(
763            format!("{stats}"),
764            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
765             false, max_value_exact: false, min_value_exact: false}"
766        );
767
768        let stats = Statistics::int96(
769            Some(Int96::from(vec![1, 0, 0])),
770            Some(Int96::from(vec![2, 3, 4])),
771            None,
772            Some(3),
773            true,
774        );
775        assert_eq!(
776            format!("{stats}"),
777            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
778             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
779        );
780
781        let stats = Statistics::ByteArray(
782            ValueStatistics::new(
783                Some(ByteArray::from(vec![1u8])),
784                Some(ByteArray::from(vec![2u8])),
785                Some(5),
786                Some(7),
787                false,
788            )
789            .with_max_is_exact(false)
790            .with_min_is_exact(false),
791        );
792        assert_eq!(
793            format!("{stats}"),
794            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
795        );
796    }
797
798    #[test]
799    fn test_statistics_partial_eq() {
800        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
801
802        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
803        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
804        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
805        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
806        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
807
808        assert!(
809            Statistics::int32(Some(12), Some(45), None, Some(11), false)
810                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
811        );
812
813        assert!(
814            Statistics::boolean(Some(false), Some(true), None, None, true)
815                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
816        );
817
818        assert!(
819            Statistics::byte_array(
820                Some(ByteArray::from(vec![1, 2, 3])),
821                Some(ByteArray::from(vec![1, 2, 3])),
822                None,
823                None,
824                true
825            ) != Statistics::fixed_len_byte_array(
826                Some(ByteArray::from(vec![1, 2, 3]).into()),
827                Some(ByteArray::from(vec![1, 2, 3]).into()),
828                None,
829                None,
830                true,
831            )
832        );
833
834        assert!(
835            Statistics::byte_array(
836                Some(ByteArray::from(vec![1, 2, 3])),
837                Some(ByteArray::from(vec![1, 2, 3])),
838                None,
839                None,
840                true,
841            ) != Statistics::ByteArray(
842                ValueStatistics::new(
843                    Some(ByteArray::from(vec![1, 2, 3])),
844                    Some(ByteArray::from(vec![1, 2, 3])),
845                    None,
846                    None,
847                    true,
848                )
849                .with_max_is_exact(false)
850            )
851        );
852
853        assert!(
854            Statistics::fixed_len_byte_array(
855                Some(FixedLenByteArray::from(vec![1, 2, 3])),
856                Some(FixedLenByteArray::from(vec![1, 2, 3])),
857                None,
858                None,
859                true,
860            ) != Statistics::FixedLenByteArray(
861                ValueStatistics::new(
862                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
863                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
864                    None,
865                    None,
866                    true,
867                )
868                .with_min_is_exact(false)
869            )
870        );
871    }
872
873    #[test]
874    fn test_statistics_from_thrift() {
875        // Helper method to check statistics conversion.
876        fn check_stats(stats: Statistics) {
877            let tpe = stats.physical_type();
878            let thrift_stats = page_stats_to_thrift(Some(&stats));
879            assert_eq!(
880                from_thrift_page_stats(tpe, thrift_stats).unwrap(),
881                Some(stats)
882            );
883        }
884
885        check_stats(Statistics::boolean(
886            Some(false),
887            Some(true),
888            None,
889            Some(7),
890            true,
891        ));
892        check_stats(Statistics::boolean(
893            Some(false),
894            Some(true),
895            None,
896            Some(7),
897            true,
898        ));
899        check_stats(Statistics::boolean(
900            Some(false),
901            Some(true),
902            None,
903            Some(0),
904            false,
905        ));
906        check_stats(Statistics::boolean(
907            Some(true),
908            Some(true),
909            None,
910            Some(7),
911            true,
912        ));
913        check_stats(Statistics::boolean(
914            Some(false),
915            Some(false),
916            None,
917            Some(7),
918            true,
919        ));
920        check_stats(Statistics::boolean(None, None, None, Some(7), true));
921
922        check_stats(Statistics::int32(
923            Some(-100),
924            Some(500),
925            None,
926            Some(7),
927            true,
928        ));
929        check_stats(Statistics::int32(
930            Some(-100),
931            Some(500),
932            None,
933            Some(0),
934            false,
935        ));
936        check_stats(Statistics::int32(None, None, None, Some(7), true));
937
938        check_stats(Statistics::int64(
939            Some(-100),
940            Some(200),
941            None,
942            Some(7),
943            true,
944        ));
945        check_stats(Statistics::int64(
946            Some(-100),
947            Some(200),
948            None,
949            Some(0),
950            false,
951        ));
952        check_stats(Statistics::int64(None, None, None, Some(7), true));
953
954        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
955        check_stats(Statistics::float(
956            Some(1.2),
957            Some(3.4),
958            None,
959            Some(0),
960            false,
961        ));
962        check_stats(Statistics::float(None, None, None, Some(7), true));
963
964        check_stats(Statistics::double(
965            Some(1.2),
966            Some(3.4),
967            None,
968            Some(7),
969            true,
970        ));
971        check_stats(Statistics::double(
972            Some(1.2),
973            Some(3.4),
974            None,
975            Some(0),
976            false,
977        ));
978        check_stats(Statistics::double(None, None, None, Some(7), true));
979
980        check_stats(Statistics::byte_array(
981            Some(ByteArray::from(vec![1, 2, 3])),
982            Some(ByteArray::from(vec![3, 4, 5])),
983            None,
984            Some(7),
985            true,
986        ));
987        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
988
989        check_stats(Statistics::fixed_len_byte_array(
990            Some(ByteArray::from(vec![1, 2, 3]).into()),
991            Some(ByteArray::from(vec![3, 4, 5]).into()),
992            None,
993            Some(7),
994            true,
995        ));
996        check_stats(Statistics::fixed_len_byte_array(
997            None,
998            None,
999            None,
1000            Some(7),
1001            true,
1002        ));
1003    }
1004
1005    #[test]
1006    fn test_count_encoding() {
1007        statistics_count_test(None, None);
1008        statistics_count_test(Some(0), Some(0));
1009        statistics_count_test(Some(100), Some(2000));
1010        statistics_count_test(Some(1), None);
1011        statistics_count_test(None, Some(1));
1012    }
1013
1014    #[test]
1015    fn test_count_encoding_distinct_too_large() {
1016        // statistics are stored using i64, so test trying to store larger values
1017        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1018        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1019        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1020        assert_eq!(thrift_stats.null_count, Some(100));
1021    }
1022
1023    #[test]
1024    fn test_count_encoding_null_too_large() {
1025        // statistics are stored using i64, so test trying to store larger values
1026        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1027        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1028        assert_eq!(thrift_stats.distinct_count, Some(100));
1029        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1030    }
1031
1032    #[test]
1033    fn test_count_decoding_null_invalid() {
1034        let tstatistics = PageStatistics {
1035            null_count: Some(-42),
1036            max: None,
1037            min: None,
1038            distinct_count: None,
1039            max_value: None,
1040            min_value: None,
1041            is_max_value_exact: None,
1042            is_min_value_exact: None,
1043        };
1044        let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1045        assert_eq!(
1046            err.to_string(),
1047            "Parquet error: Statistics null count is negative -42"
1048        );
1049    }
1050
1051    /// Writes statistics to thrift and reads them back and ensures:
1052    /// - The statistics are the same
1053    /// - The statistics written to thrift are the same as the original statistics
1054    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1055        let statistics = make_bool_stats(distinct_count, null_count);
1056
1057        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1058        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1059        assert_eq!(
1060            thrift_stats.distinct_count.map(|c| c as u64),
1061            distinct_count
1062        );
1063
1064        let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
1065            .unwrap()
1066            .unwrap();
1067        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1068        // means null_count = Some(0)
1069        if null_count.is_none() {
1070            assert_ne!(round_tripped, statistics);
1071            assert!(round_tripped.null_count_opt().is_some());
1072            assert_eq!(round_tripped.null_count_opt(), Some(0));
1073            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1074            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1075            assert_eq!(
1076                round_tripped.distinct_count_opt(),
1077                statistics.distinct_count_opt()
1078            );
1079        } else {
1080            assert_eq!(round_tripped, statistics);
1081        }
1082    }
1083
1084    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1085        let min = Some(true);
1086        let max = Some(false);
1087        let is_min_max_deprecated = false;
1088
1089        // test is about the counts, so we aren't really testing the min/max values
1090        Statistics::Boolean(ValueStatistics::new(
1091            min,
1092            max,
1093            distinct_count,
1094            null_count,
1095            is_min_max_deprecated,
1096        ))
1097    }
1098
1099    #[test]
1100    fn test_int96_invalid_statistics() {
1101        let mut thrift_stats = PageStatistics {
1102            max: None,
1103            min: Some((0..13).collect()),
1104            null_count: Some(0),
1105            distinct_count: None,
1106            max_value: None,
1107            min_value: None,
1108            is_max_value_exact: None,
1109            is_min_value_exact: None,
1110        };
1111
1112        let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats.clone())).unwrap_err();
1113        assert_eq!(
1114            err.to_string(),
1115            "Parquet error: Incorrect Int96 min statistics"
1116        );
1117
1118        thrift_stats.min = None;
1119        thrift_stats.max = Some((0..13).collect());
1120        let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats)).unwrap_err();
1121        assert_eq!(
1122            err.to_string(),
1123            "Parquet error: Incorrect Int96 max statistics"
1124        );
1125    }
1126
1127    // Ensures that we can call ValueStatistics::min_opt from a
1128    // generic function without reyling on a bound to a private trait.
1129    fn generic_statistics_handler<T: std::fmt::Display>(stats: ValueStatistics<T>) -> String {
1130        match stats.min_opt() {
1131            Some(s) => format!("min: {}", s),
1132            None => "min: NA".to_string(),
1133        }
1134    }
1135
1136    #[test]
1137    fn test_generic_access() {
1138        let stats = Statistics::int32(Some(12), Some(45), None, Some(11), false);
1139
1140        match stats {
1141            Statistics::Int32(v) => {
1142                let stats_string = generic_statistics_handler(v);
1143                assert_eq!(&stats_string, "min: 12");
1144            }
1145            _ => unreachable!(),
1146        }
1147    }
1148}