Skip to main content

parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::basic::Type;
45use crate::data_type::private::ParquetValueType;
46use crate::data_type::*;
47use crate::errors::{ParquetError, Result};
48use crate::file::metadata::thrift::PageStatistics;
49use crate::util::bit_util::FromBytes;
50
51pub(crate) mod private {
52    use super::*;
53
54    pub trait MakeStatistics {
55        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
56        where
57            Self: Sized;
58    }
59
60    macro_rules! gen_make_statistics {
61        ($value_ty:ty, $stat:ident) => {
62            impl MakeStatistics for $value_ty {
63                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
64                where
65                    Self: Sized,
66                {
67                    Statistics::$stat(statistics)
68                }
69            }
70        };
71    }
72
73    gen_make_statistics!(bool, Boolean);
74    gen_make_statistics!(i32, Int32);
75    gen_make_statistics!(i64, Int64);
76    gen_make_statistics!(Int96, Int96);
77    gen_make_statistics!(f32, Float);
78    gen_make_statistics!(f64, Double);
79    gen_make_statistics!(ByteArray, ByteArray);
80    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
81}
82
83/// Macro to generate methods to create Statistics.
84macro_rules! statistics_new_func {
85    ($func:ident, $vtype:ty, $stat:ident) => {
86        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
87        pub fn $func(
88            min: $vtype,
89            max: $vtype,
90            distinct: Option<u64>,
91            nulls: Option<u64>,
92            is_deprecated: bool,
93        ) -> Self {
94            Statistics::$stat(ValueStatistics::new(
95                min,
96                max,
97                distinct,
98                nulls,
99                is_deprecated,
100            ))
101        }
102    };
103}
104
105// Macro to generate getter functions for Statistics.
106macro_rules! statistics_enum_func {
107    ($self:ident, $func:ident) => {{
108        match *$self {
109            Statistics::Boolean(ref typed) => typed.$func(),
110            Statistics::Int32(ref typed) => typed.$func(),
111            Statistics::Int64(ref typed) => typed.$func(),
112            Statistics::Int96(ref typed) => typed.$func(),
113            Statistics::Float(ref typed) => typed.$func(),
114            Statistics::Double(ref typed) => typed.$func(),
115            Statistics::ByteArray(ref typed) => typed.$func(),
116            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
117        }
118    }};
119}
120
121/// Converts Thrift definition into `Statistics`.
122pub(crate) fn from_thrift_page_stats(
123    physical_type: Type,
124    thrift_stats: Option<PageStatistics>,
125) -> Result<Option<Statistics>> {
126    Ok(match thrift_stats {
127        Some(stats) => {
128            // Generic null count.
129            let null_count = stats
130                .null_count
131                .map(|null_count| {
132                    if null_count < 0 {
133                        return Err(ParquetError::General(format!(
134                            "Statistics null count is negative {null_count}",
135                        )));
136                    }
137                    Ok(null_count as u64)
138                })
139                .transpose()?;
140            // Generic distinct count (count of distinct values occurring)
141            let distinct_count = stats.distinct_count.map(|value| value as u64);
142            // Whether or not statistics use deprecated min/max fields.
143            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
144            // Generic min value as bytes.
145            let min = if old_format {
146                stats.min
147            } else {
148                stats.min_value
149            };
150            // Generic max value as bytes.
151            let max = if old_format {
152                stats.max
153            } else {
154                stats.max_value
155            };
156
157            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
158                if let Some(min) = min {
159                    if min.len() < len {
160                        return Err(ParquetError::General(
161                            "Insufficient bytes to parse min statistic".to_string(),
162                        ));
163                    }
164                }
165                if let Some(max) = max {
166                    if max.len() < len {
167                        return Err(ParquetError::General(
168                            "Insufficient bytes to parse max statistic".to_string(),
169                        ));
170                    }
171                }
172                Ok(())
173            }
174
175            match physical_type {
176                Type::BOOLEAN => check_len(&min, &max, 1),
177                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
178                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
179                Type::INT96 => check_len(&min, &max, 12),
180                _ => Ok(()),
181            }?;
182
183            // Values are encoded using PLAIN encoding definition, except that
184            // variable-length byte arrays do not include a length prefix.
185            //
186            // Instead of using actual decoder, we manually convert values.
187            let res = match physical_type {
188                Type::BOOLEAN => Statistics::boolean(
189                    min.map(|data| data[0] != 0),
190                    max.map(|data| data[0] != 0),
191                    distinct_count,
192                    null_count,
193                    old_format,
194                ),
195                Type::INT32 => Statistics::int32(
196                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
197                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
198                    distinct_count,
199                    null_count,
200                    old_format,
201                ),
202                Type::INT64 => Statistics::int64(
203                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
204                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
205                    distinct_count,
206                    null_count,
207                    old_format,
208                ),
209                Type::INT96 => {
210                    // INT96 statistics may not be correct, because comparison is signed
211                    let min = if let Some(data) = min {
212                        if data.len() != 12 {
213                            return Err(ParquetError::General(
214                                "Incorrect Int96 min statistics".to_string(),
215                            ));
216                        }
217                        Some(Int96::try_from_le_slice(&data)?)
218                    } else {
219                        None
220                    };
221                    let max = if let Some(data) = max {
222                        if data.len() != 12 {
223                            return Err(ParquetError::General(
224                                "Incorrect Int96 max statistics".to_string(),
225                            ));
226                        }
227                        Some(Int96::try_from_le_slice(&data)?)
228                    } else {
229                        None
230                    };
231                    Statistics::int96(min, max, distinct_count, null_count, old_format)
232                }
233                Type::FLOAT => Statistics::float(
234                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
235                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
236                    distinct_count,
237                    null_count,
238                    old_format,
239                ),
240                Type::DOUBLE => Statistics::double(
241                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
242                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
243                    distinct_count,
244                    null_count,
245                    old_format,
246                ),
247                Type::BYTE_ARRAY => Statistics::ByteArray(
248                    ValueStatistics::new(
249                        min.map(ByteArray::from),
250                        max.map(ByteArray::from),
251                        distinct_count,
252                        null_count,
253                        old_format,
254                    )
255                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
256                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
257                ),
258                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
259                    ValueStatistics::new(
260                        min.map(ByteArray::from).map(FixedLenByteArray::from),
261                        max.map(ByteArray::from).map(FixedLenByteArray::from),
262                        distinct_count,
263                        null_count,
264                        old_format,
265                    )
266                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
267                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
268                ),
269            };
270
271            Some(res)
272        }
273        None => None,
274    })
275}
276
277/// Convert Statistics into Thrift definition.
278pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> {
279    let stats = stats?;
280
281    // record null count if it can fit in i64
282    let null_count = stats
283        .null_count_opt()
284        .and_then(|value| i64::try_from(value).ok());
285
286    // record distinct count if it can fit in i64
287    let distinct_count = stats
288        .distinct_count_opt()
289        .and_then(|value| i64::try_from(value).ok());
290
291    let mut thrift_stats = PageStatistics {
292        max: None,
293        min: None,
294        null_count,
295        distinct_count,
296        max_value: None,
297        min_value: None,
298        is_max_value_exact: None,
299        is_min_value_exact: None,
300    };
301
302    // Get min/max if set.
303    let (min, max, min_exact, max_exact) = (
304        stats.min_bytes_opt().map(|x| x.to_vec()),
305        stats.max_bytes_opt().map(|x| x.to_vec()),
306        Some(stats.min_is_exact()),
307        Some(stats.max_is_exact()),
308    );
309    if stats.is_min_max_backwards_compatible() {
310        // Copy to deprecated min, max values for compatibility with older readers
311        thrift_stats.min.clone_from(&min);
312        thrift_stats.max.clone_from(&max);
313    }
314
315    if !stats.is_min_max_deprecated() {
316        thrift_stats.min_value = min;
317        thrift_stats.max_value = max;
318    }
319
320    thrift_stats.is_min_value_exact = min_exact;
321    thrift_stats.is_max_value_exact = max_exact;
322
323    Some(thrift_stats)
324}
325
326/// Strongly typed statistics for a column chunk within a row group.
327///
328/// This structure is a natively typed, in memory representation of the thrift
329/// `Statistics` structure in a Parquet file footer. The statistics stored in
330/// this structure can be used by query engines to skip decoding pages while
331/// reading parquet data.
332///
333/// Page level statistics are stored separately, in [ColumnIndexMetaData].
334///
335/// [ColumnIndexMetaData]: crate::file::page_index::column_index::ColumnIndexMetaData
336#[derive(Debug, Clone, PartialEq)]
337pub enum Statistics {
338    /// Statistics for Boolean column
339    Boolean(ValueStatistics<bool>),
340    /// Statistics for Int32 column
341    Int32(ValueStatistics<i32>),
342    /// Statistics for Int64 column
343    Int64(ValueStatistics<i64>),
344    /// Statistics for Int96 column
345    Int96(ValueStatistics<Int96>),
346    /// Statistics for Float column
347    Float(ValueStatistics<f32>),
348    /// Statistics for Double column
349    Double(ValueStatistics<f64>),
350    /// Statistics for ByteArray column
351    ByteArray(ValueStatistics<ByteArray>),
352    /// Statistics for FixedLenByteArray column
353    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
354}
355
356impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
357    fn from(t: ValueStatistics<T>) -> Self {
358        T::make_statistics(t)
359    }
360}
361
362impl Statistics {
363    /// Creates new statistics for a column type
364    pub fn new<T: ParquetValueType>(
365        min: Option<T>,
366        max: Option<T>,
367        distinct_count: Option<u64>,
368        null_count: Option<u64>,
369        is_deprecated: bool,
370    ) -> Self {
371        Self::from(ValueStatistics::new(
372            min,
373            max,
374            distinct_count,
375            null_count,
376            is_deprecated,
377        ))
378    }
379
380    statistics_new_func![boolean, Option<bool>, Boolean];
381
382    statistics_new_func![int32, Option<i32>, Int32];
383
384    statistics_new_func![int64, Option<i64>, Int64];
385
386    statistics_new_func![int96, Option<Int96>, Int96];
387
388    statistics_new_func![float, Option<f32>, Float];
389
390    statistics_new_func![double, Option<f64>, Double];
391
392    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
393
394    statistics_new_func![
395        fixed_len_byte_array,
396        Option<FixedLenByteArray>,
397        FixedLenByteArray
398    ];
399
400    /// Returns `true` if statistics have old `min` and `max` fields set.
401    /// This means that the column order is likely to be undefined, which, for old files
402    /// could mean a signed sort order of values.
403    ///
404    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
405    /// [`SortOrder`](crate::basic::SortOrder) for more information.
406    pub fn is_min_max_deprecated(&self) -> bool {
407        statistics_enum_func![self, is_min_max_deprecated]
408    }
409
410    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
411    /// using signed comparison. This resulted in an undefined ordering for unsigned
412    /// quantities, such as booleans and unsigned integers.
413    ///
414    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
415    /// which have a type-defined sort order.
416    ///
417    /// However, not all readers have been updated. For backwards compatibility, this method
418    /// returns `true` if the statistics within this have a signed sort order, that is
419    /// compatible with being stored in the deprecated `min` and `max` fields
420    pub fn is_min_max_backwards_compatible(&self) -> bool {
421        statistics_enum_func![self, is_min_max_backwards_compatible]
422    }
423
424    /// Returns optional value of number of distinct values occurring.
425    /// When it is `None`, the value should be ignored.
426    pub fn distinct_count_opt(&self) -> Option<u64> {
427        statistics_enum_func![self, distinct_count]
428    }
429
430    /// Returns number of null values for the column, if known.
431    /// Note that this includes all nulls when column is part of the complex type.
432    ///
433    /// Note: Versions of this library prior to `58.1.0` returned `0` if the null count
434    /// was not available. This method now returns `None` in that case.
435    ///
436    /// Also, versions of this library prior to `53.1.0` did not store a null count
437    /// statistic when the null count was `0`.
438    ///
439    /// It is unsound to assume that missing nullcount stats mean the column contains no nulls,
440    /// but code that depends on the old behavior can restore it by defaulting to zero:
441    ///
442    /// ```no_run
443    /// # use parquet::file::statistics::Statistics;
444    /// # let statistics: Statistics = todo!();
445    /// let null_count = statistics.null_count_opt().unwrap_or(0);
446    /// ```
447    pub fn null_count_opt(&self) -> Option<u64> {
448        statistics_enum_func![self, null_count_opt]
449    }
450
451    /// Returns `true` if the min value is set, and is an exact min value.
452    pub fn min_is_exact(&self) -> bool {
453        statistics_enum_func![self, min_is_exact]
454    }
455
456    /// Returns `true` if the max value is set, and is an exact max value.
457    pub fn max_is_exact(&self) -> bool {
458        statistics_enum_func![self, max_is_exact]
459    }
460
461    /// Returns slice of bytes that represent min value, if min value is known.
462    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
463        statistics_enum_func![self, min_bytes_opt]
464    }
465
466    /// Returns slice of bytes that represent max value, if max value is known.
467    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
468        statistics_enum_func![self, max_bytes_opt]
469    }
470
471    /// Returns physical type associated with statistics.
472    pub fn physical_type(&self) -> Type {
473        match self {
474            Statistics::Boolean(_) => Type::BOOLEAN,
475            Statistics::Int32(_) => Type::INT32,
476            Statistics::Int64(_) => Type::INT64,
477            Statistics::Int96(_) => Type::INT96,
478            Statistics::Float(_) => Type::FLOAT,
479            Statistics::Double(_) => Type::DOUBLE,
480            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
481            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
482        }
483    }
484}
485
486impl fmt::Display for Statistics {
487    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
488        match self {
489            Statistics::Boolean(typed) => write!(f, "{typed}"),
490            Statistics::Int32(typed) => write!(f, "{typed}"),
491            Statistics::Int64(typed) => write!(f, "{typed}"),
492            Statistics::Int96(typed) => write!(f, "{typed}"),
493            Statistics::Float(typed) => write!(f, "{typed}"),
494            Statistics::Double(typed) => write!(f, "{typed}"),
495            Statistics::ByteArray(typed) => write!(f, "{typed}"),
496            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
497        }
498    }
499}
500
501/// Typed implementation for [`Statistics`].
502pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
503
504/// Typed statistics for one column chunk
505///
506/// See [`Statistics`] for more details
507#[derive(Clone, Eq, PartialEq)]
508pub struct ValueStatistics<T> {
509    min: Option<T>,
510    max: Option<T>,
511    // Distinct count could be omitted in some cases
512    distinct_count: Option<u64>,
513    null_count: Option<u64>,
514
515    // Whether or not the min or max values are exact, or truncated.
516    is_max_value_exact: bool,
517    is_min_value_exact: bool,
518
519    /// If `true` populate the deprecated `min` and `max` fields instead of
520    /// `min_value` and `max_value`
521    is_min_max_deprecated: bool,
522
523    /// If `true` the statistics are compatible with the deprecated `min` and
524    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
525    is_min_max_backwards_compatible: bool,
526}
527
528impl<T> ValueStatistics<T> {
529    /// Creates new typed statistics.
530    pub fn new(
531        min: Option<T>,
532        max: Option<T>,
533        distinct_count: Option<u64>,
534        null_count: Option<u64>,
535        is_min_max_deprecated: bool,
536    ) -> Self {
537        Self {
538            is_max_value_exact: max.is_some(),
539            is_min_value_exact: min.is_some(),
540            min,
541            max,
542            distinct_count,
543            null_count,
544            is_min_max_deprecated,
545            is_min_max_backwards_compatible: is_min_max_deprecated,
546        }
547    }
548
549    /// Set whether the stored `min` field represents the exact
550    /// minimum, or just a bound on the minimum value.
551    ///
552    /// see [`Self::min_is_exact`]
553    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
554        Self {
555            is_min_value_exact,
556            ..self
557        }
558    }
559
560    /// Set whether the stored `max` field represents the exact
561    /// maximum, or just a bound on the maximum value.
562    ///
563    /// see [`Self::max_is_exact`]
564    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
565        Self {
566            is_max_value_exact,
567            ..self
568        }
569    }
570
571    /// Set whether to write the deprecated `min` and `max` fields
572    /// for compatibility with older parquet writers
573    ///
574    /// This should only be enabled if the field is signed,
575    /// see [`Self::is_min_max_backwards_compatible`]
576    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
577        Self {
578            is_min_max_backwards_compatible: backwards_compatible,
579            ..self
580        }
581    }
582
583    /// Returns min value of the statistics, if known.
584    pub fn min_opt(&self) -> Option<&T> {
585        self.min.as_ref()
586    }
587
588    /// Returns max value of the statistics, if known.
589    pub fn max_opt(&self) -> Option<&T> {
590        self.max.as_ref()
591    }
592
593    /// Whether or not min and max values are set.
594    /// Normally both min/max values will be set to `Some(value)` or `None`.
595    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
596        self.min.is_some() && self.max.is_some()
597    }
598
599    /// Whether or not max value is set, and is an exact value.
600    pub fn max_is_exact(&self) -> bool {
601        self.max.is_some() && self.is_max_value_exact
602    }
603
604    /// Whether or not min value is set, and is an exact value.
605    pub fn min_is_exact(&self) -> bool {
606        self.min.is_some() && self.is_min_value_exact
607    }
608
609    /// Returns optional value of number of distinct values occurring.
610    pub fn distinct_count(&self) -> Option<u64> {
611        self.distinct_count
612    }
613
614    /// Returns null count.
615    pub fn null_count_opt(&self) -> Option<u64> {
616        self.null_count
617    }
618
619    /// Returns `true` if statistics were created using old min/max fields.
620    fn is_min_max_deprecated(&self) -> bool {
621        self.is_min_max_deprecated
622    }
623
624    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
625    /// using signed comparison. This resulted in an undefined ordering for unsigned
626    /// quantities, such as booleans and unsigned integers.
627    ///
628    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
629    /// which have a type-defined sort order.
630    ///
631    /// However, not all readers have been updated. For backwards compatibility, this method
632    /// returns `true` if the statistics within this have a signed sort order, that is
633    /// compatible with being stored in the deprecated `min` and `max` fields
634    pub fn is_min_max_backwards_compatible(&self) -> bool {
635        self.is_min_max_backwards_compatible
636    }
637}
638
639impl<T: AsBytes> ValueStatistics<T> {
640    /// Returns min value as bytes of the statistics, if min value is known.
641    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
642        self.min_opt().map(AsBytes::as_bytes)
643    }
644
645    /// Returns max value as bytes of the statistics, if max value is known.
646    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
647        self.max_opt().map(AsBytes::as_bytes)
648    }
649}
650
651impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
652    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
653        write!(f, "{{")?;
654        write!(f, "min: ")?;
655        match self.min {
656            Some(ref value) => write!(f, "{value}")?,
657            None => write!(f, "N/A")?,
658        }
659        write!(f, ", max: ")?;
660        match self.max {
661            Some(ref value) => write!(f, "{value}")?,
662            None => write!(f, "N/A")?,
663        }
664        write!(f, ", distinct_count: ")?;
665        match self.distinct_count {
666            Some(value) => write!(f, "{value}")?,
667            None => write!(f, "N/A")?,
668        }
669        write!(f, ", null_count: ")?;
670        match self.null_count {
671            Some(value) => write!(f, "{value}")?,
672            None => write!(f, "N/A")?,
673        }
674        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
675        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
676        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
677        write!(f, "}}")
678    }
679}
680
681impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
682    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
683        write!(
684            f,
685            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
686             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
687            self.min,
688            self.max,
689            self.distinct_count,
690            self.null_count,
691            self.is_min_max_deprecated,
692            self.is_min_max_backwards_compatible,
693            self.is_max_value_exact,
694            self.is_min_value_exact
695        )
696    }
697}
698
699#[cfg(test)]
700mod tests {
701    use super::*;
702
703    #[test]
704    fn test_statistics_min_max_bytes() {
705        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
706        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
707        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
708
709        let stats = Statistics::byte_array(
710            Some(ByteArray::from(vec![1, 2, 3])),
711            Some(ByteArray::from(vec![3, 4, 5])),
712            None,
713            Some(1),
714            true,
715        );
716        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
717        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
718    }
719
720    #[test]
721    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
722    fn test_statistics_negative_null_count() {
723        let thrift_stats = PageStatistics {
724            max: None,
725            min: None,
726            null_count: Some(-10),
727            distinct_count: None,
728            max_value: None,
729            min_value: None,
730            is_max_value_exact: None,
731            is_min_value_exact: None,
732        };
733
734        from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap();
735    }
736
737    #[test]
738    fn test_statistics_thrift_none() {
739        assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None);
740        assert_eq!(
741            from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(),
742            None
743        );
744    }
745
746    #[test]
747    fn test_statistics_debug() {
748        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
749        assert_eq!(
750            format!("{stats:?}"),
751            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
752             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
753        );
754
755        let stats = Statistics::int32(None, None, None, Some(7), false);
756        assert_eq!(
757            format!("{stats:?}"),
758            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
759             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
760        )
761    }
762
763    #[test]
764    fn test_statistics_display() {
765        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
766        assert_eq!(
767            format!("{stats}"),
768            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
769        );
770
771        let stats = Statistics::int64(None, None, None, Some(7), false);
772        assert_eq!(
773            format!("{stats}"),
774            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
775             false, max_value_exact: false, min_value_exact: false}"
776        );
777
778        let stats = Statistics::int96(
779            Some(Int96::from(vec![1, 0, 0])),
780            Some(Int96::from(vec![2, 3, 4])),
781            None,
782            Some(3),
783            true,
784        );
785        assert_eq!(
786            format!("{stats}"),
787            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
788             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
789        );
790
791        let stats = Statistics::ByteArray(
792            ValueStatistics::new(
793                Some(ByteArray::from(vec![1u8])),
794                Some(ByteArray::from(vec![2u8])),
795                Some(5),
796                Some(7),
797                false,
798            )
799            .with_max_is_exact(false)
800            .with_min_is_exact(false),
801        );
802        assert_eq!(
803            format!("{stats}"),
804            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
805        );
806    }
807
808    #[test]
809    fn test_statistics_partial_eq() {
810        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
811
812        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
813        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
814        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
815        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
816        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
817
818        assert!(
819            Statistics::int32(Some(12), Some(45), None, Some(11), false)
820                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
821        );
822
823        assert!(
824            Statistics::boolean(Some(false), Some(true), None, None, true)
825                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
826        );
827
828        assert!(
829            Statistics::byte_array(
830                Some(ByteArray::from(vec![1, 2, 3])),
831                Some(ByteArray::from(vec![1, 2, 3])),
832                None,
833                None,
834                true
835            ) != Statistics::fixed_len_byte_array(
836                Some(ByteArray::from(vec![1, 2, 3]).into()),
837                Some(ByteArray::from(vec![1, 2, 3]).into()),
838                None,
839                None,
840                true,
841            )
842        );
843
844        assert!(
845            Statistics::byte_array(
846                Some(ByteArray::from(vec![1, 2, 3])),
847                Some(ByteArray::from(vec![1, 2, 3])),
848                None,
849                None,
850                true,
851            ) != Statistics::ByteArray(
852                ValueStatistics::new(
853                    Some(ByteArray::from(vec![1, 2, 3])),
854                    Some(ByteArray::from(vec![1, 2, 3])),
855                    None,
856                    None,
857                    true,
858                )
859                .with_max_is_exact(false)
860            )
861        );
862
863        assert!(
864            Statistics::fixed_len_byte_array(
865                Some(FixedLenByteArray::from(vec![1, 2, 3])),
866                Some(FixedLenByteArray::from(vec![1, 2, 3])),
867                None,
868                None,
869                true,
870            ) != Statistics::FixedLenByteArray(
871                ValueStatistics::new(
872                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
873                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
874                    None,
875                    None,
876                    true,
877                )
878                .with_min_is_exact(false)
879            )
880        );
881    }
882
883    #[test]
884    fn test_statistics_from_thrift() {
885        // Helper method to check statistics conversion.
886        fn check_stats(stats: Statistics) {
887            let tpe = stats.physical_type();
888            let thrift_stats = page_stats_to_thrift(Some(&stats));
889            assert_eq!(
890                from_thrift_page_stats(tpe, thrift_stats).unwrap(),
891                Some(stats)
892            );
893        }
894
895        check_stats(Statistics::boolean(
896            Some(false),
897            Some(true),
898            None,
899            Some(7),
900            true,
901        ));
902        check_stats(Statistics::boolean(
903            Some(false),
904            Some(true),
905            None,
906            Some(7),
907            true,
908        ));
909        check_stats(Statistics::boolean(
910            Some(false),
911            Some(true),
912            None,
913            Some(0),
914            false,
915        ));
916        check_stats(Statistics::boolean(
917            Some(true),
918            Some(true),
919            None,
920            Some(7),
921            true,
922        ));
923        check_stats(Statistics::boolean(
924            Some(false),
925            Some(false),
926            None,
927            Some(7),
928            true,
929        ));
930        check_stats(Statistics::boolean(None, None, None, Some(7), true));
931
932        check_stats(Statistics::int32(
933            Some(-100),
934            Some(500),
935            None,
936            Some(7),
937            true,
938        ));
939        check_stats(Statistics::int32(
940            Some(-100),
941            Some(500),
942            None,
943            Some(0),
944            false,
945        ));
946        check_stats(Statistics::int32(None, None, None, Some(7), true));
947
948        check_stats(Statistics::int64(
949            Some(-100),
950            Some(200),
951            None,
952            Some(7),
953            true,
954        ));
955        check_stats(Statistics::int64(
956            Some(-100),
957            Some(200),
958            None,
959            Some(0),
960            false,
961        ));
962        check_stats(Statistics::int64(None, None, None, Some(7), true));
963
964        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
965        check_stats(Statistics::float(
966            Some(1.2),
967            Some(3.4),
968            None,
969            Some(0),
970            false,
971        ));
972        check_stats(Statistics::float(None, None, None, Some(7), true));
973
974        check_stats(Statistics::double(
975            Some(1.2),
976            Some(3.4),
977            None,
978            Some(7),
979            true,
980        ));
981        check_stats(Statistics::double(
982            Some(1.2),
983            Some(3.4),
984            None,
985            Some(0),
986            false,
987        ));
988        check_stats(Statistics::double(None, None, None, Some(7), true));
989
990        check_stats(Statistics::byte_array(
991            Some(ByteArray::from(vec![1, 2, 3])),
992            Some(ByteArray::from(vec![3, 4, 5])),
993            None,
994            Some(7),
995            true,
996        ));
997        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
998
999        check_stats(Statistics::fixed_len_byte_array(
1000            Some(ByteArray::from(vec![1, 2, 3]).into()),
1001            Some(ByteArray::from(vec![3, 4, 5]).into()),
1002            None,
1003            Some(7),
1004            true,
1005        ));
1006        check_stats(Statistics::fixed_len_byte_array(
1007            None,
1008            None,
1009            None,
1010            Some(7),
1011            true,
1012        ));
1013    }
1014
1015    #[test]
1016    fn test_count_encoding() {
1017        statistics_count_test(None, None);
1018        statistics_count_test(Some(0), Some(0));
1019        statistics_count_test(Some(100), Some(2000));
1020        statistics_count_test(Some(1), None);
1021        statistics_count_test(None, Some(1));
1022    }
1023
1024    #[test]
1025    fn test_count_encoding_distinct_too_large() {
1026        // statistics are stored using i64, so test trying to store larger values
1027        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1028        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1029        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1030        assert_eq!(thrift_stats.null_count, Some(100));
1031    }
1032
1033    #[test]
1034    fn test_count_encoding_null_too_large() {
1035        // statistics are stored using i64, so test trying to store larger values
1036        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1037        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1038        assert_eq!(thrift_stats.distinct_count, Some(100));
1039        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1040    }
1041
1042    #[test]
1043    fn test_count_decoding_null_invalid() {
1044        let tstatistics = PageStatistics {
1045            null_count: Some(-42),
1046            max: None,
1047            min: None,
1048            distinct_count: None,
1049            max_value: None,
1050            min_value: None,
1051            is_max_value_exact: None,
1052            is_min_value_exact: None,
1053        };
1054        let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1055        assert_eq!(
1056            err.to_string(),
1057            "Parquet error: Statistics null count is negative -42"
1058        );
1059    }
1060
1061    /// Writes statistics to thrift and reads them back and ensures:
1062    /// - The statistics are the same
1063    /// - The statistics written to thrift are the same as the original statistics
1064    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1065        let statistics = make_bool_stats(distinct_count, null_count);
1066
1067        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1068        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1069        assert_eq!(
1070            thrift_stats.distinct_count.map(|c| c as u64),
1071            distinct_count
1072        );
1073
1074        let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
1075            .unwrap()
1076            .unwrap();
1077        assert_eq!(round_tripped, statistics);
1078    }
1079
1080    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1081        let min = Some(true);
1082        let max = Some(false);
1083        let is_min_max_deprecated = false;
1084
1085        // test is about the counts, so we aren't really testing the min/max values
1086        Statistics::Boolean(ValueStatistics::new(
1087            min,
1088            max,
1089            distinct_count,
1090            null_count,
1091            is_min_max_deprecated,
1092        ))
1093    }
1094
1095    #[test]
1096    fn test_int96_invalid_statistics() {
1097        let mut thrift_stats = PageStatistics {
1098            max: None,
1099            min: Some((0..13).collect()),
1100            null_count: Some(0),
1101            distinct_count: None,
1102            max_value: None,
1103            min_value: None,
1104            is_max_value_exact: None,
1105            is_min_value_exact: None,
1106        };
1107
1108        let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats.clone())).unwrap_err();
1109        assert_eq!(
1110            err.to_string(),
1111            "Parquet error: Incorrect Int96 min statistics"
1112        );
1113
1114        thrift_stats.min = None;
1115        thrift_stats.max = Some((0..13).collect());
1116        let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats)).unwrap_err();
1117        assert_eq!(
1118            err.to_string(),
1119            "Parquet error: Incorrect Int96 max statistics"
1120        );
1121    }
1122
1123    // Ensures that we can call ValueStatistics::min_opt from a
1124    // generic function without reyling on a bound to a private trait.
1125    fn generic_statistics_handler<T: std::fmt::Display>(stats: ValueStatistics<T>) -> String {
1126        match stats.min_opt() {
1127            Some(s) => format!("min: {}", s),
1128            None => "min: NA".to_string(),
1129        }
1130    }
1131
1132    #[test]
1133    fn test_generic_access() {
1134        let stats = Statistics::int32(Some(12), Some(45), None, Some(11), false);
1135
1136        match stats {
1137            Statistics::Int32(v) => {
1138                let stats_string = generic_statistics_handler(v);
1139                assert_eq!(&stats_string, "min: 12");
1140            }
1141            _ => unreachable!(),
1142        }
1143    }
1144}