parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::basic::Type;
45use crate::data_type::private::ParquetValueType;
46use crate::data_type::*;
47use crate::errors::{ParquetError, Result};
48use crate::file::metadata::thrift::PageStatistics;
49use crate::util::bit_util::FromBytes;
50
51pub(crate) mod private {
52    use super::*;
53
54    pub trait MakeStatistics {
55        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
56        where
57            Self: Sized;
58    }
59
60    macro_rules! gen_make_statistics {
61        ($value_ty:ty, $stat:ident) => {
62            impl MakeStatistics for $value_ty {
63                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
64                where
65                    Self: Sized,
66                {
67                    Statistics::$stat(statistics)
68                }
69            }
70        };
71    }
72
73    gen_make_statistics!(bool, Boolean);
74    gen_make_statistics!(i32, Int32);
75    gen_make_statistics!(i64, Int64);
76    gen_make_statistics!(Int96, Int96);
77    gen_make_statistics!(f32, Float);
78    gen_make_statistics!(f64, Double);
79    gen_make_statistics!(ByteArray, ByteArray);
80    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
81}
82
83/// Macro to generate methods to create Statistics.
84macro_rules! statistics_new_func {
85    ($func:ident, $vtype:ty, $stat:ident) => {
86        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
87        pub fn $func(
88            min: $vtype,
89            max: $vtype,
90            distinct: Option<u64>,
91            nulls: Option<u64>,
92            is_deprecated: bool,
93        ) -> Self {
94            Statistics::$stat(ValueStatistics::new(
95                min,
96                max,
97                distinct,
98                nulls,
99                is_deprecated,
100            ))
101        }
102    };
103}
104
105// Macro to generate getter functions for Statistics.
106macro_rules! statistics_enum_func {
107    ($self:ident, $func:ident) => {{
108        match *$self {
109            Statistics::Boolean(ref typed) => typed.$func(),
110            Statistics::Int32(ref typed) => typed.$func(),
111            Statistics::Int64(ref typed) => typed.$func(),
112            Statistics::Int96(ref typed) => typed.$func(),
113            Statistics::Float(ref typed) => typed.$func(),
114            Statistics::Double(ref typed) => typed.$func(),
115            Statistics::ByteArray(ref typed) => typed.$func(),
116            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
117        }
118    }};
119}
120
121/// Converts Thrift definition into `Statistics`.
122pub(crate) fn from_thrift_page_stats(
123    physical_type: Type,
124    thrift_stats: Option<PageStatistics>,
125) -> Result<Option<Statistics>> {
126    Ok(match thrift_stats {
127        Some(stats) => {
128            // Number of nulls recorded, when it is not available, we just mark it as 0.
129            // TODO this should be `None` if there is no information about NULLS.
130            // see https://github.com/apache/arrow-rs/pull/6216/files
131            let null_count = stats.null_count.unwrap_or(0);
132
133            if null_count < 0 {
134                return Err(ParquetError::General(format!(
135                    "Statistics null count is negative {null_count}",
136                )));
137            }
138
139            // Generic null count.
140            let null_count = Some(null_count as u64);
141            // Generic distinct count (count of distinct values occurring)
142            let distinct_count = stats.distinct_count.map(|value| value as u64);
143            // Whether or not statistics use deprecated min/max fields.
144            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
145            // Generic min value as bytes.
146            let min = if old_format {
147                stats.min
148            } else {
149                stats.min_value
150            };
151            // Generic max value as bytes.
152            let max = if old_format {
153                stats.max
154            } else {
155                stats.max_value
156            };
157
158            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
159                if let Some(min) = min {
160                    if min.len() < len {
161                        return Err(ParquetError::General(
162                            "Insufficient bytes to parse min statistic".to_string(),
163                        ));
164                    }
165                }
166                if let Some(max) = max {
167                    if max.len() < len {
168                        return Err(ParquetError::General(
169                            "Insufficient bytes to parse max statistic".to_string(),
170                        ));
171                    }
172                }
173                Ok(())
174            }
175
176            match physical_type {
177                Type::BOOLEAN => check_len(&min, &max, 1),
178                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
179                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
180                Type::INT96 => check_len(&min, &max, 12),
181                _ => Ok(()),
182            }?;
183
184            // Values are encoded using PLAIN encoding definition, except that
185            // variable-length byte arrays do not include a length prefix.
186            //
187            // Instead of using actual decoder, we manually convert values.
188            let res = match physical_type {
189                Type::BOOLEAN => Statistics::boolean(
190                    min.map(|data| data[0] != 0),
191                    max.map(|data| data[0] != 0),
192                    distinct_count,
193                    null_count,
194                    old_format,
195                ),
196                Type::INT32 => Statistics::int32(
197                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
198                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199                    distinct_count,
200                    null_count,
201                    old_format,
202                ),
203                Type::INT64 => Statistics::int64(
204                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
205                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206                    distinct_count,
207                    null_count,
208                    old_format,
209                ),
210                Type::INT96 => {
211                    // INT96 statistics may not be correct, because comparison is signed
212                    let min = if let Some(data) = min {
213                        if data.len() != 12 {
214                            return Err(ParquetError::General(
215                                "Incorrect Int96 min statistics".to_string(),
216                            ));
217                        }
218                        Some(Int96::try_from_le_slice(&data)?)
219                    } else {
220                        None
221                    };
222                    let max = if let Some(data) = max {
223                        if data.len() != 12 {
224                            return Err(ParquetError::General(
225                                "Incorrect Int96 max statistics".to_string(),
226                            ));
227                        }
228                        Some(Int96::try_from_le_slice(&data)?)
229                    } else {
230                        None
231                    };
232                    Statistics::int96(min, max, distinct_count, null_count, old_format)
233                }
234                Type::FLOAT => Statistics::float(
235                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
236                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
237                    distinct_count,
238                    null_count,
239                    old_format,
240                ),
241                Type::DOUBLE => Statistics::double(
242                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
243                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
244                    distinct_count,
245                    null_count,
246                    old_format,
247                ),
248                Type::BYTE_ARRAY => Statistics::ByteArray(
249                    ValueStatistics::new(
250                        min.map(ByteArray::from),
251                        max.map(ByteArray::from),
252                        distinct_count,
253                        null_count,
254                        old_format,
255                    )
256                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
257                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
258                ),
259                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
260                    ValueStatistics::new(
261                        min.map(ByteArray::from).map(FixedLenByteArray::from),
262                        max.map(ByteArray::from).map(FixedLenByteArray::from),
263                        distinct_count,
264                        null_count,
265                        old_format,
266                    )
267                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
268                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
269                ),
270            };
271
272            Some(res)
273        }
274        None => None,
275    })
276}
277
278/// Convert Statistics into Thrift definition.
279pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> {
280    let stats = stats?;
281
282    // record null count if it can fit in i64
283    let null_count = stats
284        .null_count_opt()
285        .and_then(|value| i64::try_from(value).ok());
286
287    // record distinct count if it can fit in i64
288    let distinct_count = stats
289        .distinct_count_opt()
290        .and_then(|value| i64::try_from(value).ok());
291
292    let mut thrift_stats = PageStatistics {
293        max: None,
294        min: None,
295        null_count,
296        distinct_count,
297        max_value: None,
298        min_value: None,
299        is_max_value_exact: None,
300        is_min_value_exact: None,
301    };
302
303    // Get min/max if set.
304    let (min, max, min_exact, max_exact) = (
305        stats.min_bytes_opt().map(|x| x.to_vec()),
306        stats.max_bytes_opt().map(|x| x.to_vec()),
307        Some(stats.min_is_exact()),
308        Some(stats.max_is_exact()),
309    );
310    if stats.is_min_max_backwards_compatible() {
311        // Copy to deprecated min, max values for compatibility with older readers
312        thrift_stats.min.clone_from(&min);
313        thrift_stats.max.clone_from(&max);
314    }
315
316    if !stats.is_min_max_deprecated() {
317        thrift_stats.min_value = min;
318        thrift_stats.max_value = max;
319    }
320
321    thrift_stats.is_min_value_exact = min_exact;
322    thrift_stats.is_max_value_exact = max_exact;
323
324    Some(thrift_stats)
325}
326
327/// Strongly typed statistics for a column chunk within a row group.
328///
329/// This structure is a natively typed, in memory representation of the thrift
330/// `Statistics` structure in a Parquet file footer. The statistics stored in
331/// this structure can be used by query engines to skip decoding pages while
332/// reading parquet data.
333///
334/// Page level statistics are stored separately, in [ColumnIndexMetaData].
335///
336/// [ColumnIndexMetaData]: crate::file::page_index::column_index::ColumnIndexMetaData
337#[derive(Debug, Clone, PartialEq)]
338pub enum Statistics {
339    /// Statistics for Boolean column
340    Boolean(ValueStatistics<bool>),
341    /// Statistics for Int32 column
342    Int32(ValueStatistics<i32>),
343    /// Statistics for Int64 column
344    Int64(ValueStatistics<i64>),
345    /// Statistics for Int96 column
346    Int96(ValueStatistics<Int96>),
347    /// Statistics for Float column
348    Float(ValueStatistics<f32>),
349    /// Statistics for Double column
350    Double(ValueStatistics<f64>),
351    /// Statistics for ByteArray column
352    ByteArray(ValueStatistics<ByteArray>),
353    /// Statistics for FixedLenByteArray column
354    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
355}
356
357impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
358    fn from(t: ValueStatistics<T>) -> Self {
359        T::make_statistics(t)
360    }
361}
362
363impl Statistics {
364    /// Creates new statistics for a column type
365    pub fn new<T: ParquetValueType>(
366        min: Option<T>,
367        max: Option<T>,
368        distinct_count: Option<u64>,
369        null_count: Option<u64>,
370        is_deprecated: bool,
371    ) -> Self {
372        Self::from(ValueStatistics::new(
373            min,
374            max,
375            distinct_count,
376            null_count,
377            is_deprecated,
378        ))
379    }
380
381    statistics_new_func![boolean, Option<bool>, Boolean];
382
383    statistics_new_func![int32, Option<i32>, Int32];
384
385    statistics_new_func![int64, Option<i64>, Int64];
386
387    statistics_new_func![int96, Option<Int96>, Int96];
388
389    statistics_new_func![float, Option<f32>, Float];
390
391    statistics_new_func![double, Option<f64>, Double];
392
393    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
394
395    statistics_new_func![
396        fixed_len_byte_array,
397        Option<FixedLenByteArray>,
398        FixedLenByteArray
399    ];
400
401    /// Returns `true` if statistics have old `min` and `max` fields set.
402    /// This means that the column order is likely to be undefined, which, for old files
403    /// could mean a signed sort order of values.
404    ///
405    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
406    /// [`SortOrder`](crate::basic::SortOrder) for more information.
407    pub fn is_min_max_deprecated(&self) -> bool {
408        statistics_enum_func![self, is_min_max_deprecated]
409    }
410
411    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
412    /// using signed comparison. This resulted in an undefined ordering for unsigned
413    /// quantities, such as booleans and unsigned integers.
414    ///
415    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
416    /// which have a type-defined sort order.
417    ///
418    /// However, not all readers have been updated. For backwards compatibility, this method
419    /// returns `true` if the statistics within this have a signed sort order, that is
420    /// compatible with being stored in the deprecated `min` and `max` fields
421    pub fn is_min_max_backwards_compatible(&self) -> bool {
422        statistics_enum_func![self, is_min_max_backwards_compatible]
423    }
424
425    /// Returns optional value of number of distinct values occurring.
426    /// When it is `None`, the value should be ignored.
427    pub fn distinct_count_opt(&self) -> Option<u64> {
428        statistics_enum_func![self, distinct_count]
429    }
430
431    /// Returns number of null values for the column, if known.
432    /// Note that this includes all nulls when column is part of the complex type.
433    ///
434    /// Note this API returns Some(0) even if the null count was not present
435    /// in the statistics.
436    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
437    pub fn null_count_opt(&self) -> Option<u64> {
438        statistics_enum_func![self, null_count_opt]
439    }
440
441    /// Returns `true` if the min value is set, and is an exact min value.
442    pub fn min_is_exact(&self) -> bool {
443        statistics_enum_func![self, min_is_exact]
444    }
445
446    /// Returns `true` if the max value is set, and is an exact max value.
447    pub fn max_is_exact(&self) -> bool {
448        statistics_enum_func![self, max_is_exact]
449    }
450
451    /// Returns slice of bytes that represent min value, if min value is known.
452    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
453        statistics_enum_func![self, min_bytes_opt]
454    }
455
456    /// Returns slice of bytes that represent max value, if max value is known.
457    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
458        statistics_enum_func![self, max_bytes_opt]
459    }
460
461    /// Returns physical type associated with statistics.
462    pub fn physical_type(&self) -> Type {
463        match self {
464            Statistics::Boolean(_) => Type::BOOLEAN,
465            Statistics::Int32(_) => Type::INT32,
466            Statistics::Int64(_) => Type::INT64,
467            Statistics::Int96(_) => Type::INT96,
468            Statistics::Float(_) => Type::FLOAT,
469            Statistics::Double(_) => Type::DOUBLE,
470            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
471            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
472        }
473    }
474}
475
476impl fmt::Display for Statistics {
477    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
478        match self {
479            Statistics::Boolean(typed) => write!(f, "{typed}"),
480            Statistics::Int32(typed) => write!(f, "{typed}"),
481            Statistics::Int64(typed) => write!(f, "{typed}"),
482            Statistics::Int96(typed) => write!(f, "{typed}"),
483            Statistics::Float(typed) => write!(f, "{typed}"),
484            Statistics::Double(typed) => write!(f, "{typed}"),
485            Statistics::ByteArray(typed) => write!(f, "{typed}"),
486            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
487        }
488    }
489}
490
491/// Typed implementation for [`Statistics`].
492pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
493
494/// Typed statistics for one column chunk
495///
496/// See [`Statistics`] for more details
497#[derive(Clone, Eq, PartialEq)]
498pub struct ValueStatistics<T> {
499    min: Option<T>,
500    max: Option<T>,
501    // Distinct count could be omitted in some cases
502    distinct_count: Option<u64>,
503    null_count: Option<u64>,
504
505    // Whether or not the min or max values are exact, or truncated.
506    is_max_value_exact: bool,
507    is_min_value_exact: bool,
508
509    /// If `true` populate the deprecated `min` and `max` fields instead of
510    /// `min_value` and `max_value`
511    is_min_max_deprecated: bool,
512
513    /// If `true` the statistics are compatible with the deprecated `min` and
514    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
515    is_min_max_backwards_compatible: bool,
516}
517
518impl<T: ParquetValueType> ValueStatistics<T> {
519    /// Creates new typed statistics.
520    pub fn new(
521        min: Option<T>,
522        max: Option<T>,
523        distinct_count: Option<u64>,
524        null_count: Option<u64>,
525        is_min_max_deprecated: bool,
526    ) -> Self {
527        Self {
528            is_max_value_exact: max.is_some(),
529            is_min_value_exact: min.is_some(),
530            min,
531            max,
532            distinct_count,
533            null_count,
534            is_min_max_deprecated,
535            is_min_max_backwards_compatible: is_min_max_deprecated,
536        }
537    }
538
539    /// Set whether the stored `min` field represents the exact
540    /// minimum, or just a bound on the minimum value.
541    ///
542    /// see [`Self::min_is_exact`]
543    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
544        Self {
545            is_min_value_exact,
546            ..self
547        }
548    }
549
550    /// Set whether the stored `max` field represents the exact
551    /// maximum, or just a bound on the maximum value.
552    ///
553    /// see [`Self::max_is_exact`]
554    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
555        Self {
556            is_max_value_exact,
557            ..self
558        }
559    }
560
561    /// Set whether to write the deprecated `min` and `max` fields
562    /// for compatibility with older parquet writers
563    ///
564    /// This should only be enabled if the field is signed,
565    /// see [`Self::is_min_max_backwards_compatible`]
566    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
567        Self {
568            is_min_max_backwards_compatible: backwards_compatible,
569            ..self
570        }
571    }
572
573    /// Returns min value of the statistics, if known.
574    pub fn min_opt(&self) -> Option<&T> {
575        self.min.as_ref()
576    }
577
578    /// Returns max value of the statistics, if known.
579    pub fn max_opt(&self) -> Option<&T> {
580        self.max.as_ref()
581    }
582
583    /// Returns min value as bytes of the statistics, if min value is known.
584    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
585        self.min_opt().map(AsBytes::as_bytes)
586    }
587
588    /// Returns max value as bytes of the statistics, if max value is known.
589    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
590        self.max_opt().map(AsBytes::as_bytes)
591    }
592
593    /// Whether or not min and max values are set.
594    /// Normally both min/max values will be set to `Some(value)` or `None`.
595    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
596        self.min.is_some() && self.max.is_some()
597    }
598
599    /// Whether or not max value is set, and is an exact value.
600    pub fn max_is_exact(&self) -> bool {
601        self.max.is_some() && self.is_max_value_exact
602    }
603
604    /// Whether or not min value is set, and is an exact value.
605    pub fn min_is_exact(&self) -> bool {
606        self.min.is_some() && self.is_min_value_exact
607    }
608
609    /// Returns optional value of number of distinct values occurring.
610    pub fn distinct_count(&self) -> Option<u64> {
611        self.distinct_count
612    }
613
614    /// Returns null count.
615    pub fn null_count_opt(&self) -> Option<u64> {
616        self.null_count
617    }
618
619    /// Returns `true` if statistics were created using old min/max fields.
620    fn is_min_max_deprecated(&self) -> bool {
621        self.is_min_max_deprecated
622    }
623
624    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
625    /// using signed comparison. This resulted in an undefined ordering for unsigned
626    /// quantities, such as booleans and unsigned integers.
627    ///
628    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
629    /// which have a type-defined sort order.
630    ///
631    /// However, not all readers have been updated. For backwards compatibility, this method
632    /// returns `true` if the statistics within this have a signed sort order, that is
633    /// compatible with being stored in the deprecated `min` and `max` fields
634    pub fn is_min_max_backwards_compatible(&self) -> bool {
635        self.is_min_max_backwards_compatible
636    }
637}
638
639impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
640    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
641        write!(f, "{{")?;
642        write!(f, "min: ")?;
643        match self.min {
644            Some(ref value) => write!(f, "{value}")?,
645            None => write!(f, "N/A")?,
646        }
647        write!(f, ", max: ")?;
648        match self.max {
649            Some(ref value) => write!(f, "{value}")?,
650            None => write!(f, "N/A")?,
651        }
652        write!(f, ", distinct_count: ")?;
653        match self.distinct_count {
654            Some(value) => write!(f, "{value}")?,
655            None => write!(f, "N/A")?,
656        }
657        write!(f, ", null_count: ")?;
658        match self.null_count {
659            Some(value) => write!(f, "{value}")?,
660            None => write!(f, "N/A")?,
661        }
662        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
663        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
664        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
665        write!(f, "}}")
666    }
667}
668
669impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
670    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
671        write!(
672            f,
673            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
674             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
675            self.min,
676            self.max,
677            self.distinct_count,
678            self.null_count,
679            self.is_min_max_deprecated,
680            self.is_min_max_backwards_compatible,
681            self.is_max_value_exact,
682            self.is_min_value_exact
683        )
684    }
685}
686
687#[cfg(test)]
688mod tests {
689    use super::*;
690
691    #[test]
692    fn test_statistics_min_max_bytes() {
693        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
694        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
695        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
696
697        let stats = Statistics::byte_array(
698            Some(ByteArray::from(vec![1, 2, 3])),
699            Some(ByteArray::from(vec![3, 4, 5])),
700            None,
701            Some(1),
702            true,
703        );
704        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
705        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
706    }
707
708    #[test]
709    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
710    fn test_statistics_negative_null_count() {
711        let thrift_stats = PageStatistics {
712            max: None,
713            min: None,
714            null_count: Some(-10),
715            distinct_count: None,
716            max_value: None,
717            min_value: None,
718            is_max_value_exact: None,
719            is_min_value_exact: None,
720        };
721
722        from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap();
723    }
724
725    #[test]
726    fn test_statistics_thrift_none() {
727        assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None);
728        assert_eq!(
729            from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(),
730            None
731        );
732    }
733
734    #[test]
735    fn test_statistics_debug() {
736        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
737        assert_eq!(
738            format!("{stats:?}"),
739            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
740             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
741        );
742
743        let stats = Statistics::int32(None, None, None, Some(7), false);
744        assert_eq!(
745            format!("{stats:?}"),
746            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
747             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
748        )
749    }
750
751    #[test]
752    fn test_statistics_display() {
753        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
754        assert_eq!(
755            format!("{stats}"),
756            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
757        );
758
759        let stats = Statistics::int64(None, None, None, Some(7), false);
760        assert_eq!(
761            format!("{stats}"),
762            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
763             false, max_value_exact: false, min_value_exact: false}"
764        );
765
766        let stats = Statistics::int96(
767            Some(Int96::from(vec![1, 0, 0])),
768            Some(Int96::from(vec![2, 3, 4])),
769            None,
770            Some(3),
771            true,
772        );
773        assert_eq!(
774            format!("{stats}"),
775            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
776             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
777        );
778
779        let stats = Statistics::ByteArray(
780            ValueStatistics::new(
781                Some(ByteArray::from(vec![1u8])),
782                Some(ByteArray::from(vec![2u8])),
783                Some(5),
784                Some(7),
785                false,
786            )
787            .with_max_is_exact(false)
788            .with_min_is_exact(false),
789        );
790        assert_eq!(
791            format!("{stats}"),
792            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
793        );
794    }
795
796    #[test]
797    fn test_statistics_partial_eq() {
798        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
799
800        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
801        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
802        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
803        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
804        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
805
806        assert!(
807            Statistics::int32(Some(12), Some(45), None, Some(11), false)
808                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
809        );
810
811        assert!(
812            Statistics::boolean(Some(false), Some(true), None, None, true)
813                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
814        );
815
816        assert!(
817            Statistics::byte_array(
818                Some(ByteArray::from(vec![1, 2, 3])),
819                Some(ByteArray::from(vec![1, 2, 3])),
820                None,
821                None,
822                true
823            ) != Statistics::fixed_len_byte_array(
824                Some(ByteArray::from(vec![1, 2, 3]).into()),
825                Some(ByteArray::from(vec![1, 2, 3]).into()),
826                None,
827                None,
828                true,
829            )
830        );
831
832        assert!(
833            Statistics::byte_array(
834                Some(ByteArray::from(vec![1, 2, 3])),
835                Some(ByteArray::from(vec![1, 2, 3])),
836                None,
837                None,
838                true,
839            ) != Statistics::ByteArray(
840                ValueStatistics::new(
841                    Some(ByteArray::from(vec![1, 2, 3])),
842                    Some(ByteArray::from(vec![1, 2, 3])),
843                    None,
844                    None,
845                    true,
846                )
847                .with_max_is_exact(false)
848            )
849        );
850
851        assert!(
852            Statistics::fixed_len_byte_array(
853                Some(FixedLenByteArray::from(vec![1, 2, 3])),
854                Some(FixedLenByteArray::from(vec![1, 2, 3])),
855                None,
856                None,
857                true,
858            ) != Statistics::FixedLenByteArray(
859                ValueStatistics::new(
860                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
861                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
862                    None,
863                    None,
864                    true,
865                )
866                .with_min_is_exact(false)
867            )
868        );
869    }
870
871    #[test]
872    fn test_statistics_from_thrift() {
873        // Helper method to check statistics conversion.
874        fn check_stats(stats: Statistics) {
875            let tpe = stats.physical_type();
876            let thrift_stats = page_stats_to_thrift(Some(&stats));
877            assert_eq!(
878                from_thrift_page_stats(tpe, thrift_stats).unwrap(),
879                Some(stats)
880            );
881        }
882
883        check_stats(Statistics::boolean(
884            Some(false),
885            Some(true),
886            None,
887            Some(7),
888            true,
889        ));
890        check_stats(Statistics::boolean(
891            Some(false),
892            Some(true),
893            None,
894            Some(7),
895            true,
896        ));
897        check_stats(Statistics::boolean(
898            Some(false),
899            Some(true),
900            None,
901            Some(0),
902            false,
903        ));
904        check_stats(Statistics::boolean(
905            Some(true),
906            Some(true),
907            None,
908            Some(7),
909            true,
910        ));
911        check_stats(Statistics::boolean(
912            Some(false),
913            Some(false),
914            None,
915            Some(7),
916            true,
917        ));
918        check_stats(Statistics::boolean(None, None, None, Some(7), true));
919
920        check_stats(Statistics::int32(
921            Some(-100),
922            Some(500),
923            None,
924            Some(7),
925            true,
926        ));
927        check_stats(Statistics::int32(
928            Some(-100),
929            Some(500),
930            None,
931            Some(0),
932            false,
933        ));
934        check_stats(Statistics::int32(None, None, None, Some(7), true));
935
936        check_stats(Statistics::int64(
937            Some(-100),
938            Some(200),
939            None,
940            Some(7),
941            true,
942        ));
943        check_stats(Statistics::int64(
944            Some(-100),
945            Some(200),
946            None,
947            Some(0),
948            false,
949        ));
950        check_stats(Statistics::int64(None, None, None, Some(7), true));
951
952        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
953        check_stats(Statistics::float(
954            Some(1.2),
955            Some(3.4),
956            None,
957            Some(0),
958            false,
959        ));
960        check_stats(Statistics::float(None, None, None, Some(7), true));
961
962        check_stats(Statistics::double(
963            Some(1.2),
964            Some(3.4),
965            None,
966            Some(7),
967            true,
968        ));
969        check_stats(Statistics::double(
970            Some(1.2),
971            Some(3.4),
972            None,
973            Some(0),
974            false,
975        ));
976        check_stats(Statistics::double(None, None, None, Some(7), true));
977
978        check_stats(Statistics::byte_array(
979            Some(ByteArray::from(vec![1, 2, 3])),
980            Some(ByteArray::from(vec![3, 4, 5])),
981            None,
982            Some(7),
983            true,
984        ));
985        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
986
987        check_stats(Statistics::fixed_len_byte_array(
988            Some(ByteArray::from(vec![1, 2, 3]).into()),
989            Some(ByteArray::from(vec![3, 4, 5]).into()),
990            None,
991            Some(7),
992            true,
993        ));
994        check_stats(Statistics::fixed_len_byte_array(
995            None,
996            None,
997            None,
998            Some(7),
999            true,
1000        ));
1001    }
1002
1003    #[test]
1004    fn test_count_encoding() {
1005        statistics_count_test(None, None);
1006        statistics_count_test(Some(0), Some(0));
1007        statistics_count_test(Some(100), Some(2000));
1008        statistics_count_test(Some(1), None);
1009        statistics_count_test(None, Some(1));
1010    }
1011
1012    #[test]
1013    fn test_count_encoding_distinct_too_large() {
1014        // statistics are stored using i64, so test trying to store larger values
1015        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1016        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1017        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1018        assert_eq!(thrift_stats.null_count, Some(100));
1019    }
1020
1021    #[test]
1022    fn test_count_encoding_null_too_large() {
1023        // statistics are stored using i64, so test trying to store larger values
1024        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1025        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1026        assert_eq!(thrift_stats.distinct_count, Some(100));
1027        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1028    }
1029
1030    #[test]
1031    fn test_count_decoding_null_invalid() {
1032        let tstatistics = PageStatistics {
1033            null_count: Some(-42),
1034            max: None,
1035            min: None,
1036            distinct_count: None,
1037            max_value: None,
1038            min_value: None,
1039            is_max_value_exact: None,
1040            is_min_value_exact: None,
1041        };
1042        let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1043        assert_eq!(
1044            err.to_string(),
1045            "Parquet error: Statistics null count is negative -42"
1046        );
1047    }
1048
1049    /// Writes statistics to thrift and reads them back and ensures:
1050    /// - The statistics are the same
1051    /// - The statistics written to thrift are the same as the original statistics
1052    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1053        let statistics = make_bool_stats(distinct_count, null_count);
1054
1055        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1056        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1057        assert_eq!(
1058            thrift_stats.distinct_count.map(|c| c as u64),
1059            distinct_count
1060        );
1061
1062        let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
1063            .unwrap()
1064            .unwrap();
1065        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1066        // means null_count = Some(0)
1067        if null_count.is_none() {
1068            assert_ne!(round_tripped, statistics);
1069            assert!(round_tripped.null_count_opt().is_some());
1070            assert_eq!(round_tripped.null_count_opt(), Some(0));
1071            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1072            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1073            assert_eq!(
1074                round_tripped.distinct_count_opt(),
1075                statistics.distinct_count_opt()
1076            );
1077        } else {
1078            assert_eq!(round_tripped, statistics);
1079        }
1080    }
1081
1082    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1083        let min = Some(true);
1084        let max = Some(false);
1085        let is_min_max_deprecated = false;
1086
1087        // test is about the counts, so we aren't really testing the min/max values
1088        Statistics::Boolean(ValueStatistics::new(
1089            min,
1090            max,
1091            distinct_count,
1092            null_count,
1093            is_min_max_deprecated,
1094        ))
1095    }
1096
1097    #[test]
1098    fn test_int96_invalid_statistics() {
1099        let mut thrift_stats = PageStatistics {
1100            max: None,
1101            min: Some((0..13).collect()),
1102            null_count: Some(0),
1103            distinct_count: None,
1104            max_value: None,
1105            min_value: None,
1106            is_max_value_exact: None,
1107            is_min_value_exact: None,
1108        };
1109
1110        let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats.clone())).unwrap_err();
1111        assert_eq!(
1112            err.to_string(),
1113            "Parquet error: Incorrect Int96 min statistics"
1114        );
1115
1116        thrift_stats.min = None;
1117        thrift_stats.max = Some((0..13).collect());
1118        let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats)).unwrap_err();
1119        assert_eq!(
1120            err.to_string(),
1121            "Parquet error: Incorrect Int96 max statistics"
1122        );
1123    }
1124}