parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::basic::Type;
45use crate::data_type::private::ParquetValueType;
46use crate::data_type::*;
47use crate::errors::{ParquetError, Result};
48use crate::file::metadata::thrift_gen::PageStatistics;
49use crate::util::bit_util::FromBytes;
50
51pub(crate) mod private {
52    use super::*;
53
54    pub trait MakeStatistics {
55        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
56        where
57            Self: Sized;
58    }
59
60    macro_rules! gen_make_statistics {
61        ($value_ty:ty, $stat:ident) => {
62            impl MakeStatistics for $value_ty {
63                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
64                where
65                    Self: Sized,
66                {
67                    Statistics::$stat(statistics)
68                }
69            }
70        };
71    }
72
73    gen_make_statistics!(bool, Boolean);
74    gen_make_statistics!(i32, Int32);
75    gen_make_statistics!(i64, Int64);
76    gen_make_statistics!(Int96, Int96);
77    gen_make_statistics!(f32, Float);
78    gen_make_statistics!(f64, Double);
79    gen_make_statistics!(ByteArray, ByteArray);
80    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
81}
82
83/// Macro to generate methods to create Statistics.
84macro_rules! statistics_new_func {
85    ($func:ident, $vtype:ty, $stat:ident) => {
86        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
87        pub fn $func(
88            min: $vtype,
89            max: $vtype,
90            distinct: Option<u64>,
91            nulls: Option<u64>,
92            is_deprecated: bool,
93        ) -> Self {
94            Statistics::$stat(ValueStatistics::new(
95                min,
96                max,
97                distinct,
98                nulls,
99                is_deprecated,
100            ))
101        }
102    };
103}
104
105// Macro to generate getter functions for Statistics.
106macro_rules! statistics_enum_func {
107    ($self:ident, $func:ident) => {{
108        match *$self {
109            Statistics::Boolean(ref typed) => typed.$func(),
110            Statistics::Int32(ref typed) => typed.$func(),
111            Statistics::Int64(ref typed) => typed.$func(),
112            Statistics::Int96(ref typed) => typed.$func(),
113            Statistics::Float(ref typed) => typed.$func(),
114            Statistics::Double(ref typed) => typed.$func(),
115            Statistics::ByteArray(ref typed) => typed.$func(),
116            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
117        }
118    }};
119}
120
121/// Converts Thrift definition into `Statistics`.
122pub(crate) fn from_thrift_page_stats(
123    physical_type: Type,
124    thrift_stats: Option<PageStatistics>,
125) -> Result<Option<Statistics>> {
126    Ok(match thrift_stats {
127        Some(stats) => {
128            // Number of nulls recorded, when it is not available, we just mark it as 0.
129            // TODO this should be `None` if there is no information about NULLS.
130            // see https://github.com/apache/arrow-rs/pull/6216/files
131            let null_count = stats.null_count.unwrap_or(0);
132
133            if null_count < 0 {
134                return Err(ParquetError::General(format!(
135                    "Statistics null count is negative {null_count}",
136                )));
137            }
138
139            // Generic null count.
140            let null_count = Some(null_count as u64);
141            // Generic distinct count (count of distinct values occurring)
142            let distinct_count = stats.distinct_count.map(|value| value as u64);
143            // Whether or not statistics use deprecated min/max fields.
144            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
145            // Generic min value as bytes.
146            let min = if old_format {
147                stats.min
148            } else {
149                stats.min_value
150            };
151            // Generic max value as bytes.
152            let max = if old_format {
153                stats.max
154            } else {
155                stats.max_value
156            };
157
158            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
159                if let Some(min) = min {
160                    if min.len() < len {
161                        return Err(ParquetError::General(
162                            "Insufficient bytes to parse min statistic".to_string(),
163                        ));
164                    }
165                }
166                if let Some(max) = max {
167                    if max.len() < len {
168                        return Err(ParquetError::General(
169                            "Insufficient bytes to parse max statistic".to_string(),
170                        ));
171                    }
172                }
173                Ok(())
174            }
175
176            match physical_type {
177                Type::BOOLEAN => check_len(&min, &max, 1),
178                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
179                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
180                Type::INT96 => check_len(&min, &max, 12),
181                _ => Ok(()),
182            }?;
183
184            // Values are encoded using PLAIN encoding definition, except that
185            // variable-length byte arrays do not include a length prefix.
186            //
187            // Instead of using actual decoder, we manually convert values.
188            let res = match physical_type {
189                Type::BOOLEAN => Statistics::boolean(
190                    min.map(|data| data[0] != 0),
191                    max.map(|data| data[0] != 0),
192                    distinct_count,
193                    null_count,
194                    old_format,
195                ),
196                Type::INT32 => Statistics::int32(
197                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
198                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199                    distinct_count,
200                    null_count,
201                    old_format,
202                ),
203                Type::INT64 => Statistics::int64(
204                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
205                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206                    distinct_count,
207                    null_count,
208                    old_format,
209                ),
210                Type::INT96 => {
211                    // INT96 statistics may not be correct, because comparison is signed
212                    let min = if let Some(data) = min {
213                        assert_eq!(data.len(), 12);
214                        Some(Int96::try_from_le_slice(&data)?)
215                    } else {
216                        None
217                    };
218                    let max = if let Some(data) = max {
219                        assert_eq!(data.len(), 12);
220                        Some(Int96::try_from_le_slice(&data)?)
221                    } else {
222                        None
223                    };
224                    Statistics::int96(min, max, distinct_count, null_count, old_format)
225                }
226                Type::FLOAT => Statistics::float(
227                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
228                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
229                    distinct_count,
230                    null_count,
231                    old_format,
232                ),
233                Type::DOUBLE => Statistics::double(
234                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
235                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
236                    distinct_count,
237                    null_count,
238                    old_format,
239                ),
240                Type::BYTE_ARRAY => Statistics::ByteArray(
241                    ValueStatistics::new(
242                        min.map(ByteArray::from),
243                        max.map(ByteArray::from),
244                        distinct_count,
245                        null_count,
246                        old_format,
247                    )
248                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
249                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
250                ),
251                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
252                    ValueStatistics::new(
253                        min.map(ByteArray::from).map(FixedLenByteArray::from),
254                        max.map(ByteArray::from).map(FixedLenByteArray::from),
255                        distinct_count,
256                        null_count,
257                        old_format,
258                    )
259                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
260                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
261                ),
262            };
263
264            Some(res)
265        }
266        None => None,
267    })
268}
269
270/// Convert Statistics into Thrift definition.
271pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> {
272    let stats = stats?;
273
274    // record null count if it can fit in i64
275    let null_count = stats
276        .null_count_opt()
277        .and_then(|value| i64::try_from(value).ok());
278
279    // record distinct count if it can fit in i64
280    let distinct_count = stats
281        .distinct_count_opt()
282        .and_then(|value| i64::try_from(value).ok());
283
284    let mut thrift_stats = PageStatistics {
285        max: None,
286        min: None,
287        null_count,
288        distinct_count,
289        max_value: None,
290        min_value: None,
291        is_max_value_exact: None,
292        is_min_value_exact: None,
293    };
294
295    // Get min/max if set.
296    let (min, max, min_exact, max_exact) = (
297        stats.min_bytes_opt().map(|x| x.to_vec()),
298        stats.max_bytes_opt().map(|x| x.to_vec()),
299        Some(stats.min_is_exact()),
300        Some(stats.max_is_exact()),
301    );
302    if stats.is_min_max_backwards_compatible() {
303        // Copy to deprecated min, max values for compatibility with older readers
304        thrift_stats.min.clone_from(&min);
305        thrift_stats.max.clone_from(&max);
306    }
307
308    if !stats.is_min_max_deprecated() {
309        thrift_stats.min_value = min;
310        thrift_stats.max_value = max;
311    }
312
313    thrift_stats.is_min_value_exact = min_exact;
314    thrift_stats.is_max_value_exact = max_exact;
315
316    Some(thrift_stats)
317}
318
319/// Strongly typed statistics for a column chunk within a row group.
320///
321/// This structure is a natively typed, in memory representation of the thrift
322/// `Statistics` structure in a Parquet file footer. The statistics stored in
323/// this structure can be used by query engines to skip decoding pages while
324/// reading parquet data.
325///
326/// Page level statistics are stored separately, in [ColumnIndexMetaData].
327///
328/// [ColumnIndexMetaData]: crate::file::page_index::column_index::ColumnIndexMetaData
329#[derive(Debug, Clone, PartialEq)]
330pub enum Statistics {
331    /// Statistics for Boolean column
332    Boolean(ValueStatistics<bool>),
333    /// Statistics for Int32 column
334    Int32(ValueStatistics<i32>),
335    /// Statistics for Int64 column
336    Int64(ValueStatistics<i64>),
337    /// Statistics for Int96 column
338    Int96(ValueStatistics<Int96>),
339    /// Statistics for Float column
340    Float(ValueStatistics<f32>),
341    /// Statistics for Double column
342    Double(ValueStatistics<f64>),
343    /// Statistics for ByteArray column
344    ByteArray(ValueStatistics<ByteArray>),
345    /// Statistics for FixedLenByteArray column
346    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
347}
348
349impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
350    fn from(t: ValueStatistics<T>) -> Self {
351        T::make_statistics(t)
352    }
353}
354
355impl Statistics {
356    /// Creates new statistics for a column type
357    pub fn new<T: ParquetValueType>(
358        min: Option<T>,
359        max: Option<T>,
360        distinct_count: Option<u64>,
361        null_count: Option<u64>,
362        is_deprecated: bool,
363    ) -> Self {
364        Self::from(ValueStatistics::new(
365            min,
366            max,
367            distinct_count,
368            null_count,
369            is_deprecated,
370        ))
371    }
372
373    statistics_new_func![boolean, Option<bool>, Boolean];
374
375    statistics_new_func![int32, Option<i32>, Int32];
376
377    statistics_new_func![int64, Option<i64>, Int64];
378
379    statistics_new_func![int96, Option<Int96>, Int96];
380
381    statistics_new_func![float, Option<f32>, Float];
382
383    statistics_new_func![double, Option<f64>, Double];
384
385    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
386
387    statistics_new_func![
388        fixed_len_byte_array,
389        Option<FixedLenByteArray>,
390        FixedLenByteArray
391    ];
392
393    /// Returns `true` if statistics have old `min` and `max` fields set.
394    /// This means that the column order is likely to be undefined, which, for old files
395    /// could mean a signed sort order of values.
396    ///
397    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
398    /// [`SortOrder`](crate::basic::SortOrder) for more information.
399    pub fn is_min_max_deprecated(&self) -> bool {
400        statistics_enum_func![self, is_min_max_deprecated]
401    }
402
403    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
404    /// using signed comparison. This resulted in an undefined ordering for unsigned
405    /// quantities, such as booleans and unsigned integers.
406    ///
407    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
408    /// which have a type-defined sort order.
409    ///
410    /// However, not all readers have been updated. For backwards compatibility, this method
411    /// returns `true` if the statistics within this have a signed sort order, that is
412    /// compatible with being stored in the deprecated `min` and `max` fields
413    pub fn is_min_max_backwards_compatible(&self) -> bool {
414        statistics_enum_func![self, is_min_max_backwards_compatible]
415    }
416
417    /// Returns optional value of number of distinct values occurring.
418    /// When it is `None`, the value should be ignored.
419    pub fn distinct_count_opt(&self) -> Option<u64> {
420        statistics_enum_func![self, distinct_count]
421    }
422
423    /// Returns number of null values for the column, if known.
424    /// Note that this includes all nulls when column is part of the complex type.
425    ///
426    /// Note this API returns Some(0) even if the null count was not present
427    /// in the statistics.
428    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
429    pub fn null_count_opt(&self) -> Option<u64> {
430        statistics_enum_func![self, null_count_opt]
431    }
432
433    /// Returns `true` if the min value is set, and is an exact min value.
434    pub fn min_is_exact(&self) -> bool {
435        statistics_enum_func![self, min_is_exact]
436    }
437
438    /// Returns `true` if the max value is set, and is an exact max value.
439    pub fn max_is_exact(&self) -> bool {
440        statistics_enum_func![self, max_is_exact]
441    }
442
443    /// Returns slice of bytes that represent min value, if min value is known.
444    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
445        statistics_enum_func![self, min_bytes_opt]
446    }
447
448    /// Returns slice of bytes that represent max value, if max value is known.
449    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
450        statistics_enum_func![self, max_bytes_opt]
451    }
452
453    /// Returns physical type associated with statistics.
454    pub fn physical_type(&self) -> Type {
455        match self {
456            Statistics::Boolean(_) => Type::BOOLEAN,
457            Statistics::Int32(_) => Type::INT32,
458            Statistics::Int64(_) => Type::INT64,
459            Statistics::Int96(_) => Type::INT96,
460            Statistics::Float(_) => Type::FLOAT,
461            Statistics::Double(_) => Type::DOUBLE,
462            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
463            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
464        }
465    }
466}
467
468impl fmt::Display for Statistics {
469    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
470        match self {
471            Statistics::Boolean(typed) => write!(f, "{typed}"),
472            Statistics::Int32(typed) => write!(f, "{typed}"),
473            Statistics::Int64(typed) => write!(f, "{typed}"),
474            Statistics::Int96(typed) => write!(f, "{typed}"),
475            Statistics::Float(typed) => write!(f, "{typed}"),
476            Statistics::Double(typed) => write!(f, "{typed}"),
477            Statistics::ByteArray(typed) => write!(f, "{typed}"),
478            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
479        }
480    }
481}
482
483/// Typed implementation for [`Statistics`].
484pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
485
486/// Typed statistics for one column chunk
487///
488/// See [`Statistics`] for more details
489#[derive(Clone, Eq, PartialEq)]
490pub struct ValueStatistics<T> {
491    min: Option<T>,
492    max: Option<T>,
493    // Distinct count could be omitted in some cases
494    distinct_count: Option<u64>,
495    null_count: Option<u64>,
496
497    // Whether or not the min or max values are exact, or truncated.
498    is_max_value_exact: bool,
499    is_min_value_exact: bool,
500
501    /// If `true` populate the deprecated `min` and `max` fields instead of
502    /// `min_value` and `max_value`
503    is_min_max_deprecated: bool,
504
505    /// If `true` the statistics are compatible with the deprecated `min` and
506    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
507    is_min_max_backwards_compatible: bool,
508}
509
510impl<T: ParquetValueType> ValueStatistics<T> {
511    /// Creates new typed statistics.
512    pub fn new(
513        min: Option<T>,
514        max: Option<T>,
515        distinct_count: Option<u64>,
516        null_count: Option<u64>,
517        is_min_max_deprecated: bool,
518    ) -> Self {
519        Self {
520            is_max_value_exact: max.is_some(),
521            is_min_value_exact: min.is_some(),
522            min,
523            max,
524            distinct_count,
525            null_count,
526            is_min_max_deprecated,
527            is_min_max_backwards_compatible: is_min_max_deprecated,
528        }
529    }
530
531    /// Set whether the stored `min` field represents the exact
532    /// minimum, or just a bound on the minimum value.
533    ///
534    /// see [`Self::min_is_exact`]
535    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
536        Self {
537            is_min_value_exact,
538            ..self
539        }
540    }
541
542    /// Set whether the stored `max` field represents the exact
543    /// maximum, or just a bound on the maximum value.
544    ///
545    /// see [`Self::max_is_exact`]
546    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
547        Self {
548            is_max_value_exact,
549            ..self
550        }
551    }
552
553    /// Set whether to write the deprecated `min` and `max` fields
554    /// for compatibility with older parquet writers
555    ///
556    /// This should only be enabled if the field is signed,
557    /// see [`Self::is_min_max_backwards_compatible`]
558    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
559        Self {
560            is_min_max_backwards_compatible: backwards_compatible,
561            ..self
562        }
563    }
564
565    /// Returns min value of the statistics, if known.
566    pub fn min_opt(&self) -> Option<&T> {
567        self.min.as_ref()
568    }
569
570    /// Returns max value of the statistics, if known.
571    pub fn max_opt(&self) -> Option<&T> {
572        self.max.as_ref()
573    }
574
575    /// Returns min value as bytes of the statistics, if min value is known.
576    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
577        self.min_opt().map(AsBytes::as_bytes)
578    }
579
580    /// Returns max value as bytes of the statistics, if max value is known.
581    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
582        self.max_opt().map(AsBytes::as_bytes)
583    }
584
585    /// Whether or not min and max values are set.
586    /// Normally both min/max values will be set to `Some(value)` or `None`.
587    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
588        self.min.is_some() && self.max.is_some()
589    }
590
591    /// Whether or not max value is set, and is an exact value.
592    pub fn max_is_exact(&self) -> bool {
593        self.max.is_some() && self.is_max_value_exact
594    }
595
596    /// Whether or not min value is set, and is an exact value.
597    pub fn min_is_exact(&self) -> bool {
598        self.min.is_some() && self.is_min_value_exact
599    }
600
601    /// Returns optional value of number of distinct values occurring.
602    pub fn distinct_count(&self) -> Option<u64> {
603        self.distinct_count
604    }
605
606    /// Returns null count.
607    pub fn null_count_opt(&self) -> Option<u64> {
608        self.null_count
609    }
610
611    /// Returns `true` if statistics were created using old min/max fields.
612    fn is_min_max_deprecated(&self) -> bool {
613        self.is_min_max_deprecated
614    }
615
616    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
617    /// using signed comparison. This resulted in an undefined ordering for unsigned
618    /// quantities, such as booleans and unsigned integers.
619    ///
620    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
621    /// which have a type-defined sort order.
622    ///
623    /// However, not all readers have been updated. For backwards compatibility, this method
624    /// returns `true` if the statistics within this have a signed sort order, that is
625    /// compatible with being stored in the deprecated `min` and `max` fields
626    pub fn is_min_max_backwards_compatible(&self) -> bool {
627        self.is_min_max_backwards_compatible
628    }
629}
630
631impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
632    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
633        write!(f, "{{")?;
634        write!(f, "min: ")?;
635        match self.min {
636            Some(ref value) => write!(f, "{value}")?,
637            None => write!(f, "N/A")?,
638        }
639        write!(f, ", max: ")?;
640        match self.max {
641            Some(ref value) => write!(f, "{value}")?,
642            None => write!(f, "N/A")?,
643        }
644        write!(f, ", distinct_count: ")?;
645        match self.distinct_count {
646            Some(value) => write!(f, "{value}")?,
647            None => write!(f, "N/A")?,
648        }
649        write!(f, ", null_count: ")?;
650        match self.null_count {
651            Some(value) => write!(f, "{value}")?,
652            None => write!(f, "N/A")?,
653        }
654        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
655        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
656        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
657        write!(f, "}}")
658    }
659}
660
661impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
662    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
663        write!(
664            f,
665            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
666             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
667            self.min,
668            self.max,
669            self.distinct_count,
670            self.null_count,
671            self.is_min_max_deprecated,
672            self.is_min_max_backwards_compatible,
673            self.is_max_value_exact,
674            self.is_min_value_exact
675        )
676    }
677}
678
679#[cfg(test)]
680mod tests {
681    use super::*;
682
683    #[test]
684    fn test_statistics_min_max_bytes() {
685        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
686        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
687        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
688
689        let stats = Statistics::byte_array(
690            Some(ByteArray::from(vec![1, 2, 3])),
691            Some(ByteArray::from(vec![3, 4, 5])),
692            None,
693            Some(1),
694            true,
695        );
696        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
697        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
698    }
699
700    #[test]
701    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
702    fn test_statistics_negative_null_count() {
703        let thrift_stats = PageStatistics {
704            max: None,
705            min: None,
706            null_count: Some(-10),
707            distinct_count: None,
708            max_value: None,
709            min_value: None,
710            is_max_value_exact: None,
711            is_min_value_exact: None,
712        };
713
714        from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap();
715    }
716
717    #[test]
718    fn test_statistics_thrift_none() {
719        assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None);
720        assert_eq!(
721            from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(),
722            None
723        );
724    }
725
726    #[test]
727    fn test_statistics_debug() {
728        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
729        assert_eq!(
730            format!("{stats:?}"),
731            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
732             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
733        );
734
735        let stats = Statistics::int32(None, None, None, Some(7), false);
736        assert_eq!(
737            format!("{stats:?}"),
738            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
739             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
740        )
741    }
742
743    #[test]
744    fn test_statistics_display() {
745        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
746        assert_eq!(
747            format!("{stats}"),
748            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
749        );
750
751        let stats = Statistics::int64(None, None, None, Some(7), false);
752        assert_eq!(
753            format!("{stats}"),
754            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
755             false, max_value_exact: false, min_value_exact: false}"
756        );
757
758        let stats = Statistics::int96(
759            Some(Int96::from(vec![1, 0, 0])),
760            Some(Int96::from(vec![2, 3, 4])),
761            None,
762            Some(3),
763            true,
764        );
765        assert_eq!(
766            format!("{stats}"),
767            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
768             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
769        );
770
771        let stats = Statistics::ByteArray(
772            ValueStatistics::new(
773                Some(ByteArray::from(vec![1u8])),
774                Some(ByteArray::from(vec![2u8])),
775                Some(5),
776                Some(7),
777                false,
778            )
779            .with_max_is_exact(false)
780            .with_min_is_exact(false),
781        );
782        assert_eq!(
783            format!("{stats}"),
784            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
785        );
786    }
787
788    #[test]
789    fn test_statistics_partial_eq() {
790        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
791
792        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
793        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
794        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
795        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
796        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
797
798        assert!(
799            Statistics::int32(Some(12), Some(45), None, Some(11), false)
800                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
801        );
802
803        assert!(
804            Statistics::boolean(Some(false), Some(true), None, None, true)
805                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
806        );
807
808        assert!(
809            Statistics::byte_array(
810                Some(ByteArray::from(vec![1, 2, 3])),
811                Some(ByteArray::from(vec![1, 2, 3])),
812                None,
813                None,
814                true
815            ) != Statistics::fixed_len_byte_array(
816                Some(ByteArray::from(vec![1, 2, 3]).into()),
817                Some(ByteArray::from(vec![1, 2, 3]).into()),
818                None,
819                None,
820                true,
821            )
822        );
823
824        assert!(
825            Statistics::byte_array(
826                Some(ByteArray::from(vec![1, 2, 3])),
827                Some(ByteArray::from(vec![1, 2, 3])),
828                None,
829                None,
830                true,
831            ) != Statistics::ByteArray(
832                ValueStatistics::new(
833                    Some(ByteArray::from(vec![1, 2, 3])),
834                    Some(ByteArray::from(vec![1, 2, 3])),
835                    None,
836                    None,
837                    true,
838                )
839                .with_max_is_exact(false)
840            )
841        );
842
843        assert!(
844            Statistics::fixed_len_byte_array(
845                Some(FixedLenByteArray::from(vec![1, 2, 3])),
846                Some(FixedLenByteArray::from(vec![1, 2, 3])),
847                None,
848                None,
849                true,
850            ) != Statistics::FixedLenByteArray(
851                ValueStatistics::new(
852                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
853                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
854                    None,
855                    None,
856                    true,
857                )
858                .with_min_is_exact(false)
859            )
860        );
861    }
862
863    #[test]
864    fn test_statistics_from_thrift() {
865        // Helper method to check statistics conversion.
866        fn check_stats(stats: Statistics) {
867            let tpe = stats.physical_type();
868            let thrift_stats = page_stats_to_thrift(Some(&stats));
869            assert_eq!(
870                from_thrift_page_stats(tpe, thrift_stats).unwrap(),
871                Some(stats)
872            );
873        }
874
875        check_stats(Statistics::boolean(
876            Some(false),
877            Some(true),
878            None,
879            Some(7),
880            true,
881        ));
882        check_stats(Statistics::boolean(
883            Some(false),
884            Some(true),
885            None,
886            Some(7),
887            true,
888        ));
889        check_stats(Statistics::boolean(
890            Some(false),
891            Some(true),
892            None,
893            Some(0),
894            false,
895        ));
896        check_stats(Statistics::boolean(
897            Some(true),
898            Some(true),
899            None,
900            Some(7),
901            true,
902        ));
903        check_stats(Statistics::boolean(
904            Some(false),
905            Some(false),
906            None,
907            Some(7),
908            true,
909        ));
910        check_stats(Statistics::boolean(None, None, None, Some(7), true));
911
912        check_stats(Statistics::int32(
913            Some(-100),
914            Some(500),
915            None,
916            Some(7),
917            true,
918        ));
919        check_stats(Statistics::int32(
920            Some(-100),
921            Some(500),
922            None,
923            Some(0),
924            false,
925        ));
926        check_stats(Statistics::int32(None, None, None, Some(7), true));
927
928        check_stats(Statistics::int64(
929            Some(-100),
930            Some(200),
931            None,
932            Some(7),
933            true,
934        ));
935        check_stats(Statistics::int64(
936            Some(-100),
937            Some(200),
938            None,
939            Some(0),
940            false,
941        ));
942        check_stats(Statistics::int64(None, None, None, Some(7), true));
943
944        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
945        check_stats(Statistics::float(
946            Some(1.2),
947            Some(3.4),
948            None,
949            Some(0),
950            false,
951        ));
952        check_stats(Statistics::float(None, None, None, Some(7), true));
953
954        check_stats(Statistics::double(
955            Some(1.2),
956            Some(3.4),
957            None,
958            Some(7),
959            true,
960        ));
961        check_stats(Statistics::double(
962            Some(1.2),
963            Some(3.4),
964            None,
965            Some(0),
966            false,
967        ));
968        check_stats(Statistics::double(None, None, None, Some(7), true));
969
970        check_stats(Statistics::byte_array(
971            Some(ByteArray::from(vec![1, 2, 3])),
972            Some(ByteArray::from(vec![3, 4, 5])),
973            None,
974            Some(7),
975            true,
976        ));
977        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
978
979        check_stats(Statistics::fixed_len_byte_array(
980            Some(ByteArray::from(vec![1, 2, 3]).into()),
981            Some(ByteArray::from(vec![3, 4, 5]).into()),
982            None,
983            Some(7),
984            true,
985        ));
986        check_stats(Statistics::fixed_len_byte_array(
987            None,
988            None,
989            None,
990            Some(7),
991            true,
992        ));
993    }
994
995    #[test]
996    fn test_count_encoding() {
997        statistics_count_test(None, None);
998        statistics_count_test(Some(0), Some(0));
999        statistics_count_test(Some(100), Some(2000));
1000        statistics_count_test(Some(1), None);
1001        statistics_count_test(None, Some(1));
1002    }
1003
1004    #[test]
1005    fn test_count_encoding_distinct_too_large() {
1006        // statistics are stored using i64, so test trying to store larger values
1007        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1008        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1009        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1010        assert_eq!(thrift_stats.null_count, Some(100));
1011    }
1012
1013    #[test]
1014    fn test_count_encoding_null_too_large() {
1015        // statistics are stored using i64, so test trying to store larger values
1016        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1017        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1018        assert_eq!(thrift_stats.distinct_count, Some(100));
1019        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1020    }
1021
1022    #[test]
1023    fn test_count_decoding_null_invalid() {
1024        let tstatistics = PageStatistics {
1025            null_count: Some(-42),
1026            max: None,
1027            min: None,
1028            distinct_count: None,
1029            max_value: None,
1030            min_value: None,
1031            is_max_value_exact: None,
1032            is_min_value_exact: None,
1033        };
1034        let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1035        assert_eq!(
1036            err.to_string(),
1037            "Parquet error: Statistics null count is negative -42"
1038        );
1039    }
1040
1041    /// Writes statistics to thrift and reads them back and ensures:
1042    /// - The statistics are the same
1043    /// - The statistics written to thrift are the same as the original statistics
1044    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1045        let statistics = make_bool_stats(distinct_count, null_count);
1046
1047        let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1048        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1049        assert_eq!(
1050            thrift_stats.distinct_count.map(|c| c as u64),
1051            distinct_count
1052        );
1053
1054        let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
1055            .unwrap()
1056            .unwrap();
1057        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1058        // means null_count = Some(0)
1059        if null_count.is_none() {
1060            assert_ne!(round_tripped, statistics);
1061            assert!(round_tripped.null_count_opt().is_some());
1062            assert_eq!(round_tripped.null_count_opt(), Some(0));
1063            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1064            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1065            assert_eq!(
1066                round_tripped.distinct_count_opt(),
1067                statistics.distinct_count_opt()
1068            );
1069        } else {
1070            assert_eq!(round_tripped, statistics);
1071        }
1072    }
1073
1074    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1075        let min = Some(true);
1076        let max = Some(false);
1077        let is_min_max_deprecated = false;
1078
1079        // test is about the counts, so we aren't really testing the min/max values
1080        Statistics::Boolean(ValueStatistics::new(
1081            min,
1082            max,
1083            distinct_count,
1084            null_count,
1085            is_min_max_deprecated,
1086        ))
1087    }
1088}