parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53    use super::*;
54
55    pub trait MakeStatistics {
56        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57        where
58            Self: Sized;
59    }
60
61    macro_rules! gen_make_statistics {
62        ($value_ty:ty, $stat:ident) => {
63            impl MakeStatistics for $value_ty {
64                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65                where
66                    Self: Sized,
67                {
68                    Statistics::$stat(statistics)
69                }
70            }
71        };
72    }
73
74    gen_make_statistics!(bool, Boolean);
75    gen_make_statistics!(i32, Int32);
76    gen_make_statistics!(i64, Int64);
77    gen_make_statistics!(Int96, Int96);
78    gen_make_statistics!(f32, Float);
79    gen_make_statistics!(f64, Double);
80    gen_make_statistics!(ByteArray, ByteArray);
81    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84/// Macro to generate methods to create Statistics.
85macro_rules! statistics_new_func {
86    ($func:ident, $vtype:ty, $stat:ident) => {
87        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88        pub fn $func(
89            min: $vtype,
90            max: $vtype,
91            distinct: Option<u64>,
92            nulls: Option<u64>,
93            is_deprecated: bool,
94        ) -> Self {
95            Statistics::$stat(ValueStatistics::new(
96                min,
97                max,
98                distinct,
99                nulls,
100                is_deprecated,
101            ))
102        }
103    };
104}
105
106// Macro to generate getter functions for Statistics.
107macro_rules! statistics_enum_func {
108    ($self:ident, $func:ident) => {{
109        match *$self {
110            Statistics::Boolean(ref typed) => typed.$func(),
111            Statistics::Int32(ref typed) => typed.$func(),
112            Statistics::Int64(ref typed) => typed.$func(),
113            Statistics::Int96(ref typed) => typed.$func(),
114            Statistics::Float(ref typed) => typed.$func(),
115            Statistics::Double(ref typed) => typed.$func(),
116            Statistics::ByteArray(ref typed) => typed.$func(),
117            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118        }
119    }};
120}
121
122/// Converts Thrift definition into `Statistics`.
123pub fn from_thrift(
124    physical_type: Type,
125    thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127    Ok(match thrift_stats {
128        Some(stats) => {
129            // Number of nulls recorded, when it is not available, we just mark it as 0.
130            // TODO this should be `None` if there is no information about NULLS.
131            // see https://github.com/apache/arrow-rs/pull/6216/files
132            let null_count = stats.null_count.unwrap_or(0);
133
134            if null_count < 0 {
135                return Err(ParquetError::General(format!(
136                    "Statistics null count is negative {null_count}",
137                )));
138            }
139
140            // Generic null count.
141            let null_count = Some(null_count as u64);
142            // Generic distinct count (count of distinct values occurring)
143            let distinct_count = stats.distinct_count.map(|value| value as u64);
144            // Whether or not statistics use deprecated min/max fields.
145            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
146            // Generic min value as bytes.
147            let min = if old_format {
148                stats.min
149            } else {
150                stats.min_value
151            };
152            // Generic max value as bytes.
153            let max = if old_format {
154                stats.max
155            } else {
156                stats.max_value
157            };
158
159            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
160                if let Some(min) = min {
161                    if min.len() < len {
162                        return Err(ParquetError::General(
163                            "Insufficient bytes to parse min statistic".to_string(),
164                        ));
165                    }
166                }
167                if let Some(max) = max {
168                    if max.len() < len {
169                        return Err(ParquetError::General(
170                            "Insufficient bytes to parse max statistic".to_string(),
171                        ));
172                    }
173                }
174                Ok(())
175            }
176
177            match physical_type {
178                Type::BOOLEAN => check_len(&min, &max, 1),
179                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
180                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
181                Type::INT96 => check_len(&min, &max, 12),
182                _ => Ok(()),
183            }?;
184
185            // Values are encoded using PLAIN encoding definition, except that
186            // variable-length byte arrays do not include a length prefix.
187            //
188            // Instead of using actual decoder, we manually convert values.
189            let res = match physical_type {
190                Type::BOOLEAN => Statistics::boolean(
191                    min.map(|data| data[0] != 0),
192                    max.map(|data| data[0] != 0),
193                    distinct_count,
194                    null_count,
195                    old_format,
196                ),
197                Type::INT32 => Statistics::int32(
198                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
200                    distinct_count,
201                    null_count,
202                    old_format,
203                ),
204                Type::INT64 => Statistics::int64(
205                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
207                    distinct_count,
208                    null_count,
209                    old_format,
210                ),
211                Type::INT96 => {
212                    // INT96 statistics may not be correct, because comparison is signed
213                    let min = if let Some(data) = min {
214                        assert_eq!(data.len(), 12);
215                        Some(Int96::try_from_le_slice(&data)?)
216                    } else {
217                        None
218                    };
219                    let max = if let Some(data) = max {
220                        assert_eq!(data.len(), 12);
221                        Some(Int96::try_from_le_slice(&data)?)
222                    } else {
223                        None
224                    };
225                    Statistics::int96(min, max, distinct_count, null_count, old_format)
226                }
227                Type::FLOAT => Statistics::float(
228                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
229                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
230                    distinct_count,
231                    null_count,
232                    old_format,
233                ),
234                Type::DOUBLE => Statistics::double(
235                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
236                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
237                    distinct_count,
238                    null_count,
239                    old_format,
240                ),
241                Type::BYTE_ARRAY => Statistics::ByteArray(
242                    ValueStatistics::new(
243                        min.map(ByteArray::from),
244                        max.map(ByteArray::from),
245                        distinct_count,
246                        null_count,
247                        old_format,
248                    )
249                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
250                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
251                ),
252                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
253                    ValueStatistics::new(
254                        min.map(ByteArray::from).map(FixedLenByteArray::from),
255                        max.map(ByteArray::from).map(FixedLenByteArray::from),
256                        distinct_count,
257                        null_count,
258                        old_format,
259                    )
260                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
261                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
262                ),
263            };
264
265            Some(res)
266        }
267        None => None,
268    })
269}
270
271/// Convert Statistics into Thrift definition.
272pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
273    let stats = stats?;
274
275    // record null count if it can fit in i64
276    let null_count = stats
277        .null_count_opt()
278        .and_then(|value| i64::try_from(value).ok());
279
280    // record distinct count if it can fit in i64
281    let distinct_count = stats
282        .distinct_count_opt()
283        .and_then(|value| i64::try_from(value).ok());
284
285    let mut thrift_stats = TStatistics {
286        max: None,
287        min: None,
288        null_count,
289        distinct_count,
290        max_value: None,
291        min_value: None,
292        is_max_value_exact: None,
293        is_min_value_exact: None,
294    };
295
296    // Get min/max if set.
297    let (min, max, min_exact, max_exact) = (
298        stats.min_bytes_opt().map(|x| x.to_vec()),
299        stats.max_bytes_opt().map(|x| x.to_vec()),
300        Some(stats.min_is_exact()),
301        Some(stats.max_is_exact()),
302    );
303    if stats.is_min_max_backwards_compatible() {
304        // Copy to deprecated min, max values for compatibility with older readers
305        thrift_stats.min.clone_from(&min);
306        thrift_stats.max.clone_from(&max);
307    }
308
309    if !stats.is_min_max_deprecated() {
310        thrift_stats.min_value = min;
311        thrift_stats.max_value = max;
312    }
313
314    thrift_stats.is_min_value_exact = min_exact;
315    thrift_stats.is_max_value_exact = max_exact;
316
317    Some(thrift_stats)
318}
319
320/// Strongly typed statistics for a column chunk within a row group.
321///
322/// This structure is a natively typed, in memory representation of the
323/// [`Statistics`] structure in a parquet file footer. The statistics stored in
324/// this structure can be used by query engines to skip decoding pages while
325/// reading parquet data.
326///
327/// Page level statistics are stored separately, in [NativeIndex].
328///
329/// [`Statistics`]: crate::format::Statistics
330/// [NativeIndex]: crate::file::page_index::index::NativeIndex
331#[derive(Debug, Clone, PartialEq)]
332pub enum Statistics {
333    /// Statistics for Boolean column
334    Boolean(ValueStatistics<bool>),
335    /// Statistics for Int32 column
336    Int32(ValueStatistics<i32>),
337    /// Statistics for Int64 column
338    Int64(ValueStatistics<i64>),
339    /// Statistics for Int96 column
340    Int96(ValueStatistics<Int96>),
341    /// Statistics for Float column
342    Float(ValueStatistics<f32>),
343    /// Statistics for Double column
344    Double(ValueStatistics<f64>),
345    /// Statistics for ByteArray column
346    ByteArray(ValueStatistics<ByteArray>),
347    /// Statistics for FixedLenByteArray column
348    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
349}
350
351impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
352    fn from(t: ValueStatistics<T>) -> Self {
353        T::make_statistics(t)
354    }
355}
356
357impl Statistics {
358    /// Creates new statistics for a column type
359    pub fn new<T: ParquetValueType>(
360        min: Option<T>,
361        max: Option<T>,
362        distinct_count: Option<u64>,
363        null_count: Option<u64>,
364        is_deprecated: bool,
365    ) -> Self {
366        Self::from(ValueStatistics::new(
367            min,
368            max,
369            distinct_count,
370            null_count,
371            is_deprecated,
372        ))
373    }
374
375    statistics_new_func![boolean, Option<bool>, Boolean];
376
377    statistics_new_func![int32, Option<i32>, Int32];
378
379    statistics_new_func![int64, Option<i64>, Int64];
380
381    statistics_new_func![int96, Option<Int96>, Int96];
382
383    statistics_new_func![float, Option<f32>, Float];
384
385    statistics_new_func![double, Option<f64>, Double];
386
387    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
388
389    statistics_new_func![
390        fixed_len_byte_array,
391        Option<FixedLenByteArray>,
392        FixedLenByteArray
393    ];
394
395    /// Returns `true` if statistics have old `min` and `max` fields set.
396    /// This means that the column order is likely to be undefined, which, for old files
397    /// could mean a signed sort order of values.
398    ///
399    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
400    /// [`SortOrder`](crate::basic::SortOrder) for more information.
401    pub fn is_min_max_deprecated(&self) -> bool {
402        statistics_enum_func![self, is_min_max_deprecated]
403    }
404
405    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
406    /// using signed comparison. This resulted in an undefined ordering for unsigned
407    /// quantities, such as booleans and unsigned integers.
408    ///
409    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
410    /// which have a type-defined sort order.
411    ///
412    /// However, not all readers have been updated. For backwards compatibility, this method
413    /// returns `true` if the statistics within this have a signed sort order, that is
414    /// compatible with being stored in the deprecated `min` and `max` fields
415    pub fn is_min_max_backwards_compatible(&self) -> bool {
416        statistics_enum_func![self, is_min_max_backwards_compatible]
417    }
418
419    /// Returns optional value of number of distinct values occurring.
420    /// When it is `None`, the value should be ignored.
421    pub fn distinct_count_opt(&self) -> Option<u64> {
422        statistics_enum_func![self, distinct_count]
423    }
424
425    /// Returns number of null values for the column, if known.
426    /// Note that this includes all nulls when column is part of the complex type.
427    ///
428    /// Note this API returns Some(0) even if the null count was not present
429    /// in the statistics.
430    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
431    pub fn null_count_opt(&self) -> Option<u64> {
432        statistics_enum_func![self, null_count_opt]
433    }
434
435    /// Returns `true` if the min value is set, and is an exact min value.
436    pub fn min_is_exact(&self) -> bool {
437        statistics_enum_func![self, min_is_exact]
438    }
439
440    /// Returns `true` if the max value is set, and is an exact max value.
441    pub fn max_is_exact(&self) -> bool {
442        statistics_enum_func![self, max_is_exact]
443    }
444
445    /// Returns slice of bytes that represent min value, if min value is known.
446    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
447        statistics_enum_func![self, min_bytes_opt]
448    }
449
450    /// Returns slice of bytes that represent max value, if max value is known.
451    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
452        statistics_enum_func![self, max_bytes_opt]
453    }
454
455    /// Returns physical type associated with statistics.
456    pub fn physical_type(&self) -> Type {
457        match self {
458            Statistics::Boolean(_) => Type::BOOLEAN,
459            Statistics::Int32(_) => Type::INT32,
460            Statistics::Int64(_) => Type::INT64,
461            Statistics::Int96(_) => Type::INT96,
462            Statistics::Float(_) => Type::FLOAT,
463            Statistics::Double(_) => Type::DOUBLE,
464            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
465            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
466        }
467    }
468}
469
470impl fmt::Display for Statistics {
471    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
472        match self {
473            Statistics::Boolean(typed) => write!(f, "{typed}"),
474            Statistics::Int32(typed) => write!(f, "{typed}"),
475            Statistics::Int64(typed) => write!(f, "{typed}"),
476            Statistics::Int96(typed) => write!(f, "{typed}"),
477            Statistics::Float(typed) => write!(f, "{typed}"),
478            Statistics::Double(typed) => write!(f, "{typed}"),
479            Statistics::ByteArray(typed) => write!(f, "{typed}"),
480            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
481        }
482    }
483}
484
485/// Typed implementation for [`Statistics`].
486pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
487
488/// Typed statistics for one column chunk
489///
490/// See [`Statistics`] for more details
491#[derive(Clone, Eq, PartialEq)]
492pub struct ValueStatistics<T> {
493    min: Option<T>,
494    max: Option<T>,
495    // Distinct count could be omitted in some cases
496    distinct_count: Option<u64>,
497    null_count: Option<u64>,
498
499    // Whether or not the min or max values are exact, or truncated.
500    is_max_value_exact: bool,
501    is_min_value_exact: bool,
502
503    /// If `true` populate the deprecated `min` and `max` fields instead of
504    /// `min_value` and `max_value`
505    is_min_max_deprecated: bool,
506
507    /// If `true` the statistics are compatible with the deprecated `min` and
508    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
509    is_min_max_backwards_compatible: bool,
510}
511
512impl<T: ParquetValueType> ValueStatistics<T> {
513    /// Creates new typed statistics.
514    pub fn new(
515        min: Option<T>,
516        max: Option<T>,
517        distinct_count: Option<u64>,
518        null_count: Option<u64>,
519        is_min_max_deprecated: bool,
520    ) -> Self {
521        Self {
522            is_max_value_exact: max.is_some(),
523            is_min_value_exact: min.is_some(),
524            min,
525            max,
526            distinct_count,
527            null_count,
528            is_min_max_deprecated,
529            is_min_max_backwards_compatible: is_min_max_deprecated,
530        }
531    }
532
533    /// Set whether the stored `min` field represents the exact
534    /// minimum, or just a bound on the minimum value.
535    ///
536    /// see [`Self::min_is_exact`]
537    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
538        Self {
539            is_min_value_exact,
540            ..self
541        }
542    }
543
544    /// Set whether the stored `max` field represents the exact
545    /// maximum, or just a bound on the maximum value.
546    ///
547    /// see [`Self::max_is_exact`]
548    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
549        Self {
550            is_max_value_exact,
551            ..self
552        }
553    }
554
555    /// Set whether to write the deprecated `min` and `max` fields
556    /// for compatibility with older parquet writers
557    ///
558    /// This should only be enabled if the field is signed,
559    /// see [`Self::is_min_max_backwards_compatible`]
560    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
561        Self {
562            is_min_max_backwards_compatible: backwards_compatible,
563            ..self
564        }
565    }
566
567    /// Returns min value of the statistics, if known.
568    pub fn min_opt(&self) -> Option<&T> {
569        self.min.as_ref()
570    }
571
572    /// Returns max value of the statistics, if known.
573    pub fn max_opt(&self) -> Option<&T> {
574        self.max.as_ref()
575    }
576
577    /// Returns min value as bytes of the statistics, if min value is known.
578    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
579        self.min_opt().map(AsBytes::as_bytes)
580    }
581
582    /// Returns max value as bytes of the statistics, if max value is known.
583    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
584        self.max_opt().map(AsBytes::as_bytes)
585    }
586
587    /// Whether or not min and max values are set.
588    /// Normally both min/max values will be set to `Some(value)` or `None`.
589    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
590        self.min.is_some() && self.max.is_some()
591    }
592
593    /// Whether or not max value is set, and is an exact value.
594    pub fn max_is_exact(&self) -> bool {
595        self.max.is_some() && self.is_max_value_exact
596    }
597
598    /// Whether or not min value is set, and is an exact value.
599    pub fn min_is_exact(&self) -> bool {
600        self.min.is_some() && self.is_min_value_exact
601    }
602
603    /// Returns optional value of number of distinct values occurring.
604    pub fn distinct_count(&self) -> Option<u64> {
605        self.distinct_count
606    }
607
608    /// Returns null count.
609    pub fn null_count_opt(&self) -> Option<u64> {
610        self.null_count
611    }
612
613    /// Returns `true` if statistics were created using old min/max fields.
614    fn is_min_max_deprecated(&self) -> bool {
615        self.is_min_max_deprecated
616    }
617
618    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
619    /// using signed comparison. This resulted in an undefined ordering for unsigned
620    /// quantities, such as booleans and unsigned integers.
621    ///
622    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
623    /// which have a type-defined sort order.
624    ///
625    /// However, not all readers have been updated. For backwards compatibility, this method
626    /// returns `true` if the statistics within this have a signed sort order, that is
627    /// compatible with being stored in the deprecated `min` and `max` fields
628    pub fn is_min_max_backwards_compatible(&self) -> bool {
629        self.is_min_max_backwards_compatible
630    }
631}
632
633impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
634    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
635        write!(f, "{{")?;
636        write!(f, "min: ")?;
637        match self.min {
638            Some(ref value) => write!(f, "{value}")?,
639            None => write!(f, "N/A")?,
640        }
641        write!(f, ", max: ")?;
642        match self.max {
643            Some(ref value) => write!(f, "{value}")?,
644            None => write!(f, "N/A")?,
645        }
646        write!(f, ", distinct_count: ")?;
647        match self.distinct_count {
648            Some(value) => write!(f, "{value}")?,
649            None => write!(f, "N/A")?,
650        }
651        write!(f, ", null_count: ")?;
652        match self.null_count {
653            Some(value) => write!(f, "{value}")?,
654            None => write!(f, "N/A")?,
655        }
656        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
657        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
658        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
659        write!(f, "}}")
660    }
661}
662
663impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
664    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
665        write!(
666            f,
667            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
668             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
669            self.min,
670            self.max,
671            self.distinct_count,
672            self.null_count,
673            self.is_min_max_deprecated,
674            self.is_min_max_backwards_compatible,
675            self.is_max_value_exact,
676            self.is_min_value_exact
677        )
678    }
679}
680
681#[cfg(test)]
682mod tests {
683    use super::*;
684
685    #[test]
686    fn test_statistics_min_max_bytes() {
687        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
688        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
689        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
690
691        let stats = Statistics::byte_array(
692            Some(ByteArray::from(vec![1, 2, 3])),
693            Some(ByteArray::from(vec![3, 4, 5])),
694            None,
695            Some(1),
696            true,
697        );
698        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
699        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
700    }
701
702    #[test]
703    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
704    fn test_statistics_negative_null_count() {
705        let thrift_stats = TStatistics {
706            max: None,
707            min: None,
708            null_count: Some(-10),
709            distinct_count: None,
710            max_value: None,
711            min_value: None,
712            is_max_value_exact: None,
713            is_min_value_exact: None,
714        };
715
716        from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
717    }
718
719    #[test]
720    fn test_statistics_thrift_none() {
721        assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
722        assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
723    }
724
725    #[test]
726    fn test_statistics_debug() {
727        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
728        assert_eq!(
729            format!("{stats:?}"),
730            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
731             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
732        );
733
734        let stats = Statistics::int32(None, None, None, Some(7), false);
735        assert_eq!(
736            format!("{stats:?}"),
737            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
738             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
739        )
740    }
741
742    #[test]
743    fn test_statistics_display() {
744        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
745        assert_eq!(
746            format!("{stats}"),
747            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
748        );
749
750        let stats = Statistics::int64(None, None, None, Some(7), false);
751        assert_eq!(
752            format!("{stats}"),
753            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
754             false, max_value_exact: false, min_value_exact: false}"
755        );
756
757        let stats = Statistics::int96(
758            Some(Int96::from(vec![1, 0, 0])),
759            Some(Int96::from(vec![2, 3, 4])),
760            None,
761            Some(3),
762            true,
763        );
764        assert_eq!(
765            format!("{stats}"),
766            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
767             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
768        );
769
770        let stats = Statistics::ByteArray(
771            ValueStatistics::new(
772                Some(ByteArray::from(vec![1u8])),
773                Some(ByteArray::from(vec![2u8])),
774                Some(5),
775                Some(7),
776                false,
777            )
778            .with_max_is_exact(false)
779            .with_min_is_exact(false),
780        );
781        assert_eq!(
782            format!("{stats}"),
783            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
784        );
785    }
786
787    #[test]
788    fn test_statistics_partial_eq() {
789        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
790
791        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
792        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
793        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
794        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
795        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
796
797        assert!(
798            Statistics::int32(Some(12), Some(45), None, Some(11), false)
799                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
800        );
801
802        assert!(
803            Statistics::boolean(Some(false), Some(true), None, None, true)
804                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
805        );
806
807        assert!(
808            Statistics::byte_array(
809                Some(ByteArray::from(vec![1, 2, 3])),
810                Some(ByteArray::from(vec![1, 2, 3])),
811                None,
812                None,
813                true
814            ) != Statistics::fixed_len_byte_array(
815                Some(ByteArray::from(vec![1, 2, 3]).into()),
816                Some(ByteArray::from(vec![1, 2, 3]).into()),
817                None,
818                None,
819                true,
820            )
821        );
822
823        assert!(
824            Statistics::byte_array(
825                Some(ByteArray::from(vec![1, 2, 3])),
826                Some(ByteArray::from(vec![1, 2, 3])),
827                None,
828                None,
829                true,
830            ) != Statistics::ByteArray(
831                ValueStatistics::new(
832                    Some(ByteArray::from(vec![1, 2, 3])),
833                    Some(ByteArray::from(vec![1, 2, 3])),
834                    None,
835                    None,
836                    true,
837                )
838                .with_max_is_exact(false)
839            )
840        );
841
842        assert!(
843            Statistics::fixed_len_byte_array(
844                Some(FixedLenByteArray::from(vec![1, 2, 3])),
845                Some(FixedLenByteArray::from(vec![1, 2, 3])),
846                None,
847                None,
848                true,
849            ) != Statistics::FixedLenByteArray(
850                ValueStatistics::new(
851                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
852                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
853                    None,
854                    None,
855                    true,
856                )
857                .with_min_is_exact(false)
858            )
859        );
860    }
861
862    #[test]
863    fn test_statistics_from_thrift() {
864        // Helper method to check statistics conversion.
865        fn check_stats(stats: Statistics) {
866            let tpe = stats.physical_type();
867            let thrift_stats = to_thrift(Some(&stats));
868            assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
869        }
870
871        check_stats(Statistics::boolean(
872            Some(false),
873            Some(true),
874            None,
875            Some(7),
876            true,
877        ));
878        check_stats(Statistics::boolean(
879            Some(false),
880            Some(true),
881            None,
882            Some(7),
883            true,
884        ));
885        check_stats(Statistics::boolean(
886            Some(false),
887            Some(true),
888            None,
889            Some(0),
890            false,
891        ));
892        check_stats(Statistics::boolean(
893            Some(true),
894            Some(true),
895            None,
896            Some(7),
897            true,
898        ));
899        check_stats(Statistics::boolean(
900            Some(false),
901            Some(false),
902            None,
903            Some(7),
904            true,
905        ));
906        check_stats(Statistics::boolean(None, None, None, Some(7), true));
907
908        check_stats(Statistics::int32(
909            Some(-100),
910            Some(500),
911            None,
912            Some(7),
913            true,
914        ));
915        check_stats(Statistics::int32(
916            Some(-100),
917            Some(500),
918            None,
919            Some(0),
920            false,
921        ));
922        check_stats(Statistics::int32(None, None, None, Some(7), true));
923
924        check_stats(Statistics::int64(
925            Some(-100),
926            Some(200),
927            None,
928            Some(7),
929            true,
930        ));
931        check_stats(Statistics::int64(
932            Some(-100),
933            Some(200),
934            None,
935            Some(0),
936            false,
937        ));
938        check_stats(Statistics::int64(None, None, None, Some(7), true));
939
940        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
941        check_stats(Statistics::float(
942            Some(1.2),
943            Some(3.4),
944            None,
945            Some(0),
946            false,
947        ));
948        check_stats(Statistics::float(None, None, None, Some(7), true));
949
950        check_stats(Statistics::double(
951            Some(1.2),
952            Some(3.4),
953            None,
954            Some(7),
955            true,
956        ));
957        check_stats(Statistics::double(
958            Some(1.2),
959            Some(3.4),
960            None,
961            Some(0),
962            false,
963        ));
964        check_stats(Statistics::double(None, None, None, Some(7), true));
965
966        check_stats(Statistics::byte_array(
967            Some(ByteArray::from(vec![1, 2, 3])),
968            Some(ByteArray::from(vec![3, 4, 5])),
969            None,
970            Some(7),
971            true,
972        ));
973        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
974
975        check_stats(Statistics::fixed_len_byte_array(
976            Some(ByteArray::from(vec![1, 2, 3]).into()),
977            Some(ByteArray::from(vec![3, 4, 5]).into()),
978            None,
979            Some(7),
980            true,
981        ));
982        check_stats(Statistics::fixed_len_byte_array(
983            None,
984            None,
985            None,
986            Some(7),
987            true,
988        ));
989    }
990
991    #[test]
992    fn test_count_encoding() {
993        statistics_count_test(None, None);
994        statistics_count_test(Some(0), Some(0));
995        statistics_count_test(Some(100), Some(2000));
996        statistics_count_test(Some(1), None);
997        statistics_count_test(None, Some(1));
998    }
999
1000    #[test]
1001    fn test_count_encoding_distinct_too_large() {
1002        // statistics are stored using i64, so test trying to store larger values
1003        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1004        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1005        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1006        assert_eq!(thrift_stats.null_count, Some(100));
1007    }
1008
1009    #[test]
1010    fn test_count_encoding_null_too_large() {
1011        // statistics are stored using i64, so test trying to store larger values
1012        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1013        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1014        assert_eq!(thrift_stats.distinct_count, Some(100));
1015        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1016    }
1017
1018    #[test]
1019    fn test_count_decoding_null_invalid() {
1020        let tstatistics = TStatistics {
1021            null_count: Some(-42),
1022            ..Default::default()
1023        };
1024        let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1025        assert_eq!(
1026            err.to_string(),
1027            "Parquet error: Statistics null count is negative -42"
1028        );
1029    }
1030
1031    /// Writes statistics to thrift and reads them back and ensures:
1032    /// - The statistics are the same
1033    /// - The statistics written to thrift are the same as the original statistics
1034    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1035        let statistics = make_bool_stats(distinct_count, null_count);
1036
1037        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1038        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1039        assert_eq!(
1040            thrift_stats.distinct_count.map(|c| c as u64),
1041            distinct_count
1042        );
1043
1044        let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1045            .unwrap()
1046            .unwrap();
1047        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1048        // means null_count = Some(0)
1049        if null_count.is_none() {
1050            assert_ne!(round_tripped, statistics);
1051            assert!(round_tripped.null_count_opt().is_some());
1052            assert_eq!(round_tripped.null_count_opt(), Some(0));
1053            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1054            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1055            assert_eq!(
1056                round_tripped.distinct_count_opt(),
1057                statistics.distinct_count_opt()
1058            );
1059        } else {
1060            assert_eq!(round_tripped, statistics);
1061        }
1062    }
1063
1064    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1065        let min = Some(true);
1066        let max = Some(false);
1067        let is_min_max_deprecated = false;
1068
1069        // test is about the counts, so we aren't really testing the min/max values
1070        Statistics::Boolean(ValueStatistics::new(
1071            min,
1072            max,
1073            distinct_count,
1074            null_count,
1075            is_min_max_deprecated,
1076        ))
1077    }
1078}