parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53    use super::*;
54
55    pub trait MakeStatistics {
56        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57        where
58            Self: Sized;
59    }
60
61    macro_rules! gen_make_statistics {
62        ($value_ty:ty, $stat:ident) => {
63            impl MakeStatistics for $value_ty {
64                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65                where
66                    Self: Sized,
67                {
68                    Statistics::$stat(statistics)
69                }
70            }
71        };
72    }
73
74    gen_make_statistics!(bool, Boolean);
75    gen_make_statistics!(i32, Int32);
76    gen_make_statistics!(i64, Int64);
77    gen_make_statistics!(Int96, Int96);
78    gen_make_statistics!(f32, Float);
79    gen_make_statistics!(f64, Double);
80    gen_make_statistics!(ByteArray, ByteArray);
81    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84/// Macro to generate methods to create Statistics.
85macro_rules! statistics_new_func {
86    ($func:ident, $vtype:ty, $stat:ident) => {
87        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88        pub fn $func(
89            min: $vtype,
90            max: $vtype,
91            distinct: Option<u64>,
92            nulls: Option<u64>,
93            is_deprecated: bool,
94        ) -> Self {
95            Statistics::$stat(ValueStatistics::new(
96                min,
97                max,
98                distinct,
99                nulls,
100                is_deprecated,
101            ))
102        }
103    };
104}
105
106// Macro to generate getter functions for Statistics.
107macro_rules! statistics_enum_func {
108    ($self:ident, $func:ident) => {{
109        match *$self {
110            Statistics::Boolean(ref typed) => typed.$func(),
111            Statistics::Int32(ref typed) => typed.$func(),
112            Statistics::Int64(ref typed) => typed.$func(),
113            Statistics::Int96(ref typed) => typed.$func(),
114            Statistics::Float(ref typed) => typed.$func(),
115            Statistics::Double(ref typed) => typed.$func(),
116            Statistics::ByteArray(ref typed) => typed.$func(),
117            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118        }
119    }};
120}
121
122/// Converts Thrift definition into `Statistics`.
123pub fn from_thrift(
124    physical_type: Type,
125    thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127    Ok(match thrift_stats {
128        Some(stats) => {
129            // Number of nulls recorded, when it is not available, we just mark it as 0.
130            // TODO this should be `None` if there is no information about NULLS.
131            // see https://github.com/apache/arrow-rs/pull/6216/files
132            let null_count = stats.null_count.unwrap_or(0);
133
134            if null_count < 0 {
135                return Err(ParquetError::General(format!(
136                    "Statistics null count is negative {}",
137                    null_count
138                )));
139            }
140
141            // Generic null count.
142            let null_count = Some(null_count as u64);
143            // Generic distinct count (count of distinct values occurring)
144            let distinct_count = stats.distinct_count.map(|value| value as u64);
145            // Whether or not statistics use deprecated min/max fields.
146            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
147            // Generic min value as bytes.
148            let min = if old_format {
149                stats.min
150            } else {
151                stats.min_value
152            };
153            // Generic max value as bytes.
154            let max = if old_format {
155                stats.max
156            } else {
157                stats.max_value
158            };
159
160            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
161                if let Some(min) = min {
162                    if min.len() < len {
163                        return Err(ParquetError::General(
164                            "Insufficient bytes to parse min statistic".to_string(),
165                        ));
166                    }
167                }
168                if let Some(max) = max {
169                    if max.len() < len {
170                        return Err(ParquetError::General(
171                            "Insufficient bytes to parse max statistic".to_string(),
172                        ));
173                    }
174                }
175                Ok(())
176            }
177
178            match physical_type {
179                Type::BOOLEAN => check_len(&min, &max, 1),
180                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
181                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
182                Type::INT96 => check_len(&min, &max, 12),
183                _ => Ok(()),
184            }?;
185
186            // Values are encoded using PLAIN encoding definition, except that
187            // variable-length byte arrays do not include a length prefix.
188            //
189            // Instead of using actual decoder, we manually convert values.
190            let res = match physical_type {
191                Type::BOOLEAN => Statistics::boolean(
192                    min.map(|data| data[0] != 0),
193                    max.map(|data| data[0] != 0),
194                    distinct_count,
195                    null_count,
196                    old_format,
197                ),
198                Type::INT32 => Statistics::int32(
199                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
200                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
201                    distinct_count,
202                    null_count,
203                    old_format,
204                ),
205                Type::INT64 => Statistics::int64(
206                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
207                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
208                    distinct_count,
209                    null_count,
210                    old_format,
211                ),
212                Type::INT96 => {
213                    // INT96 statistics may not be correct, because comparison is signed
214                    // byte-wise, not actual timestamps. It is recommended to ignore
215                    // min/max statistics for INT96 columns.
216                    let min = if let Some(data) = min {
217                        assert_eq!(data.len(), 12);
218                        Some(Int96::try_from_le_slice(&data)?)
219                    } else {
220                        None
221                    };
222                    let max = if let Some(data) = max {
223                        assert_eq!(data.len(), 12);
224                        Some(Int96::try_from_le_slice(&data)?)
225                    } else {
226                        None
227                    };
228                    Statistics::int96(min, max, distinct_count, null_count, old_format)
229                }
230                Type::FLOAT => Statistics::float(
231                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
232                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
233                    distinct_count,
234                    null_count,
235                    old_format,
236                ),
237                Type::DOUBLE => Statistics::double(
238                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
239                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
240                    distinct_count,
241                    null_count,
242                    old_format,
243                ),
244                Type::BYTE_ARRAY => Statistics::ByteArray(
245                    ValueStatistics::new(
246                        min.map(ByteArray::from),
247                        max.map(ByteArray::from),
248                        distinct_count,
249                        null_count,
250                        old_format,
251                    )
252                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
253                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
254                ),
255                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
256                    ValueStatistics::new(
257                        min.map(ByteArray::from).map(FixedLenByteArray::from),
258                        max.map(ByteArray::from).map(FixedLenByteArray::from),
259                        distinct_count,
260                        null_count,
261                        old_format,
262                    )
263                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
264                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
265                ),
266            };
267
268            Some(res)
269        }
270        None => None,
271    })
272}
273
274/// Convert Statistics into Thrift definition.
275pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
276    let stats = stats?;
277
278    // record null count if it can fit in i64
279    let null_count = stats
280        .null_count_opt()
281        .and_then(|value| i64::try_from(value).ok());
282
283    // record distinct count if it can fit in i64
284    let distinct_count = stats
285        .distinct_count_opt()
286        .and_then(|value| i64::try_from(value).ok());
287
288    let mut thrift_stats = TStatistics {
289        max: None,
290        min: None,
291        null_count,
292        distinct_count,
293        max_value: None,
294        min_value: None,
295        is_max_value_exact: None,
296        is_min_value_exact: None,
297    };
298
299    // Get min/max if set.
300    let (min, max, min_exact, max_exact) = (
301        stats.min_bytes_opt().map(|x| x.to_vec()),
302        stats.max_bytes_opt().map(|x| x.to_vec()),
303        Some(stats.min_is_exact()),
304        Some(stats.max_is_exact()),
305    );
306    if stats.is_min_max_backwards_compatible() {
307        // Copy to deprecated min, max values for compatibility with older readers
308        thrift_stats.min.clone_from(&min);
309        thrift_stats.max.clone_from(&max);
310    }
311
312    if !stats.is_min_max_deprecated() {
313        thrift_stats.min_value = min;
314        thrift_stats.max_value = max;
315    }
316
317    thrift_stats.is_min_value_exact = min_exact;
318    thrift_stats.is_max_value_exact = max_exact;
319
320    Some(thrift_stats)
321}
322
323/// Strongly typed statistics for a column chunk within a row group.
324///
325/// This structure is a natively typed, in memory representation of the
326/// [`Statistics`] structure in a parquet file footer. The statistics stored in
327/// this structure can be used by query engines to skip decoding pages while
328/// reading parquet data.
329///
330/// Page level statistics are stored separately, in [NativeIndex].
331///
332/// [`Statistics`]: crate::format::Statistics
333/// [NativeIndex]: crate::file::page_index::index::NativeIndex
334#[derive(Debug, Clone, PartialEq)]
335pub enum Statistics {
336    /// Statistics for Boolean column
337    Boolean(ValueStatistics<bool>),
338    /// Statistics for Int32 column
339    Int32(ValueStatistics<i32>),
340    /// Statistics for Int64 column
341    Int64(ValueStatistics<i64>),
342    /// Statistics for Int96 column
343    Int96(ValueStatistics<Int96>),
344    /// Statistics for Float column
345    Float(ValueStatistics<f32>),
346    /// Statistics for Double column
347    Double(ValueStatistics<f64>),
348    /// Statistics for ByteArray column
349    ByteArray(ValueStatistics<ByteArray>),
350    /// Statistics for FixedLenByteArray column
351    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
352}
353
354impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
355    fn from(t: ValueStatistics<T>) -> Self {
356        T::make_statistics(t)
357    }
358}
359
360impl Statistics {
361    /// Creates new statistics for a column type
362    pub fn new<T: ParquetValueType>(
363        min: Option<T>,
364        max: Option<T>,
365        distinct_count: Option<u64>,
366        null_count: Option<u64>,
367        is_deprecated: bool,
368    ) -> Self {
369        Self::from(ValueStatistics::new(
370            min,
371            max,
372            distinct_count,
373            null_count,
374            is_deprecated,
375        ))
376    }
377
378    statistics_new_func![boolean, Option<bool>, Boolean];
379
380    statistics_new_func![int32, Option<i32>, Int32];
381
382    statistics_new_func![int64, Option<i64>, Int64];
383
384    statistics_new_func![int96, Option<Int96>, Int96];
385
386    statistics_new_func![float, Option<f32>, Float];
387
388    statistics_new_func![double, Option<f64>, Double];
389
390    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
391
392    statistics_new_func![
393        fixed_len_byte_array,
394        Option<FixedLenByteArray>,
395        FixedLenByteArray
396    ];
397
398    /// Returns `true` if statistics have old `min` and `max` fields set.
399    /// This means that the column order is likely to be undefined, which, for old files
400    /// could mean a signed sort order of values.
401    ///
402    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
403    /// [`SortOrder`](crate::basic::SortOrder) for more information.
404    pub fn is_min_max_deprecated(&self) -> bool {
405        statistics_enum_func![self, is_min_max_deprecated]
406    }
407
408    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
409    /// using signed comparison. This resulted in an undefined ordering for unsigned
410    /// quantities, such as booleans and unsigned integers.
411    ///
412    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
413    /// which have a type-defined sort order.
414    ///
415    /// However, not all readers have been updated. For backwards compatibility, this method
416    /// returns `true` if the statistics within this have a signed sort order, that is
417    /// compatible with being stored in the deprecated `min` and `max` fields
418    pub fn is_min_max_backwards_compatible(&self) -> bool {
419        statistics_enum_func![self, is_min_max_backwards_compatible]
420    }
421
422    /// Returns optional value of number of distinct values occurring.
423    /// When it is `None`, the value should be ignored.
424    #[deprecated(since = "53.0.0", note = "Use `distinct_count_opt` method instead")]
425    pub fn distinct_count(&self) -> Option<u64> {
426        self.distinct_count_opt()
427    }
428
429    /// Returns optional value of number of distinct values occurring.
430    /// When it is `None`, the value should be ignored.
431    pub fn distinct_count_opt(&self) -> Option<u64> {
432        statistics_enum_func![self, distinct_count]
433    }
434
435    /// Returns number of null values for the column.
436    /// Note that this includes all nulls when column is part of the complex type.
437    ///
438    /// Note this API returns 0 if the null count is not available.
439    #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
440    pub fn null_count(&self) -> u64 {
441        // 0 to remain consistent behavior prior to `null_count_opt`
442        self.null_count_opt().unwrap_or(0)
443    }
444
445    /// Returns `true` if statistics collected any null values, `false` otherwise.
446    #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
447    #[allow(deprecated)]
448    pub fn has_nulls(&self) -> bool {
449        self.null_count() > 0
450    }
451
452    /// Returns number of null values for the column, if known.
453    /// Note that this includes all nulls when column is part of the complex type.
454    ///
455    /// Note this API returns Some(0) even if the null count was not present
456    /// in the statistics.
457    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
458    pub fn null_count_opt(&self) -> Option<u64> {
459        statistics_enum_func![self, null_count_opt]
460    }
461
462    /// Whether or not min and max values are set.
463    /// Normally both min/max values will be set to `Some(value)` or `None`.
464    #[deprecated(
465        since = "53.0.0",
466        note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead"
467    )]
468    pub fn has_min_max_set(&self) -> bool {
469        statistics_enum_func![self, _internal_has_min_max_set]
470    }
471
472    /// Returns `true` if the min value is set, and is an exact min value.
473    pub fn min_is_exact(&self) -> bool {
474        statistics_enum_func![self, min_is_exact]
475    }
476
477    /// Returns `true` if the max value is set, and is an exact max value.
478    pub fn max_is_exact(&self) -> bool {
479        statistics_enum_func![self, max_is_exact]
480    }
481
482    /// Returns slice of bytes that represent min value, if min value is known.
483    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
484        statistics_enum_func![self, min_bytes_opt]
485    }
486
487    /// Returns slice of bytes that represent min value.
488    /// Panics if min value is not set.
489    #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
490    pub fn min_bytes(&self) -> &[u8] {
491        self.min_bytes_opt().unwrap()
492    }
493
494    /// Returns slice of bytes that represent max value, if max value is known.
495    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
496        statistics_enum_func![self, max_bytes_opt]
497    }
498
499    /// Returns slice of bytes that represent max value.
500    /// Panics if max value is not set.
501    #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
502    pub fn max_bytes(&self) -> &[u8] {
503        self.max_bytes_opt().unwrap()
504    }
505
506    /// Returns physical type associated with statistics.
507    pub fn physical_type(&self) -> Type {
508        match self {
509            Statistics::Boolean(_) => Type::BOOLEAN,
510            Statistics::Int32(_) => Type::INT32,
511            Statistics::Int64(_) => Type::INT64,
512            Statistics::Int96(_) => Type::INT96,
513            Statistics::Float(_) => Type::FLOAT,
514            Statistics::Double(_) => Type::DOUBLE,
515            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
516            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
517        }
518    }
519}
520
521impl fmt::Display for Statistics {
522    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
523        match self {
524            Statistics::Boolean(typed) => write!(f, "{typed}"),
525            Statistics::Int32(typed) => write!(f, "{typed}"),
526            Statistics::Int64(typed) => write!(f, "{typed}"),
527            Statistics::Int96(typed) => write!(f, "{typed}"),
528            Statistics::Float(typed) => write!(f, "{typed}"),
529            Statistics::Double(typed) => write!(f, "{typed}"),
530            Statistics::ByteArray(typed) => write!(f, "{typed}"),
531            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
532        }
533    }
534}
535
536/// Typed implementation for [`Statistics`].
537pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
538
539/// Typed statistics for one column chunk
540///
541/// See [`Statistics`] for more details
542#[derive(Clone, Eq, PartialEq)]
543pub struct ValueStatistics<T> {
544    min: Option<T>,
545    max: Option<T>,
546    // Distinct count could be omitted in some cases
547    distinct_count: Option<u64>,
548    null_count: Option<u64>,
549
550    // Whether or not the min or max values are exact, or truncated.
551    is_max_value_exact: bool,
552    is_min_value_exact: bool,
553
554    /// If `true` populate the deprecated `min` and `max` fields instead of
555    /// `min_value` and `max_value`
556    is_min_max_deprecated: bool,
557
558    /// If `true` the statistics are compatible with the deprecated `min` and
559    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
560    is_min_max_backwards_compatible: bool,
561}
562
563impl<T: ParquetValueType> ValueStatistics<T> {
564    /// Creates new typed statistics.
565    pub fn new(
566        min: Option<T>,
567        max: Option<T>,
568        distinct_count: Option<u64>,
569        null_count: Option<u64>,
570        is_min_max_deprecated: bool,
571    ) -> Self {
572        Self {
573            is_max_value_exact: max.is_some(),
574            is_min_value_exact: min.is_some(),
575            min,
576            max,
577            distinct_count,
578            null_count,
579            is_min_max_deprecated,
580            is_min_max_backwards_compatible: is_min_max_deprecated,
581        }
582    }
583
584    /// Set whether the stored `min` field represents the exact
585    /// minimum, or just a bound on the minimum value.
586    ///
587    /// see [`Self::min_is_exact`]
588    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
589        Self {
590            is_min_value_exact,
591            ..self
592        }
593    }
594
595    /// Set whether the stored `max` field represents the exact
596    /// maximum, or just a bound on the maximum value.
597    ///
598    /// see [`Self::max_is_exact`]
599    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
600        Self {
601            is_max_value_exact,
602            ..self
603        }
604    }
605
606    /// Set whether to write the deprecated `min` and `max` fields
607    /// for compatibility with older parquet writers
608    ///
609    /// This should only be enabled if the field is signed,
610    /// see [`Self::is_min_max_backwards_compatible`]
611    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
612        Self {
613            is_min_max_backwards_compatible: backwards_compatible,
614            ..self
615        }
616    }
617
618    /// Returns min value of the statistics.
619    ///
620    /// Panics if min value is not set, e.g. all values are `null`.
621    /// Use `has_min_max_set` method to check that.
622    #[deprecated(since = "53.0.0", note = "Use `min_opt` instead")]
623    pub fn min(&self) -> &T {
624        self.min.as_ref().unwrap()
625    }
626
627    /// Returns min value of the statistics, if known.
628    pub fn min_opt(&self) -> Option<&T> {
629        self.min.as_ref()
630    }
631
632    /// Returns max value of the statistics.
633    ///
634    /// Panics if max value is not set, e.g. all values are `null`.
635    /// Use `has_min_max_set` method to check that.
636    #[deprecated(since = "53.0.0", note = "Use `max_opt` instead")]
637    pub fn max(&self) -> &T {
638        self.max.as_ref().unwrap()
639    }
640
641    /// Returns max value of the statistics, if known.
642    pub fn max_opt(&self) -> Option<&T> {
643        self.max.as_ref()
644    }
645
646    /// Returns min value as bytes of the statistics, if min value is known.
647    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
648        self.min_opt().map(AsBytes::as_bytes)
649    }
650
651    /// Returns min value as bytes of the statistics.
652    ///
653    /// Panics if min value is not set, use `has_min_max_set` method to check
654    /// if values are set.
655    #[deprecated(since = "53.0.0", note = "Use `min_bytes_opt` instead")]
656    pub fn min_bytes(&self) -> &[u8] {
657        self.min_bytes_opt().unwrap()
658    }
659
660    /// Returns max value as bytes of the statistics, if max value is known.
661    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
662        self.max_opt().map(AsBytes::as_bytes)
663    }
664
665    /// Returns max value as bytes of the statistics.
666    ///
667    /// Panics if max value is not set, use `has_min_max_set` method to check
668    /// if values are set.
669    #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
670    pub fn max_bytes(&self) -> &[u8] {
671        self.max_bytes_opt().unwrap()
672    }
673
674    /// Whether or not min and max values are set.
675    /// Normally both min/max values will be set to `Some(value)` or `None`.
676    #[deprecated(since = "53.0.0", note = "Use `min_opt` and `max_opt` methods instead")]
677    pub fn has_min_max_set(&self) -> bool {
678        self._internal_has_min_max_set()
679    }
680
681    /// Whether or not min and max values are set.
682    /// Normally both min/max values will be set to `Some(value)` or `None`.
683    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
684        self.min.is_some() && self.max.is_some()
685    }
686
687    /// Whether or not max value is set, and is an exact value.
688    pub fn max_is_exact(&self) -> bool {
689        self.max.is_some() && self.is_max_value_exact
690    }
691
692    /// Whether or not min value is set, and is an exact value.
693    pub fn min_is_exact(&self) -> bool {
694        self.min.is_some() && self.is_min_value_exact
695    }
696
697    /// Returns optional value of number of distinct values occurring.
698    pub fn distinct_count(&self) -> Option<u64> {
699        self.distinct_count
700    }
701
702    /// Returns number of null values for the column.
703    /// Note that this includes all nulls when column is part of the complex type.
704    #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
705    pub fn null_count(&self) -> u64 {
706        // 0 to remain consistent behavior prior to `null_count_opt`
707        self.null_count_opt().unwrap_or(0)
708    }
709
710    /// Returns null count.
711    pub fn null_count_opt(&self) -> Option<u64> {
712        self.null_count
713    }
714
715    /// Returns `true` if statistics were created using old min/max fields.
716    fn is_min_max_deprecated(&self) -> bool {
717        self.is_min_max_deprecated
718    }
719
720    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
721    /// using signed comparison. This resulted in an undefined ordering for unsigned
722    /// quantities, such as booleans and unsigned integers.
723    ///
724    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
725    /// which have a type-defined sort order.
726    ///
727    /// However, not all readers have been updated. For backwards compatibility, this method
728    /// returns `true` if the statistics within this have a signed sort order, that is
729    /// compatible with being stored in the deprecated `min` and `max` fields
730    pub fn is_min_max_backwards_compatible(&self) -> bool {
731        self.is_min_max_backwards_compatible
732    }
733}
734
735impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
736    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
737        write!(f, "{{")?;
738        write!(f, "min: ")?;
739        match self.min {
740            Some(ref value) => write!(f, "{value}")?,
741            None => write!(f, "N/A")?,
742        }
743        write!(f, ", max: ")?;
744        match self.max {
745            Some(ref value) => write!(f, "{value}")?,
746            None => write!(f, "N/A")?,
747        }
748        write!(f, ", distinct_count: ")?;
749        match self.distinct_count {
750            Some(value) => write!(f, "{value}")?,
751            None => write!(f, "N/A")?,
752        }
753        write!(f, ", null_count: ")?;
754        match self.null_count {
755            Some(value) => write!(f, "{value}")?,
756            None => write!(f, "N/A")?,
757        }
758        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
759        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
760        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
761        write!(f, "}}")
762    }
763}
764
765impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
766    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
767        write!(
768            f,
769            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
770             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
771            self.min,
772            self.max,
773            self.distinct_count,
774            self.null_count,
775            self.is_min_max_deprecated,
776            self.is_min_max_backwards_compatible,
777            self.is_max_value_exact,
778            self.is_min_value_exact
779        )
780    }
781}
782
783#[cfg(test)]
784mod tests {
785    use super::*;
786
787    #[test]
788    fn test_statistics_min_max_bytes() {
789        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
790        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
791        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
792
793        let stats = Statistics::byte_array(
794            Some(ByteArray::from(vec![1, 2, 3])),
795            Some(ByteArray::from(vec![3, 4, 5])),
796            None,
797            Some(1),
798            true,
799        );
800        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
801        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
802    }
803
804    #[test]
805    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
806    fn test_statistics_negative_null_count() {
807        let thrift_stats = TStatistics {
808            max: None,
809            min: None,
810            null_count: Some(-10),
811            distinct_count: None,
812            max_value: None,
813            min_value: None,
814            is_max_value_exact: None,
815            is_min_value_exact: None,
816        };
817
818        from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
819    }
820
821    #[test]
822    fn test_statistics_thrift_none() {
823        assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
824        assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
825    }
826
827    #[test]
828    fn test_statistics_debug() {
829        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
830        assert_eq!(
831            format!("{stats:?}"),
832            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
833             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
834        );
835
836        let stats = Statistics::int32(None, None, None, Some(7), false);
837        assert_eq!(
838            format!("{stats:?}"),
839            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
840             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
841        )
842    }
843
844    #[test]
845    fn test_statistics_display() {
846        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
847        assert_eq!(
848            format!("{stats}"),
849            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
850        );
851
852        let stats = Statistics::int64(None, None, None, Some(7), false);
853        assert_eq!(
854            format!("{stats}"),
855            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
856             false, max_value_exact: false, min_value_exact: false}"
857        );
858
859        let stats = Statistics::int96(
860            Some(Int96::from(vec![1, 0, 0])),
861            Some(Int96::from(vec![2, 3, 4])),
862            None,
863            Some(3),
864            true,
865        );
866        assert_eq!(
867            format!("{stats}"),
868            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
869             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
870        );
871
872        let stats = Statistics::ByteArray(
873            ValueStatistics::new(
874                Some(ByteArray::from(vec![1u8])),
875                Some(ByteArray::from(vec![2u8])),
876                Some(5),
877                Some(7),
878                false,
879            )
880            .with_max_is_exact(false)
881            .with_min_is_exact(false),
882        );
883        assert_eq!(
884            format!("{stats}"),
885            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
886        );
887    }
888
889    #[test]
890    fn test_statistics_partial_eq() {
891        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
892
893        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
894        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
895        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
896        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
897        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
898
899        assert!(
900            Statistics::int32(Some(12), Some(45), None, Some(11), false)
901                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
902        );
903
904        assert!(
905            Statistics::boolean(Some(false), Some(true), None, None, true)
906                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
907        );
908
909        assert!(
910            Statistics::byte_array(
911                Some(ByteArray::from(vec![1, 2, 3])),
912                Some(ByteArray::from(vec![1, 2, 3])),
913                None,
914                None,
915                true
916            ) != Statistics::fixed_len_byte_array(
917                Some(ByteArray::from(vec![1, 2, 3]).into()),
918                Some(ByteArray::from(vec![1, 2, 3]).into()),
919                None,
920                None,
921                true,
922            )
923        );
924
925        assert!(
926            Statistics::byte_array(
927                Some(ByteArray::from(vec![1, 2, 3])),
928                Some(ByteArray::from(vec![1, 2, 3])),
929                None,
930                None,
931                true,
932            ) != Statistics::ByteArray(
933                ValueStatistics::new(
934                    Some(ByteArray::from(vec![1, 2, 3])),
935                    Some(ByteArray::from(vec![1, 2, 3])),
936                    None,
937                    None,
938                    true,
939                )
940                .with_max_is_exact(false)
941            )
942        );
943
944        assert!(
945            Statistics::fixed_len_byte_array(
946                Some(FixedLenByteArray::from(vec![1, 2, 3])),
947                Some(FixedLenByteArray::from(vec![1, 2, 3])),
948                None,
949                None,
950                true,
951            ) != Statistics::FixedLenByteArray(
952                ValueStatistics::new(
953                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
954                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
955                    None,
956                    None,
957                    true,
958                )
959                .with_min_is_exact(false)
960            )
961        );
962    }
963
964    #[test]
965    fn test_statistics_from_thrift() {
966        // Helper method to check statistics conversion.
967        fn check_stats(stats: Statistics) {
968            let tpe = stats.physical_type();
969            let thrift_stats = to_thrift(Some(&stats));
970            assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
971        }
972
973        check_stats(Statistics::boolean(
974            Some(false),
975            Some(true),
976            None,
977            Some(7),
978            true,
979        ));
980        check_stats(Statistics::boolean(
981            Some(false),
982            Some(true),
983            None,
984            Some(7),
985            true,
986        ));
987        check_stats(Statistics::boolean(
988            Some(false),
989            Some(true),
990            None,
991            Some(0),
992            false,
993        ));
994        check_stats(Statistics::boolean(
995            Some(true),
996            Some(true),
997            None,
998            Some(7),
999            true,
1000        ));
1001        check_stats(Statistics::boolean(
1002            Some(false),
1003            Some(false),
1004            None,
1005            Some(7),
1006            true,
1007        ));
1008        check_stats(Statistics::boolean(None, None, None, Some(7), true));
1009
1010        check_stats(Statistics::int32(
1011            Some(-100),
1012            Some(500),
1013            None,
1014            Some(7),
1015            true,
1016        ));
1017        check_stats(Statistics::int32(
1018            Some(-100),
1019            Some(500),
1020            None,
1021            Some(0),
1022            false,
1023        ));
1024        check_stats(Statistics::int32(None, None, None, Some(7), true));
1025
1026        check_stats(Statistics::int64(
1027            Some(-100),
1028            Some(200),
1029            None,
1030            Some(7),
1031            true,
1032        ));
1033        check_stats(Statistics::int64(
1034            Some(-100),
1035            Some(200),
1036            None,
1037            Some(0),
1038            false,
1039        ));
1040        check_stats(Statistics::int64(None, None, None, Some(7), true));
1041
1042        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
1043        check_stats(Statistics::float(
1044            Some(1.2),
1045            Some(3.4),
1046            None,
1047            Some(0),
1048            false,
1049        ));
1050        check_stats(Statistics::float(None, None, None, Some(7), true));
1051
1052        check_stats(Statistics::double(
1053            Some(1.2),
1054            Some(3.4),
1055            None,
1056            Some(7),
1057            true,
1058        ));
1059        check_stats(Statistics::double(
1060            Some(1.2),
1061            Some(3.4),
1062            None,
1063            Some(0),
1064            false,
1065        ));
1066        check_stats(Statistics::double(None, None, None, Some(7), true));
1067
1068        check_stats(Statistics::byte_array(
1069            Some(ByteArray::from(vec![1, 2, 3])),
1070            Some(ByteArray::from(vec![3, 4, 5])),
1071            None,
1072            Some(7),
1073            true,
1074        ));
1075        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
1076
1077        check_stats(Statistics::fixed_len_byte_array(
1078            Some(ByteArray::from(vec![1, 2, 3]).into()),
1079            Some(ByteArray::from(vec![3, 4, 5]).into()),
1080            None,
1081            Some(7),
1082            true,
1083        ));
1084        check_stats(Statistics::fixed_len_byte_array(
1085            None,
1086            None,
1087            None,
1088            Some(7),
1089            true,
1090        ));
1091    }
1092
1093    #[test]
1094    fn test_count_encoding() {
1095        statistics_count_test(None, None);
1096        statistics_count_test(Some(0), Some(0));
1097        statistics_count_test(Some(100), Some(2000));
1098        statistics_count_test(Some(1), None);
1099        statistics_count_test(None, Some(1));
1100    }
1101
1102    #[test]
1103    fn test_count_encoding_distinct_too_large() {
1104        // statistics are stored using i64, so test trying to store larger values
1105        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1106        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1107        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1108        assert_eq!(thrift_stats.null_count, Some(100));
1109    }
1110
1111    #[test]
1112    fn test_count_encoding_null_too_large() {
1113        // statistics are stored using i64, so test trying to store larger values
1114        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1115        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1116        assert_eq!(thrift_stats.distinct_count, Some(100));
1117        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1118    }
1119
1120    #[test]
1121    fn test_count_decoding_null_invalid() {
1122        let tstatistics = TStatistics {
1123            null_count: Some(-42),
1124            ..Default::default()
1125        };
1126        let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1127        assert_eq!(
1128            err.to_string(),
1129            "Parquet error: Statistics null count is negative -42"
1130        );
1131    }
1132
1133    /// Writes statistics to thrift and reads them back and ensures:
1134    /// - The statistics are the same
1135    /// - The statistics written to thrift are the same as the original statistics
1136    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1137        let statistics = make_bool_stats(distinct_count, null_count);
1138
1139        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1140        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1141        assert_eq!(
1142            thrift_stats.distinct_count.map(|c| c as u64),
1143            distinct_count
1144        );
1145
1146        let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1147            .unwrap()
1148            .unwrap();
1149        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1150        // means null_count = Some(0)
1151        if null_count.is_none() {
1152            assert_ne!(round_tripped, statistics);
1153            assert!(round_tripped.null_count_opt().is_some());
1154            assert_eq!(round_tripped.null_count_opt(), Some(0));
1155            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1156            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1157            assert_eq!(
1158                round_tripped.distinct_count_opt(),
1159                statistics.distinct_count_opt()
1160            );
1161        } else {
1162            assert_eq!(round_tripped, statistics);
1163        }
1164    }
1165
1166    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1167        let min = Some(true);
1168        let max = Some(false);
1169        let is_min_max_deprecated = false;
1170
1171        // test is about the counts, so we aren't really testing the min/max values
1172        Statistics::Boolean(ValueStatistics::new(
1173            min,
1174            max,
1175            distinct_count,
1176            null_count,
1177            is_min_max_deprecated,
1178        ))
1179    }
1180}