parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53    use super::*;
54
55    pub trait MakeStatistics {
56        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57        where
58            Self: Sized;
59    }
60
61    macro_rules! gen_make_statistics {
62        ($value_ty:ty, $stat:ident) => {
63            impl MakeStatistics for $value_ty {
64                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65                where
66                    Self: Sized,
67                {
68                    Statistics::$stat(statistics)
69                }
70            }
71        };
72    }
73
74    gen_make_statistics!(bool, Boolean);
75    gen_make_statistics!(i32, Int32);
76    gen_make_statistics!(i64, Int64);
77    gen_make_statistics!(Int96, Int96);
78    gen_make_statistics!(f32, Float);
79    gen_make_statistics!(f64, Double);
80    gen_make_statistics!(ByteArray, ByteArray);
81    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84/// Macro to generate methods to create Statistics.
85macro_rules! statistics_new_func {
86    ($func:ident, $vtype:ty, $stat:ident) => {
87        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88        pub fn $func(
89            min: $vtype,
90            max: $vtype,
91            distinct: Option<u64>,
92            nulls: Option<u64>,
93            is_deprecated: bool,
94        ) -> Self {
95            Statistics::$stat(ValueStatistics::new(
96                min,
97                max,
98                distinct,
99                nulls,
100                is_deprecated,
101            ))
102        }
103    };
104}
105
106// Macro to generate getter functions for Statistics.
107macro_rules! statistics_enum_func {
108    ($self:ident, $func:ident) => {{
109        match *$self {
110            Statistics::Boolean(ref typed) => typed.$func(),
111            Statistics::Int32(ref typed) => typed.$func(),
112            Statistics::Int64(ref typed) => typed.$func(),
113            Statistics::Int96(ref typed) => typed.$func(),
114            Statistics::Float(ref typed) => typed.$func(),
115            Statistics::Double(ref typed) => typed.$func(),
116            Statistics::ByteArray(ref typed) => typed.$func(),
117            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118        }
119    }};
120}
121
122/// Converts Thrift definition into `Statistics`.
123pub fn from_thrift(
124    physical_type: Type,
125    thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127    Ok(match thrift_stats {
128        Some(stats) => {
129            // Number of nulls recorded, when it is not available, we just mark it as 0.
130            // TODO this should be `None` if there is no information about NULLS.
131            // see https://github.com/apache/arrow-rs/pull/6216/files
132            let null_count = stats.null_count.unwrap_or(0);
133
134            if null_count < 0 {
135                return Err(ParquetError::General(format!(
136                    "Statistics null count is negative {null_count}",
137                )));
138            }
139
140            // Generic null count.
141            let null_count = Some(null_count as u64);
142            // Generic distinct count (count of distinct values occurring)
143            let distinct_count = stats.distinct_count.map(|value| value as u64);
144            // Whether or not statistics use deprecated min/max fields.
145            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
146            // Generic min value as bytes.
147            let min = if old_format {
148                stats.min
149            } else {
150                stats.min_value
151            };
152            // Generic max value as bytes.
153            let max = if old_format {
154                stats.max
155            } else {
156                stats.max_value
157            };
158
159            fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
160                if let Some(min) = min {
161                    if min.len() < len {
162                        return Err(ParquetError::General(
163                            "Insufficient bytes to parse min statistic".to_string(),
164                        ));
165                    }
166                }
167                if let Some(max) = max {
168                    if max.len() < len {
169                        return Err(ParquetError::General(
170                            "Insufficient bytes to parse max statistic".to_string(),
171                        ));
172                    }
173                }
174                Ok(())
175            }
176
177            match physical_type {
178                Type::BOOLEAN => check_len(&min, &max, 1),
179                Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
180                Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
181                Type::INT96 => check_len(&min, &max, 12),
182                _ => Ok(()),
183            }?;
184
185            // Values are encoded using PLAIN encoding definition, except that
186            // variable-length byte arrays do not include a length prefix.
187            //
188            // Instead of using actual decoder, we manually convert values.
189            let res = match physical_type {
190                Type::BOOLEAN => Statistics::boolean(
191                    min.map(|data| data[0] != 0),
192                    max.map(|data| data[0] != 0),
193                    distinct_count,
194                    null_count,
195                    old_format,
196                ),
197                Type::INT32 => Statistics::int32(
198                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
200                    distinct_count,
201                    null_count,
202                    old_format,
203                ),
204                Type::INT64 => Statistics::int64(
205                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
207                    distinct_count,
208                    null_count,
209                    old_format,
210                ),
211                Type::INT96 => {
212                    // INT96 statistics may not be correct, because comparison is signed
213                    // byte-wise, not actual timestamps. It is recommended to ignore
214                    // min/max statistics for INT96 columns.
215                    let min = if let Some(data) = min {
216                        assert_eq!(data.len(), 12);
217                        Some(Int96::try_from_le_slice(&data)?)
218                    } else {
219                        None
220                    };
221                    let max = if let Some(data) = max {
222                        assert_eq!(data.len(), 12);
223                        Some(Int96::try_from_le_slice(&data)?)
224                    } else {
225                        None
226                    };
227                    Statistics::int96(min, max, distinct_count, null_count, old_format)
228                }
229                Type::FLOAT => Statistics::float(
230                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
231                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
232                    distinct_count,
233                    null_count,
234                    old_format,
235                ),
236                Type::DOUBLE => Statistics::double(
237                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
238                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
239                    distinct_count,
240                    null_count,
241                    old_format,
242                ),
243                Type::BYTE_ARRAY => Statistics::ByteArray(
244                    ValueStatistics::new(
245                        min.map(ByteArray::from),
246                        max.map(ByteArray::from),
247                        distinct_count,
248                        null_count,
249                        old_format,
250                    )
251                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
252                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
253                ),
254                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
255                    ValueStatistics::new(
256                        min.map(ByteArray::from).map(FixedLenByteArray::from),
257                        max.map(ByteArray::from).map(FixedLenByteArray::from),
258                        distinct_count,
259                        null_count,
260                        old_format,
261                    )
262                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
263                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
264                ),
265            };
266
267            Some(res)
268        }
269        None => None,
270    })
271}
272
273/// Convert Statistics into Thrift definition.
274pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
275    let stats = stats?;
276
277    // record null count if it can fit in i64
278    let null_count = stats
279        .null_count_opt()
280        .and_then(|value| i64::try_from(value).ok());
281
282    // record distinct count if it can fit in i64
283    let distinct_count = stats
284        .distinct_count_opt()
285        .and_then(|value| i64::try_from(value).ok());
286
287    let mut thrift_stats = TStatistics {
288        max: None,
289        min: None,
290        null_count,
291        distinct_count,
292        max_value: None,
293        min_value: None,
294        is_max_value_exact: None,
295        is_min_value_exact: None,
296    };
297
298    // Get min/max if set.
299    let (min, max, min_exact, max_exact) = (
300        stats.min_bytes_opt().map(|x| x.to_vec()),
301        stats.max_bytes_opt().map(|x| x.to_vec()),
302        Some(stats.min_is_exact()),
303        Some(stats.max_is_exact()),
304    );
305    if stats.is_min_max_backwards_compatible() {
306        // Copy to deprecated min, max values for compatibility with older readers
307        thrift_stats.min.clone_from(&min);
308        thrift_stats.max.clone_from(&max);
309    }
310
311    if !stats.is_min_max_deprecated() {
312        thrift_stats.min_value = min;
313        thrift_stats.max_value = max;
314    }
315
316    thrift_stats.is_min_value_exact = min_exact;
317    thrift_stats.is_max_value_exact = max_exact;
318
319    Some(thrift_stats)
320}
321
322/// Strongly typed statistics for a column chunk within a row group.
323///
324/// This structure is a natively typed, in memory representation of the
325/// [`Statistics`] structure in a parquet file footer. The statistics stored in
326/// this structure can be used by query engines to skip decoding pages while
327/// reading parquet data.
328///
329/// Page level statistics are stored separately, in [NativeIndex].
330///
331/// [`Statistics`]: crate::format::Statistics
332/// [NativeIndex]: crate::file::page_index::index::NativeIndex
333#[derive(Debug, Clone, PartialEq)]
334pub enum Statistics {
335    /// Statistics for Boolean column
336    Boolean(ValueStatistics<bool>),
337    /// Statistics for Int32 column
338    Int32(ValueStatistics<i32>),
339    /// Statistics for Int64 column
340    Int64(ValueStatistics<i64>),
341    /// Statistics for Int96 column
342    Int96(ValueStatistics<Int96>),
343    /// Statistics for Float column
344    Float(ValueStatistics<f32>),
345    /// Statistics for Double column
346    Double(ValueStatistics<f64>),
347    /// Statistics for ByteArray column
348    ByteArray(ValueStatistics<ByteArray>),
349    /// Statistics for FixedLenByteArray column
350    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
351}
352
353impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
354    fn from(t: ValueStatistics<T>) -> Self {
355        T::make_statistics(t)
356    }
357}
358
359impl Statistics {
360    /// Creates new statistics for a column type
361    pub fn new<T: ParquetValueType>(
362        min: Option<T>,
363        max: Option<T>,
364        distinct_count: Option<u64>,
365        null_count: Option<u64>,
366        is_deprecated: bool,
367    ) -> Self {
368        Self::from(ValueStatistics::new(
369            min,
370            max,
371            distinct_count,
372            null_count,
373            is_deprecated,
374        ))
375    }
376
377    statistics_new_func![boolean, Option<bool>, Boolean];
378
379    statistics_new_func![int32, Option<i32>, Int32];
380
381    statistics_new_func![int64, Option<i64>, Int64];
382
383    statistics_new_func![int96, Option<Int96>, Int96];
384
385    statistics_new_func![float, Option<f32>, Float];
386
387    statistics_new_func![double, Option<f64>, Double];
388
389    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
390
391    statistics_new_func![
392        fixed_len_byte_array,
393        Option<FixedLenByteArray>,
394        FixedLenByteArray
395    ];
396
397    /// Returns `true` if statistics have old `min` and `max` fields set.
398    /// This means that the column order is likely to be undefined, which, for old files
399    /// could mean a signed sort order of values.
400    ///
401    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
402    /// [`SortOrder`](crate::basic::SortOrder) for more information.
403    pub fn is_min_max_deprecated(&self) -> bool {
404        statistics_enum_func![self, is_min_max_deprecated]
405    }
406
407    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
408    /// using signed comparison. This resulted in an undefined ordering for unsigned
409    /// quantities, such as booleans and unsigned integers.
410    ///
411    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
412    /// which have a type-defined sort order.
413    ///
414    /// However, not all readers have been updated. For backwards compatibility, this method
415    /// returns `true` if the statistics within this have a signed sort order, that is
416    /// compatible with being stored in the deprecated `min` and `max` fields
417    pub fn is_min_max_backwards_compatible(&self) -> bool {
418        statistics_enum_func![self, is_min_max_backwards_compatible]
419    }
420
421    /// Returns optional value of number of distinct values occurring.
422    /// When it is `None`, the value should be ignored.
423    pub fn distinct_count_opt(&self) -> Option<u64> {
424        statistics_enum_func![self, distinct_count]
425    }
426
427    /// Returns number of null values for the column, if known.
428    /// Note that this includes all nulls when column is part of the complex type.
429    ///
430    /// Note this API returns Some(0) even if the null count was not present
431    /// in the statistics.
432    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
433    pub fn null_count_opt(&self) -> Option<u64> {
434        statistics_enum_func![self, null_count_opt]
435    }
436
437    /// Returns `true` if the min value is set, and is an exact min value.
438    pub fn min_is_exact(&self) -> bool {
439        statistics_enum_func![self, min_is_exact]
440    }
441
442    /// Returns `true` if the max value is set, and is an exact max value.
443    pub fn max_is_exact(&self) -> bool {
444        statistics_enum_func![self, max_is_exact]
445    }
446
447    /// Returns slice of bytes that represent min value, if min value is known.
448    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
449        statistics_enum_func![self, min_bytes_opt]
450    }
451
452    /// Returns slice of bytes that represent max value, if max value is known.
453    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
454        statistics_enum_func![self, max_bytes_opt]
455    }
456
457    /// Returns physical type associated with statistics.
458    pub fn physical_type(&self) -> Type {
459        match self {
460            Statistics::Boolean(_) => Type::BOOLEAN,
461            Statistics::Int32(_) => Type::INT32,
462            Statistics::Int64(_) => Type::INT64,
463            Statistics::Int96(_) => Type::INT96,
464            Statistics::Float(_) => Type::FLOAT,
465            Statistics::Double(_) => Type::DOUBLE,
466            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
467            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
468        }
469    }
470}
471
472impl fmt::Display for Statistics {
473    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
474        match self {
475            Statistics::Boolean(typed) => write!(f, "{typed}"),
476            Statistics::Int32(typed) => write!(f, "{typed}"),
477            Statistics::Int64(typed) => write!(f, "{typed}"),
478            Statistics::Int96(typed) => write!(f, "{typed}"),
479            Statistics::Float(typed) => write!(f, "{typed}"),
480            Statistics::Double(typed) => write!(f, "{typed}"),
481            Statistics::ByteArray(typed) => write!(f, "{typed}"),
482            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
483        }
484    }
485}
486
487/// Typed implementation for [`Statistics`].
488pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
489
490/// Typed statistics for one column chunk
491///
492/// See [`Statistics`] for more details
493#[derive(Clone, Eq, PartialEq)]
494pub struct ValueStatistics<T> {
495    min: Option<T>,
496    max: Option<T>,
497    // Distinct count could be omitted in some cases
498    distinct_count: Option<u64>,
499    null_count: Option<u64>,
500
501    // Whether or not the min or max values are exact, or truncated.
502    is_max_value_exact: bool,
503    is_min_value_exact: bool,
504
505    /// If `true` populate the deprecated `min` and `max` fields instead of
506    /// `min_value` and `max_value`
507    is_min_max_deprecated: bool,
508
509    /// If `true` the statistics are compatible with the deprecated `min` and
510    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
511    is_min_max_backwards_compatible: bool,
512}
513
514impl<T: ParquetValueType> ValueStatistics<T> {
515    /// Creates new typed statistics.
516    pub fn new(
517        min: Option<T>,
518        max: Option<T>,
519        distinct_count: Option<u64>,
520        null_count: Option<u64>,
521        is_min_max_deprecated: bool,
522    ) -> Self {
523        Self {
524            is_max_value_exact: max.is_some(),
525            is_min_value_exact: min.is_some(),
526            min,
527            max,
528            distinct_count,
529            null_count,
530            is_min_max_deprecated,
531            is_min_max_backwards_compatible: is_min_max_deprecated,
532        }
533    }
534
535    /// Set whether the stored `min` field represents the exact
536    /// minimum, or just a bound on the minimum value.
537    ///
538    /// see [`Self::min_is_exact`]
539    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
540        Self {
541            is_min_value_exact,
542            ..self
543        }
544    }
545
546    /// Set whether the stored `max` field represents the exact
547    /// maximum, or just a bound on the maximum value.
548    ///
549    /// see [`Self::max_is_exact`]
550    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
551        Self {
552            is_max_value_exact,
553            ..self
554        }
555    }
556
557    /// Set whether to write the deprecated `min` and `max` fields
558    /// for compatibility with older parquet writers
559    ///
560    /// This should only be enabled if the field is signed,
561    /// see [`Self::is_min_max_backwards_compatible`]
562    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
563        Self {
564            is_min_max_backwards_compatible: backwards_compatible,
565            ..self
566        }
567    }
568
569    /// Returns min value of the statistics, if known.
570    pub fn min_opt(&self) -> Option<&T> {
571        self.min.as_ref()
572    }
573
574    /// Returns max value of the statistics, if known.
575    pub fn max_opt(&self) -> Option<&T> {
576        self.max.as_ref()
577    }
578
579    /// Returns min value as bytes of the statistics, if min value is known.
580    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
581        self.min_opt().map(AsBytes::as_bytes)
582    }
583
584    /// Returns max value as bytes of the statistics, if max value is known.
585    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
586        self.max_opt().map(AsBytes::as_bytes)
587    }
588
589    /// Whether or not min and max values are set.
590    /// Normally both min/max values will be set to `Some(value)` or `None`.
591    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
592        self.min.is_some() && self.max.is_some()
593    }
594
595    /// Whether or not max value is set, and is an exact value.
596    pub fn max_is_exact(&self) -> bool {
597        self.max.is_some() && self.is_max_value_exact
598    }
599
600    /// Whether or not min value is set, and is an exact value.
601    pub fn min_is_exact(&self) -> bool {
602        self.min.is_some() && self.is_min_value_exact
603    }
604
605    /// Returns optional value of number of distinct values occurring.
606    pub fn distinct_count(&self) -> Option<u64> {
607        self.distinct_count
608    }
609
610    /// Returns null count.
611    pub fn null_count_opt(&self) -> Option<u64> {
612        self.null_count
613    }
614
615    /// Returns `true` if statistics were created using old min/max fields.
616    fn is_min_max_deprecated(&self) -> bool {
617        self.is_min_max_deprecated
618    }
619
620    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
621    /// using signed comparison. This resulted in an undefined ordering for unsigned
622    /// quantities, such as booleans and unsigned integers.
623    ///
624    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
625    /// which have a type-defined sort order.
626    ///
627    /// However, not all readers have been updated. For backwards compatibility, this method
628    /// returns `true` if the statistics within this have a signed sort order, that is
629    /// compatible with being stored in the deprecated `min` and `max` fields
630    pub fn is_min_max_backwards_compatible(&self) -> bool {
631        self.is_min_max_backwards_compatible
632    }
633}
634
635impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
636    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
637        write!(f, "{{")?;
638        write!(f, "min: ")?;
639        match self.min {
640            Some(ref value) => write!(f, "{value}")?,
641            None => write!(f, "N/A")?,
642        }
643        write!(f, ", max: ")?;
644        match self.max {
645            Some(ref value) => write!(f, "{value}")?,
646            None => write!(f, "N/A")?,
647        }
648        write!(f, ", distinct_count: ")?;
649        match self.distinct_count {
650            Some(value) => write!(f, "{value}")?,
651            None => write!(f, "N/A")?,
652        }
653        write!(f, ", null_count: ")?;
654        match self.null_count {
655            Some(value) => write!(f, "{value}")?,
656            None => write!(f, "N/A")?,
657        }
658        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
659        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
660        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
661        write!(f, "}}")
662    }
663}
664
665impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
666    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
667        write!(
668            f,
669            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
670             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
671            self.min,
672            self.max,
673            self.distinct_count,
674            self.null_count,
675            self.is_min_max_deprecated,
676            self.is_min_max_backwards_compatible,
677            self.is_max_value_exact,
678            self.is_min_value_exact
679        )
680    }
681}
682
683#[cfg(test)]
684mod tests {
685    use super::*;
686
687    #[test]
688    fn test_statistics_min_max_bytes() {
689        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
690        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
691        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
692
693        let stats = Statistics::byte_array(
694            Some(ByteArray::from(vec![1, 2, 3])),
695            Some(ByteArray::from(vec![3, 4, 5])),
696            None,
697            Some(1),
698            true,
699        );
700        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
701        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
702    }
703
704    #[test]
705    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
706    fn test_statistics_negative_null_count() {
707        let thrift_stats = TStatistics {
708            max: None,
709            min: None,
710            null_count: Some(-10),
711            distinct_count: None,
712            max_value: None,
713            min_value: None,
714            is_max_value_exact: None,
715            is_min_value_exact: None,
716        };
717
718        from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
719    }
720
721    #[test]
722    fn test_statistics_thrift_none() {
723        assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
724        assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
725    }
726
727    #[test]
728    fn test_statistics_debug() {
729        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
730        assert_eq!(
731            format!("{stats:?}"),
732            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
733             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
734        );
735
736        let stats = Statistics::int32(None, None, None, Some(7), false);
737        assert_eq!(
738            format!("{stats:?}"),
739            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
740             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
741        )
742    }
743
744    #[test]
745    fn test_statistics_display() {
746        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
747        assert_eq!(
748            format!("{stats}"),
749            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
750        );
751
752        let stats = Statistics::int64(None, None, None, Some(7), false);
753        assert_eq!(
754            format!("{stats}"),
755            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
756             false, max_value_exact: false, min_value_exact: false}"
757        );
758
759        let stats = Statistics::int96(
760            Some(Int96::from(vec![1, 0, 0])),
761            Some(Int96::from(vec![2, 3, 4])),
762            None,
763            Some(3),
764            true,
765        );
766        assert_eq!(
767            format!("{stats}"),
768            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
769             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
770        );
771
772        let stats = Statistics::ByteArray(
773            ValueStatistics::new(
774                Some(ByteArray::from(vec![1u8])),
775                Some(ByteArray::from(vec![2u8])),
776                Some(5),
777                Some(7),
778                false,
779            )
780            .with_max_is_exact(false)
781            .with_min_is_exact(false),
782        );
783        assert_eq!(
784            format!("{stats}"),
785            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
786        );
787    }
788
789    #[test]
790    fn test_statistics_partial_eq() {
791        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
792
793        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
794        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
795        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
796        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
797        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
798
799        assert!(
800            Statistics::int32(Some(12), Some(45), None, Some(11), false)
801                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
802        );
803
804        assert!(
805            Statistics::boolean(Some(false), Some(true), None, None, true)
806                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
807        );
808
809        assert!(
810            Statistics::byte_array(
811                Some(ByteArray::from(vec![1, 2, 3])),
812                Some(ByteArray::from(vec![1, 2, 3])),
813                None,
814                None,
815                true
816            ) != Statistics::fixed_len_byte_array(
817                Some(ByteArray::from(vec![1, 2, 3]).into()),
818                Some(ByteArray::from(vec![1, 2, 3]).into()),
819                None,
820                None,
821                true,
822            )
823        );
824
825        assert!(
826            Statistics::byte_array(
827                Some(ByteArray::from(vec![1, 2, 3])),
828                Some(ByteArray::from(vec![1, 2, 3])),
829                None,
830                None,
831                true,
832            ) != Statistics::ByteArray(
833                ValueStatistics::new(
834                    Some(ByteArray::from(vec![1, 2, 3])),
835                    Some(ByteArray::from(vec![1, 2, 3])),
836                    None,
837                    None,
838                    true,
839                )
840                .with_max_is_exact(false)
841            )
842        );
843
844        assert!(
845            Statistics::fixed_len_byte_array(
846                Some(FixedLenByteArray::from(vec![1, 2, 3])),
847                Some(FixedLenByteArray::from(vec![1, 2, 3])),
848                None,
849                None,
850                true,
851            ) != Statistics::FixedLenByteArray(
852                ValueStatistics::new(
853                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
854                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
855                    None,
856                    None,
857                    true,
858                )
859                .with_min_is_exact(false)
860            )
861        );
862    }
863
864    #[test]
865    fn test_statistics_from_thrift() {
866        // Helper method to check statistics conversion.
867        fn check_stats(stats: Statistics) {
868            let tpe = stats.physical_type();
869            let thrift_stats = to_thrift(Some(&stats));
870            assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
871        }
872
873        check_stats(Statistics::boolean(
874            Some(false),
875            Some(true),
876            None,
877            Some(7),
878            true,
879        ));
880        check_stats(Statistics::boolean(
881            Some(false),
882            Some(true),
883            None,
884            Some(7),
885            true,
886        ));
887        check_stats(Statistics::boolean(
888            Some(false),
889            Some(true),
890            None,
891            Some(0),
892            false,
893        ));
894        check_stats(Statistics::boolean(
895            Some(true),
896            Some(true),
897            None,
898            Some(7),
899            true,
900        ));
901        check_stats(Statistics::boolean(
902            Some(false),
903            Some(false),
904            None,
905            Some(7),
906            true,
907        ));
908        check_stats(Statistics::boolean(None, None, None, Some(7), true));
909
910        check_stats(Statistics::int32(
911            Some(-100),
912            Some(500),
913            None,
914            Some(7),
915            true,
916        ));
917        check_stats(Statistics::int32(
918            Some(-100),
919            Some(500),
920            None,
921            Some(0),
922            false,
923        ));
924        check_stats(Statistics::int32(None, None, None, Some(7), true));
925
926        check_stats(Statistics::int64(
927            Some(-100),
928            Some(200),
929            None,
930            Some(7),
931            true,
932        ));
933        check_stats(Statistics::int64(
934            Some(-100),
935            Some(200),
936            None,
937            Some(0),
938            false,
939        ));
940        check_stats(Statistics::int64(None, None, None, Some(7), true));
941
942        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
943        check_stats(Statistics::float(
944            Some(1.2),
945            Some(3.4),
946            None,
947            Some(0),
948            false,
949        ));
950        check_stats(Statistics::float(None, None, None, Some(7), true));
951
952        check_stats(Statistics::double(
953            Some(1.2),
954            Some(3.4),
955            None,
956            Some(7),
957            true,
958        ));
959        check_stats(Statistics::double(
960            Some(1.2),
961            Some(3.4),
962            None,
963            Some(0),
964            false,
965        ));
966        check_stats(Statistics::double(None, None, None, Some(7), true));
967
968        check_stats(Statistics::byte_array(
969            Some(ByteArray::from(vec![1, 2, 3])),
970            Some(ByteArray::from(vec![3, 4, 5])),
971            None,
972            Some(7),
973            true,
974        ));
975        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
976
977        check_stats(Statistics::fixed_len_byte_array(
978            Some(ByteArray::from(vec![1, 2, 3]).into()),
979            Some(ByteArray::from(vec![3, 4, 5]).into()),
980            None,
981            Some(7),
982            true,
983        ));
984        check_stats(Statistics::fixed_len_byte_array(
985            None,
986            None,
987            None,
988            Some(7),
989            true,
990        ));
991    }
992
993    #[test]
994    fn test_count_encoding() {
995        statistics_count_test(None, None);
996        statistics_count_test(Some(0), Some(0));
997        statistics_count_test(Some(100), Some(2000));
998        statistics_count_test(Some(1), None);
999        statistics_count_test(None, Some(1));
1000    }
1001
1002    #[test]
1003    fn test_count_encoding_distinct_too_large() {
1004        // statistics are stored using i64, so test trying to store larger values
1005        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1006        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1007        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1008        assert_eq!(thrift_stats.null_count, Some(100));
1009    }
1010
1011    #[test]
1012    fn test_count_encoding_null_too_large() {
1013        // statistics are stored using i64, so test trying to store larger values
1014        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1015        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1016        assert_eq!(thrift_stats.distinct_count, Some(100));
1017        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1018    }
1019
1020    #[test]
1021    fn test_count_decoding_null_invalid() {
1022        let tstatistics = TStatistics {
1023            null_count: Some(-42),
1024            ..Default::default()
1025        };
1026        let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1027        assert_eq!(
1028            err.to_string(),
1029            "Parquet error: Statistics null count is negative -42"
1030        );
1031    }
1032
1033    /// Writes statistics to thrift and reads them back and ensures:
1034    /// - The statistics are the same
1035    /// - The statistics written to thrift are the same as the original statistics
1036    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1037        let statistics = make_bool_stats(distinct_count, null_count);
1038
1039        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1040        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1041        assert_eq!(
1042            thrift_stats.distinct_count.map(|c| c as u64),
1043            distinct_count
1044        );
1045
1046        let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1047            .unwrap()
1048            .unwrap();
1049        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1050        // means null_count = Some(0)
1051        if null_count.is_none() {
1052            assert_ne!(round_tripped, statistics);
1053            assert!(round_tripped.null_count_opt().is_some());
1054            assert_eq!(round_tripped.null_count_opt(), Some(0));
1055            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1056            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1057            assert_eq!(
1058                round_tripped.distinct_count_opt(),
1059                statistics.distinct_count_opt()
1060            );
1061        } else {
1062            assert_eq!(round_tripped, statistics);
1063        }
1064    }
1065
1066    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1067        let min = Some(true);
1068        let max = Some(false);
1069        let is_min_max_deprecated = false;
1070
1071        // test is about the counts, so we aren't really testing the min/max values
1072        Statistics::Boolean(ValueStatistics::new(
1073            min,
1074            max,
1075            distinct_count,
1076            null_count,
1077            is_min_max_deprecated,
1078        ))
1079    }
1080}