1use std::fmt;
43
44use crate::basic::Type;
45use crate::data_type::private::ParquetValueType;
46use crate::data_type::*;
47use crate::errors::{ParquetError, Result};
48use crate::file::metadata::thrift::PageStatistics;
49use crate::util::bit_util::FromBytes;
50
51pub(crate) mod private {
52 use super::*;
53
54 pub trait MakeStatistics {
55 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
56 where
57 Self: Sized;
58 }
59
60 macro_rules! gen_make_statistics {
61 ($value_ty:ty, $stat:ident) => {
62 impl MakeStatistics for $value_ty {
63 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
64 where
65 Self: Sized,
66 {
67 Statistics::$stat(statistics)
68 }
69 }
70 };
71 }
72
73 gen_make_statistics!(bool, Boolean);
74 gen_make_statistics!(i32, Int32);
75 gen_make_statistics!(i64, Int64);
76 gen_make_statistics!(Int96, Int96);
77 gen_make_statistics!(f32, Float);
78 gen_make_statistics!(f64, Double);
79 gen_make_statistics!(ByteArray, ByteArray);
80 gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
81}
82
83macro_rules! statistics_new_func {
85 ($func:ident, $vtype:ty, $stat:ident) => {
86 #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
87 pub fn $func(
88 min: $vtype,
89 max: $vtype,
90 distinct: Option<u64>,
91 nulls: Option<u64>,
92 is_deprecated: bool,
93 ) -> Self {
94 Statistics::$stat(ValueStatistics::new(
95 min,
96 max,
97 distinct,
98 nulls,
99 is_deprecated,
100 ))
101 }
102 };
103}
104
105macro_rules! statistics_enum_func {
107 ($self:ident, $func:ident) => {{
108 match *$self {
109 Statistics::Boolean(ref typed) => typed.$func(),
110 Statistics::Int32(ref typed) => typed.$func(),
111 Statistics::Int64(ref typed) => typed.$func(),
112 Statistics::Int96(ref typed) => typed.$func(),
113 Statistics::Float(ref typed) => typed.$func(),
114 Statistics::Double(ref typed) => typed.$func(),
115 Statistics::ByteArray(ref typed) => typed.$func(),
116 Statistics::FixedLenByteArray(ref typed) => typed.$func(),
117 }
118 }};
119}
120
121pub(crate) fn from_thrift_page_stats(
123 physical_type: Type,
124 thrift_stats: Option<PageStatistics>,
125) -> Result<Option<Statistics>> {
126 Ok(match thrift_stats {
127 Some(stats) => {
128 let null_count = stats.null_count.unwrap_or(0);
132
133 if null_count < 0 {
134 return Err(ParquetError::General(format!(
135 "Statistics null count is negative {null_count}",
136 )));
137 }
138
139 let null_count = Some(null_count as u64);
141 let distinct_count = stats.distinct_count.map(|value| value as u64);
143 let old_format = stats.min_value.is_none() && stats.max_value.is_none();
145 let min = if old_format {
147 stats.min
148 } else {
149 stats.min_value
150 };
151 let max = if old_format {
153 stats.max
154 } else {
155 stats.max_value
156 };
157
158 fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
159 if let Some(min) = min {
160 if min.len() < len {
161 return Err(ParquetError::General(
162 "Insufficient bytes to parse min statistic".to_string(),
163 ));
164 }
165 }
166 if let Some(max) = max {
167 if max.len() < len {
168 return Err(ParquetError::General(
169 "Insufficient bytes to parse max statistic".to_string(),
170 ));
171 }
172 }
173 Ok(())
174 }
175
176 match physical_type {
177 Type::BOOLEAN => check_len(&min, &max, 1),
178 Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
179 Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
180 Type::INT96 => check_len(&min, &max, 12),
181 _ => Ok(()),
182 }?;
183
184 let res = match physical_type {
189 Type::BOOLEAN => Statistics::boolean(
190 min.map(|data| data[0] != 0),
191 max.map(|data| data[0] != 0),
192 distinct_count,
193 null_count,
194 old_format,
195 ),
196 Type::INT32 => Statistics::int32(
197 min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
198 max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199 distinct_count,
200 null_count,
201 old_format,
202 ),
203 Type::INT64 => Statistics::int64(
204 min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
205 max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206 distinct_count,
207 null_count,
208 old_format,
209 ),
210 Type::INT96 => {
211 let min = if let Some(data) = min {
213 if data.len() != 12 {
214 return Err(ParquetError::General(
215 "Incorrect Int96 min statistics".to_string(),
216 ));
217 }
218 Some(Int96::try_from_le_slice(&data)?)
219 } else {
220 None
221 };
222 let max = if let Some(data) = max {
223 if data.len() != 12 {
224 return Err(ParquetError::General(
225 "Incorrect Int96 max statistics".to_string(),
226 ));
227 }
228 Some(Int96::try_from_le_slice(&data)?)
229 } else {
230 None
231 };
232 Statistics::int96(min, max, distinct_count, null_count, old_format)
233 }
234 Type::FLOAT => Statistics::float(
235 min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
236 max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
237 distinct_count,
238 null_count,
239 old_format,
240 ),
241 Type::DOUBLE => Statistics::double(
242 min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
243 max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
244 distinct_count,
245 null_count,
246 old_format,
247 ),
248 Type::BYTE_ARRAY => Statistics::ByteArray(
249 ValueStatistics::new(
250 min.map(ByteArray::from),
251 max.map(ByteArray::from),
252 distinct_count,
253 null_count,
254 old_format,
255 )
256 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
257 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
258 ),
259 Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
260 ValueStatistics::new(
261 min.map(ByteArray::from).map(FixedLenByteArray::from),
262 max.map(ByteArray::from).map(FixedLenByteArray::from),
263 distinct_count,
264 null_count,
265 old_format,
266 )
267 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
268 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
269 ),
270 };
271
272 Some(res)
273 }
274 None => None,
275 })
276}
277
278pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> {
280 let stats = stats?;
281
282 let null_count = stats
284 .null_count_opt()
285 .and_then(|value| i64::try_from(value).ok());
286
287 let distinct_count = stats
289 .distinct_count_opt()
290 .and_then(|value| i64::try_from(value).ok());
291
292 let mut thrift_stats = PageStatistics {
293 max: None,
294 min: None,
295 null_count,
296 distinct_count,
297 max_value: None,
298 min_value: None,
299 is_max_value_exact: None,
300 is_min_value_exact: None,
301 };
302
303 let (min, max, min_exact, max_exact) = (
305 stats.min_bytes_opt().map(|x| x.to_vec()),
306 stats.max_bytes_opt().map(|x| x.to_vec()),
307 Some(stats.min_is_exact()),
308 Some(stats.max_is_exact()),
309 );
310 if stats.is_min_max_backwards_compatible() {
311 thrift_stats.min.clone_from(&min);
313 thrift_stats.max.clone_from(&max);
314 }
315
316 if !stats.is_min_max_deprecated() {
317 thrift_stats.min_value = min;
318 thrift_stats.max_value = max;
319 }
320
321 thrift_stats.is_min_value_exact = min_exact;
322 thrift_stats.is_max_value_exact = max_exact;
323
324 Some(thrift_stats)
325}
326
327#[derive(Debug, Clone, PartialEq)]
338pub enum Statistics {
339 Boolean(ValueStatistics<bool>),
341 Int32(ValueStatistics<i32>),
343 Int64(ValueStatistics<i64>),
345 Int96(ValueStatistics<Int96>),
347 Float(ValueStatistics<f32>),
349 Double(ValueStatistics<f64>),
351 ByteArray(ValueStatistics<ByteArray>),
353 FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
355}
356
357impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
358 fn from(t: ValueStatistics<T>) -> Self {
359 T::make_statistics(t)
360 }
361}
362
363impl Statistics {
364 pub fn new<T: ParquetValueType>(
366 min: Option<T>,
367 max: Option<T>,
368 distinct_count: Option<u64>,
369 null_count: Option<u64>,
370 is_deprecated: bool,
371 ) -> Self {
372 Self::from(ValueStatistics::new(
373 min,
374 max,
375 distinct_count,
376 null_count,
377 is_deprecated,
378 ))
379 }
380
381 statistics_new_func![boolean, Option<bool>, Boolean];
382
383 statistics_new_func![int32, Option<i32>, Int32];
384
385 statistics_new_func![int64, Option<i64>, Int64];
386
387 statistics_new_func![int96, Option<Int96>, Int96];
388
389 statistics_new_func![float, Option<f32>, Float];
390
391 statistics_new_func![double, Option<f64>, Double];
392
393 statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
394
395 statistics_new_func![
396 fixed_len_byte_array,
397 Option<FixedLenByteArray>,
398 FixedLenByteArray
399 ];
400
401 pub fn is_min_max_deprecated(&self) -> bool {
408 statistics_enum_func![self, is_min_max_deprecated]
409 }
410
411 pub fn is_min_max_backwards_compatible(&self) -> bool {
422 statistics_enum_func![self, is_min_max_backwards_compatible]
423 }
424
425 pub fn distinct_count_opt(&self) -> Option<u64> {
428 statistics_enum_func![self, distinct_count]
429 }
430
431 pub fn null_count_opt(&self) -> Option<u64> {
438 statistics_enum_func![self, null_count_opt]
439 }
440
441 pub fn min_is_exact(&self) -> bool {
443 statistics_enum_func![self, min_is_exact]
444 }
445
446 pub fn max_is_exact(&self) -> bool {
448 statistics_enum_func![self, max_is_exact]
449 }
450
451 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
453 statistics_enum_func![self, min_bytes_opt]
454 }
455
456 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
458 statistics_enum_func![self, max_bytes_opt]
459 }
460
461 pub fn physical_type(&self) -> Type {
463 match self {
464 Statistics::Boolean(_) => Type::BOOLEAN,
465 Statistics::Int32(_) => Type::INT32,
466 Statistics::Int64(_) => Type::INT64,
467 Statistics::Int96(_) => Type::INT96,
468 Statistics::Float(_) => Type::FLOAT,
469 Statistics::Double(_) => Type::DOUBLE,
470 Statistics::ByteArray(_) => Type::BYTE_ARRAY,
471 Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
472 }
473 }
474}
475
476impl fmt::Display for Statistics {
477 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
478 match self {
479 Statistics::Boolean(typed) => write!(f, "{typed}"),
480 Statistics::Int32(typed) => write!(f, "{typed}"),
481 Statistics::Int64(typed) => write!(f, "{typed}"),
482 Statistics::Int96(typed) => write!(f, "{typed}"),
483 Statistics::Float(typed) => write!(f, "{typed}"),
484 Statistics::Double(typed) => write!(f, "{typed}"),
485 Statistics::ByteArray(typed) => write!(f, "{typed}"),
486 Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
487 }
488 }
489}
490
491pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
493
494#[derive(Clone, Eq, PartialEq)]
498pub struct ValueStatistics<T> {
499 min: Option<T>,
500 max: Option<T>,
501 distinct_count: Option<u64>,
503 null_count: Option<u64>,
504
505 is_max_value_exact: bool,
507 is_min_value_exact: bool,
508
509 is_min_max_deprecated: bool,
512
513 is_min_max_backwards_compatible: bool,
516}
517
518impl<T> ValueStatistics<T> {
519 pub fn new(
521 min: Option<T>,
522 max: Option<T>,
523 distinct_count: Option<u64>,
524 null_count: Option<u64>,
525 is_min_max_deprecated: bool,
526 ) -> Self {
527 Self {
528 is_max_value_exact: max.is_some(),
529 is_min_value_exact: min.is_some(),
530 min,
531 max,
532 distinct_count,
533 null_count,
534 is_min_max_deprecated,
535 is_min_max_backwards_compatible: is_min_max_deprecated,
536 }
537 }
538
539 pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
544 Self {
545 is_min_value_exact,
546 ..self
547 }
548 }
549
550 pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
555 Self {
556 is_max_value_exact,
557 ..self
558 }
559 }
560
561 pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
567 Self {
568 is_min_max_backwards_compatible: backwards_compatible,
569 ..self
570 }
571 }
572
573 pub fn min_opt(&self) -> Option<&T> {
575 self.min.as_ref()
576 }
577
578 pub fn max_opt(&self) -> Option<&T> {
580 self.max.as_ref()
581 }
582
583 pub(crate) fn _internal_has_min_max_set(&self) -> bool {
586 self.min.is_some() && self.max.is_some()
587 }
588
589 pub fn max_is_exact(&self) -> bool {
591 self.max.is_some() && self.is_max_value_exact
592 }
593
594 pub fn min_is_exact(&self) -> bool {
596 self.min.is_some() && self.is_min_value_exact
597 }
598
599 pub fn distinct_count(&self) -> Option<u64> {
601 self.distinct_count
602 }
603
604 pub fn null_count_opt(&self) -> Option<u64> {
606 self.null_count
607 }
608
609 fn is_min_max_deprecated(&self) -> bool {
611 self.is_min_max_deprecated
612 }
613
614 pub fn is_min_max_backwards_compatible(&self) -> bool {
625 self.is_min_max_backwards_compatible
626 }
627}
628
629impl<T: AsBytes> ValueStatistics<T> {
630 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
632 self.min_opt().map(AsBytes::as_bytes)
633 }
634
635 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
637 self.max_opt().map(AsBytes::as_bytes)
638 }
639}
640
641impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
642 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
643 write!(f, "{{")?;
644 write!(f, "min: ")?;
645 match self.min {
646 Some(ref value) => write!(f, "{value}")?,
647 None => write!(f, "N/A")?,
648 }
649 write!(f, ", max: ")?;
650 match self.max {
651 Some(ref value) => write!(f, "{value}")?,
652 None => write!(f, "N/A")?,
653 }
654 write!(f, ", distinct_count: ")?;
655 match self.distinct_count {
656 Some(value) => write!(f, "{value}")?,
657 None => write!(f, "N/A")?,
658 }
659 write!(f, ", null_count: ")?;
660 match self.null_count {
661 Some(value) => write!(f, "{value}")?,
662 None => write!(f, "N/A")?,
663 }
664 write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
665 write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
666 write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
667 write!(f, "}}")
668 }
669}
670
671impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
672 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
673 write!(
674 f,
675 "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
676 min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
677 self.min,
678 self.max,
679 self.distinct_count,
680 self.null_count,
681 self.is_min_max_deprecated,
682 self.is_min_max_backwards_compatible,
683 self.is_max_value_exact,
684 self.is_min_value_exact
685 )
686 }
687}
688
689#[cfg(test)]
690mod tests {
691 use super::*;
692
693 #[test]
694 fn test_statistics_min_max_bytes() {
695 let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
696 assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
697 assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
698
699 let stats = Statistics::byte_array(
700 Some(ByteArray::from(vec![1, 2, 3])),
701 Some(ByteArray::from(vec![3, 4, 5])),
702 None,
703 Some(1),
704 true,
705 );
706 assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
707 assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
708 }
709
710 #[test]
711 #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
712 fn test_statistics_negative_null_count() {
713 let thrift_stats = PageStatistics {
714 max: None,
715 min: None,
716 null_count: Some(-10),
717 distinct_count: None,
718 max_value: None,
719 min_value: None,
720 is_max_value_exact: None,
721 is_min_value_exact: None,
722 };
723
724 from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap();
725 }
726
727 #[test]
728 fn test_statistics_thrift_none() {
729 assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None);
730 assert_eq!(
731 from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(),
732 None
733 );
734 }
735
736 #[test]
737 fn test_statistics_debug() {
738 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
739 assert_eq!(
740 format!("{stats:?}"),
741 "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
742 min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
743 );
744
745 let stats = Statistics::int32(None, None, None, Some(7), false);
746 assert_eq!(
747 format!("{stats:?}"),
748 "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
749 min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
750 )
751 }
752
753 #[test]
754 fn test_statistics_display() {
755 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
756 assert_eq!(
757 format!("{stats}"),
758 "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
759 );
760
761 let stats = Statistics::int64(None, None, None, Some(7), false);
762 assert_eq!(
763 format!("{stats}"),
764 "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
765 false, max_value_exact: false, min_value_exact: false}"
766 );
767
768 let stats = Statistics::int96(
769 Some(Int96::from(vec![1, 0, 0])),
770 Some(Int96::from(vec![2, 3, 4])),
771 None,
772 Some(3),
773 true,
774 );
775 assert_eq!(
776 format!("{stats}"),
777 "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
778 min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
779 );
780
781 let stats = Statistics::ByteArray(
782 ValueStatistics::new(
783 Some(ByteArray::from(vec![1u8])),
784 Some(ByteArray::from(vec![2u8])),
785 Some(5),
786 Some(7),
787 false,
788 )
789 .with_max_is_exact(false)
790 .with_min_is_exact(false),
791 );
792 assert_eq!(
793 format!("{stats}"),
794 "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
795 );
796 }
797
798 #[test]
799 fn test_statistics_partial_eq() {
800 let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
801
802 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
803 assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
804 assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
805 assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
806 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
807
808 assert!(
809 Statistics::int32(Some(12), Some(45), None, Some(11), false)
810 != Statistics::int64(Some(12), Some(45), None, Some(11), false)
811 );
812
813 assert!(
814 Statistics::boolean(Some(false), Some(true), None, None, true)
815 != Statistics::double(Some(1.2), Some(4.5), None, None, true)
816 );
817
818 assert!(
819 Statistics::byte_array(
820 Some(ByteArray::from(vec![1, 2, 3])),
821 Some(ByteArray::from(vec![1, 2, 3])),
822 None,
823 None,
824 true
825 ) != Statistics::fixed_len_byte_array(
826 Some(ByteArray::from(vec![1, 2, 3]).into()),
827 Some(ByteArray::from(vec![1, 2, 3]).into()),
828 None,
829 None,
830 true,
831 )
832 );
833
834 assert!(
835 Statistics::byte_array(
836 Some(ByteArray::from(vec![1, 2, 3])),
837 Some(ByteArray::from(vec![1, 2, 3])),
838 None,
839 None,
840 true,
841 ) != Statistics::ByteArray(
842 ValueStatistics::new(
843 Some(ByteArray::from(vec![1, 2, 3])),
844 Some(ByteArray::from(vec![1, 2, 3])),
845 None,
846 None,
847 true,
848 )
849 .with_max_is_exact(false)
850 )
851 );
852
853 assert!(
854 Statistics::fixed_len_byte_array(
855 Some(FixedLenByteArray::from(vec![1, 2, 3])),
856 Some(FixedLenByteArray::from(vec![1, 2, 3])),
857 None,
858 None,
859 true,
860 ) != Statistics::FixedLenByteArray(
861 ValueStatistics::new(
862 Some(FixedLenByteArray::from(vec![1, 2, 3])),
863 Some(FixedLenByteArray::from(vec![1, 2, 3])),
864 None,
865 None,
866 true,
867 )
868 .with_min_is_exact(false)
869 )
870 );
871 }
872
873 #[test]
874 fn test_statistics_from_thrift() {
875 fn check_stats(stats: Statistics) {
877 let tpe = stats.physical_type();
878 let thrift_stats = page_stats_to_thrift(Some(&stats));
879 assert_eq!(
880 from_thrift_page_stats(tpe, thrift_stats).unwrap(),
881 Some(stats)
882 );
883 }
884
885 check_stats(Statistics::boolean(
886 Some(false),
887 Some(true),
888 None,
889 Some(7),
890 true,
891 ));
892 check_stats(Statistics::boolean(
893 Some(false),
894 Some(true),
895 None,
896 Some(7),
897 true,
898 ));
899 check_stats(Statistics::boolean(
900 Some(false),
901 Some(true),
902 None,
903 Some(0),
904 false,
905 ));
906 check_stats(Statistics::boolean(
907 Some(true),
908 Some(true),
909 None,
910 Some(7),
911 true,
912 ));
913 check_stats(Statistics::boolean(
914 Some(false),
915 Some(false),
916 None,
917 Some(7),
918 true,
919 ));
920 check_stats(Statistics::boolean(None, None, None, Some(7), true));
921
922 check_stats(Statistics::int32(
923 Some(-100),
924 Some(500),
925 None,
926 Some(7),
927 true,
928 ));
929 check_stats(Statistics::int32(
930 Some(-100),
931 Some(500),
932 None,
933 Some(0),
934 false,
935 ));
936 check_stats(Statistics::int32(None, None, None, Some(7), true));
937
938 check_stats(Statistics::int64(
939 Some(-100),
940 Some(200),
941 None,
942 Some(7),
943 true,
944 ));
945 check_stats(Statistics::int64(
946 Some(-100),
947 Some(200),
948 None,
949 Some(0),
950 false,
951 ));
952 check_stats(Statistics::int64(None, None, None, Some(7), true));
953
954 check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
955 check_stats(Statistics::float(
956 Some(1.2),
957 Some(3.4),
958 None,
959 Some(0),
960 false,
961 ));
962 check_stats(Statistics::float(None, None, None, Some(7), true));
963
964 check_stats(Statistics::double(
965 Some(1.2),
966 Some(3.4),
967 None,
968 Some(7),
969 true,
970 ));
971 check_stats(Statistics::double(
972 Some(1.2),
973 Some(3.4),
974 None,
975 Some(0),
976 false,
977 ));
978 check_stats(Statistics::double(None, None, None, Some(7), true));
979
980 check_stats(Statistics::byte_array(
981 Some(ByteArray::from(vec![1, 2, 3])),
982 Some(ByteArray::from(vec![3, 4, 5])),
983 None,
984 Some(7),
985 true,
986 ));
987 check_stats(Statistics::byte_array(None, None, None, Some(7), true));
988
989 check_stats(Statistics::fixed_len_byte_array(
990 Some(ByteArray::from(vec![1, 2, 3]).into()),
991 Some(ByteArray::from(vec![3, 4, 5]).into()),
992 None,
993 Some(7),
994 true,
995 ));
996 check_stats(Statistics::fixed_len_byte_array(
997 None,
998 None,
999 None,
1000 Some(7),
1001 true,
1002 ));
1003 }
1004
1005 #[test]
1006 fn test_count_encoding() {
1007 statistics_count_test(None, None);
1008 statistics_count_test(Some(0), Some(0));
1009 statistics_count_test(Some(100), Some(2000));
1010 statistics_count_test(Some(1), None);
1011 statistics_count_test(None, Some(1));
1012 }
1013
1014 #[test]
1015 fn test_count_encoding_distinct_too_large() {
1016 let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1018 let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1019 assert_eq!(thrift_stats.distinct_count, None); assert_eq!(thrift_stats.null_count, Some(100));
1021 }
1022
1023 #[test]
1024 fn test_count_encoding_null_too_large() {
1025 let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1027 let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1028 assert_eq!(thrift_stats.distinct_count, Some(100));
1029 assert_eq!(thrift_stats.null_count, None); }
1031
1032 #[test]
1033 fn test_count_decoding_null_invalid() {
1034 let tstatistics = PageStatistics {
1035 null_count: Some(-42),
1036 max: None,
1037 min: None,
1038 distinct_count: None,
1039 max_value: None,
1040 min_value: None,
1041 is_max_value_exact: None,
1042 is_min_value_exact: None,
1043 };
1044 let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1045 assert_eq!(
1046 err.to_string(),
1047 "Parquet error: Statistics null count is negative -42"
1048 );
1049 }
1050
1051 fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1055 let statistics = make_bool_stats(distinct_count, null_count);
1056
1057 let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1058 assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1059 assert_eq!(
1060 thrift_stats.distinct_count.map(|c| c as u64),
1061 distinct_count
1062 );
1063
1064 let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
1065 .unwrap()
1066 .unwrap();
1067 if null_count.is_none() {
1070 assert_ne!(round_tripped, statistics);
1071 assert!(round_tripped.null_count_opt().is_some());
1072 assert_eq!(round_tripped.null_count_opt(), Some(0));
1073 assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1074 assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1075 assert_eq!(
1076 round_tripped.distinct_count_opt(),
1077 statistics.distinct_count_opt()
1078 );
1079 } else {
1080 assert_eq!(round_tripped, statistics);
1081 }
1082 }
1083
1084 fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1085 let min = Some(true);
1086 let max = Some(false);
1087 let is_min_max_deprecated = false;
1088
1089 Statistics::Boolean(ValueStatistics::new(
1091 min,
1092 max,
1093 distinct_count,
1094 null_count,
1095 is_min_max_deprecated,
1096 ))
1097 }
1098
1099 #[test]
1100 fn test_int96_invalid_statistics() {
1101 let mut thrift_stats = PageStatistics {
1102 max: None,
1103 min: Some((0..13).collect()),
1104 null_count: Some(0),
1105 distinct_count: None,
1106 max_value: None,
1107 min_value: None,
1108 is_max_value_exact: None,
1109 is_min_value_exact: None,
1110 };
1111
1112 let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats.clone())).unwrap_err();
1113 assert_eq!(
1114 err.to_string(),
1115 "Parquet error: Incorrect Int96 min statistics"
1116 );
1117
1118 thrift_stats.min = None;
1119 thrift_stats.max = Some((0..13).collect());
1120 let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats)).unwrap_err();
1121 assert_eq!(
1122 err.to_string(),
1123 "Parquet error: Incorrect Int96 max statistics"
1124 );
1125 }
1126
1127 fn generic_statistics_handler<T: std::fmt::Display>(stats: ValueStatistics<T>) -> String {
1130 match stats.min_opt() {
1131 Some(s) => format!("min: {}", s),
1132 None => "min: NA".to_string(),
1133 }
1134 }
1135
1136 #[test]
1137 fn test_generic_access() {
1138 let stats = Statistics::int32(Some(12), Some(45), None, Some(11), false);
1139
1140 match stats {
1141 Statistics::Int32(v) => {
1142 let stats_string = generic_statistics_handler(v);
1143 assert_eq!(&stats_string, "min: 12");
1144 }
1145 _ => unreachable!(),
1146 }
1147 }
1148}