1use std::fmt;
43
44use crate::basic::Type;
45use crate::data_type::private::ParquetValueType;
46use crate::data_type::*;
47use crate::errors::{ParquetError, Result};
48use crate::file::metadata::thrift::PageStatistics;
49use crate::util::bit_util::FromBytes;
50
51pub(crate) mod private {
52 use super::*;
53
54 pub trait MakeStatistics {
55 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
56 where
57 Self: Sized;
58 }
59
60 macro_rules! gen_make_statistics {
61 ($value_ty:ty, $stat:ident) => {
62 impl MakeStatistics for $value_ty {
63 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
64 where
65 Self: Sized,
66 {
67 Statistics::$stat(statistics)
68 }
69 }
70 };
71 }
72
73 gen_make_statistics!(bool, Boolean);
74 gen_make_statistics!(i32, Int32);
75 gen_make_statistics!(i64, Int64);
76 gen_make_statistics!(Int96, Int96);
77 gen_make_statistics!(f32, Float);
78 gen_make_statistics!(f64, Double);
79 gen_make_statistics!(ByteArray, ByteArray);
80 gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
81}
82
83macro_rules! statistics_new_func {
85 ($func:ident, $vtype:ty, $stat:ident) => {
86 #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
87 pub fn $func(
88 min: $vtype,
89 max: $vtype,
90 distinct: Option<u64>,
91 nulls: Option<u64>,
92 is_deprecated: bool,
93 ) -> Self {
94 Statistics::$stat(ValueStatistics::new(
95 min,
96 max,
97 distinct,
98 nulls,
99 is_deprecated,
100 ))
101 }
102 };
103}
104
105macro_rules! statistics_enum_func {
107 ($self:ident, $func:ident) => {{
108 match *$self {
109 Statistics::Boolean(ref typed) => typed.$func(),
110 Statistics::Int32(ref typed) => typed.$func(),
111 Statistics::Int64(ref typed) => typed.$func(),
112 Statistics::Int96(ref typed) => typed.$func(),
113 Statistics::Float(ref typed) => typed.$func(),
114 Statistics::Double(ref typed) => typed.$func(),
115 Statistics::ByteArray(ref typed) => typed.$func(),
116 Statistics::FixedLenByteArray(ref typed) => typed.$func(),
117 }
118 }};
119}
120
121pub(crate) fn from_thrift_page_stats(
123 physical_type: Type,
124 thrift_stats: Option<PageStatistics>,
125) -> Result<Option<Statistics>> {
126 Ok(match thrift_stats {
127 Some(stats) => {
128 let null_count = stats
130 .null_count
131 .map(|null_count| {
132 if null_count < 0 {
133 return Err(ParquetError::General(format!(
134 "Statistics null count is negative {null_count}",
135 )));
136 }
137 Ok(null_count as u64)
138 })
139 .transpose()?;
140 let distinct_count = stats.distinct_count.map(|value| value as u64);
142 let old_format = stats.min_value.is_none() && stats.max_value.is_none();
144 let min = if old_format {
146 stats.min
147 } else {
148 stats.min_value
149 };
150 let max = if old_format {
152 stats.max
153 } else {
154 stats.max_value
155 };
156
157 fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
158 if let Some(min) = min {
159 if min.len() < len {
160 return Err(ParquetError::General(
161 "Insufficient bytes to parse min statistic".to_string(),
162 ));
163 }
164 }
165 if let Some(max) = max {
166 if max.len() < len {
167 return Err(ParquetError::General(
168 "Insufficient bytes to parse max statistic".to_string(),
169 ));
170 }
171 }
172 Ok(())
173 }
174
175 match physical_type {
176 Type::BOOLEAN => check_len(&min, &max, 1),
177 Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
178 Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
179 Type::INT96 => check_len(&min, &max, 12),
180 _ => Ok(()),
181 }?;
182
183 let res = match physical_type {
188 Type::BOOLEAN => Statistics::boolean(
189 min.map(|data| data[0] != 0),
190 max.map(|data| data[0] != 0),
191 distinct_count,
192 null_count,
193 old_format,
194 ),
195 Type::INT32 => Statistics::int32(
196 min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
197 max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
198 distinct_count,
199 null_count,
200 old_format,
201 ),
202 Type::INT64 => Statistics::int64(
203 min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
204 max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
205 distinct_count,
206 null_count,
207 old_format,
208 ),
209 Type::INT96 => {
210 let min = if let Some(data) = min {
212 if data.len() != 12 {
213 return Err(ParquetError::General(
214 "Incorrect Int96 min statistics".to_string(),
215 ));
216 }
217 Some(Int96::try_from_le_slice(&data)?)
218 } else {
219 None
220 };
221 let max = if let Some(data) = max {
222 if data.len() != 12 {
223 return Err(ParquetError::General(
224 "Incorrect Int96 max statistics".to_string(),
225 ));
226 }
227 Some(Int96::try_from_le_slice(&data)?)
228 } else {
229 None
230 };
231 Statistics::int96(min, max, distinct_count, null_count, old_format)
232 }
233 Type::FLOAT => Statistics::float(
234 min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
235 max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
236 distinct_count,
237 null_count,
238 old_format,
239 ),
240 Type::DOUBLE => Statistics::double(
241 min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
242 max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
243 distinct_count,
244 null_count,
245 old_format,
246 ),
247 Type::BYTE_ARRAY => Statistics::ByteArray(
248 ValueStatistics::new(
249 min.map(ByteArray::from),
250 max.map(ByteArray::from),
251 distinct_count,
252 null_count,
253 old_format,
254 )
255 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
256 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
257 ),
258 Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
259 ValueStatistics::new(
260 min.map(ByteArray::from).map(FixedLenByteArray::from),
261 max.map(ByteArray::from).map(FixedLenByteArray::from),
262 distinct_count,
263 null_count,
264 old_format,
265 )
266 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
267 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
268 ),
269 };
270
271 Some(res)
272 }
273 None => None,
274 })
275}
276
277pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> {
279 let stats = stats?;
280
281 let null_count = stats
283 .null_count_opt()
284 .and_then(|value| i64::try_from(value).ok());
285
286 let distinct_count = stats
288 .distinct_count_opt()
289 .and_then(|value| i64::try_from(value).ok());
290
291 let mut thrift_stats = PageStatistics {
292 max: None,
293 min: None,
294 null_count,
295 distinct_count,
296 max_value: None,
297 min_value: None,
298 is_max_value_exact: None,
299 is_min_value_exact: None,
300 };
301
302 let (min, max, min_exact, max_exact) = (
304 stats.min_bytes_opt().map(|x| x.to_vec()),
305 stats.max_bytes_opt().map(|x| x.to_vec()),
306 Some(stats.min_is_exact()),
307 Some(stats.max_is_exact()),
308 );
309 if stats.is_min_max_backwards_compatible() {
310 thrift_stats.min.clone_from(&min);
312 thrift_stats.max.clone_from(&max);
313 }
314
315 if !stats.is_min_max_deprecated() {
316 thrift_stats.min_value = min;
317 thrift_stats.max_value = max;
318 }
319
320 thrift_stats.is_min_value_exact = min_exact;
321 thrift_stats.is_max_value_exact = max_exact;
322
323 Some(thrift_stats)
324}
325
326#[derive(Debug, Clone, PartialEq)]
337pub enum Statistics {
338 Boolean(ValueStatistics<bool>),
340 Int32(ValueStatistics<i32>),
342 Int64(ValueStatistics<i64>),
344 Int96(ValueStatistics<Int96>),
346 Float(ValueStatistics<f32>),
348 Double(ValueStatistics<f64>),
350 ByteArray(ValueStatistics<ByteArray>),
352 FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
354}
355
356impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
357 fn from(t: ValueStatistics<T>) -> Self {
358 T::make_statistics(t)
359 }
360}
361
362impl Statistics {
363 pub fn new<T: ParquetValueType>(
365 min: Option<T>,
366 max: Option<T>,
367 distinct_count: Option<u64>,
368 null_count: Option<u64>,
369 is_deprecated: bool,
370 ) -> Self {
371 Self::from(ValueStatistics::new(
372 min,
373 max,
374 distinct_count,
375 null_count,
376 is_deprecated,
377 ))
378 }
379
380 statistics_new_func![boolean, Option<bool>, Boolean];
381
382 statistics_new_func![int32, Option<i32>, Int32];
383
384 statistics_new_func![int64, Option<i64>, Int64];
385
386 statistics_new_func![int96, Option<Int96>, Int96];
387
388 statistics_new_func![float, Option<f32>, Float];
389
390 statistics_new_func![double, Option<f64>, Double];
391
392 statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
393
394 statistics_new_func![
395 fixed_len_byte_array,
396 Option<FixedLenByteArray>,
397 FixedLenByteArray
398 ];
399
400 pub fn is_min_max_deprecated(&self) -> bool {
407 statistics_enum_func![self, is_min_max_deprecated]
408 }
409
410 pub fn is_min_max_backwards_compatible(&self) -> bool {
421 statistics_enum_func![self, is_min_max_backwards_compatible]
422 }
423
424 pub fn distinct_count_opt(&self) -> Option<u64> {
427 statistics_enum_func![self, distinct_count]
428 }
429
430 pub fn null_count_opt(&self) -> Option<u64> {
448 statistics_enum_func![self, null_count_opt]
449 }
450
451 pub fn min_is_exact(&self) -> bool {
453 statistics_enum_func![self, min_is_exact]
454 }
455
456 pub fn max_is_exact(&self) -> bool {
458 statistics_enum_func![self, max_is_exact]
459 }
460
461 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
463 statistics_enum_func![self, min_bytes_opt]
464 }
465
466 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
468 statistics_enum_func![self, max_bytes_opt]
469 }
470
471 pub fn physical_type(&self) -> Type {
473 match self {
474 Statistics::Boolean(_) => Type::BOOLEAN,
475 Statistics::Int32(_) => Type::INT32,
476 Statistics::Int64(_) => Type::INT64,
477 Statistics::Int96(_) => Type::INT96,
478 Statistics::Float(_) => Type::FLOAT,
479 Statistics::Double(_) => Type::DOUBLE,
480 Statistics::ByteArray(_) => Type::BYTE_ARRAY,
481 Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
482 }
483 }
484}
485
486impl fmt::Display for Statistics {
487 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
488 match self {
489 Statistics::Boolean(typed) => write!(f, "{typed}"),
490 Statistics::Int32(typed) => write!(f, "{typed}"),
491 Statistics::Int64(typed) => write!(f, "{typed}"),
492 Statistics::Int96(typed) => write!(f, "{typed}"),
493 Statistics::Float(typed) => write!(f, "{typed}"),
494 Statistics::Double(typed) => write!(f, "{typed}"),
495 Statistics::ByteArray(typed) => write!(f, "{typed}"),
496 Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
497 }
498 }
499}
500
501pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
503
504#[derive(Clone, Eq, PartialEq)]
508pub struct ValueStatistics<T> {
509 min: Option<T>,
510 max: Option<T>,
511 distinct_count: Option<u64>,
513 null_count: Option<u64>,
514
515 is_max_value_exact: bool,
517 is_min_value_exact: bool,
518
519 is_min_max_deprecated: bool,
522
523 is_min_max_backwards_compatible: bool,
526}
527
528impl<T> ValueStatistics<T> {
529 pub fn new(
531 min: Option<T>,
532 max: Option<T>,
533 distinct_count: Option<u64>,
534 null_count: Option<u64>,
535 is_min_max_deprecated: bool,
536 ) -> Self {
537 Self {
538 is_max_value_exact: max.is_some(),
539 is_min_value_exact: min.is_some(),
540 min,
541 max,
542 distinct_count,
543 null_count,
544 is_min_max_deprecated,
545 is_min_max_backwards_compatible: is_min_max_deprecated,
546 }
547 }
548
549 pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
554 Self {
555 is_min_value_exact,
556 ..self
557 }
558 }
559
560 pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
565 Self {
566 is_max_value_exact,
567 ..self
568 }
569 }
570
571 pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
577 Self {
578 is_min_max_backwards_compatible: backwards_compatible,
579 ..self
580 }
581 }
582
583 pub fn min_opt(&self) -> Option<&T> {
585 self.min.as_ref()
586 }
587
588 pub fn max_opt(&self) -> Option<&T> {
590 self.max.as_ref()
591 }
592
593 pub(crate) fn _internal_has_min_max_set(&self) -> bool {
596 self.min.is_some() && self.max.is_some()
597 }
598
599 pub fn max_is_exact(&self) -> bool {
601 self.max.is_some() && self.is_max_value_exact
602 }
603
604 pub fn min_is_exact(&self) -> bool {
606 self.min.is_some() && self.is_min_value_exact
607 }
608
609 pub fn distinct_count(&self) -> Option<u64> {
611 self.distinct_count
612 }
613
614 pub fn null_count_opt(&self) -> Option<u64> {
616 self.null_count
617 }
618
619 fn is_min_max_deprecated(&self) -> bool {
621 self.is_min_max_deprecated
622 }
623
624 pub fn is_min_max_backwards_compatible(&self) -> bool {
635 self.is_min_max_backwards_compatible
636 }
637}
638
639impl<T: AsBytes> ValueStatistics<T> {
640 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
642 self.min_opt().map(AsBytes::as_bytes)
643 }
644
645 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
647 self.max_opt().map(AsBytes::as_bytes)
648 }
649}
650
651impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
652 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
653 write!(f, "{{")?;
654 write!(f, "min: ")?;
655 match self.min {
656 Some(ref value) => write!(f, "{value}")?,
657 None => write!(f, "N/A")?,
658 }
659 write!(f, ", max: ")?;
660 match self.max {
661 Some(ref value) => write!(f, "{value}")?,
662 None => write!(f, "N/A")?,
663 }
664 write!(f, ", distinct_count: ")?;
665 match self.distinct_count {
666 Some(value) => write!(f, "{value}")?,
667 None => write!(f, "N/A")?,
668 }
669 write!(f, ", null_count: ")?;
670 match self.null_count {
671 Some(value) => write!(f, "{value}")?,
672 None => write!(f, "N/A")?,
673 }
674 write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
675 write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
676 write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
677 write!(f, "}}")
678 }
679}
680
681impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
682 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
683 write!(
684 f,
685 "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
686 min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
687 self.min,
688 self.max,
689 self.distinct_count,
690 self.null_count,
691 self.is_min_max_deprecated,
692 self.is_min_max_backwards_compatible,
693 self.is_max_value_exact,
694 self.is_min_value_exact
695 )
696 }
697}
698
699#[cfg(test)]
700mod tests {
701 use super::*;
702
703 #[test]
704 fn test_statistics_min_max_bytes() {
705 let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
706 assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
707 assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
708
709 let stats = Statistics::byte_array(
710 Some(ByteArray::from(vec![1, 2, 3])),
711 Some(ByteArray::from(vec![3, 4, 5])),
712 None,
713 Some(1),
714 true,
715 );
716 assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
717 assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
718 }
719
720 #[test]
721 #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
722 fn test_statistics_negative_null_count() {
723 let thrift_stats = PageStatistics {
724 max: None,
725 min: None,
726 null_count: Some(-10),
727 distinct_count: None,
728 max_value: None,
729 min_value: None,
730 is_max_value_exact: None,
731 is_min_value_exact: None,
732 };
733
734 from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap();
735 }
736
737 #[test]
738 fn test_statistics_thrift_none() {
739 assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None);
740 assert_eq!(
741 from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(),
742 None
743 );
744 }
745
746 #[test]
747 fn test_statistics_debug() {
748 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
749 assert_eq!(
750 format!("{stats:?}"),
751 "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
752 min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
753 );
754
755 let stats = Statistics::int32(None, None, None, Some(7), false);
756 assert_eq!(
757 format!("{stats:?}"),
758 "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
759 min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
760 )
761 }
762
763 #[test]
764 fn test_statistics_display() {
765 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
766 assert_eq!(
767 format!("{stats}"),
768 "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
769 );
770
771 let stats = Statistics::int64(None, None, None, Some(7), false);
772 assert_eq!(
773 format!("{stats}"),
774 "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
775 false, max_value_exact: false, min_value_exact: false}"
776 );
777
778 let stats = Statistics::int96(
779 Some(Int96::from(vec![1, 0, 0])),
780 Some(Int96::from(vec![2, 3, 4])),
781 None,
782 Some(3),
783 true,
784 );
785 assert_eq!(
786 format!("{stats}"),
787 "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
788 min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
789 );
790
791 let stats = Statistics::ByteArray(
792 ValueStatistics::new(
793 Some(ByteArray::from(vec![1u8])),
794 Some(ByteArray::from(vec![2u8])),
795 Some(5),
796 Some(7),
797 false,
798 )
799 .with_max_is_exact(false)
800 .with_min_is_exact(false),
801 );
802 assert_eq!(
803 format!("{stats}"),
804 "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
805 );
806 }
807
808 #[test]
809 fn test_statistics_partial_eq() {
810 let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
811
812 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
813 assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
814 assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
815 assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
816 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
817
818 assert!(
819 Statistics::int32(Some(12), Some(45), None, Some(11), false)
820 != Statistics::int64(Some(12), Some(45), None, Some(11), false)
821 );
822
823 assert!(
824 Statistics::boolean(Some(false), Some(true), None, None, true)
825 != Statistics::double(Some(1.2), Some(4.5), None, None, true)
826 );
827
828 assert!(
829 Statistics::byte_array(
830 Some(ByteArray::from(vec![1, 2, 3])),
831 Some(ByteArray::from(vec![1, 2, 3])),
832 None,
833 None,
834 true
835 ) != Statistics::fixed_len_byte_array(
836 Some(ByteArray::from(vec![1, 2, 3]).into()),
837 Some(ByteArray::from(vec![1, 2, 3]).into()),
838 None,
839 None,
840 true,
841 )
842 );
843
844 assert!(
845 Statistics::byte_array(
846 Some(ByteArray::from(vec![1, 2, 3])),
847 Some(ByteArray::from(vec![1, 2, 3])),
848 None,
849 None,
850 true,
851 ) != Statistics::ByteArray(
852 ValueStatistics::new(
853 Some(ByteArray::from(vec![1, 2, 3])),
854 Some(ByteArray::from(vec![1, 2, 3])),
855 None,
856 None,
857 true,
858 )
859 .with_max_is_exact(false)
860 )
861 );
862
863 assert!(
864 Statistics::fixed_len_byte_array(
865 Some(FixedLenByteArray::from(vec![1, 2, 3])),
866 Some(FixedLenByteArray::from(vec![1, 2, 3])),
867 None,
868 None,
869 true,
870 ) != Statistics::FixedLenByteArray(
871 ValueStatistics::new(
872 Some(FixedLenByteArray::from(vec![1, 2, 3])),
873 Some(FixedLenByteArray::from(vec![1, 2, 3])),
874 None,
875 None,
876 true,
877 )
878 .with_min_is_exact(false)
879 )
880 );
881 }
882
883 #[test]
884 fn test_statistics_from_thrift() {
885 fn check_stats(stats: Statistics) {
887 let tpe = stats.physical_type();
888 let thrift_stats = page_stats_to_thrift(Some(&stats));
889 assert_eq!(
890 from_thrift_page_stats(tpe, thrift_stats).unwrap(),
891 Some(stats)
892 );
893 }
894
895 check_stats(Statistics::boolean(
896 Some(false),
897 Some(true),
898 None,
899 Some(7),
900 true,
901 ));
902 check_stats(Statistics::boolean(
903 Some(false),
904 Some(true),
905 None,
906 Some(7),
907 true,
908 ));
909 check_stats(Statistics::boolean(
910 Some(false),
911 Some(true),
912 None,
913 Some(0),
914 false,
915 ));
916 check_stats(Statistics::boolean(
917 Some(true),
918 Some(true),
919 None,
920 Some(7),
921 true,
922 ));
923 check_stats(Statistics::boolean(
924 Some(false),
925 Some(false),
926 None,
927 Some(7),
928 true,
929 ));
930 check_stats(Statistics::boolean(None, None, None, Some(7), true));
931
932 check_stats(Statistics::int32(
933 Some(-100),
934 Some(500),
935 None,
936 Some(7),
937 true,
938 ));
939 check_stats(Statistics::int32(
940 Some(-100),
941 Some(500),
942 None,
943 Some(0),
944 false,
945 ));
946 check_stats(Statistics::int32(None, None, None, Some(7), true));
947
948 check_stats(Statistics::int64(
949 Some(-100),
950 Some(200),
951 None,
952 Some(7),
953 true,
954 ));
955 check_stats(Statistics::int64(
956 Some(-100),
957 Some(200),
958 None,
959 Some(0),
960 false,
961 ));
962 check_stats(Statistics::int64(None, None, None, Some(7), true));
963
964 check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
965 check_stats(Statistics::float(
966 Some(1.2),
967 Some(3.4),
968 None,
969 Some(0),
970 false,
971 ));
972 check_stats(Statistics::float(None, None, None, Some(7), true));
973
974 check_stats(Statistics::double(
975 Some(1.2),
976 Some(3.4),
977 None,
978 Some(7),
979 true,
980 ));
981 check_stats(Statistics::double(
982 Some(1.2),
983 Some(3.4),
984 None,
985 Some(0),
986 false,
987 ));
988 check_stats(Statistics::double(None, None, None, Some(7), true));
989
990 check_stats(Statistics::byte_array(
991 Some(ByteArray::from(vec![1, 2, 3])),
992 Some(ByteArray::from(vec![3, 4, 5])),
993 None,
994 Some(7),
995 true,
996 ));
997 check_stats(Statistics::byte_array(None, None, None, Some(7), true));
998
999 check_stats(Statistics::fixed_len_byte_array(
1000 Some(ByteArray::from(vec![1, 2, 3]).into()),
1001 Some(ByteArray::from(vec![3, 4, 5]).into()),
1002 None,
1003 Some(7),
1004 true,
1005 ));
1006 check_stats(Statistics::fixed_len_byte_array(
1007 None,
1008 None,
1009 None,
1010 Some(7),
1011 true,
1012 ));
1013 }
1014
1015 #[test]
1016 fn test_count_encoding() {
1017 statistics_count_test(None, None);
1018 statistics_count_test(Some(0), Some(0));
1019 statistics_count_test(Some(100), Some(2000));
1020 statistics_count_test(Some(1), None);
1021 statistics_count_test(None, Some(1));
1022 }
1023
1024 #[test]
1025 fn test_count_encoding_distinct_too_large() {
1026 let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1028 let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1029 assert_eq!(thrift_stats.distinct_count, None); assert_eq!(thrift_stats.null_count, Some(100));
1031 }
1032
1033 #[test]
1034 fn test_count_encoding_null_too_large() {
1035 let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1037 let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1038 assert_eq!(thrift_stats.distinct_count, Some(100));
1039 assert_eq!(thrift_stats.null_count, None); }
1041
1042 #[test]
1043 fn test_count_decoding_null_invalid() {
1044 let tstatistics = PageStatistics {
1045 null_count: Some(-42),
1046 max: None,
1047 min: None,
1048 distinct_count: None,
1049 max_value: None,
1050 min_value: None,
1051 is_max_value_exact: None,
1052 is_min_value_exact: None,
1053 };
1054 let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1055 assert_eq!(
1056 err.to_string(),
1057 "Parquet error: Statistics null count is negative -42"
1058 );
1059 }
1060
1061 fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1065 let statistics = make_bool_stats(distinct_count, null_count);
1066
1067 let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap();
1068 assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1069 assert_eq!(
1070 thrift_stats.distinct_count.map(|c| c as u64),
1071 distinct_count
1072 );
1073
1074 let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
1075 .unwrap()
1076 .unwrap();
1077 assert_eq!(round_tripped, statistics);
1078 }
1079
1080 fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1081 let min = Some(true);
1082 let max = Some(false);
1083 let is_min_max_deprecated = false;
1084
1085 Statistics::Boolean(ValueStatistics::new(
1087 min,
1088 max,
1089 distinct_count,
1090 null_count,
1091 is_min_max_deprecated,
1092 ))
1093 }
1094
1095 #[test]
1096 fn test_int96_invalid_statistics() {
1097 let mut thrift_stats = PageStatistics {
1098 max: None,
1099 min: Some((0..13).collect()),
1100 null_count: Some(0),
1101 distinct_count: None,
1102 max_value: None,
1103 min_value: None,
1104 is_max_value_exact: None,
1105 is_min_value_exact: None,
1106 };
1107
1108 let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats.clone())).unwrap_err();
1109 assert_eq!(
1110 err.to_string(),
1111 "Parquet error: Incorrect Int96 min statistics"
1112 );
1113
1114 thrift_stats.min = None;
1115 thrift_stats.max = Some((0..13).collect());
1116 let err = from_thrift_page_stats(Type::INT96, Some(thrift_stats)).unwrap_err();
1117 assert_eq!(
1118 err.to_string(),
1119 "Parquet error: Incorrect Int96 max statistics"
1120 );
1121 }
1122
1123 fn generic_statistics_handler<T: std::fmt::Display>(stats: ValueStatistics<T>) -> String {
1126 match stats.min_opt() {
1127 Some(s) => format!("min: {}", s),
1128 None => "min: NA".to_string(),
1129 }
1130 }
1131
1132 #[test]
1133 fn test_generic_access() {
1134 let stats = Statistics::int32(Some(12), Some(45), None, Some(11), false);
1135
1136 match stats {
1137 Statistics::Int32(v) => {
1138 let stats_string = generic_statistics_handler(v);
1139 assert_eq!(&stats_string, "min: 12");
1140 }
1141 _ => unreachable!(),
1142 }
1143 }
1144}