1use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53 use super::*;
54
55 pub trait MakeStatistics {
56 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57 where
58 Self: Sized;
59 }
60
61 macro_rules! gen_make_statistics {
62 ($value_ty:ty, $stat:ident) => {
63 impl MakeStatistics for $value_ty {
64 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65 where
66 Self: Sized,
67 {
68 Statistics::$stat(statistics)
69 }
70 }
71 };
72 }
73
74 gen_make_statistics!(bool, Boolean);
75 gen_make_statistics!(i32, Int32);
76 gen_make_statistics!(i64, Int64);
77 gen_make_statistics!(Int96, Int96);
78 gen_make_statistics!(f32, Float);
79 gen_make_statistics!(f64, Double);
80 gen_make_statistics!(ByteArray, ByteArray);
81 gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84macro_rules! statistics_new_func {
86 ($func:ident, $vtype:ty, $stat:ident) => {
87 #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88 pub fn $func(
89 min: $vtype,
90 max: $vtype,
91 distinct: Option<u64>,
92 nulls: Option<u64>,
93 is_deprecated: bool,
94 ) -> Self {
95 Statistics::$stat(ValueStatistics::new(
96 min,
97 max,
98 distinct,
99 nulls,
100 is_deprecated,
101 ))
102 }
103 };
104}
105
106macro_rules! statistics_enum_func {
108 ($self:ident, $func:ident) => {{
109 match *$self {
110 Statistics::Boolean(ref typed) => typed.$func(),
111 Statistics::Int32(ref typed) => typed.$func(),
112 Statistics::Int64(ref typed) => typed.$func(),
113 Statistics::Int96(ref typed) => typed.$func(),
114 Statistics::Float(ref typed) => typed.$func(),
115 Statistics::Double(ref typed) => typed.$func(),
116 Statistics::ByteArray(ref typed) => typed.$func(),
117 Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118 }
119 }};
120}
121
122pub fn from_thrift(
124 physical_type: Type,
125 thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127 Ok(match thrift_stats {
128 Some(stats) => {
129 let null_count = stats.null_count.unwrap_or(0);
133
134 if null_count < 0 {
135 return Err(ParquetError::General(format!(
136 "Statistics null count is negative {null_count}",
137 )));
138 }
139
140 let null_count = Some(null_count as u64);
142 let distinct_count = stats.distinct_count.map(|value| value as u64);
144 let old_format = stats.min_value.is_none() && stats.max_value.is_none();
146 let min = if old_format {
148 stats.min
149 } else {
150 stats.min_value
151 };
152 let max = if old_format {
154 stats.max
155 } else {
156 stats.max_value
157 };
158
159 fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
160 if let Some(min) = min {
161 if min.len() < len {
162 return Err(ParquetError::General(
163 "Insufficient bytes to parse min statistic".to_string(),
164 ));
165 }
166 }
167 if let Some(max) = max {
168 if max.len() < len {
169 return Err(ParquetError::General(
170 "Insufficient bytes to parse max statistic".to_string(),
171 ));
172 }
173 }
174 Ok(())
175 }
176
177 match physical_type {
178 Type::BOOLEAN => check_len(&min, &max, 1),
179 Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
180 Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
181 Type::INT96 => check_len(&min, &max, 12),
182 _ => Ok(()),
183 }?;
184
185 let res = match physical_type {
190 Type::BOOLEAN => Statistics::boolean(
191 min.map(|data| data[0] != 0),
192 max.map(|data| data[0] != 0),
193 distinct_count,
194 null_count,
195 old_format,
196 ),
197 Type::INT32 => Statistics::int32(
198 min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199 max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
200 distinct_count,
201 null_count,
202 old_format,
203 ),
204 Type::INT64 => Statistics::int64(
205 min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206 max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
207 distinct_count,
208 null_count,
209 old_format,
210 ),
211 Type::INT96 => {
212 let min = if let Some(data) = min {
216 assert_eq!(data.len(), 12);
217 Some(Int96::try_from_le_slice(&data)?)
218 } else {
219 None
220 };
221 let max = if let Some(data) = max {
222 assert_eq!(data.len(), 12);
223 Some(Int96::try_from_le_slice(&data)?)
224 } else {
225 None
226 };
227 Statistics::int96(min, max, distinct_count, null_count, old_format)
228 }
229 Type::FLOAT => Statistics::float(
230 min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
231 max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
232 distinct_count,
233 null_count,
234 old_format,
235 ),
236 Type::DOUBLE => Statistics::double(
237 min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
238 max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
239 distinct_count,
240 null_count,
241 old_format,
242 ),
243 Type::BYTE_ARRAY => Statistics::ByteArray(
244 ValueStatistics::new(
245 min.map(ByteArray::from),
246 max.map(ByteArray::from),
247 distinct_count,
248 null_count,
249 old_format,
250 )
251 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
252 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
253 ),
254 Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
255 ValueStatistics::new(
256 min.map(ByteArray::from).map(FixedLenByteArray::from),
257 max.map(ByteArray::from).map(FixedLenByteArray::from),
258 distinct_count,
259 null_count,
260 old_format,
261 )
262 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
263 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
264 ),
265 };
266
267 Some(res)
268 }
269 None => None,
270 })
271}
272
273pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
275 let stats = stats?;
276
277 let null_count = stats
279 .null_count_opt()
280 .and_then(|value| i64::try_from(value).ok());
281
282 let distinct_count = stats
284 .distinct_count_opt()
285 .and_then(|value| i64::try_from(value).ok());
286
287 let mut thrift_stats = TStatistics {
288 max: None,
289 min: None,
290 null_count,
291 distinct_count,
292 max_value: None,
293 min_value: None,
294 is_max_value_exact: None,
295 is_min_value_exact: None,
296 };
297
298 let (min, max, min_exact, max_exact) = (
300 stats.min_bytes_opt().map(|x| x.to_vec()),
301 stats.max_bytes_opt().map(|x| x.to_vec()),
302 Some(stats.min_is_exact()),
303 Some(stats.max_is_exact()),
304 );
305 if stats.is_min_max_backwards_compatible() {
306 thrift_stats.min.clone_from(&min);
308 thrift_stats.max.clone_from(&max);
309 }
310
311 if !stats.is_min_max_deprecated() {
312 thrift_stats.min_value = min;
313 thrift_stats.max_value = max;
314 }
315
316 thrift_stats.is_min_value_exact = min_exact;
317 thrift_stats.is_max_value_exact = max_exact;
318
319 Some(thrift_stats)
320}
321
322#[derive(Debug, Clone, PartialEq)]
334pub enum Statistics {
335 Boolean(ValueStatistics<bool>),
337 Int32(ValueStatistics<i32>),
339 Int64(ValueStatistics<i64>),
341 Int96(ValueStatistics<Int96>),
343 Float(ValueStatistics<f32>),
345 Double(ValueStatistics<f64>),
347 ByteArray(ValueStatistics<ByteArray>),
349 FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
351}
352
353impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
354 fn from(t: ValueStatistics<T>) -> Self {
355 T::make_statistics(t)
356 }
357}
358
359impl Statistics {
360 pub fn new<T: ParquetValueType>(
362 min: Option<T>,
363 max: Option<T>,
364 distinct_count: Option<u64>,
365 null_count: Option<u64>,
366 is_deprecated: bool,
367 ) -> Self {
368 Self::from(ValueStatistics::new(
369 min,
370 max,
371 distinct_count,
372 null_count,
373 is_deprecated,
374 ))
375 }
376
377 statistics_new_func![boolean, Option<bool>, Boolean];
378
379 statistics_new_func![int32, Option<i32>, Int32];
380
381 statistics_new_func![int64, Option<i64>, Int64];
382
383 statistics_new_func![int96, Option<Int96>, Int96];
384
385 statistics_new_func![float, Option<f32>, Float];
386
387 statistics_new_func![double, Option<f64>, Double];
388
389 statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
390
391 statistics_new_func![
392 fixed_len_byte_array,
393 Option<FixedLenByteArray>,
394 FixedLenByteArray
395 ];
396
397 pub fn is_min_max_deprecated(&self) -> bool {
404 statistics_enum_func![self, is_min_max_deprecated]
405 }
406
407 pub fn is_min_max_backwards_compatible(&self) -> bool {
418 statistics_enum_func![self, is_min_max_backwards_compatible]
419 }
420
421 pub fn distinct_count_opt(&self) -> Option<u64> {
424 statistics_enum_func![self, distinct_count]
425 }
426
427 pub fn null_count_opt(&self) -> Option<u64> {
434 statistics_enum_func![self, null_count_opt]
435 }
436
437 pub fn min_is_exact(&self) -> bool {
439 statistics_enum_func![self, min_is_exact]
440 }
441
442 pub fn max_is_exact(&self) -> bool {
444 statistics_enum_func![self, max_is_exact]
445 }
446
447 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
449 statistics_enum_func![self, min_bytes_opt]
450 }
451
452 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
454 statistics_enum_func![self, max_bytes_opt]
455 }
456
457 pub fn physical_type(&self) -> Type {
459 match self {
460 Statistics::Boolean(_) => Type::BOOLEAN,
461 Statistics::Int32(_) => Type::INT32,
462 Statistics::Int64(_) => Type::INT64,
463 Statistics::Int96(_) => Type::INT96,
464 Statistics::Float(_) => Type::FLOAT,
465 Statistics::Double(_) => Type::DOUBLE,
466 Statistics::ByteArray(_) => Type::BYTE_ARRAY,
467 Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
468 }
469 }
470}
471
472impl fmt::Display for Statistics {
473 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
474 match self {
475 Statistics::Boolean(typed) => write!(f, "{typed}"),
476 Statistics::Int32(typed) => write!(f, "{typed}"),
477 Statistics::Int64(typed) => write!(f, "{typed}"),
478 Statistics::Int96(typed) => write!(f, "{typed}"),
479 Statistics::Float(typed) => write!(f, "{typed}"),
480 Statistics::Double(typed) => write!(f, "{typed}"),
481 Statistics::ByteArray(typed) => write!(f, "{typed}"),
482 Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
483 }
484 }
485}
486
487pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
489
490#[derive(Clone, Eq, PartialEq)]
494pub struct ValueStatistics<T> {
495 min: Option<T>,
496 max: Option<T>,
497 distinct_count: Option<u64>,
499 null_count: Option<u64>,
500
501 is_max_value_exact: bool,
503 is_min_value_exact: bool,
504
505 is_min_max_deprecated: bool,
508
509 is_min_max_backwards_compatible: bool,
512}
513
514impl<T: ParquetValueType> ValueStatistics<T> {
515 pub fn new(
517 min: Option<T>,
518 max: Option<T>,
519 distinct_count: Option<u64>,
520 null_count: Option<u64>,
521 is_min_max_deprecated: bool,
522 ) -> Self {
523 Self {
524 is_max_value_exact: max.is_some(),
525 is_min_value_exact: min.is_some(),
526 min,
527 max,
528 distinct_count,
529 null_count,
530 is_min_max_deprecated,
531 is_min_max_backwards_compatible: is_min_max_deprecated,
532 }
533 }
534
535 pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
540 Self {
541 is_min_value_exact,
542 ..self
543 }
544 }
545
546 pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
551 Self {
552 is_max_value_exact,
553 ..self
554 }
555 }
556
557 pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
563 Self {
564 is_min_max_backwards_compatible: backwards_compatible,
565 ..self
566 }
567 }
568
569 pub fn min_opt(&self) -> Option<&T> {
571 self.min.as_ref()
572 }
573
574 pub fn max_opt(&self) -> Option<&T> {
576 self.max.as_ref()
577 }
578
579 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
581 self.min_opt().map(AsBytes::as_bytes)
582 }
583
584 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
586 self.max_opt().map(AsBytes::as_bytes)
587 }
588
589 pub(crate) fn _internal_has_min_max_set(&self) -> bool {
592 self.min.is_some() && self.max.is_some()
593 }
594
595 pub fn max_is_exact(&self) -> bool {
597 self.max.is_some() && self.is_max_value_exact
598 }
599
600 pub fn min_is_exact(&self) -> bool {
602 self.min.is_some() && self.is_min_value_exact
603 }
604
605 pub fn distinct_count(&self) -> Option<u64> {
607 self.distinct_count
608 }
609
610 pub fn null_count_opt(&self) -> Option<u64> {
612 self.null_count
613 }
614
615 fn is_min_max_deprecated(&self) -> bool {
617 self.is_min_max_deprecated
618 }
619
620 pub fn is_min_max_backwards_compatible(&self) -> bool {
631 self.is_min_max_backwards_compatible
632 }
633}
634
635impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
636 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
637 write!(f, "{{")?;
638 write!(f, "min: ")?;
639 match self.min {
640 Some(ref value) => write!(f, "{value}")?,
641 None => write!(f, "N/A")?,
642 }
643 write!(f, ", max: ")?;
644 match self.max {
645 Some(ref value) => write!(f, "{value}")?,
646 None => write!(f, "N/A")?,
647 }
648 write!(f, ", distinct_count: ")?;
649 match self.distinct_count {
650 Some(value) => write!(f, "{value}")?,
651 None => write!(f, "N/A")?,
652 }
653 write!(f, ", null_count: ")?;
654 match self.null_count {
655 Some(value) => write!(f, "{value}")?,
656 None => write!(f, "N/A")?,
657 }
658 write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
659 write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
660 write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
661 write!(f, "}}")
662 }
663}
664
665impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
666 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
667 write!(
668 f,
669 "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
670 min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
671 self.min,
672 self.max,
673 self.distinct_count,
674 self.null_count,
675 self.is_min_max_deprecated,
676 self.is_min_max_backwards_compatible,
677 self.is_max_value_exact,
678 self.is_min_value_exact
679 )
680 }
681}
682
683#[cfg(test)]
684mod tests {
685 use super::*;
686
687 #[test]
688 fn test_statistics_min_max_bytes() {
689 let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
690 assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
691 assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
692
693 let stats = Statistics::byte_array(
694 Some(ByteArray::from(vec![1, 2, 3])),
695 Some(ByteArray::from(vec![3, 4, 5])),
696 None,
697 Some(1),
698 true,
699 );
700 assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
701 assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
702 }
703
704 #[test]
705 #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
706 fn test_statistics_negative_null_count() {
707 let thrift_stats = TStatistics {
708 max: None,
709 min: None,
710 null_count: Some(-10),
711 distinct_count: None,
712 max_value: None,
713 min_value: None,
714 is_max_value_exact: None,
715 is_min_value_exact: None,
716 };
717
718 from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
719 }
720
721 #[test]
722 fn test_statistics_thrift_none() {
723 assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
724 assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
725 }
726
727 #[test]
728 fn test_statistics_debug() {
729 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
730 assert_eq!(
731 format!("{stats:?}"),
732 "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
733 min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
734 );
735
736 let stats = Statistics::int32(None, None, None, Some(7), false);
737 assert_eq!(
738 format!("{stats:?}"),
739 "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
740 min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
741 )
742 }
743
744 #[test]
745 fn test_statistics_display() {
746 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
747 assert_eq!(
748 format!("{stats}"),
749 "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
750 );
751
752 let stats = Statistics::int64(None, None, None, Some(7), false);
753 assert_eq!(
754 format!("{stats}"),
755 "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
756 false, max_value_exact: false, min_value_exact: false}"
757 );
758
759 let stats = Statistics::int96(
760 Some(Int96::from(vec![1, 0, 0])),
761 Some(Int96::from(vec![2, 3, 4])),
762 None,
763 Some(3),
764 true,
765 );
766 assert_eq!(
767 format!("{stats}"),
768 "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
769 min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
770 );
771
772 let stats = Statistics::ByteArray(
773 ValueStatistics::new(
774 Some(ByteArray::from(vec![1u8])),
775 Some(ByteArray::from(vec![2u8])),
776 Some(5),
777 Some(7),
778 false,
779 )
780 .with_max_is_exact(false)
781 .with_min_is_exact(false),
782 );
783 assert_eq!(
784 format!("{stats}"),
785 "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
786 );
787 }
788
789 #[test]
790 fn test_statistics_partial_eq() {
791 let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
792
793 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
794 assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
795 assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
796 assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
797 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
798
799 assert!(
800 Statistics::int32(Some(12), Some(45), None, Some(11), false)
801 != Statistics::int64(Some(12), Some(45), None, Some(11), false)
802 );
803
804 assert!(
805 Statistics::boolean(Some(false), Some(true), None, None, true)
806 != Statistics::double(Some(1.2), Some(4.5), None, None, true)
807 );
808
809 assert!(
810 Statistics::byte_array(
811 Some(ByteArray::from(vec![1, 2, 3])),
812 Some(ByteArray::from(vec![1, 2, 3])),
813 None,
814 None,
815 true
816 ) != Statistics::fixed_len_byte_array(
817 Some(ByteArray::from(vec![1, 2, 3]).into()),
818 Some(ByteArray::from(vec![1, 2, 3]).into()),
819 None,
820 None,
821 true,
822 )
823 );
824
825 assert!(
826 Statistics::byte_array(
827 Some(ByteArray::from(vec![1, 2, 3])),
828 Some(ByteArray::from(vec![1, 2, 3])),
829 None,
830 None,
831 true,
832 ) != Statistics::ByteArray(
833 ValueStatistics::new(
834 Some(ByteArray::from(vec![1, 2, 3])),
835 Some(ByteArray::from(vec![1, 2, 3])),
836 None,
837 None,
838 true,
839 )
840 .with_max_is_exact(false)
841 )
842 );
843
844 assert!(
845 Statistics::fixed_len_byte_array(
846 Some(FixedLenByteArray::from(vec![1, 2, 3])),
847 Some(FixedLenByteArray::from(vec![1, 2, 3])),
848 None,
849 None,
850 true,
851 ) != Statistics::FixedLenByteArray(
852 ValueStatistics::new(
853 Some(FixedLenByteArray::from(vec![1, 2, 3])),
854 Some(FixedLenByteArray::from(vec![1, 2, 3])),
855 None,
856 None,
857 true,
858 )
859 .with_min_is_exact(false)
860 )
861 );
862 }
863
864 #[test]
865 fn test_statistics_from_thrift() {
866 fn check_stats(stats: Statistics) {
868 let tpe = stats.physical_type();
869 let thrift_stats = to_thrift(Some(&stats));
870 assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
871 }
872
873 check_stats(Statistics::boolean(
874 Some(false),
875 Some(true),
876 None,
877 Some(7),
878 true,
879 ));
880 check_stats(Statistics::boolean(
881 Some(false),
882 Some(true),
883 None,
884 Some(7),
885 true,
886 ));
887 check_stats(Statistics::boolean(
888 Some(false),
889 Some(true),
890 None,
891 Some(0),
892 false,
893 ));
894 check_stats(Statistics::boolean(
895 Some(true),
896 Some(true),
897 None,
898 Some(7),
899 true,
900 ));
901 check_stats(Statistics::boolean(
902 Some(false),
903 Some(false),
904 None,
905 Some(7),
906 true,
907 ));
908 check_stats(Statistics::boolean(None, None, None, Some(7), true));
909
910 check_stats(Statistics::int32(
911 Some(-100),
912 Some(500),
913 None,
914 Some(7),
915 true,
916 ));
917 check_stats(Statistics::int32(
918 Some(-100),
919 Some(500),
920 None,
921 Some(0),
922 false,
923 ));
924 check_stats(Statistics::int32(None, None, None, Some(7), true));
925
926 check_stats(Statistics::int64(
927 Some(-100),
928 Some(200),
929 None,
930 Some(7),
931 true,
932 ));
933 check_stats(Statistics::int64(
934 Some(-100),
935 Some(200),
936 None,
937 Some(0),
938 false,
939 ));
940 check_stats(Statistics::int64(None, None, None, Some(7), true));
941
942 check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
943 check_stats(Statistics::float(
944 Some(1.2),
945 Some(3.4),
946 None,
947 Some(0),
948 false,
949 ));
950 check_stats(Statistics::float(None, None, None, Some(7), true));
951
952 check_stats(Statistics::double(
953 Some(1.2),
954 Some(3.4),
955 None,
956 Some(7),
957 true,
958 ));
959 check_stats(Statistics::double(
960 Some(1.2),
961 Some(3.4),
962 None,
963 Some(0),
964 false,
965 ));
966 check_stats(Statistics::double(None, None, None, Some(7), true));
967
968 check_stats(Statistics::byte_array(
969 Some(ByteArray::from(vec![1, 2, 3])),
970 Some(ByteArray::from(vec![3, 4, 5])),
971 None,
972 Some(7),
973 true,
974 ));
975 check_stats(Statistics::byte_array(None, None, None, Some(7), true));
976
977 check_stats(Statistics::fixed_len_byte_array(
978 Some(ByteArray::from(vec![1, 2, 3]).into()),
979 Some(ByteArray::from(vec![3, 4, 5]).into()),
980 None,
981 Some(7),
982 true,
983 ));
984 check_stats(Statistics::fixed_len_byte_array(
985 None,
986 None,
987 None,
988 Some(7),
989 true,
990 ));
991 }
992
993 #[test]
994 fn test_count_encoding() {
995 statistics_count_test(None, None);
996 statistics_count_test(Some(0), Some(0));
997 statistics_count_test(Some(100), Some(2000));
998 statistics_count_test(Some(1), None);
999 statistics_count_test(None, Some(1));
1000 }
1001
1002 #[test]
1003 fn test_count_encoding_distinct_too_large() {
1004 let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1006 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1007 assert_eq!(thrift_stats.distinct_count, None); assert_eq!(thrift_stats.null_count, Some(100));
1009 }
1010
1011 #[test]
1012 fn test_count_encoding_null_too_large() {
1013 let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1015 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1016 assert_eq!(thrift_stats.distinct_count, Some(100));
1017 assert_eq!(thrift_stats.null_count, None); }
1019
1020 #[test]
1021 fn test_count_decoding_null_invalid() {
1022 let tstatistics = TStatistics {
1023 null_count: Some(-42),
1024 ..Default::default()
1025 };
1026 let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1027 assert_eq!(
1028 err.to_string(),
1029 "Parquet error: Statistics null count is negative -42"
1030 );
1031 }
1032
1033 fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1037 let statistics = make_bool_stats(distinct_count, null_count);
1038
1039 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1040 assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1041 assert_eq!(
1042 thrift_stats.distinct_count.map(|c| c as u64),
1043 distinct_count
1044 );
1045
1046 let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1047 .unwrap()
1048 .unwrap();
1049 if null_count.is_none() {
1052 assert_ne!(round_tripped, statistics);
1053 assert!(round_tripped.null_count_opt().is_some());
1054 assert_eq!(round_tripped.null_count_opt(), Some(0));
1055 assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1056 assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1057 assert_eq!(
1058 round_tripped.distinct_count_opt(),
1059 statistics.distinct_count_opt()
1060 );
1061 } else {
1062 assert_eq!(round_tripped, statistics);
1063 }
1064 }
1065
1066 fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1067 let min = Some(true);
1068 let max = Some(false);
1069 let is_min_max_deprecated = false;
1070
1071 Statistics::Boolean(ValueStatistics::new(
1073 min,
1074 max,
1075 distinct_count,
1076 null_count,
1077 is_min_max_deprecated,
1078 ))
1079 }
1080}