1use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53 use super::*;
54
55 pub trait MakeStatistics {
56 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57 where
58 Self: Sized;
59 }
60
61 macro_rules! gen_make_statistics {
62 ($value_ty:ty, $stat:ident) => {
63 impl MakeStatistics for $value_ty {
64 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65 where
66 Self: Sized,
67 {
68 Statistics::$stat(statistics)
69 }
70 }
71 };
72 }
73
74 gen_make_statistics!(bool, Boolean);
75 gen_make_statistics!(i32, Int32);
76 gen_make_statistics!(i64, Int64);
77 gen_make_statistics!(Int96, Int96);
78 gen_make_statistics!(f32, Float);
79 gen_make_statistics!(f64, Double);
80 gen_make_statistics!(ByteArray, ByteArray);
81 gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84macro_rules! statistics_new_func {
86 ($func:ident, $vtype:ty, $stat:ident) => {
87 #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88 pub fn $func(
89 min: $vtype,
90 max: $vtype,
91 distinct: Option<u64>,
92 nulls: Option<u64>,
93 is_deprecated: bool,
94 ) -> Self {
95 Statistics::$stat(ValueStatistics::new(
96 min,
97 max,
98 distinct,
99 nulls,
100 is_deprecated,
101 ))
102 }
103 };
104}
105
106macro_rules! statistics_enum_func {
108 ($self:ident, $func:ident) => {{
109 match *$self {
110 Statistics::Boolean(ref typed) => typed.$func(),
111 Statistics::Int32(ref typed) => typed.$func(),
112 Statistics::Int64(ref typed) => typed.$func(),
113 Statistics::Int96(ref typed) => typed.$func(),
114 Statistics::Float(ref typed) => typed.$func(),
115 Statistics::Double(ref typed) => typed.$func(),
116 Statistics::ByteArray(ref typed) => typed.$func(),
117 Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118 }
119 }};
120}
121
122pub fn from_thrift(
124 physical_type: Type,
125 thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127 Ok(match thrift_stats {
128 Some(stats) => {
129 let null_count = stats.null_count.unwrap_or(0);
133
134 if null_count < 0 {
135 return Err(ParquetError::General(format!(
136 "Statistics null count is negative {null_count}",
137 )));
138 }
139
140 let null_count = Some(null_count as u64);
142 let distinct_count = stats.distinct_count.map(|value| value as u64);
144 let old_format = stats.min_value.is_none() && stats.max_value.is_none();
146 let min = if old_format {
148 stats.min
149 } else {
150 stats.min_value
151 };
152 let max = if old_format {
154 stats.max
155 } else {
156 stats.max_value
157 };
158
159 fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
160 if let Some(min) = min {
161 if min.len() < len {
162 return Err(ParquetError::General(
163 "Insufficient bytes to parse min statistic".to_string(),
164 ));
165 }
166 }
167 if let Some(max) = max {
168 if max.len() < len {
169 return Err(ParquetError::General(
170 "Insufficient bytes to parse max statistic".to_string(),
171 ));
172 }
173 }
174 Ok(())
175 }
176
177 match physical_type {
178 Type::BOOLEAN => check_len(&min, &max, 1),
179 Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
180 Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
181 Type::INT96 => check_len(&min, &max, 12),
182 _ => Ok(()),
183 }?;
184
185 let res = match physical_type {
190 Type::BOOLEAN => Statistics::boolean(
191 min.map(|data| data[0] != 0),
192 max.map(|data| data[0] != 0),
193 distinct_count,
194 null_count,
195 old_format,
196 ),
197 Type::INT32 => Statistics::int32(
198 min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
199 max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
200 distinct_count,
201 null_count,
202 old_format,
203 ),
204 Type::INT64 => Statistics::int64(
205 min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
206 max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
207 distinct_count,
208 null_count,
209 old_format,
210 ),
211 Type::INT96 => {
212 let min = if let Some(data) = min {
214 assert_eq!(data.len(), 12);
215 Some(Int96::try_from_le_slice(&data)?)
216 } else {
217 None
218 };
219 let max = if let Some(data) = max {
220 assert_eq!(data.len(), 12);
221 Some(Int96::try_from_le_slice(&data)?)
222 } else {
223 None
224 };
225 Statistics::int96(min, max, distinct_count, null_count, old_format)
226 }
227 Type::FLOAT => Statistics::float(
228 min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
229 max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
230 distinct_count,
231 null_count,
232 old_format,
233 ),
234 Type::DOUBLE => Statistics::double(
235 min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
236 max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
237 distinct_count,
238 null_count,
239 old_format,
240 ),
241 Type::BYTE_ARRAY => Statistics::ByteArray(
242 ValueStatistics::new(
243 min.map(ByteArray::from),
244 max.map(ByteArray::from),
245 distinct_count,
246 null_count,
247 old_format,
248 )
249 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
250 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
251 ),
252 Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
253 ValueStatistics::new(
254 min.map(ByteArray::from).map(FixedLenByteArray::from),
255 max.map(ByteArray::from).map(FixedLenByteArray::from),
256 distinct_count,
257 null_count,
258 old_format,
259 )
260 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
261 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
262 ),
263 };
264
265 Some(res)
266 }
267 None => None,
268 })
269}
270
271pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
273 let stats = stats?;
274
275 let null_count = stats
277 .null_count_opt()
278 .and_then(|value| i64::try_from(value).ok());
279
280 let distinct_count = stats
282 .distinct_count_opt()
283 .and_then(|value| i64::try_from(value).ok());
284
285 let mut thrift_stats = TStatistics {
286 max: None,
287 min: None,
288 null_count,
289 distinct_count,
290 max_value: None,
291 min_value: None,
292 is_max_value_exact: None,
293 is_min_value_exact: None,
294 };
295
296 let (min, max, min_exact, max_exact) = (
298 stats.min_bytes_opt().map(|x| x.to_vec()),
299 stats.max_bytes_opt().map(|x| x.to_vec()),
300 Some(stats.min_is_exact()),
301 Some(stats.max_is_exact()),
302 );
303 if stats.is_min_max_backwards_compatible() {
304 thrift_stats.min.clone_from(&min);
306 thrift_stats.max.clone_from(&max);
307 }
308
309 if !stats.is_min_max_deprecated() {
310 thrift_stats.min_value = min;
311 thrift_stats.max_value = max;
312 }
313
314 thrift_stats.is_min_value_exact = min_exact;
315 thrift_stats.is_max_value_exact = max_exact;
316
317 Some(thrift_stats)
318}
319
320#[derive(Debug, Clone, PartialEq)]
332pub enum Statistics {
333 Boolean(ValueStatistics<bool>),
335 Int32(ValueStatistics<i32>),
337 Int64(ValueStatistics<i64>),
339 Int96(ValueStatistics<Int96>),
341 Float(ValueStatistics<f32>),
343 Double(ValueStatistics<f64>),
345 ByteArray(ValueStatistics<ByteArray>),
347 FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
349}
350
351impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
352 fn from(t: ValueStatistics<T>) -> Self {
353 T::make_statistics(t)
354 }
355}
356
357impl Statistics {
358 pub fn new<T: ParquetValueType>(
360 min: Option<T>,
361 max: Option<T>,
362 distinct_count: Option<u64>,
363 null_count: Option<u64>,
364 is_deprecated: bool,
365 ) -> Self {
366 Self::from(ValueStatistics::new(
367 min,
368 max,
369 distinct_count,
370 null_count,
371 is_deprecated,
372 ))
373 }
374
375 statistics_new_func![boolean, Option<bool>, Boolean];
376
377 statistics_new_func![int32, Option<i32>, Int32];
378
379 statistics_new_func![int64, Option<i64>, Int64];
380
381 statistics_new_func![int96, Option<Int96>, Int96];
382
383 statistics_new_func![float, Option<f32>, Float];
384
385 statistics_new_func![double, Option<f64>, Double];
386
387 statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
388
389 statistics_new_func![
390 fixed_len_byte_array,
391 Option<FixedLenByteArray>,
392 FixedLenByteArray
393 ];
394
395 pub fn is_min_max_deprecated(&self) -> bool {
402 statistics_enum_func![self, is_min_max_deprecated]
403 }
404
405 pub fn is_min_max_backwards_compatible(&self) -> bool {
416 statistics_enum_func![self, is_min_max_backwards_compatible]
417 }
418
419 pub fn distinct_count_opt(&self) -> Option<u64> {
422 statistics_enum_func![self, distinct_count]
423 }
424
425 pub fn null_count_opt(&self) -> Option<u64> {
432 statistics_enum_func![self, null_count_opt]
433 }
434
435 pub fn min_is_exact(&self) -> bool {
437 statistics_enum_func![self, min_is_exact]
438 }
439
440 pub fn max_is_exact(&self) -> bool {
442 statistics_enum_func![self, max_is_exact]
443 }
444
445 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
447 statistics_enum_func![self, min_bytes_opt]
448 }
449
450 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
452 statistics_enum_func![self, max_bytes_opt]
453 }
454
455 pub fn physical_type(&self) -> Type {
457 match self {
458 Statistics::Boolean(_) => Type::BOOLEAN,
459 Statistics::Int32(_) => Type::INT32,
460 Statistics::Int64(_) => Type::INT64,
461 Statistics::Int96(_) => Type::INT96,
462 Statistics::Float(_) => Type::FLOAT,
463 Statistics::Double(_) => Type::DOUBLE,
464 Statistics::ByteArray(_) => Type::BYTE_ARRAY,
465 Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
466 }
467 }
468}
469
470impl fmt::Display for Statistics {
471 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
472 match self {
473 Statistics::Boolean(typed) => write!(f, "{typed}"),
474 Statistics::Int32(typed) => write!(f, "{typed}"),
475 Statistics::Int64(typed) => write!(f, "{typed}"),
476 Statistics::Int96(typed) => write!(f, "{typed}"),
477 Statistics::Float(typed) => write!(f, "{typed}"),
478 Statistics::Double(typed) => write!(f, "{typed}"),
479 Statistics::ByteArray(typed) => write!(f, "{typed}"),
480 Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
481 }
482 }
483}
484
485pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
487
488#[derive(Clone, Eq, PartialEq)]
492pub struct ValueStatistics<T> {
493 min: Option<T>,
494 max: Option<T>,
495 distinct_count: Option<u64>,
497 null_count: Option<u64>,
498
499 is_max_value_exact: bool,
501 is_min_value_exact: bool,
502
503 is_min_max_deprecated: bool,
506
507 is_min_max_backwards_compatible: bool,
510}
511
512impl<T: ParquetValueType> ValueStatistics<T> {
513 pub fn new(
515 min: Option<T>,
516 max: Option<T>,
517 distinct_count: Option<u64>,
518 null_count: Option<u64>,
519 is_min_max_deprecated: bool,
520 ) -> Self {
521 Self {
522 is_max_value_exact: max.is_some(),
523 is_min_value_exact: min.is_some(),
524 min,
525 max,
526 distinct_count,
527 null_count,
528 is_min_max_deprecated,
529 is_min_max_backwards_compatible: is_min_max_deprecated,
530 }
531 }
532
533 pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
538 Self {
539 is_min_value_exact,
540 ..self
541 }
542 }
543
544 pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
549 Self {
550 is_max_value_exact,
551 ..self
552 }
553 }
554
555 pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
561 Self {
562 is_min_max_backwards_compatible: backwards_compatible,
563 ..self
564 }
565 }
566
567 pub fn min_opt(&self) -> Option<&T> {
569 self.min.as_ref()
570 }
571
572 pub fn max_opt(&self) -> Option<&T> {
574 self.max.as_ref()
575 }
576
577 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
579 self.min_opt().map(AsBytes::as_bytes)
580 }
581
582 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
584 self.max_opt().map(AsBytes::as_bytes)
585 }
586
587 pub(crate) fn _internal_has_min_max_set(&self) -> bool {
590 self.min.is_some() && self.max.is_some()
591 }
592
593 pub fn max_is_exact(&self) -> bool {
595 self.max.is_some() && self.is_max_value_exact
596 }
597
598 pub fn min_is_exact(&self) -> bool {
600 self.min.is_some() && self.is_min_value_exact
601 }
602
603 pub fn distinct_count(&self) -> Option<u64> {
605 self.distinct_count
606 }
607
608 pub fn null_count_opt(&self) -> Option<u64> {
610 self.null_count
611 }
612
613 fn is_min_max_deprecated(&self) -> bool {
615 self.is_min_max_deprecated
616 }
617
618 pub fn is_min_max_backwards_compatible(&self) -> bool {
629 self.is_min_max_backwards_compatible
630 }
631}
632
633impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
634 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
635 write!(f, "{{")?;
636 write!(f, "min: ")?;
637 match self.min {
638 Some(ref value) => write!(f, "{value}")?,
639 None => write!(f, "N/A")?,
640 }
641 write!(f, ", max: ")?;
642 match self.max {
643 Some(ref value) => write!(f, "{value}")?,
644 None => write!(f, "N/A")?,
645 }
646 write!(f, ", distinct_count: ")?;
647 match self.distinct_count {
648 Some(value) => write!(f, "{value}")?,
649 None => write!(f, "N/A")?,
650 }
651 write!(f, ", null_count: ")?;
652 match self.null_count {
653 Some(value) => write!(f, "{value}")?,
654 None => write!(f, "N/A")?,
655 }
656 write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
657 write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
658 write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
659 write!(f, "}}")
660 }
661}
662
663impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
664 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
665 write!(
666 f,
667 "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
668 min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
669 self.min,
670 self.max,
671 self.distinct_count,
672 self.null_count,
673 self.is_min_max_deprecated,
674 self.is_min_max_backwards_compatible,
675 self.is_max_value_exact,
676 self.is_min_value_exact
677 )
678 }
679}
680
681#[cfg(test)]
682mod tests {
683 use super::*;
684
685 #[test]
686 fn test_statistics_min_max_bytes() {
687 let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
688 assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
689 assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
690
691 let stats = Statistics::byte_array(
692 Some(ByteArray::from(vec![1, 2, 3])),
693 Some(ByteArray::from(vec![3, 4, 5])),
694 None,
695 Some(1),
696 true,
697 );
698 assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
699 assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
700 }
701
702 #[test]
703 #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
704 fn test_statistics_negative_null_count() {
705 let thrift_stats = TStatistics {
706 max: None,
707 min: None,
708 null_count: Some(-10),
709 distinct_count: None,
710 max_value: None,
711 min_value: None,
712 is_max_value_exact: None,
713 is_min_value_exact: None,
714 };
715
716 from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
717 }
718
719 #[test]
720 fn test_statistics_thrift_none() {
721 assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
722 assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
723 }
724
725 #[test]
726 fn test_statistics_debug() {
727 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
728 assert_eq!(
729 format!("{stats:?}"),
730 "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
731 min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
732 );
733
734 let stats = Statistics::int32(None, None, None, Some(7), false);
735 assert_eq!(
736 format!("{stats:?}"),
737 "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
738 min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
739 )
740 }
741
742 #[test]
743 fn test_statistics_display() {
744 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
745 assert_eq!(
746 format!("{stats}"),
747 "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
748 );
749
750 let stats = Statistics::int64(None, None, None, Some(7), false);
751 assert_eq!(
752 format!("{stats}"),
753 "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
754 false, max_value_exact: false, min_value_exact: false}"
755 );
756
757 let stats = Statistics::int96(
758 Some(Int96::from(vec![1, 0, 0])),
759 Some(Int96::from(vec![2, 3, 4])),
760 None,
761 Some(3),
762 true,
763 );
764 assert_eq!(
765 format!("{stats}"),
766 "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
767 min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
768 );
769
770 let stats = Statistics::ByteArray(
771 ValueStatistics::new(
772 Some(ByteArray::from(vec![1u8])),
773 Some(ByteArray::from(vec![2u8])),
774 Some(5),
775 Some(7),
776 false,
777 )
778 .with_max_is_exact(false)
779 .with_min_is_exact(false),
780 );
781 assert_eq!(
782 format!("{stats}"),
783 "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
784 );
785 }
786
787 #[test]
788 fn test_statistics_partial_eq() {
789 let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
790
791 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
792 assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
793 assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
794 assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
795 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
796
797 assert!(
798 Statistics::int32(Some(12), Some(45), None, Some(11), false)
799 != Statistics::int64(Some(12), Some(45), None, Some(11), false)
800 );
801
802 assert!(
803 Statistics::boolean(Some(false), Some(true), None, None, true)
804 != Statistics::double(Some(1.2), Some(4.5), None, None, true)
805 );
806
807 assert!(
808 Statistics::byte_array(
809 Some(ByteArray::from(vec![1, 2, 3])),
810 Some(ByteArray::from(vec![1, 2, 3])),
811 None,
812 None,
813 true
814 ) != Statistics::fixed_len_byte_array(
815 Some(ByteArray::from(vec![1, 2, 3]).into()),
816 Some(ByteArray::from(vec![1, 2, 3]).into()),
817 None,
818 None,
819 true,
820 )
821 );
822
823 assert!(
824 Statistics::byte_array(
825 Some(ByteArray::from(vec![1, 2, 3])),
826 Some(ByteArray::from(vec![1, 2, 3])),
827 None,
828 None,
829 true,
830 ) != Statistics::ByteArray(
831 ValueStatistics::new(
832 Some(ByteArray::from(vec![1, 2, 3])),
833 Some(ByteArray::from(vec![1, 2, 3])),
834 None,
835 None,
836 true,
837 )
838 .with_max_is_exact(false)
839 )
840 );
841
842 assert!(
843 Statistics::fixed_len_byte_array(
844 Some(FixedLenByteArray::from(vec![1, 2, 3])),
845 Some(FixedLenByteArray::from(vec![1, 2, 3])),
846 None,
847 None,
848 true,
849 ) != Statistics::FixedLenByteArray(
850 ValueStatistics::new(
851 Some(FixedLenByteArray::from(vec![1, 2, 3])),
852 Some(FixedLenByteArray::from(vec![1, 2, 3])),
853 None,
854 None,
855 true,
856 )
857 .with_min_is_exact(false)
858 )
859 );
860 }
861
862 #[test]
863 fn test_statistics_from_thrift() {
864 fn check_stats(stats: Statistics) {
866 let tpe = stats.physical_type();
867 let thrift_stats = to_thrift(Some(&stats));
868 assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
869 }
870
871 check_stats(Statistics::boolean(
872 Some(false),
873 Some(true),
874 None,
875 Some(7),
876 true,
877 ));
878 check_stats(Statistics::boolean(
879 Some(false),
880 Some(true),
881 None,
882 Some(7),
883 true,
884 ));
885 check_stats(Statistics::boolean(
886 Some(false),
887 Some(true),
888 None,
889 Some(0),
890 false,
891 ));
892 check_stats(Statistics::boolean(
893 Some(true),
894 Some(true),
895 None,
896 Some(7),
897 true,
898 ));
899 check_stats(Statistics::boolean(
900 Some(false),
901 Some(false),
902 None,
903 Some(7),
904 true,
905 ));
906 check_stats(Statistics::boolean(None, None, None, Some(7), true));
907
908 check_stats(Statistics::int32(
909 Some(-100),
910 Some(500),
911 None,
912 Some(7),
913 true,
914 ));
915 check_stats(Statistics::int32(
916 Some(-100),
917 Some(500),
918 None,
919 Some(0),
920 false,
921 ));
922 check_stats(Statistics::int32(None, None, None, Some(7), true));
923
924 check_stats(Statistics::int64(
925 Some(-100),
926 Some(200),
927 None,
928 Some(7),
929 true,
930 ));
931 check_stats(Statistics::int64(
932 Some(-100),
933 Some(200),
934 None,
935 Some(0),
936 false,
937 ));
938 check_stats(Statistics::int64(None, None, None, Some(7), true));
939
940 check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
941 check_stats(Statistics::float(
942 Some(1.2),
943 Some(3.4),
944 None,
945 Some(0),
946 false,
947 ));
948 check_stats(Statistics::float(None, None, None, Some(7), true));
949
950 check_stats(Statistics::double(
951 Some(1.2),
952 Some(3.4),
953 None,
954 Some(7),
955 true,
956 ));
957 check_stats(Statistics::double(
958 Some(1.2),
959 Some(3.4),
960 None,
961 Some(0),
962 false,
963 ));
964 check_stats(Statistics::double(None, None, None, Some(7), true));
965
966 check_stats(Statistics::byte_array(
967 Some(ByteArray::from(vec![1, 2, 3])),
968 Some(ByteArray::from(vec![3, 4, 5])),
969 None,
970 Some(7),
971 true,
972 ));
973 check_stats(Statistics::byte_array(None, None, None, Some(7), true));
974
975 check_stats(Statistics::fixed_len_byte_array(
976 Some(ByteArray::from(vec![1, 2, 3]).into()),
977 Some(ByteArray::from(vec![3, 4, 5]).into()),
978 None,
979 Some(7),
980 true,
981 ));
982 check_stats(Statistics::fixed_len_byte_array(
983 None,
984 None,
985 None,
986 Some(7),
987 true,
988 ));
989 }
990
991 #[test]
992 fn test_count_encoding() {
993 statistics_count_test(None, None);
994 statistics_count_test(Some(0), Some(0));
995 statistics_count_test(Some(100), Some(2000));
996 statistics_count_test(Some(1), None);
997 statistics_count_test(None, Some(1));
998 }
999
1000 #[test]
1001 fn test_count_encoding_distinct_too_large() {
1002 let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1004 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1005 assert_eq!(thrift_stats.distinct_count, None); assert_eq!(thrift_stats.null_count, Some(100));
1007 }
1008
1009 #[test]
1010 fn test_count_encoding_null_too_large() {
1011 let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1013 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1014 assert_eq!(thrift_stats.distinct_count, Some(100));
1015 assert_eq!(thrift_stats.null_count, None); }
1017
1018 #[test]
1019 fn test_count_decoding_null_invalid() {
1020 let tstatistics = TStatistics {
1021 null_count: Some(-42),
1022 ..Default::default()
1023 };
1024 let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1025 assert_eq!(
1026 err.to_string(),
1027 "Parquet error: Statistics null count is negative -42"
1028 );
1029 }
1030
1031 fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1035 let statistics = make_bool_stats(distinct_count, null_count);
1036
1037 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1038 assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1039 assert_eq!(
1040 thrift_stats.distinct_count.map(|c| c as u64),
1041 distinct_count
1042 );
1043
1044 let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1045 .unwrap()
1046 .unwrap();
1047 if null_count.is_none() {
1050 assert_ne!(round_tripped, statistics);
1051 assert!(round_tripped.null_count_opt().is_some());
1052 assert_eq!(round_tripped.null_count_opt(), Some(0));
1053 assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1054 assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1055 assert_eq!(
1056 round_tripped.distinct_count_opt(),
1057 statistics.distinct_count_opt()
1058 );
1059 } else {
1060 assert_eq!(round_tripped, statistics);
1061 }
1062 }
1063
1064 fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1065 let min = Some(true);
1066 let max = Some(false);
1067 let is_min_max_deprecated = false;
1068
1069 Statistics::Boolean(ValueStatistics::new(
1071 min,
1072 max,
1073 distinct_count,
1074 null_count,
1075 is_min_max_deprecated,
1076 ))
1077 }
1078}