1use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53 use super::*;
54
55 pub trait MakeStatistics {
56 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57 where
58 Self: Sized;
59 }
60
61 macro_rules! gen_make_statistics {
62 ($value_ty:ty, $stat:ident) => {
63 impl MakeStatistics for $value_ty {
64 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65 where
66 Self: Sized,
67 {
68 Statistics::$stat(statistics)
69 }
70 }
71 };
72 }
73
74 gen_make_statistics!(bool, Boolean);
75 gen_make_statistics!(i32, Int32);
76 gen_make_statistics!(i64, Int64);
77 gen_make_statistics!(Int96, Int96);
78 gen_make_statistics!(f32, Float);
79 gen_make_statistics!(f64, Double);
80 gen_make_statistics!(ByteArray, ByteArray);
81 gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84macro_rules! statistics_new_func {
86 ($func:ident, $vtype:ty, $stat:ident) => {
87 #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88 pub fn $func(
89 min: $vtype,
90 max: $vtype,
91 distinct: Option<u64>,
92 nulls: Option<u64>,
93 is_deprecated: bool,
94 ) -> Self {
95 Statistics::$stat(ValueStatistics::new(
96 min,
97 max,
98 distinct,
99 nulls,
100 is_deprecated,
101 ))
102 }
103 };
104}
105
106macro_rules! statistics_enum_func {
108 ($self:ident, $func:ident) => {{
109 match *$self {
110 Statistics::Boolean(ref typed) => typed.$func(),
111 Statistics::Int32(ref typed) => typed.$func(),
112 Statistics::Int64(ref typed) => typed.$func(),
113 Statistics::Int96(ref typed) => typed.$func(),
114 Statistics::Float(ref typed) => typed.$func(),
115 Statistics::Double(ref typed) => typed.$func(),
116 Statistics::ByteArray(ref typed) => typed.$func(),
117 Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118 }
119 }};
120}
121
122pub fn from_thrift(
124 physical_type: Type,
125 thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127 Ok(match thrift_stats {
128 Some(stats) => {
129 let null_count = stats.null_count.unwrap_or(0);
133
134 if null_count < 0 {
135 return Err(ParquetError::General(format!(
136 "Statistics null count is negative {}",
137 null_count
138 )));
139 }
140
141 let null_count = Some(null_count as u64);
143 let distinct_count = stats.distinct_count.map(|value| value as u64);
145 let old_format = stats.min_value.is_none() && stats.max_value.is_none();
147 let min = if old_format {
149 stats.min
150 } else {
151 stats.min_value
152 };
153 let max = if old_format {
155 stats.max
156 } else {
157 stats.max_value
158 };
159
160 fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> {
161 if let Some(min) = min {
162 if min.len() < len {
163 return Err(ParquetError::General(
164 "Insufficient bytes to parse min statistic".to_string(),
165 ));
166 }
167 }
168 if let Some(max) = max {
169 if max.len() < len {
170 return Err(ParquetError::General(
171 "Insufficient bytes to parse max statistic".to_string(),
172 ));
173 }
174 }
175 Ok(())
176 }
177
178 match physical_type {
179 Type::BOOLEAN => check_len(&min, &max, 1),
180 Type::INT32 | Type::FLOAT => check_len(&min, &max, 4),
181 Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8),
182 Type::INT96 => check_len(&min, &max, 12),
183 _ => Ok(()),
184 }?;
185
186 let res = match physical_type {
191 Type::BOOLEAN => Statistics::boolean(
192 min.map(|data| data[0] != 0),
193 max.map(|data| data[0] != 0),
194 distinct_count,
195 null_count,
196 old_format,
197 ),
198 Type::INT32 => Statistics::int32(
199 min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
200 max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
201 distinct_count,
202 null_count,
203 old_format,
204 ),
205 Type::INT64 => Statistics::int64(
206 min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
207 max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
208 distinct_count,
209 null_count,
210 old_format,
211 ),
212 Type::INT96 => {
213 let min = if let Some(data) = min {
217 assert_eq!(data.len(), 12);
218 Some(Int96::try_from_le_slice(&data)?)
219 } else {
220 None
221 };
222 let max = if let Some(data) = max {
223 assert_eq!(data.len(), 12);
224 Some(Int96::try_from_le_slice(&data)?)
225 } else {
226 None
227 };
228 Statistics::int96(min, max, distinct_count, null_count, old_format)
229 }
230 Type::FLOAT => Statistics::float(
231 min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
232 max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
233 distinct_count,
234 null_count,
235 old_format,
236 ),
237 Type::DOUBLE => Statistics::double(
238 min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
239 max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
240 distinct_count,
241 null_count,
242 old_format,
243 ),
244 Type::BYTE_ARRAY => Statistics::ByteArray(
245 ValueStatistics::new(
246 min.map(ByteArray::from),
247 max.map(ByteArray::from),
248 distinct_count,
249 null_count,
250 old_format,
251 )
252 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
253 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
254 ),
255 Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
256 ValueStatistics::new(
257 min.map(ByteArray::from).map(FixedLenByteArray::from),
258 max.map(ByteArray::from).map(FixedLenByteArray::from),
259 distinct_count,
260 null_count,
261 old_format,
262 )
263 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
264 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
265 ),
266 };
267
268 Some(res)
269 }
270 None => None,
271 })
272}
273
274pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
276 let stats = stats?;
277
278 let null_count = stats
280 .null_count_opt()
281 .and_then(|value| i64::try_from(value).ok());
282
283 let distinct_count = stats
285 .distinct_count_opt()
286 .and_then(|value| i64::try_from(value).ok());
287
288 let mut thrift_stats = TStatistics {
289 max: None,
290 min: None,
291 null_count,
292 distinct_count,
293 max_value: None,
294 min_value: None,
295 is_max_value_exact: None,
296 is_min_value_exact: None,
297 };
298
299 let (min, max, min_exact, max_exact) = (
301 stats.min_bytes_opt().map(|x| x.to_vec()),
302 stats.max_bytes_opt().map(|x| x.to_vec()),
303 Some(stats.min_is_exact()),
304 Some(stats.max_is_exact()),
305 );
306 if stats.is_min_max_backwards_compatible() {
307 thrift_stats.min.clone_from(&min);
309 thrift_stats.max.clone_from(&max);
310 }
311
312 if !stats.is_min_max_deprecated() {
313 thrift_stats.min_value = min;
314 thrift_stats.max_value = max;
315 }
316
317 thrift_stats.is_min_value_exact = min_exact;
318 thrift_stats.is_max_value_exact = max_exact;
319
320 Some(thrift_stats)
321}
322
323#[derive(Debug, Clone, PartialEq)]
335pub enum Statistics {
336 Boolean(ValueStatistics<bool>),
338 Int32(ValueStatistics<i32>),
340 Int64(ValueStatistics<i64>),
342 Int96(ValueStatistics<Int96>),
344 Float(ValueStatistics<f32>),
346 Double(ValueStatistics<f64>),
348 ByteArray(ValueStatistics<ByteArray>),
350 FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
352}
353
354impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
355 fn from(t: ValueStatistics<T>) -> Self {
356 T::make_statistics(t)
357 }
358}
359
360impl Statistics {
361 pub fn new<T: ParquetValueType>(
363 min: Option<T>,
364 max: Option<T>,
365 distinct_count: Option<u64>,
366 null_count: Option<u64>,
367 is_deprecated: bool,
368 ) -> Self {
369 Self::from(ValueStatistics::new(
370 min,
371 max,
372 distinct_count,
373 null_count,
374 is_deprecated,
375 ))
376 }
377
378 statistics_new_func![boolean, Option<bool>, Boolean];
379
380 statistics_new_func![int32, Option<i32>, Int32];
381
382 statistics_new_func![int64, Option<i64>, Int64];
383
384 statistics_new_func![int96, Option<Int96>, Int96];
385
386 statistics_new_func![float, Option<f32>, Float];
387
388 statistics_new_func![double, Option<f64>, Double];
389
390 statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
391
392 statistics_new_func![
393 fixed_len_byte_array,
394 Option<FixedLenByteArray>,
395 FixedLenByteArray
396 ];
397
398 pub fn is_min_max_deprecated(&self) -> bool {
405 statistics_enum_func![self, is_min_max_deprecated]
406 }
407
408 pub fn is_min_max_backwards_compatible(&self) -> bool {
419 statistics_enum_func![self, is_min_max_backwards_compatible]
420 }
421
422 #[deprecated(since = "53.0.0", note = "Use `distinct_count_opt` method instead")]
425 pub fn distinct_count(&self) -> Option<u64> {
426 self.distinct_count_opt()
427 }
428
429 pub fn distinct_count_opt(&self) -> Option<u64> {
432 statistics_enum_func![self, distinct_count]
433 }
434
435 #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
440 pub fn null_count(&self) -> u64 {
441 self.null_count_opt().unwrap_or(0)
443 }
444
445 #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
447 #[allow(deprecated)]
448 pub fn has_nulls(&self) -> bool {
449 self.null_count() > 0
450 }
451
452 pub fn null_count_opt(&self) -> Option<u64> {
459 statistics_enum_func![self, null_count_opt]
460 }
461
462 #[deprecated(
465 since = "53.0.0",
466 note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead"
467 )]
468 pub fn has_min_max_set(&self) -> bool {
469 statistics_enum_func![self, _internal_has_min_max_set]
470 }
471
472 pub fn min_is_exact(&self) -> bool {
474 statistics_enum_func![self, min_is_exact]
475 }
476
477 pub fn max_is_exact(&self) -> bool {
479 statistics_enum_func![self, max_is_exact]
480 }
481
482 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
484 statistics_enum_func![self, min_bytes_opt]
485 }
486
487 #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
490 pub fn min_bytes(&self) -> &[u8] {
491 self.min_bytes_opt().unwrap()
492 }
493
494 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
496 statistics_enum_func![self, max_bytes_opt]
497 }
498
499 #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
502 pub fn max_bytes(&self) -> &[u8] {
503 self.max_bytes_opt().unwrap()
504 }
505
506 pub fn physical_type(&self) -> Type {
508 match self {
509 Statistics::Boolean(_) => Type::BOOLEAN,
510 Statistics::Int32(_) => Type::INT32,
511 Statistics::Int64(_) => Type::INT64,
512 Statistics::Int96(_) => Type::INT96,
513 Statistics::Float(_) => Type::FLOAT,
514 Statistics::Double(_) => Type::DOUBLE,
515 Statistics::ByteArray(_) => Type::BYTE_ARRAY,
516 Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
517 }
518 }
519}
520
521impl fmt::Display for Statistics {
522 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
523 match self {
524 Statistics::Boolean(typed) => write!(f, "{typed}"),
525 Statistics::Int32(typed) => write!(f, "{typed}"),
526 Statistics::Int64(typed) => write!(f, "{typed}"),
527 Statistics::Int96(typed) => write!(f, "{typed}"),
528 Statistics::Float(typed) => write!(f, "{typed}"),
529 Statistics::Double(typed) => write!(f, "{typed}"),
530 Statistics::ByteArray(typed) => write!(f, "{typed}"),
531 Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
532 }
533 }
534}
535
536pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
538
539#[derive(Clone, Eq, PartialEq)]
543pub struct ValueStatistics<T> {
544 min: Option<T>,
545 max: Option<T>,
546 distinct_count: Option<u64>,
548 null_count: Option<u64>,
549
550 is_max_value_exact: bool,
552 is_min_value_exact: bool,
553
554 is_min_max_deprecated: bool,
557
558 is_min_max_backwards_compatible: bool,
561}
562
563impl<T: ParquetValueType> ValueStatistics<T> {
564 pub fn new(
566 min: Option<T>,
567 max: Option<T>,
568 distinct_count: Option<u64>,
569 null_count: Option<u64>,
570 is_min_max_deprecated: bool,
571 ) -> Self {
572 Self {
573 is_max_value_exact: max.is_some(),
574 is_min_value_exact: min.is_some(),
575 min,
576 max,
577 distinct_count,
578 null_count,
579 is_min_max_deprecated,
580 is_min_max_backwards_compatible: is_min_max_deprecated,
581 }
582 }
583
584 pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
589 Self {
590 is_min_value_exact,
591 ..self
592 }
593 }
594
595 pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
600 Self {
601 is_max_value_exact,
602 ..self
603 }
604 }
605
606 pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
612 Self {
613 is_min_max_backwards_compatible: backwards_compatible,
614 ..self
615 }
616 }
617
618 #[deprecated(since = "53.0.0", note = "Use `min_opt` instead")]
623 pub fn min(&self) -> &T {
624 self.min.as_ref().unwrap()
625 }
626
627 pub fn min_opt(&self) -> Option<&T> {
629 self.min.as_ref()
630 }
631
632 #[deprecated(since = "53.0.0", note = "Use `max_opt` instead")]
637 pub fn max(&self) -> &T {
638 self.max.as_ref().unwrap()
639 }
640
641 pub fn max_opt(&self) -> Option<&T> {
643 self.max.as_ref()
644 }
645
646 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
648 self.min_opt().map(AsBytes::as_bytes)
649 }
650
651 #[deprecated(since = "53.0.0", note = "Use `min_bytes_opt` instead")]
656 pub fn min_bytes(&self) -> &[u8] {
657 self.min_bytes_opt().unwrap()
658 }
659
660 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
662 self.max_opt().map(AsBytes::as_bytes)
663 }
664
665 #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
670 pub fn max_bytes(&self) -> &[u8] {
671 self.max_bytes_opt().unwrap()
672 }
673
674 #[deprecated(since = "53.0.0", note = "Use `min_opt` and `max_opt` methods instead")]
677 pub fn has_min_max_set(&self) -> bool {
678 self._internal_has_min_max_set()
679 }
680
681 pub(crate) fn _internal_has_min_max_set(&self) -> bool {
684 self.min.is_some() && self.max.is_some()
685 }
686
687 pub fn max_is_exact(&self) -> bool {
689 self.max.is_some() && self.is_max_value_exact
690 }
691
692 pub fn min_is_exact(&self) -> bool {
694 self.min.is_some() && self.is_min_value_exact
695 }
696
697 pub fn distinct_count(&self) -> Option<u64> {
699 self.distinct_count
700 }
701
702 #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
705 pub fn null_count(&self) -> u64 {
706 self.null_count_opt().unwrap_or(0)
708 }
709
710 pub fn null_count_opt(&self) -> Option<u64> {
712 self.null_count
713 }
714
715 fn is_min_max_deprecated(&self) -> bool {
717 self.is_min_max_deprecated
718 }
719
720 pub fn is_min_max_backwards_compatible(&self) -> bool {
731 self.is_min_max_backwards_compatible
732 }
733}
734
735impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
736 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
737 write!(f, "{{")?;
738 write!(f, "min: ")?;
739 match self.min {
740 Some(ref value) => write!(f, "{value}")?,
741 None => write!(f, "N/A")?,
742 }
743 write!(f, ", max: ")?;
744 match self.max {
745 Some(ref value) => write!(f, "{value}")?,
746 None => write!(f, "N/A")?,
747 }
748 write!(f, ", distinct_count: ")?;
749 match self.distinct_count {
750 Some(value) => write!(f, "{value}")?,
751 None => write!(f, "N/A")?,
752 }
753 write!(f, ", null_count: ")?;
754 match self.null_count {
755 Some(value) => write!(f, "{value}")?,
756 None => write!(f, "N/A")?,
757 }
758 write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
759 write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
760 write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
761 write!(f, "}}")
762 }
763}
764
765impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
766 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
767 write!(
768 f,
769 "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
770 min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
771 self.min,
772 self.max,
773 self.distinct_count,
774 self.null_count,
775 self.is_min_max_deprecated,
776 self.is_min_max_backwards_compatible,
777 self.is_max_value_exact,
778 self.is_min_value_exact
779 )
780 }
781}
782
783#[cfg(test)]
784mod tests {
785 use super::*;
786
787 #[test]
788 fn test_statistics_min_max_bytes() {
789 let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
790 assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
791 assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
792
793 let stats = Statistics::byte_array(
794 Some(ByteArray::from(vec![1, 2, 3])),
795 Some(ByteArray::from(vec![3, 4, 5])),
796 None,
797 Some(1),
798 true,
799 );
800 assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
801 assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
802 }
803
804 #[test]
805 #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
806 fn test_statistics_negative_null_count() {
807 let thrift_stats = TStatistics {
808 max: None,
809 min: None,
810 null_count: Some(-10),
811 distinct_count: None,
812 max_value: None,
813 min_value: None,
814 is_max_value_exact: None,
815 is_min_value_exact: None,
816 };
817
818 from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
819 }
820
821 #[test]
822 fn test_statistics_thrift_none() {
823 assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
824 assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
825 }
826
827 #[test]
828 fn test_statistics_debug() {
829 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
830 assert_eq!(
831 format!("{stats:?}"),
832 "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
833 min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
834 );
835
836 let stats = Statistics::int32(None, None, None, Some(7), false);
837 assert_eq!(
838 format!("{stats:?}"),
839 "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
840 min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
841 )
842 }
843
844 #[test]
845 fn test_statistics_display() {
846 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
847 assert_eq!(
848 format!("{stats}"),
849 "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
850 );
851
852 let stats = Statistics::int64(None, None, None, Some(7), false);
853 assert_eq!(
854 format!("{stats}"),
855 "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
856 false, max_value_exact: false, min_value_exact: false}"
857 );
858
859 let stats = Statistics::int96(
860 Some(Int96::from(vec![1, 0, 0])),
861 Some(Int96::from(vec![2, 3, 4])),
862 None,
863 Some(3),
864 true,
865 );
866 assert_eq!(
867 format!("{stats}"),
868 "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
869 min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
870 );
871
872 let stats = Statistics::ByteArray(
873 ValueStatistics::new(
874 Some(ByteArray::from(vec![1u8])),
875 Some(ByteArray::from(vec![2u8])),
876 Some(5),
877 Some(7),
878 false,
879 )
880 .with_max_is_exact(false)
881 .with_min_is_exact(false),
882 );
883 assert_eq!(
884 format!("{stats}"),
885 "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
886 );
887 }
888
889 #[test]
890 fn test_statistics_partial_eq() {
891 let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
892
893 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
894 assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
895 assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
896 assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
897 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
898
899 assert!(
900 Statistics::int32(Some(12), Some(45), None, Some(11), false)
901 != Statistics::int64(Some(12), Some(45), None, Some(11), false)
902 );
903
904 assert!(
905 Statistics::boolean(Some(false), Some(true), None, None, true)
906 != Statistics::double(Some(1.2), Some(4.5), None, None, true)
907 );
908
909 assert!(
910 Statistics::byte_array(
911 Some(ByteArray::from(vec![1, 2, 3])),
912 Some(ByteArray::from(vec![1, 2, 3])),
913 None,
914 None,
915 true
916 ) != Statistics::fixed_len_byte_array(
917 Some(ByteArray::from(vec![1, 2, 3]).into()),
918 Some(ByteArray::from(vec![1, 2, 3]).into()),
919 None,
920 None,
921 true,
922 )
923 );
924
925 assert!(
926 Statistics::byte_array(
927 Some(ByteArray::from(vec![1, 2, 3])),
928 Some(ByteArray::from(vec![1, 2, 3])),
929 None,
930 None,
931 true,
932 ) != Statistics::ByteArray(
933 ValueStatistics::new(
934 Some(ByteArray::from(vec![1, 2, 3])),
935 Some(ByteArray::from(vec![1, 2, 3])),
936 None,
937 None,
938 true,
939 )
940 .with_max_is_exact(false)
941 )
942 );
943
944 assert!(
945 Statistics::fixed_len_byte_array(
946 Some(FixedLenByteArray::from(vec![1, 2, 3])),
947 Some(FixedLenByteArray::from(vec![1, 2, 3])),
948 None,
949 None,
950 true,
951 ) != Statistics::FixedLenByteArray(
952 ValueStatistics::new(
953 Some(FixedLenByteArray::from(vec![1, 2, 3])),
954 Some(FixedLenByteArray::from(vec![1, 2, 3])),
955 None,
956 None,
957 true,
958 )
959 .with_min_is_exact(false)
960 )
961 );
962 }
963
964 #[test]
965 fn test_statistics_from_thrift() {
966 fn check_stats(stats: Statistics) {
968 let tpe = stats.physical_type();
969 let thrift_stats = to_thrift(Some(&stats));
970 assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
971 }
972
973 check_stats(Statistics::boolean(
974 Some(false),
975 Some(true),
976 None,
977 Some(7),
978 true,
979 ));
980 check_stats(Statistics::boolean(
981 Some(false),
982 Some(true),
983 None,
984 Some(7),
985 true,
986 ));
987 check_stats(Statistics::boolean(
988 Some(false),
989 Some(true),
990 None,
991 Some(0),
992 false,
993 ));
994 check_stats(Statistics::boolean(
995 Some(true),
996 Some(true),
997 None,
998 Some(7),
999 true,
1000 ));
1001 check_stats(Statistics::boolean(
1002 Some(false),
1003 Some(false),
1004 None,
1005 Some(7),
1006 true,
1007 ));
1008 check_stats(Statistics::boolean(None, None, None, Some(7), true));
1009
1010 check_stats(Statistics::int32(
1011 Some(-100),
1012 Some(500),
1013 None,
1014 Some(7),
1015 true,
1016 ));
1017 check_stats(Statistics::int32(
1018 Some(-100),
1019 Some(500),
1020 None,
1021 Some(0),
1022 false,
1023 ));
1024 check_stats(Statistics::int32(None, None, None, Some(7), true));
1025
1026 check_stats(Statistics::int64(
1027 Some(-100),
1028 Some(200),
1029 None,
1030 Some(7),
1031 true,
1032 ));
1033 check_stats(Statistics::int64(
1034 Some(-100),
1035 Some(200),
1036 None,
1037 Some(0),
1038 false,
1039 ));
1040 check_stats(Statistics::int64(None, None, None, Some(7), true));
1041
1042 check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
1043 check_stats(Statistics::float(
1044 Some(1.2),
1045 Some(3.4),
1046 None,
1047 Some(0),
1048 false,
1049 ));
1050 check_stats(Statistics::float(None, None, None, Some(7), true));
1051
1052 check_stats(Statistics::double(
1053 Some(1.2),
1054 Some(3.4),
1055 None,
1056 Some(7),
1057 true,
1058 ));
1059 check_stats(Statistics::double(
1060 Some(1.2),
1061 Some(3.4),
1062 None,
1063 Some(0),
1064 false,
1065 ));
1066 check_stats(Statistics::double(None, None, None, Some(7), true));
1067
1068 check_stats(Statistics::byte_array(
1069 Some(ByteArray::from(vec![1, 2, 3])),
1070 Some(ByteArray::from(vec![3, 4, 5])),
1071 None,
1072 Some(7),
1073 true,
1074 ));
1075 check_stats(Statistics::byte_array(None, None, None, Some(7), true));
1076
1077 check_stats(Statistics::fixed_len_byte_array(
1078 Some(ByteArray::from(vec![1, 2, 3]).into()),
1079 Some(ByteArray::from(vec![3, 4, 5]).into()),
1080 None,
1081 Some(7),
1082 true,
1083 ));
1084 check_stats(Statistics::fixed_len_byte_array(
1085 None,
1086 None,
1087 None,
1088 Some(7),
1089 true,
1090 ));
1091 }
1092
1093 #[test]
1094 fn test_count_encoding() {
1095 statistics_count_test(None, None);
1096 statistics_count_test(Some(0), Some(0));
1097 statistics_count_test(Some(100), Some(2000));
1098 statistics_count_test(Some(1), None);
1099 statistics_count_test(None, Some(1));
1100 }
1101
1102 #[test]
1103 fn test_count_encoding_distinct_too_large() {
1104 let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1106 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1107 assert_eq!(thrift_stats.distinct_count, None); assert_eq!(thrift_stats.null_count, Some(100));
1109 }
1110
1111 #[test]
1112 fn test_count_encoding_null_too_large() {
1113 let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1115 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1116 assert_eq!(thrift_stats.distinct_count, Some(100));
1117 assert_eq!(thrift_stats.null_count, None); }
1119
1120 #[test]
1121 fn test_count_decoding_null_invalid() {
1122 let tstatistics = TStatistics {
1123 null_count: Some(-42),
1124 ..Default::default()
1125 };
1126 let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1127 assert_eq!(
1128 err.to_string(),
1129 "Parquet error: Statistics null count is negative -42"
1130 );
1131 }
1132
1133 fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1137 let statistics = make_bool_stats(distinct_count, null_count);
1138
1139 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1140 assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1141 assert_eq!(
1142 thrift_stats.distinct_count.map(|c| c as u64),
1143 distinct_count
1144 );
1145
1146 let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1147 .unwrap()
1148 .unwrap();
1149 if null_count.is_none() {
1152 assert_ne!(round_tripped, statistics);
1153 assert!(round_tripped.null_count_opt().is_some());
1154 assert_eq!(round_tripped.null_count_opt(), Some(0));
1155 assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1156 assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1157 assert_eq!(
1158 round_tripped.distinct_count_opt(),
1159 statistics.distinct_count_opt()
1160 );
1161 } else {
1162 assert_eq!(round_tripped, statistics);
1163 }
1164 }
1165
1166 fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1167 let min = Some(true);
1168 let max = Some(false);
1169 let is_min_max_deprecated = false;
1170
1171 Statistics::Boolean(ValueStatistics::new(
1173 min,
1174 max,
1175 distinct_count,
1176 null_count,
1177 is_min_max_deprecated,
1178 ))
1179 }
1180}