1mod footer_tail;
94mod memory;
95mod parser;
96mod push_decoder;
97pub(crate) mod reader;
98mod writer;
99
100use crate::basic::{ColumnOrder, Compression, Encoding, Type};
101#[cfg(feature = "encryption")]
102use crate::encryption::{
103 decrypt::FileDecryptor,
104 modules::{create_module_aad, ModuleType},
105};
106use crate::errors::{ParquetError, Result};
107#[cfg(feature = "encryption")]
108use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
109pub(crate) use crate::file::metadata::memory::HeapSize;
110use crate::file::page_encoding_stats::{self, PageEncodingStats};
111use crate::file::page_index::index::Index;
112use crate::file::page_index::offset_index::OffsetIndexMetaData;
113use crate::file::statistics::{self, Statistics};
114use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
115use crate::format::{
116 BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
117 SizeStatistics, SortingColumn,
118};
119use crate::geospatial::statistics as geo_statistics;
120use crate::schema::types::{
121 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
122 Type as SchemaType,
123};
124#[cfg(feature = "encryption")]
125use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
126pub use footer_tail::FooterTail;
127pub use push_decoder::ParquetMetaDataPushDecoder;
128pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
129use std::ops::Range;
130use std::sync::Arc;
131pub use writer::ParquetMetaDataWriter;
132pub(crate) use writer::ThriftMetadataWriter;
133
134pub type ParquetColumnIndex = Vec<Vec<Index>>;
150
151pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
162
163#[derive(Debug, Clone, PartialEq)]
181pub struct ParquetMetaData {
182 file_metadata: FileMetaData,
184 row_groups: Vec<RowGroupMetaData>,
186 column_index: Option<ParquetColumnIndex>,
188 offset_index: Option<ParquetOffsetIndex>,
190 #[cfg(feature = "encryption")]
192 file_decryptor: Option<FileDecryptor>,
193}
194
195impl ParquetMetaData {
196 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
199 ParquetMetaData {
200 file_metadata,
201 row_groups,
202 column_index: None,
203 offset_index: None,
204 #[cfg(feature = "encryption")]
205 file_decryptor: None,
206 }
207 }
208
209 #[cfg(feature = "encryption")]
212 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
213 self.file_decryptor = file_decryptor;
214 }
215
216 pub fn into_builder(self) -> ParquetMetaDataBuilder {
218 self.into()
219 }
220
221 pub fn file_metadata(&self) -> &FileMetaData {
223 &self.file_metadata
224 }
225
226 #[cfg(feature = "encryption")]
228 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
229 self.file_decryptor.as_ref()
230 }
231
232 pub fn num_row_groups(&self) -> usize {
234 self.row_groups.len()
235 }
236
237 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
240 &self.row_groups[i]
241 }
242
243 pub fn row_groups(&self) -> &[RowGroupMetaData] {
245 &self.row_groups
246 }
247
248 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
255 self.column_index.as_ref()
256 }
257
258 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
265 self.offset_index.as_ref()
266 }
267
268 pub fn memory_size(&self) -> usize {
283 std::mem::size_of::<Self>()
284 + self.file_metadata.heap_size()
285 + self.row_groups.heap_size()
286 + self.column_index.heap_size()
287 + self.offset_index.heap_size()
288 }
289
290 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
292 self.column_index = index;
293 }
294
295 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
297 self.offset_index = index;
298 }
299}
300
301pub struct ParquetMetaDataBuilder(ParquetMetaData);
339
340impl ParquetMetaDataBuilder {
341 pub fn new(file_meta_data: FileMetaData) -> Self {
343 Self(ParquetMetaData::new(file_meta_data, vec![]))
344 }
345
346 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
348 Self(metadata)
349 }
350
351 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
353 self.0.row_groups.push(row_group);
354 self
355 }
356
357 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
359 self.0.row_groups = row_groups;
360 self
361 }
362
363 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
369 std::mem::take(&mut self.0.row_groups)
370 }
371
372 pub fn row_groups(&self) -> &[RowGroupMetaData] {
374 &self.0.row_groups
375 }
376
377 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
379 self.0.column_index = column_index;
380 self
381 }
382
383 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
385 std::mem::take(&mut self.0.column_index)
386 }
387
388 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
390 self.0.column_index.as_ref()
391 }
392
393 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
395 self.0.offset_index = offset_index;
396 self
397 }
398
399 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
401 std::mem::take(&mut self.0.offset_index)
402 }
403
404 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
406 self.0.offset_index.as_ref()
407 }
408
409 pub fn build(self) -> ParquetMetaData {
411 let Self(metadata) = self;
412 metadata
413 }
414}
415
416impl From<ParquetMetaData> for ParquetMetaDataBuilder {
417 fn from(meta_data: ParquetMetaData) -> Self {
418 Self(meta_data)
419 }
420}
421
422pub type KeyValue = crate::format::KeyValue;
424
425pub type FileMetaDataPtr = Arc<FileMetaData>;
427
428#[derive(Debug, Clone, PartialEq)]
432pub struct FileMetaData {
433 version: i32,
434 num_rows: i64,
435 created_by: Option<String>,
436 key_value_metadata: Option<Vec<KeyValue>>,
437 schema_descr: SchemaDescPtr,
438 column_orders: Option<Vec<ColumnOrder>>,
439}
440
441impl FileMetaData {
442 pub fn new(
444 version: i32,
445 num_rows: i64,
446 created_by: Option<String>,
447 key_value_metadata: Option<Vec<KeyValue>>,
448 schema_descr: SchemaDescPtr,
449 column_orders: Option<Vec<ColumnOrder>>,
450 ) -> Self {
451 FileMetaData {
452 version,
453 num_rows,
454 created_by,
455 key_value_metadata,
456 schema_descr,
457 column_orders,
458 }
459 }
460
461 pub fn version(&self) -> i32 {
463 self.version
464 }
465
466 pub fn num_rows(&self) -> i64 {
468 self.num_rows
469 }
470
471 pub fn created_by(&self) -> Option<&str> {
480 self.created_by.as_deref()
481 }
482
483 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
485 self.key_value_metadata.as_ref()
486 }
487
488 pub fn schema(&self) -> &SchemaType {
492 self.schema_descr.root_schema()
493 }
494
495 pub fn schema_descr(&self) -> &SchemaDescriptor {
497 &self.schema_descr
498 }
499
500 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
502 self.schema_descr.clone()
503 }
504
505 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
513 self.column_orders.as_ref()
514 }
515
516 pub fn column_order(&self, i: usize) -> ColumnOrder {
519 self.column_orders
520 .as_ref()
521 .map(|data| data[i])
522 .unwrap_or(ColumnOrder::UNDEFINED)
523 }
524}
525
526pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
528
529#[derive(Debug, Clone, PartialEq)]
534pub struct RowGroupMetaData {
535 columns: Vec<ColumnChunkMetaData>,
536 num_rows: i64,
537 sorting_columns: Option<Vec<SortingColumn>>,
538 total_byte_size: i64,
539 schema_descr: SchemaDescPtr,
540 file_offset: Option<i64>,
542 ordinal: Option<i16>,
544}
545
546impl RowGroupMetaData {
547 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
549 RowGroupMetaDataBuilder::new(schema_descr)
550 }
551
552 pub fn num_columns(&self) -> usize {
554 self.columns.len()
555 }
556
557 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
559 &self.columns[i]
560 }
561
562 pub fn columns(&self) -> &[ColumnChunkMetaData] {
564 &self.columns
565 }
566
567 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
569 &mut self.columns
570 }
571
572 pub fn num_rows(&self) -> i64 {
574 self.num_rows
575 }
576
577 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
579 self.sorting_columns.as_ref()
580 }
581
582 pub fn total_byte_size(&self) -> i64 {
584 self.total_byte_size
585 }
586
587 pub fn compressed_size(&self) -> i64 {
589 self.columns.iter().map(|c| c.total_compressed_size).sum()
590 }
591
592 pub fn schema_descr(&self) -> &SchemaDescriptor {
594 self.schema_descr.as_ref()
595 }
596
597 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
599 self.schema_descr.clone()
600 }
601
602 #[inline(always)]
607 pub fn ordinal(&self) -> Option<i16> {
608 self.ordinal
609 }
610
611 #[inline(always)]
613 pub fn file_offset(&self) -> Option<i64> {
614 self.file_offset
615 }
616
617 #[cfg(feature = "encryption")]
619 fn from_encrypted_thrift(
620 schema_descr: SchemaDescPtr,
621 mut rg: RowGroup,
622 decryptor: Option<&FileDecryptor>,
623 ) -> Result<RowGroupMetaData> {
624 if schema_descr.num_columns() != rg.columns.len() {
625 return Err(general_err!(
626 "Column count mismatch. Schema has {} columns while Row Group has {}",
627 schema_descr.num_columns(),
628 rg.columns.len()
629 ));
630 }
631 let total_byte_size = rg.total_byte_size;
632 let num_rows = rg.num_rows;
633 let mut columns = vec![];
634
635 for (i, (mut c, d)) in rg
636 .columns
637 .drain(0..)
638 .zip(schema_descr.columns())
639 .enumerate()
640 {
641 if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
643 let column_decryptor = match c.crypto_metadata.as_ref() {
644 None => {
645 return Err(general_err!(
646 "No crypto_metadata is set for column '{}', which has encrypted metadata",
647 d.path().string()
648 ));
649 }
650 Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
651 let column_name = crypto_metadata.path_in_schema.join(".");
652 decryptor.get_column_metadata_decryptor(
653 column_name.as_str(),
654 crypto_metadata.key_metadata.as_deref(),
655 )?
656 }
657 Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
658 decryptor.get_footer_decryptor()?
659 }
660 };
661
662 let column_aad = create_module_aad(
663 decryptor.file_aad(),
664 ModuleType::ColumnMetaData,
665 rg.ordinal.unwrap() as usize,
666 i,
667 None,
668 )?;
669
670 let buf = c.encrypted_column_metadata.clone().unwrap();
671 let decrypted_cc_buf = column_decryptor
672 .decrypt(buf.as_slice(), column_aad.as_ref())
673 .map_err(|_| {
674 general_err!(
675 "Unable to decrypt column '{}', perhaps the column key is wrong?",
676 d.path().string()
677 )
678 })?;
679
680 let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
681 c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
682 }
683 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
684 }
685
686 let sorting_columns = rg.sorting_columns;
687 Ok(RowGroupMetaData {
688 columns,
689 num_rows,
690 sorting_columns,
691 total_byte_size,
692 schema_descr,
693 file_offset: rg.file_offset,
694 ordinal: rg.ordinal,
695 })
696 }
697
698 pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
700 if schema_descr.num_columns() != rg.columns.len() {
701 return Err(general_err!(
702 "Column count mismatch. Schema has {} columns while Row Group has {}",
703 schema_descr.num_columns(),
704 rg.columns.len()
705 ));
706 }
707 let total_byte_size = rg.total_byte_size;
708 let num_rows = rg.num_rows;
709 let mut columns = vec![];
710
711 for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
712 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
713 }
714
715 let sorting_columns = rg.sorting_columns;
716 Ok(RowGroupMetaData {
717 columns,
718 num_rows,
719 sorting_columns,
720 total_byte_size,
721 schema_descr,
722 file_offset: rg.file_offset,
723 ordinal: rg.ordinal,
724 })
725 }
726
727 pub fn to_thrift(&self) -> RowGroup {
729 RowGroup {
730 columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
731 total_byte_size: self.total_byte_size,
732 num_rows: self.num_rows,
733 sorting_columns: self.sorting_columns().cloned(),
734 file_offset: self.file_offset(),
735 total_compressed_size: Some(self.compressed_size()),
736 ordinal: self.ordinal,
737 }
738 }
739
740 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
742 RowGroupMetaDataBuilder(self)
743 }
744}
745
746pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
748
749impl RowGroupMetaDataBuilder {
750 fn new(schema_descr: SchemaDescPtr) -> Self {
752 Self(RowGroupMetaData {
753 columns: Vec::with_capacity(schema_descr.num_columns()),
754 schema_descr,
755 file_offset: None,
756 num_rows: 0,
757 sorting_columns: None,
758 total_byte_size: 0,
759 ordinal: None,
760 })
761 }
762
763 pub fn set_num_rows(mut self, value: i64) -> Self {
765 self.0.num_rows = value;
766 self
767 }
768
769 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
771 self.0.sorting_columns = value;
772 self
773 }
774
775 pub fn set_total_byte_size(mut self, value: i64) -> Self {
777 self.0.total_byte_size = value;
778 self
779 }
780
781 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
787 std::mem::take(&mut self.0.columns)
788 }
789
790 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
792 self.0.columns = value;
793 self
794 }
795
796 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
798 self.0.columns.push(value);
799 self
800 }
801
802 pub fn set_ordinal(mut self, value: i16) -> Self {
804 self.0.ordinal = Some(value);
805 self
806 }
807
808 pub fn set_file_offset(mut self, value: i64) -> Self {
810 self.0.file_offset = Some(value);
811 self
812 }
813
814 pub fn build(self) -> Result<RowGroupMetaData> {
816 if self.0.schema_descr.num_columns() != self.0.columns.len() {
817 return Err(general_err!(
818 "Column length mismatch: {} != {}",
819 self.0.schema_descr.num_columns(),
820 self.0.columns.len()
821 ));
822 }
823
824 Ok(self.0)
825 }
826}
827
828#[derive(Debug, Clone, PartialEq)]
830pub struct ColumnChunkMetaData {
831 column_descr: ColumnDescPtr,
832 encodings: Vec<Encoding>,
833 file_path: Option<String>,
834 file_offset: i64,
835 num_values: i64,
836 compression: Compression,
837 total_compressed_size: i64,
838 total_uncompressed_size: i64,
839 data_page_offset: i64,
840 index_page_offset: Option<i64>,
841 dictionary_page_offset: Option<i64>,
842 statistics: Option<Statistics>,
843 geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
844 encoding_stats: Option<Vec<PageEncodingStats>>,
845 bloom_filter_offset: Option<i64>,
846 bloom_filter_length: Option<i32>,
847 offset_index_offset: Option<i64>,
848 offset_index_length: Option<i32>,
849 column_index_offset: Option<i64>,
850 column_index_length: Option<i32>,
851 unencoded_byte_array_data_bytes: Option<i64>,
852 repetition_level_histogram: Option<LevelHistogram>,
853 definition_level_histogram: Option<LevelHistogram>,
854 #[cfg(feature = "encryption")]
855 column_crypto_metadata: Option<ColumnCryptoMetaData>,
856}
857
858#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
867pub struct LevelHistogram {
868 inner: Vec<i64>,
869}
870
871impl LevelHistogram {
872 pub fn try_new(max_level: i16) -> Option<Self> {
878 if max_level > 0 {
879 Some(Self {
880 inner: vec![0; max_level as usize + 1],
881 })
882 } else {
883 None
884 }
885 }
886 pub fn values(&self) -> &[i64] {
888 &self.inner
889 }
890
891 pub fn into_inner(self) -> Vec<i64> {
893 self.inner
894 }
895
896 pub fn get(&self, index: usize) -> Option<i64> {
903 self.inner.get(index).copied()
904 }
905
906 pub fn add(&mut self, other: &Self) {
911 assert_eq!(self.len(), other.len());
912 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
913 *dst += src;
914 }
915 }
916
917 pub fn len(&self) -> usize {
919 self.inner.len()
920 }
921
922 pub fn is_empty(&self) -> bool {
924 self.inner.is_empty()
925 }
926
927 pub fn reset(&mut self) {
929 for value in self.inner.iter_mut() {
930 *value = 0;
931 }
932 }
933
934 pub fn update_from_levels(&mut self, levels: &[i16]) {
940 for &level in levels {
941 self.inner[level as usize] += 1;
942 }
943 }
944}
945
946impl From<Vec<i64>> for LevelHistogram {
947 fn from(inner: Vec<i64>) -> Self {
948 Self { inner }
949 }
950}
951
952impl From<LevelHistogram> for Vec<i64> {
953 fn from(value: LevelHistogram) -> Self {
954 value.into_inner()
955 }
956}
957
958impl HeapSize for LevelHistogram {
959 fn heap_size(&self) -> usize {
960 self.inner.heap_size()
961 }
962}
963
964impl ColumnChunkMetaData {
966 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
968 ColumnChunkMetaDataBuilder::new(column_descr)
969 }
970
971 pub fn file_path(&self) -> Option<&str> {
976 self.file_path.as_deref()
977 }
978
979 pub fn file_offset(&self) -> i64 {
986 self.file_offset
987 }
988
989 pub fn column_type(&self) -> Type {
991 self.column_descr.physical_type()
992 }
993
994 pub fn column_path(&self) -> &ColumnPath {
996 self.column_descr.path()
997 }
998
999 pub fn column_descr(&self) -> &ColumnDescriptor {
1001 self.column_descr.as_ref()
1002 }
1003
1004 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1006 self.column_descr.clone()
1007 }
1008
1009 pub fn encodings(&self) -> &Vec<Encoding> {
1011 &self.encodings
1012 }
1013
1014 pub fn num_values(&self) -> i64 {
1016 self.num_values
1017 }
1018
1019 pub fn compression(&self) -> Compression {
1021 self.compression
1022 }
1023
1024 pub fn compressed_size(&self) -> i64 {
1026 self.total_compressed_size
1027 }
1028
1029 pub fn uncompressed_size(&self) -> i64 {
1031 self.total_uncompressed_size
1032 }
1033
1034 pub fn data_page_offset(&self) -> i64 {
1036 self.data_page_offset
1037 }
1038
1039 pub fn index_page_offset(&self) -> Option<i64> {
1041 self.index_page_offset
1042 }
1043
1044 pub fn dictionary_page_offset(&self) -> Option<i64> {
1046 self.dictionary_page_offset
1047 }
1048
1049 pub fn byte_range(&self) -> (u64, u64) {
1051 let col_start = match self.dictionary_page_offset() {
1052 Some(dictionary_page_offset) => dictionary_page_offset,
1053 None => self.data_page_offset(),
1054 };
1055 let col_len = self.compressed_size();
1056 assert!(
1057 col_start >= 0 && col_len >= 0,
1058 "column start and length should not be negative"
1059 );
1060 (col_start as u64, col_len as u64)
1061 }
1062
1063 pub fn statistics(&self) -> Option<&Statistics> {
1066 self.statistics.as_ref()
1067 }
1068
1069 pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1072 self.geo_statistics.as_deref()
1073 }
1074
1075 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1078 self.encoding_stats.as_ref()
1079 }
1080
1081 pub fn bloom_filter_offset(&self) -> Option<i64> {
1083 self.bloom_filter_offset
1084 }
1085
1086 pub fn bloom_filter_length(&self) -> Option<i32> {
1088 self.bloom_filter_length
1089 }
1090
1091 pub fn column_index_offset(&self) -> Option<i64> {
1093 self.column_index_offset
1094 }
1095
1096 pub fn column_index_length(&self) -> Option<i32> {
1098 self.column_index_length
1099 }
1100
1101 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1103 let offset = u64::try_from(self.column_index_offset?).ok()?;
1104 let length = u64::try_from(self.column_index_length?).ok()?;
1105 Some(offset..(offset + length))
1106 }
1107
1108 pub fn offset_index_offset(&self) -> Option<i64> {
1110 self.offset_index_offset
1111 }
1112
1113 pub fn offset_index_length(&self) -> Option<i32> {
1115 self.offset_index_length
1116 }
1117
1118 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1120 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1121 let length = u64::try_from(self.offset_index_length?).ok()?;
1122 Some(offset..(offset + length))
1123 }
1124
1125 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1130 self.unencoded_byte_array_data_bytes
1131 }
1132
1133 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1139 self.repetition_level_histogram.as_ref()
1140 }
1141
1142 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1148 self.definition_level_histogram.as_ref()
1149 }
1150
1151 #[cfg(feature = "encryption")]
1153 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1154 self.column_crypto_metadata.as_ref()
1155 }
1156
1157 pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1159 if cc.meta_data.is_none() {
1160 return Err(general_err!("Expected to have column metadata"));
1161 }
1162 let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1163 let column_type = Type::try_from(col_metadata.type_)?;
1164 let encodings = col_metadata
1165 .encodings
1166 .drain(0..)
1167 .map(Encoding::try_from)
1168 .collect::<Result<_>>()?;
1169 let compression = Compression::try_from(col_metadata.codec)?;
1170 let file_path = cc.file_path;
1171 let file_offset = cc.file_offset;
1172 let num_values = col_metadata.num_values;
1173 let total_compressed_size = col_metadata.total_compressed_size;
1174 let total_uncompressed_size = col_metadata.total_uncompressed_size;
1175 let data_page_offset = col_metadata.data_page_offset;
1176 let index_page_offset = col_metadata.index_page_offset;
1177 let dictionary_page_offset = col_metadata.dictionary_page_offset;
1178 let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1179 let geo_statistics =
1180 geo_statistics::from_thrift(col_metadata.geospatial_statistics).map(Box::new);
1181 let encoding_stats = col_metadata
1182 .encoding_stats
1183 .as_ref()
1184 .map(|vec| {
1185 vec.iter()
1186 .map(page_encoding_stats::try_from_thrift)
1187 .collect::<Result<_>>()
1188 })
1189 .transpose()?;
1190 let bloom_filter_offset = col_metadata.bloom_filter_offset;
1191 let bloom_filter_length = col_metadata.bloom_filter_length;
1192 let offset_index_offset = cc.offset_index_offset;
1193 let offset_index_length = cc.offset_index_length;
1194 let column_index_offset = cc.column_index_offset;
1195 let column_index_length = cc.column_index_length;
1196 let (
1197 unencoded_byte_array_data_bytes,
1198 repetition_level_histogram,
1199 definition_level_histogram,
1200 ) = if let Some(size_stats) = col_metadata.size_statistics {
1201 (
1202 size_stats.unencoded_byte_array_data_bytes,
1203 size_stats.repetition_level_histogram,
1204 size_stats.definition_level_histogram,
1205 )
1206 } else {
1207 (None, None, None)
1208 };
1209
1210 let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1211 let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1212
1213 #[cfg(feature = "encryption")]
1214 let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1215 Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1216 } else {
1217 None
1218 };
1219
1220 let result = ColumnChunkMetaData {
1221 column_descr,
1222 encodings,
1223 file_path,
1224 file_offset,
1225 num_values,
1226 compression,
1227 total_compressed_size,
1228 total_uncompressed_size,
1229 data_page_offset,
1230 index_page_offset,
1231 dictionary_page_offset,
1232 statistics,
1233 encoding_stats,
1234 bloom_filter_offset,
1235 bloom_filter_length,
1236 offset_index_offset,
1237 offset_index_length,
1238 column_index_offset,
1239 column_index_length,
1240 unencoded_byte_array_data_bytes,
1241 repetition_level_histogram,
1242 definition_level_histogram,
1243 geo_statistics,
1244 #[cfg(feature = "encryption")]
1245 column_crypto_metadata,
1246 };
1247 Ok(result)
1248 }
1249
1250 pub fn to_thrift(&self) -> ColumnChunk {
1252 let column_metadata = self.to_column_metadata_thrift();
1253
1254 ColumnChunk {
1255 file_path: self.file_path().map(|s| s.to_owned()),
1256 file_offset: self.file_offset,
1257 meta_data: Some(column_metadata),
1258 offset_index_offset: self.offset_index_offset,
1259 offset_index_length: self.offset_index_length,
1260 column_index_offset: self.column_index_offset,
1261 column_index_length: self.column_index_length,
1262 crypto_metadata: self.column_crypto_metadata_thrift(),
1263 encrypted_column_metadata: None,
1264 }
1265 }
1266
1267 pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1269 let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1270 || self.repetition_level_histogram.is_some()
1271 || self.definition_level_histogram.is_some()
1272 {
1273 let repetition_level_histogram = self
1274 .repetition_level_histogram
1275 .as_ref()
1276 .map(|hist| hist.clone().into_inner());
1277
1278 let definition_level_histogram = self
1279 .definition_level_histogram
1280 .as_ref()
1281 .map(|hist| hist.clone().into_inner());
1282
1283 Some(SizeStatistics {
1284 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1285 repetition_level_histogram,
1286 definition_level_histogram,
1287 })
1288 } else {
1289 None
1290 };
1291
1292 ColumnMetaData {
1293 type_: self.column_type().into(),
1294 encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1295 path_in_schema: self.column_path().as_ref().to_vec(),
1296 codec: self.compression.into(),
1297 num_values: self.num_values,
1298 total_uncompressed_size: self.total_uncompressed_size,
1299 total_compressed_size: self.total_compressed_size,
1300 key_value_metadata: None,
1301 data_page_offset: self.data_page_offset,
1302 index_page_offset: self.index_page_offset,
1303 dictionary_page_offset: self.dictionary_page_offset,
1304 statistics: statistics::to_thrift(self.statistics.as_ref()),
1305 encoding_stats: self
1306 .encoding_stats
1307 .as_ref()
1308 .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1309 bloom_filter_offset: self.bloom_filter_offset,
1310 bloom_filter_length: self.bloom_filter_length,
1311 size_statistics,
1312 geospatial_statistics: geo_statistics::to_thrift(
1313 self.geo_statistics.as_ref().map(|boxed| boxed.as_ref()),
1314 ),
1315 }
1316 }
1317
1318 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1320 ColumnChunkMetaDataBuilder::from(self)
1321 }
1322
1323 #[cfg(feature = "encryption")]
1324 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1325 self.column_crypto_metadata
1326 .as_ref()
1327 .map(column_crypto_metadata::to_thrift)
1328 }
1329
1330 #[cfg(not(feature = "encryption"))]
1331 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1332 None
1333 }
1334}
1335
1336pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1355
1356impl ColumnChunkMetaDataBuilder {
1357 fn new(column_descr: ColumnDescPtr) -> Self {
1361 Self(ColumnChunkMetaData {
1362 column_descr,
1363 encodings: Vec::new(),
1364 file_path: None,
1365 file_offset: 0,
1366 num_values: 0,
1367 compression: Compression::UNCOMPRESSED,
1368 total_compressed_size: 0,
1369 total_uncompressed_size: 0,
1370 data_page_offset: 0,
1371 index_page_offset: None,
1372 dictionary_page_offset: None,
1373 statistics: None,
1374 geo_statistics: None,
1375 encoding_stats: None,
1376 bloom_filter_offset: None,
1377 bloom_filter_length: None,
1378 offset_index_offset: None,
1379 offset_index_length: None,
1380 column_index_offset: None,
1381 column_index_length: None,
1382 unencoded_byte_array_data_bytes: None,
1383 repetition_level_histogram: None,
1384 definition_level_histogram: None,
1385 #[cfg(feature = "encryption")]
1386 column_crypto_metadata: None,
1387 })
1388 }
1389
1390 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1392 self.0.encodings = encodings;
1393 self
1394 }
1395
1396 pub fn set_file_path(mut self, value: String) -> Self {
1398 self.0.file_path = Some(value);
1399 self
1400 }
1401
1402 pub fn set_num_values(mut self, value: i64) -> Self {
1404 self.0.num_values = value;
1405 self
1406 }
1407
1408 pub fn set_compression(mut self, value: Compression) -> Self {
1410 self.0.compression = value;
1411 self
1412 }
1413
1414 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1416 self.0.total_compressed_size = value;
1417 self
1418 }
1419
1420 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1422 self.0.total_uncompressed_size = value;
1423 self
1424 }
1425
1426 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1428 self.0.data_page_offset = value;
1429 self
1430 }
1431
1432 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1434 self.0.dictionary_page_offset = value;
1435 self
1436 }
1437
1438 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1440 self.0.index_page_offset = value;
1441 self
1442 }
1443
1444 pub fn set_statistics(mut self, value: Statistics) -> Self {
1446 self.0.statistics = Some(value);
1447 self
1448 }
1449
1450 pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1452 self.0.geo_statistics = Some(value);
1453 self
1454 }
1455
1456 pub fn clear_statistics(mut self) -> Self {
1458 self.0.statistics = None;
1459 self
1460 }
1461
1462 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1464 self.0.encoding_stats = Some(value);
1465 self
1466 }
1467
1468 pub fn clear_page_encoding_stats(mut self) -> Self {
1470 self.0.encoding_stats = None;
1471 self
1472 }
1473
1474 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1476 self.0.bloom_filter_offset = value;
1477 self
1478 }
1479
1480 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1482 self.0.bloom_filter_length = value;
1483 self
1484 }
1485
1486 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1488 self.0.offset_index_offset = value;
1489 self
1490 }
1491
1492 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1494 self.0.offset_index_length = value;
1495 self
1496 }
1497
1498 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1500 self.0.column_index_offset = value;
1501 self
1502 }
1503
1504 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1506 self.0.column_index_length = value;
1507 self
1508 }
1509
1510 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1512 self.0.unencoded_byte_array_data_bytes = value;
1513 self
1514 }
1515
1516 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1518 self.0.repetition_level_histogram = value;
1519 self
1520 }
1521
1522 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1524 self.0.definition_level_histogram = value;
1525 self
1526 }
1527
1528 #[cfg(feature = "encryption")]
1529 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1531 self.0.column_crypto_metadata = value;
1532 self
1533 }
1534
1535 pub fn build(self) -> Result<ColumnChunkMetaData> {
1537 Ok(self.0)
1538 }
1539}
1540
1541pub struct ColumnIndexBuilder {
1545 null_pages: Vec<bool>,
1546 min_values: Vec<Vec<u8>>,
1547 max_values: Vec<Vec<u8>>,
1548 null_counts: Vec<i64>,
1549 boundary_order: BoundaryOrder,
1550 repetition_level_histograms: Option<Vec<i64>>,
1552 definition_level_histograms: Option<Vec<i64>>,
1554 valid: bool,
1562}
1563
1564impl Default for ColumnIndexBuilder {
1565 fn default() -> Self {
1566 Self::new()
1567 }
1568}
1569
1570impl ColumnIndexBuilder {
1571 pub fn new() -> Self {
1573 ColumnIndexBuilder {
1574 null_pages: Vec::new(),
1575 min_values: Vec::new(),
1576 max_values: Vec::new(),
1577 null_counts: Vec::new(),
1578 boundary_order: BoundaryOrder::UNORDERED,
1579 repetition_level_histograms: None,
1580 definition_level_histograms: None,
1581 valid: true,
1582 }
1583 }
1584
1585 pub fn append(
1587 &mut self,
1588 null_page: bool,
1589 min_value: Vec<u8>,
1590 max_value: Vec<u8>,
1591 null_count: i64,
1592 ) {
1593 self.null_pages.push(null_page);
1594 self.min_values.push(min_value);
1595 self.max_values.push(max_value);
1596 self.null_counts.push(null_count);
1597 }
1598
1599 pub fn append_histograms(
1602 &mut self,
1603 repetition_level_histogram: &Option<LevelHistogram>,
1604 definition_level_histogram: &Option<LevelHistogram>,
1605 ) {
1606 if !self.valid {
1607 return;
1608 }
1609 if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1610 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1611 hist.reserve(rep_lvl_hist.len());
1612 hist.extend(rep_lvl_hist.values());
1613 }
1614 if let Some(ref def_lvl_hist) = definition_level_histogram {
1615 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1616 hist.reserve(def_lvl_hist.len());
1617 hist.extend(def_lvl_hist.values());
1618 }
1619 }
1620
1621 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1623 self.boundary_order = boundary_order;
1624 }
1625
1626 pub fn to_invalid(&mut self) {
1628 self.valid = false;
1629 }
1630
1631 pub fn valid(&self) -> bool {
1633 self.valid
1634 }
1635
1636 pub fn build_to_thrift(self) -> ColumnIndex {
1640 ColumnIndex::new(
1641 self.null_pages,
1642 self.min_values,
1643 self.max_values,
1644 self.boundary_order,
1645 self.null_counts,
1646 self.repetition_level_histograms,
1647 self.definition_level_histograms,
1648 )
1649 }
1650}
1651
1652impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1653 fn from(value: ColumnChunkMetaData) -> Self {
1654 ColumnChunkMetaDataBuilder(value)
1655 }
1656}
1657
1658pub struct OffsetIndexBuilder {
1662 offset_array: Vec<i64>,
1663 compressed_page_size_array: Vec<i32>,
1664 first_row_index_array: Vec<i64>,
1665 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1666 current_first_row_index: i64,
1667}
1668
1669impl Default for OffsetIndexBuilder {
1670 fn default() -> Self {
1671 Self::new()
1672 }
1673}
1674
1675impl OffsetIndexBuilder {
1676 pub fn new() -> Self {
1678 OffsetIndexBuilder {
1679 offset_array: Vec::new(),
1680 compressed_page_size_array: Vec::new(),
1681 first_row_index_array: Vec::new(),
1682 unencoded_byte_array_data_bytes_array: None,
1683 current_first_row_index: 0,
1684 }
1685 }
1686
1687 pub fn append_row_count(&mut self, row_count: i64) {
1689 let current_page_row_index = self.current_first_row_index;
1690 self.first_row_index_array.push(current_page_row_index);
1691 self.current_first_row_index += row_count;
1692 }
1693
1694 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1696 self.offset_array.push(offset);
1697 self.compressed_page_size_array.push(compressed_page_size);
1698 }
1699
1700 pub fn append_unencoded_byte_array_data_bytes(
1702 &mut self,
1703 unencoded_byte_array_data_bytes: Option<i64>,
1704 ) {
1705 if let Some(val) = unencoded_byte_array_data_bytes {
1706 self.unencoded_byte_array_data_bytes_array
1707 .get_or_insert(Vec::new())
1708 .push(val);
1709 }
1710 }
1711
1712 pub fn build_to_thrift(self) -> OffsetIndex {
1714 let locations = self
1715 .offset_array
1716 .iter()
1717 .zip(self.compressed_page_size_array.iter())
1718 .zip(self.first_row_index_array.iter())
1719 .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1720 .collect::<Vec<_>>();
1721 OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1722 }
1723}
1724
1725#[cfg(test)]
1726mod tests {
1727 use super::*;
1728 use crate::basic::{PageType, SortOrder};
1729 use crate::file::page_index::index::NativeIndex;
1730
1731 #[test]
1732 fn test_row_group_metadata_thrift_conversion() {
1733 let schema_descr = get_test_schema_descr();
1734
1735 let mut columns = vec![];
1736 for ptr in schema_descr.columns() {
1737 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1738 columns.push(column);
1739 }
1740 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1741 .set_num_rows(1000)
1742 .set_total_byte_size(2000)
1743 .set_column_metadata(columns)
1744 .set_ordinal(1)
1745 .build()
1746 .unwrap();
1747
1748 let row_group_exp = row_group_meta.to_thrift();
1749 let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1750 .unwrap()
1751 .to_thrift();
1752
1753 assert_eq!(row_group_res, row_group_exp);
1754 }
1755
1756 #[test]
1757 fn test_row_group_metadata_thrift_conversion_empty() {
1758 let schema_descr = get_test_schema_descr();
1759
1760 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1761
1762 assert!(row_group_meta.is_err());
1763 if let Err(e) = row_group_meta {
1764 assert_eq!(
1765 format!("{e}"),
1766 "Parquet error: Column length mismatch: 2 != 0"
1767 );
1768 }
1769 }
1770
1771 #[test]
1773 fn test_row_group_metadata_thrift_corrupted() {
1774 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1775 SchemaType::group_type_builder("schema")
1776 .with_fields(vec![
1777 Arc::new(
1778 SchemaType::primitive_type_builder("a", Type::INT32)
1779 .build()
1780 .unwrap(),
1781 ),
1782 Arc::new(
1783 SchemaType::primitive_type_builder("b", Type::INT32)
1784 .build()
1785 .unwrap(),
1786 ),
1787 ])
1788 .build()
1789 .unwrap(),
1790 )));
1791
1792 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1793 SchemaType::group_type_builder("schema")
1794 .with_fields(vec![
1795 Arc::new(
1796 SchemaType::primitive_type_builder("a", Type::INT32)
1797 .build()
1798 .unwrap(),
1799 ),
1800 Arc::new(
1801 SchemaType::primitive_type_builder("b", Type::INT32)
1802 .build()
1803 .unwrap(),
1804 ),
1805 Arc::new(
1806 SchemaType::primitive_type_builder("c", Type::INT32)
1807 .build()
1808 .unwrap(),
1809 ),
1810 ])
1811 .build()
1812 .unwrap(),
1813 )));
1814
1815 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1816 .set_num_rows(1000)
1817 .set_total_byte_size(2000)
1818 .set_column_metadata(vec![
1819 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1820 .build()
1821 .unwrap(),
1822 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1823 .build()
1824 .unwrap(),
1825 ])
1826 .set_ordinal(1)
1827 .build()
1828 .unwrap();
1829
1830 let err =
1831 RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1832 .unwrap_err()
1833 .to_string();
1834 assert_eq!(
1835 err,
1836 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1837 );
1838 }
1839
1840 #[test]
1841 fn test_column_chunk_metadata_thrift_conversion() {
1842 let column_descr = get_test_schema_descr().column(0);
1843
1844 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1845 .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1846 .set_file_path("file_path".to_owned())
1847 .set_num_values(1000)
1848 .set_compression(Compression::SNAPPY)
1849 .set_total_compressed_size(2000)
1850 .set_total_uncompressed_size(3000)
1851 .set_data_page_offset(4000)
1852 .set_dictionary_page_offset(Some(5000))
1853 .set_page_encoding_stats(vec![
1854 PageEncodingStats {
1855 page_type: PageType::DATA_PAGE,
1856 encoding: Encoding::PLAIN,
1857 count: 3,
1858 },
1859 PageEncodingStats {
1860 page_type: PageType::DATA_PAGE,
1861 encoding: Encoding::RLE,
1862 count: 5,
1863 },
1864 ])
1865 .set_bloom_filter_offset(Some(6000))
1866 .set_bloom_filter_length(Some(25))
1867 .set_offset_index_offset(Some(7000))
1868 .set_offset_index_length(Some(25))
1869 .set_column_index_offset(Some(8000))
1870 .set_column_index_length(Some(25))
1871 .set_unencoded_byte_array_data_bytes(Some(2000))
1872 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1873 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1874 .build()
1875 .unwrap();
1876
1877 let col_chunk_res =
1878 ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1879
1880 assert_eq!(col_chunk_res, col_metadata);
1881 }
1882
1883 #[test]
1884 fn test_column_chunk_metadata_thrift_conversion_empty() {
1885 let column_descr = get_test_schema_descr().column(0);
1886
1887 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1888 .build()
1889 .unwrap();
1890
1891 let col_chunk_exp = col_metadata.to_thrift();
1892 let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1893 .unwrap()
1894 .to_thrift();
1895
1896 assert_eq!(col_chunk_res, col_chunk_exp);
1897 }
1898
1899 #[test]
1900 fn test_compressed_size() {
1901 let schema_descr = get_test_schema_descr();
1902
1903 let mut columns = vec![];
1904 for column_descr in schema_descr.columns() {
1905 let column = ColumnChunkMetaData::builder(column_descr.clone())
1906 .set_total_compressed_size(500)
1907 .set_total_uncompressed_size(700)
1908 .build()
1909 .unwrap();
1910 columns.push(column);
1911 }
1912 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1913 .set_num_rows(1000)
1914 .set_column_metadata(columns)
1915 .build()
1916 .unwrap();
1917
1918 let compressed_size_res: i64 = row_group_meta.compressed_size();
1919 let compressed_size_exp: i64 = 1000;
1920
1921 assert_eq!(compressed_size_res, compressed_size_exp);
1922 }
1923
1924 #[test]
1925 fn test_memory_size() {
1926 let schema_descr = get_test_schema_descr();
1927
1928 let columns = schema_descr
1929 .columns()
1930 .iter()
1931 .map(|column_descr| {
1932 ColumnChunkMetaData::builder(column_descr.clone())
1933 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1934 .build()
1935 })
1936 .collect::<Result<Vec<_>>>()
1937 .unwrap();
1938 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1939 .set_num_rows(1000)
1940 .set_column_metadata(columns)
1941 .build()
1942 .unwrap();
1943 let row_group_meta = vec![row_group_meta];
1944
1945 let version = 2;
1946 let num_rows = 1000;
1947 let created_by = Some(String::from("test harness"));
1948 let key_value_metadata = Some(vec![KeyValue::new(
1949 String::from("Foo"),
1950 Some(String::from("bar")),
1951 )]);
1952 let column_orders = Some(vec![
1953 ColumnOrder::UNDEFINED,
1954 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1955 ]);
1956 let file_metadata = FileMetaData::new(
1957 version,
1958 num_rows,
1959 created_by,
1960 key_value_metadata,
1961 schema_descr.clone(),
1962 column_orders,
1963 );
1964
1965 let columns_with_stats = schema_descr
1967 .columns()
1968 .iter()
1969 .map(|column_descr| {
1970 ColumnChunkMetaData::builder(column_descr.clone())
1971 .set_statistics(Statistics::new::<i32>(
1972 Some(0),
1973 Some(100),
1974 None,
1975 None,
1976 false,
1977 ))
1978 .build()
1979 })
1980 .collect::<Result<Vec<_>>>()
1981 .unwrap();
1982
1983 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1984 .set_num_rows(1000)
1985 .set_column_metadata(columns_with_stats)
1986 .build()
1987 .unwrap();
1988 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1989
1990 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1991 .set_row_groups(row_group_meta_with_stats)
1992 .build();
1993
1994 #[cfg(not(feature = "encryption"))]
1995 let base_expected_size = 2344;
1996 #[cfg(feature = "encryption")]
1997 let base_expected_size = 2680;
1998
1999 assert_eq!(parquet_meta.memory_size(), base_expected_size);
2000
2001 let mut column_index = ColumnIndexBuilder::new();
2002 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2003 let column_index = column_index.build_to_thrift();
2004 let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
2005
2006 let mut offset_index = OffsetIndexBuilder::new();
2008 offset_index.append_row_count(1);
2009 offset_index.append_offset_and_size(2, 3);
2010 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2011 offset_index.append_row_count(1);
2012 offset_index.append_offset_and_size(2, 3);
2013 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2014 let offset_index = offset_index.build_to_thrift();
2015
2016 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2017 .set_row_groups(row_group_meta)
2018 .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
2019 .set_offset_index(Some(vec![vec![
2020 OffsetIndexMetaData::try_new(offset_index).unwrap()
2021 ]]))
2022 .build();
2023
2024 #[cfg(not(feature = "encryption"))]
2025 let bigger_expected_size = 2848;
2026 #[cfg(feature = "encryption")]
2027 let bigger_expected_size = 3184;
2028
2029 assert!(bigger_expected_size > base_expected_size);
2031 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2032 }
2033
2034 fn get_test_schema_descr() -> SchemaDescPtr {
2036 let schema = SchemaType::group_type_builder("schema")
2037 .with_fields(vec![
2038 Arc::new(
2039 SchemaType::primitive_type_builder("a", Type::INT32)
2040 .build()
2041 .unwrap(),
2042 ),
2043 Arc::new(
2044 SchemaType::primitive_type_builder("b", Type::INT32)
2045 .build()
2046 .unwrap(),
2047 ),
2048 ])
2049 .build()
2050 .unwrap();
2051
2052 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2053 }
2054}