1mod memory;
94mod push_decoder;
95pub(crate) mod reader;
96mod writer;
97
98use crate::basic::{ColumnOrder, Compression, Encoding, Type};
99#[cfg(feature = "encryption")]
100use crate::encryption::{
101 decrypt::FileDecryptor,
102 modules::{create_module_aad, ModuleType},
103};
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
107pub(crate) use crate::file::metadata::memory::HeapSize;
108use crate::file::page_encoding_stats::{self, PageEncodingStats};
109use crate::file::page_index::index::Index;
110use crate::file::page_index::offset_index::OffsetIndexMetaData;
111use crate::file::statistics::{self, Statistics};
112use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
113use crate::format::{
114 BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
115 SizeStatistics, SortingColumn,
116};
117use crate::schema::types::{
118 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
119 Type as SchemaType,
120};
121#[cfg(feature = "encryption")]
122use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
123pub use push_decoder::ParquetMetaDataPushDecoder;
124pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader};
125use std::ops::Range;
126use std::sync::Arc;
127pub use writer::ParquetMetaDataWriter;
128pub(crate) use writer::ThriftMetadataWriter;
129
130pub type ParquetColumnIndex = Vec<Vec<Index>>;
146
147pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
158
159#[derive(Debug, Clone, PartialEq)]
177pub struct ParquetMetaData {
178 file_metadata: FileMetaData,
180 row_groups: Vec<RowGroupMetaData>,
182 column_index: Option<ParquetColumnIndex>,
184 offset_index: Option<ParquetOffsetIndex>,
186 #[cfg(feature = "encryption")]
188 file_decryptor: Option<FileDecryptor>,
189}
190
191impl ParquetMetaData {
192 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
195 ParquetMetaData {
196 file_metadata,
197 row_groups,
198 #[cfg(feature = "encryption")]
199 file_decryptor: None,
200 column_index: None,
201 offset_index: None,
202 }
203 }
204
205 #[cfg(feature = "encryption")]
208 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
209 self.file_decryptor = file_decryptor;
210 }
211
212 pub fn into_builder(self) -> ParquetMetaDataBuilder {
214 self.into()
215 }
216
217 pub fn file_metadata(&self) -> &FileMetaData {
219 &self.file_metadata
220 }
221
222 #[cfg(feature = "encryption")]
224 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
225 self.file_decryptor.as_ref()
226 }
227
228 pub fn num_row_groups(&self) -> usize {
230 self.row_groups.len()
231 }
232
233 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
236 &self.row_groups[i]
237 }
238
239 pub fn row_groups(&self) -> &[RowGroupMetaData] {
241 &self.row_groups
242 }
243
244 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
251 self.column_index.as_ref()
252 }
253
254 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
261 self.offset_index.as_ref()
262 }
263
264 pub fn memory_size(&self) -> usize {
279 std::mem::size_of::<Self>()
280 + self.file_metadata.heap_size()
281 + self.row_groups.heap_size()
282 + self.column_index.heap_size()
283 + self.offset_index.heap_size()
284 }
285
286 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
288 self.column_index = index;
289 }
290
291 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
293 self.offset_index = index;
294 }
295}
296
297pub struct ParquetMetaDataBuilder(ParquetMetaData);
335
336impl ParquetMetaDataBuilder {
337 pub fn new(file_meta_data: FileMetaData) -> Self {
339 Self(ParquetMetaData::new(file_meta_data, vec![]))
340 }
341
342 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
344 Self(metadata)
345 }
346
347 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
349 self.0.row_groups.push(row_group);
350 self
351 }
352
353 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
355 self.0.row_groups = row_groups;
356 self
357 }
358
359 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
365 std::mem::take(&mut self.0.row_groups)
366 }
367
368 pub fn row_groups(&self) -> &[RowGroupMetaData] {
370 &self.0.row_groups
371 }
372
373 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
375 self.0.column_index = column_index;
376 self
377 }
378
379 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
381 std::mem::take(&mut self.0.column_index)
382 }
383
384 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
386 self.0.column_index.as_ref()
387 }
388
389 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
391 self.0.offset_index = offset_index;
392 self
393 }
394
395 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
397 std::mem::take(&mut self.0.offset_index)
398 }
399
400 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
402 self.0.offset_index.as_ref()
403 }
404
405 pub fn build(self) -> ParquetMetaData {
407 let Self(metadata) = self;
408 metadata
409 }
410}
411
412impl From<ParquetMetaData> for ParquetMetaDataBuilder {
413 fn from(meta_data: ParquetMetaData) -> Self {
414 Self(meta_data)
415 }
416}
417
418pub type KeyValue = crate::format::KeyValue;
420
421pub type FileMetaDataPtr = Arc<FileMetaData>;
423
424#[derive(Debug, Clone, PartialEq)]
428pub struct FileMetaData {
429 version: i32,
430 num_rows: i64,
431 created_by: Option<String>,
432 key_value_metadata: Option<Vec<KeyValue>>,
433 schema_descr: SchemaDescPtr,
434 column_orders: Option<Vec<ColumnOrder>>,
435}
436
437impl FileMetaData {
438 pub fn new(
440 version: i32,
441 num_rows: i64,
442 created_by: Option<String>,
443 key_value_metadata: Option<Vec<KeyValue>>,
444 schema_descr: SchemaDescPtr,
445 column_orders: Option<Vec<ColumnOrder>>,
446 ) -> Self {
447 FileMetaData {
448 version,
449 num_rows,
450 created_by,
451 key_value_metadata,
452 schema_descr,
453 column_orders,
454 }
455 }
456
457 pub fn version(&self) -> i32 {
459 self.version
460 }
461
462 pub fn num_rows(&self) -> i64 {
464 self.num_rows
465 }
466
467 pub fn created_by(&self) -> Option<&str> {
476 self.created_by.as_deref()
477 }
478
479 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
481 self.key_value_metadata.as_ref()
482 }
483
484 pub fn schema(&self) -> &SchemaType {
488 self.schema_descr.root_schema()
489 }
490
491 pub fn schema_descr(&self) -> &SchemaDescriptor {
493 &self.schema_descr
494 }
495
496 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
498 self.schema_descr.clone()
499 }
500
501 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
509 self.column_orders.as_ref()
510 }
511
512 pub fn column_order(&self, i: usize) -> ColumnOrder {
515 self.column_orders
516 .as_ref()
517 .map(|data| data[i])
518 .unwrap_or(ColumnOrder::UNDEFINED)
519 }
520}
521
522pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
524
525#[derive(Debug, Clone, PartialEq)]
530pub struct RowGroupMetaData {
531 columns: Vec<ColumnChunkMetaData>,
532 num_rows: i64,
533 sorting_columns: Option<Vec<SortingColumn>>,
534 total_byte_size: i64,
535 schema_descr: SchemaDescPtr,
536 file_offset: Option<i64>,
538 ordinal: Option<i16>,
540}
541
542impl RowGroupMetaData {
543 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
545 RowGroupMetaDataBuilder::new(schema_descr)
546 }
547
548 pub fn num_columns(&self) -> usize {
550 self.columns.len()
551 }
552
553 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
555 &self.columns[i]
556 }
557
558 pub fn columns(&self) -> &[ColumnChunkMetaData] {
560 &self.columns
561 }
562
563 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
565 &mut self.columns
566 }
567
568 pub fn num_rows(&self) -> i64 {
570 self.num_rows
571 }
572
573 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
575 self.sorting_columns.as_ref()
576 }
577
578 pub fn total_byte_size(&self) -> i64 {
580 self.total_byte_size
581 }
582
583 pub fn compressed_size(&self) -> i64 {
585 self.columns.iter().map(|c| c.total_compressed_size).sum()
586 }
587
588 pub fn schema_descr(&self) -> &SchemaDescriptor {
590 self.schema_descr.as_ref()
591 }
592
593 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
595 self.schema_descr.clone()
596 }
597
598 #[inline(always)]
603 pub fn ordinal(&self) -> Option<i16> {
604 self.ordinal
605 }
606
607 #[inline(always)]
609 pub fn file_offset(&self) -> Option<i64> {
610 self.file_offset
611 }
612
613 #[cfg(feature = "encryption")]
615 fn from_encrypted_thrift(
616 schema_descr: SchemaDescPtr,
617 mut rg: RowGroup,
618 decryptor: Option<&FileDecryptor>,
619 ) -> Result<RowGroupMetaData> {
620 if schema_descr.num_columns() != rg.columns.len() {
621 return Err(general_err!(
622 "Column count mismatch. Schema has {} columns while Row Group has {}",
623 schema_descr.num_columns(),
624 rg.columns.len()
625 ));
626 }
627 let total_byte_size = rg.total_byte_size;
628 let num_rows = rg.num_rows;
629 let mut columns = vec![];
630
631 for (i, (mut c, d)) in rg
632 .columns
633 .drain(0..)
634 .zip(schema_descr.columns())
635 .enumerate()
636 {
637 if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
639 let column_decryptor = match c.crypto_metadata.as_ref() {
640 None => {
641 return Err(general_err!(
642 "No crypto_metadata is set for column '{}', which has encrypted metadata",
643 d.path().string()
644 ));
645 }
646 Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
647 let column_name = crypto_metadata.path_in_schema.join(".");
648 decryptor.get_column_metadata_decryptor(
649 column_name.as_str(),
650 crypto_metadata.key_metadata.as_deref(),
651 )?
652 }
653 Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
654 decryptor.get_footer_decryptor()?
655 }
656 };
657
658 let column_aad = create_module_aad(
659 decryptor.file_aad(),
660 ModuleType::ColumnMetaData,
661 rg.ordinal.unwrap() as usize,
662 i,
663 None,
664 )?;
665
666 let buf = c.encrypted_column_metadata.clone().unwrap();
667 let decrypted_cc_buf = column_decryptor
668 .decrypt(buf.as_slice(), column_aad.as_ref())
669 .map_err(|_| {
670 general_err!(
671 "Unable to decrypt column '{}', perhaps the column key is wrong?",
672 d.path().string()
673 )
674 })?;
675
676 let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
677 c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
678 }
679 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
680 }
681
682 let sorting_columns = rg.sorting_columns;
683 Ok(RowGroupMetaData {
684 columns,
685 num_rows,
686 sorting_columns,
687 total_byte_size,
688 schema_descr,
689 file_offset: rg.file_offset,
690 ordinal: rg.ordinal,
691 })
692 }
693
694 pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
696 if schema_descr.num_columns() != rg.columns.len() {
697 return Err(general_err!(
698 "Column count mismatch. Schema has {} columns while Row Group has {}",
699 schema_descr.num_columns(),
700 rg.columns.len()
701 ));
702 }
703 let total_byte_size = rg.total_byte_size;
704 let num_rows = rg.num_rows;
705 let mut columns = vec![];
706
707 for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
708 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
709 }
710
711 let sorting_columns = rg.sorting_columns;
712 Ok(RowGroupMetaData {
713 columns,
714 num_rows,
715 sorting_columns,
716 total_byte_size,
717 schema_descr,
718 file_offset: rg.file_offset,
719 ordinal: rg.ordinal,
720 })
721 }
722
723 pub fn to_thrift(&self) -> RowGroup {
725 RowGroup {
726 columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
727 total_byte_size: self.total_byte_size,
728 num_rows: self.num_rows,
729 sorting_columns: self.sorting_columns().cloned(),
730 file_offset: self.file_offset(),
731 total_compressed_size: Some(self.compressed_size()),
732 ordinal: self.ordinal,
733 }
734 }
735
736 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
738 RowGroupMetaDataBuilder(self)
739 }
740}
741
742pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
744
745impl RowGroupMetaDataBuilder {
746 fn new(schema_descr: SchemaDescPtr) -> Self {
748 Self(RowGroupMetaData {
749 columns: Vec::with_capacity(schema_descr.num_columns()),
750 schema_descr,
751 file_offset: None,
752 num_rows: 0,
753 sorting_columns: None,
754 total_byte_size: 0,
755 ordinal: None,
756 })
757 }
758
759 pub fn set_num_rows(mut self, value: i64) -> Self {
761 self.0.num_rows = value;
762 self
763 }
764
765 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
767 self.0.sorting_columns = value;
768 self
769 }
770
771 pub fn set_total_byte_size(mut self, value: i64) -> Self {
773 self.0.total_byte_size = value;
774 self
775 }
776
777 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
783 std::mem::take(&mut self.0.columns)
784 }
785
786 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
788 self.0.columns = value;
789 self
790 }
791
792 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
794 self.0.columns.push(value);
795 self
796 }
797
798 pub fn set_ordinal(mut self, value: i16) -> Self {
800 self.0.ordinal = Some(value);
801 self
802 }
803
804 pub fn set_file_offset(mut self, value: i64) -> Self {
806 self.0.file_offset = Some(value);
807 self
808 }
809
810 pub fn build(self) -> Result<RowGroupMetaData> {
812 if self.0.schema_descr.num_columns() != self.0.columns.len() {
813 return Err(general_err!(
814 "Column length mismatch: {} != {}",
815 self.0.schema_descr.num_columns(),
816 self.0.columns.len()
817 ));
818 }
819
820 Ok(self.0)
821 }
822}
823
824#[derive(Debug, Clone, PartialEq)]
826pub struct ColumnChunkMetaData {
827 column_descr: ColumnDescPtr,
828 encodings: Vec<Encoding>,
829 file_path: Option<String>,
830 file_offset: i64,
831 num_values: i64,
832 compression: Compression,
833 total_compressed_size: i64,
834 total_uncompressed_size: i64,
835 data_page_offset: i64,
836 index_page_offset: Option<i64>,
837 dictionary_page_offset: Option<i64>,
838 statistics: Option<Statistics>,
839 encoding_stats: Option<Vec<PageEncodingStats>>,
840 bloom_filter_offset: Option<i64>,
841 bloom_filter_length: Option<i32>,
842 offset_index_offset: Option<i64>,
843 offset_index_length: Option<i32>,
844 column_index_offset: Option<i64>,
845 column_index_length: Option<i32>,
846 unencoded_byte_array_data_bytes: Option<i64>,
847 repetition_level_histogram: Option<LevelHistogram>,
848 definition_level_histogram: Option<LevelHistogram>,
849 #[cfg(feature = "encryption")]
850 column_crypto_metadata: Option<ColumnCryptoMetaData>,
851}
852
853#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
862pub struct LevelHistogram {
863 inner: Vec<i64>,
864}
865
866impl LevelHistogram {
867 pub fn try_new(max_level: i16) -> Option<Self> {
873 if max_level > 0 {
874 Some(Self {
875 inner: vec![0; max_level as usize + 1],
876 })
877 } else {
878 None
879 }
880 }
881 pub fn values(&self) -> &[i64] {
883 &self.inner
884 }
885
886 pub fn into_inner(self) -> Vec<i64> {
888 self.inner
889 }
890
891 pub fn get(&self, index: usize) -> Option<i64> {
898 self.inner.get(index).copied()
899 }
900
901 pub fn add(&mut self, other: &Self) {
906 assert_eq!(self.len(), other.len());
907 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
908 *dst += src;
909 }
910 }
911
912 pub fn len(&self) -> usize {
914 self.inner.len()
915 }
916
917 pub fn is_empty(&self) -> bool {
919 self.inner.is_empty()
920 }
921
922 pub fn reset(&mut self) {
924 for value in self.inner.iter_mut() {
925 *value = 0;
926 }
927 }
928
929 pub fn update_from_levels(&mut self, levels: &[i16]) {
935 for &level in levels {
936 self.inner[level as usize] += 1;
937 }
938 }
939}
940
941impl From<Vec<i64>> for LevelHistogram {
942 fn from(inner: Vec<i64>) -> Self {
943 Self { inner }
944 }
945}
946
947impl From<LevelHistogram> for Vec<i64> {
948 fn from(value: LevelHistogram) -> Self {
949 value.into_inner()
950 }
951}
952
953impl HeapSize for LevelHistogram {
954 fn heap_size(&self) -> usize {
955 self.inner.heap_size()
956 }
957}
958
959impl ColumnChunkMetaData {
961 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
963 ColumnChunkMetaDataBuilder::new(column_descr)
964 }
965
966 pub fn file_path(&self) -> Option<&str> {
971 self.file_path.as_deref()
972 }
973
974 pub fn file_offset(&self) -> i64 {
981 self.file_offset
982 }
983
984 pub fn column_type(&self) -> Type {
986 self.column_descr.physical_type()
987 }
988
989 pub fn column_path(&self) -> &ColumnPath {
991 self.column_descr.path()
992 }
993
994 pub fn column_descr(&self) -> &ColumnDescriptor {
996 self.column_descr.as_ref()
997 }
998
999 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1001 self.column_descr.clone()
1002 }
1003
1004 pub fn encodings(&self) -> &Vec<Encoding> {
1006 &self.encodings
1007 }
1008
1009 pub fn num_values(&self) -> i64 {
1011 self.num_values
1012 }
1013
1014 pub fn compression(&self) -> Compression {
1016 self.compression
1017 }
1018
1019 pub fn compressed_size(&self) -> i64 {
1021 self.total_compressed_size
1022 }
1023
1024 pub fn uncompressed_size(&self) -> i64 {
1026 self.total_uncompressed_size
1027 }
1028
1029 pub fn data_page_offset(&self) -> i64 {
1031 self.data_page_offset
1032 }
1033
1034 pub fn index_page_offset(&self) -> Option<i64> {
1036 self.index_page_offset
1037 }
1038
1039 pub fn dictionary_page_offset(&self) -> Option<i64> {
1041 self.dictionary_page_offset
1042 }
1043
1044 pub fn byte_range(&self) -> (u64, u64) {
1046 let col_start = match self.dictionary_page_offset() {
1047 Some(dictionary_page_offset) => dictionary_page_offset,
1048 None => self.data_page_offset(),
1049 };
1050 let col_len = self.compressed_size();
1051 assert!(
1052 col_start >= 0 && col_len >= 0,
1053 "column start and length should not be negative"
1054 );
1055 (col_start as u64, col_len as u64)
1056 }
1057
1058 pub fn statistics(&self) -> Option<&Statistics> {
1061 self.statistics.as_ref()
1062 }
1063
1064 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1067 self.encoding_stats.as_ref()
1068 }
1069
1070 pub fn bloom_filter_offset(&self) -> Option<i64> {
1072 self.bloom_filter_offset
1073 }
1074
1075 pub fn bloom_filter_length(&self) -> Option<i32> {
1077 self.bloom_filter_length
1078 }
1079
1080 pub fn column_index_offset(&self) -> Option<i64> {
1082 self.column_index_offset
1083 }
1084
1085 pub fn column_index_length(&self) -> Option<i32> {
1087 self.column_index_length
1088 }
1089
1090 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1092 let offset = u64::try_from(self.column_index_offset?).ok()?;
1093 let length = u64::try_from(self.column_index_length?).ok()?;
1094 Some(offset..(offset + length))
1095 }
1096
1097 pub fn offset_index_offset(&self) -> Option<i64> {
1099 self.offset_index_offset
1100 }
1101
1102 pub fn offset_index_length(&self) -> Option<i32> {
1104 self.offset_index_length
1105 }
1106
1107 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1109 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1110 let length = u64::try_from(self.offset_index_length?).ok()?;
1111 Some(offset..(offset + length))
1112 }
1113
1114 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1119 self.unencoded_byte_array_data_bytes
1120 }
1121
1122 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1128 self.repetition_level_histogram.as_ref()
1129 }
1130
1131 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1137 self.definition_level_histogram.as_ref()
1138 }
1139
1140 #[cfg(feature = "encryption")]
1142 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1143 self.column_crypto_metadata.as_ref()
1144 }
1145
1146 pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1148 if cc.meta_data.is_none() {
1149 return Err(general_err!("Expected to have column metadata"));
1150 }
1151 let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1152 let column_type = Type::try_from(col_metadata.type_)?;
1153 let encodings = col_metadata
1154 .encodings
1155 .drain(0..)
1156 .map(Encoding::try_from)
1157 .collect::<Result<_>>()?;
1158 let compression = Compression::try_from(col_metadata.codec)?;
1159 let file_path = cc.file_path;
1160 let file_offset = cc.file_offset;
1161 let num_values = col_metadata.num_values;
1162 let total_compressed_size = col_metadata.total_compressed_size;
1163 let total_uncompressed_size = col_metadata.total_uncompressed_size;
1164 let data_page_offset = col_metadata.data_page_offset;
1165 let index_page_offset = col_metadata.index_page_offset;
1166 let dictionary_page_offset = col_metadata.dictionary_page_offset;
1167 let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1168 let encoding_stats = col_metadata
1169 .encoding_stats
1170 .as_ref()
1171 .map(|vec| {
1172 vec.iter()
1173 .map(page_encoding_stats::try_from_thrift)
1174 .collect::<Result<_>>()
1175 })
1176 .transpose()?;
1177 let bloom_filter_offset = col_metadata.bloom_filter_offset;
1178 let bloom_filter_length = col_metadata.bloom_filter_length;
1179 let offset_index_offset = cc.offset_index_offset;
1180 let offset_index_length = cc.offset_index_length;
1181 let column_index_offset = cc.column_index_offset;
1182 let column_index_length = cc.column_index_length;
1183 let (
1184 unencoded_byte_array_data_bytes,
1185 repetition_level_histogram,
1186 definition_level_histogram,
1187 ) = if let Some(size_stats) = col_metadata.size_statistics {
1188 (
1189 size_stats.unencoded_byte_array_data_bytes,
1190 size_stats.repetition_level_histogram,
1191 size_stats.definition_level_histogram,
1192 )
1193 } else {
1194 (None, None, None)
1195 };
1196
1197 let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1198 let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1199
1200 #[cfg(feature = "encryption")]
1201 let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1202 Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1203 } else {
1204 None
1205 };
1206
1207 let result = ColumnChunkMetaData {
1208 column_descr,
1209 encodings,
1210 file_path,
1211 file_offset,
1212 num_values,
1213 compression,
1214 total_compressed_size,
1215 total_uncompressed_size,
1216 data_page_offset,
1217 index_page_offset,
1218 dictionary_page_offset,
1219 statistics,
1220 encoding_stats,
1221 bloom_filter_offset,
1222 bloom_filter_length,
1223 offset_index_offset,
1224 offset_index_length,
1225 column_index_offset,
1226 column_index_length,
1227 unencoded_byte_array_data_bytes,
1228 repetition_level_histogram,
1229 definition_level_histogram,
1230 #[cfg(feature = "encryption")]
1231 column_crypto_metadata,
1232 };
1233 Ok(result)
1234 }
1235
1236 pub fn to_thrift(&self) -> ColumnChunk {
1238 let column_metadata = self.to_column_metadata_thrift();
1239
1240 ColumnChunk {
1241 file_path: self.file_path().map(|s| s.to_owned()),
1242 file_offset: self.file_offset,
1243 meta_data: Some(column_metadata),
1244 offset_index_offset: self.offset_index_offset,
1245 offset_index_length: self.offset_index_length,
1246 column_index_offset: self.column_index_offset,
1247 column_index_length: self.column_index_length,
1248 crypto_metadata: self.column_crypto_metadata_thrift(),
1249 encrypted_column_metadata: None,
1250 }
1251 }
1252
1253 pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1255 let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1256 || self.repetition_level_histogram.is_some()
1257 || self.definition_level_histogram.is_some()
1258 {
1259 let repetition_level_histogram = self
1260 .repetition_level_histogram
1261 .as_ref()
1262 .map(|hist| hist.clone().into_inner());
1263
1264 let definition_level_histogram = self
1265 .definition_level_histogram
1266 .as_ref()
1267 .map(|hist| hist.clone().into_inner());
1268
1269 Some(SizeStatistics {
1270 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1271 repetition_level_histogram,
1272 definition_level_histogram,
1273 })
1274 } else {
1275 None
1276 };
1277
1278 ColumnMetaData {
1279 type_: self.column_type().into(),
1280 encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1281 path_in_schema: self.column_path().as_ref().to_vec(),
1282 codec: self.compression.into(),
1283 num_values: self.num_values,
1284 total_uncompressed_size: self.total_uncompressed_size,
1285 total_compressed_size: self.total_compressed_size,
1286 key_value_metadata: None,
1287 data_page_offset: self.data_page_offset,
1288 index_page_offset: self.index_page_offset,
1289 dictionary_page_offset: self.dictionary_page_offset,
1290 statistics: statistics::to_thrift(self.statistics.as_ref()),
1291 encoding_stats: self
1292 .encoding_stats
1293 .as_ref()
1294 .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1295 bloom_filter_offset: self.bloom_filter_offset,
1296 bloom_filter_length: self.bloom_filter_length,
1297 size_statistics,
1298 geospatial_statistics: None,
1299 }
1300 }
1301
1302 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1304 ColumnChunkMetaDataBuilder::from(self)
1305 }
1306
1307 #[cfg(feature = "encryption")]
1308 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1309 self.column_crypto_metadata
1310 .as_ref()
1311 .map(column_crypto_metadata::to_thrift)
1312 }
1313
1314 #[cfg(not(feature = "encryption"))]
1315 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1316 None
1317 }
1318}
1319
1320pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1339
1340impl ColumnChunkMetaDataBuilder {
1341 fn new(column_descr: ColumnDescPtr) -> Self {
1345 Self(ColumnChunkMetaData {
1346 column_descr,
1347 encodings: Vec::new(),
1348 file_path: None,
1349 file_offset: 0,
1350 num_values: 0,
1351 compression: Compression::UNCOMPRESSED,
1352 total_compressed_size: 0,
1353 total_uncompressed_size: 0,
1354 data_page_offset: 0,
1355 index_page_offset: None,
1356 dictionary_page_offset: None,
1357 statistics: None,
1358 encoding_stats: None,
1359 bloom_filter_offset: None,
1360 bloom_filter_length: None,
1361 offset_index_offset: None,
1362 offset_index_length: None,
1363 column_index_offset: None,
1364 column_index_length: None,
1365 unencoded_byte_array_data_bytes: None,
1366 repetition_level_histogram: None,
1367 definition_level_histogram: None,
1368 #[cfg(feature = "encryption")]
1369 column_crypto_metadata: None,
1370 })
1371 }
1372
1373 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1375 self.0.encodings = encodings;
1376 self
1377 }
1378
1379 pub fn set_file_path(mut self, value: String) -> Self {
1381 self.0.file_path = Some(value);
1382 self
1383 }
1384
1385 pub fn set_num_values(mut self, value: i64) -> Self {
1387 self.0.num_values = value;
1388 self
1389 }
1390
1391 pub fn set_compression(mut self, value: Compression) -> Self {
1393 self.0.compression = value;
1394 self
1395 }
1396
1397 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1399 self.0.total_compressed_size = value;
1400 self
1401 }
1402
1403 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1405 self.0.total_uncompressed_size = value;
1406 self
1407 }
1408
1409 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1411 self.0.data_page_offset = value;
1412 self
1413 }
1414
1415 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1417 self.0.dictionary_page_offset = value;
1418 self
1419 }
1420
1421 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1423 self.0.index_page_offset = value;
1424 self
1425 }
1426
1427 pub fn set_statistics(mut self, value: Statistics) -> Self {
1429 self.0.statistics = Some(value);
1430 self
1431 }
1432
1433 pub fn clear_statistics(mut self) -> Self {
1435 self.0.statistics = None;
1436 self
1437 }
1438
1439 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1441 self.0.encoding_stats = Some(value);
1442 self
1443 }
1444
1445 pub fn clear_page_encoding_stats(mut self) -> Self {
1447 self.0.encoding_stats = None;
1448 self
1449 }
1450
1451 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1453 self.0.bloom_filter_offset = value;
1454 self
1455 }
1456
1457 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1459 self.0.bloom_filter_length = value;
1460 self
1461 }
1462
1463 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1465 self.0.offset_index_offset = value;
1466 self
1467 }
1468
1469 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1471 self.0.offset_index_length = value;
1472 self
1473 }
1474
1475 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1477 self.0.column_index_offset = value;
1478 self
1479 }
1480
1481 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1483 self.0.column_index_length = value;
1484 self
1485 }
1486
1487 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1489 self.0.unencoded_byte_array_data_bytes = value;
1490 self
1491 }
1492
1493 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1495 self.0.repetition_level_histogram = value;
1496 self
1497 }
1498
1499 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1501 self.0.definition_level_histogram = value;
1502 self
1503 }
1504
1505 #[cfg(feature = "encryption")]
1506 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1508 self.0.column_crypto_metadata = value;
1509 self
1510 }
1511
1512 pub fn build(self) -> Result<ColumnChunkMetaData> {
1514 Ok(self.0)
1515 }
1516}
1517
1518pub struct ColumnIndexBuilder {
1522 null_pages: Vec<bool>,
1523 min_values: Vec<Vec<u8>>,
1524 max_values: Vec<Vec<u8>>,
1525 null_counts: Vec<i64>,
1526 boundary_order: BoundaryOrder,
1527 repetition_level_histograms: Option<Vec<i64>>,
1529 definition_level_histograms: Option<Vec<i64>>,
1531 valid: bool,
1539}
1540
1541impl Default for ColumnIndexBuilder {
1542 fn default() -> Self {
1543 Self::new()
1544 }
1545}
1546
1547impl ColumnIndexBuilder {
1548 pub fn new() -> Self {
1550 ColumnIndexBuilder {
1551 null_pages: Vec::new(),
1552 min_values: Vec::new(),
1553 max_values: Vec::new(),
1554 null_counts: Vec::new(),
1555 boundary_order: BoundaryOrder::UNORDERED,
1556 repetition_level_histograms: None,
1557 definition_level_histograms: None,
1558 valid: true,
1559 }
1560 }
1561
1562 pub fn append(
1564 &mut self,
1565 null_page: bool,
1566 min_value: Vec<u8>,
1567 max_value: Vec<u8>,
1568 null_count: i64,
1569 ) {
1570 self.null_pages.push(null_page);
1571 self.min_values.push(min_value);
1572 self.max_values.push(max_value);
1573 self.null_counts.push(null_count);
1574 }
1575
1576 pub fn append_histograms(
1579 &mut self,
1580 repetition_level_histogram: &Option<LevelHistogram>,
1581 definition_level_histogram: &Option<LevelHistogram>,
1582 ) {
1583 if !self.valid {
1584 return;
1585 }
1586 if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1587 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1588 hist.reserve(rep_lvl_hist.len());
1589 hist.extend(rep_lvl_hist.values());
1590 }
1591 if let Some(ref def_lvl_hist) = definition_level_histogram {
1592 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1593 hist.reserve(def_lvl_hist.len());
1594 hist.extend(def_lvl_hist.values());
1595 }
1596 }
1597
1598 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1600 self.boundary_order = boundary_order;
1601 }
1602
1603 pub fn to_invalid(&mut self) {
1605 self.valid = false;
1606 }
1607
1608 pub fn valid(&self) -> bool {
1610 self.valid
1611 }
1612
1613 pub fn build_to_thrift(self) -> ColumnIndex {
1617 ColumnIndex::new(
1618 self.null_pages,
1619 self.min_values,
1620 self.max_values,
1621 self.boundary_order,
1622 self.null_counts,
1623 self.repetition_level_histograms,
1624 self.definition_level_histograms,
1625 )
1626 }
1627}
1628
1629impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1630 fn from(value: ColumnChunkMetaData) -> Self {
1631 ColumnChunkMetaDataBuilder(value)
1632 }
1633}
1634
1635pub struct OffsetIndexBuilder {
1639 offset_array: Vec<i64>,
1640 compressed_page_size_array: Vec<i32>,
1641 first_row_index_array: Vec<i64>,
1642 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1643 current_first_row_index: i64,
1644}
1645
1646impl Default for OffsetIndexBuilder {
1647 fn default() -> Self {
1648 Self::new()
1649 }
1650}
1651
1652impl OffsetIndexBuilder {
1653 pub fn new() -> Self {
1655 OffsetIndexBuilder {
1656 offset_array: Vec::new(),
1657 compressed_page_size_array: Vec::new(),
1658 first_row_index_array: Vec::new(),
1659 unencoded_byte_array_data_bytes_array: None,
1660 current_first_row_index: 0,
1661 }
1662 }
1663
1664 pub fn append_row_count(&mut self, row_count: i64) {
1666 let current_page_row_index = self.current_first_row_index;
1667 self.first_row_index_array.push(current_page_row_index);
1668 self.current_first_row_index += row_count;
1669 }
1670
1671 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1673 self.offset_array.push(offset);
1674 self.compressed_page_size_array.push(compressed_page_size);
1675 }
1676
1677 pub fn append_unencoded_byte_array_data_bytes(
1679 &mut self,
1680 unencoded_byte_array_data_bytes: Option<i64>,
1681 ) {
1682 if let Some(val) = unencoded_byte_array_data_bytes {
1683 self.unencoded_byte_array_data_bytes_array
1684 .get_or_insert(Vec::new())
1685 .push(val);
1686 }
1687 }
1688
1689 pub fn build_to_thrift(self) -> OffsetIndex {
1691 let locations = self
1692 .offset_array
1693 .iter()
1694 .zip(self.compressed_page_size_array.iter())
1695 .zip(self.first_row_index_array.iter())
1696 .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1697 .collect::<Vec<_>>();
1698 OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1699 }
1700}
1701
1702#[cfg(test)]
1703mod tests {
1704 use super::*;
1705 use crate::basic::{PageType, SortOrder};
1706 use crate::file::page_index::index::NativeIndex;
1707
1708 #[test]
1709 fn test_row_group_metadata_thrift_conversion() {
1710 let schema_descr = get_test_schema_descr();
1711
1712 let mut columns = vec![];
1713 for ptr in schema_descr.columns() {
1714 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1715 columns.push(column);
1716 }
1717 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1718 .set_num_rows(1000)
1719 .set_total_byte_size(2000)
1720 .set_column_metadata(columns)
1721 .set_ordinal(1)
1722 .build()
1723 .unwrap();
1724
1725 let row_group_exp = row_group_meta.to_thrift();
1726 let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1727 .unwrap()
1728 .to_thrift();
1729
1730 assert_eq!(row_group_res, row_group_exp);
1731 }
1732
1733 #[test]
1734 fn test_row_group_metadata_thrift_conversion_empty() {
1735 let schema_descr = get_test_schema_descr();
1736
1737 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1738
1739 assert!(row_group_meta.is_err());
1740 if let Err(e) = row_group_meta {
1741 assert_eq!(
1742 format!("{e}"),
1743 "Parquet error: Column length mismatch: 2 != 0"
1744 );
1745 }
1746 }
1747
1748 #[test]
1750 fn test_row_group_metadata_thrift_corrupted() {
1751 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1752 SchemaType::group_type_builder("schema")
1753 .with_fields(vec![
1754 Arc::new(
1755 SchemaType::primitive_type_builder("a", Type::INT32)
1756 .build()
1757 .unwrap(),
1758 ),
1759 Arc::new(
1760 SchemaType::primitive_type_builder("b", Type::INT32)
1761 .build()
1762 .unwrap(),
1763 ),
1764 ])
1765 .build()
1766 .unwrap(),
1767 )));
1768
1769 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1770 SchemaType::group_type_builder("schema")
1771 .with_fields(vec![
1772 Arc::new(
1773 SchemaType::primitive_type_builder("a", Type::INT32)
1774 .build()
1775 .unwrap(),
1776 ),
1777 Arc::new(
1778 SchemaType::primitive_type_builder("b", Type::INT32)
1779 .build()
1780 .unwrap(),
1781 ),
1782 Arc::new(
1783 SchemaType::primitive_type_builder("c", Type::INT32)
1784 .build()
1785 .unwrap(),
1786 ),
1787 ])
1788 .build()
1789 .unwrap(),
1790 )));
1791
1792 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1793 .set_num_rows(1000)
1794 .set_total_byte_size(2000)
1795 .set_column_metadata(vec![
1796 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1797 .build()
1798 .unwrap(),
1799 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1800 .build()
1801 .unwrap(),
1802 ])
1803 .set_ordinal(1)
1804 .build()
1805 .unwrap();
1806
1807 let err =
1808 RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1809 .unwrap_err()
1810 .to_string();
1811 assert_eq!(
1812 err,
1813 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1814 );
1815 }
1816
1817 #[test]
1818 fn test_column_chunk_metadata_thrift_conversion() {
1819 let column_descr = get_test_schema_descr().column(0);
1820
1821 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1822 .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1823 .set_file_path("file_path".to_owned())
1824 .set_num_values(1000)
1825 .set_compression(Compression::SNAPPY)
1826 .set_total_compressed_size(2000)
1827 .set_total_uncompressed_size(3000)
1828 .set_data_page_offset(4000)
1829 .set_dictionary_page_offset(Some(5000))
1830 .set_page_encoding_stats(vec![
1831 PageEncodingStats {
1832 page_type: PageType::DATA_PAGE,
1833 encoding: Encoding::PLAIN,
1834 count: 3,
1835 },
1836 PageEncodingStats {
1837 page_type: PageType::DATA_PAGE,
1838 encoding: Encoding::RLE,
1839 count: 5,
1840 },
1841 ])
1842 .set_bloom_filter_offset(Some(6000))
1843 .set_bloom_filter_length(Some(25))
1844 .set_offset_index_offset(Some(7000))
1845 .set_offset_index_length(Some(25))
1846 .set_column_index_offset(Some(8000))
1847 .set_column_index_length(Some(25))
1848 .set_unencoded_byte_array_data_bytes(Some(2000))
1849 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1850 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1851 .build()
1852 .unwrap();
1853
1854 let col_chunk_res =
1855 ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1856
1857 assert_eq!(col_chunk_res, col_metadata);
1858 }
1859
1860 #[test]
1861 fn test_column_chunk_metadata_thrift_conversion_empty() {
1862 let column_descr = get_test_schema_descr().column(0);
1863
1864 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1865 .build()
1866 .unwrap();
1867
1868 let col_chunk_exp = col_metadata.to_thrift();
1869 let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1870 .unwrap()
1871 .to_thrift();
1872
1873 assert_eq!(col_chunk_res, col_chunk_exp);
1874 }
1875
1876 #[test]
1877 fn test_compressed_size() {
1878 let schema_descr = get_test_schema_descr();
1879
1880 let mut columns = vec![];
1881 for column_descr in schema_descr.columns() {
1882 let column = ColumnChunkMetaData::builder(column_descr.clone())
1883 .set_total_compressed_size(500)
1884 .set_total_uncompressed_size(700)
1885 .build()
1886 .unwrap();
1887 columns.push(column);
1888 }
1889 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1890 .set_num_rows(1000)
1891 .set_column_metadata(columns)
1892 .build()
1893 .unwrap();
1894
1895 let compressed_size_res: i64 = row_group_meta.compressed_size();
1896 let compressed_size_exp: i64 = 1000;
1897
1898 assert_eq!(compressed_size_res, compressed_size_exp);
1899 }
1900
1901 #[test]
1902 fn test_memory_size() {
1903 let schema_descr = get_test_schema_descr();
1904
1905 let columns = schema_descr
1906 .columns()
1907 .iter()
1908 .map(|column_descr| {
1909 ColumnChunkMetaData::builder(column_descr.clone())
1910 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1911 .build()
1912 })
1913 .collect::<Result<Vec<_>>>()
1914 .unwrap();
1915 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1916 .set_num_rows(1000)
1917 .set_column_metadata(columns)
1918 .build()
1919 .unwrap();
1920 let row_group_meta = vec![row_group_meta];
1921
1922 let version = 2;
1923 let num_rows = 1000;
1924 let created_by = Some(String::from("test harness"));
1925 let key_value_metadata = Some(vec![KeyValue::new(
1926 String::from("Foo"),
1927 Some(String::from("bar")),
1928 )]);
1929 let column_orders = Some(vec![
1930 ColumnOrder::UNDEFINED,
1931 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1932 ]);
1933 let file_metadata = FileMetaData::new(
1934 version,
1935 num_rows,
1936 created_by,
1937 key_value_metadata,
1938 schema_descr.clone(),
1939 column_orders,
1940 );
1941
1942 let columns_with_stats = schema_descr
1944 .columns()
1945 .iter()
1946 .map(|column_descr| {
1947 ColumnChunkMetaData::builder(column_descr.clone())
1948 .set_statistics(Statistics::new::<i32>(
1949 Some(0),
1950 Some(100),
1951 None,
1952 None,
1953 false,
1954 ))
1955 .build()
1956 })
1957 .collect::<Result<Vec<_>>>()
1958 .unwrap();
1959
1960 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1961 .set_num_rows(1000)
1962 .set_column_metadata(columns_with_stats)
1963 .build()
1964 .unwrap();
1965 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1966
1967 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1968 .set_row_groups(row_group_meta_with_stats)
1969 .build();
1970
1971 #[cfg(not(feature = "encryption"))]
1972 let base_expected_size = 2312;
1973 #[cfg(feature = "encryption")]
1974 let base_expected_size = 2648;
1975
1976 assert_eq!(parquet_meta.memory_size(), base_expected_size);
1977
1978 let mut column_index = ColumnIndexBuilder::new();
1979 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1980 let column_index = column_index.build_to_thrift();
1981 let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
1982
1983 let mut offset_index = OffsetIndexBuilder::new();
1985 offset_index.append_row_count(1);
1986 offset_index.append_offset_and_size(2, 3);
1987 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1988 offset_index.append_row_count(1);
1989 offset_index.append_offset_and_size(2, 3);
1990 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1991 let offset_index = offset_index.build_to_thrift();
1992
1993 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1994 .set_row_groups(row_group_meta)
1995 .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
1996 .set_offset_index(Some(vec![vec![
1997 OffsetIndexMetaData::try_new(offset_index).unwrap()
1998 ]]))
1999 .build();
2000
2001 #[cfg(not(feature = "encryption"))]
2002 let bigger_expected_size = 2816;
2003 #[cfg(feature = "encryption")]
2004 let bigger_expected_size = 3152;
2005
2006 assert!(bigger_expected_size > base_expected_size);
2008 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2009 }
2010
2011 fn get_test_schema_descr() -> SchemaDescPtr {
2013 let schema = SchemaType::group_type_builder("schema")
2014 .with_fields(vec![
2015 Arc::new(
2016 SchemaType::primitive_type_builder("a", Type::INT32)
2017 .build()
2018 .unwrap(),
2019 ),
2020 Arc::new(
2021 SchemaType::primitive_type_builder("b", Type::INT32)
2022 .build()
2023 .unwrap(),
2024 ),
2025 ])
2026 .build()
2027 .unwrap();
2028
2029 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2030 }
2031}