1mod footer_tail;
90mod memory;
91mod parser;
92mod push_decoder;
93pub(crate) mod reader;
94pub(crate) mod thrift;
95mod writer;
96
97use crate::basic::{EncodingMask, PageType};
98#[cfg(feature = "encryption")]
99use crate::encryption::decrypt::FileDecryptor;
100#[cfg(feature = "encryption")]
101use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
102pub(crate) use crate::file::metadata::memory::HeapSize;
103#[cfg(feature = "encryption")]
104use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
105use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
106use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
107use crate::file::statistics::Statistics;
108use crate::geospatial::statistics as geo_statistics;
109use crate::schema::types::{
110 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
111 Type as SchemaType,
112};
113use crate::thrift_struct;
114use crate::{
115 basic::BoundaryOrder,
116 errors::{ParquetError, Result},
117};
118use crate::{
119 basic::{ColumnOrder, Compression, Encoding, Type},
120 parquet_thrift::{
121 ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
122 ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
123 },
124};
125use crate::{
126 data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
127};
128
129pub use footer_tail::FooterTail;
130pub use push_decoder::ParquetMetaDataPushDecoder;
131pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
132use std::io::Write;
133use std::ops::Range;
134use std::sync::Arc;
135pub use writer::ParquetMetaDataWriter;
136pub(crate) use writer::ThriftMetadataWriter;
137
138pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
155
156pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
168
169#[derive(Debug, Clone, PartialEq)]
187pub struct ParquetMetaData {
188 file_metadata: FileMetaData,
190 row_groups: Vec<RowGroupMetaData>,
192 column_index: Option<ParquetColumnIndex>,
194 offset_index: Option<ParquetOffsetIndex>,
196 #[cfg(feature = "encryption")]
198 file_decryptor: Option<Box<FileDecryptor>>,
199}
200
201impl ParquetMetaData {
202 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
205 ParquetMetaData {
206 file_metadata,
207 row_groups,
208 column_index: None,
209 offset_index: None,
210 #[cfg(feature = "encryption")]
211 file_decryptor: None,
212 }
213 }
214
215 #[cfg(feature = "encryption")]
218 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
219 self.file_decryptor = file_decryptor.map(Box::new);
220 }
221
222 pub fn into_builder(self) -> ParquetMetaDataBuilder {
224 self.into()
225 }
226
227 pub fn file_metadata(&self) -> &FileMetaData {
229 &self.file_metadata
230 }
231
232 #[cfg(feature = "encryption")]
234 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
235 self.file_decryptor.as_deref()
236 }
237
238 pub fn num_row_groups(&self) -> usize {
240 self.row_groups.len()
241 }
242
243 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
246 &self.row_groups[i]
247 }
248
249 pub fn row_groups(&self) -> &[RowGroupMetaData] {
251 &self.row_groups
252 }
253
254 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
261 self.column_index.as_ref()
262 }
263
264 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
271 self.offset_index.as_ref()
272 }
273
274 pub fn memory_size(&self) -> usize {
289 #[cfg(feature = "encryption")]
290 let encryption_size = self.file_decryptor.heap_size();
291 #[cfg(not(feature = "encryption"))]
292 let encryption_size = 0usize;
293
294 std::mem::size_of::<Self>()
295 + self.file_metadata.heap_size()
296 + self.row_groups.heap_size()
297 + self.column_index.heap_size()
298 + self.offset_index.heap_size()
299 + encryption_size
300 }
301
302 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
304 self.column_index = index;
305 }
306
307 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
309 self.offset_index = index;
310 }
311}
312
313pub struct ParquetMetaDataBuilder(ParquetMetaData);
351
352impl ParquetMetaDataBuilder {
353 pub fn new(file_meta_data: FileMetaData) -> Self {
355 Self(ParquetMetaData::new(file_meta_data, vec![]))
356 }
357
358 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
360 Self(metadata)
361 }
362
363 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
365 self.0.row_groups.push(row_group);
366 self
367 }
368
369 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
371 self.0.row_groups = row_groups;
372 self
373 }
374
375 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
381 std::mem::take(&mut self.0.row_groups)
382 }
383
384 pub fn row_groups(&self) -> &[RowGroupMetaData] {
386 &self.0.row_groups
387 }
388
389 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
391 self.0.column_index = column_index;
392 self
393 }
394
395 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
397 std::mem::take(&mut self.0.column_index)
398 }
399
400 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
402 self.0.column_index.as_ref()
403 }
404
405 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
407 self.0.offset_index = offset_index;
408 self
409 }
410
411 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
413 std::mem::take(&mut self.0.offset_index)
414 }
415
416 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
418 self.0.offset_index.as_ref()
419 }
420
421 #[cfg(feature = "encryption")]
423 pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
424 self.0.with_file_decryptor(file_decryptor);
425 self
426 }
427
428 pub fn build(self) -> ParquetMetaData {
430 let Self(metadata) = self;
431 metadata
432 }
433}
434
435impl From<ParquetMetaData> for ParquetMetaDataBuilder {
436 fn from(meta_data: ParquetMetaData) -> Self {
437 Self(meta_data)
438 }
439}
440
441thrift_struct!(
442pub struct KeyValue {
444 1: required string key
445 2: optional string value
446}
447);
448
449impl KeyValue {
450 pub fn new<F2>(key: String, value: F2) -> KeyValue
452 where
453 F2: Into<Option<String>>,
454 {
455 KeyValue {
456 key,
457 value: value.into(),
458 }
459 }
460}
461
462thrift_struct!(
463pub struct PageEncodingStats {
465 1: required PageType page_type;
466 2: required Encoding encoding;
467 3: required i32 count;
468}
469);
470
471pub type FileMetaDataPtr = Arc<FileMetaData>;
473
474#[derive(Debug, Clone, PartialEq)]
478pub struct FileMetaData {
479 version: i32,
480 num_rows: i64,
481 created_by: Option<String>,
482 key_value_metadata: Option<Vec<KeyValue>>,
483 schema_descr: SchemaDescPtr,
484 column_orders: Option<Vec<ColumnOrder>>,
485 #[cfg(feature = "encryption")]
486 encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
487 #[cfg(feature = "encryption")]
488 footer_signing_key_metadata: Option<Vec<u8>>,
489}
490
491impl FileMetaData {
492 pub fn new(
494 version: i32,
495 num_rows: i64,
496 created_by: Option<String>,
497 key_value_metadata: Option<Vec<KeyValue>>,
498 schema_descr: SchemaDescPtr,
499 column_orders: Option<Vec<ColumnOrder>>,
500 ) -> Self {
501 FileMetaData {
502 version,
503 num_rows,
504 created_by,
505 key_value_metadata,
506 schema_descr,
507 column_orders,
508 #[cfg(feature = "encryption")]
509 encryption_algorithm: None,
510 #[cfg(feature = "encryption")]
511 footer_signing_key_metadata: None,
512 }
513 }
514
515 #[cfg(feature = "encryption")]
516 pub(crate) fn with_encryption_algorithm(
517 mut self,
518 encryption_algorithm: Option<EncryptionAlgorithm>,
519 ) -> Self {
520 self.encryption_algorithm = encryption_algorithm.map(Box::new);
521 self
522 }
523
524 #[cfg(feature = "encryption")]
525 pub(crate) fn with_footer_signing_key_metadata(
526 mut self,
527 footer_signing_key_metadata: Option<Vec<u8>>,
528 ) -> Self {
529 self.footer_signing_key_metadata = footer_signing_key_metadata;
530 self
531 }
532
533 pub fn version(&self) -> i32 {
535 self.version
536 }
537
538 pub fn num_rows(&self) -> i64 {
540 self.num_rows
541 }
542
543 pub fn created_by(&self) -> Option<&str> {
552 self.created_by.as_deref()
553 }
554
555 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
557 self.key_value_metadata.as_ref()
558 }
559
560 pub fn schema(&self) -> &SchemaType {
564 self.schema_descr.root_schema()
565 }
566
567 pub fn schema_descr(&self) -> &SchemaDescriptor {
569 &self.schema_descr
570 }
571
572 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
574 self.schema_descr.clone()
575 }
576
577 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
585 self.column_orders.as_ref()
586 }
587
588 pub fn column_order(&self, i: usize) -> ColumnOrder {
591 self.column_orders
592 .as_ref()
593 .map(|data| data[i])
594 .unwrap_or(ColumnOrder::UNDEFINED)
595 }
596}
597
598thrift_struct!(
599pub struct SortingColumn {
601 1: required i32 column_idx
603
604 2: required bool descending
606
607 3: required bool nulls_first
610}
611);
612
613pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
615
616#[derive(Debug, Clone, PartialEq)]
621pub struct RowGroupMetaData {
622 columns: Vec<ColumnChunkMetaData>,
623 num_rows: i64,
624 sorting_columns: Option<Vec<SortingColumn>>,
625 total_byte_size: i64,
626 schema_descr: SchemaDescPtr,
627 file_offset: Option<i64>,
629 ordinal: Option<i16>,
631}
632
633impl RowGroupMetaData {
634 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
636 RowGroupMetaDataBuilder::new(schema_descr)
637 }
638
639 pub fn num_columns(&self) -> usize {
641 self.columns.len()
642 }
643
644 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
646 &self.columns[i]
647 }
648
649 pub fn columns(&self) -> &[ColumnChunkMetaData] {
651 &self.columns
652 }
653
654 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
656 &mut self.columns
657 }
658
659 pub fn num_rows(&self) -> i64 {
661 self.num_rows
662 }
663
664 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
666 self.sorting_columns.as_ref()
667 }
668
669 pub fn total_byte_size(&self) -> i64 {
671 self.total_byte_size
672 }
673
674 pub fn compressed_size(&self) -> i64 {
676 self.columns.iter().map(|c| c.total_compressed_size).sum()
677 }
678
679 pub fn schema_descr(&self) -> &SchemaDescriptor {
681 self.schema_descr.as_ref()
682 }
683
684 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
686 self.schema_descr.clone()
687 }
688
689 #[inline(always)]
694 pub fn ordinal(&self) -> Option<i16> {
695 self.ordinal
696 }
697
698 #[inline(always)]
700 pub fn file_offset(&self) -> Option<i64> {
701 self.file_offset
702 }
703
704 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
706 RowGroupMetaDataBuilder(self)
707 }
708}
709
710pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
712
713impl RowGroupMetaDataBuilder {
714 fn new(schema_descr: SchemaDescPtr) -> Self {
716 Self(RowGroupMetaData {
717 columns: Vec::with_capacity(schema_descr.num_columns()),
718 schema_descr,
719 file_offset: None,
720 num_rows: 0,
721 sorting_columns: None,
722 total_byte_size: 0,
723 ordinal: None,
724 })
725 }
726
727 pub fn set_num_rows(mut self, value: i64) -> Self {
729 self.0.num_rows = value;
730 self
731 }
732
733 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
735 self.0.sorting_columns = value;
736 self
737 }
738
739 pub fn set_total_byte_size(mut self, value: i64) -> Self {
741 self.0.total_byte_size = value;
742 self
743 }
744
745 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
751 std::mem::take(&mut self.0.columns)
752 }
753
754 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
756 self.0.columns = value;
757 self
758 }
759
760 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
762 self.0.columns.push(value);
763 self
764 }
765
766 pub fn set_ordinal(mut self, value: i16) -> Self {
768 self.0.ordinal = Some(value);
769 self
770 }
771
772 pub fn set_file_offset(mut self, value: i64) -> Self {
774 self.0.file_offset = Some(value);
775 self
776 }
777
778 pub fn build(self) -> Result<RowGroupMetaData> {
780 if self.0.schema_descr.num_columns() != self.0.columns.len() {
781 return Err(general_err!(
782 "Column length mismatch: {} != {}",
783 self.0.schema_descr.num_columns(),
784 self.0.columns.len()
785 ));
786 }
787
788 Ok(self.0)
789 }
790
791 pub(super) fn build_unchecked(self) -> RowGroupMetaData {
793 self.0
794 }
795}
796
797#[derive(Debug, Clone, PartialEq)]
799pub struct ColumnChunkMetaData {
800 column_descr: ColumnDescPtr,
801 encodings: EncodingMask,
802 file_path: Option<String>,
803 file_offset: i64,
804 num_values: i64,
805 compression: Compression,
806 total_compressed_size: i64,
807 total_uncompressed_size: i64,
808 data_page_offset: i64,
809 index_page_offset: Option<i64>,
810 dictionary_page_offset: Option<i64>,
811 statistics: Option<Statistics>,
812 geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
813 encoding_stats: Option<Vec<PageEncodingStats>>,
814 bloom_filter_offset: Option<i64>,
815 bloom_filter_length: Option<i32>,
816 offset_index_offset: Option<i64>,
817 offset_index_length: Option<i32>,
818 column_index_offset: Option<i64>,
819 column_index_length: Option<i32>,
820 unencoded_byte_array_data_bytes: Option<i64>,
821 repetition_level_histogram: Option<LevelHistogram>,
822 definition_level_histogram: Option<LevelHistogram>,
823 #[cfg(feature = "encryption")]
824 column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
825 #[cfg(feature = "encryption")]
826 encrypted_column_metadata: Option<Vec<u8>>,
827}
828
829#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
838pub struct LevelHistogram {
839 inner: Vec<i64>,
840}
841
842impl LevelHistogram {
843 pub fn try_new(max_level: i16) -> Option<Self> {
849 if max_level > 0 {
850 Some(Self {
851 inner: vec![0; max_level as usize + 1],
852 })
853 } else {
854 None
855 }
856 }
857 pub fn values(&self) -> &[i64] {
859 &self.inner
860 }
861
862 pub fn into_inner(self) -> Vec<i64> {
864 self.inner
865 }
866
867 pub fn get(&self, index: usize) -> Option<i64> {
874 self.inner.get(index).copied()
875 }
876
877 pub fn add(&mut self, other: &Self) {
882 assert_eq!(self.len(), other.len());
883 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
884 *dst += src;
885 }
886 }
887
888 pub fn len(&self) -> usize {
890 self.inner.len()
891 }
892
893 pub fn is_empty(&self) -> bool {
895 self.inner.is_empty()
896 }
897
898 pub fn reset(&mut self) {
900 for value in self.inner.iter_mut() {
901 *value = 0;
902 }
903 }
904
905 pub fn update_from_levels(&mut self, levels: &[i16]) {
911 for &level in levels {
912 self.inner[level as usize] += 1;
913 }
914 }
915}
916
917impl From<Vec<i64>> for LevelHistogram {
918 fn from(inner: Vec<i64>) -> Self {
919 Self { inner }
920 }
921}
922
923impl From<LevelHistogram> for Vec<i64> {
924 fn from(value: LevelHistogram) -> Self {
925 value.into_inner()
926 }
927}
928
929impl HeapSize for LevelHistogram {
930 fn heap_size(&self) -> usize {
931 self.inner.heap_size()
932 }
933}
934
935impl ColumnChunkMetaData {
937 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
939 ColumnChunkMetaDataBuilder::new(column_descr)
940 }
941
942 pub fn file_path(&self) -> Option<&str> {
947 self.file_path.as_deref()
948 }
949
950 pub fn file_offset(&self) -> i64 {
957 self.file_offset
958 }
959
960 pub fn column_type(&self) -> Type {
962 self.column_descr.physical_type()
963 }
964
965 pub fn column_path(&self) -> &ColumnPath {
967 self.column_descr.path()
968 }
969
970 pub fn column_descr(&self) -> &ColumnDescriptor {
972 self.column_descr.as_ref()
973 }
974
975 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
977 self.column_descr.clone()
978 }
979
980 pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
982 self.encodings.encodings()
983 }
984
985 pub fn encodings_mask(&self) -> &EncodingMask {
987 &self.encodings
988 }
989
990 pub fn num_values(&self) -> i64 {
992 self.num_values
993 }
994
995 pub fn compression(&self) -> Compression {
997 self.compression
998 }
999
1000 pub fn compressed_size(&self) -> i64 {
1002 self.total_compressed_size
1003 }
1004
1005 pub fn uncompressed_size(&self) -> i64 {
1007 self.total_uncompressed_size
1008 }
1009
1010 pub fn data_page_offset(&self) -> i64 {
1012 self.data_page_offset
1013 }
1014
1015 pub fn index_page_offset(&self) -> Option<i64> {
1017 self.index_page_offset
1018 }
1019
1020 pub fn dictionary_page_offset(&self) -> Option<i64> {
1022 self.dictionary_page_offset
1023 }
1024
1025 pub fn byte_range(&self) -> (u64, u64) {
1027 let col_start = match self.dictionary_page_offset() {
1028 Some(dictionary_page_offset) => dictionary_page_offset,
1029 None => self.data_page_offset(),
1030 };
1031 let col_len = self.compressed_size();
1032 assert!(
1033 col_start >= 0 && col_len >= 0,
1034 "column start and length should not be negative"
1035 );
1036 (col_start as u64, col_len as u64)
1037 }
1038
1039 pub fn statistics(&self) -> Option<&Statistics> {
1042 self.statistics.as_ref()
1043 }
1044
1045 pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1048 self.geo_statistics.as_deref()
1049 }
1050
1051 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1054 self.encoding_stats.as_ref()
1055 }
1056
1057 pub fn bloom_filter_offset(&self) -> Option<i64> {
1059 self.bloom_filter_offset
1060 }
1061
1062 pub fn bloom_filter_length(&self) -> Option<i32> {
1064 self.bloom_filter_length
1065 }
1066
1067 pub fn column_index_offset(&self) -> Option<i64> {
1069 self.column_index_offset
1070 }
1071
1072 pub fn column_index_length(&self) -> Option<i32> {
1074 self.column_index_length
1075 }
1076
1077 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1079 let offset = u64::try_from(self.column_index_offset?).ok()?;
1080 let length = u64::try_from(self.column_index_length?).ok()?;
1081 Some(offset..(offset + length))
1082 }
1083
1084 pub fn offset_index_offset(&self) -> Option<i64> {
1086 self.offset_index_offset
1087 }
1088
1089 pub fn offset_index_length(&self) -> Option<i32> {
1091 self.offset_index_length
1092 }
1093
1094 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1096 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1097 let length = u64::try_from(self.offset_index_length?).ok()?;
1098 Some(offset..(offset + length))
1099 }
1100
1101 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1106 self.unencoded_byte_array_data_bytes
1107 }
1108
1109 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1115 self.repetition_level_histogram.as_ref()
1116 }
1117
1118 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1124 self.definition_level_histogram.as_ref()
1125 }
1126
1127 #[cfg(feature = "encryption")]
1129 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1130 self.column_crypto_metadata.as_deref()
1131 }
1132
1133 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1135 ColumnChunkMetaDataBuilder::from(self)
1136 }
1137}
1138
1139pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1158
1159impl ColumnChunkMetaDataBuilder {
1160 fn new(column_descr: ColumnDescPtr) -> Self {
1164 Self(ColumnChunkMetaData {
1165 column_descr,
1166 encodings: Default::default(),
1167 file_path: None,
1168 file_offset: 0,
1169 num_values: 0,
1170 compression: Compression::UNCOMPRESSED,
1171 total_compressed_size: 0,
1172 total_uncompressed_size: 0,
1173 data_page_offset: 0,
1174 index_page_offset: None,
1175 dictionary_page_offset: None,
1176 statistics: None,
1177 geo_statistics: None,
1178 encoding_stats: None,
1179 bloom_filter_offset: None,
1180 bloom_filter_length: None,
1181 offset_index_offset: None,
1182 offset_index_length: None,
1183 column_index_offset: None,
1184 column_index_length: None,
1185 unencoded_byte_array_data_bytes: None,
1186 repetition_level_histogram: None,
1187 definition_level_histogram: None,
1188 #[cfg(feature = "encryption")]
1189 column_crypto_metadata: None,
1190 #[cfg(feature = "encryption")]
1191 encrypted_column_metadata: None,
1192 })
1193 }
1194
1195 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1197 self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1198 self
1199 }
1200
1201 pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1203 self.0.encodings = encodings;
1204 self
1205 }
1206
1207 pub fn set_file_path(mut self, value: String) -> Self {
1209 self.0.file_path = Some(value);
1210 self
1211 }
1212
1213 pub fn set_num_values(mut self, value: i64) -> Self {
1215 self.0.num_values = value;
1216 self
1217 }
1218
1219 pub fn set_compression(mut self, value: Compression) -> Self {
1221 self.0.compression = value;
1222 self
1223 }
1224
1225 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1227 self.0.total_compressed_size = value;
1228 self
1229 }
1230
1231 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1233 self.0.total_uncompressed_size = value;
1234 self
1235 }
1236
1237 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1239 self.0.data_page_offset = value;
1240 self
1241 }
1242
1243 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1245 self.0.dictionary_page_offset = value;
1246 self
1247 }
1248
1249 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1251 self.0.index_page_offset = value;
1252 self
1253 }
1254
1255 pub fn set_statistics(mut self, value: Statistics) -> Self {
1257 self.0.statistics = Some(value);
1258 self
1259 }
1260
1261 pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1263 self.0.geo_statistics = Some(value);
1264 self
1265 }
1266
1267 pub fn clear_statistics(mut self) -> Self {
1269 self.0.statistics = None;
1270 self
1271 }
1272
1273 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1275 self.0.encoding_stats = Some(value);
1276 self
1277 }
1278
1279 pub fn clear_page_encoding_stats(mut self) -> Self {
1281 self.0.encoding_stats = None;
1282 self
1283 }
1284
1285 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1287 self.0.bloom_filter_offset = value;
1288 self
1289 }
1290
1291 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1293 self.0.bloom_filter_length = value;
1294 self
1295 }
1296
1297 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1299 self.0.offset_index_offset = value;
1300 self
1301 }
1302
1303 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1305 self.0.offset_index_length = value;
1306 self
1307 }
1308
1309 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1311 self.0.column_index_offset = value;
1312 self
1313 }
1314
1315 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1317 self.0.column_index_length = value;
1318 self
1319 }
1320
1321 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1323 self.0.unencoded_byte_array_data_bytes = value;
1324 self
1325 }
1326
1327 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1329 self.0.repetition_level_histogram = value;
1330 self
1331 }
1332
1333 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1335 self.0.definition_level_histogram = value;
1336 self
1337 }
1338
1339 #[cfg(feature = "encryption")]
1340 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1342 self.0.column_crypto_metadata = value.map(Box::new);
1343 self
1344 }
1345
1346 #[cfg(feature = "encryption")]
1347 pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1349 self.0.encrypted_column_metadata = value;
1350 self
1351 }
1352
1353 pub fn build(self) -> Result<ColumnChunkMetaData> {
1355 Ok(self.0)
1356 }
1357}
1358
1359pub struct ColumnIndexBuilder {
1364 column_type: Type,
1365 null_pages: Vec<bool>,
1366 min_values: Vec<Vec<u8>>,
1367 max_values: Vec<Vec<u8>>,
1368 null_counts: Vec<i64>,
1369 boundary_order: BoundaryOrder,
1370 repetition_level_histograms: Option<Vec<i64>>,
1372 definition_level_histograms: Option<Vec<i64>>,
1374 valid: bool,
1382}
1383
1384impl ColumnIndexBuilder {
1385 pub fn new(column_type: Type) -> Self {
1387 ColumnIndexBuilder {
1388 column_type,
1389 null_pages: Vec::new(),
1390 min_values: Vec::new(),
1391 max_values: Vec::new(),
1392 null_counts: Vec::new(),
1393 boundary_order: BoundaryOrder::UNORDERED,
1394 repetition_level_histograms: None,
1395 definition_level_histograms: None,
1396 valid: true,
1397 }
1398 }
1399
1400 pub fn append(
1402 &mut self,
1403 null_page: bool,
1404 min_value: Vec<u8>,
1405 max_value: Vec<u8>,
1406 null_count: i64,
1407 ) {
1408 self.null_pages.push(null_page);
1409 self.min_values.push(min_value);
1410 self.max_values.push(max_value);
1411 self.null_counts.push(null_count);
1412 }
1413
1414 pub fn append_histograms(
1419 &mut self,
1420 repetition_level_histogram: &Option<LevelHistogram>,
1421 definition_level_histogram: &Option<LevelHistogram>,
1422 ) {
1423 if !self.valid {
1424 return;
1425 }
1426 if let Some(rep_lvl_hist) = repetition_level_histogram {
1427 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1428 hist.reserve(rep_lvl_hist.len());
1429 hist.extend(rep_lvl_hist.values());
1430 }
1431 if let Some(def_lvl_hist) = definition_level_histogram {
1432 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1433 hist.reserve(def_lvl_hist.len());
1434 hist.extend(def_lvl_hist.values());
1435 }
1436 }
1437
1438 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1440 self.boundary_order = boundary_order;
1441 }
1442
1443 pub fn to_invalid(&mut self) {
1445 self.valid = false;
1446 }
1447
1448 pub fn valid(&self) -> bool {
1450 self.valid
1451 }
1452
1453 pub fn build(self) -> Result<ColumnIndexMetaData> {
1457 Ok(match self.column_type {
1458 Type::BOOLEAN => {
1459 let index = self.build_page_index()?;
1460 ColumnIndexMetaData::BOOLEAN(index)
1461 }
1462 Type::INT32 => {
1463 let index = self.build_page_index()?;
1464 ColumnIndexMetaData::INT32(index)
1465 }
1466 Type::INT64 => {
1467 let index = self.build_page_index()?;
1468 ColumnIndexMetaData::INT64(index)
1469 }
1470 Type::INT96 => {
1471 let index = self.build_page_index()?;
1472 ColumnIndexMetaData::INT96(index)
1473 }
1474 Type::FLOAT => {
1475 let index = self.build_page_index()?;
1476 ColumnIndexMetaData::FLOAT(index)
1477 }
1478 Type::DOUBLE => {
1479 let index = self.build_page_index()?;
1480 ColumnIndexMetaData::DOUBLE(index)
1481 }
1482 Type::BYTE_ARRAY => {
1483 let index = self.build_byte_array_index()?;
1484 ColumnIndexMetaData::BYTE_ARRAY(index)
1485 }
1486 Type::FIXED_LEN_BYTE_ARRAY => {
1487 let index = self.build_byte_array_index()?;
1488 ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1489 }
1490 })
1491 }
1492
1493 fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1494 where
1495 T: ParquetValueType,
1496 {
1497 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1498 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1499
1500 PrimitiveColumnIndex::try_new(
1501 self.null_pages,
1502 self.boundary_order,
1503 Some(self.null_counts),
1504 self.repetition_level_histograms,
1505 self.definition_level_histograms,
1506 min_values,
1507 max_values,
1508 )
1509 }
1510
1511 fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1512 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1513 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1514
1515 ByteArrayColumnIndex::try_new(
1516 self.null_pages,
1517 self.boundary_order,
1518 Some(self.null_counts),
1519 self.repetition_level_histograms,
1520 self.definition_level_histograms,
1521 min_values,
1522 max_values,
1523 )
1524 }
1525}
1526
1527impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1528 fn from(value: ColumnChunkMetaData) -> Self {
1529 ColumnChunkMetaDataBuilder(value)
1530 }
1531}
1532
1533pub struct OffsetIndexBuilder {
1537 offset_array: Vec<i64>,
1538 compressed_page_size_array: Vec<i32>,
1539 first_row_index_array: Vec<i64>,
1540 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1541 current_first_row_index: i64,
1542}
1543
1544impl Default for OffsetIndexBuilder {
1545 fn default() -> Self {
1546 Self::new()
1547 }
1548}
1549
1550impl OffsetIndexBuilder {
1551 pub fn new() -> Self {
1553 OffsetIndexBuilder {
1554 offset_array: Vec::new(),
1555 compressed_page_size_array: Vec::new(),
1556 first_row_index_array: Vec::new(),
1557 unencoded_byte_array_data_bytes_array: None,
1558 current_first_row_index: 0,
1559 }
1560 }
1561
1562 pub fn append_row_count(&mut self, row_count: i64) {
1564 let current_page_row_index = self.current_first_row_index;
1565 self.first_row_index_array.push(current_page_row_index);
1566 self.current_first_row_index += row_count;
1567 }
1568
1569 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1571 self.offset_array.push(offset);
1572 self.compressed_page_size_array.push(compressed_page_size);
1573 }
1574
1575 pub fn append_unencoded_byte_array_data_bytes(
1577 &mut self,
1578 unencoded_byte_array_data_bytes: Option<i64>,
1579 ) {
1580 if let Some(val) = unencoded_byte_array_data_bytes {
1581 self.unencoded_byte_array_data_bytes_array
1582 .get_or_insert(Vec::new())
1583 .push(val);
1584 }
1585 }
1586
1587 pub fn build(self) -> OffsetIndexMetaData {
1589 let locations = self
1590 .offset_array
1591 .iter()
1592 .zip(self.compressed_page_size_array.iter())
1593 .zip(self.first_row_index_array.iter())
1594 .map(|((offset, size), row_index)| PageLocation {
1595 offset: *offset,
1596 compressed_page_size: *size,
1597 first_row_index: *row_index,
1598 })
1599 .collect::<Vec<_>>();
1600 OffsetIndexMetaData {
1601 page_locations: locations,
1602 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1603 }
1604 }
1605}
1606
1607#[cfg(test)]
1608mod tests {
1609 use super::*;
1610 use crate::basic::{PageType, SortOrder};
1611 use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1612
1613 #[test]
1614 fn test_row_group_metadata_thrift_conversion() {
1615 let schema_descr = get_test_schema_descr();
1616
1617 let mut columns = vec![];
1618 for ptr in schema_descr.columns() {
1619 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1620 columns.push(column);
1621 }
1622 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1623 .set_num_rows(1000)
1624 .set_total_byte_size(2000)
1625 .set_column_metadata(columns)
1626 .set_ordinal(1)
1627 .build()
1628 .unwrap();
1629
1630 let mut buf = Vec::new();
1631 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1632 row_group_meta.write_thrift(&mut writer).unwrap();
1633
1634 let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1635
1636 assert_eq!(row_group_res, row_group_meta);
1637 }
1638
1639 #[test]
1640 fn test_row_group_metadata_thrift_conversion_empty() {
1641 let schema_descr = get_test_schema_descr();
1642
1643 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1644
1645 assert!(row_group_meta.is_err());
1646 if let Err(e) = row_group_meta {
1647 assert_eq!(
1648 format!("{e}"),
1649 "Parquet error: Column length mismatch: 2 != 0"
1650 );
1651 }
1652 }
1653
1654 #[test]
1656 fn test_row_group_metadata_thrift_corrupted() {
1657 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1658 SchemaType::group_type_builder("schema")
1659 .with_fields(vec![
1660 Arc::new(
1661 SchemaType::primitive_type_builder("a", Type::INT32)
1662 .build()
1663 .unwrap(),
1664 ),
1665 Arc::new(
1666 SchemaType::primitive_type_builder("b", Type::INT32)
1667 .build()
1668 .unwrap(),
1669 ),
1670 ])
1671 .build()
1672 .unwrap(),
1673 )));
1674
1675 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1676 SchemaType::group_type_builder("schema")
1677 .with_fields(vec![
1678 Arc::new(
1679 SchemaType::primitive_type_builder("a", Type::INT32)
1680 .build()
1681 .unwrap(),
1682 ),
1683 Arc::new(
1684 SchemaType::primitive_type_builder("b", Type::INT32)
1685 .build()
1686 .unwrap(),
1687 ),
1688 Arc::new(
1689 SchemaType::primitive_type_builder("c", Type::INT32)
1690 .build()
1691 .unwrap(),
1692 ),
1693 ])
1694 .build()
1695 .unwrap(),
1696 )));
1697
1698 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1699 .set_num_rows(1000)
1700 .set_total_byte_size(2000)
1701 .set_column_metadata(vec![
1702 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1703 .build()
1704 .unwrap(),
1705 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1706 .build()
1707 .unwrap(),
1708 ])
1709 .set_ordinal(1)
1710 .build()
1711 .unwrap();
1712 let mut buf = Vec::new();
1713 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1714 row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1715
1716 let err = read_row_group(&mut buf, schema_descr_3cols)
1717 .unwrap_err()
1718 .to_string();
1719 assert_eq!(
1720 err,
1721 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1722 );
1723 }
1724
1725 #[test]
1726 fn test_column_chunk_metadata_thrift_conversion() {
1727 let column_descr = get_test_schema_descr().column(0);
1728 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1729 .set_encodings_mask(EncodingMask::new_from_encodings(
1730 [Encoding::PLAIN, Encoding::RLE].iter(),
1731 ))
1732 .set_file_path("file_path".to_owned())
1733 .set_num_values(1000)
1734 .set_compression(Compression::SNAPPY)
1735 .set_total_compressed_size(2000)
1736 .set_total_uncompressed_size(3000)
1737 .set_data_page_offset(4000)
1738 .set_dictionary_page_offset(Some(5000))
1739 .set_page_encoding_stats(vec![
1740 PageEncodingStats {
1741 page_type: PageType::DATA_PAGE,
1742 encoding: Encoding::PLAIN,
1743 count: 3,
1744 },
1745 PageEncodingStats {
1746 page_type: PageType::DATA_PAGE,
1747 encoding: Encoding::RLE,
1748 count: 5,
1749 },
1750 ])
1751 .set_bloom_filter_offset(Some(6000))
1752 .set_bloom_filter_length(Some(25))
1753 .set_offset_index_offset(Some(7000))
1754 .set_offset_index_length(Some(25))
1755 .set_column_index_offset(Some(8000))
1756 .set_column_index_length(Some(25))
1757 .set_unencoded_byte_array_data_bytes(Some(2000))
1758 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1759 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1760 .build()
1761 .unwrap();
1762
1763 let mut buf = Vec::new();
1764 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1765 col_metadata.write_thrift(&mut writer).unwrap();
1766 let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1767
1768 assert_eq!(col_chunk_res, col_metadata);
1769 }
1770
1771 #[test]
1772 fn test_column_chunk_metadata_thrift_conversion_empty() {
1773 let column_descr = get_test_schema_descr().column(0);
1774
1775 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1776 .build()
1777 .unwrap();
1778
1779 let mut buf = Vec::new();
1780 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1781 col_metadata.write_thrift(&mut writer).unwrap();
1782 let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1783
1784 assert_eq!(col_chunk_res, col_metadata);
1785 }
1786
1787 #[test]
1788 fn test_compressed_size() {
1789 let schema_descr = get_test_schema_descr();
1790
1791 let mut columns = vec![];
1792 for column_descr in schema_descr.columns() {
1793 let column = ColumnChunkMetaData::builder(column_descr.clone())
1794 .set_total_compressed_size(500)
1795 .set_total_uncompressed_size(700)
1796 .build()
1797 .unwrap();
1798 columns.push(column);
1799 }
1800 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1801 .set_num_rows(1000)
1802 .set_column_metadata(columns)
1803 .build()
1804 .unwrap();
1805
1806 let compressed_size_res: i64 = row_group_meta.compressed_size();
1807 let compressed_size_exp: i64 = 1000;
1808
1809 assert_eq!(compressed_size_res, compressed_size_exp);
1810 }
1811
1812 #[test]
1813 fn test_memory_size() {
1814 let schema_descr = get_test_schema_descr();
1815
1816 let columns = schema_descr
1817 .columns()
1818 .iter()
1819 .map(|column_descr| {
1820 ColumnChunkMetaData::builder(column_descr.clone())
1821 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1822 .build()
1823 })
1824 .collect::<Result<Vec<_>>>()
1825 .unwrap();
1826 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1827 .set_num_rows(1000)
1828 .set_column_metadata(columns)
1829 .build()
1830 .unwrap();
1831 let row_group_meta = vec![row_group_meta];
1832
1833 let version = 2;
1834 let num_rows = 1000;
1835 let created_by = Some(String::from("test harness"));
1836 let key_value_metadata = Some(vec![KeyValue::new(
1837 String::from("Foo"),
1838 Some(String::from("bar")),
1839 )]);
1840 let column_orders = Some(vec![
1841 ColumnOrder::UNDEFINED,
1842 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1843 ]);
1844 let file_metadata = FileMetaData::new(
1845 version,
1846 num_rows,
1847 created_by,
1848 key_value_metadata,
1849 schema_descr.clone(),
1850 column_orders,
1851 );
1852
1853 let columns_with_stats = schema_descr
1855 .columns()
1856 .iter()
1857 .map(|column_descr| {
1858 ColumnChunkMetaData::builder(column_descr.clone())
1859 .set_statistics(Statistics::new::<i32>(
1860 Some(0),
1861 Some(100),
1862 None,
1863 None,
1864 false,
1865 ))
1866 .build()
1867 })
1868 .collect::<Result<Vec<_>>>()
1869 .unwrap();
1870
1871 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1872 .set_num_rows(1000)
1873 .set_column_metadata(columns_with_stats)
1874 .build()
1875 .unwrap();
1876 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1877
1878 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1879 .set_row_groups(row_group_meta_with_stats)
1880 .build();
1881
1882 #[cfg(not(feature = "encryption"))]
1883 let base_expected_size = 2766;
1884 #[cfg(feature = "encryption")]
1885 let base_expected_size = 2934;
1886
1887 assert_eq!(parquet_meta.memory_size(), base_expected_size);
1888
1889 let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
1890 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1891 let column_index = column_index.build().unwrap();
1892 let native_index = match column_index {
1893 ColumnIndexMetaData::BOOLEAN(index) => index,
1894 _ => panic!("wrong type of column index"),
1895 };
1896
1897 let mut offset_index = OffsetIndexBuilder::new();
1899 offset_index.append_row_count(1);
1900 offset_index.append_offset_and_size(2, 3);
1901 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1902 offset_index.append_row_count(1);
1903 offset_index.append_offset_and_size(2, 3);
1904 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1905 let offset_index = offset_index.build();
1906
1907 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1908 .set_row_groups(row_group_meta)
1909 .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
1910 .set_offset_index(Some(vec![vec![offset_index]]))
1911 .build();
1912
1913 #[cfg(not(feature = "encryption"))]
1914 let bigger_expected_size = 3192;
1915 #[cfg(feature = "encryption")]
1916 let bigger_expected_size = 3360;
1917
1918 assert!(bigger_expected_size > base_expected_size);
1920 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1921 }
1922
1923 #[test]
1924 #[cfg(feature = "encryption")]
1925 fn test_memory_size_with_decryptor() {
1926 use crate::encryption::decrypt::FileDecryptionProperties;
1927 use crate::file::metadata::thrift::encryption::AesGcmV1;
1928
1929 let schema_descr = get_test_schema_descr();
1930
1931 let columns = schema_descr
1932 .columns()
1933 .iter()
1934 .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
1935 .collect::<Result<Vec<_>>>()
1936 .unwrap();
1937 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1938 .set_num_rows(1000)
1939 .set_column_metadata(columns)
1940 .build()
1941 .unwrap();
1942 let row_group_meta = vec![row_group_meta];
1943
1944 let version = 2;
1945 let num_rows = 1000;
1946 let aad_file_unique = vec![1u8; 8];
1947 let aad_prefix = vec![2u8; 8];
1948 let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
1949 aad_prefix: Some(aad_prefix.clone()),
1950 aad_file_unique: Some(aad_file_unique.clone()),
1951 supply_aad_prefix: Some(true),
1952 });
1953 let footer_key_metadata = Some(vec![3u8; 8]);
1954 let file_metadata =
1955 FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
1956 .with_encryption_algorithm(Some(encryption_algorithm))
1957 .with_footer_signing_key_metadata(footer_key_metadata.clone());
1958
1959 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1960 .set_row_groups(row_group_meta.clone())
1961 .build();
1962
1963 let base_expected_size = 2058;
1964 assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
1965
1966 let footer_key = "0123456789012345".as_bytes();
1967 let column_key = "1234567890123450".as_bytes();
1968 let mut decryption_properties_builder =
1969 FileDecryptionProperties::builder(footer_key.to_vec())
1970 .with_aad_prefix(aad_prefix.clone());
1971 for column in schema_descr.columns() {
1972 decryption_properties_builder = decryption_properties_builder
1973 .with_column_key(&column.path().string(), column_key.to_vec());
1974 }
1975 let decryption_properties = decryption_properties_builder.build().unwrap();
1976 let decryptor = FileDecryptor::new(
1977 &decryption_properties,
1978 footer_key_metadata.as_deref(),
1979 aad_file_unique,
1980 aad_prefix,
1981 )
1982 .unwrap();
1983
1984 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1985 .set_row_groups(row_group_meta.clone())
1986 .set_file_decryptor(Some(decryptor))
1987 .build();
1988
1989 let expected_size_with_decryptor = 3072;
1990 assert!(expected_size_with_decryptor > base_expected_size);
1991
1992 assert_eq!(
1993 parquet_meta_data.memory_size(),
1994 expected_size_with_decryptor
1995 );
1996 }
1997
1998 fn get_test_schema_descr() -> SchemaDescPtr {
2000 let schema = SchemaType::group_type_builder("schema")
2001 .with_fields(vec![
2002 Arc::new(
2003 SchemaType::primitive_type_builder("a", Type::INT32)
2004 .build()
2005 .unwrap(),
2006 ),
2007 Arc::new(
2008 SchemaType::primitive_type_builder("b", Type::INT32)
2009 .build()
2010 .unwrap(),
2011 ),
2012 ])
2013 .build()
2014 .unwrap();
2015
2016 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2017 }
2018}