1mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112 Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116 basic::BoundaryOrder,
117 errors::{ParquetError, Result},
118};
119use crate::{
120 basic::{ColumnOrder, Compression, Encoding, Type},
121 parquet_thrift::{
122 ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123 ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124 },
125};
126use crate::{
127 data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use options::ParquetMetaDataOptions;
132pub use push_decoder::ParquetMetaDataPushDecoder;
133pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
134use std::io::Write;
135use std::ops::Range;
136use std::sync::Arc;
137pub use writer::ParquetMetaDataWriter;
138pub(crate) use writer::ThriftMetadataWriter;
139
140pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
157
158pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
170
171#[derive(Debug, Clone, PartialEq)]
189pub struct ParquetMetaData {
190 file_metadata: FileMetaData,
192 row_groups: Vec<RowGroupMetaData>,
194 column_index: Option<ParquetColumnIndex>,
196 offset_index: Option<ParquetOffsetIndex>,
198 #[cfg(feature = "encryption")]
200 file_decryptor: Option<Box<FileDecryptor>>,
201}
202
203impl ParquetMetaData {
204 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
207 ParquetMetaData {
208 file_metadata,
209 row_groups,
210 column_index: None,
211 offset_index: None,
212 #[cfg(feature = "encryption")]
213 file_decryptor: None,
214 }
215 }
216
217 #[cfg(feature = "encryption")]
220 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
221 self.file_decryptor = file_decryptor.map(Box::new);
222 }
223
224 pub fn into_builder(self) -> ParquetMetaDataBuilder {
226 self.into()
227 }
228
229 pub fn file_metadata(&self) -> &FileMetaData {
231 &self.file_metadata
232 }
233
234 #[cfg(feature = "encryption")]
236 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
237 self.file_decryptor.as_deref()
238 }
239
240 pub fn num_row_groups(&self) -> usize {
242 self.row_groups.len()
243 }
244
245 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
248 &self.row_groups[i]
249 }
250
251 pub fn row_groups(&self) -> &[RowGroupMetaData] {
253 &self.row_groups
254 }
255
256 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
263 self.column_index.as_ref()
264 }
265
266 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
273 self.offset_index.as_ref()
274 }
275
276 pub fn memory_size(&self) -> usize {
291 #[cfg(feature = "encryption")]
292 let encryption_size = self.file_decryptor.heap_size();
293 #[cfg(not(feature = "encryption"))]
294 let encryption_size = 0usize;
295
296 std::mem::size_of::<Self>()
297 + self.file_metadata.heap_size()
298 + self.row_groups.heap_size()
299 + self.column_index.heap_size()
300 + self.offset_index.heap_size()
301 + encryption_size
302 }
303
304 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
306 self.column_index = index;
307 }
308
309 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
311 self.offset_index = index;
312 }
313}
314
315pub struct ParquetMetaDataBuilder(ParquetMetaData);
353
354impl ParquetMetaDataBuilder {
355 pub fn new(file_meta_data: FileMetaData) -> Self {
357 Self(ParquetMetaData::new(file_meta_data, vec![]))
358 }
359
360 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
362 Self(metadata)
363 }
364
365 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
367 self.0.row_groups.push(row_group);
368 self
369 }
370
371 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
373 self.0.row_groups = row_groups;
374 self
375 }
376
377 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
383 std::mem::take(&mut self.0.row_groups)
384 }
385
386 pub fn row_groups(&self) -> &[RowGroupMetaData] {
388 &self.0.row_groups
389 }
390
391 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
393 self.0.column_index = column_index;
394 self
395 }
396
397 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
399 std::mem::take(&mut self.0.column_index)
400 }
401
402 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
404 self.0.column_index.as_ref()
405 }
406
407 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
409 self.0.offset_index = offset_index;
410 self
411 }
412
413 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
415 std::mem::take(&mut self.0.offset_index)
416 }
417
418 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
420 self.0.offset_index.as_ref()
421 }
422
423 #[cfg(feature = "encryption")]
425 pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
426 self.0.with_file_decryptor(file_decryptor);
427 self
428 }
429
430 pub fn build(self) -> ParquetMetaData {
432 let Self(metadata) = self;
433 metadata
434 }
435}
436
437impl From<ParquetMetaData> for ParquetMetaDataBuilder {
438 fn from(meta_data: ParquetMetaData) -> Self {
439 Self(meta_data)
440 }
441}
442
443thrift_struct!(
444pub struct KeyValue {
446 1: required string key
447 2: optional string value
448}
449);
450
451impl KeyValue {
452 pub fn new<F2>(key: String, value: F2) -> KeyValue
454 where
455 F2: Into<Option<String>>,
456 {
457 KeyValue {
458 key,
459 value: value.into(),
460 }
461 }
462}
463
464thrift_struct!(
465pub struct PageEncodingStats {
467 1: required PageType page_type;
468 2: required Encoding encoding;
469 3: required i32 count;
470}
471);
472
473pub type FileMetaDataPtr = Arc<FileMetaData>;
475
476#[derive(Debug, Clone, PartialEq)]
480pub struct FileMetaData {
481 version: i32,
482 num_rows: i64,
483 created_by: Option<String>,
484 key_value_metadata: Option<Vec<KeyValue>>,
485 schema_descr: SchemaDescPtr,
486 column_orders: Option<Vec<ColumnOrder>>,
487 #[cfg(feature = "encryption")]
488 encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
489 #[cfg(feature = "encryption")]
490 footer_signing_key_metadata: Option<Vec<u8>>,
491}
492
493impl FileMetaData {
494 pub fn new(
496 version: i32,
497 num_rows: i64,
498 created_by: Option<String>,
499 key_value_metadata: Option<Vec<KeyValue>>,
500 schema_descr: SchemaDescPtr,
501 column_orders: Option<Vec<ColumnOrder>>,
502 ) -> Self {
503 FileMetaData {
504 version,
505 num_rows,
506 created_by,
507 key_value_metadata,
508 schema_descr,
509 column_orders,
510 #[cfg(feature = "encryption")]
511 encryption_algorithm: None,
512 #[cfg(feature = "encryption")]
513 footer_signing_key_metadata: None,
514 }
515 }
516
517 #[cfg(feature = "encryption")]
518 pub(crate) fn with_encryption_algorithm(
519 mut self,
520 encryption_algorithm: Option<EncryptionAlgorithm>,
521 ) -> Self {
522 self.encryption_algorithm = encryption_algorithm.map(Box::new);
523 self
524 }
525
526 #[cfg(feature = "encryption")]
527 pub(crate) fn with_footer_signing_key_metadata(
528 mut self,
529 footer_signing_key_metadata: Option<Vec<u8>>,
530 ) -> Self {
531 self.footer_signing_key_metadata = footer_signing_key_metadata;
532 self
533 }
534
535 pub fn version(&self) -> i32 {
537 self.version
538 }
539
540 pub fn num_rows(&self) -> i64 {
542 self.num_rows
543 }
544
545 pub fn created_by(&self) -> Option<&str> {
554 self.created_by.as_deref()
555 }
556
557 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
559 self.key_value_metadata.as_ref()
560 }
561
562 pub fn schema(&self) -> &SchemaType {
566 self.schema_descr.root_schema()
567 }
568
569 pub fn schema_descr(&self) -> &SchemaDescriptor {
571 &self.schema_descr
572 }
573
574 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
576 self.schema_descr.clone()
577 }
578
579 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
587 self.column_orders.as_ref()
588 }
589
590 pub fn column_order(&self, i: usize) -> ColumnOrder {
593 self.column_orders
594 .as_ref()
595 .map(|data| data[i])
596 .unwrap_or(ColumnOrder::UNDEFINED)
597 }
598}
599
600thrift_struct!(
601pub struct SortingColumn {
603 1: required i32 column_idx
605
606 2: required bool descending
608
609 3: required bool nulls_first
612}
613);
614
615pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
617
618#[derive(Debug, Clone, PartialEq)]
623pub struct RowGroupMetaData {
624 columns: Vec<ColumnChunkMetaData>,
625 num_rows: i64,
626 sorting_columns: Option<Vec<SortingColumn>>,
627 total_byte_size: i64,
628 schema_descr: SchemaDescPtr,
629 file_offset: Option<i64>,
631 ordinal: Option<i16>,
633}
634
635impl RowGroupMetaData {
636 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
638 RowGroupMetaDataBuilder::new(schema_descr)
639 }
640
641 pub fn num_columns(&self) -> usize {
643 self.columns.len()
644 }
645
646 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
648 &self.columns[i]
649 }
650
651 pub fn columns(&self) -> &[ColumnChunkMetaData] {
653 &self.columns
654 }
655
656 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
658 &mut self.columns
659 }
660
661 pub fn num_rows(&self) -> i64 {
663 self.num_rows
664 }
665
666 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
668 self.sorting_columns.as_ref()
669 }
670
671 pub fn total_byte_size(&self) -> i64 {
673 self.total_byte_size
674 }
675
676 pub fn compressed_size(&self) -> i64 {
678 self.columns.iter().map(|c| c.total_compressed_size).sum()
679 }
680
681 pub fn schema_descr(&self) -> &SchemaDescriptor {
683 self.schema_descr.as_ref()
684 }
685
686 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
688 self.schema_descr.clone()
689 }
690
691 #[inline(always)]
696 pub fn ordinal(&self) -> Option<i16> {
697 self.ordinal
698 }
699
700 #[inline(always)]
702 pub fn file_offset(&self) -> Option<i64> {
703 self.file_offset
704 }
705
706 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
708 RowGroupMetaDataBuilder(self)
709 }
710}
711
712pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
714
715impl RowGroupMetaDataBuilder {
716 fn new(schema_descr: SchemaDescPtr) -> Self {
718 Self(RowGroupMetaData {
719 columns: Vec::with_capacity(schema_descr.num_columns()),
720 schema_descr,
721 file_offset: None,
722 num_rows: 0,
723 sorting_columns: None,
724 total_byte_size: 0,
725 ordinal: None,
726 })
727 }
728
729 pub fn set_num_rows(mut self, value: i64) -> Self {
731 self.0.num_rows = value;
732 self
733 }
734
735 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
737 self.0.sorting_columns = value;
738 self
739 }
740
741 pub fn set_total_byte_size(mut self, value: i64) -> Self {
743 self.0.total_byte_size = value;
744 self
745 }
746
747 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
753 std::mem::take(&mut self.0.columns)
754 }
755
756 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
758 self.0.columns = value;
759 self
760 }
761
762 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
764 self.0.columns.push(value);
765 self
766 }
767
768 pub fn set_ordinal(mut self, value: i16) -> Self {
770 self.0.ordinal = Some(value);
771 self
772 }
773
774 pub fn set_file_offset(mut self, value: i64) -> Self {
776 self.0.file_offset = Some(value);
777 self
778 }
779
780 pub fn build(self) -> Result<RowGroupMetaData> {
782 if self.0.schema_descr.num_columns() != self.0.columns.len() {
783 return Err(general_err!(
784 "Column length mismatch: {} != {}",
785 self.0.schema_descr.num_columns(),
786 self.0.columns.len()
787 ));
788 }
789
790 Ok(self.0)
791 }
792
793 pub(super) fn build_unchecked(self) -> RowGroupMetaData {
795 self.0
796 }
797}
798
799#[derive(Debug, Clone, PartialEq)]
801pub struct ColumnChunkMetaData {
802 column_descr: ColumnDescPtr,
803 encodings: EncodingMask,
804 file_path: Option<String>,
805 file_offset: i64,
806 num_values: i64,
807 compression: Compression,
808 total_compressed_size: i64,
809 total_uncompressed_size: i64,
810 data_page_offset: i64,
811 index_page_offset: Option<i64>,
812 dictionary_page_offset: Option<i64>,
813 statistics: Option<Statistics>,
814 geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
815 encoding_stats: Option<Vec<PageEncodingStats>>,
816 bloom_filter_offset: Option<i64>,
817 bloom_filter_length: Option<i32>,
818 offset_index_offset: Option<i64>,
819 offset_index_length: Option<i32>,
820 column_index_offset: Option<i64>,
821 column_index_length: Option<i32>,
822 unencoded_byte_array_data_bytes: Option<i64>,
823 repetition_level_histogram: Option<LevelHistogram>,
824 definition_level_histogram: Option<LevelHistogram>,
825 #[cfg(feature = "encryption")]
826 column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
827 #[cfg(feature = "encryption")]
828 encrypted_column_metadata: Option<Vec<u8>>,
829}
830
831#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
840pub struct LevelHistogram {
841 inner: Vec<i64>,
842}
843
844impl LevelHistogram {
845 pub fn try_new(max_level: i16) -> Option<Self> {
851 if max_level > 0 {
852 Some(Self {
853 inner: vec![0; max_level as usize + 1],
854 })
855 } else {
856 None
857 }
858 }
859 pub fn values(&self) -> &[i64] {
861 &self.inner
862 }
863
864 pub fn into_inner(self) -> Vec<i64> {
866 self.inner
867 }
868
869 pub fn get(&self, index: usize) -> Option<i64> {
876 self.inner.get(index).copied()
877 }
878
879 pub fn add(&mut self, other: &Self) {
884 assert_eq!(self.len(), other.len());
885 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
886 *dst += src;
887 }
888 }
889
890 pub fn len(&self) -> usize {
892 self.inner.len()
893 }
894
895 pub fn is_empty(&self) -> bool {
897 self.inner.is_empty()
898 }
899
900 pub fn reset(&mut self) {
902 for value in self.inner.iter_mut() {
903 *value = 0;
904 }
905 }
906
907 pub fn update_from_levels(&mut self, levels: &[i16]) {
913 for &level in levels {
914 self.inner[level as usize] += 1;
915 }
916 }
917}
918
919impl From<Vec<i64>> for LevelHistogram {
920 fn from(inner: Vec<i64>) -> Self {
921 Self { inner }
922 }
923}
924
925impl From<LevelHistogram> for Vec<i64> {
926 fn from(value: LevelHistogram) -> Self {
927 value.into_inner()
928 }
929}
930
931impl HeapSize for LevelHistogram {
932 fn heap_size(&self) -> usize {
933 self.inner.heap_size()
934 }
935}
936
937impl ColumnChunkMetaData {
939 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
941 ColumnChunkMetaDataBuilder::new(column_descr)
942 }
943
944 pub fn file_path(&self) -> Option<&str> {
949 self.file_path.as_deref()
950 }
951
952 pub fn file_offset(&self) -> i64 {
959 self.file_offset
960 }
961
962 pub fn column_type(&self) -> Type {
964 self.column_descr.physical_type()
965 }
966
967 pub fn column_path(&self) -> &ColumnPath {
969 self.column_descr.path()
970 }
971
972 pub fn column_descr(&self) -> &ColumnDescriptor {
974 self.column_descr.as_ref()
975 }
976
977 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
979 self.column_descr.clone()
980 }
981
982 pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
984 self.encodings.encodings()
985 }
986
987 pub fn encodings_mask(&self) -> &EncodingMask {
989 &self.encodings
990 }
991
992 pub fn num_values(&self) -> i64 {
994 self.num_values
995 }
996
997 pub fn compression(&self) -> Compression {
999 self.compression
1000 }
1001
1002 pub fn compressed_size(&self) -> i64 {
1004 self.total_compressed_size
1005 }
1006
1007 pub fn uncompressed_size(&self) -> i64 {
1009 self.total_uncompressed_size
1010 }
1011
1012 pub fn data_page_offset(&self) -> i64 {
1014 self.data_page_offset
1015 }
1016
1017 pub fn index_page_offset(&self) -> Option<i64> {
1019 self.index_page_offset
1020 }
1021
1022 pub fn dictionary_page_offset(&self) -> Option<i64> {
1024 self.dictionary_page_offset
1025 }
1026
1027 pub fn byte_range(&self) -> (u64, u64) {
1029 let col_start = match self.dictionary_page_offset() {
1030 Some(dictionary_page_offset) => dictionary_page_offset,
1031 None => self.data_page_offset(),
1032 };
1033 let col_len = self.compressed_size();
1034 assert!(
1035 col_start >= 0 && col_len >= 0,
1036 "column start and length should not be negative"
1037 );
1038 (col_start as u64, col_len as u64)
1039 }
1040
1041 pub fn statistics(&self) -> Option<&Statistics> {
1044 self.statistics.as_ref()
1045 }
1046
1047 pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1050 self.geo_statistics.as_deref()
1051 }
1052
1053 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1056 self.encoding_stats.as_ref()
1057 }
1058
1059 pub fn bloom_filter_offset(&self) -> Option<i64> {
1061 self.bloom_filter_offset
1062 }
1063
1064 pub fn bloom_filter_length(&self) -> Option<i32> {
1066 self.bloom_filter_length
1067 }
1068
1069 pub fn column_index_offset(&self) -> Option<i64> {
1071 self.column_index_offset
1072 }
1073
1074 pub fn column_index_length(&self) -> Option<i32> {
1076 self.column_index_length
1077 }
1078
1079 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1081 let offset = u64::try_from(self.column_index_offset?).ok()?;
1082 let length = u64::try_from(self.column_index_length?).ok()?;
1083 Some(offset..(offset + length))
1084 }
1085
1086 pub fn offset_index_offset(&self) -> Option<i64> {
1088 self.offset_index_offset
1089 }
1090
1091 pub fn offset_index_length(&self) -> Option<i32> {
1093 self.offset_index_length
1094 }
1095
1096 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1098 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1099 let length = u64::try_from(self.offset_index_length?).ok()?;
1100 Some(offset..(offset + length))
1101 }
1102
1103 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1108 self.unencoded_byte_array_data_bytes
1109 }
1110
1111 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1117 self.repetition_level_histogram.as_ref()
1118 }
1119
1120 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1126 self.definition_level_histogram.as_ref()
1127 }
1128
1129 #[cfg(feature = "encryption")]
1131 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1132 self.column_crypto_metadata.as_deref()
1133 }
1134
1135 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1137 ColumnChunkMetaDataBuilder::from(self)
1138 }
1139}
1140
1141pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1160
1161impl ColumnChunkMetaDataBuilder {
1162 fn new(column_descr: ColumnDescPtr) -> Self {
1166 Self(ColumnChunkMetaData {
1167 column_descr,
1168 encodings: Default::default(),
1169 file_path: None,
1170 file_offset: 0,
1171 num_values: 0,
1172 compression: Compression::UNCOMPRESSED,
1173 total_compressed_size: 0,
1174 total_uncompressed_size: 0,
1175 data_page_offset: 0,
1176 index_page_offset: None,
1177 dictionary_page_offset: None,
1178 statistics: None,
1179 geo_statistics: None,
1180 encoding_stats: None,
1181 bloom_filter_offset: None,
1182 bloom_filter_length: None,
1183 offset_index_offset: None,
1184 offset_index_length: None,
1185 column_index_offset: None,
1186 column_index_length: None,
1187 unencoded_byte_array_data_bytes: None,
1188 repetition_level_histogram: None,
1189 definition_level_histogram: None,
1190 #[cfg(feature = "encryption")]
1191 column_crypto_metadata: None,
1192 #[cfg(feature = "encryption")]
1193 encrypted_column_metadata: None,
1194 })
1195 }
1196
1197 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1199 self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1200 self
1201 }
1202
1203 pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1205 self.0.encodings = encodings;
1206 self
1207 }
1208
1209 pub fn set_file_path(mut self, value: String) -> Self {
1211 self.0.file_path = Some(value);
1212 self
1213 }
1214
1215 pub fn set_num_values(mut self, value: i64) -> Self {
1217 self.0.num_values = value;
1218 self
1219 }
1220
1221 pub fn set_compression(mut self, value: Compression) -> Self {
1223 self.0.compression = value;
1224 self
1225 }
1226
1227 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1229 self.0.total_compressed_size = value;
1230 self
1231 }
1232
1233 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1235 self.0.total_uncompressed_size = value;
1236 self
1237 }
1238
1239 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1241 self.0.data_page_offset = value;
1242 self
1243 }
1244
1245 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1247 self.0.dictionary_page_offset = value;
1248 self
1249 }
1250
1251 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1253 self.0.index_page_offset = value;
1254 self
1255 }
1256
1257 pub fn set_statistics(mut self, value: Statistics) -> Self {
1259 self.0.statistics = Some(value);
1260 self
1261 }
1262
1263 pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1265 self.0.geo_statistics = Some(value);
1266 self
1267 }
1268
1269 pub fn clear_statistics(mut self) -> Self {
1271 self.0.statistics = None;
1272 self
1273 }
1274
1275 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1277 self.0.encoding_stats = Some(value);
1278 self
1279 }
1280
1281 pub fn clear_page_encoding_stats(mut self) -> Self {
1283 self.0.encoding_stats = None;
1284 self
1285 }
1286
1287 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1289 self.0.bloom_filter_offset = value;
1290 self
1291 }
1292
1293 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1295 self.0.bloom_filter_length = value;
1296 self
1297 }
1298
1299 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1301 self.0.offset_index_offset = value;
1302 self
1303 }
1304
1305 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1307 self.0.offset_index_length = value;
1308 self
1309 }
1310
1311 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1313 self.0.column_index_offset = value;
1314 self
1315 }
1316
1317 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1319 self.0.column_index_length = value;
1320 self
1321 }
1322
1323 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1325 self.0.unencoded_byte_array_data_bytes = value;
1326 self
1327 }
1328
1329 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1331 self.0.repetition_level_histogram = value;
1332 self
1333 }
1334
1335 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1337 self.0.definition_level_histogram = value;
1338 self
1339 }
1340
1341 #[cfg(feature = "encryption")]
1342 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1344 self.0.column_crypto_metadata = value.map(Box::new);
1345 self
1346 }
1347
1348 #[cfg(feature = "encryption")]
1349 pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1351 self.0.encrypted_column_metadata = value;
1352 self
1353 }
1354
1355 pub fn build(self) -> Result<ColumnChunkMetaData> {
1357 Ok(self.0)
1358 }
1359}
1360
1361pub struct ColumnIndexBuilder {
1366 column_type: Type,
1367 null_pages: Vec<bool>,
1368 min_values: Vec<Vec<u8>>,
1369 max_values: Vec<Vec<u8>>,
1370 null_counts: Vec<i64>,
1371 boundary_order: BoundaryOrder,
1372 repetition_level_histograms: Option<Vec<i64>>,
1374 definition_level_histograms: Option<Vec<i64>>,
1376 valid: bool,
1384}
1385
1386impl ColumnIndexBuilder {
1387 pub fn new(column_type: Type) -> Self {
1389 ColumnIndexBuilder {
1390 column_type,
1391 null_pages: Vec::new(),
1392 min_values: Vec::new(),
1393 max_values: Vec::new(),
1394 null_counts: Vec::new(),
1395 boundary_order: BoundaryOrder::UNORDERED,
1396 repetition_level_histograms: None,
1397 definition_level_histograms: None,
1398 valid: true,
1399 }
1400 }
1401
1402 pub fn append(
1404 &mut self,
1405 null_page: bool,
1406 min_value: Vec<u8>,
1407 max_value: Vec<u8>,
1408 null_count: i64,
1409 ) {
1410 self.null_pages.push(null_page);
1411 self.min_values.push(min_value);
1412 self.max_values.push(max_value);
1413 self.null_counts.push(null_count);
1414 }
1415
1416 pub fn append_histograms(
1421 &mut self,
1422 repetition_level_histogram: &Option<LevelHistogram>,
1423 definition_level_histogram: &Option<LevelHistogram>,
1424 ) {
1425 if !self.valid {
1426 return;
1427 }
1428 if let Some(rep_lvl_hist) = repetition_level_histogram {
1429 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1430 hist.reserve(rep_lvl_hist.len());
1431 hist.extend(rep_lvl_hist.values());
1432 }
1433 if let Some(def_lvl_hist) = definition_level_histogram {
1434 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1435 hist.reserve(def_lvl_hist.len());
1436 hist.extend(def_lvl_hist.values());
1437 }
1438 }
1439
1440 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1442 self.boundary_order = boundary_order;
1443 }
1444
1445 pub fn to_invalid(&mut self) {
1447 self.valid = false;
1448 }
1449
1450 pub fn valid(&self) -> bool {
1452 self.valid
1453 }
1454
1455 pub fn build(self) -> Result<ColumnIndexMetaData> {
1459 Ok(match self.column_type {
1460 Type::BOOLEAN => {
1461 let index = self.build_page_index()?;
1462 ColumnIndexMetaData::BOOLEAN(index)
1463 }
1464 Type::INT32 => {
1465 let index = self.build_page_index()?;
1466 ColumnIndexMetaData::INT32(index)
1467 }
1468 Type::INT64 => {
1469 let index = self.build_page_index()?;
1470 ColumnIndexMetaData::INT64(index)
1471 }
1472 Type::INT96 => {
1473 let index = self.build_page_index()?;
1474 ColumnIndexMetaData::INT96(index)
1475 }
1476 Type::FLOAT => {
1477 let index = self.build_page_index()?;
1478 ColumnIndexMetaData::FLOAT(index)
1479 }
1480 Type::DOUBLE => {
1481 let index = self.build_page_index()?;
1482 ColumnIndexMetaData::DOUBLE(index)
1483 }
1484 Type::BYTE_ARRAY => {
1485 let index = self.build_byte_array_index()?;
1486 ColumnIndexMetaData::BYTE_ARRAY(index)
1487 }
1488 Type::FIXED_LEN_BYTE_ARRAY => {
1489 let index = self.build_byte_array_index()?;
1490 ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1491 }
1492 })
1493 }
1494
1495 fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1496 where
1497 T: ParquetValueType,
1498 {
1499 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1500 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1501
1502 PrimitiveColumnIndex::try_new(
1503 self.null_pages,
1504 self.boundary_order,
1505 Some(self.null_counts),
1506 self.repetition_level_histograms,
1507 self.definition_level_histograms,
1508 min_values,
1509 max_values,
1510 )
1511 }
1512
1513 fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1514 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1515 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1516
1517 ByteArrayColumnIndex::try_new(
1518 self.null_pages,
1519 self.boundary_order,
1520 Some(self.null_counts),
1521 self.repetition_level_histograms,
1522 self.definition_level_histograms,
1523 min_values,
1524 max_values,
1525 )
1526 }
1527}
1528
1529impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1530 fn from(value: ColumnChunkMetaData) -> Self {
1531 ColumnChunkMetaDataBuilder(value)
1532 }
1533}
1534
1535pub struct OffsetIndexBuilder {
1539 offset_array: Vec<i64>,
1540 compressed_page_size_array: Vec<i32>,
1541 first_row_index_array: Vec<i64>,
1542 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1543 current_first_row_index: i64,
1544}
1545
1546impl Default for OffsetIndexBuilder {
1547 fn default() -> Self {
1548 Self::new()
1549 }
1550}
1551
1552impl OffsetIndexBuilder {
1553 pub fn new() -> Self {
1555 OffsetIndexBuilder {
1556 offset_array: Vec::new(),
1557 compressed_page_size_array: Vec::new(),
1558 first_row_index_array: Vec::new(),
1559 unencoded_byte_array_data_bytes_array: None,
1560 current_first_row_index: 0,
1561 }
1562 }
1563
1564 pub fn append_row_count(&mut self, row_count: i64) {
1566 let current_page_row_index = self.current_first_row_index;
1567 self.first_row_index_array.push(current_page_row_index);
1568 self.current_first_row_index += row_count;
1569 }
1570
1571 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1573 self.offset_array.push(offset);
1574 self.compressed_page_size_array.push(compressed_page_size);
1575 }
1576
1577 pub fn append_unencoded_byte_array_data_bytes(
1579 &mut self,
1580 unencoded_byte_array_data_bytes: Option<i64>,
1581 ) {
1582 if let Some(val) = unencoded_byte_array_data_bytes {
1583 self.unencoded_byte_array_data_bytes_array
1584 .get_or_insert(Vec::new())
1585 .push(val);
1586 }
1587 }
1588
1589 pub fn build(self) -> OffsetIndexMetaData {
1591 let locations = self
1592 .offset_array
1593 .iter()
1594 .zip(self.compressed_page_size_array.iter())
1595 .zip(self.first_row_index_array.iter())
1596 .map(|((offset, size), row_index)| PageLocation {
1597 offset: *offset,
1598 compressed_page_size: *size,
1599 first_row_index: *row_index,
1600 })
1601 .collect::<Vec<_>>();
1602 OffsetIndexMetaData {
1603 page_locations: locations,
1604 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1605 }
1606 }
1607}
1608
1609#[cfg(test)]
1610mod tests {
1611 use super::*;
1612 use crate::basic::{PageType, SortOrder};
1613 use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1614
1615 #[test]
1616 fn test_row_group_metadata_thrift_conversion() {
1617 let schema_descr = get_test_schema_descr();
1618
1619 let mut columns = vec![];
1620 for ptr in schema_descr.columns() {
1621 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1622 columns.push(column);
1623 }
1624 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1625 .set_num_rows(1000)
1626 .set_total_byte_size(2000)
1627 .set_column_metadata(columns)
1628 .set_ordinal(1)
1629 .build()
1630 .unwrap();
1631
1632 let mut buf = Vec::new();
1633 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1634 row_group_meta.write_thrift(&mut writer).unwrap();
1635
1636 let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1637
1638 assert_eq!(row_group_res, row_group_meta);
1639 }
1640
1641 #[test]
1642 fn test_row_group_metadata_thrift_conversion_empty() {
1643 let schema_descr = get_test_schema_descr();
1644
1645 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1646
1647 assert!(row_group_meta.is_err());
1648 if let Err(e) = row_group_meta {
1649 assert_eq!(
1650 format!("{e}"),
1651 "Parquet error: Column length mismatch: 2 != 0"
1652 );
1653 }
1654 }
1655
1656 #[test]
1658 fn test_row_group_metadata_thrift_corrupted() {
1659 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1660 SchemaType::group_type_builder("schema")
1661 .with_fields(vec![
1662 Arc::new(
1663 SchemaType::primitive_type_builder("a", Type::INT32)
1664 .build()
1665 .unwrap(),
1666 ),
1667 Arc::new(
1668 SchemaType::primitive_type_builder("b", Type::INT32)
1669 .build()
1670 .unwrap(),
1671 ),
1672 ])
1673 .build()
1674 .unwrap(),
1675 )));
1676
1677 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1678 SchemaType::group_type_builder("schema")
1679 .with_fields(vec![
1680 Arc::new(
1681 SchemaType::primitive_type_builder("a", Type::INT32)
1682 .build()
1683 .unwrap(),
1684 ),
1685 Arc::new(
1686 SchemaType::primitive_type_builder("b", Type::INT32)
1687 .build()
1688 .unwrap(),
1689 ),
1690 Arc::new(
1691 SchemaType::primitive_type_builder("c", Type::INT32)
1692 .build()
1693 .unwrap(),
1694 ),
1695 ])
1696 .build()
1697 .unwrap(),
1698 )));
1699
1700 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1701 .set_num_rows(1000)
1702 .set_total_byte_size(2000)
1703 .set_column_metadata(vec![
1704 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1705 .build()
1706 .unwrap(),
1707 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1708 .build()
1709 .unwrap(),
1710 ])
1711 .set_ordinal(1)
1712 .build()
1713 .unwrap();
1714 let mut buf = Vec::new();
1715 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1716 row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1717
1718 let err = read_row_group(&mut buf, schema_descr_3cols)
1719 .unwrap_err()
1720 .to_string();
1721 assert_eq!(
1722 err,
1723 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1724 );
1725 }
1726
1727 #[test]
1728 fn test_column_chunk_metadata_thrift_conversion() {
1729 let column_descr = get_test_schema_descr().column(0);
1730 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1731 .set_encodings_mask(EncodingMask::new_from_encodings(
1732 [Encoding::PLAIN, Encoding::RLE].iter(),
1733 ))
1734 .set_file_path("file_path".to_owned())
1735 .set_num_values(1000)
1736 .set_compression(Compression::SNAPPY)
1737 .set_total_compressed_size(2000)
1738 .set_total_uncompressed_size(3000)
1739 .set_data_page_offset(4000)
1740 .set_dictionary_page_offset(Some(5000))
1741 .set_page_encoding_stats(vec![
1742 PageEncodingStats {
1743 page_type: PageType::DATA_PAGE,
1744 encoding: Encoding::PLAIN,
1745 count: 3,
1746 },
1747 PageEncodingStats {
1748 page_type: PageType::DATA_PAGE,
1749 encoding: Encoding::RLE,
1750 count: 5,
1751 },
1752 ])
1753 .set_bloom_filter_offset(Some(6000))
1754 .set_bloom_filter_length(Some(25))
1755 .set_offset_index_offset(Some(7000))
1756 .set_offset_index_length(Some(25))
1757 .set_column_index_offset(Some(8000))
1758 .set_column_index_length(Some(25))
1759 .set_unencoded_byte_array_data_bytes(Some(2000))
1760 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1761 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1762 .build()
1763 .unwrap();
1764
1765 let mut buf = Vec::new();
1766 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1767 col_metadata.write_thrift(&mut writer).unwrap();
1768 let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1769
1770 assert_eq!(col_chunk_res, col_metadata);
1771 }
1772
1773 #[test]
1774 fn test_column_chunk_metadata_thrift_conversion_empty() {
1775 let column_descr = get_test_schema_descr().column(0);
1776
1777 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1778 .build()
1779 .unwrap();
1780
1781 let mut buf = Vec::new();
1782 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1783 col_metadata.write_thrift(&mut writer).unwrap();
1784 let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1785
1786 assert_eq!(col_chunk_res, col_metadata);
1787 }
1788
1789 #[test]
1790 fn test_compressed_size() {
1791 let schema_descr = get_test_schema_descr();
1792
1793 let mut columns = vec![];
1794 for column_descr in schema_descr.columns() {
1795 let column = ColumnChunkMetaData::builder(column_descr.clone())
1796 .set_total_compressed_size(500)
1797 .set_total_uncompressed_size(700)
1798 .build()
1799 .unwrap();
1800 columns.push(column);
1801 }
1802 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1803 .set_num_rows(1000)
1804 .set_column_metadata(columns)
1805 .build()
1806 .unwrap();
1807
1808 let compressed_size_res: i64 = row_group_meta.compressed_size();
1809 let compressed_size_exp: i64 = 1000;
1810
1811 assert_eq!(compressed_size_res, compressed_size_exp);
1812 }
1813
1814 #[test]
1815 fn test_memory_size() {
1816 let schema_descr = get_test_schema_descr();
1817
1818 let columns = schema_descr
1819 .columns()
1820 .iter()
1821 .map(|column_descr| {
1822 ColumnChunkMetaData::builder(column_descr.clone())
1823 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1824 .build()
1825 })
1826 .collect::<Result<Vec<_>>>()
1827 .unwrap();
1828 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1829 .set_num_rows(1000)
1830 .set_column_metadata(columns)
1831 .build()
1832 .unwrap();
1833 let row_group_meta = vec![row_group_meta];
1834
1835 let version = 2;
1836 let num_rows = 1000;
1837 let created_by = Some(String::from("test harness"));
1838 let key_value_metadata = Some(vec![KeyValue::new(
1839 String::from("Foo"),
1840 Some(String::from("bar")),
1841 )]);
1842 let column_orders = Some(vec![
1843 ColumnOrder::UNDEFINED,
1844 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1845 ]);
1846 let file_metadata = FileMetaData::new(
1847 version,
1848 num_rows,
1849 created_by,
1850 key_value_metadata,
1851 schema_descr.clone(),
1852 column_orders,
1853 );
1854
1855 let columns_with_stats = schema_descr
1857 .columns()
1858 .iter()
1859 .map(|column_descr| {
1860 ColumnChunkMetaData::builder(column_descr.clone())
1861 .set_statistics(Statistics::new::<i32>(
1862 Some(0),
1863 Some(100),
1864 None,
1865 None,
1866 false,
1867 ))
1868 .build()
1869 })
1870 .collect::<Result<Vec<_>>>()
1871 .unwrap();
1872
1873 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1874 .set_num_rows(1000)
1875 .set_column_metadata(columns_with_stats)
1876 .build()
1877 .unwrap();
1878 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1879
1880 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1881 .set_row_groups(row_group_meta_with_stats)
1882 .build();
1883
1884 #[cfg(not(feature = "encryption"))]
1885 let base_expected_size = 2766;
1886 #[cfg(feature = "encryption")]
1887 let base_expected_size = 2934;
1888
1889 assert_eq!(parquet_meta.memory_size(), base_expected_size);
1890
1891 let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
1892 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1893 let column_index = column_index.build().unwrap();
1894 let native_index = match column_index {
1895 ColumnIndexMetaData::BOOLEAN(index) => index,
1896 _ => panic!("wrong type of column index"),
1897 };
1898
1899 let mut offset_index = OffsetIndexBuilder::new();
1901 offset_index.append_row_count(1);
1902 offset_index.append_offset_and_size(2, 3);
1903 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1904 offset_index.append_row_count(1);
1905 offset_index.append_offset_and_size(2, 3);
1906 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1907 let offset_index = offset_index.build();
1908
1909 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1910 .set_row_groups(row_group_meta)
1911 .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
1912 .set_offset_index(Some(vec![vec![offset_index]]))
1913 .build();
1914
1915 #[cfg(not(feature = "encryption"))]
1916 let bigger_expected_size = 3192;
1917 #[cfg(feature = "encryption")]
1918 let bigger_expected_size = 3360;
1919
1920 assert!(bigger_expected_size > base_expected_size);
1922 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1923 }
1924
1925 #[test]
1926 #[cfg(feature = "encryption")]
1927 fn test_memory_size_with_decryptor() {
1928 use crate::encryption::decrypt::FileDecryptionProperties;
1929 use crate::file::metadata::thrift::encryption::AesGcmV1;
1930
1931 let schema_descr = get_test_schema_descr();
1932
1933 let columns = schema_descr
1934 .columns()
1935 .iter()
1936 .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
1937 .collect::<Result<Vec<_>>>()
1938 .unwrap();
1939 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1940 .set_num_rows(1000)
1941 .set_column_metadata(columns)
1942 .build()
1943 .unwrap();
1944 let row_group_meta = vec![row_group_meta];
1945
1946 let version = 2;
1947 let num_rows = 1000;
1948 let aad_file_unique = vec![1u8; 8];
1949 let aad_prefix = vec![2u8; 8];
1950 let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
1951 aad_prefix: Some(aad_prefix.clone()),
1952 aad_file_unique: Some(aad_file_unique.clone()),
1953 supply_aad_prefix: Some(true),
1954 });
1955 let footer_key_metadata = Some(vec![3u8; 8]);
1956 let file_metadata =
1957 FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
1958 .with_encryption_algorithm(Some(encryption_algorithm))
1959 .with_footer_signing_key_metadata(footer_key_metadata.clone());
1960
1961 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1962 .set_row_groups(row_group_meta.clone())
1963 .build();
1964
1965 let base_expected_size = 2058;
1966 assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
1967
1968 let footer_key = "0123456789012345".as_bytes();
1969 let column_key = "1234567890123450".as_bytes();
1970 let mut decryption_properties_builder =
1971 FileDecryptionProperties::builder(footer_key.to_vec())
1972 .with_aad_prefix(aad_prefix.clone());
1973 for column in schema_descr.columns() {
1974 decryption_properties_builder = decryption_properties_builder
1975 .with_column_key(&column.path().string(), column_key.to_vec());
1976 }
1977 let decryption_properties = decryption_properties_builder.build().unwrap();
1978 let decryptor = FileDecryptor::new(
1979 &decryption_properties,
1980 footer_key_metadata.as_deref(),
1981 aad_file_unique,
1982 aad_prefix,
1983 )
1984 .unwrap();
1985
1986 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1987 .set_row_groups(row_group_meta.clone())
1988 .set_file_decryptor(Some(decryptor))
1989 .build();
1990
1991 let expected_size_with_decryptor = 3072;
1992 assert!(expected_size_with_decryptor > base_expected_size);
1993
1994 assert_eq!(
1995 parquet_meta_data.memory_size(),
1996 expected_size_with_decryptor
1997 );
1998 }
1999
2000 fn get_test_schema_descr() -> SchemaDescPtr {
2002 let schema = SchemaType::group_type_builder("schema")
2003 .with_fields(vec![
2004 Arc::new(
2005 SchemaType::primitive_type_builder("a", Type::INT32)
2006 .build()
2007 .unwrap(),
2008 ),
2009 Arc::new(
2010 SchemaType::primitive_type_builder("b", Type::INT32)
2011 .build()
2012 .unwrap(),
2013 ),
2014 ])
2015 .build()
2016 .unwrap();
2017
2018 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2019 }
2020}