1mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112 Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116 basic::BoundaryOrder,
117 errors::{ParquetError, Result},
118};
119use crate::{
120 basic::{ColumnOrder, Compression, Encoding, Type},
121 parquet_thrift::{
122 ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123 ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124 },
125};
126use crate::{
127 data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
132pub use push_decoder::ParquetMetaDataPushDecoder;
133pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
134use std::io::Write;
135use std::ops::Range;
136use std::sync::Arc;
137pub use writer::ParquetMetaDataWriter;
138pub(crate) use writer::ThriftMetadataWriter;
139
140pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
157
158pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
170
171#[derive(Debug, Clone, PartialEq)]
189pub struct ParquetMetaData {
190 file_metadata: FileMetaData,
192 row_groups: Vec<RowGroupMetaData>,
194 column_index: Option<ParquetColumnIndex>,
196 offset_index: Option<ParquetOffsetIndex>,
198 #[cfg(feature = "encryption")]
200 file_decryptor: Option<Box<FileDecryptor>>,
201}
202
203impl ParquetMetaData {
204 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
207 ParquetMetaData {
208 file_metadata,
209 row_groups,
210 column_index: None,
211 offset_index: None,
212 #[cfg(feature = "encryption")]
213 file_decryptor: None,
214 }
215 }
216
217 #[cfg(feature = "encryption")]
220 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
221 self.file_decryptor = file_decryptor.map(Box::new);
222 }
223
224 pub fn into_builder(self) -> ParquetMetaDataBuilder {
226 self.into()
227 }
228
229 pub fn file_metadata(&self) -> &FileMetaData {
231 &self.file_metadata
232 }
233
234 #[cfg(feature = "encryption")]
236 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
237 self.file_decryptor.as_deref()
238 }
239
240 pub fn num_row_groups(&self) -> usize {
242 self.row_groups.len()
243 }
244
245 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
248 &self.row_groups[i]
249 }
250
251 pub fn row_groups(&self) -> &[RowGroupMetaData] {
253 &self.row_groups
254 }
255
256 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
263 self.column_index.as_ref()
264 }
265
266 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
273 self.offset_index.as_ref()
274 }
275
276 pub fn memory_size(&self) -> usize {
291 #[cfg(feature = "encryption")]
292 let encryption_size = self.file_decryptor.heap_size();
293 #[cfg(not(feature = "encryption"))]
294 let encryption_size = 0usize;
295
296 std::mem::size_of::<Self>()
297 + self.file_metadata.heap_size()
298 + self.row_groups.heap_size()
299 + self.column_index.heap_size()
300 + self.offset_index.heap_size()
301 + encryption_size
302 }
303
304 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
306 self.column_index = index;
307 }
308
309 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
311 self.offset_index = index;
312 }
313}
314
315pub struct ParquetMetaDataBuilder(ParquetMetaData);
353
354impl ParquetMetaDataBuilder {
355 pub fn new(file_meta_data: FileMetaData) -> Self {
357 Self(ParquetMetaData::new(file_meta_data, vec![]))
358 }
359
360 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
362 Self(metadata)
363 }
364
365 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
367 self.0.row_groups.push(row_group);
368 self
369 }
370
371 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
373 self.0.row_groups = row_groups;
374 self
375 }
376
377 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
383 std::mem::take(&mut self.0.row_groups)
384 }
385
386 pub fn row_groups(&self) -> &[RowGroupMetaData] {
388 &self.0.row_groups
389 }
390
391 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
393 self.0.column_index = column_index;
394 self
395 }
396
397 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
399 std::mem::take(&mut self.0.column_index)
400 }
401
402 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
404 self.0.column_index.as_ref()
405 }
406
407 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
409 self.0.offset_index = offset_index;
410 self
411 }
412
413 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
415 std::mem::take(&mut self.0.offset_index)
416 }
417
418 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
420 self.0.offset_index.as_ref()
421 }
422
423 #[cfg(feature = "encryption")]
425 pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
426 self.0.with_file_decryptor(file_decryptor);
427 self
428 }
429
430 pub fn build(self) -> ParquetMetaData {
432 let Self(metadata) = self;
433 metadata
434 }
435}
436
437impl From<ParquetMetaData> for ParquetMetaDataBuilder {
438 fn from(meta_data: ParquetMetaData) -> Self {
439 Self(meta_data)
440 }
441}
442
443thrift_struct!(
444pub struct KeyValue {
446 1: required string key
447 2: optional string value
448}
449);
450
451impl KeyValue {
452 pub fn new<F2>(key: String, value: F2) -> KeyValue
454 where
455 F2: Into<Option<String>>,
456 {
457 KeyValue {
458 key,
459 value: value.into(),
460 }
461 }
462}
463
464thrift_struct!(
465pub struct PageEncodingStats {
467 1: required PageType page_type;
468 2: required Encoding encoding;
469 3: required i32 count;
470}
471);
472
473#[derive(Debug, Clone, PartialEq)]
476enum ParquetPageEncodingStats {
477 Full(Vec<PageEncodingStats>),
479 Mask(EncodingMask),
481}
482
483pub type FileMetaDataPtr = Arc<FileMetaData>;
485
486#[derive(Debug, Clone, PartialEq)]
490pub struct FileMetaData {
491 version: i32,
492 num_rows: i64,
493 created_by: Option<String>,
494 key_value_metadata: Option<Vec<KeyValue>>,
495 schema_descr: SchemaDescPtr,
496 column_orders: Option<Vec<ColumnOrder>>,
497 #[cfg(feature = "encryption")]
498 encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
499 #[cfg(feature = "encryption")]
500 footer_signing_key_metadata: Option<Vec<u8>>,
501}
502
503impl FileMetaData {
504 pub fn new(
506 version: i32,
507 num_rows: i64,
508 created_by: Option<String>,
509 key_value_metadata: Option<Vec<KeyValue>>,
510 schema_descr: SchemaDescPtr,
511 column_orders: Option<Vec<ColumnOrder>>,
512 ) -> Self {
513 FileMetaData {
514 version,
515 num_rows,
516 created_by,
517 key_value_metadata,
518 schema_descr,
519 column_orders,
520 #[cfg(feature = "encryption")]
521 encryption_algorithm: None,
522 #[cfg(feature = "encryption")]
523 footer_signing_key_metadata: None,
524 }
525 }
526
527 #[cfg(feature = "encryption")]
528 pub(crate) fn with_encryption_algorithm(
529 mut self,
530 encryption_algorithm: Option<EncryptionAlgorithm>,
531 ) -> Self {
532 self.encryption_algorithm = encryption_algorithm.map(Box::new);
533 self
534 }
535
536 #[cfg(feature = "encryption")]
537 pub(crate) fn with_footer_signing_key_metadata(
538 mut self,
539 footer_signing_key_metadata: Option<Vec<u8>>,
540 ) -> Self {
541 self.footer_signing_key_metadata = footer_signing_key_metadata;
542 self
543 }
544
545 pub fn version(&self) -> i32 {
547 self.version
548 }
549
550 pub fn num_rows(&self) -> i64 {
552 self.num_rows
553 }
554
555 pub fn created_by(&self) -> Option<&str> {
564 self.created_by.as_deref()
565 }
566
567 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
569 self.key_value_metadata.as_ref()
570 }
571
572 pub fn schema(&self) -> &SchemaType {
576 self.schema_descr.root_schema()
577 }
578
579 pub fn schema_descr(&self) -> &SchemaDescriptor {
581 &self.schema_descr
582 }
583
584 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
586 self.schema_descr.clone()
587 }
588
589 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
597 self.column_orders.as_ref()
598 }
599
600 pub fn column_order(&self, i: usize) -> ColumnOrder {
603 self.column_orders
604 .as_ref()
605 .map(|data| data[i])
606 .unwrap_or(ColumnOrder::UNDEFINED)
607 }
608}
609
610thrift_struct!(
611pub struct SortingColumn {
613 1: required i32 column_idx
615
616 2: required bool descending
618
619 3: required bool nulls_first
622}
623);
624
625pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
627
628#[derive(Debug, Clone, PartialEq)]
633pub struct RowGroupMetaData {
634 columns: Vec<ColumnChunkMetaData>,
635 num_rows: i64,
636 sorting_columns: Option<Vec<SortingColumn>>,
637 total_byte_size: i64,
638 schema_descr: SchemaDescPtr,
639 file_offset: Option<i64>,
641 ordinal: Option<i16>,
643}
644
645impl RowGroupMetaData {
646 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
648 RowGroupMetaDataBuilder::new(schema_descr)
649 }
650
651 pub fn num_columns(&self) -> usize {
653 self.columns.len()
654 }
655
656 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
658 &self.columns[i]
659 }
660
661 pub fn columns(&self) -> &[ColumnChunkMetaData] {
663 &self.columns
664 }
665
666 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
668 &mut self.columns
669 }
670
671 pub fn num_rows(&self) -> i64 {
673 self.num_rows
674 }
675
676 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
678 self.sorting_columns.as_ref()
679 }
680
681 pub fn total_byte_size(&self) -> i64 {
683 self.total_byte_size
684 }
685
686 pub fn compressed_size(&self) -> i64 {
688 self.columns.iter().map(|c| c.total_compressed_size).sum()
689 }
690
691 pub fn schema_descr(&self) -> &SchemaDescriptor {
693 self.schema_descr.as_ref()
694 }
695
696 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
698 self.schema_descr.clone()
699 }
700
701 #[inline(always)]
706 pub fn ordinal(&self) -> Option<i16> {
707 self.ordinal
708 }
709
710 #[inline(always)]
712 pub fn file_offset(&self) -> Option<i64> {
713 self.file_offset
714 }
715
716 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
718 RowGroupMetaDataBuilder(self)
719 }
720}
721
722pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
724
725impl RowGroupMetaDataBuilder {
726 fn new(schema_descr: SchemaDescPtr) -> Self {
728 Self(RowGroupMetaData {
729 columns: Vec::with_capacity(schema_descr.num_columns()),
730 schema_descr,
731 file_offset: None,
732 num_rows: 0,
733 sorting_columns: None,
734 total_byte_size: 0,
735 ordinal: None,
736 })
737 }
738
739 pub fn set_num_rows(mut self, value: i64) -> Self {
741 self.0.num_rows = value;
742 self
743 }
744
745 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
747 self.0.sorting_columns = value;
748 self
749 }
750
751 pub fn set_total_byte_size(mut self, value: i64) -> Self {
753 self.0.total_byte_size = value;
754 self
755 }
756
757 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
763 std::mem::take(&mut self.0.columns)
764 }
765
766 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
768 self.0.columns = value;
769 self
770 }
771
772 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
774 self.0.columns.push(value);
775 self
776 }
777
778 pub fn set_ordinal(mut self, value: i16) -> Self {
780 self.0.ordinal = Some(value);
781 self
782 }
783
784 pub fn set_file_offset(mut self, value: i64) -> Self {
786 self.0.file_offset = Some(value);
787 self
788 }
789
790 pub fn build(self) -> Result<RowGroupMetaData> {
792 if self.0.schema_descr.num_columns() != self.0.columns.len() {
793 return Err(general_err!(
794 "Column length mismatch: {} != {}",
795 self.0.schema_descr.num_columns(),
796 self.0.columns.len()
797 ));
798 }
799
800 Ok(self.0)
801 }
802
803 pub(super) fn build_unchecked(self) -> RowGroupMetaData {
805 self.0
806 }
807}
808
809#[derive(Debug, Clone, PartialEq)]
811pub struct ColumnChunkMetaData {
812 column_descr: ColumnDescPtr,
813 encodings: EncodingMask,
814 file_path: Option<String>,
815 file_offset: i64,
816 num_values: i64,
817 compression: Compression,
818 total_compressed_size: i64,
819 total_uncompressed_size: i64,
820 data_page_offset: i64,
821 index_page_offset: Option<i64>,
822 dictionary_page_offset: Option<i64>,
823 statistics: Option<Statistics>,
824 geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
825 encoding_stats: Option<ParquetPageEncodingStats>,
826 bloom_filter_offset: Option<i64>,
827 bloom_filter_length: Option<i32>,
828 offset_index_offset: Option<i64>,
829 offset_index_length: Option<i32>,
830 column_index_offset: Option<i64>,
831 column_index_length: Option<i32>,
832 unencoded_byte_array_data_bytes: Option<i64>,
833 repetition_level_histogram: Option<LevelHistogram>,
834 definition_level_histogram: Option<LevelHistogram>,
835 #[cfg(feature = "encryption")]
836 column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
837 #[cfg(feature = "encryption")]
838 encrypted_column_metadata: Option<Vec<u8>>,
839 #[cfg(feature = "encryption")]
843 plaintext_footer_mode: bool,
844}
845
846#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
855pub struct LevelHistogram {
856 inner: Vec<i64>,
857}
858
859impl LevelHistogram {
860 pub fn try_new(max_level: i16) -> Option<Self> {
866 if max_level > 0 {
867 Some(Self {
868 inner: vec![0; max_level as usize + 1],
869 })
870 } else {
871 None
872 }
873 }
874 pub fn values(&self) -> &[i64] {
876 &self.inner
877 }
878
879 pub fn into_inner(self) -> Vec<i64> {
881 self.inner
882 }
883
884 pub fn get(&self, index: usize) -> Option<i64> {
891 self.inner.get(index).copied()
892 }
893
894 pub fn add(&mut self, other: &Self) {
899 assert_eq!(self.len(), other.len());
900 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
901 *dst += src;
902 }
903 }
904
905 pub fn len(&self) -> usize {
907 self.inner.len()
908 }
909
910 pub fn is_empty(&self) -> bool {
912 self.inner.is_empty()
913 }
914
915 pub fn reset(&mut self) {
917 for value in self.inner.iter_mut() {
918 *value = 0;
919 }
920 }
921
922 pub fn update_from_levels(&mut self, levels: &[i16]) {
928 for &level in levels {
929 self.inner[level as usize] += 1;
930 }
931 }
932}
933
934impl From<Vec<i64>> for LevelHistogram {
935 fn from(inner: Vec<i64>) -> Self {
936 Self { inner }
937 }
938}
939
940impl From<LevelHistogram> for Vec<i64> {
941 fn from(value: LevelHistogram) -> Self {
942 value.into_inner()
943 }
944}
945
946impl HeapSize for LevelHistogram {
947 fn heap_size(&self) -> usize {
948 self.inner.heap_size()
949 }
950}
951
952impl ColumnChunkMetaData {
954 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
956 ColumnChunkMetaDataBuilder::new(column_descr)
957 }
958
959 pub fn file_path(&self) -> Option<&str> {
964 self.file_path.as_deref()
965 }
966
967 pub fn file_offset(&self) -> i64 {
974 self.file_offset
975 }
976
977 pub fn column_type(&self) -> Type {
979 self.column_descr.physical_type()
980 }
981
982 pub fn column_path(&self) -> &ColumnPath {
984 self.column_descr.path()
985 }
986
987 pub fn column_descr(&self) -> &ColumnDescriptor {
989 self.column_descr.as_ref()
990 }
991
992 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
994 self.column_descr.clone()
995 }
996
997 pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
999 self.encodings.encodings()
1000 }
1001
1002 pub fn encodings_mask(&self) -> &EncodingMask {
1004 &self.encodings
1005 }
1006
1007 pub fn num_values(&self) -> i64 {
1009 self.num_values
1010 }
1011
1012 pub fn compression(&self) -> Compression {
1014 self.compression
1015 }
1016
1017 pub fn compressed_size(&self) -> i64 {
1019 self.total_compressed_size
1020 }
1021
1022 pub fn uncompressed_size(&self) -> i64 {
1024 self.total_uncompressed_size
1025 }
1026
1027 pub fn data_page_offset(&self) -> i64 {
1029 self.data_page_offset
1030 }
1031
1032 pub fn index_page_offset(&self) -> Option<i64> {
1034 self.index_page_offset
1035 }
1036
1037 pub fn dictionary_page_offset(&self) -> Option<i64> {
1039 self.dictionary_page_offset
1040 }
1041
1042 pub fn byte_range(&self) -> (u64, u64) {
1044 let col_start = match self.dictionary_page_offset() {
1045 Some(dictionary_page_offset) => dictionary_page_offset,
1046 None => self.data_page_offset(),
1047 };
1048 let col_len = self.compressed_size();
1049 assert!(
1050 col_start >= 0 && col_len >= 0,
1051 "column start and length should not be negative"
1052 );
1053 (col_start as u64, col_len as u64)
1054 }
1055
1056 pub fn statistics(&self) -> Option<&Statistics> {
1059 self.statistics.as_ref()
1060 }
1061
1062 pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1065 self.geo_statistics.as_deref()
1066 }
1067
1068 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1075 match self.encoding_stats.as_ref() {
1076 Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
1077 _ => None,
1078 }
1079 }
1080
1081 pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1111 match self.encoding_stats.as_ref() {
1112 Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
1113 _ => None,
1114 }
1115 }
1116
1117 pub fn bloom_filter_offset(&self) -> Option<i64> {
1119 self.bloom_filter_offset
1120 }
1121
1122 pub fn bloom_filter_length(&self) -> Option<i32> {
1124 self.bloom_filter_length
1125 }
1126
1127 pub fn column_index_offset(&self) -> Option<i64> {
1129 self.column_index_offset
1130 }
1131
1132 pub fn column_index_length(&self) -> Option<i32> {
1134 self.column_index_length
1135 }
1136
1137 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1139 let offset = u64::try_from(self.column_index_offset?).ok()?;
1140 let length = u64::try_from(self.column_index_length?).ok()?;
1141 Some(offset..(offset + length))
1142 }
1143
1144 pub fn offset_index_offset(&self) -> Option<i64> {
1146 self.offset_index_offset
1147 }
1148
1149 pub fn offset_index_length(&self) -> Option<i32> {
1151 self.offset_index_length
1152 }
1153
1154 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1156 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1157 let length = u64::try_from(self.offset_index_length?).ok()?;
1158 Some(offset..(offset + length))
1159 }
1160
1161 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1166 self.unencoded_byte_array_data_bytes
1167 }
1168
1169 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1175 self.repetition_level_histogram.as_ref()
1176 }
1177
1178 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1184 self.definition_level_histogram.as_ref()
1185 }
1186
1187 #[cfg(feature = "encryption")]
1189 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1190 self.column_crypto_metadata.as_deref()
1191 }
1192
1193 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1195 ColumnChunkMetaDataBuilder::from(self)
1196 }
1197}
1198
1199pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1218
1219impl ColumnChunkMetaDataBuilder {
1220 fn new(column_descr: ColumnDescPtr) -> Self {
1224 Self(ColumnChunkMetaData {
1225 column_descr,
1226 encodings: Default::default(),
1227 file_path: None,
1228 file_offset: 0,
1229 num_values: 0,
1230 compression: Compression::UNCOMPRESSED,
1231 total_compressed_size: 0,
1232 total_uncompressed_size: 0,
1233 data_page_offset: 0,
1234 index_page_offset: None,
1235 dictionary_page_offset: None,
1236 statistics: None,
1237 geo_statistics: None,
1238 encoding_stats: None,
1239 bloom_filter_offset: None,
1240 bloom_filter_length: None,
1241 offset_index_offset: None,
1242 offset_index_length: None,
1243 column_index_offset: None,
1244 column_index_length: None,
1245 unencoded_byte_array_data_bytes: None,
1246 repetition_level_histogram: None,
1247 definition_level_histogram: None,
1248 #[cfg(feature = "encryption")]
1249 column_crypto_metadata: None,
1250 #[cfg(feature = "encryption")]
1251 encrypted_column_metadata: None,
1252 #[cfg(feature = "encryption")]
1253 plaintext_footer_mode: false,
1254 })
1255 }
1256
1257 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1259 self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1260 self
1261 }
1262
1263 pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1265 self.0.encodings = encodings;
1266 self
1267 }
1268
1269 pub fn set_file_path(mut self, value: String) -> Self {
1271 self.0.file_path = Some(value);
1272 self
1273 }
1274
1275 pub fn set_num_values(mut self, value: i64) -> Self {
1277 self.0.num_values = value;
1278 self
1279 }
1280
1281 pub fn set_compression(mut self, value: Compression) -> Self {
1283 self.0.compression = value;
1284 self
1285 }
1286
1287 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1289 self.0.total_compressed_size = value;
1290 self
1291 }
1292
1293 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1295 self.0.total_uncompressed_size = value;
1296 self
1297 }
1298
1299 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1301 self.0.data_page_offset = value;
1302 self
1303 }
1304
1305 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1307 self.0.dictionary_page_offset = value;
1308 self
1309 }
1310
1311 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1313 self.0.index_page_offset = value;
1314 self
1315 }
1316
1317 pub fn set_statistics(mut self, value: Statistics) -> Self {
1319 self.0.statistics = Some(value);
1320 self
1321 }
1322
1323 pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1325 self.0.geo_statistics = Some(value);
1326 self
1327 }
1328
1329 pub fn clear_statistics(mut self) -> Self {
1331 self.0.statistics = None;
1332 self
1333 }
1334
1335 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1339 self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
1340 self
1341 }
1342
1343 pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1347 self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
1348 self
1349 }
1350
1351 pub fn clear_page_encoding_stats(mut self) -> Self {
1353 self.0.encoding_stats = None;
1354 self
1355 }
1356
1357 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1359 self.0.bloom_filter_offset = value;
1360 self
1361 }
1362
1363 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1365 self.0.bloom_filter_length = value;
1366 self
1367 }
1368
1369 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1371 self.0.offset_index_offset = value;
1372 self
1373 }
1374
1375 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1377 self.0.offset_index_length = value;
1378 self
1379 }
1380
1381 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1383 self.0.column_index_offset = value;
1384 self
1385 }
1386
1387 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1389 self.0.column_index_length = value;
1390 self
1391 }
1392
1393 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1395 self.0.unencoded_byte_array_data_bytes = value;
1396 self
1397 }
1398
1399 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1401 self.0.repetition_level_histogram = value;
1402 self
1403 }
1404
1405 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1407 self.0.definition_level_histogram = value;
1408 self
1409 }
1410
1411 #[cfg(feature = "encryption")]
1412 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1414 self.0.column_crypto_metadata = value.map(Box::new);
1415 self
1416 }
1417
1418 #[cfg(feature = "encryption")]
1419 pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1421 self.0.encrypted_column_metadata = value;
1422 self
1423 }
1424
1425 pub fn build(self) -> Result<ColumnChunkMetaData> {
1427 Ok(self.0)
1428 }
1429}
1430
1431pub struct ColumnIndexBuilder {
1436 column_type: Type,
1437 null_pages: Vec<bool>,
1438 min_values: Vec<Vec<u8>>,
1439 max_values: Vec<Vec<u8>>,
1440 null_counts: Vec<i64>,
1441 boundary_order: BoundaryOrder,
1442 repetition_level_histograms: Option<Vec<i64>>,
1444 definition_level_histograms: Option<Vec<i64>>,
1446 valid: bool,
1454}
1455
1456impl ColumnIndexBuilder {
1457 pub fn new(column_type: Type) -> Self {
1459 ColumnIndexBuilder {
1460 column_type,
1461 null_pages: Vec::new(),
1462 min_values: Vec::new(),
1463 max_values: Vec::new(),
1464 null_counts: Vec::new(),
1465 boundary_order: BoundaryOrder::UNORDERED,
1466 repetition_level_histograms: None,
1467 definition_level_histograms: None,
1468 valid: true,
1469 }
1470 }
1471
1472 pub fn append(
1474 &mut self,
1475 null_page: bool,
1476 min_value: Vec<u8>,
1477 max_value: Vec<u8>,
1478 null_count: i64,
1479 ) {
1480 self.null_pages.push(null_page);
1481 self.min_values.push(min_value);
1482 self.max_values.push(max_value);
1483 self.null_counts.push(null_count);
1484 }
1485
1486 pub fn append_histograms(
1491 &mut self,
1492 repetition_level_histogram: &Option<LevelHistogram>,
1493 definition_level_histogram: &Option<LevelHistogram>,
1494 ) {
1495 if !self.valid {
1496 return;
1497 }
1498 if let Some(rep_lvl_hist) = repetition_level_histogram {
1499 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1500 hist.reserve(rep_lvl_hist.len());
1501 hist.extend(rep_lvl_hist.values());
1502 }
1503 if let Some(def_lvl_hist) = definition_level_histogram {
1504 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1505 hist.reserve(def_lvl_hist.len());
1506 hist.extend(def_lvl_hist.values());
1507 }
1508 }
1509
1510 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1512 self.boundary_order = boundary_order;
1513 }
1514
1515 pub fn to_invalid(&mut self) {
1517 self.valid = false;
1518 }
1519
1520 pub fn valid(&self) -> bool {
1522 self.valid
1523 }
1524
1525 pub fn build(self) -> Result<ColumnIndexMetaData> {
1529 Ok(match self.column_type {
1530 Type::BOOLEAN => {
1531 let index = self.build_page_index()?;
1532 ColumnIndexMetaData::BOOLEAN(index)
1533 }
1534 Type::INT32 => {
1535 let index = self.build_page_index()?;
1536 ColumnIndexMetaData::INT32(index)
1537 }
1538 Type::INT64 => {
1539 let index = self.build_page_index()?;
1540 ColumnIndexMetaData::INT64(index)
1541 }
1542 Type::INT96 => {
1543 let index = self.build_page_index()?;
1544 ColumnIndexMetaData::INT96(index)
1545 }
1546 Type::FLOAT => {
1547 let index = self.build_page_index()?;
1548 ColumnIndexMetaData::FLOAT(index)
1549 }
1550 Type::DOUBLE => {
1551 let index = self.build_page_index()?;
1552 ColumnIndexMetaData::DOUBLE(index)
1553 }
1554 Type::BYTE_ARRAY => {
1555 let index = self.build_byte_array_index()?;
1556 ColumnIndexMetaData::BYTE_ARRAY(index)
1557 }
1558 Type::FIXED_LEN_BYTE_ARRAY => {
1559 let index = self.build_byte_array_index()?;
1560 ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1561 }
1562 })
1563 }
1564
1565 fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1566 where
1567 T: ParquetValueType,
1568 {
1569 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1570 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1571
1572 PrimitiveColumnIndex::try_new(
1573 self.null_pages,
1574 self.boundary_order,
1575 Some(self.null_counts),
1576 self.repetition_level_histograms,
1577 self.definition_level_histograms,
1578 min_values,
1579 max_values,
1580 )
1581 }
1582
1583 fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1584 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1585 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1586
1587 ByteArrayColumnIndex::try_new(
1588 self.null_pages,
1589 self.boundary_order,
1590 Some(self.null_counts),
1591 self.repetition_level_histograms,
1592 self.definition_level_histograms,
1593 min_values,
1594 max_values,
1595 )
1596 }
1597}
1598
1599impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1600 fn from(value: ColumnChunkMetaData) -> Self {
1601 ColumnChunkMetaDataBuilder(value)
1602 }
1603}
1604
1605pub struct OffsetIndexBuilder {
1609 offset_array: Vec<i64>,
1610 compressed_page_size_array: Vec<i32>,
1611 first_row_index_array: Vec<i64>,
1612 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1613 current_first_row_index: i64,
1614}
1615
1616impl Default for OffsetIndexBuilder {
1617 fn default() -> Self {
1618 Self::new()
1619 }
1620}
1621
1622impl OffsetIndexBuilder {
1623 pub fn new() -> Self {
1625 OffsetIndexBuilder {
1626 offset_array: Vec::new(),
1627 compressed_page_size_array: Vec::new(),
1628 first_row_index_array: Vec::new(),
1629 unencoded_byte_array_data_bytes_array: None,
1630 current_first_row_index: 0,
1631 }
1632 }
1633
1634 pub fn append_row_count(&mut self, row_count: i64) {
1636 let current_page_row_index = self.current_first_row_index;
1637 self.first_row_index_array.push(current_page_row_index);
1638 self.current_first_row_index += row_count;
1639 }
1640
1641 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1643 self.offset_array.push(offset);
1644 self.compressed_page_size_array.push(compressed_page_size);
1645 }
1646
1647 pub fn append_unencoded_byte_array_data_bytes(
1649 &mut self,
1650 unencoded_byte_array_data_bytes: Option<i64>,
1651 ) {
1652 if let Some(val) = unencoded_byte_array_data_bytes {
1653 self.unencoded_byte_array_data_bytes_array
1654 .get_or_insert(Vec::new())
1655 .push(val);
1656 }
1657 }
1658
1659 pub fn build(self) -> OffsetIndexMetaData {
1661 let locations = self
1662 .offset_array
1663 .iter()
1664 .zip(self.compressed_page_size_array.iter())
1665 .zip(self.first_row_index_array.iter())
1666 .map(|((offset, size), row_index)| PageLocation {
1667 offset: *offset,
1668 compressed_page_size: *size,
1669 first_row_index: *row_index,
1670 })
1671 .collect::<Vec<_>>();
1672 OffsetIndexMetaData {
1673 page_locations: locations,
1674 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1675 }
1676 }
1677}
1678
1679#[cfg(test)]
1680mod tests {
1681 use super::*;
1682 use crate::basic::{PageType, SortOrder};
1683 use crate::file::metadata::thrift::tests::{
1684 read_column_chunk, read_column_chunk_with_options, read_row_group,
1685 };
1686
1687 #[test]
1688 fn test_row_group_metadata_thrift_conversion() {
1689 let schema_descr = get_test_schema_descr();
1690
1691 let mut columns = vec![];
1692 for ptr in schema_descr.columns() {
1693 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1694 columns.push(column);
1695 }
1696 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1697 .set_num_rows(1000)
1698 .set_total_byte_size(2000)
1699 .set_column_metadata(columns)
1700 .set_ordinal(1)
1701 .build()
1702 .unwrap();
1703
1704 let mut buf = Vec::new();
1705 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1706 row_group_meta.write_thrift(&mut writer).unwrap();
1707
1708 let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1709
1710 assert_eq!(row_group_res, row_group_meta);
1711 }
1712
1713 #[test]
1714 fn test_row_group_metadata_thrift_conversion_empty() {
1715 let schema_descr = get_test_schema_descr();
1716
1717 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1718
1719 assert!(row_group_meta.is_err());
1720 if let Err(e) = row_group_meta {
1721 assert_eq!(
1722 format!("{e}"),
1723 "Parquet error: Column length mismatch: 2 != 0"
1724 );
1725 }
1726 }
1727
1728 #[test]
1730 fn test_row_group_metadata_thrift_corrupted() {
1731 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1732 SchemaType::group_type_builder("schema")
1733 .with_fields(vec![
1734 Arc::new(
1735 SchemaType::primitive_type_builder("a", Type::INT32)
1736 .build()
1737 .unwrap(),
1738 ),
1739 Arc::new(
1740 SchemaType::primitive_type_builder("b", Type::INT32)
1741 .build()
1742 .unwrap(),
1743 ),
1744 ])
1745 .build()
1746 .unwrap(),
1747 )));
1748
1749 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1750 SchemaType::group_type_builder("schema")
1751 .with_fields(vec![
1752 Arc::new(
1753 SchemaType::primitive_type_builder("a", Type::INT32)
1754 .build()
1755 .unwrap(),
1756 ),
1757 Arc::new(
1758 SchemaType::primitive_type_builder("b", Type::INT32)
1759 .build()
1760 .unwrap(),
1761 ),
1762 Arc::new(
1763 SchemaType::primitive_type_builder("c", Type::INT32)
1764 .build()
1765 .unwrap(),
1766 ),
1767 ])
1768 .build()
1769 .unwrap(),
1770 )));
1771
1772 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1773 .set_num_rows(1000)
1774 .set_total_byte_size(2000)
1775 .set_column_metadata(vec![
1776 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1777 .build()
1778 .unwrap(),
1779 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1780 .build()
1781 .unwrap(),
1782 ])
1783 .set_ordinal(1)
1784 .build()
1785 .unwrap();
1786 let mut buf = Vec::new();
1787 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1788 row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1789
1790 let err = read_row_group(&mut buf, schema_descr_3cols)
1791 .unwrap_err()
1792 .to_string();
1793 assert_eq!(
1794 err,
1795 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1796 );
1797 }
1798
1799 #[test]
1800 fn test_column_chunk_metadata_thrift_conversion() {
1801 let column_descr = get_test_schema_descr().column(0);
1802 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1803 .set_encodings_mask(EncodingMask::new_from_encodings(
1804 [Encoding::PLAIN, Encoding::RLE].iter(),
1805 ))
1806 .set_file_path("file_path".to_owned())
1807 .set_num_values(1000)
1808 .set_compression(Compression::SNAPPY)
1809 .set_total_compressed_size(2000)
1810 .set_total_uncompressed_size(3000)
1811 .set_data_page_offset(4000)
1812 .set_dictionary_page_offset(Some(5000))
1813 .set_page_encoding_stats(vec![
1814 PageEncodingStats {
1815 page_type: PageType::DATA_PAGE,
1816 encoding: Encoding::PLAIN,
1817 count: 3,
1818 },
1819 PageEncodingStats {
1820 page_type: PageType::DATA_PAGE,
1821 encoding: Encoding::RLE,
1822 count: 5,
1823 },
1824 ])
1825 .set_bloom_filter_offset(Some(6000))
1826 .set_bloom_filter_length(Some(25))
1827 .set_offset_index_offset(Some(7000))
1828 .set_offset_index_length(Some(25))
1829 .set_column_index_offset(Some(8000))
1830 .set_column_index_length(Some(25))
1831 .set_unencoded_byte_array_data_bytes(Some(2000))
1832 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1833 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1834 .build()
1835 .unwrap();
1836
1837 let mut buf = Vec::new();
1838 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1839 col_metadata.write_thrift(&mut writer).unwrap();
1840 let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1841
1842 let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1843 .set_encodings_mask(EncodingMask::new_from_encodings(
1844 [Encoding::PLAIN, Encoding::RLE].iter(),
1845 ))
1846 .set_file_path("file_path".to_owned())
1847 .set_num_values(1000)
1848 .set_compression(Compression::SNAPPY)
1849 .set_total_compressed_size(2000)
1850 .set_total_uncompressed_size(3000)
1851 .set_data_page_offset(4000)
1852 .set_dictionary_page_offset(Some(5000))
1853 .set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1854 [Encoding::PLAIN, Encoding::RLE].iter(),
1855 ))
1856 .set_bloom_filter_offset(Some(6000))
1857 .set_bloom_filter_length(Some(25))
1858 .set_offset_index_offset(Some(7000))
1859 .set_offset_index_length(Some(25))
1860 .set_column_index_offset(Some(8000))
1861 .set_column_index_length(Some(25))
1862 .set_unencoded_byte_array_data_bytes(Some(2000))
1863 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1864 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1865 .build()
1866 .unwrap();
1867
1868 assert_eq!(col_chunk_res, expected_metadata);
1869 }
1870
1871 #[test]
1872 fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1873 let column_descr = get_test_schema_descr().column(0);
1874 let stats = vec![
1875 PageEncodingStats {
1876 page_type: PageType::DATA_PAGE,
1877 encoding: Encoding::PLAIN,
1878 count: 3,
1879 },
1880 PageEncodingStats {
1881 page_type: PageType::DATA_PAGE,
1882 encoding: Encoding::RLE,
1883 count: 5,
1884 },
1885 ];
1886 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1887 .set_encodings_mask(EncodingMask::new_from_encodings(
1888 [Encoding::PLAIN, Encoding::RLE].iter(),
1889 ))
1890 .set_num_values(1000)
1891 .set_compression(Compression::SNAPPY)
1892 .set_total_compressed_size(2000)
1893 .set_total_uncompressed_size(3000)
1894 .set_data_page_offset(4000)
1895 .set_page_encoding_stats(stats)
1896 .build()
1897 .unwrap();
1898
1899 let mut buf = Vec::new();
1900 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1901 col_metadata.write_thrift(&mut writer).unwrap();
1902
1903 let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1904 let col_chunk_res =
1905 read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
1906
1907 assert_eq!(col_chunk_res, col_metadata);
1908 }
1909
1910 #[test]
1911 fn test_column_chunk_metadata_thrift_conversion_empty() {
1912 let column_descr = get_test_schema_descr().column(0);
1913
1914 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1915 .build()
1916 .unwrap();
1917
1918 let mut buf = Vec::new();
1919 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1920 col_metadata.write_thrift(&mut writer).unwrap();
1921 let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1922
1923 assert_eq!(col_chunk_res, col_metadata);
1924 }
1925
1926 #[test]
1927 fn test_compressed_size() {
1928 let schema_descr = get_test_schema_descr();
1929
1930 let mut columns = vec![];
1931 for column_descr in schema_descr.columns() {
1932 let column = ColumnChunkMetaData::builder(column_descr.clone())
1933 .set_total_compressed_size(500)
1934 .set_total_uncompressed_size(700)
1935 .build()
1936 .unwrap();
1937 columns.push(column);
1938 }
1939 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1940 .set_num_rows(1000)
1941 .set_column_metadata(columns)
1942 .build()
1943 .unwrap();
1944
1945 let compressed_size_res: i64 = row_group_meta.compressed_size();
1946 let compressed_size_exp: i64 = 1000;
1947
1948 assert_eq!(compressed_size_res, compressed_size_exp);
1949 }
1950
1951 #[test]
1952 fn test_memory_size() {
1953 let schema_descr = get_test_schema_descr();
1954
1955 let columns = schema_descr
1956 .columns()
1957 .iter()
1958 .map(|column_descr| {
1959 ColumnChunkMetaData::builder(column_descr.clone())
1960 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1961 .build()
1962 })
1963 .collect::<Result<Vec<_>>>()
1964 .unwrap();
1965 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1966 .set_num_rows(1000)
1967 .set_column_metadata(columns)
1968 .build()
1969 .unwrap();
1970 let row_group_meta = vec![row_group_meta];
1971
1972 let version = 2;
1973 let num_rows = 1000;
1974 let created_by = Some(String::from("test harness"));
1975 let key_value_metadata = Some(vec![KeyValue::new(
1976 String::from("Foo"),
1977 Some(String::from("bar")),
1978 )]);
1979 let column_orders = Some(vec![
1980 ColumnOrder::UNDEFINED,
1981 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1982 ]);
1983 let file_metadata = FileMetaData::new(
1984 version,
1985 num_rows,
1986 created_by,
1987 key_value_metadata,
1988 schema_descr.clone(),
1989 column_orders,
1990 );
1991
1992 let columns_with_stats = schema_descr
1994 .columns()
1995 .iter()
1996 .map(|column_descr| {
1997 ColumnChunkMetaData::builder(column_descr.clone())
1998 .set_statistics(Statistics::new::<i32>(
1999 Some(0),
2000 Some(100),
2001 None,
2002 None,
2003 false,
2004 ))
2005 .build()
2006 })
2007 .collect::<Result<Vec<_>>>()
2008 .unwrap();
2009
2010 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
2011 .set_num_rows(1000)
2012 .set_column_metadata(columns_with_stats)
2013 .build()
2014 .unwrap();
2015 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
2016
2017 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
2018 .set_row_groups(row_group_meta_with_stats)
2019 .build();
2020
2021 #[cfg(not(feature = "encryption"))]
2022 let base_expected_size = 2766;
2023 #[cfg(feature = "encryption")]
2024 let base_expected_size = 2934;
2025
2026 assert_eq!(parquet_meta.memory_size(), base_expected_size);
2027
2028 let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
2029 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2030 let column_index = column_index.build().unwrap();
2031 let native_index = match column_index {
2032 ColumnIndexMetaData::BOOLEAN(index) => index,
2033 _ => panic!("wrong type of column index"),
2034 };
2035
2036 let mut offset_index = OffsetIndexBuilder::new();
2038 offset_index.append_row_count(1);
2039 offset_index.append_offset_and_size(2, 3);
2040 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2041 offset_index.append_row_count(1);
2042 offset_index.append_offset_and_size(2, 3);
2043 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2044 let offset_index = offset_index.build();
2045
2046 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2047 .set_row_groups(row_group_meta)
2048 .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
2049 .set_offset_index(Some(vec![vec![offset_index]]))
2050 .build();
2051
2052 #[cfg(not(feature = "encryption"))]
2053 let bigger_expected_size = 3192;
2054 #[cfg(feature = "encryption")]
2055 let bigger_expected_size = 3360;
2056
2057 assert!(bigger_expected_size > base_expected_size);
2059 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2060 }
2061
2062 #[test]
2063 #[cfg(feature = "encryption")]
2064 fn test_memory_size_with_decryptor() {
2065 use crate::encryption::decrypt::FileDecryptionProperties;
2066 use crate::file::metadata::thrift::encryption::AesGcmV1;
2067
2068 let schema_descr = get_test_schema_descr();
2069
2070 let columns = schema_descr
2071 .columns()
2072 .iter()
2073 .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
2074 .collect::<Result<Vec<_>>>()
2075 .unwrap();
2076 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
2077 .set_num_rows(1000)
2078 .set_column_metadata(columns)
2079 .build()
2080 .unwrap();
2081 let row_group_meta = vec![row_group_meta];
2082
2083 let version = 2;
2084 let num_rows = 1000;
2085 let aad_file_unique = vec![1u8; 8];
2086 let aad_prefix = vec![2u8; 8];
2087 let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
2088 aad_prefix: Some(aad_prefix.clone()),
2089 aad_file_unique: Some(aad_file_unique.clone()),
2090 supply_aad_prefix: Some(true),
2091 });
2092 let footer_key_metadata = Some(vec![3u8; 8]);
2093 let file_metadata =
2094 FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
2095 .with_encryption_algorithm(Some(encryption_algorithm))
2096 .with_footer_signing_key_metadata(footer_key_metadata.clone());
2097
2098 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2099 .set_row_groups(row_group_meta.clone())
2100 .build();
2101
2102 let base_expected_size = 2058;
2103 assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
2104
2105 let footer_key = "0123456789012345".as_bytes();
2106 let column_key = "1234567890123450".as_bytes();
2107 let mut decryption_properties_builder =
2108 FileDecryptionProperties::builder(footer_key.to_vec())
2109 .with_aad_prefix(aad_prefix.clone());
2110 for column in schema_descr.columns() {
2111 decryption_properties_builder = decryption_properties_builder
2112 .with_column_key(&column.path().string(), column_key.to_vec());
2113 }
2114 let decryption_properties = decryption_properties_builder.build().unwrap();
2115 let decryptor = FileDecryptor::new(
2116 &decryption_properties,
2117 footer_key_metadata.as_deref(),
2118 aad_file_unique,
2119 aad_prefix,
2120 )
2121 .unwrap();
2122
2123 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2124 .set_row_groups(row_group_meta.clone())
2125 .set_file_decryptor(Some(decryptor))
2126 .build();
2127
2128 let expected_size_with_decryptor = 3072;
2129 assert!(expected_size_with_decryptor > base_expected_size);
2130
2131 assert_eq!(
2132 parquet_meta_data.memory_size(),
2133 expected_size_with_decryptor
2134 );
2135 }
2136
2137 fn get_test_schema_descr() -> SchemaDescPtr {
2139 let schema = SchemaType::group_type_builder("schema")
2140 .with_fields(vec![
2141 Arc::new(
2142 SchemaType::primitive_type_builder("a", Type::INT32)
2143 .build()
2144 .unwrap(),
2145 ),
2146 Arc::new(
2147 SchemaType::primitive_type_builder("b", Type::INT32)
2148 .build()
2149 .unwrap(),
2150 ),
2151 ])
2152 .build()
2153 .unwrap();
2154
2155 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2156 }
2157}