1mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{
99 BoundaryOrder, ColumnOrder, Compression, CompressionCodec, Encoding, EncodingMask, PageType,
100 Type,
101};
102#[cfg(feature = "encryption")]
103use crate::encryption::decrypt::FileDecryptor;
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
107pub(crate) use crate::file::metadata::memory::HeapSize;
108#[cfg(feature = "encryption")]
109use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
110use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
111use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
112use crate::file::statistics::Statistics;
113use crate::geospatial::statistics as geo_statistics;
114use crate::parquet_thrift::{
115 ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
116 WriteThrift, WriteThriftField,
117};
118use crate::schema::types::{
119 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
120 Type as SchemaType,
121};
122use crate::thrift_struct;
123use crate::{
124 data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
125};
126
127pub use footer_tail::FooterTail;
128pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
129pub use push_decoder::ParquetMetaDataPushDecoder;
130pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
131use std::io::Write;
132use std::ops::Range;
133use std::sync::Arc;
134pub use writer::ParquetMetaDataWriter;
135pub(crate) use writer::ThriftMetadataWriter;
136
137pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
154
155pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
167
168#[derive(Debug, Clone, PartialEq)]
186pub struct ParquetMetaData {
187 file_metadata: FileMetaData,
189 row_groups: Vec<RowGroupMetaData>,
191 column_index: Option<ParquetColumnIndex>,
193 offset_index: Option<ParquetOffsetIndex>,
195 #[cfg(feature = "encryption")]
197 file_decryptor: Option<Box<FileDecryptor>>,
198}
199
200impl ParquetMetaData {
201 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
204 ParquetMetaData {
205 file_metadata,
206 row_groups,
207 column_index: None,
208 offset_index: None,
209 #[cfg(feature = "encryption")]
210 file_decryptor: None,
211 }
212 }
213
214 #[cfg(feature = "encryption")]
217 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
218 self.file_decryptor = file_decryptor.map(Box::new);
219 }
220
221 pub fn into_builder(self) -> ParquetMetaDataBuilder {
223 self.into()
224 }
225
226 pub fn file_metadata(&self) -> &FileMetaData {
228 &self.file_metadata
229 }
230
231 #[cfg(feature = "encryption")]
233 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
234 self.file_decryptor.as_deref()
235 }
236
237 pub fn num_row_groups(&self) -> usize {
239 self.row_groups.len()
240 }
241
242 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
245 &self.row_groups[i]
246 }
247
248 pub fn row_groups(&self) -> &[RowGroupMetaData] {
250 &self.row_groups
251 }
252
253 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
260 self.column_index.as_ref()
261 }
262
263 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
270 self.offset_index.as_ref()
271 }
272
273 pub fn memory_size(&self) -> usize {
288 #[cfg(feature = "encryption")]
289 let encryption_size = self.file_decryptor.heap_size();
290 #[cfg(not(feature = "encryption"))]
291 let encryption_size = 0usize;
292
293 std::mem::size_of::<Self>()
294 + self.file_metadata.heap_size()
295 + self.row_groups.heap_size()
296 + self.column_index.heap_size()
297 + self.offset_index.heap_size()
298 + encryption_size
299 }
300
301 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
303 self.column_index = index;
304 }
305
306 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
308 self.offset_index = index;
309 }
310}
311
312pub struct ParquetMetaDataBuilder(ParquetMetaData);
350
351impl ParquetMetaDataBuilder {
352 pub fn new(file_meta_data: FileMetaData) -> Self {
354 Self(ParquetMetaData::new(file_meta_data, vec![]))
355 }
356
357 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
359 Self(metadata)
360 }
361
362 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
364 self.0.row_groups.push(row_group);
365 self
366 }
367
368 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
370 self.0.row_groups = row_groups;
371 self
372 }
373
374 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
380 std::mem::take(&mut self.0.row_groups)
381 }
382
383 pub fn row_groups(&self) -> &[RowGroupMetaData] {
385 &self.0.row_groups
386 }
387
388 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
390 self.0.column_index = column_index;
391 self
392 }
393
394 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
396 std::mem::take(&mut self.0.column_index)
397 }
398
399 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
401 self.0.column_index.as_ref()
402 }
403
404 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
406 self.0.offset_index = offset_index;
407 self
408 }
409
410 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
412 std::mem::take(&mut self.0.offset_index)
413 }
414
415 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
417 self.0.offset_index.as_ref()
418 }
419
420 #[cfg(feature = "encryption")]
422 pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
423 self.0.with_file_decryptor(file_decryptor);
424 self
425 }
426
427 pub fn build(self) -> ParquetMetaData {
429 let Self(metadata) = self;
430 metadata
431 }
432}
433
434impl From<ParquetMetaData> for ParquetMetaDataBuilder {
435 fn from(meta_data: ParquetMetaData) -> Self {
436 Self(meta_data)
437 }
438}
439
440thrift_struct!(
441pub struct KeyValue {
443 1: required string key
444 2: optional string value
445}
446);
447
448impl KeyValue {
449 pub fn new<F2>(key: String, value: F2) -> KeyValue
451 where
452 F2: Into<Option<String>>,
453 {
454 KeyValue {
455 key,
456 value: value.into(),
457 }
458 }
459}
460
461thrift_struct!(
462pub struct PageEncodingStats {
464 1: required PageType page_type;
465 2: required Encoding encoding;
466 3: required i32 count;
467}
468);
469
470#[derive(Debug, Clone, PartialEq)]
473enum ParquetPageEncodingStats {
474 Full(Vec<PageEncodingStats>),
476 Mask(EncodingMask),
478}
479
480pub type FileMetaDataPtr = Arc<FileMetaData>;
482
483#[derive(Debug, Clone, PartialEq)]
487pub struct FileMetaData {
488 version: i32,
489 num_rows: i64,
490 created_by: Option<String>,
491 key_value_metadata: Option<Vec<KeyValue>>,
492 schema_descr: SchemaDescPtr,
493 column_orders: Option<Vec<ColumnOrder>>,
494 #[cfg(feature = "encryption")]
495 encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
496 #[cfg(feature = "encryption")]
497 footer_signing_key_metadata: Option<Vec<u8>>,
498}
499
500impl FileMetaData {
501 pub fn new(
503 version: i32,
504 num_rows: i64,
505 created_by: Option<String>,
506 key_value_metadata: Option<Vec<KeyValue>>,
507 schema_descr: SchemaDescPtr,
508 column_orders: Option<Vec<ColumnOrder>>,
509 ) -> Self {
510 FileMetaData {
511 version,
512 num_rows,
513 created_by,
514 key_value_metadata,
515 schema_descr,
516 column_orders,
517 #[cfg(feature = "encryption")]
518 encryption_algorithm: None,
519 #[cfg(feature = "encryption")]
520 footer_signing_key_metadata: None,
521 }
522 }
523
524 #[cfg(feature = "encryption")]
525 pub(crate) fn with_encryption_algorithm(
526 mut self,
527 encryption_algorithm: Option<EncryptionAlgorithm>,
528 ) -> Self {
529 self.encryption_algorithm = encryption_algorithm.map(Box::new);
530 self
531 }
532
533 #[cfg(feature = "encryption")]
534 pub(crate) fn with_footer_signing_key_metadata(
535 mut self,
536 footer_signing_key_metadata: Option<Vec<u8>>,
537 ) -> Self {
538 self.footer_signing_key_metadata = footer_signing_key_metadata;
539 self
540 }
541
542 pub fn version(&self) -> i32 {
544 self.version
545 }
546
547 pub fn num_rows(&self) -> i64 {
549 self.num_rows
550 }
551
552 pub fn created_by(&self) -> Option<&str> {
561 self.created_by.as_deref()
562 }
563
564 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
566 self.key_value_metadata.as_ref()
567 }
568
569 pub fn schema(&self) -> &SchemaType {
573 self.schema_descr.root_schema()
574 }
575
576 pub fn schema_descr(&self) -> &SchemaDescriptor {
578 &self.schema_descr
579 }
580
581 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
583 self.schema_descr.clone()
584 }
585
586 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
594 self.column_orders.as_ref()
595 }
596
597 pub fn column_order(&self, i: usize) -> ColumnOrder {
600 self.column_orders
601 .as_ref()
602 .map(|data| data[i])
603 .unwrap_or(ColumnOrder::UNDEFINED)
604 }
605}
606
607thrift_struct!(
608pub struct SortingColumn {
610 1: required i32 column_idx
612
613 2: required bool descending
615
616 3: required bool nulls_first
619}
620);
621
622pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
624
625#[derive(Debug, Clone, PartialEq)]
630pub struct RowGroupMetaData {
631 columns: Vec<ColumnChunkMetaData>,
632 num_rows: i64,
633 sorting_columns: Option<Vec<SortingColumn>>,
634 total_byte_size: i64,
635 schema_descr: SchemaDescPtr,
636 file_offset: Option<i64>,
638 ordinal: Option<i16>,
640}
641
642impl RowGroupMetaData {
643 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
645 RowGroupMetaDataBuilder::new(schema_descr)
646 }
647
648 pub fn num_columns(&self) -> usize {
650 self.columns.len()
651 }
652
653 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
655 &self.columns[i]
656 }
657
658 pub fn columns(&self) -> &[ColumnChunkMetaData] {
660 &self.columns
661 }
662
663 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
665 &mut self.columns
666 }
667
668 pub fn num_rows(&self) -> i64 {
670 self.num_rows
671 }
672
673 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
675 self.sorting_columns.as_ref()
676 }
677
678 pub fn total_byte_size(&self) -> i64 {
680 self.total_byte_size
681 }
682
683 pub fn compressed_size(&self) -> i64 {
685 self.columns.iter().map(|c| c.total_compressed_size).sum()
686 }
687
688 pub fn schema_descr(&self) -> &SchemaDescriptor {
690 self.schema_descr.as_ref()
691 }
692
693 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
695 self.schema_descr.clone()
696 }
697
698 #[inline(always)]
703 pub fn ordinal(&self) -> Option<i16> {
704 self.ordinal
705 }
706
707 #[inline(always)]
709 pub fn file_offset(&self) -> Option<i64> {
710 self.file_offset
711 }
712
713 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
715 RowGroupMetaDataBuilder(self)
716 }
717}
718
719pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
721
722impl RowGroupMetaDataBuilder {
723 fn new(schema_descr: SchemaDescPtr) -> Self {
725 Self(RowGroupMetaData {
726 columns: Vec::with_capacity(schema_descr.num_columns()),
727 schema_descr,
728 file_offset: None,
729 num_rows: 0,
730 sorting_columns: None,
731 total_byte_size: 0,
732 ordinal: None,
733 })
734 }
735
736 pub fn set_num_rows(mut self, value: i64) -> Self {
738 self.0.num_rows = value;
739 self
740 }
741
742 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
744 self.0.sorting_columns = value;
745 self
746 }
747
748 pub fn set_total_byte_size(mut self, value: i64) -> Self {
750 self.0.total_byte_size = value;
751 self
752 }
753
754 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
760 std::mem::take(&mut self.0.columns)
761 }
762
763 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
765 self.0.columns = value;
766 self
767 }
768
769 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
771 self.0.columns.push(value);
772 self
773 }
774
775 pub fn set_ordinal(mut self, value: i16) -> Self {
777 self.0.ordinal = Some(value);
778 self
779 }
780
781 pub fn set_file_offset(mut self, value: i64) -> Self {
783 self.0.file_offset = Some(value);
784 self
785 }
786
787 pub fn build(self) -> Result<RowGroupMetaData> {
789 if self.0.schema_descr.num_columns() != self.0.columns.len() {
790 return Err(general_err!(
791 "Column length mismatch: {} != {}",
792 self.0.schema_descr.num_columns(),
793 self.0.columns.len()
794 ));
795 }
796
797 Ok(self.0)
798 }
799
800 pub(super) fn build_unchecked(self) -> RowGroupMetaData {
802 self.0
803 }
804}
805
806#[derive(Debug, Clone, PartialEq)]
808pub struct ColumnChunkMetaData {
809 column_descr: ColumnDescPtr,
810 encodings: EncodingMask,
811 file_path: Option<String>,
812 file_offset: i64,
813 num_values: i64,
814 compression: CompressionCodec,
815 total_compressed_size: i64,
816 total_uncompressed_size: i64,
817 data_page_offset: i64,
818 index_page_offset: Option<i64>,
819 dictionary_page_offset: Option<i64>,
820 statistics: Option<Statistics>,
821 geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
822 encoding_stats: Option<ParquetPageEncodingStats>,
823 bloom_filter_offset: Option<i64>,
824 bloom_filter_length: Option<i32>,
825 offset_index_offset: Option<i64>,
826 offset_index_length: Option<i32>,
827 column_index_offset: Option<i64>,
828 column_index_length: Option<i32>,
829 unencoded_byte_array_data_bytes: Option<i64>,
830 repetition_level_histogram: Option<LevelHistogram>,
831 definition_level_histogram: Option<LevelHistogram>,
832 #[cfg(feature = "encryption")]
833 column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
834 #[cfg(feature = "encryption")]
835 encrypted_column_metadata: Option<Vec<u8>>,
836 #[cfg(feature = "encryption")]
840 plaintext_footer_mode: bool,
841}
842
843#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
852pub struct LevelHistogram {
853 inner: Vec<i64>,
854}
855
856impl LevelHistogram {
857 pub fn try_new(max_level: i16) -> Option<Self> {
863 if max_level > 0 {
864 Some(Self {
865 inner: vec![0; max_level as usize + 1],
866 })
867 } else {
868 None
869 }
870 }
871 pub fn values(&self) -> &[i64] {
873 &self.inner
874 }
875
876 pub fn into_inner(self) -> Vec<i64> {
878 self.inner
879 }
880
881 pub fn get(&self, index: usize) -> Option<i64> {
888 self.inner.get(index).copied()
889 }
890
891 pub fn add(&mut self, other: &Self) {
896 assert_eq!(self.len(), other.len());
897 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
898 *dst += src;
899 }
900 }
901
902 pub fn len(&self) -> usize {
904 self.inner.len()
905 }
906
907 pub fn is_empty(&self) -> bool {
909 self.inner.is_empty()
910 }
911
912 pub fn reset(&mut self) {
914 for value in self.inner.iter_mut() {
915 *value = 0;
916 }
917 }
918
919 #[inline]
921 pub fn increment_by(&mut self, level: i16, count: i64) {
922 self.inner[level as usize] += count;
923 }
924
925 #[deprecated(since = "58.2.0", note = "Use `increment_by` instead")]
931 pub fn update_from_levels(&mut self, levels: &[i16]) {
932 for &level in levels {
933 self.increment_by(level, 1);
934 }
935 }
936}
937
938impl From<Vec<i64>> for LevelHistogram {
939 fn from(inner: Vec<i64>) -> Self {
940 Self { inner }
941 }
942}
943
944impl From<LevelHistogram> for Vec<i64> {
945 fn from(value: LevelHistogram) -> Self {
946 value.into_inner()
947 }
948}
949
950impl HeapSize for LevelHistogram {
951 fn heap_size(&self) -> usize {
952 self.inner.heap_size()
953 }
954}
955
956impl ColumnChunkMetaData {
958 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
960 ColumnChunkMetaDataBuilder::new(column_descr)
961 }
962
963 pub fn file_path(&self) -> Option<&str> {
968 self.file_path.as_deref()
969 }
970
971 pub fn file_offset(&self) -> i64 {
978 self.file_offset
979 }
980
981 pub fn column_type(&self) -> Type {
983 self.column_descr.physical_type()
984 }
985
986 pub fn column_path(&self) -> &ColumnPath {
988 self.column_descr.path()
989 }
990
991 pub fn column_descr(&self) -> &ColumnDescriptor {
993 self.column_descr.as_ref()
994 }
995
996 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
998 self.column_descr.clone()
999 }
1000
1001 pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
1003 self.encodings.encodings()
1004 }
1005
1006 pub fn encodings_mask(&self) -> &EncodingMask {
1008 &self.encodings
1009 }
1010
1011 pub fn num_values(&self) -> i64 {
1013 self.num_values
1014 }
1015
1016 pub fn compression(&self) -> Compression {
1023 self.compression.into()
1024 }
1025
1026 pub fn compression_codec(&self) -> CompressionCodec {
1028 self.compression
1029 }
1030
1031 pub fn compressed_size(&self) -> i64 {
1033 self.total_compressed_size
1034 }
1035
1036 pub fn uncompressed_size(&self) -> i64 {
1038 self.total_uncompressed_size
1039 }
1040
1041 pub fn data_page_offset(&self) -> i64 {
1043 self.data_page_offset
1044 }
1045
1046 pub fn index_page_offset(&self) -> Option<i64> {
1048 self.index_page_offset
1049 }
1050
1051 pub fn dictionary_page_offset(&self) -> Option<i64> {
1053 self.dictionary_page_offset
1054 }
1055
1056 pub fn byte_range(&self) -> (u64, u64) {
1058 let col_start = match self.dictionary_page_offset() {
1059 Some(dictionary_page_offset) => dictionary_page_offset,
1060 None => self.data_page_offset(),
1061 };
1062 let col_len = self.compressed_size();
1063 assert!(
1064 col_start >= 0 && col_len >= 0,
1065 "column start and length should not be negative"
1066 );
1067 (col_start as u64, col_len as u64)
1068 }
1069
1070 pub fn statistics(&self) -> Option<&Statistics> {
1073 self.statistics.as_ref()
1074 }
1075
1076 pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1079 self.geo_statistics.as_deref()
1080 }
1081
1082 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1089 match self.encoding_stats.as_ref() {
1090 Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
1091 _ => None,
1092 }
1093 }
1094
1095 pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1125 match self.encoding_stats.as_ref() {
1126 Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
1127 _ => None,
1128 }
1129 }
1130
1131 pub fn bloom_filter_offset(&self) -> Option<i64> {
1133 self.bloom_filter_offset
1134 }
1135
1136 pub fn bloom_filter_length(&self) -> Option<i32> {
1138 self.bloom_filter_length
1139 }
1140
1141 pub fn column_index_offset(&self) -> Option<i64> {
1143 self.column_index_offset
1144 }
1145
1146 pub fn column_index_length(&self) -> Option<i32> {
1148 self.column_index_length
1149 }
1150
1151 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1153 let offset = u64::try_from(self.column_index_offset?).ok()?;
1154 let length = u64::try_from(self.column_index_length?).ok()?;
1155 Some(offset..(offset + length))
1156 }
1157
1158 pub fn offset_index_offset(&self) -> Option<i64> {
1160 self.offset_index_offset
1161 }
1162
1163 pub fn offset_index_length(&self) -> Option<i32> {
1165 self.offset_index_length
1166 }
1167
1168 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1170 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1171 let length = u64::try_from(self.offset_index_length?).ok()?;
1172 Some(offset..(offset + length))
1173 }
1174
1175 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1180 self.unencoded_byte_array_data_bytes
1181 }
1182
1183 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1189 self.repetition_level_histogram.as_ref()
1190 }
1191
1192 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1198 self.definition_level_histogram.as_ref()
1199 }
1200
1201 #[cfg(feature = "encryption")]
1203 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1204 self.column_crypto_metadata.as_deref()
1205 }
1206
1207 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1209 ColumnChunkMetaDataBuilder::from(self)
1210 }
1211}
1212
1213pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1232
1233impl ColumnChunkMetaDataBuilder {
1234 fn new(column_descr: ColumnDescPtr) -> Self {
1238 Self(ColumnChunkMetaData {
1239 column_descr,
1240 encodings: Default::default(),
1241 file_path: None,
1242 file_offset: 0,
1243 num_values: 0,
1244 compression: CompressionCodec::UNCOMPRESSED,
1245 total_compressed_size: 0,
1246 total_uncompressed_size: 0,
1247 data_page_offset: 0,
1248 index_page_offset: None,
1249 dictionary_page_offset: None,
1250 statistics: None,
1251 geo_statistics: None,
1252 encoding_stats: None,
1253 bloom_filter_offset: None,
1254 bloom_filter_length: None,
1255 offset_index_offset: None,
1256 offset_index_length: None,
1257 column_index_offset: None,
1258 column_index_length: None,
1259 unencoded_byte_array_data_bytes: None,
1260 repetition_level_histogram: None,
1261 definition_level_histogram: None,
1262 #[cfg(feature = "encryption")]
1263 column_crypto_metadata: None,
1264 #[cfg(feature = "encryption")]
1265 encrypted_column_metadata: None,
1266 #[cfg(feature = "encryption")]
1267 plaintext_footer_mode: false,
1268 })
1269 }
1270
1271 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1273 self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1274 self
1275 }
1276
1277 pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1279 self.0.encodings = encodings;
1280 self
1281 }
1282
1283 pub fn set_file_path(mut self, value: String) -> Self {
1285 self.0.file_path = Some(value);
1286 self
1287 }
1288
1289 pub fn set_num_values(mut self, value: i64) -> Self {
1291 self.0.num_values = value;
1292 self
1293 }
1294
1295 pub fn set_compression(mut self, value: Compression) -> Self {
1297 self.0.compression = value.into();
1298 self
1299 }
1300
1301 pub fn set_compression_codec(mut self, value: CompressionCodec) -> Self {
1303 self.0.compression = value;
1304 self
1305 }
1306
1307 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1309 self.0.total_compressed_size = value;
1310 self
1311 }
1312
1313 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1315 self.0.total_uncompressed_size = value;
1316 self
1317 }
1318
1319 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1321 self.0.data_page_offset = value;
1322 self
1323 }
1324
1325 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1327 self.0.dictionary_page_offset = value;
1328 self
1329 }
1330
1331 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1333 self.0.index_page_offset = value;
1334 self
1335 }
1336
1337 pub fn set_statistics(mut self, value: Statistics) -> Self {
1339 self.0.statistics = Some(value);
1340 self
1341 }
1342
1343 pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1345 self.0.geo_statistics = Some(value);
1346 self
1347 }
1348
1349 pub fn clear_statistics(mut self) -> Self {
1351 self.0.statistics = None;
1352 self
1353 }
1354
1355 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1359 self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
1360 self
1361 }
1362
1363 pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1367 self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
1368 self
1369 }
1370
1371 pub fn clear_page_encoding_stats(mut self) -> Self {
1373 self.0.encoding_stats = None;
1374 self
1375 }
1376
1377 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1379 self.0.bloom_filter_offset = value;
1380 self
1381 }
1382
1383 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1385 self.0.bloom_filter_length = value;
1386 self
1387 }
1388
1389 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1391 self.0.offset_index_offset = value;
1392 self
1393 }
1394
1395 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1397 self.0.offset_index_length = value;
1398 self
1399 }
1400
1401 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1403 self.0.column_index_offset = value;
1404 self
1405 }
1406
1407 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1409 self.0.column_index_length = value;
1410 self
1411 }
1412
1413 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1415 self.0.unencoded_byte_array_data_bytes = value;
1416 self
1417 }
1418
1419 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1421 self.0.repetition_level_histogram = value;
1422 self
1423 }
1424
1425 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1427 self.0.definition_level_histogram = value;
1428 self
1429 }
1430
1431 #[cfg(feature = "encryption")]
1432 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1434 self.0.column_crypto_metadata = value.map(Box::new);
1435 self
1436 }
1437
1438 #[cfg(feature = "encryption")]
1439 pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1441 self.0.encrypted_column_metadata = value;
1442 self
1443 }
1444
1445 pub fn build(self) -> Result<ColumnChunkMetaData> {
1447 Ok(self.0)
1448 }
1449}
1450
1451pub struct ColumnIndexBuilder {
1456 column_type: Type,
1457 null_pages: Vec<bool>,
1458 min_values: Vec<Vec<u8>>,
1459 max_values: Vec<Vec<u8>>,
1460 null_counts: Vec<i64>,
1461 boundary_order: BoundaryOrder,
1462 repetition_level_histograms: Option<Vec<i64>>,
1464 definition_level_histograms: Option<Vec<i64>>,
1466 valid: bool,
1474}
1475
1476impl ColumnIndexBuilder {
1477 pub fn new(column_type: Type) -> Self {
1479 ColumnIndexBuilder {
1480 column_type,
1481 null_pages: Vec::new(),
1482 min_values: Vec::new(),
1483 max_values: Vec::new(),
1484 null_counts: Vec::new(),
1485 boundary_order: BoundaryOrder::UNORDERED,
1486 repetition_level_histograms: None,
1487 definition_level_histograms: None,
1488 valid: true,
1489 }
1490 }
1491
1492 pub fn append(
1494 &mut self,
1495 null_page: bool,
1496 min_value: Vec<u8>,
1497 max_value: Vec<u8>,
1498 null_count: i64,
1499 ) {
1500 self.null_pages.push(null_page);
1501 self.min_values.push(min_value);
1502 self.max_values.push(max_value);
1503 self.null_counts.push(null_count);
1504 }
1505
1506 pub fn append_histograms(
1511 &mut self,
1512 repetition_level_histogram: &Option<LevelHistogram>,
1513 definition_level_histogram: &Option<LevelHistogram>,
1514 ) {
1515 if !self.valid {
1516 return;
1517 }
1518 if let Some(rep_lvl_hist) = repetition_level_histogram {
1519 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1520 hist.reserve(rep_lvl_hist.len());
1521 hist.extend(rep_lvl_hist.values());
1522 }
1523 if let Some(def_lvl_hist) = definition_level_histogram {
1524 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1525 hist.reserve(def_lvl_hist.len());
1526 hist.extend(def_lvl_hist.values());
1527 }
1528 }
1529
1530 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1532 self.boundary_order = boundary_order;
1533 }
1534
1535 pub fn to_invalid(&mut self) {
1537 self.valid = false;
1538 }
1539
1540 pub fn valid(&self) -> bool {
1542 self.valid
1543 }
1544
1545 pub fn build(self) -> Result<ColumnIndexMetaData> {
1549 Ok(match self.column_type {
1550 Type::BOOLEAN => {
1551 let index = self.build_page_index()?;
1552 ColumnIndexMetaData::BOOLEAN(index)
1553 }
1554 Type::INT32 => {
1555 let index = self.build_page_index()?;
1556 ColumnIndexMetaData::INT32(index)
1557 }
1558 Type::INT64 => {
1559 let index = self.build_page_index()?;
1560 ColumnIndexMetaData::INT64(index)
1561 }
1562 Type::INT96 => {
1563 let index = self.build_page_index()?;
1564 ColumnIndexMetaData::INT96(index)
1565 }
1566 Type::FLOAT => {
1567 let index = self.build_page_index()?;
1568 ColumnIndexMetaData::FLOAT(index)
1569 }
1570 Type::DOUBLE => {
1571 let index = self.build_page_index()?;
1572 ColumnIndexMetaData::DOUBLE(index)
1573 }
1574 Type::BYTE_ARRAY => {
1575 let index = self.build_byte_array_index()?;
1576 ColumnIndexMetaData::BYTE_ARRAY(index)
1577 }
1578 Type::FIXED_LEN_BYTE_ARRAY => {
1579 let index = self.build_byte_array_index()?;
1580 ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1581 }
1582 })
1583 }
1584
1585 fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1586 where
1587 T: ParquetValueType,
1588 {
1589 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1590 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1591
1592 PrimitiveColumnIndex::try_new(
1593 self.null_pages,
1594 self.boundary_order,
1595 Some(self.null_counts),
1596 self.repetition_level_histograms,
1597 self.definition_level_histograms,
1598 min_values,
1599 max_values,
1600 )
1601 }
1602
1603 fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1604 let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1605 let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1606
1607 ByteArrayColumnIndex::try_new(
1608 self.null_pages,
1609 self.boundary_order,
1610 Some(self.null_counts),
1611 self.repetition_level_histograms,
1612 self.definition_level_histograms,
1613 min_values,
1614 max_values,
1615 )
1616 }
1617}
1618
1619impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1620 fn from(value: ColumnChunkMetaData) -> Self {
1621 ColumnChunkMetaDataBuilder(value)
1622 }
1623}
1624
1625pub struct OffsetIndexBuilder {
1629 offset_array: Vec<i64>,
1630 compressed_page_size_array: Vec<i32>,
1631 first_row_index_array: Vec<i64>,
1632 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1633 current_first_row_index: i64,
1634}
1635
1636impl Default for OffsetIndexBuilder {
1637 fn default() -> Self {
1638 Self::new()
1639 }
1640}
1641
1642impl OffsetIndexBuilder {
1643 pub fn new() -> Self {
1645 OffsetIndexBuilder {
1646 offset_array: Vec::new(),
1647 compressed_page_size_array: Vec::new(),
1648 first_row_index_array: Vec::new(),
1649 unencoded_byte_array_data_bytes_array: None,
1650 current_first_row_index: 0,
1651 }
1652 }
1653
1654 pub fn append_row_count(&mut self, row_count: i64) {
1656 let current_page_row_index = self.current_first_row_index;
1657 self.first_row_index_array.push(current_page_row_index);
1658 self.current_first_row_index += row_count;
1659 }
1660
1661 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1663 self.offset_array.push(offset);
1664 self.compressed_page_size_array.push(compressed_page_size);
1665 }
1666
1667 pub fn append_unencoded_byte_array_data_bytes(
1669 &mut self,
1670 unencoded_byte_array_data_bytes: Option<i64>,
1671 ) {
1672 if let Some(val) = unencoded_byte_array_data_bytes {
1673 self.unencoded_byte_array_data_bytes_array
1674 .get_or_insert(Vec::new())
1675 .push(val);
1676 }
1677 }
1678
1679 pub fn build(self) -> OffsetIndexMetaData {
1681 let locations = self
1682 .offset_array
1683 .iter()
1684 .zip(self.compressed_page_size_array.iter())
1685 .zip(self.first_row_index_array.iter())
1686 .map(|((offset, size), row_index)| PageLocation {
1687 offset: *offset,
1688 compressed_page_size: *size,
1689 first_row_index: *row_index,
1690 })
1691 .collect::<Vec<_>>();
1692 OffsetIndexMetaData {
1693 page_locations: locations,
1694 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1695 }
1696 }
1697}
1698
1699#[cfg(test)]
1700mod tests {
1701 use super::*;
1702 use crate::basic::{PageType, SortOrder};
1703 use crate::file::metadata::thrift::tests::{
1704 read_column_chunk, read_column_chunk_with_options, read_row_group,
1705 };
1706
1707 #[test]
1708 #[allow(deprecated)]
1709 fn test_level_histogram_update_from_levels_compat() {
1710 let mut histogram = LevelHistogram::try_new(2).unwrap();
1711 histogram.update_from_levels(&[0, 2, 1, 2, 2]);
1712 assert_eq!(histogram.values(), &[1, 1, 3]);
1713 }
1714
1715 #[test]
1716 fn test_row_group_metadata_thrift_conversion() {
1717 let schema_descr = get_test_schema_descr();
1718
1719 let mut columns = vec![];
1720 for ptr in schema_descr.columns() {
1721 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1722 columns.push(column);
1723 }
1724 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1725 .set_num_rows(1000)
1726 .set_total_byte_size(2000)
1727 .set_column_metadata(columns)
1728 .set_ordinal(1)
1729 .build()
1730 .unwrap();
1731
1732 let mut buf = Vec::new();
1733 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1734 row_group_meta.write_thrift(&mut writer).unwrap();
1735
1736 let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1737
1738 assert_eq!(row_group_res, row_group_meta);
1739 }
1740
1741 #[test]
1742 fn test_row_group_metadata_thrift_conversion_empty() {
1743 let schema_descr = get_test_schema_descr();
1744
1745 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1746
1747 assert!(row_group_meta.is_err());
1748 if let Err(e) = row_group_meta {
1749 assert_eq!(
1750 format!("{e}"),
1751 "Parquet error: Column length mismatch: 2 != 0"
1752 );
1753 }
1754 }
1755
1756 #[test]
1758 fn test_row_group_metadata_thrift_corrupted() {
1759 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1760 SchemaType::group_type_builder("schema")
1761 .with_fields(vec![
1762 Arc::new(
1763 SchemaType::primitive_type_builder("a", Type::INT32)
1764 .build()
1765 .unwrap(),
1766 ),
1767 Arc::new(
1768 SchemaType::primitive_type_builder("b", Type::INT32)
1769 .build()
1770 .unwrap(),
1771 ),
1772 ])
1773 .build()
1774 .unwrap(),
1775 )));
1776
1777 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1778 SchemaType::group_type_builder("schema")
1779 .with_fields(vec![
1780 Arc::new(
1781 SchemaType::primitive_type_builder("a", Type::INT32)
1782 .build()
1783 .unwrap(),
1784 ),
1785 Arc::new(
1786 SchemaType::primitive_type_builder("b", Type::INT32)
1787 .build()
1788 .unwrap(),
1789 ),
1790 Arc::new(
1791 SchemaType::primitive_type_builder("c", Type::INT32)
1792 .build()
1793 .unwrap(),
1794 ),
1795 ])
1796 .build()
1797 .unwrap(),
1798 )));
1799
1800 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1801 .set_num_rows(1000)
1802 .set_total_byte_size(2000)
1803 .set_column_metadata(vec![
1804 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1805 .build()
1806 .unwrap(),
1807 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1808 .build()
1809 .unwrap(),
1810 ])
1811 .set_ordinal(1)
1812 .build()
1813 .unwrap();
1814 let mut buf = Vec::new();
1815 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1816 row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1817
1818 let err = read_row_group(&mut buf, schema_descr_3cols)
1819 .unwrap_err()
1820 .to_string();
1821 assert_eq!(
1822 err,
1823 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1824 );
1825 }
1826
1827 #[test]
1828 fn test_column_chunk_metadata_thrift_conversion() {
1829 let column_descr = get_test_schema_descr().column(0);
1830 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1831 .set_encodings_mask(EncodingMask::new_from_encodings(
1832 [Encoding::PLAIN, Encoding::RLE].iter(),
1833 ))
1834 .set_file_path("file_path".to_owned())
1835 .set_num_values(1000)
1836 .set_compression_codec(CompressionCodec::SNAPPY)
1837 .set_total_compressed_size(2000)
1838 .set_total_uncompressed_size(3000)
1839 .set_data_page_offset(4000)
1840 .set_dictionary_page_offset(Some(5000))
1841 .set_page_encoding_stats(vec![
1842 PageEncodingStats {
1843 page_type: PageType::DATA_PAGE,
1844 encoding: Encoding::PLAIN,
1845 count: 3,
1846 },
1847 PageEncodingStats {
1848 page_type: PageType::DATA_PAGE,
1849 encoding: Encoding::RLE,
1850 count: 5,
1851 },
1852 ])
1853 .set_bloom_filter_offset(Some(6000))
1854 .set_bloom_filter_length(Some(25))
1855 .set_offset_index_offset(Some(7000))
1856 .set_offset_index_length(Some(25))
1857 .set_column_index_offset(Some(8000))
1858 .set_column_index_length(Some(25))
1859 .set_unencoded_byte_array_data_bytes(Some(2000))
1860 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1861 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1862 .build()
1863 .unwrap();
1864
1865 let mut buf = Vec::new();
1866 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1867 col_metadata.write_thrift(&mut writer).unwrap();
1868 let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1869
1870 let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1871 .set_encodings_mask(EncodingMask::new_from_encodings(
1872 [Encoding::PLAIN, Encoding::RLE].iter(),
1873 ))
1874 .set_file_path("file_path".to_owned())
1875 .set_num_values(1000)
1876 .set_compression_codec(CompressionCodec::SNAPPY)
1877 .set_total_compressed_size(2000)
1878 .set_total_uncompressed_size(3000)
1879 .set_data_page_offset(4000)
1880 .set_dictionary_page_offset(Some(5000))
1881 .set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1882 [Encoding::PLAIN, Encoding::RLE].iter(),
1883 ))
1884 .set_bloom_filter_offset(Some(6000))
1885 .set_bloom_filter_length(Some(25))
1886 .set_offset_index_offset(Some(7000))
1887 .set_offset_index_length(Some(25))
1888 .set_column_index_offset(Some(8000))
1889 .set_column_index_length(Some(25))
1890 .set_unencoded_byte_array_data_bytes(Some(2000))
1891 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1892 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1893 .build()
1894 .unwrap();
1895
1896 assert_eq!(col_chunk_res, expected_metadata);
1897 }
1898
1899 #[test]
1900 fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1901 let column_descr = get_test_schema_descr().column(0);
1902 let stats = vec![
1903 PageEncodingStats {
1904 page_type: PageType::DATA_PAGE,
1905 encoding: Encoding::PLAIN,
1906 count: 3,
1907 },
1908 PageEncodingStats {
1909 page_type: PageType::DATA_PAGE,
1910 encoding: Encoding::RLE,
1911 count: 5,
1912 },
1913 ];
1914 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1915 .set_encodings_mask(EncodingMask::new_from_encodings(
1916 [Encoding::PLAIN, Encoding::RLE].iter(),
1917 ))
1918 .set_num_values(1000)
1919 .set_compression_codec(CompressionCodec::SNAPPY)
1920 .set_total_compressed_size(2000)
1921 .set_total_uncompressed_size(3000)
1922 .set_data_page_offset(4000)
1923 .set_page_encoding_stats(stats)
1924 .build()
1925 .unwrap();
1926
1927 let mut buf = Vec::new();
1928 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1929 col_metadata.write_thrift(&mut writer).unwrap();
1930
1931 let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1932 let col_chunk_res =
1933 read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
1934
1935 assert_eq!(col_chunk_res, col_metadata);
1936 }
1937
1938 #[test]
1939 fn test_column_chunk_metadata_thrift_conversion_empty() {
1940 let column_descr = get_test_schema_descr().column(0);
1941
1942 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1943 .build()
1944 .unwrap();
1945
1946 let mut buf = Vec::new();
1947 let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1948 col_metadata.write_thrift(&mut writer).unwrap();
1949 let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1950
1951 assert_eq!(col_chunk_res, col_metadata);
1952 }
1953
1954 #[test]
1955 fn test_compressed_size() {
1956 let schema_descr = get_test_schema_descr();
1957
1958 let mut columns = vec![];
1959 for column_descr in schema_descr.columns() {
1960 let column = ColumnChunkMetaData::builder(column_descr.clone())
1961 .set_total_compressed_size(500)
1962 .set_total_uncompressed_size(700)
1963 .build()
1964 .unwrap();
1965 columns.push(column);
1966 }
1967 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1968 .set_num_rows(1000)
1969 .set_column_metadata(columns)
1970 .build()
1971 .unwrap();
1972
1973 let compressed_size_res: i64 = row_group_meta.compressed_size();
1974 let compressed_size_exp: i64 = 1000;
1975
1976 assert_eq!(compressed_size_res, compressed_size_exp);
1977 }
1978
1979 #[test]
1980 fn test_memory_size() {
1981 let schema_descr = get_test_schema_descr();
1982
1983 let columns = schema_descr
1984 .columns()
1985 .iter()
1986 .map(|column_descr| {
1987 ColumnChunkMetaData::builder(column_descr.clone())
1988 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1989 .build()
1990 })
1991 .collect::<Result<Vec<_>>>()
1992 .unwrap();
1993 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1994 .set_num_rows(1000)
1995 .set_column_metadata(columns)
1996 .build()
1997 .unwrap();
1998 let row_group_meta = vec![row_group_meta];
1999
2000 let version = 2;
2001 let num_rows = 1000;
2002 let created_by = Some(String::from("test harness"));
2003 let key_value_metadata = Some(vec![KeyValue::new(
2004 String::from("Foo"),
2005 Some(String::from("bar")),
2006 )]);
2007 let column_orders = Some(vec![
2008 ColumnOrder::UNDEFINED,
2009 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
2010 ]);
2011 let file_metadata = FileMetaData::new(
2012 version,
2013 num_rows,
2014 created_by,
2015 key_value_metadata,
2016 schema_descr.clone(),
2017 column_orders,
2018 );
2019
2020 let columns_with_stats = schema_descr
2022 .columns()
2023 .iter()
2024 .map(|column_descr| {
2025 ColumnChunkMetaData::builder(column_descr.clone())
2026 .set_statistics(Statistics::new::<i32>(
2027 Some(0),
2028 Some(100),
2029 None,
2030 None,
2031 false,
2032 ))
2033 .build()
2034 })
2035 .collect::<Result<Vec<_>>>()
2036 .unwrap();
2037
2038 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
2039 .set_num_rows(1000)
2040 .set_column_metadata(columns_with_stats)
2041 .build()
2042 .unwrap();
2043 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
2044
2045 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
2046 .set_row_groups(row_group_meta_with_stats)
2047 .build();
2048
2049 #[cfg(not(feature = "encryption"))]
2050 let base_expected_size = 2734;
2051 #[cfg(feature = "encryption")]
2052 let base_expected_size = 2902;
2053
2054 assert_eq!(parquet_meta.memory_size(), base_expected_size);
2055
2056 let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
2057 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2058 let column_index = column_index.build().unwrap();
2059 let native_index = match column_index {
2060 ColumnIndexMetaData::BOOLEAN(index) => index,
2061 _ => panic!("wrong type of column index"),
2062 };
2063
2064 let mut offset_index = OffsetIndexBuilder::new();
2066 offset_index.append_row_count(1);
2067 offset_index.append_offset_and_size(2, 3);
2068 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2069 offset_index.append_row_count(1);
2070 offset_index.append_offset_and_size(2, 3);
2071 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2072 let offset_index = offset_index.build();
2073
2074 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2075 .set_row_groups(row_group_meta)
2076 .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
2077 .set_offset_index(Some(vec![vec![offset_index]]))
2078 .build();
2079
2080 #[cfg(not(feature = "encryption"))]
2081 let bigger_expected_size = 3160;
2082 #[cfg(feature = "encryption")]
2083 let bigger_expected_size = 3328;
2084
2085 assert!(bigger_expected_size > base_expected_size);
2087 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2088 }
2089
2090 #[test]
2091 #[cfg(feature = "encryption")]
2092 fn test_memory_size_with_decryptor() {
2093 use crate::encryption::decrypt::FileDecryptionProperties;
2094 use crate::file::metadata::thrift::encryption::AesGcmV1;
2095
2096 let schema_descr = get_test_schema_descr();
2097
2098 let columns = schema_descr
2099 .columns()
2100 .iter()
2101 .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
2102 .collect::<Result<Vec<_>>>()
2103 .unwrap();
2104 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
2105 .set_num_rows(1000)
2106 .set_column_metadata(columns)
2107 .build()
2108 .unwrap();
2109 let row_group_meta = vec![row_group_meta];
2110
2111 let version = 2;
2112 let num_rows = 1000;
2113 let aad_file_unique = vec![1u8; 8];
2114 let aad_prefix = vec![2u8; 8];
2115 let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
2116 aad_prefix: Some(aad_prefix.clone()),
2117 aad_file_unique: Some(aad_file_unique.clone()),
2118 supply_aad_prefix: Some(true),
2119 });
2120 let footer_key_metadata = Some(vec![3u8; 8]);
2121 let file_metadata =
2122 FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
2123 .with_encryption_algorithm(Some(encryption_algorithm))
2124 .with_footer_signing_key_metadata(footer_key_metadata.clone());
2125
2126 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2127 .set_row_groups(row_group_meta.clone())
2128 .build();
2129
2130 let base_expected_size = 2042;
2131 assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
2132
2133 let footer_key = "0123456789012345".as_bytes();
2134 let column_key = "1234567890123450".as_bytes();
2135 let mut decryption_properties_builder =
2136 FileDecryptionProperties::builder(footer_key.to_vec())
2137 .with_aad_prefix(aad_prefix.clone());
2138 for column in schema_descr.columns() {
2139 decryption_properties_builder = decryption_properties_builder
2140 .with_column_key(&column.path().string(), column_key.to_vec());
2141 }
2142 let decryption_properties = decryption_properties_builder.build().unwrap();
2143 let decryptor = FileDecryptor::new(
2144 &decryption_properties,
2145 footer_key_metadata.as_deref(),
2146 aad_file_unique,
2147 aad_prefix,
2148 )
2149 .unwrap();
2150
2151 let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2152 .set_row_groups(row_group_meta.clone())
2153 .set_file_decryptor(Some(decryptor))
2154 .build();
2155
2156 let expected_size_with_decryptor = 3056;
2157 assert!(expected_size_with_decryptor > base_expected_size);
2158
2159 assert_eq!(
2160 parquet_meta_data.memory_size(),
2161 expected_size_with_decryptor
2162 );
2163 }
2164
2165 fn get_test_schema_descr() -> SchemaDescPtr {
2167 let schema = SchemaType::group_type_builder("schema")
2168 .with_fields(vec![
2169 Arc::new(
2170 SchemaType::primitive_type_builder("a", Type::INT32)
2171 .build()
2172 .unwrap(),
2173 ),
2174 Arc::new(
2175 SchemaType::primitive_type_builder("b", Type::INT32)
2176 .build()
2177 .unwrap(),
2178 ),
2179 ])
2180 .build()
2181 .unwrap();
2182
2183 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2184 }
2185}