1mod memory;
95pub(crate) mod reader;
96mod writer;
97
98use crate::basic::{ColumnOrder, Compression, Encoding, Type};
99#[cfg(feature = "encryption")]
100use crate::encryption::{
101 decrypt::FileDecryptor,
102 modules::{create_module_aad, ModuleType},
103};
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
107pub(crate) use crate::file::metadata::memory::HeapSize;
108use crate::file::page_encoding_stats::{self, PageEncodingStats};
109use crate::file::page_index::index::Index;
110use crate::file::page_index::offset_index::OffsetIndexMetaData;
111use crate::file::statistics::{self, Statistics};
112use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
113use crate::format::{
114 BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
115 SizeStatistics, SortingColumn,
116};
117use crate::schema::types::{
118 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
119 Type as SchemaType,
120};
121#[cfg(feature = "encryption")]
122use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
123pub use reader::{FooterTail, ParquetMetaDataReader};
124use std::ops::Range;
125use std::sync::Arc;
126pub use writer::ParquetMetaDataWriter;
127pub(crate) use writer::ThriftMetadataWriter;
128
129pub type ParquetColumnIndex = Vec<Vec<Index>>;
145
146pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
157
158#[derive(Debug, Clone, PartialEq)]
176pub struct ParquetMetaData {
177 file_metadata: FileMetaData,
179 row_groups: Vec<RowGroupMetaData>,
181 column_index: Option<ParquetColumnIndex>,
183 offset_index: Option<ParquetOffsetIndex>,
185 #[cfg(feature = "encryption")]
187 file_decryptor: Option<FileDecryptor>,
188}
189
190impl ParquetMetaData {
191 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
194 ParquetMetaData {
195 file_metadata,
196 row_groups,
197 #[cfg(feature = "encryption")]
198 file_decryptor: None,
199 column_index: None,
200 offset_index: None,
201 }
202 }
203
204 #[cfg(feature = "encryption")]
207 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
208 self.file_decryptor = file_decryptor;
209 }
210
211 pub fn into_builder(self) -> ParquetMetaDataBuilder {
213 self.into()
214 }
215
216 pub fn file_metadata(&self) -> &FileMetaData {
218 &self.file_metadata
219 }
220
221 #[cfg(feature = "encryption")]
223 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
224 self.file_decryptor.as_ref()
225 }
226
227 pub fn num_row_groups(&self) -> usize {
229 self.row_groups.len()
230 }
231
232 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
235 &self.row_groups[i]
236 }
237
238 pub fn row_groups(&self) -> &[RowGroupMetaData] {
240 &self.row_groups
241 }
242
243 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
250 self.column_index.as_ref()
251 }
252
253 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
260 self.offset_index.as_ref()
261 }
262
263 pub fn memory_size(&self) -> usize {
278 std::mem::size_of::<Self>()
279 + self.file_metadata.heap_size()
280 + self.row_groups.heap_size()
281 + self.column_index.heap_size()
282 + self.offset_index.heap_size()
283 }
284
285 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
287 self.column_index = index;
288 }
289
290 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
292 self.offset_index = index;
293 }
294}
295
296pub struct ParquetMetaDataBuilder(ParquetMetaData);
334
335impl ParquetMetaDataBuilder {
336 pub fn new(file_meta_data: FileMetaData) -> Self {
338 Self(ParquetMetaData::new(file_meta_data, vec![]))
339 }
340
341 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
343 Self(metadata)
344 }
345
346 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
348 self.0.row_groups.push(row_group);
349 self
350 }
351
352 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
354 self.0.row_groups = row_groups;
355 self
356 }
357
358 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
364 std::mem::take(&mut self.0.row_groups)
365 }
366
367 pub fn row_groups(&self) -> &[RowGroupMetaData] {
369 &self.0.row_groups
370 }
371
372 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
374 self.0.column_index = column_index;
375 self
376 }
377
378 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
380 std::mem::take(&mut self.0.column_index)
381 }
382
383 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
385 self.0.column_index.as_ref()
386 }
387
388 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
390 self.0.offset_index = offset_index;
391 self
392 }
393
394 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
396 std::mem::take(&mut self.0.offset_index)
397 }
398
399 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
401 self.0.offset_index.as_ref()
402 }
403
404 pub fn build(self) -> ParquetMetaData {
406 let Self(metadata) = self;
407 metadata
408 }
409}
410
411impl From<ParquetMetaData> for ParquetMetaDataBuilder {
412 fn from(meta_data: ParquetMetaData) -> Self {
413 Self(meta_data)
414 }
415}
416
417pub type KeyValue = crate::format::KeyValue;
419
420pub type FileMetaDataPtr = Arc<FileMetaData>;
422
423#[derive(Debug, Clone, PartialEq)]
427pub struct FileMetaData {
428 version: i32,
429 num_rows: i64,
430 created_by: Option<String>,
431 key_value_metadata: Option<Vec<KeyValue>>,
432 schema_descr: SchemaDescPtr,
433 column_orders: Option<Vec<ColumnOrder>>,
434}
435
436impl FileMetaData {
437 pub fn new(
439 version: i32,
440 num_rows: i64,
441 created_by: Option<String>,
442 key_value_metadata: Option<Vec<KeyValue>>,
443 schema_descr: SchemaDescPtr,
444 column_orders: Option<Vec<ColumnOrder>>,
445 ) -> Self {
446 FileMetaData {
447 version,
448 num_rows,
449 created_by,
450 key_value_metadata,
451 schema_descr,
452 column_orders,
453 }
454 }
455
456 pub fn version(&self) -> i32 {
458 self.version
459 }
460
461 pub fn num_rows(&self) -> i64 {
463 self.num_rows
464 }
465
466 pub fn created_by(&self) -> Option<&str> {
475 self.created_by.as_deref()
476 }
477
478 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
480 self.key_value_metadata.as_ref()
481 }
482
483 pub fn schema(&self) -> &SchemaType {
487 self.schema_descr.root_schema()
488 }
489
490 pub fn schema_descr(&self) -> &SchemaDescriptor {
492 &self.schema_descr
493 }
494
495 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
497 self.schema_descr.clone()
498 }
499
500 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
508 self.column_orders.as_ref()
509 }
510
511 pub fn column_order(&self, i: usize) -> ColumnOrder {
514 self.column_orders
515 .as_ref()
516 .map(|data| data[i])
517 .unwrap_or(ColumnOrder::UNDEFINED)
518 }
519}
520
521pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
523
524#[derive(Debug, Clone, PartialEq)]
529pub struct RowGroupMetaData {
530 columns: Vec<ColumnChunkMetaData>,
531 num_rows: i64,
532 sorting_columns: Option<Vec<SortingColumn>>,
533 total_byte_size: i64,
534 schema_descr: SchemaDescPtr,
535 file_offset: Option<i64>,
537 ordinal: Option<i16>,
539}
540
541impl RowGroupMetaData {
542 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
544 RowGroupMetaDataBuilder::new(schema_descr)
545 }
546
547 pub fn num_columns(&self) -> usize {
549 self.columns.len()
550 }
551
552 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
554 &self.columns[i]
555 }
556
557 pub fn columns(&self) -> &[ColumnChunkMetaData] {
559 &self.columns
560 }
561
562 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
564 &mut self.columns
565 }
566
567 pub fn num_rows(&self) -> i64 {
569 self.num_rows
570 }
571
572 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
574 self.sorting_columns.as_ref()
575 }
576
577 pub fn total_byte_size(&self) -> i64 {
579 self.total_byte_size
580 }
581
582 pub fn compressed_size(&self) -> i64 {
584 self.columns.iter().map(|c| c.total_compressed_size).sum()
585 }
586
587 pub fn schema_descr(&self) -> &SchemaDescriptor {
589 self.schema_descr.as_ref()
590 }
591
592 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
594 self.schema_descr.clone()
595 }
596
597 #[inline(always)]
602 pub fn ordinal(&self) -> Option<i16> {
603 self.ordinal
604 }
605
606 #[inline(always)]
608 pub fn file_offset(&self) -> Option<i64> {
609 self.file_offset
610 }
611
612 #[cfg(feature = "encryption")]
614 fn from_encrypted_thrift(
615 schema_descr: SchemaDescPtr,
616 mut rg: RowGroup,
617 decryptor: Option<&FileDecryptor>,
618 ) -> Result<RowGroupMetaData> {
619 if schema_descr.num_columns() != rg.columns.len() {
620 return Err(general_err!(
621 "Column count mismatch. Schema has {} columns while Row Group has {}",
622 schema_descr.num_columns(),
623 rg.columns.len()
624 ));
625 }
626 let total_byte_size = rg.total_byte_size;
627 let num_rows = rg.num_rows;
628 let mut columns = vec![];
629
630 for (i, (mut c, d)) in rg
631 .columns
632 .drain(0..)
633 .zip(schema_descr.columns())
634 .enumerate()
635 {
636 if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
638 let column_decryptor = match c.crypto_metadata.as_ref() {
639 None => {
640 return Err(general_err!(
641 "No crypto_metadata is set for column '{}', which has encrypted metadata",
642 d.path().string()
643 ));
644 }
645 Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
646 let column_name = crypto_metadata.path_in_schema.join(".");
647 decryptor.get_column_metadata_decryptor(
648 column_name.as_str(),
649 crypto_metadata.key_metadata.as_deref(),
650 )?
651 }
652 Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
653 decryptor.get_footer_decryptor()?
654 }
655 };
656
657 let column_aad = create_module_aad(
658 decryptor.file_aad(),
659 ModuleType::ColumnMetaData,
660 rg.ordinal.unwrap() as usize,
661 i,
662 None,
663 )?;
664
665 let buf = c.encrypted_column_metadata.clone().unwrap();
666 let decrypted_cc_buf = column_decryptor
667 .decrypt(buf.as_slice(), column_aad.as_ref())
668 .map_err(|_| {
669 general_err!(
670 "Unable to decrypt column '{}', perhaps the column key is wrong?",
671 d.path().string()
672 )
673 })?;
674
675 let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
676 c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
677 }
678 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
679 }
680
681 let sorting_columns = rg.sorting_columns;
682 Ok(RowGroupMetaData {
683 columns,
684 num_rows,
685 sorting_columns,
686 total_byte_size,
687 schema_descr,
688 file_offset: rg.file_offset,
689 ordinal: rg.ordinal,
690 })
691 }
692
693 pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
695 if schema_descr.num_columns() != rg.columns.len() {
696 return Err(general_err!(
697 "Column count mismatch. Schema has {} columns while Row Group has {}",
698 schema_descr.num_columns(),
699 rg.columns.len()
700 ));
701 }
702 let total_byte_size = rg.total_byte_size;
703 let num_rows = rg.num_rows;
704 let mut columns = vec![];
705
706 for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
707 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
708 }
709
710 let sorting_columns = rg.sorting_columns;
711 Ok(RowGroupMetaData {
712 columns,
713 num_rows,
714 sorting_columns,
715 total_byte_size,
716 schema_descr,
717 file_offset: rg.file_offset,
718 ordinal: rg.ordinal,
719 })
720 }
721
722 pub fn to_thrift(&self) -> RowGroup {
724 RowGroup {
725 columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
726 total_byte_size: self.total_byte_size,
727 num_rows: self.num_rows,
728 sorting_columns: self.sorting_columns().cloned(),
729 file_offset: self.file_offset(),
730 total_compressed_size: Some(self.compressed_size()),
731 ordinal: self.ordinal,
732 }
733 }
734
735 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
737 RowGroupMetaDataBuilder(self)
738 }
739}
740
741pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
743
744impl RowGroupMetaDataBuilder {
745 fn new(schema_descr: SchemaDescPtr) -> Self {
747 Self(RowGroupMetaData {
748 columns: Vec::with_capacity(schema_descr.num_columns()),
749 schema_descr,
750 file_offset: None,
751 num_rows: 0,
752 sorting_columns: None,
753 total_byte_size: 0,
754 ordinal: None,
755 })
756 }
757
758 pub fn set_num_rows(mut self, value: i64) -> Self {
760 self.0.num_rows = value;
761 self
762 }
763
764 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
766 self.0.sorting_columns = value;
767 self
768 }
769
770 pub fn set_total_byte_size(mut self, value: i64) -> Self {
772 self.0.total_byte_size = value;
773 self
774 }
775
776 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
782 std::mem::take(&mut self.0.columns)
783 }
784
785 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
787 self.0.columns = value;
788 self
789 }
790
791 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
793 self.0.columns.push(value);
794 self
795 }
796
797 pub fn set_ordinal(mut self, value: i16) -> Self {
799 self.0.ordinal = Some(value);
800 self
801 }
802
803 pub fn set_file_offset(mut self, value: i64) -> Self {
805 self.0.file_offset = Some(value);
806 self
807 }
808
809 pub fn build(self) -> Result<RowGroupMetaData> {
811 if self.0.schema_descr.num_columns() != self.0.columns.len() {
812 return Err(general_err!(
813 "Column length mismatch: {} != {}",
814 self.0.schema_descr.num_columns(),
815 self.0.columns.len()
816 ));
817 }
818
819 Ok(self.0)
820 }
821}
822
823#[derive(Debug, Clone, PartialEq)]
825pub struct ColumnChunkMetaData {
826 column_descr: ColumnDescPtr,
827 encodings: Vec<Encoding>,
828 file_path: Option<String>,
829 file_offset: i64,
830 num_values: i64,
831 compression: Compression,
832 total_compressed_size: i64,
833 total_uncompressed_size: i64,
834 data_page_offset: i64,
835 index_page_offset: Option<i64>,
836 dictionary_page_offset: Option<i64>,
837 statistics: Option<Statistics>,
838 encoding_stats: Option<Vec<PageEncodingStats>>,
839 bloom_filter_offset: Option<i64>,
840 bloom_filter_length: Option<i32>,
841 offset_index_offset: Option<i64>,
842 offset_index_length: Option<i32>,
843 column_index_offset: Option<i64>,
844 column_index_length: Option<i32>,
845 unencoded_byte_array_data_bytes: Option<i64>,
846 repetition_level_histogram: Option<LevelHistogram>,
847 definition_level_histogram: Option<LevelHistogram>,
848 #[cfg(feature = "encryption")]
849 column_crypto_metadata: Option<ColumnCryptoMetaData>,
850}
851
852#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
861pub struct LevelHistogram {
862 inner: Vec<i64>,
863}
864
865impl LevelHistogram {
866 pub fn try_new(max_level: i16) -> Option<Self> {
872 if max_level > 0 {
873 Some(Self {
874 inner: vec![0; max_level as usize + 1],
875 })
876 } else {
877 None
878 }
879 }
880 pub fn values(&self) -> &[i64] {
882 &self.inner
883 }
884
885 pub fn into_inner(self) -> Vec<i64> {
887 self.inner
888 }
889
890 pub fn get(&self, index: usize) -> Option<i64> {
897 self.inner.get(index).copied()
898 }
899
900 pub fn add(&mut self, other: &Self) {
905 assert_eq!(self.len(), other.len());
906 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
907 *dst += src;
908 }
909 }
910
911 pub fn len(&self) -> usize {
913 self.inner.len()
914 }
915
916 pub fn is_empty(&self) -> bool {
918 self.inner.is_empty()
919 }
920
921 pub fn reset(&mut self) {
923 for value in self.inner.iter_mut() {
924 *value = 0;
925 }
926 }
927
928 pub fn update_from_levels(&mut self, levels: &[i16]) {
934 for &level in levels {
935 self.inner[level as usize] += 1;
936 }
937 }
938}
939
940impl From<Vec<i64>> for LevelHistogram {
941 fn from(inner: Vec<i64>) -> Self {
942 Self { inner }
943 }
944}
945
946impl From<LevelHistogram> for Vec<i64> {
947 fn from(value: LevelHistogram) -> Self {
948 value.into_inner()
949 }
950}
951
952impl HeapSize for LevelHistogram {
953 fn heap_size(&self) -> usize {
954 self.inner.heap_size()
955 }
956}
957
958impl ColumnChunkMetaData {
960 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
962 ColumnChunkMetaDataBuilder::new(column_descr)
963 }
964
965 pub fn file_path(&self) -> Option<&str> {
970 self.file_path.as_deref()
971 }
972
973 pub fn file_offset(&self) -> i64 {
980 self.file_offset
981 }
982
983 pub fn column_type(&self) -> Type {
985 self.column_descr.physical_type()
986 }
987
988 pub fn column_path(&self) -> &ColumnPath {
990 self.column_descr.path()
991 }
992
993 pub fn column_descr(&self) -> &ColumnDescriptor {
995 self.column_descr.as_ref()
996 }
997
998 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1000 self.column_descr.clone()
1001 }
1002
1003 pub fn encodings(&self) -> &Vec<Encoding> {
1005 &self.encodings
1006 }
1007
1008 pub fn num_values(&self) -> i64 {
1010 self.num_values
1011 }
1012
1013 pub fn compression(&self) -> Compression {
1015 self.compression
1016 }
1017
1018 pub fn compressed_size(&self) -> i64 {
1020 self.total_compressed_size
1021 }
1022
1023 pub fn uncompressed_size(&self) -> i64 {
1025 self.total_uncompressed_size
1026 }
1027
1028 pub fn data_page_offset(&self) -> i64 {
1030 self.data_page_offset
1031 }
1032
1033 pub fn index_page_offset(&self) -> Option<i64> {
1035 self.index_page_offset
1036 }
1037
1038 pub fn dictionary_page_offset(&self) -> Option<i64> {
1040 self.dictionary_page_offset
1041 }
1042
1043 pub fn byte_range(&self) -> (u64, u64) {
1045 let col_start = match self.dictionary_page_offset() {
1046 Some(dictionary_page_offset) => dictionary_page_offset,
1047 None => self.data_page_offset(),
1048 };
1049 let col_len = self.compressed_size();
1050 assert!(
1051 col_start >= 0 && col_len >= 0,
1052 "column start and length should not be negative"
1053 );
1054 (col_start as u64, col_len as u64)
1055 }
1056
1057 pub fn statistics(&self) -> Option<&Statistics> {
1060 self.statistics.as_ref()
1061 }
1062
1063 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1066 self.encoding_stats.as_ref()
1067 }
1068
1069 pub fn bloom_filter_offset(&self) -> Option<i64> {
1071 self.bloom_filter_offset
1072 }
1073
1074 pub fn bloom_filter_length(&self) -> Option<i32> {
1076 self.bloom_filter_length
1077 }
1078
1079 pub fn column_index_offset(&self) -> Option<i64> {
1081 self.column_index_offset
1082 }
1083
1084 pub fn column_index_length(&self) -> Option<i32> {
1086 self.column_index_length
1087 }
1088
1089 pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1091 let offset = u64::try_from(self.column_index_offset?).ok()?;
1092 let length = u64::try_from(self.column_index_length?).ok()?;
1093 Some(offset..(offset + length))
1094 }
1095
1096 pub fn offset_index_offset(&self) -> Option<i64> {
1098 self.offset_index_offset
1099 }
1100
1101 pub fn offset_index_length(&self) -> Option<i32> {
1103 self.offset_index_length
1104 }
1105
1106 pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1108 let offset = u64::try_from(self.offset_index_offset?).ok()?;
1109 let length = u64::try_from(self.offset_index_length?).ok()?;
1110 Some(offset..(offset + length))
1111 }
1112
1113 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1118 self.unencoded_byte_array_data_bytes
1119 }
1120
1121 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1127 self.repetition_level_histogram.as_ref()
1128 }
1129
1130 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1136 self.definition_level_histogram.as_ref()
1137 }
1138
1139 #[cfg(feature = "encryption")]
1141 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1142 self.column_crypto_metadata.as_ref()
1143 }
1144
1145 pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1147 if cc.meta_data.is_none() {
1148 return Err(general_err!("Expected to have column metadata"));
1149 }
1150 let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1151 let column_type = Type::try_from(col_metadata.type_)?;
1152 let encodings = col_metadata
1153 .encodings
1154 .drain(0..)
1155 .map(Encoding::try_from)
1156 .collect::<Result<_>>()?;
1157 let compression = Compression::try_from(col_metadata.codec)?;
1158 let file_path = cc.file_path;
1159 let file_offset = cc.file_offset;
1160 let num_values = col_metadata.num_values;
1161 let total_compressed_size = col_metadata.total_compressed_size;
1162 let total_uncompressed_size = col_metadata.total_uncompressed_size;
1163 let data_page_offset = col_metadata.data_page_offset;
1164 let index_page_offset = col_metadata.index_page_offset;
1165 let dictionary_page_offset = col_metadata.dictionary_page_offset;
1166 let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1167 let encoding_stats = col_metadata
1168 .encoding_stats
1169 .as_ref()
1170 .map(|vec| {
1171 vec.iter()
1172 .map(page_encoding_stats::try_from_thrift)
1173 .collect::<Result<_>>()
1174 })
1175 .transpose()?;
1176 let bloom_filter_offset = col_metadata.bloom_filter_offset;
1177 let bloom_filter_length = col_metadata.bloom_filter_length;
1178 let offset_index_offset = cc.offset_index_offset;
1179 let offset_index_length = cc.offset_index_length;
1180 let column_index_offset = cc.column_index_offset;
1181 let column_index_length = cc.column_index_length;
1182 let (
1183 unencoded_byte_array_data_bytes,
1184 repetition_level_histogram,
1185 definition_level_histogram,
1186 ) = if let Some(size_stats) = col_metadata.size_statistics {
1187 (
1188 size_stats.unencoded_byte_array_data_bytes,
1189 size_stats.repetition_level_histogram,
1190 size_stats.definition_level_histogram,
1191 )
1192 } else {
1193 (None, None, None)
1194 };
1195
1196 let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1197 let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1198
1199 #[cfg(feature = "encryption")]
1200 let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1201 Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1202 } else {
1203 None
1204 };
1205
1206 let result = ColumnChunkMetaData {
1207 column_descr,
1208 encodings,
1209 file_path,
1210 file_offset,
1211 num_values,
1212 compression,
1213 total_compressed_size,
1214 total_uncompressed_size,
1215 data_page_offset,
1216 index_page_offset,
1217 dictionary_page_offset,
1218 statistics,
1219 encoding_stats,
1220 bloom_filter_offset,
1221 bloom_filter_length,
1222 offset_index_offset,
1223 offset_index_length,
1224 column_index_offset,
1225 column_index_length,
1226 unencoded_byte_array_data_bytes,
1227 repetition_level_histogram,
1228 definition_level_histogram,
1229 #[cfg(feature = "encryption")]
1230 column_crypto_metadata,
1231 };
1232 Ok(result)
1233 }
1234
1235 pub fn to_thrift(&self) -> ColumnChunk {
1237 let column_metadata = self.to_column_metadata_thrift();
1238
1239 ColumnChunk {
1240 file_path: self.file_path().map(|s| s.to_owned()),
1241 file_offset: self.file_offset,
1242 meta_data: Some(column_metadata),
1243 offset_index_offset: self.offset_index_offset,
1244 offset_index_length: self.offset_index_length,
1245 column_index_offset: self.column_index_offset,
1246 column_index_length: self.column_index_length,
1247 crypto_metadata: self.column_crypto_metadata_thrift(),
1248 encrypted_column_metadata: None,
1249 }
1250 }
1251
1252 pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1254 let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1255 || self.repetition_level_histogram.is_some()
1256 || self.definition_level_histogram.is_some()
1257 {
1258 let repetition_level_histogram = self
1259 .repetition_level_histogram
1260 .as_ref()
1261 .map(|hist| hist.clone().into_inner());
1262
1263 let definition_level_histogram = self
1264 .definition_level_histogram
1265 .as_ref()
1266 .map(|hist| hist.clone().into_inner());
1267
1268 Some(SizeStatistics {
1269 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1270 repetition_level_histogram,
1271 definition_level_histogram,
1272 })
1273 } else {
1274 None
1275 };
1276
1277 ColumnMetaData {
1278 type_: self.column_type().into(),
1279 encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1280 path_in_schema: self.column_path().as_ref().to_vec(),
1281 codec: self.compression.into(),
1282 num_values: self.num_values,
1283 total_uncompressed_size: self.total_uncompressed_size,
1284 total_compressed_size: self.total_compressed_size,
1285 key_value_metadata: None,
1286 data_page_offset: self.data_page_offset,
1287 index_page_offset: self.index_page_offset,
1288 dictionary_page_offset: self.dictionary_page_offset,
1289 statistics: statistics::to_thrift(self.statistics.as_ref()),
1290 encoding_stats: self
1291 .encoding_stats
1292 .as_ref()
1293 .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1294 bloom_filter_offset: self.bloom_filter_offset,
1295 bloom_filter_length: self.bloom_filter_length,
1296 size_statistics,
1297 geospatial_statistics: None,
1298 }
1299 }
1300
1301 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1303 ColumnChunkMetaDataBuilder::from(self)
1304 }
1305
1306 #[cfg(feature = "encryption")]
1307 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1308 self.column_crypto_metadata
1309 .as_ref()
1310 .map(column_crypto_metadata::to_thrift)
1311 }
1312
1313 #[cfg(not(feature = "encryption"))]
1314 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1315 None
1316 }
1317}
1318
1319pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1338
1339impl ColumnChunkMetaDataBuilder {
1340 fn new(column_descr: ColumnDescPtr) -> Self {
1344 Self(ColumnChunkMetaData {
1345 column_descr,
1346 encodings: Vec::new(),
1347 file_path: None,
1348 file_offset: 0,
1349 num_values: 0,
1350 compression: Compression::UNCOMPRESSED,
1351 total_compressed_size: 0,
1352 total_uncompressed_size: 0,
1353 data_page_offset: 0,
1354 index_page_offset: None,
1355 dictionary_page_offset: None,
1356 statistics: None,
1357 encoding_stats: None,
1358 bloom_filter_offset: None,
1359 bloom_filter_length: None,
1360 offset_index_offset: None,
1361 offset_index_length: None,
1362 column_index_offset: None,
1363 column_index_length: None,
1364 unencoded_byte_array_data_bytes: None,
1365 repetition_level_histogram: None,
1366 definition_level_histogram: None,
1367 #[cfg(feature = "encryption")]
1368 column_crypto_metadata: None,
1369 })
1370 }
1371
1372 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1374 self.0.encodings = encodings;
1375 self
1376 }
1377
1378 pub fn set_file_path(mut self, value: String) -> Self {
1380 self.0.file_path = Some(value);
1381 self
1382 }
1383
1384 pub fn set_num_values(mut self, value: i64) -> Self {
1386 self.0.num_values = value;
1387 self
1388 }
1389
1390 pub fn set_compression(mut self, value: Compression) -> Self {
1392 self.0.compression = value;
1393 self
1394 }
1395
1396 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1398 self.0.total_compressed_size = value;
1399 self
1400 }
1401
1402 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1404 self.0.total_uncompressed_size = value;
1405 self
1406 }
1407
1408 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1410 self.0.data_page_offset = value;
1411 self
1412 }
1413
1414 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1416 self.0.dictionary_page_offset = value;
1417 self
1418 }
1419
1420 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1422 self.0.index_page_offset = value;
1423 self
1424 }
1425
1426 pub fn set_statistics(mut self, value: Statistics) -> Self {
1428 self.0.statistics = Some(value);
1429 self
1430 }
1431
1432 pub fn clear_statistics(mut self) -> Self {
1434 self.0.statistics = None;
1435 self
1436 }
1437
1438 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1440 self.0.encoding_stats = Some(value);
1441 self
1442 }
1443
1444 pub fn clear_page_encoding_stats(mut self) -> Self {
1446 self.0.encoding_stats = None;
1447 self
1448 }
1449
1450 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1452 self.0.bloom_filter_offset = value;
1453 self
1454 }
1455
1456 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1458 self.0.bloom_filter_length = value;
1459 self
1460 }
1461
1462 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1464 self.0.offset_index_offset = value;
1465 self
1466 }
1467
1468 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1470 self.0.offset_index_length = value;
1471 self
1472 }
1473
1474 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1476 self.0.column_index_offset = value;
1477 self
1478 }
1479
1480 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1482 self.0.column_index_length = value;
1483 self
1484 }
1485
1486 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1488 self.0.unencoded_byte_array_data_bytes = value;
1489 self
1490 }
1491
1492 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1494 self.0.repetition_level_histogram = value;
1495 self
1496 }
1497
1498 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1500 self.0.definition_level_histogram = value;
1501 self
1502 }
1503
1504 #[cfg(feature = "encryption")]
1505 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1507 self.0.column_crypto_metadata = value;
1508 self
1509 }
1510
1511 pub fn build(self) -> Result<ColumnChunkMetaData> {
1513 Ok(self.0)
1514 }
1515}
1516
1517pub struct ColumnIndexBuilder {
1521 null_pages: Vec<bool>,
1522 min_values: Vec<Vec<u8>>,
1523 max_values: Vec<Vec<u8>>,
1524 null_counts: Vec<i64>,
1525 boundary_order: BoundaryOrder,
1526 repetition_level_histograms: Option<Vec<i64>>,
1528 definition_level_histograms: Option<Vec<i64>>,
1530 valid: bool,
1538}
1539
1540impl Default for ColumnIndexBuilder {
1541 fn default() -> Self {
1542 Self::new()
1543 }
1544}
1545
1546impl ColumnIndexBuilder {
1547 pub fn new() -> Self {
1549 ColumnIndexBuilder {
1550 null_pages: Vec::new(),
1551 min_values: Vec::new(),
1552 max_values: Vec::new(),
1553 null_counts: Vec::new(),
1554 boundary_order: BoundaryOrder::UNORDERED,
1555 repetition_level_histograms: None,
1556 definition_level_histograms: None,
1557 valid: true,
1558 }
1559 }
1560
1561 pub fn append(
1563 &mut self,
1564 null_page: bool,
1565 min_value: Vec<u8>,
1566 max_value: Vec<u8>,
1567 null_count: i64,
1568 ) {
1569 self.null_pages.push(null_page);
1570 self.min_values.push(min_value);
1571 self.max_values.push(max_value);
1572 self.null_counts.push(null_count);
1573 }
1574
1575 pub fn append_histograms(
1578 &mut self,
1579 repetition_level_histogram: &Option<LevelHistogram>,
1580 definition_level_histogram: &Option<LevelHistogram>,
1581 ) {
1582 if !self.valid {
1583 return;
1584 }
1585 if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1586 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1587 hist.reserve(rep_lvl_hist.len());
1588 hist.extend(rep_lvl_hist.values());
1589 }
1590 if let Some(ref def_lvl_hist) = definition_level_histogram {
1591 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1592 hist.reserve(def_lvl_hist.len());
1593 hist.extend(def_lvl_hist.values());
1594 }
1595 }
1596
1597 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1599 self.boundary_order = boundary_order;
1600 }
1601
1602 pub fn to_invalid(&mut self) {
1604 self.valid = false;
1605 }
1606
1607 pub fn valid(&self) -> bool {
1609 self.valid
1610 }
1611
1612 pub fn build_to_thrift(self) -> ColumnIndex {
1616 ColumnIndex::new(
1617 self.null_pages,
1618 self.min_values,
1619 self.max_values,
1620 self.boundary_order,
1621 self.null_counts,
1622 self.repetition_level_histograms,
1623 self.definition_level_histograms,
1624 )
1625 }
1626}
1627
1628impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1629 fn from(value: ColumnChunkMetaData) -> Self {
1630 ColumnChunkMetaDataBuilder(value)
1631 }
1632}
1633
1634pub struct OffsetIndexBuilder {
1638 offset_array: Vec<i64>,
1639 compressed_page_size_array: Vec<i32>,
1640 first_row_index_array: Vec<i64>,
1641 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1642 current_first_row_index: i64,
1643}
1644
1645impl Default for OffsetIndexBuilder {
1646 fn default() -> Self {
1647 Self::new()
1648 }
1649}
1650
1651impl OffsetIndexBuilder {
1652 pub fn new() -> Self {
1654 OffsetIndexBuilder {
1655 offset_array: Vec::new(),
1656 compressed_page_size_array: Vec::new(),
1657 first_row_index_array: Vec::new(),
1658 unencoded_byte_array_data_bytes_array: None,
1659 current_first_row_index: 0,
1660 }
1661 }
1662
1663 pub fn append_row_count(&mut self, row_count: i64) {
1665 let current_page_row_index = self.current_first_row_index;
1666 self.first_row_index_array.push(current_page_row_index);
1667 self.current_first_row_index += row_count;
1668 }
1669
1670 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1672 self.offset_array.push(offset);
1673 self.compressed_page_size_array.push(compressed_page_size);
1674 }
1675
1676 pub fn append_unencoded_byte_array_data_bytes(
1678 &mut self,
1679 unencoded_byte_array_data_bytes: Option<i64>,
1680 ) {
1681 if let Some(val) = unencoded_byte_array_data_bytes {
1682 self.unencoded_byte_array_data_bytes_array
1683 .get_or_insert(Vec::new())
1684 .push(val);
1685 }
1686 }
1687
1688 pub fn build_to_thrift(self) -> OffsetIndex {
1690 let locations = self
1691 .offset_array
1692 .iter()
1693 .zip(self.compressed_page_size_array.iter())
1694 .zip(self.first_row_index_array.iter())
1695 .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1696 .collect::<Vec<_>>();
1697 OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1698 }
1699}
1700
1701#[cfg(test)]
1702mod tests {
1703 use super::*;
1704 use crate::basic::{PageType, SortOrder};
1705 use crate::file::page_index::index::NativeIndex;
1706
1707 #[test]
1708 fn test_row_group_metadata_thrift_conversion() {
1709 let schema_descr = get_test_schema_descr();
1710
1711 let mut columns = vec![];
1712 for ptr in schema_descr.columns() {
1713 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1714 columns.push(column);
1715 }
1716 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1717 .set_num_rows(1000)
1718 .set_total_byte_size(2000)
1719 .set_column_metadata(columns)
1720 .set_ordinal(1)
1721 .build()
1722 .unwrap();
1723
1724 let row_group_exp = row_group_meta.to_thrift();
1725 let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1726 .unwrap()
1727 .to_thrift();
1728
1729 assert_eq!(row_group_res, row_group_exp);
1730 }
1731
1732 #[test]
1733 fn test_row_group_metadata_thrift_conversion_empty() {
1734 let schema_descr = get_test_schema_descr();
1735
1736 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1737
1738 assert!(row_group_meta.is_err());
1739 if let Err(e) = row_group_meta {
1740 assert_eq!(
1741 format!("{e}"),
1742 "Parquet error: Column length mismatch: 2 != 0"
1743 );
1744 }
1745 }
1746
1747 #[test]
1749 fn test_row_group_metadata_thrift_corrupted() {
1750 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1751 SchemaType::group_type_builder("schema")
1752 .with_fields(vec![
1753 Arc::new(
1754 SchemaType::primitive_type_builder("a", Type::INT32)
1755 .build()
1756 .unwrap(),
1757 ),
1758 Arc::new(
1759 SchemaType::primitive_type_builder("b", Type::INT32)
1760 .build()
1761 .unwrap(),
1762 ),
1763 ])
1764 .build()
1765 .unwrap(),
1766 )));
1767
1768 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1769 SchemaType::group_type_builder("schema")
1770 .with_fields(vec![
1771 Arc::new(
1772 SchemaType::primitive_type_builder("a", Type::INT32)
1773 .build()
1774 .unwrap(),
1775 ),
1776 Arc::new(
1777 SchemaType::primitive_type_builder("b", Type::INT32)
1778 .build()
1779 .unwrap(),
1780 ),
1781 Arc::new(
1782 SchemaType::primitive_type_builder("c", Type::INT32)
1783 .build()
1784 .unwrap(),
1785 ),
1786 ])
1787 .build()
1788 .unwrap(),
1789 )));
1790
1791 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1792 .set_num_rows(1000)
1793 .set_total_byte_size(2000)
1794 .set_column_metadata(vec![
1795 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1796 .build()
1797 .unwrap(),
1798 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1799 .build()
1800 .unwrap(),
1801 ])
1802 .set_ordinal(1)
1803 .build()
1804 .unwrap();
1805
1806 let err =
1807 RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1808 .unwrap_err()
1809 .to_string();
1810 assert_eq!(
1811 err,
1812 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1813 );
1814 }
1815
1816 #[test]
1817 fn test_column_chunk_metadata_thrift_conversion() {
1818 let column_descr = get_test_schema_descr().column(0);
1819
1820 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1821 .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1822 .set_file_path("file_path".to_owned())
1823 .set_num_values(1000)
1824 .set_compression(Compression::SNAPPY)
1825 .set_total_compressed_size(2000)
1826 .set_total_uncompressed_size(3000)
1827 .set_data_page_offset(4000)
1828 .set_dictionary_page_offset(Some(5000))
1829 .set_page_encoding_stats(vec![
1830 PageEncodingStats {
1831 page_type: PageType::DATA_PAGE,
1832 encoding: Encoding::PLAIN,
1833 count: 3,
1834 },
1835 PageEncodingStats {
1836 page_type: PageType::DATA_PAGE,
1837 encoding: Encoding::RLE,
1838 count: 5,
1839 },
1840 ])
1841 .set_bloom_filter_offset(Some(6000))
1842 .set_bloom_filter_length(Some(25))
1843 .set_offset_index_offset(Some(7000))
1844 .set_offset_index_length(Some(25))
1845 .set_column_index_offset(Some(8000))
1846 .set_column_index_length(Some(25))
1847 .set_unencoded_byte_array_data_bytes(Some(2000))
1848 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1849 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1850 .build()
1851 .unwrap();
1852
1853 let col_chunk_res =
1854 ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1855
1856 assert_eq!(col_chunk_res, col_metadata);
1857 }
1858
1859 #[test]
1860 fn test_column_chunk_metadata_thrift_conversion_empty() {
1861 let column_descr = get_test_schema_descr().column(0);
1862
1863 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1864 .build()
1865 .unwrap();
1866
1867 let col_chunk_exp = col_metadata.to_thrift();
1868 let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1869 .unwrap()
1870 .to_thrift();
1871
1872 assert_eq!(col_chunk_res, col_chunk_exp);
1873 }
1874
1875 #[test]
1876 fn test_compressed_size() {
1877 let schema_descr = get_test_schema_descr();
1878
1879 let mut columns = vec![];
1880 for column_descr in schema_descr.columns() {
1881 let column = ColumnChunkMetaData::builder(column_descr.clone())
1882 .set_total_compressed_size(500)
1883 .set_total_uncompressed_size(700)
1884 .build()
1885 .unwrap();
1886 columns.push(column);
1887 }
1888 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1889 .set_num_rows(1000)
1890 .set_column_metadata(columns)
1891 .build()
1892 .unwrap();
1893
1894 let compressed_size_res: i64 = row_group_meta.compressed_size();
1895 let compressed_size_exp: i64 = 1000;
1896
1897 assert_eq!(compressed_size_res, compressed_size_exp);
1898 }
1899
1900 #[test]
1901 fn test_memory_size() {
1902 let schema_descr = get_test_schema_descr();
1903
1904 let columns = schema_descr
1905 .columns()
1906 .iter()
1907 .map(|column_descr| {
1908 ColumnChunkMetaData::builder(column_descr.clone())
1909 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1910 .build()
1911 })
1912 .collect::<Result<Vec<_>>>()
1913 .unwrap();
1914 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1915 .set_num_rows(1000)
1916 .set_column_metadata(columns)
1917 .build()
1918 .unwrap();
1919 let row_group_meta = vec![row_group_meta];
1920
1921 let version = 2;
1922 let num_rows = 1000;
1923 let created_by = Some(String::from("test harness"));
1924 let key_value_metadata = Some(vec![KeyValue::new(
1925 String::from("Foo"),
1926 Some(String::from("bar")),
1927 )]);
1928 let column_orders = Some(vec![
1929 ColumnOrder::UNDEFINED,
1930 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1931 ]);
1932 let file_metadata = FileMetaData::new(
1933 version,
1934 num_rows,
1935 created_by,
1936 key_value_metadata,
1937 schema_descr.clone(),
1938 column_orders,
1939 );
1940
1941 let columns_with_stats = schema_descr
1943 .columns()
1944 .iter()
1945 .map(|column_descr| {
1946 ColumnChunkMetaData::builder(column_descr.clone())
1947 .set_statistics(Statistics::new::<i32>(
1948 Some(0),
1949 Some(100),
1950 None,
1951 None,
1952 false,
1953 ))
1954 .build()
1955 })
1956 .collect::<Result<Vec<_>>>()
1957 .unwrap();
1958
1959 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1960 .set_num_rows(1000)
1961 .set_column_metadata(columns_with_stats)
1962 .build()
1963 .unwrap();
1964 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1965
1966 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1967 .set_row_groups(row_group_meta_with_stats)
1968 .build();
1969
1970 #[cfg(not(feature = "encryption"))]
1971 let base_expected_size = 2312;
1972 #[cfg(feature = "encryption")]
1973 let base_expected_size = 2648;
1974
1975 assert_eq!(parquet_meta.memory_size(), base_expected_size);
1976
1977 let mut column_index = ColumnIndexBuilder::new();
1978 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1979 let column_index = column_index.build_to_thrift();
1980 let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
1981
1982 let mut offset_index = OffsetIndexBuilder::new();
1984 offset_index.append_row_count(1);
1985 offset_index.append_offset_and_size(2, 3);
1986 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1987 offset_index.append_row_count(1);
1988 offset_index.append_offset_and_size(2, 3);
1989 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1990 let offset_index = offset_index.build_to_thrift();
1991
1992 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1993 .set_row_groups(row_group_meta)
1994 .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
1995 .set_offset_index(Some(vec![vec![
1996 OffsetIndexMetaData::try_new(offset_index).unwrap()
1997 ]]))
1998 .build();
1999
2000 #[cfg(not(feature = "encryption"))]
2001 let bigger_expected_size = 2816;
2002 #[cfg(feature = "encryption")]
2003 let bigger_expected_size = 3152;
2004
2005 assert!(bigger_expected_size > base_expected_size);
2007 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2008 }
2009
2010 fn get_test_schema_descr() -> SchemaDescPtr {
2012 let schema = SchemaType::group_type_builder("schema")
2013 .with_fields(vec![
2014 Arc::new(
2015 SchemaType::primitive_type_builder("a", Type::INT32)
2016 .build()
2017 .unwrap(),
2018 ),
2019 Arc::new(
2020 SchemaType::primitive_type_builder("b", Type::INT32)
2021 .build()
2022 .unwrap(),
2023 ),
2024 ])
2025 .build()
2026 .unwrap();
2027
2028 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2029 }
2030}