1mod memory;
95pub(crate) mod reader;
96mod writer;
97
98use crate::basic::{ColumnOrder, Compression, Encoding, Type};
99#[cfg(feature = "encryption")]
100use crate::encryption::{
101 decrypt::FileDecryptor,
102 modules::{create_module_aad, ModuleType},
103};
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
107pub(crate) use crate::file::metadata::memory::HeapSize;
108use crate::file::page_encoding_stats::{self, PageEncodingStats};
109use crate::file::page_index::index::Index;
110use crate::file::page_index::offset_index::OffsetIndexMetaData;
111use crate::file::statistics::{self, Statistics};
112use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
113use crate::format::{
114 BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
115 SizeStatistics, SortingColumn,
116};
117use crate::schema::types::{
118 ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
119 Type as SchemaType,
120};
121#[cfg(feature = "encryption")]
122use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
123pub use reader::ParquetMetaDataReader;
124use std::ops::Range;
125use std::sync::Arc;
126pub use writer::ParquetMetaDataWriter;
127pub(crate) use writer::ThriftMetadataWriter;
128
129pub type ParquetColumnIndex = Vec<Vec<Index>>;
145
146pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
157
158#[derive(Debug, Clone, PartialEq)]
176pub struct ParquetMetaData {
177 file_metadata: FileMetaData,
179 row_groups: Vec<RowGroupMetaData>,
181 column_index: Option<ParquetColumnIndex>,
183 offset_index: Option<ParquetOffsetIndex>,
185 #[cfg(feature = "encryption")]
187 file_decryptor: Option<FileDecryptor>,
188}
189
190impl ParquetMetaData {
191 pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
194 ParquetMetaData {
195 file_metadata,
196 row_groups,
197 #[cfg(feature = "encryption")]
198 file_decryptor: None,
199 column_index: None,
200 offset_index: None,
201 }
202 }
203
204 #[cfg(feature = "encryption")]
207 pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
208 self.file_decryptor = file_decryptor;
209 }
210
211 #[deprecated(since = "53.1.0", note = "Use ParquetMetaDataBuilder")]
214 pub fn new_with_page_index(
215 file_metadata: FileMetaData,
216 row_groups: Vec<RowGroupMetaData>,
217 column_index: Option<ParquetColumnIndex>,
218 offset_index: Option<ParquetOffsetIndex>,
219 ) -> Self {
220 ParquetMetaDataBuilder::new(file_metadata)
221 .set_row_groups(row_groups)
222 .set_column_index(column_index)
223 .set_offset_index(offset_index)
224 .build()
225 }
226
227 pub fn into_builder(self) -> ParquetMetaDataBuilder {
229 self.into()
230 }
231
232 pub fn file_metadata(&self) -> &FileMetaData {
234 &self.file_metadata
235 }
236
237 #[cfg(feature = "encryption")]
239 pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
240 self.file_decryptor.as_ref()
241 }
242
243 pub fn num_row_groups(&self) -> usize {
245 self.row_groups.len()
246 }
247
248 pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
251 &self.row_groups[i]
252 }
253
254 pub fn row_groups(&self) -> &[RowGroupMetaData] {
256 &self.row_groups
257 }
258
259 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
266 self.column_index.as_ref()
267 }
268
269 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
276 self.offset_index.as_ref()
277 }
278
279 pub fn memory_size(&self) -> usize {
294 std::mem::size_of::<Self>()
295 + self.file_metadata.heap_size()
296 + self.row_groups.heap_size()
297 + self.column_index.heap_size()
298 + self.offset_index.heap_size()
299 }
300
301 pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
303 self.column_index = index;
304 }
305
306 pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
308 self.offset_index = index;
309 }
310}
311
312pub struct ParquetMetaDataBuilder(ParquetMetaData);
350
351impl ParquetMetaDataBuilder {
352 pub fn new(file_meta_data: FileMetaData) -> Self {
354 Self(ParquetMetaData::new(file_meta_data, vec![]))
355 }
356
357 pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
359 Self(metadata)
360 }
361
362 pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
364 self.0.row_groups.push(row_group);
365 self
366 }
367
368 pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
370 self.0.row_groups = row_groups;
371 self
372 }
373
374 pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
380 std::mem::take(&mut self.0.row_groups)
381 }
382
383 pub fn row_groups(&self) -> &[RowGroupMetaData] {
385 &self.0.row_groups
386 }
387
388 pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
390 self.0.column_index = column_index;
391 self
392 }
393
394 pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
396 std::mem::take(&mut self.0.column_index)
397 }
398
399 pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
401 self.0.column_index.as_ref()
402 }
403
404 pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
406 self.0.offset_index = offset_index;
407 self
408 }
409
410 pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
412 std::mem::take(&mut self.0.offset_index)
413 }
414
415 pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
417 self.0.offset_index.as_ref()
418 }
419
420 pub fn build(self) -> ParquetMetaData {
422 let Self(metadata) = self;
423 metadata
424 }
425}
426
427impl From<ParquetMetaData> for ParquetMetaDataBuilder {
428 fn from(meta_data: ParquetMetaData) -> Self {
429 Self(meta_data)
430 }
431}
432
433pub type KeyValue = crate::format::KeyValue;
435
436pub type FileMetaDataPtr = Arc<FileMetaData>;
438
439#[derive(Debug, Clone, PartialEq)]
443pub struct FileMetaData {
444 version: i32,
445 num_rows: i64,
446 created_by: Option<String>,
447 key_value_metadata: Option<Vec<KeyValue>>,
448 schema_descr: SchemaDescPtr,
449 column_orders: Option<Vec<ColumnOrder>>,
450}
451
452impl FileMetaData {
453 pub fn new(
455 version: i32,
456 num_rows: i64,
457 created_by: Option<String>,
458 key_value_metadata: Option<Vec<KeyValue>>,
459 schema_descr: SchemaDescPtr,
460 column_orders: Option<Vec<ColumnOrder>>,
461 ) -> Self {
462 FileMetaData {
463 version,
464 num_rows,
465 created_by,
466 key_value_metadata,
467 schema_descr,
468 column_orders,
469 }
470 }
471
472 pub fn version(&self) -> i32 {
474 self.version
475 }
476
477 pub fn num_rows(&self) -> i64 {
479 self.num_rows
480 }
481
482 pub fn created_by(&self) -> Option<&str> {
491 self.created_by.as_deref()
492 }
493
494 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
496 self.key_value_metadata.as_ref()
497 }
498
499 pub fn schema(&self) -> &SchemaType {
503 self.schema_descr.root_schema()
504 }
505
506 pub fn schema_descr(&self) -> &SchemaDescriptor {
508 &self.schema_descr
509 }
510
511 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
513 self.schema_descr.clone()
514 }
515
516 pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
524 self.column_orders.as_ref()
525 }
526
527 pub fn column_order(&self, i: usize) -> ColumnOrder {
530 self.column_orders
531 .as_ref()
532 .map(|data| data[i])
533 .unwrap_or(ColumnOrder::UNDEFINED)
534 }
535}
536
537pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
539
540#[derive(Debug, Clone, PartialEq)]
545pub struct RowGroupMetaData {
546 columns: Vec<ColumnChunkMetaData>,
547 num_rows: i64,
548 sorting_columns: Option<Vec<SortingColumn>>,
549 total_byte_size: i64,
550 schema_descr: SchemaDescPtr,
551 file_offset: Option<i64>,
553 ordinal: Option<i16>,
555}
556
557impl RowGroupMetaData {
558 pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
560 RowGroupMetaDataBuilder::new(schema_descr)
561 }
562
563 pub fn num_columns(&self) -> usize {
565 self.columns.len()
566 }
567
568 pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
570 &self.columns[i]
571 }
572
573 pub fn columns(&self) -> &[ColumnChunkMetaData] {
575 &self.columns
576 }
577
578 pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
580 &mut self.columns
581 }
582
583 pub fn num_rows(&self) -> i64 {
585 self.num_rows
586 }
587
588 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
590 self.sorting_columns.as_ref()
591 }
592
593 pub fn total_byte_size(&self) -> i64 {
595 self.total_byte_size
596 }
597
598 pub fn compressed_size(&self) -> i64 {
600 self.columns.iter().map(|c| c.total_compressed_size).sum()
601 }
602
603 pub fn schema_descr(&self) -> &SchemaDescriptor {
605 self.schema_descr.as_ref()
606 }
607
608 pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
610 self.schema_descr.clone()
611 }
612
613 #[inline(always)]
618 pub fn ordinal(&self) -> Option<i16> {
619 self.ordinal
620 }
621
622 #[inline(always)]
624 pub fn file_offset(&self) -> Option<i64> {
625 self.file_offset
626 }
627
628 #[cfg(feature = "encryption")]
630 fn from_encrypted_thrift(
631 schema_descr: SchemaDescPtr,
632 mut rg: RowGroup,
633 decryptor: Option<&FileDecryptor>,
634 ) -> Result<RowGroupMetaData> {
635 if schema_descr.num_columns() != rg.columns.len() {
636 return Err(general_err!(
637 "Column count mismatch. Schema has {} columns while Row Group has {}",
638 schema_descr.num_columns(),
639 rg.columns.len()
640 ));
641 }
642 let total_byte_size = rg.total_byte_size;
643 let num_rows = rg.num_rows;
644 let mut columns = vec![];
645
646 for (i, (mut c, d)) in rg
647 .columns
648 .drain(0..)
649 .zip(schema_descr.columns())
650 .enumerate()
651 {
652 if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
654 let column_decryptor = match c.crypto_metadata.as_ref() {
655 None => {
656 return Err(general_err!(
657 "No crypto_metadata is set for column '{}', which has encrypted metadata",
658 d.path().string()
659 ));
660 }
661 Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
662 let column_name = crypto_metadata.path_in_schema.join(".");
663 decryptor.get_column_metadata_decryptor(
664 column_name.as_str(),
665 crypto_metadata.key_metadata.as_deref(),
666 )?
667 }
668 Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
669 decryptor.get_footer_decryptor()?
670 }
671 };
672
673 let column_aad = create_module_aad(
674 decryptor.file_aad(),
675 ModuleType::ColumnMetaData,
676 rg.ordinal.unwrap() as usize,
677 i,
678 None,
679 )?;
680
681 let buf = c.encrypted_column_metadata.clone().unwrap();
682 let decrypted_cc_buf = column_decryptor
683 .decrypt(buf.as_slice(), column_aad.as_ref())
684 .map_err(|_| {
685 general_err!(
686 "Unable to decrypt column '{}', perhaps the column key is wrong?",
687 d.path().string()
688 )
689 })?;
690
691 let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
692 c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
693 }
694 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
695 }
696
697 let sorting_columns = rg.sorting_columns;
698 Ok(RowGroupMetaData {
699 columns,
700 num_rows,
701 sorting_columns,
702 total_byte_size,
703 schema_descr,
704 file_offset: rg.file_offset,
705 ordinal: rg.ordinal,
706 })
707 }
708
709 pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
711 if schema_descr.num_columns() != rg.columns.len() {
712 return Err(general_err!(
713 "Column count mismatch. Schema has {} columns while Row Group has {}",
714 schema_descr.num_columns(),
715 rg.columns.len()
716 ));
717 }
718 let total_byte_size = rg.total_byte_size;
719 let num_rows = rg.num_rows;
720 let mut columns = vec![];
721
722 for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
723 columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
724 }
725
726 let sorting_columns = rg.sorting_columns;
727 Ok(RowGroupMetaData {
728 columns,
729 num_rows,
730 sorting_columns,
731 total_byte_size,
732 schema_descr,
733 file_offset: rg.file_offset,
734 ordinal: rg.ordinal,
735 })
736 }
737
738 pub fn to_thrift(&self) -> RowGroup {
740 RowGroup {
741 columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
742 total_byte_size: self.total_byte_size,
743 num_rows: self.num_rows,
744 sorting_columns: self.sorting_columns().cloned(),
745 file_offset: self.file_offset(),
746 total_compressed_size: Some(self.compressed_size()),
747 ordinal: self.ordinal,
748 }
749 }
750
751 pub fn into_builder(self) -> RowGroupMetaDataBuilder {
753 RowGroupMetaDataBuilder(self)
754 }
755}
756
757pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
759
760impl RowGroupMetaDataBuilder {
761 fn new(schema_descr: SchemaDescPtr) -> Self {
763 Self(RowGroupMetaData {
764 columns: Vec::with_capacity(schema_descr.num_columns()),
765 schema_descr,
766 file_offset: None,
767 num_rows: 0,
768 sorting_columns: None,
769 total_byte_size: 0,
770 ordinal: None,
771 })
772 }
773
774 pub fn set_num_rows(mut self, value: i64) -> Self {
776 self.0.num_rows = value;
777 self
778 }
779
780 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
782 self.0.sorting_columns = value;
783 self
784 }
785
786 pub fn set_total_byte_size(mut self, value: i64) -> Self {
788 self.0.total_byte_size = value;
789 self
790 }
791
792 pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
798 std::mem::take(&mut self.0.columns)
799 }
800
801 pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
803 self.0.columns = value;
804 self
805 }
806
807 pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
809 self.0.columns.push(value);
810 self
811 }
812
813 pub fn set_ordinal(mut self, value: i16) -> Self {
815 self.0.ordinal = Some(value);
816 self
817 }
818
819 pub fn set_file_offset(mut self, value: i64) -> Self {
821 self.0.file_offset = Some(value);
822 self
823 }
824
825 pub fn build(self) -> Result<RowGroupMetaData> {
827 if self.0.schema_descr.num_columns() != self.0.columns.len() {
828 return Err(general_err!(
829 "Column length mismatch: {} != {}",
830 self.0.schema_descr.num_columns(),
831 self.0.columns.len()
832 ));
833 }
834
835 Ok(self.0)
836 }
837}
838
839#[derive(Debug, Clone, PartialEq)]
841pub struct ColumnChunkMetaData {
842 column_descr: ColumnDescPtr,
843 encodings: Vec<Encoding>,
844 file_path: Option<String>,
845 file_offset: i64,
846 num_values: i64,
847 compression: Compression,
848 total_compressed_size: i64,
849 total_uncompressed_size: i64,
850 data_page_offset: i64,
851 index_page_offset: Option<i64>,
852 dictionary_page_offset: Option<i64>,
853 statistics: Option<Statistics>,
854 encoding_stats: Option<Vec<PageEncodingStats>>,
855 bloom_filter_offset: Option<i64>,
856 bloom_filter_length: Option<i32>,
857 offset_index_offset: Option<i64>,
858 offset_index_length: Option<i32>,
859 column_index_offset: Option<i64>,
860 column_index_length: Option<i32>,
861 unencoded_byte_array_data_bytes: Option<i64>,
862 repetition_level_histogram: Option<LevelHistogram>,
863 definition_level_histogram: Option<LevelHistogram>,
864 #[cfg(feature = "encryption")]
865 column_crypto_metadata: Option<ColumnCryptoMetaData>,
866}
867
868#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
877pub struct LevelHistogram {
878 inner: Vec<i64>,
879}
880
881impl LevelHistogram {
882 pub fn try_new(max_level: i16) -> Option<Self> {
888 if max_level > 0 {
889 Some(Self {
890 inner: vec![0; max_level as usize + 1],
891 })
892 } else {
893 None
894 }
895 }
896 pub fn values(&self) -> &[i64] {
898 &self.inner
899 }
900
901 pub fn into_inner(self) -> Vec<i64> {
903 self.inner
904 }
905
906 pub fn get(&self, index: usize) -> Option<i64> {
913 self.inner.get(index).copied()
914 }
915
916 pub fn add(&mut self, other: &Self) {
921 assert_eq!(self.len(), other.len());
922 for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
923 *dst += src;
924 }
925 }
926
927 pub fn len(&self) -> usize {
929 self.inner.len()
930 }
931
932 pub fn is_empty(&self) -> bool {
934 self.inner.is_empty()
935 }
936
937 pub fn reset(&mut self) {
939 for value in self.inner.iter_mut() {
940 *value = 0;
941 }
942 }
943
944 pub fn update_from_levels(&mut self, levels: &[i16]) {
950 for &level in levels {
951 self.inner[level as usize] += 1;
952 }
953 }
954}
955
956impl From<Vec<i64>> for LevelHistogram {
957 fn from(inner: Vec<i64>) -> Self {
958 Self { inner }
959 }
960}
961
962impl From<LevelHistogram> for Vec<i64> {
963 fn from(value: LevelHistogram) -> Self {
964 value.into_inner()
965 }
966}
967
968impl HeapSize for LevelHistogram {
969 fn heap_size(&self) -> usize {
970 self.inner.heap_size()
971 }
972}
973
974impl ColumnChunkMetaData {
976 pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
978 ColumnChunkMetaDataBuilder::new(column_descr)
979 }
980
981 pub fn file_path(&self) -> Option<&str> {
986 self.file_path.as_deref()
987 }
988
989 pub fn file_offset(&self) -> i64 {
996 self.file_offset
997 }
998
999 pub fn column_type(&self) -> Type {
1001 self.column_descr.physical_type()
1002 }
1003
1004 pub fn column_path(&self) -> &ColumnPath {
1006 self.column_descr.path()
1007 }
1008
1009 pub fn column_descr(&self) -> &ColumnDescriptor {
1011 self.column_descr.as_ref()
1012 }
1013
1014 pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1016 self.column_descr.clone()
1017 }
1018
1019 pub fn encodings(&self) -> &Vec<Encoding> {
1021 &self.encodings
1022 }
1023
1024 pub fn num_values(&self) -> i64 {
1026 self.num_values
1027 }
1028
1029 pub fn compression(&self) -> Compression {
1031 self.compression
1032 }
1033
1034 pub fn compressed_size(&self) -> i64 {
1036 self.total_compressed_size
1037 }
1038
1039 pub fn uncompressed_size(&self) -> i64 {
1041 self.total_uncompressed_size
1042 }
1043
1044 pub fn data_page_offset(&self) -> i64 {
1046 self.data_page_offset
1047 }
1048
1049 pub fn index_page_offset(&self) -> Option<i64> {
1051 self.index_page_offset
1052 }
1053
1054 pub fn dictionary_page_offset(&self) -> Option<i64> {
1056 self.dictionary_page_offset
1057 }
1058
1059 pub fn byte_range(&self) -> (u64, u64) {
1061 let col_start = match self.dictionary_page_offset() {
1062 Some(dictionary_page_offset) => dictionary_page_offset,
1063 None => self.data_page_offset(),
1064 };
1065 let col_len = self.compressed_size();
1066 assert!(
1067 col_start >= 0 && col_len >= 0,
1068 "column start and length should not be negative"
1069 );
1070 (col_start as u64, col_len as u64)
1071 }
1072
1073 pub fn statistics(&self) -> Option<&Statistics> {
1076 self.statistics.as_ref()
1077 }
1078
1079 pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1082 self.encoding_stats.as_ref()
1083 }
1084
1085 pub fn bloom_filter_offset(&self) -> Option<i64> {
1087 self.bloom_filter_offset
1088 }
1089
1090 pub fn bloom_filter_length(&self) -> Option<i32> {
1092 self.bloom_filter_length
1093 }
1094
1095 pub fn column_index_offset(&self) -> Option<i64> {
1097 self.column_index_offset
1098 }
1099
1100 pub fn column_index_length(&self) -> Option<i32> {
1102 self.column_index_length
1103 }
1104
1105 pub(crate) fn column_index_range(&self) -> Option<Range<usize>> {
1107 let offset = usize::try_from(self.column_index_offset?).ok()?;
1108 let length = usize::try_from(self.column_index_length?).ok()?;
1109 Some(offset..(offset + length))
1110 }
1111
1112 pub fn offset_index_offset(&self) -> Option<i64> {
1114 self.offset_index_offset
1115 }
1116
1117 pub fn offset_index_length(&self) -> Option<i32> {
1119 self.offset_index_length
1120 }
1121
1122 pub(crate) fn offset_index_range(&self) -> Option<Range<usize>> {
1124 let offset = usize::try_from(self.offset_index_offset?).ok()?;
1125 let length = usize::try_from(self.offset_index_length?).ok()?;
1126 Some(offset..(offset + length))
1127 }
1128
1129 pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1134 self.unencoded_byte_array_data_bytes
1135 }
1136
1137 pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1143 self.repetition_level_histogram.as_ref()
1144 }
1145
1146 pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1152 self.definition_level_histogram.as_ref()
1153 }
1154
1155 #[cfg(feature = "encryption")]
1157 pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1158 self.column_crypto_metadata.as_ref()
1159 }
1160
1161 pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1163 if cc.meta_data.is_none() {
1164 return Err(general_err!("Expected to have column metadata"));
1165 }
1166 let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1167 let column_type = Type::try_from(col_metadata.type_)?;
1168 let encodings = col_metadata
1169 .encodings
1170 .drain(0..)
1171 .map(Encoding::try_from)
1172 .collect::<Result<_>>()?;
1173 let compression = Compression::try_from(col_metadata.codec)?;
1174 let file_path = cc.file_path;
1175 let file_offset = cc.file_offset;
1176 let num_values = col_metadata.num_values;
1177 let total_compressed_size = col_metadata.total_compressed_size;
1178 let total_uncompressed_size = col_metadata.total_uncompressed_size;
1179 let data_page_offset = col_metadata.data_page_offset;
1180 let index_page_offset = col_metadata.index_page_offset;
1181 let dictionary_page_offset = col_metadata.dictionary_page_offset;
1182 let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1183 let encoding_stats = col_metadata
1184 .encoding_stats
1185 .as_ref()
1186 .map(|vec| {
1187 vec.iter()
1188 .map(page_encoding_stats::try_from_thrift)
1189 .collect::<Result<_>>()
1190 })
1191 .transpose()?;
1192 let bloom_filter_offset = col_metadata.bloom_filter_offset;
1193 let bloom_filter_length = col_metadata.bloom_filter_length;
1194 let offset_index_offset = cc.offset_index_offset;
1195 let offset_index_length = cc.offset_index_length;
1196 let column_index_offset = cc.column_index_offset;
1197 let column_index_length = cc.column_index_length;
1198 let (
1199 unencoded_byte_array_data_bytes,
1200 repetition_level_histogram,
1201 definition_level_histogram,
1202 ) = if let Some(size_stats) = col_metadata.size_statistics {
1203 (
1204 size_stats.unencoded_byte_array_data_bytes,
1205 size_stats.repetition_level_histogram,
1206 size_stats.definition_level_histogram,
1207 )
1208 } else {
1209 (None, None, None)
1210 };
1211
1212 let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1213 let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1214
1215 #[cfg(feature = "encryption")]
1216 let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1217 Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1218 } else {
1219 None
1220 };
1221
1222 let result = ColumnChunkMetaData {
1223 column_descr,
1224 encodings,
1225 file_path,
1226 file_offset,
1227 num_values,
1228 compression,
1229 total_compressed_size,
1230 total_uncompressed_size,
1231 data_page_offset,
1232 index_page_offset,
1233 dictionary_page_offset,
1234 statistics,
1235 encoding_stats,
1236 bloom_filter_offset,
1237 bloom_filter_length,
1238 offset_index_offset,
1239 offset_index_length,
1240 column_index_offset,
1241 column_index_length,
1242 unencoded_byte_array_data_bytes,
1243 repetition_level_histogram,
1244 definition_level_histogram,
1245 #[cfg(feature = "encryption")]
1246 column_crypto_metadata,
1247 };
1248 Ok(result)
1249 }
1250
1251 pub fn to_thrift(&self) -> ColumnChunk {
1253 let column_metadata = self.to_column_metadata_thrift();
1254
1255 ColumnChunk {
1256 file_path: self.file_path().map(|s| s.to_owned()),
1257 file_offset: self.file_offset,
1258 meta_data: Some(column_metadata),
1259 offset_index_offset: self.offset_index_offset,
1260 offset_index_length: self.offset_index_length,
1261 column_index_offset: self.column_index_offset,
1262 column_index_length: self.column_index_length,
1263 crypto_metadata: self.column_crypto_metadata_thrift(),
1264 encrypted_column_metadata: None,
1265 }
1266 }
1267
1268 pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1270 let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1271 || self.repetition_level_histogram.is_some()
1272 || self.definition_level_histogram.is_some()
1273 {
1274 let repetition_level_histogram = self
1275 .repetition_level_histogram
1276 .as_ref()
1277 .map(|hist| hist.clone().into_inner());
1278
1279 let definition_level_histogram = self
1280 .definition_level_histogram
1281 .as_ref()
1282 .map(|hist| hist.clone().into_inner());
1283
1284 Some(SizeStatistics {
1285 unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1286 repetition_level_histogram,
1287 definition_level_histogram,
1288 })
1289 } else {
1290 None
1291 };
1292
1293 ColumnMetaData {
1294 type_: self.column_type().into(),
1295 encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1296 path_in_schema: self.column_path().as_ref().to_vec(),
1297 codec: self.compression.into(),
1298 num_values: self.num_values,
1299 total_uncompressed_size: self.total_uncompressed_size,
1300 total_compressed_size: self.total_compressed_size,
1301 key_value_metadata: None,
1302 data_page_offset: self.data_page_offset,
1303 index_page_offset: self.index_page_offset,
1304 dictionary_page_offset: self.dictionary_page_offset,
1305 statistics: statistics::to_thrift(self.statistics.as_ref()),
1306 encoding_stats: self
1307 .encoding_stats
1308 .as_ref()
1309 .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1310 bloom_filter_offset: self.bloom_filter_offset,
1311 bloom_filter_length: self.bloom_filter_length,
1312 size_statistics,
1313 }
1314 }
1315
1316 pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1318 ColumnChunkMetaDataBuilder::from(self)
1319 }
1320
1321 #[cfg(feature = "encryption")]
1322 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1323 self.column_crypto_metadata
1324 .as_ref()
1325 .map(column_crypto_metadata::to_thrift)
1326 }
1327
1328 #[cfg(not(feature = "encryption"))]
1329 fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1330 None
1331 }
1332}
1333
1334pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1353
1354impl ColumnChunkMetaDataBuilder {
1355 fn new(column_descr: ColumnDescPtr) -> Self {
1359 Self(ColumnChunkMetaData {
1360 column_descr,
1361 encodings: Vec::new(),
1362 file_path: None,
1363 file_offset: 0,
1364 num_values: 0,
1365 compression: Compression::UNCOMPRESSED,
1366 total_compressed_size: 0,
1367 total_uncompressed_size: 0,
1368 data_page_offset: 0,
1369 index_page_offset: None,
1370 dictionary_page_offset: None,
1371 statistics: None,
1372 encoding_stats: None,
1373 bloom_filter_offset: None,
1374 bloom_filter_length: None,
1375 offset_index_offset: None,
1376 offset_index_length: None,
1377 column_index_offset: None,
1378 column_index_length: None,
1379 unencoded_byte_array_data_bytes: None,
1380 repetition_level_histogram: None,
1381 definition_level_histogram: None,
1382 #[cfg(feature = "encryption")]
1383 column_crypto_metadata: None,
1384 })
1385 }
1386
1387 pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1389 self.0.encodings = encodings;
1390 self
1391 }
1392
1393 pub fn set_file_path(mut self, value: String) -> Self {
1395 self.0.file_path = Some(value);
1396 self
1397 }
1398
1399 #[deprecated(
1405 since = "53.0.0",
1406 note = "The Parquet specification requires this field to be 0"
1407 )]
1408 pub fn set_file_offset(mut self, value: i64) -> Self {
1409 self.0.file_offset = value;
1410 self
1411 }
1412
1413 pub fn set_num_values(mut self, value: i64) -> Self {
1415 self.0.num_values = value;
1416 self
1417 }
1418
1419 pub fn set_compression(mut self, value: Compression) -> Self {
1421 self.0.compression = value;
1422 self
1423 }
1424
1425 pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1427 self.0.total_compressed_size = value;
1428 self
1429 }
1430
1431 pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1433 self.0.total_uncompressed_size = value;
1434 self
1435 }
1436
1437 pub fn set_data_page_offset(mut self, value: i64) -> Self {
1439 self.0.data_page_offset = value;
1440 self
1441 }
1442
1443 pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1445 self.0.dictionary_page_offset = value;
1446 self
1447 }
1448
1449 pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1451 self.0.index_page_offset = value;
1452 self
1453 }
1454
1455 pub fn set_statistics(mut self, value: Statistics) -> Self {
1457 self.0.statistics = Some(value);
1458 self
1459 }
1460
1461 pub fn clear_statistics(mut self) -> Self {
1463 self.0.statistics = None;
1464 self
1465 }
1466
1467 pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1469 self.0.encoding_stats = Some(value);
1470 self
1471 }
1472
1473 pub fn clear_page_encoding_stats(mut self) -> Self {
1475 self.0.encoding_stats = None;
1476 self
1477 }
1478
1479 pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1481 self.0.bloom_filter_offset = value;
1482 self
1483 }
1484
1485 pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1487 self.0.bloom_filter_length = value;
1488 self
1489 }
1490
1491 pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1493 self.0.offset_index_offset = value;
1494 self
1495 }
1496
1497 pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1499 self.0.offset_index_length = value;
1500 self
1501 }
1502
1503 pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1505 self.0.column_index_offset = value;
1506 self
1507 }
1508
1509 pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1511 self.0.column_index_length = value;
1512 self
1513 }
1514
1515 pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1517 self.0.unencoded_byte_array_data_bytes = value;
1518 self
1519 }
1520
1521 pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1523 self.0.repetition_level_histogram = value;
1524 self
1525 }
1526
1527 pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1529 self.0.definition_level_histogram = value;
1530 self
1531 }
1532
1533 #[cfg(feature = "encryption")]
1534 pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1536 self.0.column_crypto_metadata = value;
1537 self
1538 }
1539
1540 pub fn build(self) -> Result<ColumnChunkMetaData> {
1542 Ok(self.0)
1543 }
1544}
1545
1546pub struct ColumnIndexBuilder {
1550 null_pages: Vec<bool>,
1551 min_values: Vec<Vec<u8>>,
1552 max_values: Vec<Vec<u8>>,
1553 null_counts: Vec<i64>,
1554 boundary_order: BoundaryOrder,
1555 repetition_level_histograms: Option<Vec<i64>>,
1557 definition_level_histograms: Option<Vec<i64>>,
1559 valid: bool,
1567}
1568
1569impl Default for ColumnIndexBuilder {
1570 fn default() -> Self {
1571 Self::new()
1572 }
1573}
1574
1575impl ColumnIndexBuilder {
1576 pub fn new() -> Self {
1578 ColumnIndexBuilder {
1579 null_pages: Vec::new(),
1580 min_values: Vec::new(),
1581 max_values: Vec::new(),
1582 null_counts: Vec::new(),
1583 boundary_order: BoundaryOrder::UNORDERED,
1584 repetition_level_histograms: None,
1585 definition_level_histograms: None,
1586 valid: true,
1587 }
1588 }
1589
1590 pub fn append(
1592 &mut self,
1593 null_page: bool,
1594 min_value: Vec<u8>,
1595 max_value: Vec<u8>,
1596 null_count: i64,
1597 ) {
1598 self.null_pages.push(null_page);
1599 self.min_values.push(min_value);
1600 self.max_values.push(max_value);
1601 self.null_counts.push(null_count);
1602 }
1603
1604 pub fn append_histograms(
1607 &mut self,
1608 repetition_level_histogram: &Option<LevelHistogram>,
1609 definition_level_histogram: &Option<LevelHistogram>,
1610 ) {
1611 if !self.valid {
1612 return;
1613 }
1614 if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1615 let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1616 hist.reserve(rep_lvl_hist.len());
1617 hist.extend(rep_lvl_hist.values());
1618 }
1619 if let Some(ref def_lvl_hist) = definition_level_histogram {
1620 let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1621 hist.reserve(def_lvl_hist.len());
1622 hist.extend(def_lvl_hist.values());
1623 }
1624 }
1625
1626 pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1628 self.boundary_order = boundary_order;
1629 }
1630
1631 pub fn to_invalid(&mut self) {
1633 self.valid = false;
1634 }
1635
1636 pub fn valid(&self) -> bool {
1638 self.valid
1639 }
1640
1641 pub fn build_to_thrift(self) -> ColumnIndex {
1645 ColumnIndex::new(
1646 self.null_pages,
1647 self.min_values,
1648 self.max_values,
1649 self.boundary_order,
1650 self.null_counts,
1651 self.repetition_level_histograms,
1652 self.definition_level_histograms,
1653 )
1654 }
1655}
1656
1657impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1658 fn from(value: ColumnChunkMetaData) -> Self {
1659 ColumnChunkMetaDataBuilder(value)
1660 }
1661}
1662
1663pub struct OffsetIndexBuilder {
1667 offset_array: Vec<i64>,
1668 compressed_page_size_array: Vec<i32>,
1669 first_row_index_array: Vec<i64>,
1670 unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1671 current_first_row_index: i64,
1672}
1673
1674impl Default for OffsetIndexBuilder {
1675 fn default() -> Self {
1676 Self::new()
1677 }
1678}
1679
1680impl OffsetIndexBuilder {
1681 pub fn new() -> Self {
1683 OffsetIndexBuilder {
1684 offset_array: Vec::new(),
1685 compressed_page_size_array: Vec::new(),
1686 first_row_index_array: Vec::new(),
1687 unencoded_byte_array_data_bytes_array: None,
1688 current_first_row_index: 0,
1689 }
1690 }
1691
1692 pub fn append_row_count(&mut self, row_count: i64) {
1694 let current_page_row_index = self.current_first_row_index;
1695 self.first_row_index_array.push(current_page_row_index);
1696 self.current_first_row_index += row_count;
1697 }
1698
1699 pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1701 self.offset_array.push(offset);
1702 self.compressed_page_size_array.push(compressed_page_size);
1703 }
1704
1705 pub fn append_unencoded_byte_array_data_bytes(
1707 &mut self,
1708 unencoded_byte_array_data_bytes: Option<i64>,
1709 ) {
1710 if let Some(val) = unencoded_byte_array_data_bytes {
1711 self.unencoded_byte_array_data_bytes_array
1712 .get_or_insert(Vec::new())
1713 .push(val);
1714 }
1715 }
1716
1717 pub fn build_to_thrift(self) -> OffsetIndex {
1719 let locations = self
1720 .offset_array
1721 .iter()
1722 .zip(self.compressed_page_size_array.iter())
1723 .zip(self.first_row_index_array.iter())
1724 .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1725 .collect::<Vec<_>>();
1726 OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1727 }
1728}
1729
1730#[cfg(test)]
1731mod tests {
1732 use super::*;
1733 use crate::basic::{PageType, SortOrder};
1734 use crate::file::page_index::index::NativeIndex;
1735
1736 #[test]
1737 fn test_row_group_metadata_thrift_conversion() {
1738 let schema_descr = get_test_schema_descr();
1739
1740 let mut columns = vec![];
1741 for ptr in schema_descr.columns() {
1742 let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1743 columns.push(column);
1744 }
1745 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1746 .set_num_rows(1000)
1747 .set_total_byte_size(2000)
1748 .set_column_metadata(columns)
1749 .set_ordinal(1)
1750 .build()
1751 .unwrap();
1752
1753 let row_group_exp = row_group_meta.to_thrift();
1754 let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1755 .unwrap()
1756 .to_thrift();
1757
1758 assert_eq!(row_group_res, row_group_exp);
1759 }
1760
1761 #[test]
1762 fn test_row_group_metadata_thrift_conversion_empty() {
1763 let schema_descr = get_test_schema_descr();
1764
1765 let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1766
1767 assert!(row_group_meta.is_err());
1768 if let Err(e) = row_group_meta {
1769 assert_eq!(
1770 format!("{e}"),
1771 "Parquet error: Column length mismatch: 2 != 0"
1772 );
1773 }
1774 }
1775
1776 #[test]
1778 fn test_row_group_metadata_thrift_corrupted() {
1779 let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1780 SchemaType::group_type_builder("schema")
1781 .with_fields(vec![
1782 Arc::new(
1783 SchemaType::primitive_type_builder("a", Type::INT32)
1784 .build()
1785 .unwrap(),
1786 ),
1787 Arc::new(
1788 SchemaType::primitive_type_builder("b", Type::INT32)
1789 .build()
1790 .unwrap(),
1791 ),
1792 ])
1793 .build()
1794 .unwrap(),
1795 )));
1796
1797 let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1798 SchemaType::group_type_builder("schema")
1799 .with_fields(vec![
1800 Arc::new(
1801 SchemaType::primitive_type_builder("a", Type::INT32)
1802 .build()
1803 .unwrap(),
1804 ),
1805 Arc::new(
1806 SchemaType::primitive_type_builder("b", Type::INT32)
1807 .build()
1808 .unwrap(),
1809 ),
1810 Arc::new(
1811 SchemaType::primitive_type_builder("c", Type::INT32)
1812 .build()
1813 .unwrap(),
1814 ),
1815 ])
1816 .build()
1817 .unwrap(),
1818 )));
1819
1820 let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1821 .set_num_rows(1000)
1822 .set_total_byte_size(2000)
1823 .set_column_metadata(vec![
1824 ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1825 .build()
1826 .unwrap(),
1827 ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1828 .build()
1829 .unwrap(),
1830 ])
1831 .set_ordinal(1)
1832 .build()
1833 .unwrap();
1834
1835 let err =
1836 RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1837 .unwrap_err()
1838 .to_string();
1839 assert_eq!(
1840 err,
1841 "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1842 );
1843 }
1844
1845 #[test]
1846 fn test_column_chunk_metadata_thrift_conversion() {
1847 let column_descr = get_test_schema_descr().column(0);
1848
1849 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1850 .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1851 .set_file_path("file_path".to_owned())
1852 .set_num_values(1000)
1853 .set_compression(Compression::SNAPPY)
1854 .set_total_compressed_size(2000)
1855 .set_total_uncompressed_size(3000)
1856 .set_data_page_offset(4000)
1857 .set_dictionary_page_offset(Some(5000))
1858 .set_page_encoding_stats(vec![
1859 PageEncodingStats {
1860 page_type: PageType::DATA_PAGE,
1861 encoding: Encoding::PLAIN,
1862 count: 3,
1863 },
1864 PageEncodingStats {
1865 page_type: PageType::DATA_PAGE,
1866 encoding: Encoding::RLE,
1867 count: 5,
1868 },
1869 ])
1870 .set_bloom_filter_offset(Some(6000))
1871 .set_bloom_filter_length(Some(25))
1872 .set_offset_index_offset(Some(7000))
1873 .set_offset_index_length(Some(25))
1874 .set_column_index_offset(Some(8000))
1875 .set_column_index_length(Some(25))
1876 .set_unencoded_byte_array_data_bytes(Some(2000))
1877 .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1878 .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1879 .build()
1880 .unwrap();
1881
1882 let col_chunk_res =
1883 ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1884
1885 assert_eq!(col_chunk_res, col_metadata);
1886 }
1887
1888 #[test]
1889 fn test_column_chunk_metadata_thrift_conversion_empty() {
1890 let column_descr = get_test_schema_descr().column(0);
1891
1892 let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1893 .build()
1894 .unwrap();
1895
1896 let col_chunk_exp = col_metadata.to_thrift();
1897 let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1898 .unwrap()
1899 .to_thrift();
1900
1901 assert_eq!(col_chunk_res, col_chunk_exp);
1902 }
1903
1904 #[test]
1905 fn test_compressed_size() {
1906 let schema_descr = get_test_schema_descr();
1907
1908 let mut columns = vec![];
1909 for column_descr in schema_descr.columns() {
1910 let column = ColumnChunkMetaData::builder(column_descr.clone())
1911 .set_total_compressed_size(500)
1912 .set_total_uncompressed_size(700)
1913 .build()
1914 .unwrap();
1915 columns.push(column);
1916 }
1917 let row_group_meta = RowGroupMetaData::builder(schema_descr)
1918 .set_num_rows(1000)
1919 .set_column_metadata(columns)
1920 .build()
1921 .unwrap();
1922
1923 let compressed_size_res: i64 = row_group_meta.compressed_size();
1924 let compressed_size_exp: i64 = 1000;
1925
1926 assert_eq!(compressed_size_res, compressed_size_exp);
1927 }
1928
1929 #[test]
1930 fn test_memory_size() {
1931 let schema_descr = get_test_schema_descr();
1932
1933 let columns = schema_descr
1934 .columns()
1935 .iter()
1936 .map(|column_descr| {
1937 ColumnChunkMetaData::builder(column_descr.clone())
1938 .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1939 .build()
1940 })
1941 .collect::<Result<Vec<_>>>()
1942 .unwrap();
1943 let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1944 .set_num_rows(1000)
1945 .set_column_metadata(columns)
1946 .build()
1947 .unwrap();
1948 let row_group_meta = vec![row_group_meta];
1949
1950 let version = 2;
1951 let num_rows = 1000;
1952 let created_by = Some(String::from("test harness"));
1953 let key_value_metadata = Some(vec![KeyValue::new(
1954 String::from("Foo"),
1955 Some(String::from("bar")),
1956 )]);
1957 let column_orders = Some(vec![
1958 ColumnOrder::UNDEFINED,
1959 ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1960 ]);
1961 let file_metadata = FileMetaData::new(
1962 version,
1963 num_rows,
1964 created_by,
1965 key_value_metadata,
1966 schema_descr.clone(),
1967 column_orders,
1968 );
1969
1970 let columns_with_stats = schema_descr
1972 .columns()
1973 .iter()
1974 .map(|column_descr| {
1975 ColumnChunkMetaData::builder(column_descr.clone())
1976 .set_statistics(Statistics::new::<i32>(
1977 Some(0),
1978 Some(100),
1979 None,
1980 None,
1981 false,
1982 ))
1983 .build()
1984 })
1985 .collect::<Result<Vec<_>>>()
1986 .unwrap();
1987
1988 let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1989 .set_num_rows(1000)
1990 .set_column_metadata(columns_with_stats)
1991 .build()
1992 .unwrap();
1993 let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1994
1995 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1996 .set_row_groups(row_group_meta_with_stats)
1997 .build();
1998
1999 #[cfg(not(feature = "encryption"))]
2000 let base_expected_size = 2312;
2001 #[cfg(feature = "encryption")]
2002 let base_expected_size = 2640;
2003
2004 assert_eq!(parquet_meta.memory_size(), base_expected_size);
2005
2006 let mut column_index = ColumnIndexBuilder::new();
2007 column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2008 let column_index = column_index.build_to_thrift();
2009 let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
2010
2011 let mut offset_index = OffsetIndexBuilder::new();
2013 offset_index.append_row_count(1);
2014 offset_index.append_offset_and_size(2, 3);
2015 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2016 offset_index.append_row_count(1);
2017 offset_index.append_offset_and_size(2, 3);
2018 offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2019 let offset_index = offset_index.build_to_thrift();
2020
2021 let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2022 .set_row_groups(row_group_meta)
2023 .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
2024 .set_offset_index(Some(vec![vec![
2025 OffsetIndexMetaData::try_new(offset_index).unwrap()
2026 ]]))
2027 .build();
2028
2029 #[cfg(not(feature = "encryption"))]
2030 let bigger_expected_size = 2816;
2031 #[cfg(feature = "encryption")]
2032 let bigger_expected_size = 3144;
2033
2034 assert!(bigger_expected_size > base_expected_size);
2036 assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2037 }
2038
2039 fn get_test_schema_descr() -> SchemaDescPtr {
2041 let schema = SchemaType::group_type_builder("schema")
2042 .with_fields(vec![
2043 Arc::new(
2044 SchemaType::primitive_type_builder("a", Type::INT32)
2045 .build()
2046 .unwrap(),
2047 ),
2048 Arc::new(
2049 SchemaType::primitive_type_builder("b", Type::INT32)
2050 .build()
2051 .unwrap(),
2052 ),
2053 ])
2054 .build()
2055 .unwrap();
2056
2057 Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2058 }
2059}