parquet/file/metadata/
memory.rs1use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType};
22use crate::data_type::private::ParquetValueType;
23use crate::file::metadata::{
24 ColumnChunkMetaData, FileMetaData, KeyValue, PageEncodingStats, ParquetPageEncodingStats,
25 RowGroupMetaData, SortingColumn,
26};
27use crate::file::page_index::column_index::{
28 ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
29};
30use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
31use crate::file::statistics::{Statistics, ValueStatistics};
32use std::collections::HashMap;
33use std::sync::Arc;
34
35pub trait HeapSize {
37 fn heap_size(&self) -> usize;
43}
44
45impl<T: HeapSize> HeapSize for Vec<T> {
46 fn heap_size(&self) -> usize {
47 let item_size = std::mem::size_of::<T>();
48 (self.capacity() * item_size) +
50 self.iter().map(|t| t.heap_size()).sum::<usize>()
52 }
53}
54
55impl<K: HeapSize, V: HeapSize> HeapSize for HashMap<K, V> {
56 fn heap_size(&self) -> usize {
57 let capacity = self.capacity();
58 if capacity == 0 {
59 return 0;
60 }
61
62 let key_val_size = std::mem::size_of::<(K, V)>();
66 let group_size = 16;
68 let metadata_size = 1;
70
71 let buckets = if capacity < 15 {
73 let min_cap = match key_val_size {
74 0..=1 => 14,
75 2..=3 => 7,
76 _ => 3,
77 };
78 let cap = min_cap.max(capacity);
79 if cap < 4 {
80 4
81 } else if cap < 8 {
82 8
83 } else {
84 16
85 }
86 } else {
87 (capacity.saturating_mul(8) / 7).next_power_of_two()
88 };
89
90 group_size
91 + (buckets * (key_val_size + metadata_size))
92 + self.keys().map(|k| k.heap_size()).sum::<usize>()
93 + self.values().map(|v| v.heap_size()).sum::<usize>()
94 }
95}
96
97impl<T: HeapSize> HeapSize for Arc<T> {
98 fn heap_size(&self) -> usize {
99 2 * std::mem::size_of::<usize>() + std::mem::size_of::<T>() + self.as_ref().heap_size()
101 }
102}
103
104impl HeapSize for Arc<dyn HeapSize> {
105 fn heap_size(&self) -> usize {
106 2 * std::mem::size_of::<usize>()
107 + std::mem::size_of_val(self.as_ref())
108 + self.as_ref().heap_size()
109 }
110}
111
112impl<T: HeapSize> HeapSize for Box<T> {
113 fn heap_size(&self) -> usize {
114 std::mem::size_of::<T>() + self.as_ref().heap_size()
115 }
116}
117
118impl<T: HeapSize> HeapSize for Option<T> {
119 fn heap_size(&self) -> usize {
120 self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0)
121 }
122}
123
124impl HeapSize for String {
125 fn heap_size(&self) -> usize {
126 self.capacity()
127 }
128}
129
130impl HeapSize for FileMetaData {
131 fn heap_size(&self) -> usize {
132 #[cfg(feature = "encryption")]
133 let encryption_heap_size =
134 self.encryption_algorithm.heap_size() + self.footer_signing_key_metadata.heap_size();
135 #[cfg(not(feature = "encryption"))]
136 let encryption_heap_size = 0;
137
138 self.created_by.heap_size()
139 + self.key_value_metadata.heap_size()
140 + self.schema_descr.heap_size()
141 + self.column_orders.heap_size()
142 + encryption_heap_size
143 }
144}
145
146impl HeapSize for KeyValue {
147 fn heap_size(&self) -> usize {
148 self.key.heap_size() + self.value.heap_size()
149 }
150}
151
152impl HeapSize for RowGroupMetaData {
153 fn heap_size(&self) -> usize {
154 self.columns.heap_size() + self.sorting_columns.heap_size()
157 }
158}
159
160impl HeapSize for ColumnChunkMetaData {
161 fn heap_size(&self) -> usize {
162 #[cfg(feature = "encryption")]
163 let encryption_heap_size =
164 self.column_crypto_metadata.heap_size() + self.encrypted_column_metadata.heap_size();
165 #[cfg(not(feature = "encryption"))]
166 let encryption_heap_size = 0;
167
168 self.encodings.heap_size()
171 + self.file_path.heap_size()
172 + self.compression.heap_size()
173 + self.statistics.heap_size()
174 + self.encoding_stats.heap_size()
175 + self.unencoded_byte_array_data_bytes.heap_size()
176 + self.repetition_level_histogram.heap_size()
177 + self.definition_level_histogram.heap_size()
178 + self.geo_statistics.heap_size()
179 + encryption_heap_size
180 }
181}
182
183impl HeapSize for Encoding {
184 fn heap_size(&self) -> usize {
185 0 }
187}
188
189impl HeapSize for ParquetPageEncodingStats {
190 fn heap_size(&self) -> usize {
191 match self {
192 Self::Full(v) => v.heap_size(),
193 Self::Mask(_) => 0,
194 }
195 }
196}
197
198impl HeapSize for PageEncodingStats {
199 fn heap_size(&self) -> usize {
200 self.page_type.heap_size() + self.encoding.heap_size()
201 }
202}
203
204impl HeapSize for SortingColumn {
205 fn heap_size(&self) -> usize {
206 0 }
208}
209impl HeapSize for Compression {
210 fn heap_size(&self) -> usize {
211 0 }
213}
214
215impl HeapSize for PageType {
216 fn heap_size(&self) -> usize {
217 0 }
219}
220
221impl HeapSize for Statistics {
222 fn heap_size(&self) -> usize {
223 match self {
224 Statistics::Boolean(value_statistics) => value_statistics.heap_size(),
225 Statistics::Int32(value_statistics) => value_statistics.heap_size(),
226 Statistics::Int64(value_statistics) => value_statistics.heap_size(),
227 Statistics::Int96(value_statistics) => value_statistics.heap_size(),
228 Statistics::Float(value_statistics) => value_statistics.heap_size(),
229 Statistics::Double(value_statistics) => value_statistics.heap_size(),
230 Statistics::ByteArray(value_statistics) => value_statistics.heap_size(),
231 Statistics::FixedLenByteArray(value_statistics) => value_statistics.heap_size(),
232 }
233 }
234}
235
236impl HeapSize for OffsetIndexMetaData {
237 fn heap_size(&self) -> usize {
238 self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size()
239 }
240}
241
242impl HeapSize for ColumnIndexMetaData {
243 fn heap_size(&self) -> usize {
244 match self {
245 Self::NONE => 0,
246 Self::BOOLEAN(native_index) => native_index.heap_size(),
247 Self::INT32(native_index) => native_index.heap_size(),
248 Self::INT64(native_index) => native_index.heap_size(),
249 Self::INT96(native_index) => native_index.heap_size(),
250 Self::FLOAT(native_index) => native_index.heap_size(),
251 Self::DOUBLE(native_index) => native_index.heap_size(),
252 Self::BYTE_ARRAY(native_index) => native_index.heap_size(),
253 Self::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
254 }
255 }
256}
257
258impl HeapSize for ColumnIndex {
259 fn heap_size(&self) -> usize {
260 self.null_pages.heap_size()
261 + self.boundary_order.heap_size()
262 + self.null_counts.heap_size()
263 + self.definition_level_histograms.heap_size()
264 + self.repetition_level_histograms.heap_size()
265 }
266}
267
268impl<T: ParquetValueType> HeapSize for PrimitiveColumnIndex<T> {
269 fn heap_size(&self) -> usize {
270 self.column_index.heap_size() + self.min_values.heap_size() + self.max_values.heap_size()
271 }
272}
273
274impl HeapSize for ByteArrayColumnIndex {
275 fn heap_size(&self) -> usize {
276 self.column_index.heap_size()
277 + self.min_bytes.heap_size()
278 + self.min_offsets.heap_size()
279 + self.max_bytes.heap_size()
280 + self.max_offsets.heap_size()
281 }
282}
283
284impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
285 fn heap_size(&self) -> usize {
286 self.min_opt().map(T::heap_size).unwrap_or(0)
287 + self.max_opt().map(T::heap_size).unwrap_or(0)
288 }
289}
290impl HeapSize for bool {
291 fn heap_size(&self) -> usize {
292 0 }
294}
295impl HeapSize for u8 {
296 fn heap_size(&self) -> usize {
297 0 }
299}
300impl HeapSize for i32 {
301 fn heap_size(&self) -> usize {
302 0 }
304}
305impl HeapSize for i64 {
306 fn heap_size(&self) -> usize {
307 0 }
309}
310
311impl HeapSize for f32 {
312 fn heap_size(&self) -> usize {
313 0 }
315}
316impl HeapSize for f64 {
317 fn heap_size(&self) -> usize {
318 0 }
320}
321
322impl HeapSize for usize {
323 fn heap_size(&self) -> usize {
324 0 }
326}
327
328impl HeapSize for BoundaryOrder {
329 fn heap_size(&self) -> usize {
330 0 }
332}
333
334impl HeapSize for PageLocation {
335 fn heap_size(&self) -> usize {
336 0 }
338}
339
340impl HeapSize for ColumnOrder {
341 fn heap_size(&self) -> usize {
342 0 }
344}