parquet/file/metadata/
memory.rs1use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType};
22use crate::data_type::private::ParquetValueType;
23use crate::file::metadata::{
24 ColumnChunkMetaData, FileMetaData, KeyValue, PageEncodingStats, RowGroupMetaData, SortingColumn,
25};
26use crate::file::page_index::column_index::{
27 ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
28};
29use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
30use crate::file::statistics::{Statistics, ValueStatistics};
31use std::collections::HashMap;
32use std::sync::Arc;
33
34pub trait HeapSize {
36 fn heap_size(&self) -> usize;
42}
43
44impl<T: HeapSize> HeapSize for Vec<T> {
45 fn heap_size(&self) -> usize {
46 let item_size = std::mem::size_of::<T>();
47 (self.capacity() * item_size) +
49 self.iter().map(|t| t.heap_size()).sum::<usize>()
51 }
52}
53
54impl<K: HeapSize, V: HeapSize> HeapSize for HashMap<K, V> {
55 fn heap_size(&self) -> usize {
56 let capacity = self.capacity();
57 if capacity == 0 {
58 return 0;
59 }
60
61 let key_val_size = std::mem::size_of::<(K, V)>();
65 let group_size = 16;
67 let metadata_size = 1;
69
70 let buckets = if capacity < 15 {
72 let min_cap = match key_val_size {
73 0..=1 => 14,
74 2..=3 => 7,
75 _ => 3,
76 };
77 let cap = min_cap.max(capacity);
78 if cap < 4 {
79 4
80 } else if cap < 8 {
81 8
82 } else {
83 16
84 }
85 } else {
86 (capacity.saturating_mul(8) / 7).next_power_of_two()
87 };
88
89 group_size
90 + (buckets * (key_val_size + metadata_size))
91 + self.keys().map(|k| k.heap_size()).sum::<usize>()
92 + self.values().map(|v| v.heap_size()).sum::<usize>()
93 }
94}
95
96impl<T: HeapSize> HeapSize for Arc<T> {
97 fn heap_size(&self) -> usize {
98 2 * std::mem::size_of::<usize>() + std::mem::size_of::<T>() + self.as_ref().heap_size()
100 }
101}
102
103impl HeapSize for Arc<dyn HeapSize> {
104 fn heap_size(&self) -> usize {
105 2 * std::mem::size_of::<usize>()
106 + std::mem::size_of_val(self.as_ref())
107 + self.as_ref().heap_size()
108 }
109}
110
111impl<T: HeapSize> HeapSize for Box<T> {
112 fn heap_size(&self) -> usize {
113 std::mem::size_of::<T>() + self.as_ref().heap_size()
114 }
115}
116
117impl<T: HeapSize> HeapSize for Option<T> {
118 fn heap_size(&self) -> usize {
119 self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0)
120 }
121}
122
123impl HeapSize for String {
124 fn heap_size(&self) -> usize {
125 self.capacity()
126 }
127}
128
129impl HeapSize for FileMetaData {
130 fn heap_size(&self) -> usize {
131 #[cfg(feature = "encryption")]
132 let encryption_heap_size =
133 self.encryption_algorithm.heap_size() + self.footer_signing_key_metadata.heap_size();
134 #[cfg(not(feature = "encryption"))]
135 let encryption_heap_size = 0;
136
137 self.created_by.heap_size()
138 + self.key_value_metadata.heap_size()
139 + self.schema_descr.heap_size()
140 + self.column_orders.heap_size()
141 + encryption_heap_size
142 }
143}
144
145impl HeapSize for KeyValue {
146 fn heap_size(&self) -> usize {
147 self.key.heap_size() + self.value.heap_size()
148 }
149}
150
151impl HeapSize for RowGroupMetaData {
152 fn heap_size(&self) -> usize {
153 self.columns.heap_size() + self.sorting_columns.heap_size()
156 }
157}
158
159impl HeapSize for ColumnChunkMetaData {
160 fn heap_size(&self) -> usize {
161 #[cfg(feature = "encryption")]
162 let encryption_heap_size =
163 self.column_crypto_metadata.heap_size() + self.encrypted_column_metadata.heap_size();
164 #[cfg(not(feature = "encryption"))]
165 let encryption_heap_size = 0;
166
167 self.encodings.heap_size()
170 + self.file_path.heap_size()
171 + self.compression.heap_size()
172 + self.statistics.heap_size()
173 + self.encoding_stats.heap_size()
174 + self.unencoded_byte_array_data_bytes.heap_size()
175 + self.repetition_level_histogram.heap_size()
176 + self.definition_level_histogram.heap_size()
177 + self.geo_statistics.heap_size()
178 + encryption_heap_size
179 }
180}
181
182impl HeapSize for Encoding {
183 fn heap_size(&self) -> usize {
184 0 }
186}
187
188impl HeapSize for PageEncodingStats {
189 fn heap_size(&self) -> usize {
190 self.page_type.heap_size() + self.encoding.heap_size()
191 }
192}
193
194impl HeapSize for SortingColumn {
195 fn heap_size(&self) -> usize {
196 0 }
198}
199impl HeapSize for Compression {
200 fn heap_size(&self) -> usize {
201 0 }
203}
204
205impl HeapSize for PageType {
206 fn heap_size(&self) -> usize {
207 0 }
209}
210
211impl HeapSize for Statistics {
212 fn heap_size(&self) -> usize {
213 match self {
214 Statistics::Boolean(value_statistics) => value_statistics.heap_size(),
215 Statistics::Int32(value_statistics) => value_statistics.heap_size(),
216 Statistics::Int64(value_statistics) => value_statistics.heap_size(),
217 Statistics::Int96(value_statistics) => value_statistics.heap_size(),
218 Statistics::Float(value_statistics) => value_statistics.heap_size(),
219 Statistics::Double(value_statistics) => value_statistics.heap_size(),
220 Statistics::ByteArray(value_statistics) => value_statistics.heap_size(),
221 Statistics::FixedLenByteArray(value_statistics) => value_statistics.heap_size(),
222 }
223 }
224}
225
226impl HeapSize for OffsetIndexMetaData {
227 fn heap_size(&self) -> usize {
228 self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size()
229 }
230}
231
232impl HeapSize for ColumnIndexMetaData {
233 fn heap_size(&self) -> usize {
234 match self {
235 Self::NONE => 0,
236 Self::BOOLEAN(native_index) => native_index.heap_size(),
237 Self::INT32(native_index) => native_index.heap_size(),
238 Self::INT64(native_index) => native_index.heap_size(),
239 Self::INT96(native_index) => native_index.heap_size(),
240 Self::FLOAT(native_index) => native_index.heap_size(),
241 Self::DOUBLE(native_index) => native_index.heap_size(),
242 Self::BYTE_ARRAY(native_index) => native_index.heap_size(),
243 Self::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
244 }
245 }
246}
247
248impl HeapSize for ColumnIndex {
249 fn heap_size(&self) -> usize {
250 self.null_pages.heap_size()
251 + self.boundary_order.heap_size()
252 + self.null_counts.heap_size()
253 + self.definition_level_histograms.heap_size()
254 + self.repetition_level_histograms.heap_size()
255 }
256}
257
258impl<T: ParquetValueType> HeapSize for PrimitiveColumnIndex<T> {
259 fn heap_size(&self) -> usize {
260 self.column_index.heap_size() + self.min_values.heap_size() + self.max_values.heap_size()
261 }
262}
263
264impl HeapSize for ByteArrayColumnIndex {
265 fn heap_size(&self) -> usize {
266 self.column_index.heap_size()
267 + self.min_bytes.heap_size()
268 + self.min_offsets.heap_size()
269 + self.max_bytes.heap_size()
270 + self.max_offsets.heap_size()
271 }
272}
273
274impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
275 fn heap_size(&self) -> usize {
276 self.min_opt().map(T::heap_size).unwrap_or(0)
277 + self.max_opt().map(T::heap_size).unwrap_or(0)
278 }
279}
280impl HeapSize for bool {
281 fn heap_size(&self) -> usize {
282 0 }
284}
285impl HeapSize for u8 {
286 fn heap_size(&self) -> usize {
287 0 }
289}
290impl HeapSize for i32 {
291 fn heap_size(&self) -> usize {
292 0 }
294}
295impl HeapSize for i64 {
296 fn heap_size(&self) -> usize {
297 0 }
299}
300
301impl HeapSize for f32 {
302 fn heap_size(&self) -> usize {
303 0 }
305}
306impl HeapSize for f64 {
307 fn heap_size(&self) -> usize {
308 0 }
310}
311
312impl HeapSize for usize {
313 fn heap_size(&self) -> usize {
314 0 }
316}
317
318impl HeapSize for BoundaryOrder {
319 fn heap_size(&self) -> usize {
320 0 }
322}
323
324impl HeapSize for PageLocation {
325 fn heap_size(&self) -> usize {
326 0 }
328}
329
330impl HeapSize for ColumnOrder {
331 fn heap_size(&self) -> usize {
332 0 }
334}