parquet/file/metadata/
memory.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Memory calculations for [`ParquetMetadata::memory_size`]
19//!
20//! [`ParquetMetadata::memory_size`]: crate::file::metadata::ParquetMetaData::memory_size
21use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType};
22use crate::data_type::private::ParquetValueType;
23use crate::file::metadata::{
24    ColumnChunkMetaData, FileMetaData, KeyValue, PageEncodingStats, RowGroupMetaData, SortingColumn,
25};
26use crate::file::page_index::column_index::{
27    ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
28};
29use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
30use crate::file::statistics::{Statistics, ValueStatistics};
31use std::collections::HashMap;
32use std::sync::Arc;
33
34/// Trait for calculating the size of various containers
35pub trait HeapSize {
36    /// Return the size of any bytes allocated on the heap by this object,
37    /// including heap memory in those structures
38    ///
39    /// Note that the size of the type itself is not included in the result --
40    /// instead, that size is added by the caller (e.g. container).
41    fn heap_size(&self) -> usize;
42}
43
44impl<T: HeapSize> HeapSize for Vec<T> {
45    fn heap_size(&self) -> usize {
46        let item_size = std::mem::size_of::<T>();
47        // account for the contents of the Vec
48        (self.capacity() * item_size) +
49        // add any heap allocations by contents
50        self.iter().map(|t| t.heap_size()).sum::<usize>()
51    }
52}
53
54impl<K: HeapSize, V: HeapSize> HeapSize for HashMap<K, V> {
55    fn heap_size(&self) -> usize {
56        let capacity = self.capacity();
57        if capacity == 0 {
58            return 0;
59        }
60
61        // HashMap doesn't provide a way to get its heap size, so this is an approximation based on
62        // the behavior of hashbrown::HashMap as at version 0.16.0, and may become inaccurate
63        // if the implementation changes.
64        let key_val_size = std::mem::size_of::<(K, V)>();
65        // Overhead for the control tags group, which may be smaller depending on architecture
66        let group_size = 16;
67        // 1 byte of metadata stored per bucket.
68        let metadata_size = 1;
69
70        // Compute the number of buckets for the capacity. Based on hashbrown's capacity_to_buckets
71        let buckets = if capacity < 15 {
72            let min_cap = match key_val_size {
73                0..=1 => 14,
74                2..=3 => 7,
75                _ => 3,
76            };
77            let cap = min_cap.max(capacity);
78            if cap < 4 {
79                4
80            } else if cap < 8 {
81                8
82            } else {
83                16
84            }
85        } else {
86            (capacity.saturating_mul(8) / 7).next_power_of_two()
87        };
88
89        group_size
90            + (buckets * (key_val_size + metadata_size))
91            + self.keys().map(|k| k.heap_size()).sum::<usize>()
92            + self.values().map(|v| v.heap_size()).sum::<usize>()
93    }
94}
95
96impl<T: HeapSize> HeapSize for Arc<T> {
97    fn heap_size(&self) -> usize {
98        // Arc stores weak and strong counts on the heap alongside an instance of T
99        2 * std::mem::size_of::<usize>() + std::mem::size_of::<T>() + self.as_ref().heap_size()
100    }
101}
102
103impl HeapSize for Arc<dyn HeapSize> {
104    fn heap_size(&self) -> usize {
105        2 * std::mem::size_of::<usize>()
106            + std::mem::size_of_val(self.as_ref())
107            + self.as_ref().heap_size()
108    }
109}
110
111impl<T: HeapSize> HeapSize for Box<T> {
112    fn heap_size(&self) -> usize {
113        std::mem::size_of::<T>() + self.as_ref().heap_size()
114    }
115}
116
117impl<T: HeapSize> HeapSize for Option<T> {
118    fn heap_size(&self) -> usize {
119        self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0)
120    }
121}
122
123impl HeapSize for String {
124    fn heap_size(&self) -> usize {
125        self.capacity()
126    }
127}
128
129impl HeapSize for FileMetaData {
130    fn heap_size(&self) -> usize {
131        #[cfg(feature = "encryption")]
132        let encryption_heap_size =
133            self.encryption_algorithm.heap_size() + self.footer_signing_key_metadata.heap_size();
134        #[cfg(not(feature = "encryption"))]
135        let encryption_heap_size = 0;
136
137        self.created_by.heap_size()
138            + self.key_value_metadata.heap_size()
139            + self.schema_descr.heap_size()
140            + self.column_orders.heap_size()
141            + encryption_heap_size
142    }
143}
144
145impl HeapSize for KeyValue {
146    fn heap_size(&self) -> usize {
147        self.key.heap_size() + self.value.heap_size()
148    }
149}
150
151impl HeapSize for RowGroupMetaData {
152    fn heap_size(&self) -> usize {
153        // don't count schema_descr here because it is already
154        // counted in FileMetaData
155        self.columns.heap_size() + self.sorting_columns.heap_size()
156    }
157}
158
159impl HeapSize for ColumnChunkMetaData {
160    fn heap_size(&self) -> usize {
161        #[cfg(feature = "encryption")]
162        let encryption_heap_size =
163            self.column_crypto_metadata.heap_size() + self.encrypted_column_metadata.heap_size();
164        #[cfg(not(feature = "encryption"))]
165        let encryption_heap_size = 0;
166
167        // don't count column_descr here because it is already counted in
168        // FileMetaData
169        self.encodings.heap_size()
170            + self.file_path.heap_size()
171            + self.compression.heap_size()
172            + self.statistics.heap_size()
173            + self.encoding_stats.heap_size()
174            + self.unencoded_byte_array_data_bytes.heap_size()
175            + self.repetition_level_histogram.heap_size()
176            + self.definition_level_histogram.heap_size()
177            + self.geo_statistics.heap_size()
178            + encryption_heap_size
179    }
180}
181
182impl HeapSize for Encoding {
183    fn heap_size(&self) -> usize {
184        0 // no heap allocations
185    }
186}
187
188impl HeapSize for PageEncodingStats {
189    fn heap_size(&self) -> usize {
190        self.page_type.heap_size() + self.encoding.heap_size()
191    }
192}
193
194impl HeapSize for SortingColumn {
195    fn heap_size(&self) -> usize {
196        0 // no heap allocations
197    }
198}
199impl HeapSize for Compression {
200    fn heap_size(&self) -> usize {
201        0 // no heap allocations
202    }
203}
204
205impl HeapSize for PageType {
206    fn heap_size(&self) -> usize {
207        0 // no heap allocations
208    }
209}
210
211impl HeapSize for Statistics {
212    fn heap_size(&self) -> usize {
213        match self {
214            Statistics::Boolean(value_statistics) => value_statistics.heap_size(),
215            Statistics::Int32(value_statistics) => value_statistics.heap_size(),
216            Statistics::Int64(value_statistics) => value_statistics.heap_size(),
217            Statistics::Int96(value_statistics) => value_statistics.heap_size(),
218            Statistics::Float(value_statistics) => value_statistics.heap_size(),
219            Statistics::Double(value_statistics) => value_statistics.heap_size(),
220            Statistics::ByteArray(value_statistics) => value_statistics.heap_size(),
221            Statistics::FixedLenByteArray(value_statistics) => value_statistics.heap_size(),
222        }
223    }
224}
225
226impl HeapSize for OffsetIndexMetaData {
227    fn heap_size(&self) -> usize {
228        self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size()
229    }
230}
231
232impl HeapSize for ColumnIndexMetaData {
233    fn heap_size(&self) -> usize {
234        match self {
235            Self::NONE => 0,
236            Self::BOOLEAN(native_index) => native_index.heap_size(),
237            Self::INT32(native_index) => native_index.heap_size(),
238            Self::INT64(native_index) => native_index.heap_size(),
239            Self::INT96(native_index) => native_index.heap_size(),
240            Self::FLOAT(native_index) => native_index.heap_size(),
241            Self::DOUBLE(native_index) => native_index.heap_size(),
242            Self::BYTE_ARRAY(native_index) => native_index.heap_size(),
243            Self::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
244        }
245    }
246}
247
248impl HeapSize for ColumnIndex {
249    fn heap_size(&self) -> usize {
250        self.null_pages.heap_size()
251            + self.boundary_order.heap_size()
252            + self.null_counts.heap_size()
253            + self.definition_level_histograms.heap_size()
254            + self.repetition_level_histograms.heap_size()
255    }
256}
257
258impl<T: ParquetValueType> HeapSize for PrimitiveColumnIndex<T> {
259    fn heap_size(&self) -> usize {
260        self.column_index.heap_size() + self.min_values.heap_size() + self.max_values.heap_size()
261    }
262}
263
264impl HeapSize for ByteArrayColumnIndex {
265    fn heap_size(&self) -> usize {
266        self.column_index.heap_size()
267            + self.min_bytes.heap_size()
268            + self.min_offsets.heap_size()
269            + self.max_bytes.heap_size()
270            + self.max_offsets.heap_size()
271    }
272}
273
274impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
275    fn heap_size(&self) -> usize {
276        self.min_opt().map(T::heap_size).unwrap_or(0)
277            + self.max_opt().map(T::heap_size).unwrap_or(0)
278    }
279}
280impl HeapSize for bool {
281    fn heap_size(&self) -> usize {
282        0 // no heap allocations
283    }
284}
285impl HeapSize for u8 {
286    fn heap_size(&self) -> usize {
287        0 // no heap allocations
288    }
289}
290impl HeapSize for i32 {
291    fn heap_size(&self) -> usize {
292        0 // no heap allocations
293    }
294}
295impl HeapSize for i64 {
296    fn heap_size(&self) -> usize {
297        0 // no heap allocations
298    }
299}
300
301impl HeapSize for f32 {
302    fn heap_size(&self) -> usize {
303        0 // no heap allocations
304    }
305}
306impl HeapSize for f64 {
307    fn heap_size(&self) -> usize {
308        0 // no heap allocations
309    }
310}
311
312impl HeapSize for usize {
313    fn heap_size(&self) -> usize {
314        0 // no heap allocations
315    }
316}
317
318impl HeapSize for BoundaryOrder {
319    fn heap_size(&self) -> usize {
320        0 // no heap allocations
321    }
322}
323
324impl HeapSize for PageLocation {
325    fn heap_size(&self) -> usize {
326        0 // no heap allocations
327    }
328}
329
330impl HeapSize for ColumnOrder {
331    fn heap_size(&self) -> usize {
332        0 // no heap allocations in ColumnOrder
333    }
334}