parquet/file/metadata/
memory.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Memory calculations for [`ParquetMetadata::memory_size`]
19//!
20//! [`ParquetMetadata::memory_size`]: crate::file::metadata::ParquetMetaData::memory_size
21use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType};
22use crate::data_type::private::ParquetValueType;
23use crate::file::metadata::{
24    ColumnChunkMetaData, FileMetaData, KeyValue, PageEncodingStats, RowGroupMetaData, SortingColumn,
25};
26use crate::file::page_index::column_index::{
27    ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
28};
29use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
30use crate::file::statistics::{Statistics, ValueStatistics};
31use std::sync::Arc;
32
33/// Trait for calculating the size of various containers
34pub trait HeapSize {
35    /// Return the size of any bytes allocated on the heap by this object,
36    /// including heap memory in those structures
37    ///
38    /// Note that the size of the type itself is not included in the result --
39    /// instead, that size is added by the caller (e.g. container).
40    fn heap_size(&self) -> usize;
41}
42
43impl<T: HeapSize> HeapSize for Vec<T> {
44    fn heap_size(&self) -> usize {
45        let item_size = std::mem::size_of::<T>();
46        // account for the contents of the Vec
47        (self.capacity() * item_size) +
48        // add any heap allocations by contents
49        self.iter().map(|t| t.heap_size()).sum::<usize>()
50    }
51}
52
53impl<T: HeapSize> HeapSize for Arc<T> {
54    fn heap_size(&self) -> usize {
55        self.as_ref().heap_size()
56    }
57}
58
59impl<T: HeapSize> HeapSize for Box<T> {
60    fn heap_size(&self) -> usize {
61        std::mem::size_of::<T>() + self.as_ref().heap_size()
62    }
63}
64
65impl<T: HeapSize> HeapSize for Option<T> {
66    fn heap_size(&self) -> usize {
67        self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0)
68    }
69}
70
71impl HeapSize for String {
72    fn heap_size(&self) -> usize {
73        self.capacity()
74    }
75}
76
77impl HeapSize for FileMetaData {
78    fn heap_size(&self) -> usize {
79        #[cfg(feature = "encryption")]
80        let encryption_heap_size =
81            self.encryption_algorithm.heap_size() + self.footer_signing_key_metadata.heap_size();
82        #[cfg(not(feature = "encryption"))]
83        let encryption_heap_size = 0;
84
85        self.created_by.heap_size()
86            + self.key_value_metadata.heap_size()
87            + self.schema_descr.heap_size()
88            + self.column_orders.heap_size()
89            + encryption_heap_size
90    }
91}
92
93impl HeapSize for KeyValue {
94    fn heap_size(&self) -> usize {
95        self.key.heap_size() + self.value.heap_size()
96    }
97}
98
99impl HeapSize for RowGroupMetaData {
100    fn heap_size(&self) -> usize {
101        // don't count schema_descr here because it is already
102        // counted in FileMetaData
103        self.columns.heap_size() + self.sorting_columns.heap_size()
104    }
105}
106
107impl HeapSize for ColumnChunkMetaData {
108    fn heap_size(&self) -> usize {
109        #[cfg(feature = "encryption")]
110        let encryption_heap_size =
111            self.column_crypto_metadata.heap_size() + self.encrypted_column_metadata.heap_size();
112        #[cfg(not(feature = "encryption"))]
113        let encryption_heap_size = 0;
114
115        // don't count column_descr here because it is already counted in
116        // FileMetaData
117        self.encodings.heap_size()
118            + self.file_path.heap_size()
119            + self.compression.heap_size()
120            + self.statistics.heap_size()
121            + self.encoding_stats.heap_size()
122            + self.unencoded_byte_array_data_bytes.heap_size()
123            + self.repetition_level_histogram.heap_size()
124            + self.definition_level_histogram.heap_size()
125            + self.geo_statistics.heap_size()
126            + encryption_heap_size
127    }
128}
129
130impl HeapSize for Encoding {
131    fn heap_size(&self) -> usize {
132        0 // no heap allocations
133    }
134}
135
136impl HeapSize for PageEncodingStats {
137    fn heap_size(&self) -> usize {
138        self.page_type.heap_size() + self.encoding.heap_size()
139    }
140}
141
142impl HeapSize for SortingColumn {
143    fn heap_size(&self) -> usize {
144        0 // no heap allocations
145    }
146}
147impl HeapSize for Compression {
148    fn heap_size(&self) -> usize {
149        0 // no heap allocations
150    }
151}
152
153impl HeapSize for PageType {
154    fn heap_size(&self) -> usize {
155        0 // no heap allocations
156    }
157}
158
159impl HeapSize for Statistics {
160    fn heap_size(&self) -> usize {
161        match self {
162            Statistics::Boolean(value_statistics) => value_statistics.heap_size(),
163            Statistics::Int32(value_statistics) => value_statistics.heap_size(),
164            Statistics::Int64(value_statistics) => value_statistics.heap_size(),
165            Statistics::Int96(value_statistics) => value_statistics.heap_size(),
166            Statistics::Float(value_statistics) => value_statistics.heap_size(),
167            Statistics::Double(value_statistics) => value_statistics.heap_size(),
168            Statistics::ByteArray(value_statistics) => value_statistics.heap_size(),
169            Statistics::FixedLenByteArray(value_statistics) => value_statistics.heap_size(),
170        }
171    }
172}
173
174impl HeapSize for OffsetIndexMetaData {
175    fn heap_size(&self) -> usize {
176        self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size()
177    }
178}
179
180impl HeapSize for ColumnIndexMetaData {
181    fn heap_size(&self) -> usize {
182        match self {
183            Self::NONE => 0,
184            Self::BOOLEAN(native_index) => native_index.heap_size(),
185            Self::INT32(native_index) => native_index.heap_size(),
186            Self::INT64(native_index) => native_index.heap_size(),
187            Self::INT96(native_index) => native_index.heap_size(),
188            Self::FLOAT(native_index) => native_index.heap_size(),
189            Self::DOUBLE(native_index) => native_index.heap_size(),
190            Self::BYTE_ARRAY(native_index) => native_index.heap_size(),
191            Self::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
192        }
193    }
194}
195
196impl HeapSize for ColumnIndex {
197    fn heap_size(&self) -> usize {
198        self.null_pages.heap_size()
199            + self.boundary_order.heap_size()
200            + self.null_counts.heap_size()
201            + self.definition_level_histograms.heap_size()
202            + self.repetition_level_histograms.heap_size()
203    }
204}
205
206impl<T: ParquetValueType> HeapSize for PrimitiveColumnIndex<T> {
207    fn heap_size(&self) -> usize {
208        self.column_index.heap_size() + self.min_values.heap_size() + self.max_values.heap_size()
209    }
210}
211
212impl HeapSize for ByteArrayColumnIndex {
213    fn heap_size(&self) -> usize {
214        self.column_index.heap_size()
215            + self.min_bytes.heap_size()
216            + self.min_offsets.heap_size()
217            + self.max_bytes.heap_size()
218            + self.max_offsets.heap_size()
219    }
220}
221
222impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
223    fn heap_size(&self) -> usize {
224        self.min_opt().map(T::heap_size).unwrap_or(0)
225            + self.max_opt().map(T::heap_size).unwrap_or(0)
226    }
227}
228impl HeapSize for bool {
229    fn heap_size(&self) -> usize {
230        0 // no heap allocations
231    }
232}
233impl HeapSize for u8 {
234    fn heap_size(&self) -> usize {
235        0 // no heap allocations
236    }
237}
238impl HeapSize for i32 {
239    fn heap_size(&self) -> usize {
240        0 // no heap allocations
241    }
242}
243impl HeapSize for i64 {
244    fn heap_size(&self) -> usize {
245        0 // no heap allocations
246    }
247}
248
249impl HeapSize for f32 {
250    fn heap_size(&self) -> usize {
251        0 // no heap allocations
252    }
253}
254impl HeapSize for f64 {
255    fn heap_size(&self) -> usize {
256        0 // no heap allocations
257    }
258}
259
260impl HeapSize for usize {
261    fn heap_size(&self) -> usize {
262        0 // no heap allocations
263    }
264}
265
266impl HeapSize for BoundaryOrder {
267    fn heap_size(&self) -> usize {
268        0 // no heap allocations
269    }
270}
271
272impl HeapSize for PageLocation {
273    fn heap_size(&self) -> usize {
274        0 // no heap allocations
275    }
276}
277
278impl HeapSize for ColumnOrder {
279    fn heap_size(&self) -> usize {
280        0 // no heap allocations in ColumnOrder
281    }
282}