parquet/file/metadata/
memory.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Memory calculations for [`ParquetMetadata::memory_size`]
19//!
20//! [`ParquetMetadata::memory_size`]: crate::file::metadata::ParquetMetaData::memory_size
21use crate::basic::{ColumnOrder, Compression, Encoding, PageType};
22use crate::data_type::private::ParquetValueType;
23use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData};
24use crate::file::page_encoding_stats::PageEncodingStats;
25use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
26use crate::file::page_index::offset_index::OffsetIndexMetaData;
27use crate::file::statistics::{Statistics, ValueStatistics};
28use crate::format::{BoundaryOrder, PageLocation, SortingColumn};
29use std::sync::Arc;
30
31/// Trait for calculating the size of various containers
32pub trait HeapSize {
33    /// Return the size of any bytes allocated on the heap by this object,
34    /// including heap memory in those structures
35    ///
36    /// Note that the size of the type itself is not included in the result --
37    /// instead, that size is added by the caller (e.g. container).
38    fn heap_size(&self) -> usize;
39}
40
41impl<T: HeapSize> HeapSize for Vec<T> {
42    fn heap_size(&self) -> usize {
43        let item_size = std::mem::size_of::<T>();
44        // account for the contents of the Vec
45        (self.capacity() * item_size) +
46        // add any heap allocations by contents
47        self.iter().map(|t| t.heap_size()).sum::<usize>()
48    }
49}
50
51impl<T: HeapSize> HeapSize for Arc<T> {
52    fn heap_size(&self) -> usize {
53        self.as_ref().heap_size()
54    }
55}
56
57impl<T: HeapSize> HeapSize for Option<T> {
58    fn heap_size(&self) -> usize {
59        self.as_ref().map(|inner| inner.heap_size()).unwrap_or(0)
60    }
61}
62
63impl HeapSize for String {
64    fn heap_size(&self) -> usize {
65        self.capacity()
66    }
67}
68
69impl HeapSize for FileMetaData {
70    fn heap_size(&self) -> usize {
71        self.created_by.heap_size()
72            + self.key_value_metadata.heap_size()
73            + self.schema_descr.heap_size()
74            + self.column_orders.heap_size()
75    }
76}
77
78impl HeapSize for KeyValue {
79    fn heap_size(&self) -> usize {
80        self.key.heap_size() + self.value.heap_size()
81    }
82}
83
84impl HeapSize for RowGroupMetaData {
85    fn heap_size(&self) -> usize {
86        // don't count schema_descr here because it is already
87        // counted in FileMetaData
88        self.columns.heap_size() + self.sorting_columns.heap_size()
89    }
90}
91
92impl HeapSize for ColumnChunkMetaData {
93    fn heap_size(&self) -> usize {
94        // don't count column_descr here because it is already counted in
95        // FileMetaData
96        self.encodings.heap_size()
97            + self.file_path.heap_size()
98            + self.compression.heap_size()
99            + self.statistics.heap_size()
100            + self.encoding_stats.heap_size()
101            + self.unencoded_byte_array_data_bytes.heap_size()
102            + self.repetition_level_histogram.heap_size()
103            + self.definition_level_histogram.heap_size()
104    }
105}
106
107impl HeapSize for Encoding {
108    fn heap_size(&self) -> usize {
109        0 // no heap allocations
110    }
111}
112
113impl HeapSize for PageEncodingStats {
114    fn heap_size(&self) -> usize {
115        self.page_type.heap_size() + self.encoding.heap_size()
116    }
117}
118
119impl HeapSize for SortingColumn {
120    fn heap_size(&self) -> usize {
121        0 // no heap allocations
122    }
123}
124impl HeapSize for Compression {
125    fn heap_size(&self) -> usize {
126        0 // no heap allocations
127    }
128}
129
130impl HeapSize for PageType {
131    fn heap_size(&self) -> usize {
132        0 // no heap allocations
133    }
134}
135impl HeapSize for Statistics {
136    fn heap_size(&self) -> usize {
137        match self {
138            Statistics::Boolean(value_statistics) => value_statistics.heap_size(),
139            Statistics::Int32(value_statistics) => value_statistics.heap_size(),
140            Statistics::Int64(value_statistics) => value_statistics.heap_size(),
141            Statistics::Int96(value_statistics) => value_statistics.heap_size(),
142            Statistics::Float(value_statistics) => value_statistics.heap_size(),
143            Statistics::Double(value_statistics) => value_statistics.heap_size(),
144            Statistics::ByteArray(value_statistics) => value_statistics.heap_size(),
145            Statistics::FixedLenByteArray(value_statistics) => value_statistics.heap_size(),
146        }
147    }
148}
149
150impl HeapSize for OffsetIndexMetaData {
151    fn heap_size(&self) -> usize {
152        self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size()
153    }
154}
155
156impl HeapSize for Index {
157    fn heap_size(&self) -> usize {
158        match self {
159            Index::NONE => 0,
160            Index::BOOLEAN(native_index) => native_index.heap_size(),
161            Index::INT32(native_index) => native_index.heap_size(),
162            Index::INT64(native_index) => native_index.heap_size(),
163            Index::INT96(native_index) => native_index.heap_size(),
164            Index::FLOAT(native_index) => native_index.heap_size(),
165            Index::DOUBLE(native_index) => native_index.heap_size(),
166            Index::BYTE_ARRAY(native_index) => native_index.heap_size(),
167            Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
168        }
169    }
170}
171
172impl<T: ParquetValueType> HeapSize for NativeIndex<T> {
173    fn heap_size(&self) -> usize {
174        self.indexes.heap_size() + self.boundary_order.heap_size()
175    }
176}
177
178impl<T: ParquetValueType> HeapSize for PageIndex<T> {
179    fn heap_size(&self) -> usize {
180        self.min.heap_size() + self.max.heap_size() + self.null_count.heap_size()
181    }
182}
183
184impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
185    fn heap_size(&self) -> usize {
186        self.min_opt().map(T::heap_size).unwrap_or(0)
187            + self.max_opt().map(T::heap_size).unwrap_or(0)
188    }
189}
190impl HeapSize for bool {
191    fn heap_size(&self) -> usize {
192        0 // no heap allocations
193    }
194}
195impl HeapSize for i32 {
196    fn heap_size(&self) -> usize {
197        0 // no heap allocations
198    }
199}
200impl HeapSize for i64 {
201    fn heap_size(&self) -> usize {
202        0 // no heap allocations
203    }
204}
205
206impl HeapSize for f32 {
207    fn heap_size(&self) -> usize {
208        0 // no heap allocations
209    }
210}
211impl HeapSize for f64 {
212    fn heap_size(&self) -> usize {
213        0 // no heap allocations
214    }
215}
216
217impl HeapSize for usize {
218    fn heap_size(&self) -> usize {
219        0 // no heap allocations
220    }
221}
222
223impl HeapSize for BoundaryOrder {
224    fn heap_size(&self) -> usize {
225        0 // no heap allocations
226    }
227}
228
229impl HeapSize for PageLocation {
230    fn heap_size(&self) -> usize {
231        0 // no heap allocations
232    }
233}
234
235impl HeapSize for ColumnOrder {
236    fn heap_size(&self) -> usize {
237        0 // no heap allocations in ColumnOrder
238    }
239}