parquet/util/interner.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::data_type::AsBytes;
19use hashbrown::HashTable;
20
21const DEFAULT_DEDUP_CAPACITY: usize = 4096;
22
23/// Storage trait for [`Interner`]
24pub trait Storage {
25 type Key: Copy;
26
27 type Value: AsBytes + ?Sized;
28
29 /// Gets an element by its key
30 fn get(&self, idx: Self::Key) -> &Self::Value;
31
32 /// Adds a new element, returning the key
33 fn push(&mut self, value: &Self::Value) -> Self::Key;
34
35 /// Return an estimate of the memory used in this storage, in bytes
36 #[allow(dead_code)] // not used in parquet_derive, so is dead there
37 fn estimated_memory_size(&self) -> usize;
38}
39
40/// A generic value interner supporting various different [`Storage`]
41#[derive(Debug, Default)]
42pub struct Interner<S: Storage> {
43 state: ahash::RandomState,
44
45 /// Used to provide a lookup from value to unique value
46 dedup: HashTable<S::Key>,
47
48 storage: S,
49}
50
51impl<S: Storage> Interner<S> {
52 /// Create a new `Interner` with the provided storage
53 pub fn new(storage: S) -> Self {
54 Self {
55 state: Default::default(),
56 dedup: HashTable::with_capacity(DEFAULT_DEDUP_CAPACITY),
57 storage,
58 }
59 }
60
61 /// Intern the value, returning the interned key, and if this was a new value
62 pub fn intern(&mut self, value: &S::Value) -> S::Key {
63 let hash = self.state.hash_one(value.as_bytes());
64
65 *self
66 .dedup
67 .entry(
68 hash,
69 // Compare bytes rather than directly comparing values so NaNs can be interned
70 |index| value.as_bytes() == self.storage.get(*index).as_bytes(),
71 |key| self.state.hash_one(self.storage.get(*key).as_bytes()),
72 )
73 .or_insert_with(|| self.storage.push(value))
74 .get()
75 }
76
77 /// Return estimate of the memory used, in bytes
78 #[allow(dead_code)] // not used in parquet_derive, so is dead there
79 pub fn estimated_memory_size(&self) -> usize {
80 self.storage.estimated_memory_size() +
81 // estimate size of dedup hashmap as just th size of the keys
82 self.dedup.capacity() + std::mem::size_of::<S::Key>()
83 }
84
85 /// Returns the storage for this interner
86 pub fn storage(&self) -> &S {
87 &self.storage
88 }
89
90 /// Unwraps the inner storage
91 #[cfg(feature = "arrow")]
92 pub fn into_inner(self) -> S {
93 self.storage
94 }
95}