parquet/util/
interner.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::data_type::AsBytes;
19use hashbrown::HashTable;
20
21const DEFAULT_DEDUP_CAPACITY: usize = 4096;
22
23/// Storage trait for [`Interner`]
24pub trait Storage {
25    type Key: Copy;
26
27    type Value: AsBytes + ?Sized;
28
29    /// Gets an element by its key
30    fn get(&self, idx: Self::Key) -> &Self::Value;
31
32    /// Adds a new element, returning the key
33    fn push(&mut self, value: &Self::Value) -> Self::Key;
34
35    /// Return an estimate of the memory used in this storage, in bytes
36    #[allow(dead_code)] // not used in parquet_derive, so is dead there
37    fn estimated_memory_size(&self) -> usize;
38}
39
40/// A generic value interner supporting various different [`Storage`]
41#[derive(Debug, Default)]
42pub struct Interner<S: Storage> {
43    state: ahash::RandomState,
44
45    /// Used to provide a lookup from value to unique value
46    dedup: HashTable<S::Key>,
47
48    storage: S,
49}
50
51impl<S: Storage> Interner<S> {
52    /// Create a new `Interner` with the provided storage
53    pub fn new(storage: S) -> Self {
54        Self {
55            state: Default::default(),
56            dedup: HashTable::with_capacity(DEFAULT_DEDUP_CAPACITY),
57            storage,
58        }
59    }
60
61    /// Intern the value, returning the interned key, and if this was a new value
62    pub fn intern(&mut self, value: &S::Value) -> S::Key {
63        let hash = self.state.hash_one(value.as_bytes());
64
65        *self
66            .dedup
67            .entry(
68                hash,
69                // Compare bytes rather than directly comparing values so NaNs can be interned
70                |index| value.as_bytes() == self.storage.get(*index).as_bytes(),
71                |key| self.state.hash_one(self.storage.get(*key).as_bytes()),
72            )
73            .or_insert_with(|| self.storage.push(value))
74            .get()
75    }
76
77    /// Return estimate of the memory used, in bytes
78    #[allow(dead_code)] // not used in parquet_derive, so is dead there
79    pub fn estimated_memory_size(&self) -> usize {
80        self.storage.estimated_memory_size() +
81            // estimate size of dedup hashmap as just th size of the keys
82            self.dedup.capacity() + std::mem::size_of::<S::Key>()
83    }
84
85    /// Returns the storage for this interner
86    pub fn storage(&self) -> &S {
87        &self.storage
88    }
89
90    /// Unwraps the inner storage
91    #[cfg(feature = "arrow")]
92    pub fn into_inner(self) -> S {
93        self.storage
94    }
95}