Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
hash-util.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // From Apache Impala (incubating) as of 2016-02-22
19 
20 #ifndef ARROW_UTIL_HASH_UTIL_H
21 #define ARROW_UTIL_HASH_UTIL_H
22 
23 #include <cstdint>
24 
25 #include "arrow/util/cpu-info.h"
26 #include "arrow/util/logging.h"
27 #include "arrow/util/macros.h"
28 #include "arrow/util/sse-util.h"
29 
30 namespace arrow {
31 
33 class HashUtil {
34  public:
42  static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
43  uint32_t words = static_cast<uint32_t>(bytes / sizeof(uint32_t));
44  bytes = static_cast<int32_t>(bytes % sizeof(uint32_t));
45 
46  const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
47  while (words--) {
48  hash = SSE4_crc32_u32(hash, *p);
49  ++p;
50  }
51 
52  const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
53  while (bytes--) {
54  hash = SSE4_crc32_u8(hash, *s);
55  ++s;
56  }
57 
58  // The lower half of the CRC hash has has poor uniformity, so swap the halves
59  // for anyone who only uses the first several bits of the hash.
60  hash = (hash << 16) | (hash >> 16);
61  return hash;
62  }
63 
65  static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
66  const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
67  hash = SSE4_crc32_u8(hash, *s);
68  hash = (hash << 16) | (hash >> 16);
69  return hash;
70  }
71 
73  static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
74  const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
75  hash = SSE4_crc32_u16(hash, *s);
76  hash = (hash << 16) | (hash >> 16);
77  return hash;
78  }
79 
81  static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
82  const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
83  hash = SSE4_crc32_u32(hash, *p);
84  hash = (hash << 16) | (hash >> 16);
85  return hash;
86  }
87 
89  static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
90  const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
91  hash = SSE4_crc32_u64(hash, *p);
92  hash = (hash << 16) | (hash >> 16);
93  return hash;
94  }
95 
97  static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
98  const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
99  hash = SSE4_crc32_u64(hash, *p);
100  ++p;
101  hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
102  hash = (hash << 16) | (hash >> 16);
103  return hash;
104  }
105 
107  static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
108  const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
109  hash = SSE4_crc32_u64(hash, *p);
110  ++p;
111  hash = SSE4_crc32_u64(hash, *p);
112  hash = (hash << 16) | (hash >> 16);
113  return hash;
114  }
115 
116  static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995;
117  static const int MURMUR_R = 47;
118 
120  static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
121  uint64_t h = seed ^ (len * MURMUR_PRIME);
122 
123  const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
124  const uint64_t* end = data + (len / sizeof(uint64_t));
125 
126  while (data != end) {
127  uint64_t k = *data++;
128  k *= MURMUR_PRIME;
129  k ^= k >> MURMUR_R;
130  k *= MURMUR_PRIME;
131  h ^= k;
132  h *= MURMUR_PRIME;
133  }
134 
135  const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
136  switch (len & 7) {
137  case 7:
138  h ^= uint64_t(data2[6]) << 48;
139  case 6:
140  h ^= uint64_t(data2[5]) << 40;
141  case 5:
142  h ^= uint64_t(data2[4]) << 32;
143  case 4:
144  h ^= uint64_t(data2[3]) << 24;
145  case 3:
146  h ^= uint64_t(data2[2]) << 16;
147  case 2:
148  h ^= uint64_t(data2[1]) << 8;
149  case 1:
150  h ^= uint64_t(data2[0]);
151  h *= MURMUR_PRIME;
152  }
153 
154  h ^= h >> MURMUR_R;
155  h *= MURMUR_PRIME;
156  h ^= h >> MURMUR_R;
157  return h;
158  }
159 
161  static const uint32_t FNV_PRIME = 0x01000193; // 16777619
162  static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261
163  static const uint64_t FNV64_PRIME = 1099511628211UL;
164  static const uint64_t FNV64_SEED = 14695981039346656037UL;
165 
175  static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) {
176  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
177  while (bytes--) {
178  hash = (*ptr ^ hash) * FNV64_PRIME;
179  ++ptr;
180  }
181  return hash;
182  }
183 
192  static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) {
193  // IMPALA-2270: this function should never be used for zero-byte inputs.
194  DCHECK_GT(bytes, 0);
195  uint64_t hash_u64 = hash | (static_cast<uint64_t>(hash) << 32);
196  hash_u64 = FnvHash64(data, bytes, hash_u64);
197  return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
198  }
199 
200  // With sse4.2
201  template <bool use_sse42 = true>
202  static inline int Hash(const void* data, int32_t bytes, uint32_t seed);
203 
205  static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
206 
211  static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
212  return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
213  }
214 
215  // Get 32 more bits of randomness from a 32-bit hash:
216  static inline uint32_t Rehash32to32(const uint32_t hash) {
217  // Constants generated by uuidgen(1) with the -r flag
218  static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull;
219  // This is strongly universal hashing following Dietzfelbinger's "Universal hashing
220  // and k-wise independent random variables via integer arithmetic without primes". As
221  // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the
222  // randomness of the constants) that any subset of bit positions of
223  // Rehash32to32(hash1) is equal to the same subset of bit positions
224  // Rehash32to32(hash2) is minimal.
225  return static_cast<uint32_t>((static_cast<uint64_t>(hash) * m + a) >> 32);
226  }
227 
228  static inline uint64_t Rehash32to64(const uint32_t hash) {
229  static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull,
230  a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull;
231  const uint64_t hash1 = (static_cast<uint64_t>(hash) * m1 + a1) >> 32;
232  const uint64_t hash2 = (static_cast<uint64_t>(hash) * m2 + a2) >> 32;
233  return hash1 | (hash2 << 32);
234  }
235 };
236 
237 // With sse4.2
238 template <>
239 inline int HashUtil::Hash<true>(const void* data, int32_t bytes, uint32_t seed) {
240  return static_cast<int>(HashUtil::CrcHash(data, bytes, seed));
241 }
242 
243 // Non-sse4 hash
244 template <>
245 inline int HashUtil::Hash<false>(const void* data, int32_t bytes, uint32_t seed) {
246  return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
247 }
248 
249 } // namespace arrow
250 
251 #endif // ARROW_UTIL_HASH_UTIL_H
static uint32_t CrcHash8(const void *v, uint32_t hash)
CrcHash() specialized for 8-byte data.
Definition: hash-util.h:89
static const uint32_t FNV_PRIME
default values recommended by http://isthe.com/chongo/tech/comp/fnv/
Definition: hash-util.h:161
static uint32_t CrcHash16(const void *v, uint32_t hash)
CrcHash() specialized for 16-byte data.
Definition: hash-util.h:107
static const uint64_t MURMUR_PRIME
Definition: hash-util.h:116
static uint64_t MurmurHash2_64(const void *input, int len, uint64_t seed)
Murmur2 hash implementation returning 64-bit hashes.
Definition: hash-util.h:120
static uint32_t CrcHash4(const void *v, uint32_t hash)
CrcHash() specialized for 4-byte data.
Definition: hash-util.h:81
#define DCHECK_GT(val1, val2)
Definition: logging.h:99
static const int MURMUR_R
Definition: hash-util.h:117
static const uint32_t HASH_COMBINE_SEED
The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
Definition: hash-util.h:205
Utility class to compute hash values.
Definition: hash-util.h:33
static const uint32_t FNV_SEED
Definition: hash-util.h:162
static uint32_t CrcHash12(const void *v, uint32_t hash)
CrcHash() specialized for 12-byte data.
Definition: hash-util.h:97
static uint64_t Rehash32to64(const uint32_t hash)
Definition: hash-util.h:228
static uint32_t HashCombine32(uint32_t value, uint32_t seed)
Combine hashes &#39;value&#39; and &#39;seed&#39; to get a new hash value.
Definition: hash-util.h:211
static int Hash(const void *data, int32_t bytes, uint32_t seed)
static uint32_t CrcHash1(const void *v, uint32_t hash)
CrcHash() specialized for 1-byte data.
Definition: hash-util.h:65
Top-level namespace for Apache Arrow C++ API.
Definition: adapter.h:32
static uint64_t FnvHash64(const void *data, int32_t bytes, uint64_t hash)
Implementation of the Fowler-Noll-Vo hash function.
Definition: hash-util.h:175
static uint32_t FnvHash64to32(const void *data, int32_t bytes, uint32_t hash)
Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits.
Definition: hash-util.h:192
static const uint64_t FNV64_SEED
Definition: hash-util.h:164
static uint32_t Rehash32to32(const uint32_t hash)
Definition: hash-util.h:216
static uint32_t CrcHash(const void *data, int32_t bytes, uint32_t hash)
Compute the Crc32 hash for data using SSE4 instructions.
Definition: hash-util.h:42
static const uint64_t FNV64_PRIME
Definition: hash-util.h:163
static uint32_t CrcHash2(const void *v, uint32_t hash)
CrcHash() specialized for 2-byte data.
Definition: hash-util.h:73