Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
hash-util.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // From Apache Impala (incubating) as of 2016-02-22
19 
20 #ifndef ARROW_UTIL_HASH_UTIL_H
21 #define ARROW_UTIL_HASH_UTIL_H
22 
23 #include <cstdint>
24 
25 #include "arrow/util/cpu-info.h"
26 #include "arrow/util/logging.h"
27 #include "arrow/util/macros.h"
28 #include "arrow/util/sse-util.h"
29 
30 namespace arrow {
31 
33 class HashUtil {
34  public:
42  static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
44  uint32_t words = static_cast<uint32_t>(bytes / sizeof(uint32_t));
45  bytes = static_cast<int32_t>(bytes % sizeof(uint32_t));
46 
47  const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
48  while (words--) {
49  hash = SSE4_crc32_u32(hash, *p);
50  ++p;
51  }
52 
53  const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
54  while (bytes--) {
55  hash = SSE4_crc32_u8(hash, *s);
56  ++s;
57  }
58 
59  // The lower half of the CRC hash has has poor uniformity, so swap the halves
60  // for anyone who only uses the first several bits of the hash.
61  hash = (hash << 16) | (hash >> 16);
62  return hash;
63  }
64 
66  static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
68  const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
69  hash = SSE4_crc32_u8(hash, *s);
70  hash = (hash << 16) | (hash >> 16);
71  return hash;
72  }
73 
75  static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
77  const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
78  hash = SSE4_crc32_u16(hash, *s);
79  hash = (hash << 16) | (hash >> 16);
80  return hash;
81  }
82 
84  static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
86  const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
87  hash = SSE4_crc32_u32(hash, *p);
88  hash = (hash << 16) | (hash >> 16);
89  return hash;
90  }
91 
93  static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
95  const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
96  hash = SSE4_crc32_u64(hash, *p);
97  hash = (hash << 16) | (hash >> 16);
98  return hash;
99  }
100 
102  static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
104  const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
105  hash = SSE4_crc32_u64(hash, *p);
106  ++p;
107  hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
108  hash = (hash << 16) | (hash >> 16);
109  return hash;
110  }
111 
113  static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
115  const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
116  hash = SSE4_crc32_u64(hash, *p);
117  ++p;
118  hash = SSE4_crc32_u64(hash, *p);
119  hash = (hash << 16) | (hash >> 16);
120  return hash;
121  }
122 
123  static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995;
124  static const int MURMUR_R = 47;
125 
127  static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
128  uint64_t h = seed ^ (len * MURMUR_PRIME);
129 
130  const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
131  const uint64_t* end = data + (len / sizeof(uint64_t));
132 
133  while (data != end) {
134  uint64_t k = *data++;
135  k *= MURMUR_PRIME;
136  k ^= k >> MURMUR_R;
137  k *= MURMUR_PRIME;
138  h ^= k;
139  h *= MURMUR_PRIME;
140  }
141 
142  const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
143  switch (len & 7) {
144  case 7:
145  h ^= uint64_t(data2[6]) << 48;
146  case 6:
147  h ^= uint64_t(data2[5]) << 40;
148  case 5:
149  h ^= uint64_t(data2[4]) << 32;
150  case 4:
151  h ^= uint64_t(data2[3]) << 24;
152  case 3:
153  h ^= uint64_t(data2[2]) << 16;
154  case 2:
155  h ^= uint64_t(data2[1]) << 8;
156  case 1:
157  h ^= uint64_t(data2[0]);
158  h *= MURMUR_PRIME;
159  }
160 
161  h ^= h >> MURMUR_R;
162  h *= MURMUR_PRIME;
163  h ^= h >> MURMUR_R;
164  return h;
165  }
166 
168  static const uint32_t FNV_PRIME = 0x01000193; // 16777619
169  static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261
170  static const uint64_t FNV64_PRIME = 1099511628211UL;
171  static const uint64_t FNV64_SEED = 14695981039346656037UL;
172 
182  static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) {
183  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
184  while (bytes--) {
185  hash = (*ptr ^ hash) * FNV64_PRIME;
186  ++ptr;
187  }
188  return hash;
189  }
190 
199  static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) {
200  // IMPALA-2270: this function should never be used for zero-byte inputs.
201  DCHECK_GT(bytes, 0);
202  uint64_t hash_u64 = hash | (static_cast<uint64_t>(hash) << 32);
203  hash_u64 = FnvHash64(data, bytes, hash_u64);
204  return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
205  }
206 
211  static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) {
212 #ifdef ARROW_USE_SSE
213  if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) {
214  return CrcHash(data, bytes, seed);
215  } else {
216  return MurmurHash2_64(data, bytes, seed);
217  }
218 #else
219  return static_cast<uint32_t>(MurmurHash2_64(data, bytes, seed));
220 #endif
221  }
222 
224  static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
225 
230  static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
231  return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
232  }
233 
234  // Get 32 more bits of randomness from a 32-bit hash:
235  static inline uint32_t Rehash32to32(const uint32_t hash) {
236  // Constants generated by uuidgen(1) with the -r flag
237  static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull;
238  // This is strongly universal hashing following Dietzfelbinger's "Universal hashing
239  // and k-wise independent random variables via integer arithmetic without primes". As
240  // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the
241  // randomness of the constants) that any subset of bit positions of
242  // Rehash32to32(hash1) is equal to the same subset of bit positions
243  // Rehash32to32(hash2) is minimal.
244  return static_cast<uint32_t>((static_cast<uint64_t>(hash) * m + a) >> 32);
245  }
246 
247  static inline uint64_t Rehash32to64(const uint32_t hash) {
248  static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull,
249  a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull;
250  const uint64_t hash1 = (static_cast<uint64_t>(hash) * m1 + a1) >> 32;
251  const uint64_t hash2 = (static_cast<uint64_t>(hash) * m2 + a2) >> 32;
252  return hash1 | (hash2 << 32);
253  }
254 };
255 
256 } // namespace arrow
257 
258 #endif // ARROW_UTIL_HASH_UTIL_H
static uint32_t CrcHash8(const void *v, uint32_t hash)
CrcHash() specialized for 8-byte data.
Definition: hash-util.h:93
static const uint32_t FNV_PRIME
default values recommended by http://isthe.com/chongo/tech/comp/fnv/
Definition: hash-util.h:168
static uint32_t CrcHash16(const void *v, uint32_t hash)
CrcHash() specialized for 16-byte data.
Definition: hash-util.h:113
static const uint64_t MURMUR_PRIME
Definition: hash-util.h:123
static uint64_t MurmurHash2_64(const void *input, int len, uint64_t seed)
Murmur2 hash implementation returning 64-bit hashes.
Definition: hash-util.h:127
static uint32_t CrcHash4(const void *v, uint32_t hash)
CrcHash() specialized for 4-byte data.
Definition: hash-util.h:84
static const int MURMUR_R
Definition: hash-util.h:124
static const int64_t SSE4_2
Definition: cpu-info.h:39
static const uint32_t HASH_COMBINE_SEED
The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
Definition: hash-util.h:224
#define DCHECK_GT(val1, val2)
Definition: logging.h:84
Utility class to compute hash values.
Definition: hash-util.h:33
static const uint32_t FNV_SEED
Definition: hash-util.h:169
static uint32_t CrcHash12(const void *v, uint32_t hash)
CrcHash() specialized for 12-byte data.
Definition: hash-util.h:102
static uint64_t Rehash32to64(const uint32_t hash)
Definition: hash-util.h:247
static uint32_t HashCombine32(uint32_t value, uint32_t seed)
Combine hashes &#39;value&#39; and &#39;seed&#39; to get a new hash value.
Definition: hash-util.h:230
static uint32_t CrcHash1(const void *v, uint32_t hash)
CrcHash() specialized for 1-byte data.
Definition: hash-util.h:66
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
static uint64_t FnvHash64(const void *data, int32_t bytes, uint64_t hash)
Implementation of the Fowler-Noll-Vo hash function.
Definition: hash-util.h:182
static uint32_t FnvHash64to32(const void *data, int32_t bytes, uint32_t hash)
Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits.
Definition: hash-util.h:199
#define DCHECK(condition)
Definition: logging.h:78
static const uint64_t FNV64_SEED
Definition: hash-util.h:171
static uint32_t Rehash32to32(const uint32_t hash)
Definition: hash-util.h:235
static uint32_t Hash(const void *data, int32_t bytes, uint32_t seed)
Computes the hash value for data.
Definition: hash-util.h:211
static uint32_t CrcHash(const void *data, int32_t bytes, uint32_t hash)
Compute the Crc32 hash for data using SSE4 instructions.
Definition: hash-util.h:42
static bool IsSupported(int64_t flag)
Returns whether of not the cpu supports this flag.
Definition: cpu-info.h:60
static const uint64_t FNV64_PRIME
Definition: hash-util.h:170
static uint32_t CrcHash2(const void *v, uint32_t hash)
CrcHash() specialized for 2-byte data.
Definition: hash-util.h:75