Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Frozen |
3 |
|
|
* Copyright 2016 QuarksLab |
4 |
|
|
* |
5 |
|
|
* Licensed to the Apache Software Foundation (ASF) under one |
6 |
|
|
* or more contributor license agreements. See the NOTICE file |
7 |
|
|
* distributed with this work for additional information |
8 |
|
|
* regarding copyright ownership. The ASF licenses this file |
9 |
|
|
* to you under the Apache License, Version 2.0 (the |
10 |
|
|
* "License"); you may not use this file except in compliance |
11 |
|
|
* with the License. You may obtain a copy of the License at |
12 |
|
|
* |
13 |
|
|
* http://www.apache.org/licenses/LICENSE-2.0 |
14 |
|
|
* |
15 |
|
|
* Unless required by applicable law or agreed to in writing, |
16 |
|
|
* software distributed under the License is distributed on an |
17 |
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
18 |
|
|
* KIND, either express or implied. See the License for the |
19 |
|
|
* specific language governing permissions and limitations |
20 |
|
|
* under the License. |
21 |
|
|
*/ |
22 |
|
|
|
23 |
|
|
// inspired from http://stevehanov.ca/blog/index.php?id=119 |
24 |
|
|
#ifndef FROZEN_LETITGO_PMH_H |
25 |
|
|
#define FROZEN_LETITGO_PMH_H |
26 |
|
|
|
27 |
|
|
#include "frozen/bits/algorithms.h" |
28 |
|
|
#include "frozen/bits/basic_types.h" |
29 |
|
|
|
30 |
|
|
#include <array> |
31 |
|
|
#include <cstddef> |
32 |
|
|
#include <cstdint> |
33 |
|
|
#include <limits> |
34 |
|
|
|
35 |
|
|
namespace frozen { |
36 |
|
|
|
37 |
|
|
namespace bits { |
38 |
|
|
|
39 |
|
|
// Function object for sorting buckets in decreasing order of size |
40 |
|
|
struct bucket_size_compare { |
41 |
|
|
template <typename B> |
42 |
|
|
bool constexpr operator()(B const &b0, |
43 |
|
|
B const &b1) const { |
44 |
|
|
return b0.size() > b1.size(); |
45 |
|
|
} |
46 |
|
|
}; |
47 |
|
|
|
48 |
|
|
// Step One in pmh routine is to take all items and hash them into buckets, |
49 |
|
|
// with some collisions. Then process those buckets further to build a perfect |
50 |
|
|
// hash function. |
51 |
|
|
// pmh_buckets represents the initial placement into buckets. |
52 |
|
|
|
53 |
|
|
template <std::size_t M> |
54 |
|
|
struct pmh_buckets { |
55 |
|
|
// Step 0: Bucket max is 2 * sqrt M |
56 |
|
|
// TODO: Come up with justification for this, should it not be O(log M)? |
57 |
|
|
static constexpr auto bucket_max = 2 * (1u << (log(M) / 2)); |
58 |
|
|
|
59 |
|
|
using bucket_t = cvector<std::size_t, bucket_max>; |
60 |
|
|
carray<bucket_t, M> buckets; |
61 |
|
|
std::uint64_t seed; |
62 |
|
|
|
63 |
|
|
// Represents a reference to a bucket. This is used because the buckets |
64 |
|
|
// have to be sorted, but buckets are big, making it slower than sorting refs |
65 |
|
|
struct bucket_ref { |
66 |
|
|
unsigned hash; |
67 |
|
|
const bucket_t * ptr; |
68 |
|
|
|
69 |
|
|
// Forward some interface of bucket |
70 |
|
|
using value_type = typename bucket_t::value_type; |
71 |
|
|
using const_iterator = typename bucket_t::const_iterator; |
72 |
|
|
|
73 |
|
|
constexpr auto size() const { return ptr->size(); } |
74 |
|
|
constexpr const auto & operator[](std::size_t idx) const { return (*ptr)[idx]; } |
75 |
|
|
constexpr auto begin() const { return ptr->begin(); } |
76 |
|
|
constexpr auto end() const { return ptr->end(); } |
77 |
|
|
}; |
78 |
|
|
|
79 |
|
|
// Make a bucket_ref for each bucket |
80 |
|
|
template <std::size_t... Is> |
81 |
|
|
carray<bucket_ref, M> constexpr make_bucket_refs(std::index_sequence<Is...>) const { |
82 |
|
|
return {{ bucket_ref{Is, &buckets[Is]}... }}; |
83 |
|
|
} |
84 |
|
|
|
85 |
|
|
// Makes a bucket_ref for each bucket and sorts them by size |
86 |
|
|
carray<bucket_ref, M> constexpr get_sorted_buckets() const { |
87 |
|
|
carray<bucket_ref, M> result{this->make_bucket_refs(std::make_index_sequence<M>())}; |
88 |
|
|
bits::quicksort(result.begin(), result.end() - 1, bucket_size_compare{}); |
89 |
|
|
return result; |
90 |
|
|
} |
91 |
|
|
}; |
92 |
|
|
|
93 |
|
|
template <std::size_t M, class Item, std::size_t N, class Hash, class Key, class PRG> |
94 |
|
|
pmh_buckets<M> constexpr make_pmh_buckets(const carray<Item, N> & items, |
95 |
|
|
Hash const & hash, |
96 |
|
|
Key const & key, |
97 |
|
|
PRG & prg) { |
98 |
|
|
using result_t = pmh_buckets<M>; |
99 |
|
|
// Continue until all items are placed without exceeding bucket_max |
100 |
|
|
while (1) { |
101 |
|
|
result_t result{}; |
102 |
|
|
result.seed = prg(); |
103 |
|
|
bool rejected = false; |
104 |
|
|
for (std::size_t i = 0; i < items.size(); ++i) { |
105 |
|
|
auto & bucket = result.buckets[hash(key(items[i]), static_cast<std::size_t>(result.seed)) % M]; |
106 |
|
|
if (bucket.size() >= result_t::bucket_max) { |
107 |
|
|
rejected = true; |
108 |
|
|
break; |
109 |
|
|
} |
110 |
|
|
bucket.push_back(i); |
111 |
|
|
} |
112 |
|
|
if (!rejected) { return result; } |
113 |
|
|
} |
114 |
|
|
} |
115 |
|
|
|
116 |
|
|
// Check if an item appears in a cvector |
117 |
|
|
template<class T, std::size_t N> |
118 |
|
|
constexpr bool all_different_from(cvector<T, N> & data, T & a) { |
119 |
|
|
for (std::size_t i = 0; i < data.size(); ++i) |
120 |
|
|
if (data[i] == a) |
121 |
|
|
return false; |
122 |
|
|
|
123 |
|
|
return true; |
124 |
|
|
} |
125 |
|
|
|
126 |
|
|
// Represents either an index to a data item array, or a seed to be used with |
127 |
|
|
// a hasher. Seed must have high bit of 1, value has high bit of zero. |
128 |
|
|
struct seed_or_index { |
129 |
|
|
using value_type = std::uint64_t; |
130 |
|
|
|
131 |
|
|
private: |
132 |
|
|
static constexpr value_type MINUS_ONE = std::numeric_limits<value_type>::max(); |
133 |
|
|
static constexpr value_type HIGH_BIT = ~(MINUS_ONE >> 1); |
134 |
|
|
|
135 |
|
|
value_type value_ = 0; |
136 |
|
|
|
137 |
|
|
public: |
138 |
|
88149 |
constexpr value_type value() const { return value_; } |
139 |
|
88149 |
constexpr bool is_seed() const { return value_ & HIGH_BIT; } |
140 |
|
|
|
141 |
|
|
constexpr seed_or_index(bool is_seed, value_type value) |
142 |
|
|
: value_(is_seed ? (value | HIGH_BIT) : (value & ~HIGH_BIT)) {} |
143 |
|
|
|
144 |
|
|
constexpr seed_or_index() = default; |
145 |
|
|
constexpr seed_or_index(const seed_or_index &) = default; |
146 |
|
|
constexpr seed_or_index & operator =(const seed_or_index &) = default; |
147 |
|
|
}; |
148 |
|
|
|
149 |
|
|
// Represents the perfect hash function created by pmh algorithm |
150 |
|
|
template <std::size_t M, class Hasher> |
151 |
|
|
struct pmh_tables : private Hasher { |
152 |
|
|
std::uint64_t first_seed_; |
153 |
|
|
carray<seed_or_index, M> first_table_; |
154 |
|
|
carray<std::size_t, M> second_table_; |
155 |
|
|
|
156 |
|
|
constexpr pmh_tables( |
157 |
|
|
std::uint64_t first_seed, |
158 |
|
|
carray<seed_or_index, M> first_table, |
159 |
|
|
carray<std::size_t, M> second_table, |
160 |
|
|
Hasher hash) noexcept |
161 |
|
|
: Hasher(hash) |
162 |
|
|
, first_seed_(first_seed) |
163 |
|
|
, first_table_(first_table) |
164 |
|
|
, second_table_(second_table) |
165 |
|
|
{} |
166 |
|
|
|
167 |
|
129125 |
constexpr Hasher const& hash_function() const noexcept { |
168 |
|
129125 |
return static_cast<Hasher const&>(*this); |
169 |
|
|
} |
170 |
|
|
|
171 |
|
|
template <typename KeyType> |
172 |
|
|
constexpr std::size_t lookup(const KeyType & key) const { |
173 |
|
|
return lookup(key, hash_function()); |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
// Looks up a given key, to find its expected index in carray<Item, N> |
177 |
|
|
// Always returns a valid index, must use KeyEqual test after to confirm. |
178 |
|
|
template <typename KeyType, typename HasherType> |
179 |
|
88149 |
constexpr std::size_t lookup(const KeyType & key, const HasherType& hasher) const { |
180 |
2/4
✓ Branch 0 taken 23321 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 23852 times.
✗ Branch 3 not taken.
|
88149 |
auto const d = first_table_[hasher(key, static_cast<std::size_t>(first_seed_)) % M]; |
181 |
2/2
✓ Branch 0 taken 41695 times.
✓ Branch 1 taken 5478 times.
|
88149 |
if (!d.is_seed()) { return static_cast<std::size_t>(d.value()); } // this is narrowing std::uint64 -> std::size_t but should be fine |
182 |
2/4
✓ Branch 0 taken 2721 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2757 times.
✗ Branch 3 not taken.
|
9836 |
else { return second_table_[hasher(key, static_cast<std::size_t>(d.value())) % M]; } |
183 |
|
|
} |
184 |
|
|
}; |
185 |
|
|
|
186 |
|
|
// Make pmh tables for given items, hash function, prg, etc. |
187 |
|
|
template <std::size_t M, class Item, std::size_t N, class Hash, class Key, class KeyEqual, class PRG> |
188 |
|
|
pmh_tables<M, Hash> constexpr make_pmh_tables(const carray<Item, N> & |
189 |
|
|
items, |
190 |
|
|
Hash const &hash, |
191 |
|
|
KeyEqual const &equal, |
192 |
|
|
Key const &key, |
193 |
|
|
PRG prg) { |
194 |
|
|
// Step 1: Place all of the keys into buckets |
195 |
|
|
auto step_one = make_pmh_buckets<M>(items, hash, key, prg); |
196 |
|
|
|
197 |
|
|
#ifndef NDEBUG |
198 |
|
|
// Step 1.5: Detect redundant keys. |
199 |
|
|
for(auto const& bucket : step_one.buckets) |
200 |
|
|
for(std::size_t i = 1; i < bucket.size(); ++i) |
201 |
|
|
constexpr_assert(!equal(key(items[0]), key(items[i])), "unique keys"); |
202 |
|
|
#endif |
203 |
|
|
|
204 |
|
|
// Step 2: Sort the buckets to process the ones with the most items first. |
205 |
|
|
auto buckets = step_one.get_sorted_buckets(); |
206 |
|
|
|
207 |
|
|
// Special value for unused slots. This is purposefully the index |
208 |
|
|
// one-past-the-end of 'items' to function as a sentinel value. Both to avoid |
209 |
|
|
// the need to apply the KeyEqual predicate and to be easily convertible to |
210 |
|
|
// end(). |
211 |
|
|
// Unused entries in both hash tables (G and H) have to contain this value. |
212 |
|
|
const auto UNUSED = items.size(); |
213 |
|
|
|
214 |
|
|
// G becomes the first hash table in the resulting pmh function |
215 |
|
|
carray<seed_or_index, M> G({false, UNUSED}); |
216 |
|
|
|
217 |
|
|
// H becomes the second hash table in the resulting pmh function |
218 |
|
|
carray<std::size_t, M> H(UNUSED); |
219 |
|
|
|
220 |
|
|
// Step 3: Map the items in buckets into hash tables. |
221 |
|
|
for (const auto & bucket : buckets) { |
222 |
|
|
auto const bsize = bucket.size(); |
223 |
|
|
|
224 |
|
|
if (bsize == 1) { |
225 |
|
|
// Store index to the (single) item in G |
226 |
|
|
// assert(bucket.hash == hash(key(items[bucket[0]]), step_one.seed) % M); |
227 |
|
|
G[bucket.hash] = {false, static_cast<std::uint64_t>(bucket[0])}; |
228 |
|
|
} else if (bsize > 1) { |
229 |
|
|
|
230 |
|
|
// Repeatedly try different H of d until we find a hash function |
231 |
|
|
// that places all items in the bucket into free slots |
232 |
|
|
seed_or_index d{true, prg()}; |
233 |
|
|
cvector<std::size_t, decltype(step_one)::bucket_max> bucket_slots; |
234 |
|
|
|
235 |
|
|
while (bucket_slots.size() < bsize) { |
236 |
|
|
auto slot = hash(key(items[bucket[bucket_slots.size()]]), static_cast<std::size_t>(d.value())) % M; |
237 |
|
|
|
238 |
|
|
if (H[slot] != UNUSED || !all_different_from(bucket_slots, slot)) { |
239 |
|
|
bucket_slots.clear(); |
240 |
|
|
d = {true, prg()}; |
241 |
|
|
continue; |
242 |
|
|
} |
243 |
|
|
|
244 |
|
|
bucket_slots.push_back(slot); |
245 |
|
|
} |
246 |
|
|
|
247 |
|
|
// Put successful seed in G, and put indices to items in their slots |
248 |
|
|
// assert(bucket.hash == hash(key(items[bucket[0]]), step_one.seed) % M); |
249 |
|
|
G[bucket.hash] = d; |
250 |
|
|
for (std::size_t i = 0; i < bsize; ++i) |
251 |
|
|
H[bucket_slots[i]] = bucket[i]; |
252 |
|
|
} |
253 |
|
|
} |
254 |
|
|
|
255 |
|
|
return {step_one.seed, G, H, hash}; |
256 |
|
|
} |
257 |
|
|
|
258 |
|
|
} // namespace bits |
259 |
|
|
|
260 |
|
|
} // namespace frozen |
261 |
|
|
|
262 |
|
|
#endif |
263 |
|
|
|