| Line |
Branch |
Exec |
Source |
| 1 |
|
|
/* |
| 2 |
|
|
* Frozen |
| 3 |
|
|
* Copyright 2016 QuarksLab |
| 4 |
|
|
* |
| 5 |
|
|
* Licensed to the Apache Software Foundation (ASF) under one |
| 6 |
|
|
* or more contributor license agreements. See the NOTICE file |
| 7 |
|
|
* distributed with this work for additional information |
| 8 |
|
|
* regarding copyright ownership. The ASF licenses this file |
| 9 |
|
|
* to you under the Apache License, Version 2.0 (the |
| 10 |
|
|
* "License"); you may not use this file except in compliance |
| 11 |
|
|
* with the License. You may obtain a copy of the License at |
| 12 |
|
|
* |
| 13 |
|
|
* http://www.apache.org/licenses/LICENSE-2.0 |
| 14 |
|
|
* |
| 15 |
|
|
* Unless required by applicable law or agreed to in writing, |
| 16 |
|
|
* software distributed under the License is distributed on an |
| 17 |
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 18 |
|
|
* KIND, either express or implied. See the License for the |
| 19 |
|
|
* specific language governing permissions and limitations |
| 20 |
|
|
* under the License. |
| 21 |
|
|
*/ |
| 22 |
|
|
|
| 23 |
|
|
// inspired from http://stevehanov.ca/blog/index.php?id=119 |
| 24 |
|
|
#ifndef FROZEN_LETITGO_PMH_H |
| 25 |
|
|
#define FROZEN_LETITGO_PMH_H |
| 26 |
|
|
|
| 27 |
|
|
#include "frozen/bits/algorithms.h" |
| 28 |
|
|
#include "frozen/bits/basic_types.h" |
| 29 |
|
|
|
| 30 |
|
|
#include <array> |
| 31 |
|
|
#include <cstddef> |
| 32 |
|
|
#include <cstdint> |
| 33 |
|
|
#include <limits> |
| 34 |
|
|
|
| 35 |
|
|
namespace frozen { |
| 36 |
|
|
|
| 37 |
|
|
namespace bits { |
| 38 |
|
|
|
| 39 |
|
|
// Function object for sorting buckets in decreasing order of size |
| 40 |
|
|
struct bucket_size_compare { |
| 41 |
|
|
template <typename B> |
| 42 |
|
|
bool constexpr operator()(B const &b0, |
| 43 |
|
|
B const &b1) const { |
| 44 |
|
|
return b0.size() > b1.size(); |
| 45 |
|
|
} |
| 46 |
|
|
}; |
| 47 |
|
|
|
| 48 |
|
|
// Step One in pmh routine is to take all items and hash them into buckets, |
| 49 |
|
|
// with some collisions. Then process those buckets further to build a perfect |
| 50 |
|
|
// hash function. |
| 51 |
|
|
// pmh_buckets represents the initial placement into buckets. |
| 52 |
|
|
|
| 53 |
|
|
template <std::size_t M> |
| 54 |
|
|
struct pmh_buckets { |
| 55 |
|
|
// Step 0: Bucket max is 2 * sqrt M |
| 56 |
|
|
// TODO: Come up with justification for this, should it not be O(log M)? |
| 57 |
|
|
static constexpr auto bucket_max = 2 * (1u << (log(M) / 2)); |
| 58 |
|
|
|
| 59 |
|
|
using bucket_t = cvector<std::size_t, bucket_max>; |
| 60 |
|
|
carray<bucket_t, M> buckets; |
| 61 |
|
|
std::uint64_t seed; |
| 62 |
|
|
|
| 63 |
|
|
// Represents a reference to a bucket. This is used because the buckets |
| 64 |
|
|
// have to be sorted, but buckets are big, making it slower than sorting refs |
| 65 |
|
|
struct bucket_ref { |
| 66 |
|
|
unsigned hash; |
| 67 |
|
|
const bucket_t * ptr; |
| 68 |
|
|
|
| 69 |
|
|
// Forward some interface of bucket |
| 70 |
|
|
using value_type = typename bucket_t::value_type; |
| 71 |
|
|
using const_iterator = typename bucket_t::const_iterator; |
| 72 |
|
|
|
| 73 |
|
|
constexpr auto size() const { return ptr->size(); } |
| 74 |
|
|
constexpr const auto & operator[](std::size_t idx) const { return (*ptr)[idx]; } |
| 75 |
|
|
constexpr auto begin() const { return ptr->begin(); } |
| 76 |
|
|
constexpr auto end() const { return ptr->end(); } |
| 77 |
|
|
}; |
| 78 |
|
|
|
| 79 |
|
|
// Make a bucket_ref for each bucket |
| 80 |
|
|
template <std::size_t... Is> |
| 81 |
|
|
carray<bucket_ref, M> constexpr make_bucket_refs(std::index_sequence<Is...>) const { |
| 82 |
|
|
return {{ bucket_ref{Is, &buckets[Is]}... }}; |
| 83 |
|
|
} |
| 84 |
|
|
|
| 85 |
|
|
// Makes a bucket_ref for each bucket and sorts them by size |
| 86 |
|
|
carray<bucket_ref, M> constexpr get_sorted_buckets() const { |
| 87 |
|
|
carray<bucket_ref, M> result{this->make_bucket_refs(std::make_index_sequence<M>())}; |
| 88 |
|
|
bits::quicksort(result.begin(), result.end() - 1, bucket_size_compare{}); |
| 89 |
|
|
return result; |
| 90 |
|
|
} |
| 91 |
|
|
}; |
| 92 |
|
|
|
| 93 |
|
|
template <std::size_t M, class Item, std::size_t N, class Hash, class Key, class PRG> |
| 94 |
|
|
pmh_buckets<M> constexpr make_pmh_buckets(const carray<Item, N> & items, |
| 95 |
|
|
Hash const & hash, |
| 96 |
|
|
Key const & key, |
| 97 |
|
|
PRG & prg) { |
| 98 |
|
|
using result_t = pmh_buckets<M>; |
| 99 |
|
|
// Continue until all items are placed without exceeding bucket_max |
| 100 |
|
|
while (1) { |
| 101 |
|
|
result_t result{}; |
| 102 |
|
|
result.seed = prg(); |
| 103 |
|
|
bool rejected = false; |
| 104 |
|
|
for (std::size_t i = 0; i < items.size(); ++i) { |
| 105 |
|
|
auto & bucket = result.buckets[hash(key(items[i]), static_cast<std::size_t>(result.seed)) % M]; |
| 106 |
|
|
if (bucket.size() >= result_t::bucket_max) { |
| 107 |
|
|
rejected = true; |
| 108 |
|
|
break; |
| 109 |
|
|
} |
| 110 |
|
|
bucket.push_back(i); |
| 111 |
|
|
} |
| 112 |
|
|
if (!rejected) { return result; } |
| 113 |
|
|
} |
| 114 |
|
|
} |
| 115 |
|
|
|
| 116 |
|
|
// Check if an item appears in a cvector |
| 117 |
|
|
template<class T, std::size_t N> |
| 118 |
|
|
constexpr bool all_different_from(cvector<T, N> & data, T & a) { |
| 119 |
|
|
for (std::size_t i = 0; i < data.size(); ++i) |
| 120 |
|
|
if (data[i] == a) |
| 121 |
|
|
return false; |
| 122 |
|
|
|
| 123 |
|
|
return true; |
| 124 |
|
|
} |
| 125 |
|
|
|
| 126 |
|
|
// Represents either an index to a data item array, or a seed to be used with |
| 127 |
|
|
// a hasher. Seed must have high bit of 1, value has high bit of zero. |
| 128 |
|
|
struct seed_or_index { |
| 129 |
|
|
using value_type = std::uint64_t; |
| 130 |
|
|
|
| 131 |
|
|
private: |
| 132 |
|
|
static constexpr value_type MINUS_ONE = std::numeric_limits<value_type>::max(); |
| 133 |
|
|
static constexpr value_type HIGH_BIT = ~(MINUS_ONE >> 1); |
| 134 |
|
|
|
| 135 |
|
|
value_type value_ = 0; |
| 136 |
|
|
|
| 137 |
|
|
public: |
| 138 |
|
88359 |
constexpr value_type value() const { return value_; } |
| 139 |
|
88359 |
constexpr bool is_seed() const { return value_ & HIGH_BIT; } |
| 140 |
|
|
|
| 141 |
|
|
constexpr seed_or_index(bool is_seed, value_type value) |
| 142 |
|
|
: value_(is_seed ? (value | HIGH_BIT) : (value & ~HIGH_BIT)) {} |
| 143 |
|
|
|
| 144 |
|
|
constexpr seed_or_index() = default; |
| 145 |
|
|
constexpr seed_or_index(const seed_or_index &) = default; |
| 146 |
|
|
constexpr seed_or_index & operator =(const seed_or_index &) = default; |
| 147 |
|
|
}; |
| 148 |
|
|
|
| 149 |
|
|
// Represents the perfect hash function created by pmh algorithm |
| 150 |
|
|
template <std::size_t M, class Hasher> |
| 151 |
|
|
struct pmh_tables : private Hasher { |
| 152 |
|
|
std::uint64_t first_seed_; |
| 153 |
|
|
carray<seed_or_index, M> first_table_; |
| 154 |
|
|
carray<std::size_t, M> second_table_; |
| 155 |
|
|
|
| 156 |
|
|
constexpr pmh_tables( |
| 157 |
|
|
std::uint64_t first_seed, |
| 158 |
|
|
carray<seed_or_index, M> first_table, |
| 159 |
|
|
carray<std::size_t, M> second_table, |
| 160 |
|
|
Hasher hash) noexcept |
| 161 |
|
|
: Hasher(hash) |
| 162 |
|
|
, first_seed_(first_seed) |
| 163 |
|
|
, first_table_(first_table) |
| 164 |
|
|
, second_table_(second_table) |
| 165 |
|
|
{} |
| 166 |
|
|
|
| 167 |
|
129431 |
constexpr Hasher const& hash_function() const noexcept { |
| 168 |
|
129431 |
return static_cast<Hasher const&>(*this); |
| 169 |
|
|
} |
| 170 |
|
|
|
| 171 |
|
|
template <typename KeyType> |
| 172 |
|
|
constexpr std::size_t lookup(const KeyType & key) const { |
| 173 |
|
|
return lookup(key, hash_function()); |
| 174 |
|
|
} |
| 175 |
|
|
|
| 176 |
|
|
// Looks up a given key, to find its expected index in carray<Item, N> |
| 177 |
|
|
// Always returns a valid index, must use KeyEqual test after to confirm. |
| 178 |
|
|
template <typename KeyType, typename HasherType> |
| 179 |
|
88359 |
constexpr std::size_t lookup(const KeyType & key, const HasherType& hasher) const { |
| 180 |
2/4
✓ Branch 0 taken 23408 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 23879 times.
✗ Branch 3 not taken.
|
88359 |
auto const d = first_table_[hasher(key, static_cast<std::size_t>(first_seed_)) % M]; |
| 181 |
2/2
✓ Branch 0 taken 41806 times.
✓ Branch 1 taken 5481 times.
|
88359 |
if (!d.is_seed()) { return static_cast<std::size_t>(d.value()); } // this is narrowing std::uint64 -> std::size_t but should be fine |
| 182 |
2/4
✓ Branch 0 taken 2727 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2754 times.
✗ Branch 3 not taken.
|
9836 |
else { return second_table_[hasher(key, static_cast<std::size_t>(d.value())) % M]; } |
| 183 |
|
|
} |
| 184 |
|
|
}; |
| 185 |
|
|
|
| 186 |
|
|
// Make pmh tables for given items, hash function, prg, etc. |
| 187 |
|
|
template <std::size_t M, class Item, std::size_t N, class Hash, class Key, class KeyEqual, class PRG> |
| 188 |
|
|
pmh_tables<M, Hash> constexpr make_pmh_tables(const carray<Item, N> & |
| 189 |
|
|
items, |
| 190 |
|
|
Hash const &hash, |
| 191 |
|
|
KeyEqual const &equal, |
| 192 |
|
|
Key const &key, |
| 193 |
|
|
PRG prg) { |
| 194 |
|
|
// Step 1: Place all of the keys into buckets |
| 195 |
|
|
auto step_one = make_pmh_buckets<M>(items, hash, key, prg); |
| 196 |
|
|
|
| 197 |
|
|
// Step 1.5: Detect redundant keys. |
| 198 |
|
|
for(auto const& bucket : step_one.buckets) |
| 199 |
|
|
for(std::size_t i = 1; i < bucket.size(); ++i) |
| 200 |
|
|
constexpr_assert(!equal(key(items[0]), key(items[i])), "structure keys should be unique"); |
| 201 |
|
|
|
| 202 |
|
|
// Step 2: Sort the buckets to process the ones with the most items first. |
| 203 |
|
|
auto buckets = step_one.get_sorted_buckets(); |
| 204 |
|
|
|
| 205 |
|
|
// Special value for unused slots. This is purposefully the index |
| 206 |
|
|
// one-past-the-end of 'items' to function as a sentinel value. Both to avoid |
| 207 |
|
|
// the need to apply the KeyEqual predicate and to be easily convertible to |
| 208 |
|
|
// end(). |
| 209 |
|
|
// Unused entries in both hash tables (G and H) have to contain this value. |
| 210 |
|
|
const auto UNUSED = items.size(); |
| 211 |
|
|
|
| 212 |
|
|
// G becomes the first hash table in the resulting pmh function |
| 213 |
|
|
carray<seed_or_index, M> G({false, UNUSED}); |
| 214 |
|
|
|
| 215 |
|
|
// H becomes the second hash table in the resulting pmh function |
| 216 |
|
|
carray<std::size_t, M> H(UNUSED); |
| 217 |
|
|
|
| 218 |
|
|
// Step 3: Map the items in buckets into hash tables. |
| 219 |
|
|
for (const auto & bucket : buckets) { |
| 220 |
|
|
auto const bsize = bucket.size(); |
| 221 |
|
|
|
| 222 |
|
|
if (bsize == 1) { |
| 223 |
|
|
// Store index to the (single) item in G |
| 224 |
|
|
// assert(bucket.hash == hash(key(items[bucket[0]]), step_one.seed) % M); |
| 225 |
|
|
G[bucket.hash] = {false, static_cast<std::uint64_t>(bucket[0])}; |
| 226 |
|
|
} else if (bsize > 1) { |
| 227 |
|
|
|
| 228 |
|
|
// Repeatedly try different H of d until we find a hash function |
| 229 |
|
|
// that places all items in the bucket into free slots |
| 230 |
|
|
seed_or_index d{true, prg()}; |
| 231 |
|
|
cvector<std::size_t, decltype(step_one)::bucket_max> bucket_slots; |
| 232 |
|
|
|
| 233 |
|
|
while (bucket_slots.size() < bsize) { |
| 234 |
|
|
auto slot = hash(key(items[bucket[bucket_slots.size()]]), static_cast<std::size_t>(d.value())) % M; |
| 235 |
|
|
|
| 236 |
|
|
if (H[slot] != UNUSED || !all_different_from(bucket_slots, slot)) { |
| 237 |
|
|
bucket_slots.clear(); |
| 238 |
|
|
d = {true, prg()}; |
| 239 |
|
|
continue; |
| 240 |
|
|
} |
| 241 |
|
|
|
| 242 |
|
|
bucket_slots.push_back(slot); |
| 243 |
|
|
} |
| 244 |
|
|
|
| 245 |
|
|
// Put successful seed in G, and put indices to items in their slots |
| 246 |
|
|
// assert(bucket.hash == hash(key(items[bucket[0]]), step_one.seed) % M); |
| 247 |
|
|
G[bucket.hash] = d; |
| 248 |
|
|
for (std::size_t i = 0; i < bsize; ++i) |
| 249 |
|
|
H[bucket_slots[i]] = bucket[i]; |
| 250 |
|
|
} |
| 251 |
|
|
} |
| 252 |
|
|
|
| 253 |
|
|
return {step_one.seed, G, H, hash}; |
| 254 |
|
|
} |
| 255 |
|
|
|
| 256 |
|
|
} // namespace bits |
| 257 |
|
|
|
| 258 |
|
|
} // namespace frozen |
| 259 |
|
|
|
| 260 |
|
|
#endif |
| 261 |
|
|
|