tudocomp
– The TU Dortmund Compression Framework
compact_sparse_hash.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <memory>
4 #include <cstdint>
5 #include <utility>
6 #include <algorithm>
7 
9 #include <tudocomp/ds/IntPtr.hpp>
10 
11 namespace tdc {
12 
13 /*
14  uint64_t x;
15 
16 uint64_t xorshift64star(uint64_t state[static 1]) {
17  uint64_t x = state[0];
18  x ^= x >> 12; // a
19  x ^= x << 25; // b
20  x ^= x >> 27; // c
21  state[0] = x;
22  return x * 0x2545F4914F6CDD1D;
23 }
24 
25 unsigned long long xor64(){
26 static unsigned long long x=88172645463325252LL;
27 xˆ=(x<<13); xˆ=(x>>7); return (xˆ=(x<<17));
28 */
29 
30 inline uint64_t compact_hashfn(uint64_t x, uint64_t w) {
31  uint64_t j = (w / 2ull) + 1;
32  DCHECK_LT((w / 2ull), j);
33  DCHECK_NE(w, 0);
34 
35  // NB: Two shifts because a single shift with w == 64 is undefined
36  // behavior
37  uint64_t w_mask = (1ull << (w - 1ull) << 1ull) - 1ull;
38 
39  return (x xor ((x << j) & w_mask)) & w_mask;
40 }
41 
42 inline uint64_t compact_reverse_hashfn(uint64_t x, uint64_t w) {
43  return compact_hashfn(x, w);
44 }
45 
46 inline uint8_t log2_upper(uint64_t v) {
47  uint8_t m = 0;
48  uint64_t n = v;
49  while(n) {
50  n >>= 1;
51  m++;
52  }
53  m--;
54  return m;
55 }
56 
57 inline bool is_pot(size_t n) {
58  return (n > 0ull && ((n & (n - 1ull)) == 0ull));
59 }
60 
61 class SizeManager {
62  uint8_t m_capacity_log2;
63  size_t m_size;
64 
65 public:
66 
67  inline SizeManager(size_t capacity) {
68  m_size = 0;
69  CHECK(is_pot(capacity));
70  m_capacity_log2 = log2_upper(capacity);
71  }
72 
73  inline size_t& size() {
74  return m_size;
75  }
76 
77  inline size_t const& size() const {
78  return m_size;
79  }
80 
81  inline size_t capacity() const {
82  return 1ull << m_capacity_log2;
83  }
84 
85  inline uint8_t capacity_log2() const {
86  return m_capacity_log2;
87  }
88 };
89 
90 using QuotPtr = IntPtr<dynamic_t>;
91 inline QuotPtr make_quot_ptr(uint64_t* ptr, size_t quot_width) {
92  using namespace int_vector;
93 
94  return IntPtrBase<QuotPtr>(ptr, 0, quot_width);
95 }
96 template<typename V, typename I>
97 inline void rotate_end_to(I base, size_t pos, size_t size) {
98  auto a = base + pos;
99  auto b = base + size;
100 
101  V tmp = std::move(*b);
102  while(a != b) {
103  *b = std::move(*(b - 1));
104  b--;
105  }
106  *a = std::move(tmp);
107 }
108 
109 template<typename val_t>
111 
112 template<typename val_t>
113 class Bucket {
114  uint64_t m_bv = 0;
115  void* m_ptr = nullptr;
116 
117  struct Layout {
118  size_t val_bytes;
119  size_t padding;
120  size_t bv_bytes;
121 
122  inline size_t byte_size() {
123  return val_bytes + padding + bv_bytes;
124  }
125 
126  inline size_t bv_offset() {
127  return val_bytes + padding;
128  }
129  };
130  inline static Layout calc_sizes(size_t size, size_t quot_width) {
131  size_t val_size = sizeof(val_t) * size;
132  size_t bv_size = (size * quot_width + 63ull) / 64ull * sizeof(uint64_t);
133  size_t padding = 0;
134 
135  DCHECK_EQ(val_size % alignof(val_t), 0);
136 
137  size_t rem = val_size % alignof(uint64_t);
138 
139  if (rem != 0) {
140  padding += (alignof(uint64_t) - rem);
141  }
142 
143  size_t padded_val_size = val_size + padding;
144 
145  DCHECK_EQ(padded_val_size % alignof(uint64_t), 0);
146 
147  return Layout {
148  val_size,
149  padding,
150  bv_size,
151  };
152  }
153 
154  struct Ptrs {
155  val_t* val_ptr;
156  QuotPtr quot_ptr;
157  };
158  inline Ptrs ptrs(size_t quot_width) const {
159  size_t size = this->size();
160 
161  auto layout = calc_sizes(size, quot_width);
162 
163  char* p = (char*) m_ptr;
164 
165  val_t* val_ptr = (val_t*)(p);
166  uint64_t* bv_ptr = (uint64_t*)(p + layout.bv_offset());
167 
168  DCHECK_EQ(uint64_t(val_ptr) % alignof(val_t), 0);
169  DCHECK_EQ(uint64_t(bv_ptr) % alignof(uint64_t), 0);
170 
171  return Ptrs {
172  val_ptr,
173  make_quot_ptr(bv_ptr, quot_width)
174  };
175  }
176 
177 public:
178  inline Bucket(): m_bv(0), m_ptr(nullptr) {}
179  explicit inline Bucket(uint64_t bv, size_t quot_width): m_bv(bv) {
180  size_t size = this->size();
181 
182  if (size == 0) {
183  m_ptr = nullptr;
184  return;
185  }
186 
187  auto layout = calc_sizes(size, quot_width);
188  size_t malloc_size = layout.byte_size();
189 
190  /*std::cout << "size: " << size
191  << ", val: "
192  << "x" << sizeof(val_t) << "=" << val_size
193  << " al " << alignof(val_t)
194  << ", bv: "
195  << "x" << (quot_width/64ull) << ":" << (quot_width%64ull)
196  << "=" << ((size * quot_width)/64ull) << ":" << ((size * quot_width) %64ull)
197  << "="
198  << (bv_size/sizeof(uint64_t)) << "x" << sizeof(uint64_t) << "=" << bv_size
199  << " al " << alignof(uint64_t)
200  << ", rem: " << rem
201  << ", padding: " << padding
202  << ", padded_val_size: " << padded_val_size
203  << ", malloc_size: " << malloc_size
204  << "\n";*/
205 
206  m_ptr = malloc(malloc_size);
207  }
208 
209  inline size_t size() const {
210  return __builtin_popcountll(m_bv);
211  }
212 
213  inline uint64_t bv() const {
214  return m_bv;
215  }
216 
217  inline void destroy_vals(size_t quot_width) {
218  size_t size = this->size();
219 
220  val_t* start = ptrs(quot_width).val_ptr;
221  val_t* end = start + size;
222 
223  for(; start != end; start++) {
224  start->~val_t();
225  }
226  }
227 
228  inline void destroy_self() {
229  if (m_ptr != nullptr) {
230  free(m_ptr);
231  m_ptr = nullptr;
232  }
233  }
234 
235  inline ~Bucket() {
236  destroy_self();
237  }
238 
239  inline Bucket(Bucket&& other) {
240  m_ptr = other.m_ptr;
241  other.m_ptr = nullptr;
242  m_bv = other.m_bv;
243  other.m_bv = 0;
244  }
245 
246  inline Bucket& operator=(Bucket&& other) {
247  destroy_self();
248 
249  m_ptr = other.m_ptr;
250  other.m_ptr = nullptr;
251  m_bv = other.m_bv;
252  other.m_bv = 0;
253 
254  return *this;
255  }
256 
257  inline BucketElem<val_t> at(size_t pos, size_t quot_width) const {
258  auto ps = ptrs(quot_width);
259  return BucketElem<val_t>(ps.val_ptr + pos, ps.quot_ptr + pos);
260  }
261 
262  inline void rotate_end_to(size_t pos, size_t quot_width) {
263  size_t size = this->size();
264  size_t last = size - 1;
265 
266  auto ps = ptrs(quot_width);
267  tdc::rotate_end_to<val_t>(ps.val_ptr, pos, last);
268  tdc::rotate_end_to<uint64_t>(ps.quot_ptr, pos, last);
269  }
270 
271  inline bool is_empty() {
272  return m_ptr == nullptr;
273  }
274 };
275 
276 template<typename val_t>
277 class BucketElem {
278  val_t* m_val_ptr;
279  QuotPtr m_quot_ptr;
280 
281  friend class Bucket<val_t>;
282 
283  inline BucketElem(val_t* val_ptr,
284  QuotPtr quot_ptr):
285  m_val_ptr(val_ptr),
286  m_quot_ptr(quot_ptr)
287  {
288  }
289 
290 public:
291  inline BucketElem():
292  m_val_ptr(nullptr) {}
293 
294  inline uint64_t get_quotient() {
295  return uint64_t(*m_quot_ptr);
296  }
297 
298  inline void set_quotient(uint64_t v) {
299  *m_quot_ptr = v;
300  }
301 
302  inline void swap_quotient(uint64_t& other) {
303  uint64_t tmp = uint64_t(*m_quot_ptr);
304  std::swap(other, tmp);
305  *m_quot_ptr = tmp;
306  }
307 
308  inline val_t& val() {
309  return *m_val_ptr;
310  }
311 
312  inline val_t const& val() const {
313  return *m_val_ptr;
314  }
315 
316  inline void increment_ptr() {
317  m_quot_ptr++;
318  m_val_ptr++;
319  }
320  inline void decrement_ptr() {
321  m_quot_ptr--;
322  m_val_ptr--;
323  }
324 
325  inline bool ptr_eq(BucketElem const& other) {
326  return m_val_ptr == other.m_val_ptr;
327  }
328 };
329 
330 template<typename val_t>
332  using key_t = uint64_t;
333 
334  static constexpr size_t BVS_WIDTH_SHIFT = 6;
335  static constexpr size_t BVS_WIDTH_MASK = 0b111111;
336  static constexpr size_t DEFAULT_KEY_WIDTH = 16;
337  static constexpr bool HIGH_BITS_RANDOM = false;
338 
339  SizeManager m_sizing;
340  uint8_t m_width;
341 
342  // Compact table data
343  IntVector<uint_t<2>> m_cv;
344 
345  // Sparse table data
346  std::vector<Bucket<val_t>> m_buckets;
347 
348  inline static constexpr size_t min_size(size_t size) {
349  return (size < 2) ? 2 : size;
350  }
351 
352 public:
353 
354  inline compact_hash(size_t size, size_t key_width = DEFAULT_KEY_WIDTH):
355  m_sizing(min_size(size)),
356  m_width(key_width)
357  {
358  m_cv.reserve(table_size());
359  m_cv.resize(table_size());
360  m_buckets.reserve((m_sizing.size() + BVS_WIDTH_MASK) << BVS_WIDTH_SHIFT);
361  m_buckets.resize((m_sizing.size() + BVS_WIDTH_MASK) << BVS_WIDTH_SHIFT);
362  }
363 
364  inline ~compact_hash() {
365  destroy_buckets();
366  }
367 
368  inline compact_hash(compact_hash&& other):
369  m_sizing(std::move(other.m_sizing)),
370  m_width(std::move(other.m_width)),
371  m_cv(std::move(other.m_cv)),
372  m_buckets(std::move(other.m_buckets))
373  {
374  }
375 
377  destroy_buckets();
378 
379  m_sizing = std::move(other.m_sizing);
380  m_width = std::move(other.m_width);
381  m_cv = std::move(other.m_cv);
382  m_buckets = std::move(other.m_buckets);
383 
384  return *this;
385  }
386 
387 private:
388  inline uint8_t table_size_log2() {
389  return m_sizing.capacity_log2();
390  }
391 
392  inline size_t table_size() {
393  return m_sizing.capacity();
394  }
395 
396  inline bool get_v(size_t pos) {
397  return (m_cv.at(pos) & 0b01) != 0;
398  }
399 
400  inline bool get_c(size_t pos) {
401  return (m_cv.at(pos) & 0b10) != 0;
402  }
403 
404  inline void set_v(size_t pos, bool v) {
405  auto x = m_cv.at(pos) & 0b10;
406  m_cv.at(pos) = x | (0b01 * v);
407  }
408 
409  inline void set_c(size_t pos, bool c) {
410  auto x = m_cv.at(pos) & 0b01;
411  m_cv.at(pos) = x | (0b10 * c);
412  }
413 
414  inline void sparse_drop_bucket(size_t i) {
415  size_t qw = quotient_width();
416  m_buckets.at(i).destroy_vals(qw);
417  m_buckets.at(i) = Bucket<val_t>();
418  }
419 
420  inline void destroy_buckets() {
421  size_t qw = quotient_width();
422  for(size_t i = 0; i < m_buckets.size(); i++) {
423  if (!m_buckets[i].is_empty()) {
424  m_buckets[i].destroy_vals(qw);
425  }
426  }
427  }
428 
429  inline void dcheck_key_width(uint64_t key) {
430  IF_DEBUG({
431  uint64_t key_mask = (1ull << (m_width - 1ull) << 1ull) - 1ull;
432  bool key_is_too_large = key & ~key_mask;
433  DCHECK(!key_is_too_large) << "Attempt to insert key " << key << ", which requires more bits than the current set maximum of " << m_width << " Bits";
434  });
435  }
436 
437  // he actual amount of bits usable for storing a key
438  // is always >= the set key width stored in m_width
439  inline uint8_t real_width() {
440  return std::max(uint8_t(table_size_log2() + 1), m_width);
441  }
442 
443  struct DecomposedKey {
444  size_t initial_address; // initial address of key in table
445  size_t stored_quotient; // quotient value stored in table
446  };
447 
448  inline DecomposedKey decompose_key(uint64_t key) {
449  dcheck_key_width(key);
450  uint64_t hres = compact_hashfn(key, real_width());
451 
452  uint64_t shift = table_size_log2();
453 
454  if (HIGH_BITS_RANDOM) {
455  shift = 64 - shift;
456  return DecomposedKey {
457  hres >> shift,
458  hres & ((1ull << shift) - 1ull),
459  };
460  } else {
461  return DecomposedKey {
462  hres & ((1ull << shift) - 1ull),
463  hres >> shift,
464  };
465  }
466  }
467 
468  inline uint64_t compose_key(uint64_t initial_address, uint64_t quotient) {
469  uint64_t shift = table_size_log2();
470 
471  uint64_t harg;
472  if (HIGH_BITS_RANDOM) {
473  shift = 64 - shift;
474  harg = (initial_address << shift) | quotient;
475  } else {
476  harg = (quotient << shift) | initial_address;
477  }
478 
479  uint64_t key = compact_reverse_hashfn(harg, real_width());
480  dcheck_key_width(key);
481  return key;
482  }
483 
484  template<typename handler_t>
485  inline void shift_insert_handler(size_t from,
486  size_t to,
487  uint64_t key,
488  handler_t&& handler) {
489  DCHECK_NE(from, to);
490 
491  for(size_t i = to; i != from;) {
492  size_t next_i = mod_sub(i, size_t(1));
493 
494  set_c(i, get_c(next_i));
495 
496  i = next_i;
497  }
498  shift_insert_sparse_handler(from, to, key, std::move(handler));
499 
500  set_c(from, false);
501  }
502 
503  struct SearchedGroup {
504  size_t group_start; // group that belongs to the key
505  size_t group_end; // it's a half-open range: [start .. end)
506  size_t groups_terminator; // next free location
507  };
508 
509  // assumption: there exists a group at the location indicated by key
510  // this group is either the group belonging to key,
511  // or the one after it in the case that no group for key exists yet
512  inline SearchedGroup search_existing_group(DecomposedKey const& key) {
513  auto ret = SearchedGroup();
514 
515  // walk forward from the initial address until we find a empty location
516  // TODO: This search could maybe be accelerated by:
517  // - checking whole blocks in the bucket bitvector for == or != 0
518  size_t cursor = key.initial_address;
519  size_t v_counter = 0;
520  DCHECK_EQ(get_v(cursor), true);
521 
522  for(; sparse_exists(cursor); cursor = mod_add(cursor)) {
523  v_counter += get_v(cursor);
524  }
525  DCHECK_GE(v_counter, 1);
526  ret.groups_terminator = cursor;
527 
528  // walk back again to find start of group belong to the initial address
529  size_t c_counter = v_counter;
530  for(; c_counter != 1; cursor = mod_sub(cursor)) {
531  c_counter -= get_c(mod_sub(cursor));
532  }
533 
534  ret.group_end = cursor;
535 
536  for(; c_counter != 0; cursor = mod_sub(cursor)) {
537  c_counter -= get_c(mod_sub(cursor));
538  }
539 
540  ret.group_start = cursor;
541 
542  return ret;
543  }
544 
545  inline val_t* search(SearchedGroup const& res, uint64_t stored_quotient) {
546  for(size_t i = res.group_start; i != res.group_end; i = mod_add(i)) {
547  auto sparse_entry = sparse_get_at(i);
548 
549  if (sparse_entry.get_quotient() == stored_quotient) {
550  return &sparse_entry.val();
551  }
552  }
553  return nullptr;
554  }
555 
556  inline val_t* search(uint64_t key) {
557  auto dkey = decompose_key(key);
558  if (get_v(dkey.initial_address)) {
559  return search(search_existing_group(dkey), dkey.stored_quotient);
560  }
561  return nullptr;
562  }
563 
564 public:
565 
566  // TODO: This is not a std interface
567  inline void insert(uint64_t key, val_t&& value) {
568  insert(key, std::move(value), m_width);
569  }
570  inline void insert(uint64_t key, val_t&& value, size_t key_width) {
571  insert_handler(key, key_width, InsertHandler {
572  std::move(value)
573  });
574  }
575 
576  inline val_t& operator[](uint64_t key) {
577  return index(key, m_width);
578  }
579 
580  inline val_t& index(uint64_t key, size_t key_width) {
581  val_t* addr = nullptr;
582 
583  insert_handler(key, key_width, AddressDefaultHandler {
584  &addr
585  });
586 
587  return *addr;
588  }
589 
590  inline size_t size() const {
591  return m_sizing.size();
592  }
593 
594 private:
595  // Handler for inserting an element that exists as a rvalue reference.
596  // This will overwrite an existing element.
597  class InsertHandler {
598  val_t&& m_value;
599  public:
600  InsertHandler(val_t&& value): m_value(std::move(value)) {}
601 
602  inline auto on_new() {
603  struct InsertHandlerOnNew {
604  val_t&& m_value;
605  inline val_t& get() {
606  return m_value;
607  }
608  inline void new_location(val_t& value) {
609  }
610  };
611 
612  return InsertHandlerOnNew {
613  std::move(m_value),
614  };
615  }
616 
617  inline void on_existing(val_t& value) {
618  m_value = std::move(value);
619  }
620  };
621 
622  // Handler for getting the address of an element in the map.
623  // If none exists yet, it will be default constructed.
624  class AddressDefaultHandler {
625  val_t** m_address = nullptr;
626  public:
627  AddressDefaultHandler(val_t** address): m_address(address) {}
628 
629  inline auto on_new() {
630  struct AddressDefaultHandlerOnNew {
631  val_t m_value;
632  val_t** m_address;
633  inline val_t& get() {
634  return m_value;
635  }
636  inline void new_location(val_t& value) {
637  *m_address = &value;
638  }
639  };
640 
641  return AddressDefaultHandlerOnNew {
642  val_t(),
643  m_address,
644  };
645  }
646 
647  inline void on_existing(val_t& value) {
648  *m_address = &value;
649  }
650  };
651 
652  template<typename handler_t>
653  inline void insert_after(SearchedGroup const& res,
654  DecomposedKey const& dkey,
655  handler_t&& handler)
656  {
657  // this will insert the value at the end of the range defined by res
658 
659  if (sparse_is_empty(res.group_end)) {
660  // if there is no following group, just append the new entry
661 
662  sparse_set_at_empty_handler(res.group_end,
663  dkey.stored_quotient,
664  std::move(handler));
665  } else {
666  // else, shift all following elements
667 
668  shift_insert_handler(res.group_end,
669  res.groups_terminator,
670  dkey.stored_quotient,
671  std::move(handler));
672  }
673  }
674 
675  template<typename handler_t>
676  inline void insert_handler(uint64_t key, size_t key_width, handler_t&& handler) {
677  grow_if_needed(key_width);
678  auto const dkey = decompose_key(key);
679 
680  // cases:
681  // - initial address empty
682  // - initial address full, v[initial address] = 1
683  // - initial address full, v[initial address] = 0
684 
685  if (sparse_is_empty(dkey.initial_address)) {
686  // check if we can insert directly
687 
688  sparse_set_at_empty_handler(dkey.initial_address,
689  dkey.stored_quotient,
690  std::move(handler));
691 
692  // we created a new group, so update the bitflags
693  set_v(dkey.initial_address, true);
694  set_c(dkey.initial_address, true);
695  m_sizing.size()++;
696  } else {
697  // check if there already is a group for this key
698  bool const group_exists = get_v(dkey.initial_address);
699 
700  if (group_exists) {
701  auto const res = search_existing_group(dkey);
702 
703  // check if element already exists
704  auto p = search(res, dkey.stored_quotient);
705  if (p != nullptr) {
706  handler.on_existing(*p);
707  } else {
708  insert_after(res, dkey, std::move(handler));
709  m_sizing.size()++;
710  }
711  } else {
712  // insert a new group
713 
714  // pretend we already inserted the new group
715  // this makes insert_after() find the group
716  // at the location _before_ the new group
717  set_v(dkey.initial_address, true);
718  auto const res = search_existing_group(dkey);
719 
720  // insert the element after the found group
721  insert_after(res, dkey, std::move(handler));
722 
723  // mark the inserted element as the start of a new group,
724  // thus fixing-up the v <-> c mapping
725  set_c(res.group_end, true);
726 
727  m_sizing.size()++;
728  }
729  }
730  }
731 
732  template<typename int_t>
733  inline int_t mod_add(int_t v, int_t add = 1) {
734  size_t mask = table_size() - 1;
735  return (v + add) & mask;
736  }
737  template<typename int_t>
738  inline int_t mod_sub(int_t v, int_t sub = 1) {
739  size_t mask = table_size() - 1;
740  return (v - sub) & mask;
741  }
742 
743  struct iter_all_t {
744  compact_hash<val_t>& m_self;
745  size_t i = 0;
746  size_t original_start = 0;
747  uint64_t initial_address = 0;
748  enum {
749  EMPTY_LOCATIONS,
750  FULL_LOCATIONS
751  } state = EMPTY_LOCATIONS;
752 
753  inline iter_all_t(compact_hash<val_t>& self): m_self(self) {
754  // first, skip forward to the first empty location
755  // so that iteration can start at the beginning of the first complete group
756 
757  i = 0;
758 
759  for(;;i++) {
760  if (m_self.sparse_is_empty(i)) {
761  break;
762  }
763  }
764 
765  // Remember our startpoint so that we can recognize it when
766  // we wrapped around back to it
767  original_start = i;
768 
769  // We proceed to the next position so that we can iterate until
770  // we reach `original_start` again.
771  i = m_self.mod_add(i);
772  }
773 
774  inline bool next(uint64_t* out_initial_address, size_t* out_i) {
775  while(true) {
776  if (state == EMPTY_LOCATIONS) {
777  // skip empty locations
778  for(;;i = m_self.mod_add(i)) {
779  if (m_self.sparse_exists(i)) {
780  // we initialize init-addr at 1 pos before the start of
781  // a group of blocks, so that the blocks iteration logic works
782  initial_address = m_self.mod_sub(i);
783  state = FULL_LOCATIONS;
784  break;
785  }
786  if (i == original_start) {
787  return false;
788  }
789  }
790  } else {
791  // process full locations
792  if (m_self.sparse_is_empty(i)) {
793  state = EMPTY_LOCATIONS;
794  continue;
795  }
796  if (m_self.get_c(i)) {
797  // skip forward m_v cursor
798  // to find initial address for this block
799  //
800  // this works for the first block because
801  // initial_address starts at 1 before the group
802  initial_address = m_self.mod_add(initial_address);
803 
804  while(!m_self.get_v(initial_address)) {
805  initial_address = m_self.mod_add(initial_address);
806  }
807  }
808 
809  *out_initial_address = initial_address;
810  *out_i = i;
811 
812  i = m_self.mod_add(i);
813  return true;
814  }
815  }
816  }
817  };
818 
819  inline size_t quotient_width() {
820  return real_width() - table_size_log2();
821  }
822 
823  inline void grow_if_needed(size_t new_width) {
824  auto needs_capacity_change = [&]() {
825  return(m_sizing.capacity() / 2) <= (m_sizing.size() + 1);
826  };
827 
828  auto needs_realloc = [&]() {
829  return needs_capacity_change() || (new_width != m_width);
830  };
831 
832  if (needs_realloc()) {
833  size_t new_capacity;
834  if (needs_capacity_change()) {
835  new_capacity = m_sizing.capacity() * 2;
836  } else {
837  new_capacity = m_sizing.capacity();
838  }
839  auto new_table = compact_hash(new_capacity, new_width);
840 
841  /*
842  std::cout
843  << "size: " << m_sizing.size()
844  << ", grow to cap " << new_table.table_size()
845  << ", m_width: " << new_table.m_width
846  << ", real_width: " << new_table.real_width()
847  << ", quot width: " << new_table.quotient_width()
848  << "\n";
849  */
850 
851  bool start_of_bucket = false;
852  size_t bucket = 0;
853 
854  uint64_t initial_address;
855  size_t i;
856  auto iter = iter_all_t(*this);
857  while(iter.next(&initial_address, &i)) {
858  auto p = sparse_pos(i);
859 
860  // drop buckets of old table as they get emptied out
861  if (p.offset == 0) {
862  if (start_of_bucket) {
863  DCHECK_NE(bucket, p.bucket_pos);
864  sparse_drop_bucket(bucket);
865  }
866 
867  start_of_bucket = true;
868  bucket = p.bucket_pos;
869  }
870 
871  auto kv = sparse_get_at(p);
872  auto stored_quotient = kv.get_quotient();
873  auto& val = kv.val();
874  key_t key = compose_key(initial_address, stored_quotient);
875 
876  new_table.insert(key, std::move(val));
877  }
878 
879  *this = std::move(new_table);
880  }
881 
882  DCHECK(!needs_realloc());
883  }
884 
885 public:
886  // for tests:
887 
888  inline std::string debug_state() {
889  std::stringstream ss;
890 
891  bool gap_active = false;
892  size_t gap_start;
893  size_t gap_end;
894 
895  auto print_gap = [&](){
896  if (gap_active) {
897  gap_active = false;
898  ss << " [" << gap_start << " to " << gap_end << " empty]\n";
899  }
900  };
901 
902  auto add_gap = [&](size_t i){
903  if (!gap_active) {
904  gap_active = true;
905  gap_start = i;
906  }
907  gap_end = i;
908  };
909 
910  std::vector<std::string> lines(table_size());
911 
912  uint64_t initial_address;
913  size_t j;
914  auto iter = iter_all_t(*this);
915  while(iter.next(&initial_address, &j)) {
916  std::stringstream ss2;
917 
918  auto kv = sparse_get_at(j);
919  auto stored_quotient = kv.get_quotient();
920  auto& val = kv.val();
921  key_t key = compose_key(initial_address, stored_quotient);
922 
923  ss2 << j
924  << "\t: v = " << get_v(j)
925  << ", c = " << get_c(j)
926  << ", quot = " << stored_quotient
927  << ", iadr = " << initial_address
928  << "\t, key = " << key
929  << "\t, value = " << val
930  << "\t (" << &val << ")";
931 
932  lines.at(j) = ss2.str();
933  }
934 
935  ss << "[\n";
936  for (size_t i = 0; i < table_size(); i++) {
937  bool cv_exist = lines.at(i) != "";
938 
939  DCHECK_EQ(cv_exist, sparse_exists(i));
940 
941  if (cv_exist) {
942  print_gap();
943  ss << " "
944  << lines.at(i)
945  << "\n";
946  } else {
947  add_gap(i);
948  }
949  }
950  print_gap();
951  ss << "]";
952 
953  return ss.str();
954  }
955 
956  inline void debug_check_single(uint64_t key, val_t const& val) {
957  auto ptr = search(key);
958  ASSERT_NE(ptr, nullptr) << "key " << key << " not found!";
959  if (ptr != nullptr) {
960  ASSERT_EQ(*ptr, val) << "value is " << *ptr << " instead of " << val;
961  }
962  }
963 
964 private:
965  template<typename handler_t>
966  inline void sparse_set_at_empty_handler(size_t pos, key_t quot, handler_t&& handler) {
967  auto data = sparse_pos(pos);
968  DCHECK(!data.exists);
969 
970  // figure out which bucket to access
971  auto& bucket = m_buckets[data.bucket_pos];
972  size_t qw = quotient_width();
973  const size_t size = bucket.size();
974 
975  // we will insert a new element
976  auto value_handler = handler.on_new();
977  auto& val = value_handler.get();
978 
979  // TODO: check out different sizing strategies
980  // eg, the known sparse_hash repo uses overallocation for small buckets
981 
982  // create a new bucket with enough size for the new element
983  auto new_bucket = Bucket<val_t>(bucket.bv() | data.b_mask, qw);
984 
985  // move element from old bucket into new bucket
986  for(size_t i = 0; i < size; i++) {
987  auto new_elem = new_bucket.at(i, qw);
988  auto elem = bucket.at(i, qw);
989 
990  new_elem.set_quotient(elem.get_quotient());
991  new(&new_elem.val()) val_t(std::move(elem.val()));
992  }
993 
994  // construct new element into new bucket position
995  auto new_elem = new_bucket.at(size, qw);
996  new_elem.set_quotient(quot);
997  new(&new_elem.val()) val_t(std::move(val));
998 
999  // rotate new element to the right position
1000  new_bucket.rotate_end_to(data.offset, qw);
1001 
1002  // notify handler with location of new element
1003  auto new_loc = new_bucket.at(data.offset, qw);
1004  value_handler.new_location(new_loc.val());
1005 
1006  // destroy old empty elements and replace with new bucket
1007  bucket.destroy_vals(qw);
1008  bucket = std::move(new_bucket);
1009  }
1010 
1011  inline bool sparse_is_empty(size_t i) {
1012  return !sparse_exists(i);
1013  }
1014 
1015  inline bool sparse_exists(size_t pos) {
1016  return sparse_pos(pos).exists;
1017  }
1018 
1019  // shifts all elements one to the right,
1020  // inserts val and quot at the from position,
1021  // stores the old from element in val and quot,
1022  // and returns the SparsePos to the form position
1023  inline auto sparse_shift(size_t from, size_t to, val_t& val, key_t& quot) {
1024  DCHECK_LT(from, to);
1025 
1026  // pseudo-iterator for iterating over bucket elements
1027  // NB: does not wrap around!
1028  struct iter {
1029  Bucket<val_t> const* m_bucket;
1030  BucketElem<val_t> m_b_start;
1031  BucketElem<val_t> m_b_end;
1032  size_t m_quotient_width;
1033 
1034  inline void set_bucket_elem_range(size_t end_offset) {
1035  size_t start_offset = 0;
1036  DCHECK_LE(start_offset, end_offset);
1037 
1038  m_b_start = m_bucket->at(start_offset, m_quotient_width);
1039  m_b_end = m_bucket->at(end_offset, m_quotient_width);
1040  }
1041 
1042  inline iter(compact_hash& table,
1043  SparsePos const& pos) {
1044  m_quotient_width = table.quotient_width();
1045  m_bucket = &table.m_buckets.at(pos.bucket_pos);
1046 
1047  set_bucket_elem_range(pos.offset);
1048  }
1049 
1050  inline BucketElem<val_t> get() {
1051  return m_b_end;
1052  }
1053 
1054  inline void decrement() {
1055  if (!m_b_start.ptr_eq(m_b_end)) {
1056  m_b_end.decrement_ptr();
1057  } else {
1058  do {
1059  --m_bucket;
1060  } while(m_bucket->bv() == 0);
1061  set_bucket_elem_range(m_bucket->size() - 1);
1062  }
1063  }
1064 
1065  inline bool operator!=(iter& other) {
1066  return !(m_b_end.ptr_eq(other.m_b_end));
1067  }
1068  };
1069 
1070  // initialize iterators like this:
1071  // [ ]
1072  // ^from to^
1073  // ||
1074  // <- src^|
1075  // <- dest^
1076 
1077  auto from_loc = sparse_pos(from);
1078  auto from_iter = iter(*this, from_loc);
1079 
1080  auto last = sparse_pos(to - 1);
1081  auto src = iter(*this, last);
1082  auto dst = iter(*this, sparse_pos(to));
1083 
1084  // move the element at the last position to a temporary position
1085  auto tmp_p = sparse_get_at(last);
1086  val_t tmp_val = std::move(tmp_p.val());
1087  key_t tmp_quot = tmp_p.get_quotient();
1088 
1089  // move all elements one to the right
1090  while(src != from_iter) {
1091  src.decrement();
1092  dst.decrement();
1093 
1094  auto& src_val = src.get().val();
1095  auto& dst_val = dst.get().val();
1096 
1097  auto src_quot = src.get().get_quotient();
1098 
1099  dst_val = std::move(src_val);
1100  dst.get().set_quotient(src_quot);
1101  }
1102 
1103  // move new element into empty from position
1104  auto from_p = sparse_get_at(from_loc);
1105  from_p.val() = std::move(val);
1106  from_p.set_quotient(quot);
1107 
1108  // move temporary element into the parameters
1109  val = std::move(tmp_val);
1110  quot = std::move(tmp_quot);
1111 
1112  return from_loc;
1113  }
1114 
1115  template<typename handler_t>
1116  inline void shift_insert_sparse_handler(size_t from,
1117  size_t to,
1118  key_t quot,
1119  handler_t&& handler) {
1120  // move from...to one to the right, then insert at from
1121 
1122  DCHECK(from != to);
1123 
1124  auto value_handler = handler.on_new();
1125  auto& val = value_handler.get();
1126 
1127  SparsePos loc;
1128  if (to < from) {
1129  // if the range wraps around, we decompose into two ranges:
1130  // [ | | ]
1131  // | to^ ^from |
1132  // ^start end^
1133  // [ 2 ] [ 1 ]
1134  //
1135  // NB: because we require from != to, and insert 1 additional element,
1136  // we are always dealing with a minimum 2 element range,
1137  // and thus can not end up with a split range with length == 0
1138 
1139  // inserts the new element at the start of the range,
1140  // and temporarily stores the element at the end of the range
1141  // in `val` and `quot`
1142  loc = sparse_shift(from, table_size(), val, quot);
1143  sparse_shift(0, to, val, quot);
1144  } else {
1145  // inserts the new element at the start of the range,
1146  // and temporarily stores the element at the end of the range
1147  // in `val` and `quot`
1148  loc = sparse_shift(from, to, val, quot);
1149  }
1150 
1151  // insert the element from the end of the range at the free
1152  // position to the right of it
1153  auto insert = InsertHandler(std::move(val));
1154  sparse_set_at_empty_handler(to, quot, std::move(insert));
1155 
1156  // after the previous insert and a potential reallocation,
1157  // notify the handler about the address of the new value
1158  value_handler.new_location(sparse_get_at(loc).val());
1159  }
1160 
1161  struct SparsePos {
1162  public:
1163  size_t bucket_pos;
1164  size_t bit_pos;
1165  private:
1166  uint64_t bv;
1167  public:
1168  uint64_t b_mask;
1169  bool exists;
1170  size_t offset;
1171 
1172  inline SparsePos() {}
1173 
1174  template<typename buckets_t>
1175  inline SparsePos(size_t pos, buckets_t& m_buckets):
1176  // bucket index based on position (division by 64 bits)
1177  bucket_pos(pos >> BVS_WIDTH_SHIFT),
1178 
1179  // remainder position of pos inside the bucket (modulo by 64 bits)
1180  bit_pos(pos & BVS_WIDTH_MASK),
1181 
1182  // reference to the bitvector for the bucket
1183  bv(m_buckets[bucket_pos].bv()),
1184 
1185  // mask for the single bit we deal with
1186  b_mask(1ull << bit_pos),
1187 
1188  // check if the bit is set
1189  exists(bv & b_mask),
1190 
1191  // calculate offset of element in bucket for current pos
1192  // based on number of set bits in bv
1193  offset(__builtin_popcountll(bv & ((1ull << bit_pos) - 1)))
1194  {}
1195  };
1196 
1197  SparsePos sparse_pos(size_t pos) {
1198  return SparsePos { pos, m_buckets };
1199  }
1200 
1201  inline BucketElem<val_t> sparse_get_at(SparsePos pos) {
1202  DCHECK(pos.exists);
1203  size_t qw = quotient_width();
1204 
1205  return m_buckets[pos.bucket_pos].at(pos.offset, qw);
1206  }
1207 
1208  inline BucketElem<val_t> sparse_get_at(size_t pos) {
1209  return sparse_get_at(sparse_pos(pos));
1210  }
1211 };
1212 
1213 }
val_t & operator[](uint64_t key)
compact_hash(size_t size, size_t key_width=DEFAULT_KEY_WIDTH)
bool operator!=(const ConstGenericView< uliteral_t > &lhs, const ConstGenericView< uliteral_t > &rhs)
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Bucket & operator=(Bucket &&other)
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
val_t const & val() const
bool is_pot(size_t n)
Bucket(uint64_t bv, size_t quot_width)
IntPtr< dynamic_t > QuotPtr
Bucket(Bucket &&other)
size_t size() const
uint8_t capacity_log2() const
val_t & index(uint64_t key, size_t key_width)
size_t capacity() const
void rotate_end_to(size_t pos, size_t quot_width)
uint64_t compact_reverse_hashfn(uint64_t x, uint64_t w)
void swap(IntVector< T > &lhs, IntVector< T > &rhs)
Definition: IntVector.hpp:532
void resize(size_type n)
Definition: IntVector.hpp:327
BucketElem< val_t > at(size_t pos, size_t quot_width) const
void rotate_end_to(I base, size_t pos, size_t size)
len_compact_t src
Definition: LZSSFactors.hpp:38
void insert(uint64_t key, val_t &&value, size_t key_width)
size_t const & size() const
void set_quotient(uint64_t v)
void destroy_vals(size_t quot_width)
SizeManager(size_t capacity)
len_compact_t pos
Definition: LZSSFactors.hpp:38
uint64_t bv() const
bool ptr_eq(BucketElem const &other)
void insert(uint64_t key, val_t &&value)
void swap_quotient(uint64_t &other)
void reserve(size_type n)
Definition: IntVector.hpp:357
compact_hash(compact_hash &&other)
compact_hash & operator=(compact_hash &&other)
uint64_t compact_hashfn(uint64_t x, uint64_t w)
QuotPtr make_quot_ptr(uint64_t *ptr, size_t quot_width)
reference at(size_type n)
Definition: IntVector.hpp:381
uint8_t log2_upper(uint64_t v)
void debug_check_single(uint64_t key, val_t const &val)
#define IF_DEBUG(x)
x is compiled only in debug builds.
Definition: def.hpp:32