tudocomp
– The TU Dortmund Compression Framework
meta_blocks.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <tudocomp/util.hpp>
4 #include <tudocomp/Env.hpp>
7 
12 
13 namespace tdc {namespace esp {
14 
15 template<typename Source>
17  RoundContext<Source>* m_parent;
18 public:
20  return *m_parent;
21  }
22 
24 
26  m_parent(&ctx),
27  debug(dbg) {}
28 
29  void push_block(size_t width, size_t type) {
30  debug.block(width, type);
31  rctx().push_back(width, type);
32  }
33 
34  inline void eager_mb13(const Source& src, size_t t) {
35  [&]() {
36  size_t j = src.size();
37  for (size_t i = 0; i < j;) {
38  size_t remaining_len = j - i;
39  switch (remaining_len) {
40  case 4:
41  push_block(2, t);
42  push_block(2, t);
43  return;
44  case 3:
45  push_block(3, t);
46  return;
47  case 2:
48  push_block(2, t);
49  return;
50  case 1:
51  push_block(1, t);
52  //DCHECK_GT(remaining_len, 1);
53  return;
54  case 0:
55  return;
56  default:
57  push_block(3, t);
58  i += 3;
59  }
60  }
61  }();
62  rctx().debug_check_advanced(src.size());
63  }
64 
65  inline void eager_mb2(const Source& src) {
66  auto& ctx = rctx();
67 
68  auto A = src;
69  DCHECK(A.size() > 0);
70  auto type_3_prefix_len = std::min(iter_log(ctx.alphabet_size),
71  A.size());
72 
73  // Handle non-m2 prefix
74  {
75  auto type_3_prefix = A.slice(0, type_3_prefix_len);
76  eager_mb13(type_3_prefix, 3);
77  if (type_3_prefix_len == A.size()) { return; }
78  }
79 
80  auto type_2_suffix_size = src.size() - type_3_prefix_len;
81 
82  // Prepare scratchpad buffer
83  auto& buf = ctx.scratchpad;
84  buf.clear();
85  buf.reserve(A.cend() - A.cbegin());
86  buf.insert(buf.cbegin(), A.cbegin(), A.cend());
87 
88  // Iterate on the buffer by combing each two adjacent elements.
89  // This reduces the size by `iter_log(alphabet_size) == type_3_prefix_len`
90  // the alphabet to size 6.
91  {
92  debug.mb2_initial(buf);
93  debug.mb2_reduce_to_6_start();
94  for (uint shrink_i = 0; shrink_i < type_3_prefix_len; shrink_i++) {
95  for (size_t i = 1; i < buf.size(); i++) {
96  auto left = buf[i - 1];
97  auto right = buf[i];
98  buf[i - 1] = label(left, right);
99  }
100  buf.pop_back();
101 
102  debug.mb2_reduce_to_6_step(buf);
103  }
104 
105  DCHECK_LE(calc_alphabet_size(buf), 6);
106  }
107 
108  // Reduce further to alphabet 3
109  {
110  debug.mb2_reduce_to_3_start();
111 
112  // final pass: reduce to alphabet 3
113  for(uint to_replace = 3; to_replace < 6; to_replace++) {
114  do_for_neighbors(buf, [&](size_t i, ConstGenericView<size_t> neighbors) {
115  auto& e = buf[i];
116  if (e == to_replace) {
117  e = 0;
118  for (auto n : neighbors) { if (n == e) { e++; } }
119  for (auto n : neighbors) { if (n == e) { e++; } }
120  }
121  });
122 
123  debug.mb2_reduce_to_3_step(buf);
124  }
125 
126  DCHECK(calc_alphabet_size(buf) <= 3);
127  DCHECK(no_adjacent_identical(buf));
128  }
129 
130  // find landmarks:
131  {
132  // TODO: Maybe store in high bits of buf to reduce memory?
133  // buf gets reduced to 2 bit values anyway, and stays around long enough
134  IntVector<uint_t<1>> landmarks(buf.size());
135 
136  do_for_neighbors(buf, [&](size_t i, ConstGenericView<size_t> neighbors) {
137  bool is_high_landmark = true;
138  for (auto e : neighbors) {
139  if (e > buf[i]) {
140  is_high_landmark = false;
141  }
142  }
143  if (is_high_landmark) {
144  landmarks[i] = 1;
145  }
146  });
147 
148  debug.mb2_high_landmarks(landmarks);
149 
150  do_for_neighbors(buf, [&](size_t i, ConstGenericView<size_t> neighbors) {
151  bool is_low_landmark = true;
152  for (auto e : neighbors) {
153  if (e < buf[i]) {
154  is_low_landmark = false;
155  }
156  }
157  // if there is a large enough landmark-less gap, mark it as well
158  if (is_low_landmark) {
159  //if (i > 0 && i < buf.size() - 1)
160  if ( (!(i > 0) || (landmarks[i - 1] == 0u))
161  && (!(i < buf.size() - 1) || (landmarks[i + 1] == 0u))
162  ) {
163  landmarks[i] = 1;
164  }
165  }
166  });
167 
168  debug.mb2_high_and_low_landmarks(landmarks);
169 
170  DCHECK(check_landmarks(landmarks, true));
171 
172  // Split at landmarks
173 
175  landmarks.size(),
176  [&](size_t i) {
177  return landmarks[i] == uint_t<1>(1);
178  },
179  [&](size_t left, size_t right) {
180  push_block(right - left + 1, 2);
181  },
182  ctx.behavior_landmarks_tie_to_right
183  );
184  }
185 
186  ctx.debug_check_advanced(type_2_suffix_size);
187  }
188 };
189 
190 }}
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
void mb2_reduce_to_3_step(const T &buf)
uint64_t label(uint64_t left, uint64_t right)
Definition: esp_math.hpp:16
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
A const view into a slice of memory.
DebugMetablockContext debug
Definition: meta_blocks.hpp:23
unsigned int uint
Definition: characterhash.h:6
void mb2_high_and_low_landmarks(const T &buf)
void block(size_t width, size_t type)
void debug_check_advanced(size_t len)
void mb2_high_landmarks(const T &buf)
void eager_mb2(const Source &src)
Definition: meta_blocks.hpp:65
typename uint_dispatch_t< N >::type uint_t
Definition: uint_t.hpp:165
bool check_landmarks(const T &t, bool allow_long=false)
Definition: landmarks.hpp:8
size_t iter_log(size_t n)
Definition: esp_math.hpp:8
MetablockContext(RoundContext< Source > &ctx, DebugMetablockContext dbg)
Definition: meta_blocks.hpp:25
void push_back(size_t l, size_t type)
bool no_adjacent_identical(const T &t)
Definition: utils.hpp:79
len_compact_t src
Definition: LZSSFactors.hpp:38
void mb2_reduce_to_6_step(const T &buf)
void landmark_spanner(size_t size, LmPred pred, SpanPush push, bool tie)
Definition: landmarks.hpp:30
void push_block(size_t width, size_t type)
Definition: meta_blocks.hpp:29
void eager_mb13(const Source &src, size_t t)
Definition: meta_blocks.hpp:34
void do_for_neighbors(T &t, F f)
Definition: utils.hpp:87
RoundContext< Source > & rctx()
Definition: meta_blocks.hpp:19
uint64_t calc_alphabet_size(const T &t)
Definition: utils.hpp:68