tudocomp
– The TU Dortmund Compression Framework
CedarTrie.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <vector>
5 #include <tudocomp/Algorithm.hpp>
6 
7 #include "cedar.hpp"
8 
9 namespace tdc {
10 namespace lz78 {
11 
12 namespace cedar {
13  class CedarTrieNode: public LZ78TrieNode {
14  size_t m_search_pos;
15  public:
16  inline CedarTrieNode(factorid_t id, bool is_new, size_t search_pos):
17  LZ78TrieNode(id, is_new),m_search_pos(search_pos) {}
18  inline CedarTrieNode():
19  LZ78TrieNode(), m_search_pos(0) {}
20 
21  inline size_t search_pos() const { return m_search_pos; }
22  };
23 }
24 
25 class CedarTrie: public Algorithm, public LZ78Trie<cedar::CedarTrieNode> {
26  using cedar_factorid_t = lz78::factorid_t;
27  // NB: this refers to different cedar namespace than defined in this file
28  using cedar_t = ::cedar::da<cedar_factorid_t>;
29 
30  static constexpr uint8_t NULL_ESCAPE_ESCAPE_BYTE = 255;
31  static constexpr uint8_t NULL_ESCAPE_REPLACEMENT_BYTE = 254;
32 
33  static constexpr cedar_factorid_t NO_VALUE = static_cast<cedar_factorid_t>(
34  cedar_t::error_code::CEDAR_NO_VALUE);
35  static constexpr cedar_factorid_t NO_PATH = static_cast<cedar_factorid_t>(
36  cedar_t::error_code::CEDAR_NO_PATH);
37  static constexpr cedar_factorid_t HIDDEN_ESCAPE_ID = -3; // NOTE: May not be -1 or -2
38 
39  class LzwRootSearchPosMap {
40  std::array<size_t, 256> m_array;
41  public:
42  inline size_t get(uliteral_t c) const {
43  DCHECK(c < m_array.size());
44  return m_array[c];
45  }
46  inline void set(uliteral_t c, size_t v) {
47  DCHECK(c < m_array.size());
48  m_array[c] = v;
49  }
50  };
51 
52  // unique_ptr only needed for reassignment
53  std::unique_ptr<cedar_t> m_trie;
54  cedar_factorid_t m_ids = 0;
55  LzwRootSearchPosMap m_roots;
56 
57  inline node_t _find_or_insert(const node_t& parent, uliteral_t c, bool incr_id) {
58  auto search_pos = parent.search_pos();
59 
60  auto letter = (const char*) &c;
61  auto& from = search_pos;
62  cedar_factorid_t searchResult;
63 
64  {
65  size_t pos = 0;
66  searchResult = m_trie->traverse(letter, from, pos, 1);
67  }
68 
69  if(searchResult != NO_VALUE && searchResult != NO_PATH) {
70  return node_t {
71  searchResult - 1,
72  false,
73  search_pos,
74  };
75  } else {
76  {
77  size_t pos = 0;
78  if (incr_id) {
79  m_trie->update(letter, from, pos, 1, ++m_ids);
80  } else {
81  m_trie->update(letter, from, pos, 1, HIDDEN_ESCAPE_ID);
82  }
83  }
84  return node_t {
85  factorid_t(size() - 1ull),
86  true,
87  search_pos,
88  };
89  }
90  }
91 
92  inline void _print(size_t from, size_t ind) {
93  cedar_t& t = *m_trie;
94 
95  bool prev_empty = false;
96  size_t prev_empty_min = 1;
97  size_t prev_empty_max = 1;
98 
99  auto print_prev_empty = [&]() {
100  if (prev_empty) {
101  DLOG(INFO)
102  << std::setfill(' ')
103  << std::setw(ind)
104  << ""
105  << "["
106  << std::setfill('0')
107  << std::setw(3)
108  << prev_empty_min
109  << "]: "
110  << "-"
111  << "\n";
112  if (prev_empty_min != prev_empty_max) {
113  DLOG(INFO)
114  << std::setfill(' ')
115  << std::setw(ind)
116  << ""
117  << "[...]\n";
118  DLOG(INFO)
119  << std::setfill(' ')
120  << std::setw(ind)
121  << ""
122  << "["
123  << std::setfill('0')
124  << std::setw(3)
125  << prev_empty_max
126  << "]: "
127  << "-"
128  << "\n";
129  }
130  prev_empty = false;
131  }
132  };
133 
134  for (size_t i = 1; i < 256; i++) {
135  auto child_from = from;
136  const char c = uint8_t(i);
137  size_t pos = 0;
138  auto r = t.traverse(&c, child_from, pos, 1);
139  if (r != NO_PATH && r != NO_VALUE) {
140  print_prev_empty();
141  prev_empty_min = i + 1;
142  DLOG(INFO)
143  << std::setfill(' ')
144  << std::setw(ind)
145  << ""
146  << "["
147  << std::setfill('0')
148  << std::setw(3)
149  << i
150  << std::setfill(' ')
151  << "]: "
152  << r
153  << "\n";
154  _print(child_from, ind + 4);
155  } else {
156  prev_empty = true;
157  prev_empty_max = i;
158  }
159  }
160  print_prev_empty();
161  }
162 
163  inline void print() {
164  DLOG(INFO) << "\n";
165  _print(0, 0);
166  DLOG(INFO) << "\n";
167  }
168 
169 public:
170  inline static Meta meta() {
171  Meta m("lz78trie", "cedar", "Lempel-Ziv 78 Cedar Trie");
172  return m;
173  }
174 
175  CedarTrie(Env&& env, const size_t n, const size_t& remaining_characters, factorid_t = 0)
176  : Algorithm(std::move(env))
177  , LZ78Trie(n, remaining_characters)
178  , m_trie(std::make_unique<cedar_t>()) {}
179 
180  inline node_t add_rootnode(const uliteral_t c) {
181  cedar_factorid_t ids = c;
182  DCHECK(m_ids == ids);
183  m_ids++;
184 
185  size_t search_pos;
186 
187  if (c != 0 && c != NULL_ESCAPE_ESCAPE_BYTE) {
188  const char* letter = (const char*) &c;
189  size_t from = 0;
190  size_t pos = 0;
191  m_trie->update(letter, from, pos, 1, ids);
192  DCHECK(pos == 1);
193  search_pos = size_t{ from };
194  } else {
195  const char* letter;
196  size_t from = 0;
197  size_t pos;
198 
199  pos = 0;
200 
201  auto null_escape_byte = NULL_ESCAPE_ESCAPE_BYTE;
202  letter = (const char*) &null_escape_byte;
203  auto res = m_trie->traverse(letter, from, pos, 1);
204 
205  if (res == NO_PATH || res == NO_VALUE) {
206  DCHECK(pos == 0);
207  m_trie->update(letter, from, pos, 1, cedar_factorid_t(HIDDEN_ESCAPE_ID));
208  DCHECK(pos == 1);
209  }
210 
211  pos = 0;
212  char c2 = c;
213  if (c == 0) c2 = NULL_ESCAPE_REPLACEMENT_BYTE;
214  letter = (const char*) &c2;
215  m_trie->update(letter, from, pos, 1, ids);
216  DCHECK(pos == 1);
217 
218  search_pos = size_t{ from };
219  }
220  auto r = node_t(ids, true, search_pos);
221  m_roots.set(c, search_pos);
222  /*
223  DLOG(INFO) << "add rootnode "
224  << "char: " << int(c)
225  << ", factor id: "
226  << r.id() << ", from: "
227  << r.search_pos();
228  print();
229  */
230  return r;
231  }
232 
233  inline node_t get_rootnode(uliteral_t c) const {
234  return node_t(c, false, m_roots.get(c));
235  }
236 
237  inline void clear() {
238  // TODO: cedar seems to have a clear() method, but also
239  // seems to have bugs in its implementation
240  m_trie = std::make_unique<cedar_t>();
241  m_ids = 0;
242  m_roots = LzwRootSearchPosMap();
243  }
244 
245  inline node_t find_or_insert(const node_t& parent, uliteral_t c) {
246  node_t r;
247  /*
248  DLOG(INFO) << "find or insert "
249  << "char: " << int(c)
250  << ", factor id: "
251  << parent.id() << ", from: "
252  << parent.search_pos();
253  */
254  if (c == 0) {
255  auto r1 = _find_or_insert(parent, NULL_ESCAPE_ESCAPE_BYTE, false);
256  auto r2 = _find_or_insert(r1, NULL_ESCAPE_REPLACEMENT_BYTE, true);
257  r = r2;
258  } else if (c == NULL_ESCAPE_ESCAPE_BYTE) {
259  auto r1 = _find_or_insert(parent, NULL_ESCAPE_ESCAPE_BYTE, false);
260  auto r2 = _find_or_insert(r1, NULL_ESCAPE_ESCAPE_BYTE, true);
261  r = r2;
262  } else {
263  r = _find_or_insert(parent, c, true);
264  }
265  //print();
266  return r;
267  }
268 
269  inline size_t size() const {
270  return m_ids;
271  }
272 };
273 
274 }} //ns
275 
node_t find_or_insert(const node_t &parent, uliteral_t c)
Definition: CedarTrie.hpp:245
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
CedarTrieNode(factorid_t id, bool is_new, size_t search_pos)
Definition: CedarTrie.hpp:16
node_t get_rootnode(uliteral_t c) const
Definition: CedarTrie.hpp:233
bool is_new() const
Definition: LZ78Trie.hpp:31
Default return type of find_or_insert.
Definition: LZ78Trie.hpp:22
len_compact_t pos
Definition: LZSSFactors.hpp:38
CedarTrie(Env &&env, const size_t n, const size_t &remaining_characters, factorid_t=0)
Definition: CedarTrie.hpp:175
size_t size() const
Definition: CedarTrie.hpp:269
uint32_t factorid_t
Type for the factor indices, bounded by the number of LZ78 trie nodes.
Definition: LZ78Trie.hpp:11
static Meta meta()
Definition: CedarTrie.hpp:170
Local environment for a compression/encoding/decompression call.
Interface for algorithms.
Definition: Algorithm.hpp:15
node_t add_rootnode(const uliteral_t c)
Definition: CedarTrie.hpp:180