tudocomp
– The TU Dortmund Compression Framework
compressors/esp/HuffmanCoder.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <sstream>
4 #include <iostream>
5 #include <bitset>
6 #include <numeric>
7 
8 #include <tudocomp/Coder.hpp>
9 
10 namespace tdc {namespace esp {
11  namespace huff2 {
12  // TODO: dynamic size in memory, lesser priority
13  using OrderedCodelengths = std::vector<size_t>;
14  using OrderedMapToEffective = std::vector<size_t>; // TODO < use better map than vector for higher value ranges
15  using Codewords = std::vector<size_t>;
16  using Counts = std::vector<size_t>;
17  using MapFromEffective = std::vector<size_t>;
18  using Codelengths = std::vector<size_t>;
19  using Numl = std::vector<size_t>;
20  using OrderedMapFromEffective = std::vector<size_t>;
21  using Firstcode = std::vector<size_t>;
22 
23  inline static MapFromEffective gen_effective_alphabet(const Counts& counts, size_t alphabet_size) {
24  MapFromEffective map_from_effective;
25  map_from_effective.reserve(alphabet_size);
26  map_from_effective.resize(alphabet_size);
27 
28  size_t j = 0;
29  for(size_t i = 0; i < counts.size(); ++i) {
30  if(counts[i] == 0) continue;
31  DCHECK_LT(j, alphabet_size);
32  map_from_effective[j++] = i;
33  }
34  DCHECK_EQ(j, alphabet_size);
35  for(size_t i = 0; i < alphabet_size; ++i) {
36  DCHECK_NE(counts[map_from_effective[i]], 0);
37  }
38  return map_from_effective;
39  }
40 
41  inline static Codelengths gen_codelengths(Counts counts,
42  const MapFromEffective& map_from_effective,
43  size_t alphabet_size) {
44  std::vector<size_t> A;
45  A.reserve(alphabet_size * 2);
46  A.resize(alphabet_size * 2);
47 
48  for(size_t i = 0; i < alphabet_size; i++) {
49  DVLOG(2)
50  << "Char " << map_from_effective[i]
51  << " : " << size_t(counts[map_from_effective[i]])
52  << "\t";
53  A[alphabet_size + i] = counts[map_from_effective[i]];
54  A[i] = alphabet_size + i;
55  }
56 
57  // build a minHeap of A[0..size)
58  auto comp = [&A] (const size_t a, const size_t b) -> bool {
59  return A[a] > A[b];
60  };
61  std::make_heap(&A[0], &A[alphabet_size], comp);
62  DCHECK_LE(A[A[0]], A[A[1]]);
63 
64  DVLOG(2) << "A: " << vec_to_debug_string(A) << "\t";
65 
66  size_t h = alphabet_size - 1;
67  //builds the Huffman tree
68 
69  while(h > 0) {
70  std::pop_heap(A.begin(), A.begin() + h + 1, comp);
71  const size_t m1 = A[h]; // first element from the heap (already popped)
72  --h;
73 
74  std::pop_heap(A.begin(), A.begin() + h + 1, comp);
75  const size_t m2 = A[h]; //second min count
76  DCHECK_GT(m1, h);
77  DCHECK_GT(m2, h);
78 
79  A[h + 1] = A[m1] + A[m2]; // create a new parent node
80  A[h] = h + 1;
81  A[m1] = A[m2] = h + 1; //parent pointer
82  std::push_heap(A.begin(), A.begin() + h + 1, comp);
83  }
84 
85  A[1] = 0;
86  for(size_t i = 2; i < 2 * alphabet_size; ++i) {
87  A[i] = A[A[i]]+ 1;
88  }
89 
90  Codelengths codelengths;
91  codelengths.reserve(alphabet_size);
92  codelengths.resize(alphabet_size);
93  for (size_t i = 0; i < alphabet_size; i++) {
94  DCHECK_LE(A[alphabet_size + i], 64); // the latter representation allows only codewords of length at most 64 bits
95  codelengths[i] = A[alphabet_size+i];
96  DVLOG(2)
97  << "Char " << map_from_effective[i]
98  << " : " << codelengths[i]
99  << "\t";
100  }
101 
102  return codelengths;
103 
104  }
105 
106  inline static Numl gen_numl(const OrderedCodelengths& ordered_codelengths, const size_t alphabet_size, const uint8_t longest) {
107  DCHECK_EQ(longest, *std::max_element(ordered_codelengths.begin(),
108  ordered_codelengths.end()));
109  DCHECK_GT(longest, 0);
110 
111  // numl : length l -> #codewords of length l
112  Numl numl;
113  numl.reserve(longest);
114  numl.resize(longest);
115 
116  for (size_t i = 0; i < alphabet_size; ++i) {
117  DCHECK_LE(ordered_codelengths[i], longest);
118  DCHECK_GT(ordered_codelengths[i], 0);
119  ++numl[ordered_codelengths[i] - 1];
120  }
121 
122  return numl;
123  }
124 
125  inline static Firstcode gen_first_codes(const Numl& numl, const size_t longest) {
126  Firstcode firstcode;
127  firstcode.reserve(longest);
128  firstcode.resize(longest);
129 
130  firstcode[longest - 1] = 0;
131  for(size_t i = longest-1; i > 0; --i) {
132  firstcode[i - 1] = (firstcode[i] + numl[i]) / 2;
133  }
134  return firstcode;
135  }
136 
137  inline static Codewords gen_codewords(const OrderedCodelengths& ordered_codelengths,
138  const size_t alphabet_size,
139  const Numl& numl,
140  const uint8_t longest) {
141  DCHECK_EQ(longest, *std::max_element(ordered_codelengths.begin(),
142  ordered_codelengths.end()));
143  DCHECK_GT(longest,0);
144 
145  auto firstcode = gen_first_codes(numl, longest);
146 
147  Codewords codewords;
148  codewords.reserve(alphabet_size);
149  codewords.resize(alphabet_size);
150  for(size_t i = 0; i < alphabet_size; ++i) {
151  DCHECK_LE(ordered_codelengths[i], longest);
152  DCHECK_GT(ordered_codelengths[i], 0);
153  codewords[i] = firstcode[ordered_codelengths[i]-1]++;
154  DVLOG(2) << "codeword " << i << " : " << std::bitset<64>(codewords[i])
155  << ", length " << ordered_codelengths[i]
156  << "\t";
157  }
158  return codewords;
159  }
160 
161  inline static OrderedMapToEffective gen_ordered_map_to_effective(
162  const OrderedMapFromEffective& ordered_map_from_effective,
163  const size_t alphabet_size,
164  const size_t real_alphabet_size)
165  {
166 
167  size_t max = real_alphabet_size;
168  // TODO Debug
169  //max = std::max(real_alphabet_size, ULITERAL_MAX);
170  OrderedMapToEffective map_to_effective;
171  map_to_effective.reserve(max);
172  map_to_effective.resize(max, size_t(-1));
173 
174  for(size_t i = 0; i < alphabet_size; ++i) {
175  map_to_effective[ordered_map_from_effective[i]] = i;
176  }
177 
178  DVLOG(2) << "ordered_map_from_effective : "
179  << vec_to_debug_string(ordered_map_from_effective)
180  << "\t";
181  DVLOG(2) << "map_to_effective : "
182  << vec_to_debug_string(map_to_effective)
183  << "\t";
184 
185  return map_to_effective;
186  }
187 
188 
189  struct Huffmantable {
193  size_t m_longest;
195  };
196 
200 
201  inline void gen_huffmantable(Counts&& counts, size_t alphabet_size) {
202  DCHECK_GT(alphabet_size, 0);
203 
204  OrderedCodelengths ordered_codelengths;
205  OrderedMapFromEffective ordered_map_from_effective;
206  size_t longest;
207 
208  {
209  auto map_from_effective = gen_effective_alphabet(counts, alphabet_size);
210  auto codelengths = gen_codelengths(std::move(counts),
211  map_from_effective,
212  alphabet_size);
213  std::vector<size_t> codeword_order;
214  codeword_order.reserve(alphabet_size);
215  codeword_order.resize(alphabet_size);
216  std::iota(codeword_order.begin(), codeword_order.end(), 0);
217  std::sort(codeword_order.begin(), codeword_order.end(),
218  [&] (size_t i, size_t j) -> bool {
219  return codelengths[i] < codelengths[j];
220  });
221  longest = *std::max_element(codelengths.begin(),
222  codelengths.end());
223 
224  ordered_codelengths.reserve(alphabet_size);
225  ordered_codelengths.resize(alphabet_size);
226 
227  ordered_map_from_effective.reserve(alphabet_size);
228  ordered_map_from_effective.resize(alphabet_size);
229 
230  for(size_t i = 0; i < alphabet_size; ++i) {
231  ordered_codelengths[i] = codelengths[codeword_order[i]];
232  ordered_map_from_effective[i] = map_from_effective[codeword_order[i]];
233  }
234  }
235 
236  auto numl = gen_numl(ordered_codelengths, alphabet_size, longest);
237  auto codewords = gen_codewords(ordered_codelengths, alphabet_size, numl, longest);
238 
239  m_ordered_map_from_effective = std::move(ordered_map_from_effective);
240  m_codewords = std::move(codewords);
241  m_ordered_codelengths = std::move(ordered_codelengths);
242  m_effective_alphabet_size = std::move(alphabet_size);
243  m_numl = std::move(numl);
244  m_longest = std::move(longest);
245  }
246 
247  template<typename input_t>
248  ExtendedHuffmantable(const input_t& inp) {
249  size_t max = 0;
250  for (size_t i = 0; i < inp.size(); i++) {
251  max = std::max(max, inp[i]);
252  }
253  size_t alphabet_size = max + 1;
254 
255  Counts counts;
256  counts.reserve(alphabet_size);
257  counts.resize(alphabet_size);
258  for (size_t i = 0; i < inp.size(); i++) {
259  counts[inp[i]]++;
260  }
261 
262  m_real_alphabet_size = alphabet_size;
263 
264  size_t shrunk_alphabet_size = 0;
265  for (size_t i = 0; i < counts.size(); i++) {
266  shrunk_alphabet_size += (counts[i] != 0);
267  }
268 
269  if (shrunk_alphabet_size <= 1) {
270  m_effective_alphabet_size = shrunk_alphabet_size;
271  } else {
272  gen_huffmantable(std::move(counts), shrunk_alphabet_size);
273  }
274  }
275  };
276 
277  inline static void huffman_encode(
278  size_t input,
280  const OrderedCodelengths& ordered_codelengths,
281  const OrderedMapToEffective& ordered_map_to_effective,
282  const size_t alphabet_size,
283  const Codewords& codewords)
284  {
285  const size_t char_value = input;
286  const size_t effective_char = ordered_map_to_effective[char_value];
287  DCHECK_LT(effective_char, alphabet_size);
288  os.write_int(codewords[effective_char], ordered_codelengths[effective_char]);
289  DVLOG(2) << " codeword "
290  << codewords[effective_char]
291  << " length " << int(ordered_codelengths[effective_char])
292  << "\t";
293  }
294 
295  // TODO: Only write one huffman table for both D arrays
296  // -> just treat both d arrays as one array
297  inline static void huffmantable_encode(BitOStream& os, const Huffmantable& table) {
300  for(size_t i = 0; i < table.m_longest; ++i) {
301  os.write_compressed_int(table.m_numl[i]);
302  }
304  for(size_t i = 0; i < table.m_effective_alphabet_size; ++i) {
305  // TODO: variable bit width here
306  os.write_int<size_t>(table.m_ordered_map_from_effective[i], bits_for(table.m_real_alphabet_size - 1));
307  }
308  }
309 
310 
311  using PrefixSumLengths = std::vector<size_t>;
312 
313  inline static Huffmantable huffmantable_decode(tdc::io::BitIStream& in) {
314  const size_t real_size = in.read_compressed_int<size_t>();
315  const size_t longest = in.read_compressed_int<size_t>();
316 
317  Numl numl;
318  numl.reserve(longest);
319  numl.resize(longest);
320 
321  for(size_t i = 0; i < longest; ++i) {
322  numl[i] = in.read_compressed_int<size_t>();
323  }
324 
325  const size_t alphabet_size = in.read_compressed_int<size_t>();
326  OrderedMapFromEffective ordered_map_from_effective;
327  ordered_map_from_effective.reserve(alphabet_size);
328  ordered_map_from_effective.resize(alphabet_size);
329  for(size_t i = 0; i < alphabet_size; ++i) {
330  ordered_map_from_effective[i] = in.read_int<size_t>(bits_for(real_size - 1));
331  }
332 
333  return {
334  std::move(ordered_map_from_effective),
335  alphabet_size,
336  std::move(numl),
337  longest,
338  real_size,
339  };
340  }
341 
342  inline static OrderedCodelengths gen_ordered_codelengths(
343  const size_t alphabet_size, const Numl& numl, const size_t longest)
344  {
345  OrderedCodelengths ordered_codelengths;
346  ordered_codelengths.reserve(alphabet_size);
347  ordered_codelengths.resize(alphabet_size);
348  for(size_t i = 0, k = 0; i < longest; ++i) {
349  for(size_t j = 0; j < numl[i]; ++j) {
350  DCHECK_LT(k, alphabet_size);
351  ordered_codelengths[k++] = i + 1;
352  }
353  }
354  return ordered_codelengths;
355  }
356 
357  inline static PrefixSumLengths gen_prefix_sum_lengths(
358  OrderedCodelengths ordered_codelengths,
359  size_t alphabet_size,
360  size_t longest)
361  {
362  PrefixSumLengths prefix_sum_lengths;
363  prefix_sum_lengths.reserve(longest);
364  prefix_sum_lengths.resize(longest);
365 
366  /*
367  std::fill(prefix_sum_lengths.begin(),
368  prefix_sum_lengths.end(),
369  std::numeric_limits<size_t>::max());*/
370 
371  prefix_sum_lengths[ordered_codelengths[0] - 1] = 0;
372  for(size_t i = 1; i < alphabet_size; ++i) {
373  if(ordered_codelengths[i - 1] < ordered_codelengths[i]) {
374  prefix_sum_lengths[ordered_codelengths[i] - 1] = i;
375  }
376  }
377  DVLOG(2) << "ordered_codelengths : "
378  << vec_to_debug_string(ordered_codelengths)
379  << "\t";
380  DVLOG(2) << "prefix_sum_lengths : "
381  << vec_to_debug_string(prefix_sum_lengths)
382  << "\t";
383  return prefix_sum_lengths;
384  }
385 
386  inline static size_t huffman_decode(
388  const OrderedMapFromEffective& ordered_map_from_effective,
389  const PrefixSumLengths& prefix_sum_lengths,
390  const Firstcode& firstcodes)
391  {
392  DCHECK(!is.eof());
393  size_t value = 0;
394  size_t length = 0;
395  do {
396  DCHECK(!is.eof());
397  value = (value << 1) + is.read_bit();
398  ++length;
399  } while(value < firstcodes[length - 1]);
400 
401  DVLOG(2) << " codeword " << value << " length " << length << "\t";
402  --length;
403  // DCHECK_LT(prefix_sum_lengths[length]+ (value - firstcodes[length]), alphabet_size);
404  return ordered_map_from_effective[prefix_sum_lengths[length]+ (value - firstcodes[length]) ];
405  }
406  }
407 
408  using namespace huff2;
409 
412  std::shared_ptr<BitOStream> m_out;
413  ExtendedHuffmantable m_table;
414  OrderedMapToEffective m_ordered_map_to_effective;
415  public:
416  template<typename input_t>
417  HuffmanEncoder(const std::shared_ptr<BitOStream>& out,
418  const input_t& literals):
419  m_out(out),
420  m_table(literals),
421  m_ordered_map_to_effective {
422  m_table.m_codewords.empty()
424  : gen_ordered_map_to_effective(m_table.m_ordered_map_from_effective,
426  m_table.m_real_alphabet_size)
427  }
428  {
429  if (m_table.m_effective_alphabet_size <= 1) {
430  m_out->write_bit(0);
431  } else {
432  m_out->write_bit(1);
433  huffmantable_encode(*m_out, m_table);
434  }
435  }
436 
437  inline void encode(size_t v) {
438  if (m_table.m_effective_alphabet_size == 1) {
439  m_out->write_int<size_t>(v);
440  } else {
441  huffman_encode(v, *m_out, m_table.m_ordered_codelengths, m_ordered_map_to_effective, m_table.m_effective_alphabet_size, m_table.m_codewords);
442  }
443  }
444  };
445 
448  std::shared_ptr<BitIStream> m_in;
450  PrefixSumLengths m_prefix_sum_lengths;
451  Firstcode m_firstcodes;
452 
453  public:
454  HuffmanDecoder(const std::shared_ptr<BitIStream>& in):
455  m_in(in)
456  {
457  if(!m_in->read_bit()) {
458  return;
459  }
460  Huffmantable table = huffmantable_decode(*m_in);
461  m_ordered_map_from_effective = std::move(table.m_ordered_map_from_effective);
462  OrderedCodelengths ordered_codelengths = gen_ordered_codelengths(
464  table.m_numl,
465  table.m_longest);
466  m_prefix_sum_lengths = gen_prefix_sum_lengths(
467  std::move(ordered_codelengths),
469  table.m_longest);
470  m_firstcodes = gen_first_codes(table.m_numl, table.m_longest);
471  }
472 
473  inline size_t decode() {
474  if(m_ordered_map_from_effective.empty()) {
475  return m_in->read_int<size_t>();
476  }
477  return huffman_decode(*m_in,
478  m_ordered_map_from_effective,
479  m_prefix_sum_lengths,
480  m_firstcodes);
481  }
482  };
483 }}
std::vector< size_t > Codelengths
T read_compressed_int(size_t b=7)
Reads a compressed integer from the input.
Definition: BitIStream.hpp:175
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
HuffmanEncoder(const std::shared_ptr< BitOStream > &out, const input_t &literals)
std::vector< size_t > Firstcode
std::vector< size_t > OrderedCodelengths
std::string vec_to_debug_string(const T &s, size_t indent=0)
Builds the string representation of a vector of byte values, sorrounded by square brackets ([ and ])...
void gen_huffmantable(Counts &&counts, size_t alphabet_size)
Wrapper for input streams that provides bitwise reading functionality.
Definition: BitIStream.hpp:16
OrderedMapFromEffective m_ordered_map_from_effective
std::vector< size_t > Counts
std::vector< size_t > PrefixSumLengths
std::vector< size_t > Numl
Encodes data to an ASCII character stream.
std::vector< size_t > MapFromEffective
Wrapper for output streams that provides bitwise writing functionality.
Definition: BitOStream.hpp:17
HuffmanDecoder(const std::shared_ptr< BitIStream > &in)
std::vector< size_t > Codewords
void write_compressed_int(T v, size_t b=7)
Writes a compressed integer to the input.
Definition: BitOStream.hpp:151
std::vector< size_t > OrderedMapFromEffective
void write_int(T value, size_t bits=sizeof(T) *CHAR_BIT)
Writes the bit representation of an integer in MSB first order to the output.
Definition: BitOStream.hpp:98
std::vector< size_t > OrderedMapToEffective
bool eof() const
TODO document.
Definition: BitIStream.hpp:191
uint8_t read_bit()
Reads the next single bit from the input.
Definition: BitIStream.hpp:91
T read_int(size_t amount=sizeof(T) *CHAR_BIT)
Reads the integer value of the next amount bits in MSB first order.
Definition: BitIStream.hpp:119
Decodes data from an Huffman character stream.