24 typedef typename std::make_unsigned<typename T::value_type>::type value_type;
25 constexpr
size_t max_literal = std::numeric_limits<value_type>::max();
29 for(
const auto& c : input) {
30 DCHECK_LT(static_cast<value_type>(c), max_literal+1);
31 DCHECK_LT(C[static_cast<value_type>(c)], std::numeric_limits<len_compact_t>::max());
32 ++C[
static_cast<value_type
>(c)];
41 while(input.has_next()) {
44 DCHECK_LT(C[static_cast<uliteral_t>(c)], std::numeric_limits<len_compact_t>::max());
66 if(C[i] == 0u)
continue;
67 DCHECK_LT(j, alphabet_size);
68 map_from_effective[j++] = i;
70 DCHECK_EQ(j, alphabet_size);
71 for(
size_t i = 0; i < alphabet_size; ++i) {
74 DCHECK_NE(C[map_from_effective[i]], 0u);
76 return map_from_effective;
88 inline uint8_t* gen_codelengths(
const len_compact_t*
const C,
const uliteral_t*
const map_from_effective,
const size_t alphabet_size) {
89 size_t A[2*alphabet_size];
90 for(
size_t i=0; i < alphabet_size; i++) {
91 DVLOG(2) <<
"Char " << map_from_effective[i] <<
" : " << size_t(C[map_from_effective[i]]);
92 A[alphabet_size+i] = C[map_from_effective[i]];
93 A[i] = alphabet_size+i;
96 auto comp = [&A] (
const size_t a,
const size_t b) ->
bool {
return A[a] > A[b];};
97 std::make_heap(&A[0], &A[alphabet_size], comp);
98 DCHECK_LE(A[A[0]], A[A[1]]);
99 assert_permutation_offset<size_t*>(A,alphabet_size,alphabet_size);
104 size_t h = alphabet_size-1;
107 std::pop_heap(A, A+h+1, comp);
108 const size_t m1 = A[h];
111 std::pop_heap(A, A+h+1, comp);
112 const size_t m2 = A[h];
113 DCHECK_GT(m1,h); DCHECK_GT(m2,h);
115 A[h+1] = A[m1] + A[m2];
117 A[m1] = A[m2] = h + 1;
118 std::push_heap(A, A+h+1, comp);
122 for(
size_t i = 2; i < 2*alphabet_size; ++i) {
136 uint8_t* codelengths {
new uint8_t[alphabet_size] };
137 for (
size_t i=0; i < alphabet_size; i++) {
138 DCHECK_LE(A[alphabet_size+i], 64);
139 codelengths[i] = A[alphabet_size+i];
140 DVLOG(2) <<
"Char " << map_from_effective[i] <<
" : " << codelengths[i];
147 for(
size_t i=0; i < alphabet_size; ++i) {
148 for(
size_t j=i+1; j < alphabet_size; ++j) {
149 if(codelengths[i] > codelengths[j]) {
150 DCHECK_LE(C[map_from_effective[i]], C[map_from_effective[j]]);
152 else if(codelengths[i] < codelengths[j]) {
153 DCHECK_GE(C[map_from_effective[i]], C[map_from_effective[j]]);
157 const size_t& max_el = *std::max_element(A+alphabet_size,A+2*alphabet_size);
158 DCHECK_LT(max_el,63);
160 for (
size_t i = 0; i < alphabet_size; ++i)
162 sum += 2ULL<<(max_el - A[alphabet_size+i]);
164 DCHECK_EQ(sum, 2ULL<<max_el);
173 inline uliteral_t* gen_numl(
const uint8_t*
const ordered_codelengths,
const size_t alphabet_size,
const uint8_t longest) {
174 DCHECK_EQ(longest, *std::max_element(ordered_codelengths, ordered_codelengths+alphabet_size));
175 DCHECK_GT(longest,0);
179 std::memset(numl,0,
sizeof(
uliteral_t)*longest);
181 for (
size_t i = 0; i < alphabet_size; ++i) {
182 DCHECK_LE(ordered_codelengths[i], longest);
183 DCHECK_GT(ordered_codelengths[i], 0);
184 ++numl[ordered_codelengths[i]-1];
192 inline size_t* gen_first_codes(
const uliteral_t*
const numl,
const size_t longest) {
193 size_t* firstcode =
new size_t[longest];
194 firstcode[longest-1] = 0;
195 for(
size_t i = longest-1; i > 0; --i)
196 firstcode[i-1] = (firstcode[i] + numl[i])/2;
202 inline size_t* gen_codewords(
const uint8_t*
const ordered_codelengths,
const size_t alphabet_size,
const uliteral_t*
const numl,
const uint8_t longest) {
203 DCHECK_EQ(longest, *std::max_element(ordered_codelengths, ordered_codelengths+alphabet_size));
204 DCHECK_GT(longest,0);
207 size_t*
const firstcode = gen_first_codes(numl, longest);
209 size_t*
const codewords =
new size_t[alphabet_size];
210 for(
size_t i = 0; i < alphabet_size; ++i) {
211 DCHECK_LE(ordered_codelengths[i], longest);
212 DCHECK_GT(ordered_codelengths[i], 0);
213 codewords[i] = firstcode[ordered_codelengths[i]-1]++;
214 DVLOG(2) <<
"codeword " << i <<
" : " << std::bitset<64>(codewords[i]) <<
", length " << ordered_codelengths[i];
220 struct huffmantable {
222 const size_t alphabet_size;
228 const uint8_t longest;
231 if(ordered_map_from_effective !=
nullptr)
delete [] ordered_map_from_effective;
232 if(numl !=
nullptr)
delete [] numl;
241 struct extended_huffmantable :
public huffmantable {
242 extended_huffmantable(
243 const uliteral_t*
const _ordered_map_from_effective,
244 const size_t*
const _codewords,
245 const uint8_t*
const _ordered_codelengths,
246 const size_t _alphabet_size,
248 const uint8_t _longest)
249 : huffmantable{_ordered_map_from_effective,_alphabet_size, _numl, _longest},
250 codewords(_codewords),
251 ordered_codelengths(_ordered_codelengths)
253 const size_t*
const codewords;
254 const uint8_t*
const ordered_codelengths;
255 ~extended_huffmantable() {
256 if(codewords !=
nullptr)
delete [] codewords;
257 if(ordered_codelengths !=
nullptr)
delete [] ordered_codelengths;
266 for(
size_t i = 0; i < table.longest; ++i) {
270 for(
size_t i = 0; i < table.alphabet_size; ++i) {
281 for(
size_t i = 0; i < longest; ++i) {
285 uint8_t*
const ordered_map_from_effective {
new uint8_t[alphabet_size] };
286 for(
size_t i = 0; i < alphabet_size; ++i) {
289 return { ordered_map_from_effective, alphabet_size, numl, longest };
294 inline uint8_t* gen_ordered_map_to_effective(
const uint8_t*
const ordered_map_from_effective,
const size_t alphabet_size) {
296 std::memset(map_to_effective, 0xff,
sizeof(map_to_effective)*
sizeof(uint8_t));
297 for(
size_t i = 0; i < alphabet_size; ++i) {
298 map_to_effective[ordered_map_from_effective[i]] = i;
300 DVLOG(2) <<
"ordered_map_from_effective : " <<
arr_to_debug_string(ordered_map_from_effective, alphabet_size);
302 return map_to_effective;
309 inline void huffman_encode(
312 const uint8_t*
const ordered_codelengths,
313 const uint8_t*
const ordered_map_to_effective,
314 const size_t alphabet_size,
315 const size_t*
const codewords
317 const size_t char_value =
static_cast<uliteral_t>(input);
318 const uint8_t& effective_char = ordered_map_to_effective[char_value];
319 DCHECK_LT(effective_char, alphabet_size);
320 os.
write_int(codewords[effective_char], ordered_codelengths[effective_char]);
321 DVLOG(2) <<
" codeword " << codewords[effective_char] <<
" length " << int(ordered_codelengths[effective_char]);
328 inline void huffman_encode(
331 const size_t input_length,
332 const uint8_t*
const ordered_map_from_effective,
333 const uint8_t*
const ordered_codelengths,
334 const size_t alphabet_size,
335 const size_t*
const codewords
337 DCHECK_GT(input_length, 0);
339 const uint8_t*
const ordered_map_to_effective { gen_ordered_map_to_effective(ordered_map_from_effective, alphabet_size) };
344 while(input.get(c)) {
345 huffman_encode(c, os, ordered_codelengths, ordered_map_to_effective, alphabet_size, codewords);
348 delete [] ordered_map_to_effective;
356 inline std::unique_ptr<size_t[]> gen_prefix_sum_lengths(
357 const uint8_t*
const ordered_codelengths,
358 const size_t alphabet_size,
359 const uint8_t longest) {
360 auto prefix_sum_lengths = std::make_unique<size_t[]>(longest);
363 prefix_sum_lengths.get(),
364 prefix_sum_lengths.get() + longest,
365 std::numeric_limits<size_t>::max()
368 prefix_sum_lengths[ordered_codelengths[0]-1] = 0;
369 for(
size_t i = 1; i < alphabet_size; ++i) {
370 if(ordered_codelengths[i-1] < ordered_codelengths[i])
371 prefix_sum_lengths[ordered_codelengths[i]-1] = i;
373 DVLOG(2) <<
"ordered_codelengths : " <<
arr_to_debug_string(ordered_codelengths, alphabet_size);
374 DVLOG(2) <<
"prefix_sum_lengths : " <<
arr_to_debug_string(prefix_sum_lengths.get(), longest);
375 return prefix_sum_lengths;
379 const uliteral_t*
const ordered_map_from_effective,
380 const size_t*
const prefix_sum_lengths,
381 const size_t*
const firstcodes
390 }
while(value < firstcodes[length-1]);
391 DVLOG(2) <<
" codeword " << value <<
" length " << length;
394 return ordered_map_from_effective[prefix_sum_lengths[length]+ (value - firstcodes[length]) ];
400 inline void huffman_decode(
402 std::ostream& output,
403 const uliteral_t*
const ordered_map_from_effective,
404 const uint8_t*
const ordered_codelengths,
405 const size_t alphabet_size,
407 const uint8_t longest) {
409 std::unique_ptr<size_t const[]>
const prefix_sum_lengths { gen_prefix_sum_lengths(ordered_codelengths, alphabet_size, longest) };
412 DCHECK_GT(text_length, 0);
413 const size_t*
const firstcodes = gen_first_codes(numl, longest);
415 size_t num_chars_read = 0;
417 output << huffman_decode(is, ordered_map_from_effective, prefix_sum_lengths.get(), firstcodes);
419 if(num_chars_read == text_length)
break;
421 delete [] firstcodes;
426 inline uint8_t* gen_ordered_codelength(
const size_t alphabet_size,
const uliteral_t*
const numl,
const size_t longest) {
427 uint8_t* ordered_codelengths {
new uint8_t[alphabet_size] };
428 for(
size_t i = 0,k=0; i < longest; ++i) {
429 for(
size_t j = 0; j < numl[i]; ++j) {
430 DCHECK_LT(k, alphabet_size);
431 ordered_codelengths[k++] = i+1;
434 return ordered_codelengths;
442 inline extended_huffmantable gen_huffmantable(
const len_compact_t*
const C) {
443 const size_t alphabet_size = effective_alphabet_size(C);
444 DCHECK_GT(alphabet_size,0);
447 const uliteral_t*
const mapFromEffective = gen_effective_alphabet(C, alphabet_size);
449 const uint8_t*
const codelengths = gen_codelengths(C, mapFromEffective, alphabet_size);
453 size_t*
const codeword_order =
new size_t[alphabet_size];
454 std::iota(codeword_order,codeword_order+alphabet_size,0);
455 std::sort(codeword_order,codeword_order+alphabet_size, [&] (
const uliteral_t& i,
const uliteral_t& j) {
return codelengths[i] < codelengths[j]; });
457 const uint8_t longest = *std::max_element(codelengths, codelengths+alphabet_size);
460 uint8_t* ordered_codelengths {
new uint8_t[alphabet_size] };
462 for(
size_t i = 0; i < alphabet_size; ++i) {
463 ordered_codelengths[i] = codelengths[codeword_order[i]];
464 ordered_map_from_effective[i] = mapFromEffective[codeword_order[i]];
466 delete [] codelengths;
467 delete [] mapFromEffective;
468 delete [] codeword_order;
470 const uliteral_t*
const numl = gen_numl(ordered_codelengths, alphabet_size, longest);
471 const size_t*
const codewords = gen_codewords(ordered_codelengths, alphabet_size, numl, longest);
473 return { ordered_map_from_effective, codewords, ordered_codelengths, alphabet_size, numl, longest };
476 inline extended_huffmantable gen_huffmantable(
const std::string& text) {
478 return gen_huffmantable(C);
485 extended_huffmantable table = gen_huffmantable(C);
486 huffmantable_encode(bit_os, table);
487 io::ViewStream view_stream(iview);
488 auto& is = view_stream.stream();
489 huffman_encode(is, bit_os, iview.size(), table.ordered_map_from_effective, table.ordered_codelengths, table.alphabet_size, table.codewords);
494 huffmantable table = huffmantable_decode(bit_is);
495 const uint8_t*
const ordered_codelengths = gen_ordered_codelength(table.alphabet_size, table.numl, table.longest);
500 table.ordered_map_from_effective,
506 delete [] ordered_codelengths;
515 Meta m(
"coder",
"huff",
"Canonical Huffman Coder");
522 const huff::extended_huffmantable m_table;
523 const uint8_t*
const ordered_map_to_effective;
525 template<
typename literals_t>
526 inline Encoder(
Env&& env, std::shared_ptr<BitOStream> out, literals_t&& literals)
529 if(
tdc_likely(!literals.has_next()))
return huff::extended_huffmantable {
nullptr,
nullptr,
nullptr, 0,
nullptr, 0 };
530 const len_compact_t*
const C = huff::count_alphabet_literals(std::move(literals));
531 const len_t alphabet_size = huff::effective_alphabet_size(C);
534 return huff::extended_huffmantable {
nullptr,
nullptr,
nullptr, 1,
nullptr, 0 };
536 return huff::gen_huffmantable(C);
538 , ordered_map_to_effective { m_table.codewords ==
nullptr ? nullptr : huff::gen_ordered_map_to_effective(m_table.ordered_map_from_effective, m_table.alphabet_size) }
545 huff::huffmantable_encode(*m_out, m_table);
550 if(
tdc_likely(ordered_map_to_effective !=
nullptr)) {
551 delete [] ordered_map_to_effective;
555 template<
typename literals_t>
562 template<
typename value_t>
564 DCHECK_NE(m_table.alphabet_size,0);
566 m_out->write_int(static_cast<uliteral_t>(v),8*
sizeof(
uliteral_t));
568 huff::huffman_encode(v, *m_out, m_table.ordered_codelengths, ordered_map_to_effective, m_table.alphabet_size, m_table.codewords);
574 std::unique_ptr<size_t const[]> prefix_sum_lengths;
575 const size_t* firstcodes;
578 if(
tdc_likely(ordered_map_from_effective !=
nullptr)) {
579 delete [] ordered_map_from_effective;
580 delete [] firstcodes;
588 ordered_map_from_effective =
nullptr;
591 huff::huffmantable table(huff::huffmantable_decode(*m_in) );
592 ordered_map_from_effective = table.ordered_map_from_effective;
593 table.ordered_map_from_effective =
nullptr;
594 const uint8_t*
const ordered_codelengths { huff::gen_ordered_codelength(table.alphabet_size, table.numl, table.longest) };
595 prefix_sum_lengths = huff::gen_prefix_sum_lengths(ordered_codelengths, table.alphabet_size, table.longest);
596 delete [] ordered_codelengths;
597 firstcodes = huff::gen_first_codes(table.numl, table.longest);
606 template<
typename value_t>
610 return huff::huffman_decode(*m_in, ordered_map_from_effective, prefix_sum_lengths.get(), firstcodes);
T read_compressed_int(size_t b=7)
Reads a compressed integer from the input.
Contains the text compression and encoding framework.
Decoder(Env &&env, Input &in)
Represents the range of valid tdc::uliteral_t values.
#define tdc_unlikely(x)
Provides a hint to the compiler that x is expected to resolve to false.
uint8_t uliteral_t
Type to represent signed single literals.
Wrapper for input streams that provides bitwise reading functionality.
Encoder(Env &&env, std::shared_ptr< BitOStream > out, literals_t &&literals)
value_t decode(const LiteralRange &)
void encode(value_t v, const LiteralRange &)
Encoder(Env &&env, Output &out, literals_t &&literals)
constexpr size_t ULITERAL_MAX
The maximum value of uliteral_t.
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
#define tdc_likely(x)
Provides a hint to the compiler that x is expected to resolve to true.
Wrapper for output streams that provides bitwise writing functionality.
An abstraction layer for algorithm output.
uint32_t len_compact_t
Type to represent an bit-compact length value.
fast_t< len_compact_t > len_t
Type to represent an length value.
void write_compressed_int(T v, size_t b=7)
Writes a compressed integer to the input.
Decoder(Env &&env, std::shared_ptr< BitIStream > in)
std::string arr_to_debug_string(const T *s, size_t length)
Builds the string representation of an array of printable values, sorrounded by square brackets ([ an...
value_t decode(const Range &r)
Decodes an arbitrary-range integer value.
void write_int(T value, size_t bits=sizeof(T) *CHAR_BIT)
Writes the bit representation of an integer in MSB first order to the output.
bool eof() const
TODO document.
uint8_t read_bit()
Reads the next single bit from the input.
#define IF_PARANOID(x)
x is compiled only in debug builds and when the PARANOID macro is defined.
Local environment for a compression/encoding/decompression call.
void encode(value_t v, const Range &r)
Encodes an arbitrary-range integer value.
Interface for algorithms.
T read_int(size_t amount=sizeof(T) *CHAR_BIT)
Reads the integer value of the next amount bits in MSB first order.