tudocomp
– The TU Dortmund Compression Framework
LZ78Compressor.hpp
Go to the documentation of this file.
1 #pragma once
2 
5 #include <tudocomp/Range.hpp>
6 
8 
9 // For default params
12 
13 namespace tdc {
14 
15  namespace lz78 {
16  class Decompressor {
17  std::vector<lz78::factorid_t> indices;
18  std::vector<uliteral_t> literals;
19 
20  public:
21  inline void decompress(lz78::factorid_t index, uliteral_t literal, std::ostream& out) {
22  indices.push_back(index);
23  literals.push_back(literal);
24  std::vector<uliteral_t> buffer;
25 
26  while(index != 0) {
27  buffer.push_back(literal);
28  literal = literals[index - 1];
29  index = indices[index - 1];
30  }
31 
32  out << literal;
33  for(size_t i = buffer.size(); i > 0; i--) {
34  out << buffer[i - 1];
35  }
36  }
37 
38  };
39  }//ns
40 
41 
42 template <typename coder_t, typename dict_t>
43 class LZ78Compressor: public Compressor {
44 private:
45  using node_t = typename dict_t::node_t;
46 
48  const lz78::factorid_t m_dict_max_size {0};
49 
50 public:
51  inline LZ78Compressor(Env&& env):
52  Compressor(std::move(env)),
53  m_dict_max_size(env.option("dict_size").as_integer())
54  {}
55 
56  inline static Meta meta() {
57  Meta m("compressor", "lz78", "Lempel-Ziv 78\n\n" LZ78_DICT_SIZE_DESC);
58  m.option("coder").templated<coder_t, BitCoder>("coder");
59  m.option("lz78trie").templated<dict_t, lz78::TernaryTrie>("lz78trie");
60  m.option("dict_size").dynamic(0);
61  return m;
62  }
63 
64  virtual void compress(Input& input, Output& out) override {
65  const size_t n = input.size();
66  const size_t reserved_size = isqrt(n)*2;
67  auto is = input.as_stream();
68 
69  // Stats
70  StatPhase phase1("Lz78 compression");
71 
72  IF_STATS(size_t stat_dictionary_resets = 0);
73  IF_STATS(size_t stat_dict_counter_at_last_reset = 0);
74  IF_STATS(size_t stat_factor_count = 0);
75  size_t factor_count = 0;
76 
77  size_t remaining_characters = n; // position in the text
78  dict_t dict(env().env_for_option("lz78trie"), n, remaining_characters, reserved_size);
79 
80  auto reset_dict = [&dict] () {
81  dict.clear();
82  node_t node = dict.add_rootnode(0);
83  DCHECK_EQ(node.id(), dict.size() - 1);
84  DCHECK_EQ(node.id(), 0);
85  };
86  reset_dict();
87 
88  typename coder_t::Encoder coder(env().env_for_option("coder"), out, NoLiterals());
89 
90  // Define ranges
91  node_t node = dict.get_rootnode(0);
92  node_t parent = node; // parent of node, needed for the last factor
93  DCHECK_EQ(node.id(), 0);
94  DCHECK_EQ(parent.id(), 0);
95 
96  char c;
97  while(is.get(c)) {
98  --remaining_characters;
99  node_t child = dict.find_or_insert(node, static_cast<uliteral_t>(c));
100  if(child.is_new()) {
101  coder.encode(node.id(), Range(factor_count));
102  coder.encode(static_cast<uliteral_t>(c), literal_r);
103  factor_count++;
104  IF_STATS(stat_factor_count++);
105  parent = node = dict.get_rootnode(0); // return to the root
106  DCHECK_EQ(node.id(), 0);
107  DCHECK_EQ(parent.id(), 0);
108  DCHECK_EQ(factor_count+1, dict.size());
109  // dictionary's maximum size was reached
110  if(tdc_unlikely(dict.size() == m_dict_max_size)) { // if m_dict_max_size == 0 this will never happen
111  DCHECK(false); // broken right now
112  reset_dict();
113  factor_count = 0; //coder.dictionary_reset();
114  IF_STATS(stat_dictionary_resets++);
115  IF_STATS(stat_dict_counter_at_last_reset = m_dict_max_size);
116  }
117  } else { // traverse further
118  parent = node;
119  node = child;
120  }
121  }
122 
123  // take care of left-overs. We do not assume that the stream has a sentinel
124  if(node.id() != 0) {
125  coder.encode(parent.id(), Range(factor_count));
126  coder.encode(c, literal_r);
127 // // TODO: this check only works if the trie is not the MonteCarloTrie !
128 // DCHECK_EQ(dict.find_or_insert(parent, static_cast<uliteral_t>(c)).id(), node.id());
129  factor_count++;
130  IF_STATS(stat_factor_count++);
131  }
132 
133  IF_STATS(
134  phase1.log_stat("factor_count", stat_factor_count);
135  phase1.log_stat("dictionary_reset_counter",
136  stat_dictionary_resets);
137  phase1.log_stat("max_factor_counter",
138  stat_dict_counter_at_last_reset);
139  )
140  }
141 
142  virtual void decompress(Input& input, Output& output) override final {
143  auto out = output.as_stream();
144  typename coder_t::Decoder decoder(env().env_for_option("coder"), input);
145 
146  lz78::Decompressor decomp;
147  uint64_t factor_count = 0;
148 
149  while (!decoder.eof()) {
150  const lz78::factorid_t index = decoder.template decode<lz78::factorid_t>(Range(factor_count));
151  const uliteral_t chr = decoder.template decode<uliteral_t>(literal_r);
152  decomp.decompress(index, chr, out);
153  factor_count++;
154  }
155 
156  out.flush();
157  }
158 
159 };
160 
161 
162 }//ns
163 
Represents a generic range of positive integers.
Definition: Range.hpp:16
int_t isqrt(int_t num)
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
void decompress(lz78::factorid_t index, uliteral_t literal, std::ostream &out)
virtual void compress(Input &input, Output &out) override
Compress the given input to the given output.
#define tdc_unlikely(x)
Provides a hint to the compiler that x is expected to resolve to false.
Definition: def.hpp:23
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
Base for data compressors.
Definition: Compressor.hpp:19
Defines data encoding to and decoding from a stream of binary integer representations.
Definition: BitCoder.hpp:13
virtual void decompress(Input &input, Output &output) override final
Decompress the given input to the given output.
InputStream as_stream() const
Creates a stream that allows for character-wise reading of the input.
Definition: Input.hpp:264
Provides access to runtime and memory measurement in statistics phases.
Definition: StatPhase.hpp:44
#define LZ78_DICT_SIZE_DESC
Definition: LZ78Trie.hpp:35
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
Definition: Literal.hpp:37
LZ78Compressor(Env &&env)
Maximum dictionary size before reset, 0 == unlimited.
constexpr auto literal_r
Global predefined reange for literals.
Definition: Range.hpp:111
An abstraction layer for algorithm output.
Definition: Output.hpp:23
#define IF_STATS(x)
x is compiled only when the STATS_DISABLED macro is undefined.
Definition: def.hpp:59
void log_stat(const char *key, const T &value)
Logs a user statistic for this phase.
Definition: StatPhase.hpp:298
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
size_t size() const
Yields the total amount of characters in the input.
Definition: Input.hpp:233
uint32_t factorid_t
Type for the factor indices, bounded by the number of LZ78 trie nodes.
Definition: LZ78Trie.hpp:11
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Local environment for a compression/encoding/decompression call.
LZ78 Trie Implementation based on Julius Pettersson (MIT/Expat License.) and Juha Nieminen&#39;s work...
Definition: TernaryTrie.hpp:16
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
An abstraction layer for algorithm input.
Definition: Input.hpp:37