tudocomp
– The TU Dortmund Compression Framework
coders/ArithmeticCoder.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <sstream>
4 #include <tudocomp/Coder.hpp>
5 #include <iostream>
6 
7 namespace tdc {
8 
16 class ArithmeticCoder : public Algorithm {
17  using ulong = unsigned long;
18 
19  // TODO: Figure out how to correctly replace this with len_t or len_compact_t
20  using len_fixup_t = uint32_t;
21 
22 public:
25  inline static Meta meta() {
26  Meta m("coder", "arithmetic", "Simple range encoding");
27  return m;
28  }
29 
31  ArithmeticCoder() = delete;
33 
35  class Encoder : public tdc::Encoder {
36  private:
37  std::vector<len_fixup_t> C;
38 
39  ulong lower_bound=0;
40  ulong upper_bound=std::numeric_limits<ulong>::max();
41  uliteral_t codebook_size=0;
42 
43  len_t literal_count = 0;
44  len_t literal_counter = 0;
45  ulong min_range=std::numeric_limits<len_fixup_t>::max();
46 
52  template<class T>
53  std::vector<len_fixup_t> count_alphabet_literals(T&& input) {
54  std::vector<len_fixup_t> C;
55  C.resize(ULITERAL_MAX+1, 0);
56 
57  while(input.has_next()) {
58  uliteral_t c = input.next().c;
59  DCHECK_LT(static_cast<uliteral_t>(c), ULITERAL_MAX+1);
60  DCHECK_LT(C[static_cast<uliteral_t>(c)], std::numeric_limits<len_fixup_t>::max());
61  ++C[static_cast<uliteral_t>(c)];
62  }
63 
64  return C;
65  }
66 
72  void build_intervals(std::vector<len_fixup_t> &c) {
73  if(c[0] != 0u) {
74  codebook_size++;
75  }
76  len_t min=std::numeric_limits<len_fixup_t>::max();
77  //calculates difference to the entry before, searches min and counts entries != 0
78  for(ulong i=1; i<=ULITERAL_MAX; i++) {
79  if(c[i]!=0u) {
80  codebook_size++;
81  min=std::min(min, len_t(c[i]));
82  }
83  c[i] = c[i] + c[i-1];
84  }
85  literal_count = c[ULITERAL_MAX-1];
86 
87  //normalize all Intervals
88  for(ulong i=0; i<=ULITERAL_MAX; i++) {
89  c[i] = c[i] / min;
90  }
91  min_range=c[ULITERAL_MAX-1];
92  }
93 
94 
95  template<typename value_t>
96  inline void setNewBounds(value_t v) {
97  ulong range = upper_bound-lower_bound;
98  //check if all characters can be uniquely mapped to range
99  if(range<min_range){
100  writeCode(lower_bound);
101  lower_bound=0;
102  upper_bound=std::numeric_limits<ulong>::max();
103  range = upper_bound-lower_bound;
104  }
105  DCHECK_NE(lower_bound,upper_bound);
106 
107  const ulong literal_count = C[ULITERAL_MAX];
108 
109  //unsure if this is needed
110  const ulong offset_upper = range <= literal_count ? (range*C[(int) v])/literal_count : (range/literal_count)*C[(int) v];
111  upper_bound=lower_bound+offset_upper;
112  if(v != 0) { //first interval starts at zero
113  const ulong offset_lower = range <= literal_count ? (range*C[(int) v-1])/literal_count : (range/literal_count)*C[(int) v-1];
114  lower_bound=lower_bound+offset_lower;
115  }
116 
117  }
118 
119  inline void writeCodebook() {
126 
127  //write count of expected chars
128  m_out->write_int<len_fixup_t>(literal_count);
129 
130  //write codebook size in outstream
131  m_out->write_int<uliteral_t>(codebook_size);
132 
133  if(C[0]!=0u) {
134  m_out->write_int(uliteral_t(0));
135  m_out->write_int(C[0]);
136  }
137  for(ulong i=1; i<=ULITERAL_MAX;i++) {
138  if(C[i]!=C[i-1]) {
139  m_out->write_int((uliteral_t) i);
140  m_out->write_int(C[i]);
141  }
142 
143  }
144  }
145 
146  inline void writeCode(ulong code){
147  m_out->write_int(code);
148  }
149 
150  //write last code-block
151  void postProcessing() {
152  writeCode(lower_bound);
153  //dummy codeblock - needed to read until no more code-blocks
154  m_out->write_int(std::numeric_limits<ulong>::max());
155  }
156 
157  public:
158  template<typename literals_t>
159  inline Encoder(Env&& env, Output& out, literals_t&& literals)
160  : tdc::Encoder(std::move(env), out, literals),
161  C(count_alphabet_literals(std::move(literals))) {
162  build_intervals(C);
163  writeCodebook();
164  }
165 
166  using tdc::Encoder::encode; // default encoding as fallback
167 
168  template<typename value_t>
169  inline void encode(value_t v, const LiteralRange&) {
170  literal_counter++;
171  setNewBounds(v);
172 
173  if(literal_counter==literal_count){
174  postProcessing();
175  }
176  }
177  };
178 
180  class Decoder : public tdc::Decoder {
181  private:
182  std::vector<std::pair<uliteral_t ,int>> literals;
183  std::string decoded;
184  uliteral_t codebook_size;
185  len_t literal_count = 0;
186  len_t literal_counter = 0;
187  ulong literals_read = 0;
188  ulong min_range=std::numeric_limits<ulong>::max();
189 
190  void decode(ulong code) {
191  ulong lower_bound = 0;
192  ulong upper_bound = std::numeric_limits<ulong>::max();
193  std::ostringstream os;
194  ulong interval_parts = literals[codebook_size -1].second;
195  //count of characters in stream
196 
197  ulong range = upper_bound - lower_bound;
198 
199  //stop decoding code-bllock when range is too small or all literals read
200  while(min_range<=range && literal_counter<literal_count) {
201  ulong interval_lower_bound = lower_bound;
202  //search the right interval
203  for(int i = 0; i < codebook_size ; i++) {
204  const std::pair<uliteral_t, int>& pair=literals[i];
205  const ulong offset = range <= interval_parts ? range*pair.second/interval_parts : range/interval_parts*pair.second;
206  upper_bound = lower_bound + offset;
207  if(code < upper_bound) {
208  //character decoded
209  os << pair.first;
210  lower_bound = interval_lower_bound;
211  break;
212  }
213  interval_lower_bound = upper_bound;
214  }
215  literal_counter++;
216  range = upper_bound - lower_bound;
217  }
218 
219  decoded = os.str();
220  literals_read = 0;
221  }
222 
223  public:
225  //read codebook size
226  literal_count = m_in->read_int<len_fixup_t>();
227  codebook_size = m_in->read_int<uliteral_t>();
228  literals.resize(codebook_size);
229 
230  //read and parse dictionary - is is already "normalized"
231  for (int i =0; i<codebook_size; i++) {
232  uliteral_t c = m_in->read_int<uliteral_t>();
233  int val = m_in->read_int<int>();
234  literals[i]=std::pair<uliteral_t, int>(c, val);
235  }
236 
237  min_range=literals[codebook_size-1].second;
238  }
239 
240  using tdc::Decoder::decode;
241 
242  template<typename value_t>
243  inline value_t decode(const LiteralRange&) {
244  //read code if nothing buffered
245  if(!decoded.size()) {
246  ulong code = m_in->read_int<ulong>();
247  if(code!=std::numeric_limits<ulong>::max()) {
248  //code must not be a dummy-code
249  decode(code);
250  }
251  }
252  value_t val = decoded[literals_read++];
253 
254  //if all buffered literals are read: decode the next buffer
255  if(literals_read == decoded.size()) {
256  ulong code = m_in->read_int<ulong>();
257  if(code!=std::numeric_limits<ulong>::max()) {
258  //code must not be a dummy-code
259  decode(code);
260  }
261  }
262 
263  return val;
264  }
265  };
266 };
267 
268 }
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Represents the range of valid tdc::uliteral_t values.
Definition: Range.hpp:90
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
Encoder(Env &&env, Output &out, literals_t &&literals)
uint8_t uliteral_t
Type to represent signed single literals.
Definition: def.hpp:131
void encode(value_t v, const LiteralRange &)
Decodes data from an Arithmetic character stream.
value_t decode(const LiteralRange &)
constexpr size_t ULITERAL_MAX
The maximum value of uliteral_t.
Definition: def.hpp:134
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
An abstraction layer for algorithm output.
Definition: Output.hpp:23
Defines data encoding to and decoding from a stream of ASCII characters.
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
Encodes data to an ASCII character stream.
std::shared_ptr< BitOStream > m_out
The underlying bit output stream.
Definition: Coder.hpp:18
value_t decode(const Range &r)
Decodes an arbitrary-range integer value.
Definition: Coder.hpp:127
Base for data encoders.
Definition: Coder.hpp:14
static Meta meta()
Yields the coder&#39;s meta information.
Local environment for a compression/encoding/decompression call.
void encode(value_t v, const Range &r)
Encodes an arbitrary-range integer value.
Definition: Coder.hpp:61
Base for data decoders.
Definition: Coder.hpp:87
Interface for algorithms.
Definition: Algorithm.hpp:15