12 static const size_t max_kmer =
sizeof(sym_t) - 1;
13 static const size_t kmer_mask = 0xFFUL << (8UL * max_kmer);
15 static inline bool is_kmer(sym_t x) {
16 return (x & kmer_mask) == kmer_mask;
19 static inline sym_t compile_kmer(
const uliteral_t* kmer,
size_t k) {
22 for(
size_t i = 0; i < k; i++) {
23 kmer_sym |= (sym_t(kmer[k-1-i]) << 8UL * i);
26 return kmer_sym | kmer_mask;
29 static inline void decompile_kmer(sym_t x,
uliteral_t* kmer,
size_t k) {
30 for(
size_t i = 0; i < k; i++) {
31 kmer[k-1-i] = (x >> 8UL * i) & 0xFFUL;
37 Meta m(
"coder",
"sle",
"Static low entropy encoding conforming [Dinklage, 2015]");
54 inline bool kmer_full() {
55 return m_kmer_cur == m_k;
59 bool roll_out = kmer_full();
63 for(
size_t i = 0; i < m_k - 1; i++) m_kmer[i] = m_kmer[i+1];
67 m_kmer[m_kmer_cur++] = c;
72 inline std::string kmer_str() {
74 for(
size_t i = 0; i < m_k; i++) s << m_kmer[i];
79 template<
typename literals_t>
80 inline Encoder(
Env&&
env, std::shared_ptr<BitOStream> out, literals_t&& literals)
84 assert(m_k <= max_kmer);
88 size_t last_literal_pos = 0;
93 while(literals.has_next()) {
98 if(l.
pos != last_literal_pos + 1) {
107 auto x = compile_kmer(m_kmer, m_k);
119 last_literal_pos = l.
pos;
127 size_t eta_add_bits = ((1UL << m_sigma_bits) == sigma) ? 1 : 2;
128 size_t eta = (1UL << (m_sigma_bits + eta_add_bits)) - sigma;
136 alphabet.
setCount(e.first, e.second);
137 if(--eta == 0)
break;
159 m_out->write_compressed_int(sigma);
161 m_out->write_compressed_int(e.first);
168 template<
typename literals_t>
182 inline void flush_kmer() {
184 for(
size_t i = 0; i < m_kmer_cur; i++) {
185 encode_sym(sym_t(m_kmer[i]));
192 inline void encode_sym(sym_t x) {
193 auto r = m_ranking[x];
197 if(m_sigma_bits < 4) {
198 m_out->write_int(r, m_sigma_bits);
199 }
else if(m_sigma_bits < 6) {
202 m_out->write_int(r, 2);
205 m_out->write_int(r, m_sigma_bits);
207 }
else if(m_sigma_bits == 6) {
209 m_out->write_int(0, 2);
210 m_out->write_int(r, 3);
212 m_out->write_int(1, 2);
213 m_out->write_int(r - 8UL, 3);
215 m_out->write_int(2, 2);
216 m_out->write_int(r - 16UL, 4);
218 m_out->write_int(3, 2);
219 m_out->write_int(r, m_sigma_bits);
223 m_out->write_int(0, 3);
224 m_out->write_int(r, 2);
226 m_out->write_int(1, 3);
227 m_out->write_int(r - 4UL, 2);
229 m_out->write_int(2, 3);
230 m_out->write_int(r - 8UL, 2);
232 m_out->write_int(3, 3);
233 m_out->write_int(r - 12UL, 2);
235 m_out->write_int(4, 3);
236 m_out->write_int(r - 16UL, 3);
238 m_out->write_int(5, 3);
239 m_out->write_int(r - 24UL, 3);
241 m_out->write_int(6, 3);
242 m_out->write_int(r - 32UL, 3);
244 m_out->write_int(7, 3);
245 m_out->write_int(r, m_sigma_bits);
251 template<
typename value_t>
256 if(kmer_roll(c, out)) {
258 encode_sym(sym_t(out));
262 sym_t x = compile_kmer(m_kmer, m_k);
263 if(m_ranking.find(x) != m_ranking.end()) {
271 template<
typename value_t>
277 template<
typename value_t>
286 m_out->write_int(v, bits);
289 m_out->write_int(0, 2);
290 m_out->write_int(v, 3);
292 m_out->write_int(1, 2);
293 m_out->write_int(v - value_t(8), 3);
295 m_out->write_int(2, 2);
296 m_out->write_int(v - value_t(16), 4);
298 m_out->write_int(3, 2);
299 m_out->write_int(v, bits);
304 template<
typename value_t>
316 sym_t* m_inv_ranking;
321 inline void reset_kmer() {
322 m_kmer_read = SIZE_MAX;
334 auto sigma = m_in->read_compressed_int<
size_t>();
337 m_inv_ranking =
new sym_t[sigma];
339 for(
size_t rank = 0; rank < sigma; rank++) {
340 auto c = m_in->read_compressed_int<sym_t>();
345 m_inv_ranking[rank] = c;
355 delete[] m_inv_ranking;
359 if(m_kmer_read < m_k) {
368 template<
typename value_t>
370 if(m_kmer_read < m_k) {
372 return value_t(m_kmer[m_kmer_read++]);
378 if(m_sigma_bits < 4) {
379 r = m_in->read_int<
size_t>(m_sigma_bits);
380 }
else if(m_sigma_bits < 6) {
381 auto b = m_in->read_bit();
382 if(b == 0) r = m_in->read_int<
size_t>(2);
383 else r = m_in->read_int<
size_t>(m_sigma_bits);
384 }
else if(m_sigma_bits == 6) {
385 auto x = m_in->read_int<uint8_t>(2);
387 case 0: r = m_in->read_int<
size_t>(3);
break;
388 case 1: r = 8UL + m_in->read_int<
size_t>(3);
break;
389 case 2: r = 16UL + m_in->read_int<
size_t>(4);
break;
390 case 3: r = m_in->read_int<
size_t>(m_sigma_bits);
break;
393 auto x = m_in->read_int<uint8_t>(3);
395 case 0: r = m_in->read_int<
size_t>(2);
break;
396 case 1: r = 4UL + m_in->read_int<
size_t>(2);
break;
397 case 2: r = 8UL + m_in->read_int<
size_t>(2);
break;
398 case 3: r = 12UL + m_in->read_int<
size_t>(2);
break;
399 case 4: r = 16UL + m_in->read_int<
size_t>(3);
break;
400 case 5: r = 24UL + m_in->read_int<
size_t>(3);
break;
401 case 6: r = 32UL + m_in->read_int<
size_t>(3);
break;
402 case 7: r = m_in->read_int<
size_t>(m_sigma_bits);
break;
406 auto x = m_inv_ranking[r];
410 decompile_kmer(x, m_kmer, m_k);
418 template<
typename value_t>
424 template<
typename value_t>
432 v = m_in->read_int<value_t>(bits);
434 auto x = m_in->read_int<uint8_t>(2);
436 case 0: v = m_in->read_int<value_t>(3);
break;
437 case 1: v = value_t(8) + m_in->read_int<value_t>(3);
break;
438 case 2: v = value_t(16) + m_in->read_int<value_t>(4);
break;
439 case 3: v = m_in->read_int<value_t>(bits);
break;
443 return v + value_t(r.
min());
446 template<
typename value_t>
449 return value_t(m_in->read_bit());
Represents a generic range of positive integers.
Contains the text compression and encoding framework.
size_t delta() const
Yields the difference between the range's minimum and maximum values.
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
Represents the range of valid tdc::uliteral_t values.
value_t decode(const MinDistributedRange &r)
Represents a compiler-level fixed range.
void encode(value_t v, const MinDistributedRange &r)
uint8_t uliteral_t
Type to represent signed single literals.
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Decoder(Env &&env, std::shared_ptr< BitIStream > in)
Wrapper for input streams that provides bitwise reading functionality.
std::vector< std::pair< T, size_t > > getSorted()
Return a list of value-count pairs, sorted with the most-seen first.
A data structure for counting occurences of values of a given type T.
std::shared_ptr< BitIStream > m_in
The underlying bit input stream.
Represents a range of positive integers that tend to be distributed towards the minimum.
uint64_t as_integer() const
size_t getNumItems()
Return how many differnt values have been seen so far.
Decoder(Env &&env, Input &in)
Env & env()
Provides access to the environment that the algorithm works in.
void encode(value_t v, const LiteralRange &)
void increase(const T &item)
Increase the counter for the passed value by one, setting it to 1 if it was not yet seen...
Contains a literal and its location in the input.
Wrapper for output streams that provides bitwise writing functionality.
size_t min() const
Yields the range's minimum value.
An abstraction layer for algorithm output.
ranking_t createRanking(size_t num=SIZE_MAX)
Return a map of the num-most common values, keyed by their common-ness, with map[0] being hte most co...
void encode(value_t v, const Range &r)
value_t decode(const Range &r)
void setCount(const T &item, size_t count)
Set the count of an item to a specific value.
value_t decode(const LiteralRange &)
std::shared_ptr< BitOStream > m_out
The underlying bit output stream.
Encoder(Env &&env, std::shared_ptr< BitOStream > out, literals_t &&literals)
Local environment for a compression/encoding/decompression call.
len_t pos
The position of the literal in the input.
Interface for algorithms.
value_t decode(const BitRange &)
Encoder(Env &&env, Output &out, literals_t &&literals)
void encode(value_t v, const BitRange &)