14 template <
typename coder_t>
17 typedef uint32_t sym_t;
18 typedef uint64_t digram_t;
19 typedef std::vector<digram_t> grammar_t;
20 static const size_t digram_shift = 32UL;
21 static const sym_t sigma = 256;
23 inline static digram_t digram(sym_t l, sym_t r) {
24 return (digram_t(l) << digram_shift) | digram_t(r);
27 inline static sym_t left(digram_t di) {
28 return sym_t(di >> digram_shift);
31 inline static sym_t right(digram_t di) {
35 template<
typename text_t>
43 std::vector<uliteral_t> m_g_literals;
47 inline Literals(
const text_t& text,
50 const grammar_t& grammar)
51 : m_text(&text), m_text_size(text_size), m_next(next),
52 m_pos(0), m_g_pos(0) {
55 for(digram_t di : grammar) {
57 if(l < sigma) m_g_literals.push_back(
uliteral_t(l));
60 if(r < sigma) m_g_literals.push_back(
uliteral_t(r));
64 inline bool has_next()
const {
65 return m_pos < m_text_size || m_g_pos < m_g_literals.size();
71 if(m_pos < m_text_size) {
74 m_pos = m_next[m_pos];
78 auto l =
Literal { m_g_literals[m_g_pos],
79 m_text_size + 2 * m_g_pos };
88 Meta m(
"compressor",
"repair",
"Re-Pair compression");
99 if(max_rules == 0) max_rules = SIZE_MAX;
112 for(
size_t i = 0; i < n; i++) {
121 size_t num_replaced = 0;
126 size_t max_count = 0;
129 std::unordered_map<digram_t, size_t> count;
137 digram_t di = digram(text[i], text[j]);
140 size_t c = count[di] + 1;
156 sym_t new_sym = sigma + grammar.size();
157 grammar.push_back(max);
164 digram_t di = digram(text[i], text[j]);
178 }
while(grammar.size() < max_rules);
208 typename coder_t::Encoder coder(
env().env_for_option(
"coder"),
209 output, Literals<sym_t*>(text, n, next, grammar));
212 coder.encode(grammar.size(),
len_r);
215 auto encode_sym = [&](sym_t x,
const Range& r) {
217 coder.encode(
false,
bit_r);
220 coder.encode(
true,
bit_r);
221 coder.encode(x - sigma, r);
226 size_t num_grammar_terminals = 0;
227 size_t num_grammar_nonterminals = 0;
229 for(
size_t i = 0; i < grammar.size(); i++) {
230 digram_t di = grammar[i];
236 if(l < sigma) ++num_grammar_terminals;
237 else ++num_grammar_nonterminals;
240 if(r < sigma) ++num_grammar_terminals;
241 else ++num_grammar_nonterminals;
243 encode_sym(l, grammar_r);
244 encode_sym(r, grammar_r);
251 size_t num_text_terminals = 0;
252 size_t num_text_nonterminals = 0;
254 Range grammar_r(grammar.size());
255 for(
size_t i = 0; i < n; i = next[i]) {
259 if(x < sigma) ++num_text_terminals;
260 else ++num_text_nonterminals;
262 encode_sym(text[i], grammar_r);
274 inline static void decode(sym_t x,
const grammar_t& grammar, std::ostream& ostream) {
280 digram_t di = grammar[x - sigma];
281 decode(left(di), grammar, ostream);
282 decode(right(di), grammar, ostream);
289 typename coder_t::Decoder decoder(
env().env_for_option(
"coder"), input);
292 auto decode_sym = [&](
const Range& r) {
293 bool is_nonterminal = decoder.template decode<bool>(
bit_r);
295 auto dec = decoder.template decode<sym_t>(r);
298 auto dec = sym_t(decoder.template decode<uliteral_t>(
literal_r));
306 auto num_rules = decoder.template decode<size_t>(
len_r);
308 Range grammar_r(grammar.size());
309 sym_t l = decode_sym(grammar_r);
310 sym_t r = decode_sym(grammar_r);
311 grammar.push_back(digram(l, r));
330 Range grammar_r(grammar.size());
333 while(!decoder.eof()) {
334 decode(decode_sym(grammar_r), grammar, ostream);
Represents a generic range of positive integers.
Contains the text compression and encoding framework.
constexpr auto bit_r
Global predefined range for bits (0 or 1).
RePairCompressor(Env &&env)
uint8_t uliteral_t
Type to represent signed single literals.
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Base for data compressors.
Defines data encoding to and decoding from a stream of binary integer representations.
size_type size() const
Returns size of the View.
uint64_t as_integer() const
Env & env()
Provides access to the environment that the algorithm works in.
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
Contains a literal and its location in the input.
constexpr auto literal_r
Global predefined reange for literals.
An abstraction layer for algorithm output.
uint32_t len_compact_t
Type to represent an bit-compact length value.
fast_t< len_compact_t > len_t
Type to represent an length value.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
constexpr auto len_r
Global predefined range for len_t.
virtual void decompress(Input &input, Output &output) override
Decompress the given input to the given output.
Local environment for a compression/encoding/decompression call.
virtual void compress(Input &input, Output &output) override
Compress the given input to the given output.