5 #include <sdsl/cst_fully.hpp> 6 #include <sdsl/cst_sada.hpp> 25 std::vector<lz78::factorid_t> indices;
26 std::vector<uliteral_t> literal_strings;
27 std::vector<size_t> start_literal_strings;
29 std::vector<uliteral_t> buffer;
40 View ls = literal_strings;
42 size_t start = start_literal_strings[i];
44 if ((i + 1) < start_literal_strings.size()) {
45 end = start_literal_strings[i + 1];
50 return ls.
slice(start, end);
54 indices.push_back(index);
55 start_literal_strings.push_back(literal_strings.size());
56 for (
auto c : literals) {
57 literal_strings.push_back(c);
67 auto inv_old_size = buffer.size();
68 for (
size_t i = 0; i < literals.size(); i++) {
69 buffer.push_back(literals[literals.size() - i - 1]);
72 DCHECK_EQ(inv_old_size + literals.size(), buffer.size());
81 std::reverse(buffer.begin(), buffer.end());
89 template<
typename strategy_t,
typename ref_coder_t>
94 using RefEncoder =
typename ref_coder_t::Encoder;
95 using RefDecoder =
typename ref_coder_t::Decoder;
97 using CompressionStrat
98 =
typename strategy_t::template Compression<RefEncoder>;
99 using DecompressionStrat
100 =
typename strategy_t::template Decompression<RefDecoder>;
108 Meta m(
"compressor",
"lz78u",
"Lempel-Ziv 78 U\n\n" );
120 const len_t threshold = env().option(
"threshold").as_integer();
121 phase1.
log_stat(
"threshold", threshold);
129 std::string bad_copy_1 = T.
slice(0, T.
size() - 1);
132 construct_im(backing_cst, bad_copy_1, 1);
139 sdsl::int_vector<> R(ST.internal_nodes,0,
bits_for(max_z));
146 CompressionStrat strategy {
147 env().env_for_option(
"comp"),
148 env().env_for_option(
"coder"),
149 std::make_shared<BitOStream>(out)
152 len_t factor_count = 0;
156 auto output = [&](
len_t begin,
len_t end,
size_t ref) {
158 while(T[end-1] == 0) --end;
161 DVLOG(2) <<
"reference";
162 strategy.encode_ref(ref,
Range(factor_count));
165 if (end-begin >= threshold) {
166 DVLOG(2) <<
"factorized";
169 strategy.encode_sep(
false);
171 for(
len_t pos = begin; pos < end;) {
174 const node_t leaf = ST.select_leaf(ST.cst.csa.isa[pos]);
176 node_t parent = ST.root;
177 node_t node = ST.level_anc(leaf, d);
178 while(!ST.cst.is_leaf(node) && R[ST.nid(node)] != 0) {
180 node = ST.level_anc(leaf, ++d);
182 const len_t depth = ST.str_depth(parent);
184 if(depth < threshold) {
185 DVLOG(2) <<
"sub char";
186 strategy.encode_sep(
false);
187 strategy.encode_char(T[pos]);
190 DVLOG(2) <<
"sub ref";
191 strategy.encode_sep(
true);
192 strategy.encode_ref(R[ST.nid(parent)],
Range(factor_count));
200 DVLOG(2) <<
"special sub ref";
201 strategy.encode_sep(
true);
202 strategy.encode_ref(0,
Range(factor_count));
203 strategy.encode_ref(pos-end,
len_r);
208 DVLOG(2) <<
"sub char term";
209 strategy.encode_sep(
false);
210 strategy.encode_char(0);
216 strategy.encode_sep(
true);
217 strategy.encode_str(T.
slice(begin,end));
223 DVLOG(2) <<
"[ compress ]";
226 while(pos < T.
size() - 1) {
227 const node_t l = ST.select_leaf(ST.cst.csa.isa[pos]);
230 if(ST.parent(l) == ST.root || R[ST.nid(ST.parent(l))] != 0) {
231 const len_t parent_strdepth = ST.str_depth(ST.parent(l));
234 output(pos + parent_strdepth,
235 pos + parent_strdepth + 1,
236 R[ST.nid(ST.parent(l))]);
238 pos += parent_strdepth+1;
241 DCHECK_EQ(z, factor_count);
247 node_t parent = ST.root;
248 node_t node = ST.level_anc(l, d);
251 while(R[ST.nid(node)] != 0) {
253 node = ST.level_anc(l, ++d);
255 pos += ST.str_depth(parent);
259 const len_t begin = leaflabel + ST.str_depth(parent);
260 const len_t end = leaflabel + ST.str_depth(node);
265 R[ST.nid(ST.parent(node))]);
266 R[ST.nid(node)] = ++z;
269 DCHECK_EQ(z, factor_count);
275 DVLOG(2) <<
"[ decompress ]";
276 auto out = output.as_stream();
279 DecompressionStrat strategy {
280 env().env_for_option(
"comp"),
281 env().env_for_option(
"coder"),
282 std::make_shared<BitIStream>(input)
285 uint64_t factor_count = 0;
289 std::vector<uliteral_t> rebuilt_buffer;
291 while (!strategy.eof()) {
292 auto ref = strategy.decode_ref(
Range(factor_count));
294 DVLOG(2) <<
"decode ref: " << ref;
296 bool not_factorized = strategy.decode_sep();
297 DVLOG(2) <<
"decode sep: " << int(not_factorized);
299 if (not_factorized) {
300 auto str = strategy.decode_str();
301 DVLOG(2) <<
"decode str: '" << str <<
"\\0'";
303 << ((ref > 0) ? decomp.
str_at(ref) :
""_v)
310 rebuilt_buffer.clear();
313 bool is_sub_char = !strategy.decode_sep();
314 DVLOG(2) <<
"decode sep: " << int(!is_sub_char);
317 auto sub_chr = strategy.decode_char();
318 DVLOG(2) <<
"decode chr: " << int(sub_chr);
320 rebuilt_buffer.push_back(sub_chr);
322 auto sub_ref = strategy.decode_ref(
Range(factor_count));
323 DVLOG(2) <<
"decode ref: " << sub_ref;
327 auto cut = strategy.decode_ref(
len_r);
328 DVLOG(2) <<
"decode special ref: " << cut;
330 rebuilt_buffer.pop_back();
334 size_t prev_r = sub_ref;
335 size_t old_end = rebuilt_buffer.size();
341 prev_r = decomp.
ref_at(prev_r);
343 for (
size_t j = 0; j < s.
size(); j++) {
344 rebuilt_buffer.push_back(s[s.
size() - j - 1]);
346 }
while (prev_r != 0);
347 DCHECK_EQ(prev_r, 0);
350 std::reverse(rebuilt_buffer.begin() + old_end,
351 rebuilt_buffer.end());
355 if (rebuilt_buffer.size() > 0 && rebuilt_buffer.back() == 0) {
356 rebuilt_buffer.pop_back();
362 << ((ref > 0) ? decomp.
str_at(ref) :
""_v)
364 << rebuilt_buffer <<
"'";
Represents a generic range of positive integers.
Contains the text compression and encoding framework.
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
LZ78UCompressor(Env &&env)
A const view into a slice of memory.
Base for data compressors.
Provides access to runtime and memory measurement in statistics phases.
size_type size() const
Returns size of the View.
An abstraction layer for algorithm output.
ConstGenericView slice(size_type from, size_type to=npos) const
Construct a new View that is a sub view into the current one.
fast_t< len_compact_t > len_t
Type to represent an length value.
virtual void compress(Input &input, Output &out) override
Compress the given input to the given output.
void log_stat(const char *key, const T &value)
Logs a user statistic for this phase.
cst_t::node_type node_type
constexpr auto len_r
Global predefined range for len_t.
lz78::factorid_t ref_at(lz78::factorid_t index) const
uint32_t factorid_t
Type for the factor indices, bounded by the number of LZ78 trie nodes.
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
This is a wrapper class around the sdsl-lite library to get a easier translation between the pseudoco...
View str_at(lz78::factorid_t index) const
Local environment for a compression/encoding/decompression call.
virtual void decompress(Input &input, Output &output) override final
Decompress the given input to the given output.
void decompress(lz78::factorid_t index, View literals, std::ostream &out)