tudocomp
– The TU Dortmund Compression Framework
LFS2Compressor.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 //std includes:
4 #include <vector>
5 #include <tuple>
6 
7 //general includes:
9 #include <tudocomp/util.hpp>
10 #include <tudocomp/io.hpp>
13 
14 //sdsl include stree:
15 #include <sdsl/suffix_trees.hpp>
16 
17 
18 //includes encoding:
21 #include <tudocomp/Literal.hpp>
23 
26 
27 
29 
30 
31 namespace tdc {
32 namespace lfs {
33 
34 template<typename literal_coder_t = HuffmanCoder, typename len_coder_t = EliasGammaCoder >
35 class LFS2Compressor : public Compressor {
36 private:
37 
38 
39 
40 
41  //Suffix Tree type + st
42  typedef sdsl::cst_sct3< sdsl::csa_bitcompressed<> > cst_t;
43  cst_t stree;
44 
45  using node_type = typename cst_t::node_type;
46 
47  //Stores nts_symbols of first layer
48  IntVector<uint> first_layer_nts;
49  // offset to begin of last nts +1. if ==0 no substitution
50  IntVector<uint> fl_offsets;
51  // stores subs in first layer symbols
52  IntVector<uint> second_layer_nts;
53  // dead pos in first layer
54  BitVector second_layer_dead;
55 
56 
57  //pair contains begin pos, length
58  std::vector<std::pair<uint, uint> > non_terminal_symbols;
59 
60 
61  //stores node_ids of corresponding factor length
62  std::vector<std::vector<uint> > bins;
63 
64 
65  //stores beginning positions corresponding to node_ids
66  std::vector<std::vector<uint> > node_begins;
67 
68  bool exact;
69  uint size;
70 
71 
72 
73 
74 public:
75 
76  inline static Meta meta() {
77  Meta m("compressor", "lfs2",
78  "lfs2 with simst");
80  m.option("min_lrf").dynamic(5);
81  m.option("exact").dynamic(0);
82  m.option("lfs2_lit_coder").templated<literal_coder_t, HuffmanCoder>("lfs2_lit_coder");
83  m.option("lfs2_len_coder").templated<len_coder_t, EliasGammaCoder>("lfs2_len_coder");
84 
85  return m;
86  }
87 
88 
89  inline LFS2Compressor(Env&& env):
90  Compressor(std::move(env))
91  {
92  DLOG(INFO) << "Compressor lfs2 instantiated";
93  }
94  inline virtual void compress(Input& input, Output& output) override {
95  uint min_lrf = env().option("min_lrf").as_integer();
96  uint temp = env().option("exact").as_integer();
97  exact=false;
98  if(temp > 0){
99  exact =true;
100  }
101 
102  auto in = input.as_view();
103 
104 
105  //create vectors:
106  first_layer_nts = IntVector<uint>(input.size(), 0);
107  fl_offsets = IntVector<uint>(input.size(), 0);
108  second_layer_nts = IntVector<uint>(input.size(), 0);
109  second_layer_dead = BitVector(input.size(), 0);
110 
111 
112 
113  if(in.size() >= min_lrf){
114 
115 
116 
117 
118  StatPhase::wrap("Constructing ST", [&]{
119  size = in.size();
120  //remove sentinel because sdsl cant handle that
121  while(in[size-1] == 0){
122  size--;
123  }
124 
125 
126 
127  std::string in_string ((const char*) in.data(), size);
128  sdsl::construct_im(stree, in_string , 1);
129  });
130 
131 
132 
133  StatPhase::wrap("Computing LRF", [&]{
134  bins.resize(200);
135  uint node_counter = 0;
136 
137  typedef sdsl::cst_bfs_iterator<cst_t> iterator;
138  iterator begin = iterator(&stree, stree.root());
139  iterator end = iterator(&stree, stree.root(), true, true);
140 
141  StatPhase::wrap("Iterate over ST", [&]{
142  DLOG(INFO)<<"iterate st";
143 
144  for (iterator it = begin; it != end; ++it) {
145 
146  if(!stree.is_leaf(*it)){
147 
148  if(bins.size() <= stree.depth(*it)) {
149 
150  uint resize = bins.size()*2;
151  while (resize<= stree.depth(*it)) {
152  resize*=2;
153  }
154  bins.resize(resize);
155  }
156  bins[stree.depth(*it)].push_back(stree.id(*it));
157  node_counter++;
158  }
159  }
160  });
161  node_begins.resize(node_counter);
162 
163  uint nts_number = 1 ;
164  StatPhase::wrap("Iterate over Node Bins", [&]{
165  //iterate node bins top down
166  DLOG(INFO)<<"iterate over Node Bins";
167  for(uint i = bins.size()-1; i>=min_lrf; i--){
168 
169  //iterate over ids in bin:
170  while(!bins[i].empty()){
171  uint id = bins[i].back();
172  bins[i].pop_back();
173  node_type node = stree.inv_id(id);
174  uint no_leaf_id = id - stree.size();
175 
176  //get bps of node
177 
178  if(node_begins[no_leaf_id].empty()){
179  //get leaves or merge child vectors
180  std::vector<uint> offsets;
181  std::vector<uint> leaf_bps;
182 
183  for (auto & child : stree.children(node)) {
184  if(stree.is_leaf(child)){
185 
186 
187  leaf_bps.push_back(stree.csa[stree.lb(child)]);
188 
189 
190  } else {
191  int child_id = stree.id(child) - stree.size();
192  if(!node_begins[child_id ].empty()){
193  //append child list, remember offset
194  offsets.push_back(node_begins[no_leaf_id].size());
195 
196 
197  node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),node_begins[child_id ].begin(), node_begins[child_id].end());
198 
199  node_begins[child_id] = std::vector<uint>();
200 
201  }
202  }
203  }
204  std::sort(leaf_bps.begin(), leaf_bps.end());
205 
206  offsets.push_back(node_begins[no_leaf_id].size());
207  node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),leaf_bps.begin(), leaf_bps.end());
208 
209  //inplace merge with offset
210  for(uint k = 0; k < offsets.size()-1; k++){
211  std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets[k], node_begins[no_leaf_id].begin()+ offsets[k+1]);
212 
213  }
214  //now inplace merge to end
215  std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets.back(), node_begins[no_leaf_id].end());
216 
217 
218 
219  }
220  //if still empty, because everything is substituted...
221  if(node_begins[no_leaf_id].empty()){
222  continue;
223  }
224  //check if viable lrf, else sort higher!
225  if((node_begins[no_leaf_id].size()>=2)){
226 
227  if (( (uint)( node_begins[no_leaf_id].back() - node_begins[no_leaf_id].front() )) >= i ){
228 
229  //greedily iterate over occurences
230  signed long last = 0 - (long) i;
231  std::vector<uint> first_layer_viable;
232  std::vector<uint> second_layer_viable;
233  for(uint occurence : node_begins[no_leaf_id]){
234  //check for viability
235  if( (last+i <= (long) occurence)){
236  if(fl_offsets[occurence] == 0){
237  if(fl_offsets[occurence + i -1] == 0){
238  //Position is firs layer viable
239  first_layer_viable.push_back(occurence);
240  last= occurence;
241  }
242  } else {
243  //find nts number of symbol that corresponds to substitued occ
244  uint parent_nts= first_layer_nts[ occurence - (fl_offsets[occurence] -1) ];
245  auto nts = non_terminal_symbols[parent_nts-1];
246  //if length of parent nts is greater than current len + offset
247  if(nts.second >=fl_offsets[occurence]-1 + i ){
248  second_layer_viable.push_back(occurence);
249  }
250  }
251 
252  }
253 
254  }
255 
256 
257  //and substitute
258 
259  //if at least 2 first level layer occs viable:
260  if(first_layer_viable.size() >=1 &&(first_layer_viable.size() + second_layer_viable.size() >= 2) ) {
261  std::pair<uint,uint> nts = std::make_pair(first_layer_viable.front(), i);
262  non_terminal_symbols.push_back(nts);
263 
264  //iterate over vector, make first layer unviable:
265  for(uint occ : first_layer_viable){
266  first_layer_nts[occ]= nts_number;
267 
268  for(uint nts_length =0; nts_length < i; nts_length++){
269  fl_offsets[occ + nts_length] = nts_length+1;
270  }
271  }
272 
273 
274 
275  for(uint sl_occ :second_layer_viable){
276  uint parent_nts= first_layer_nts[ sl_occ - (fl_offsets[sl_occ] -1) ];
277 
278  auto parent_sym = non_terminal_symbols[parent_nts-1];
279  uint parent_start= parent_sym.first;
280  uint sl_start = (parent_start + fl_offsets[sl_occ] -1);
281  uint sl_end = sl_start+i-1;
282  if(second_layer_dead[sl_start] == (uint)0 && second_layer_dead[sl_end] == (uint)0){
283 
284  second_layer_nts[sl_start]=nts_number;
285 
286  for(uint dead = sl_start; dead<=sl_end;dead++){
287  second_layer_dead[dead]=1;
288  }
289  }
290  }
291 
292  //raise nts number:
293  nts_number++;
294 
295  }
296 
297 
298 
299  } else {
300  if(exact){
301  //readd node if lrf shorter
302  uint min_shorter = node_begins[no_leaf_id].back() - node_begins[no_leaf_id].front();
303  //check if parent subs this lrf
304  node_type parent = stree.parent(node);
305  uint depth = stree.depth(parent);
306  if(depth < (min_shorter)){
307  //just re-add node, if the possible replaceable lrf is longer than dpeth of parent node
308  bins[min_shorter].push_back(stree.id(node));
309  }
310 
311 
312  }
313  continue;
314 
315  }
316  }
317  }
318 
319  }
320 
321  });
322 
323  });
324 
325  DLOG(INFO)<<"Computing symbol depth";
326 
327  IntVector<uint> nts_depth(non_terminal_symbols.size(), 0);
328 
329  for(uint nts_num =0; nts_num<non_terminal_symbols.size(); nts_num++){
330  auto symbol = non_terminal_symbols[nts_num];
331  uint cur_depth = nts_depth[nts_num];
332 
333  for(uint pos = symbol.first; pos < symbol.second + symbol.first ; pos++){
334  if(second_layer_nts[pos]>0){
335 
336  uint symbol_num = second_layer_nts[pos] -1;
337  if(nts_depth[symbol_num]< cur_depth+1){
338  nts_depth[symbol_num]= cur_depth+1;
339  }
340 
341  }
342 
343 
344  }
345 
346  }
347 
348  DLOG(INFO)<<"Computing done";
349 
350  std::sort(nts_depth.begin(), nts_depth.end());
351  if(nts_depth.size()>0){
352 
353  uint max_depth = nts_depth[nts_depth.size()-1];
354 
355  DLOG(INFO)<<"Max CFG Depth: "<< max_depth;
356  DLOG(INFO)<<"Number of CFG rules: "<< non_terminal_symbols.size();
357 
358  if(nts_depth.size()>=4){
359  uint quarter = nts_depth.size() /4;
360 
361  StatPhase::log("25 \% quantil CFG Depth", nts_depth[quarter -1]);
362  StatPhase::log("50 \% quantil CFG Depth", nts_depth[(2*quarter) -1]);
363  StatPhase::log("75 \% quantil CFG Depth", nts_depth[(3*quarter) -1]);
364 
365  }
366  StatPhase::log("Max CFG Depth", max_depth);
367  }
368 
369 
370 
371  //input size end
372  }
373 
374  StatPhase::log("Number of CFG rules", non_terminal_symbols.size());
375 
376  std::stringstream literals;
377 
378  for(uint position = 0; position< in.size(); position++){
379  if(fl_offsets[position]==0){
380  literals << in[position];
381  }
382  }
383  for(uint nts_num = 0; nts_num<non_terminal_symbols.size(); nts_num++){
384 
385  auto symbol = non_terminal_symbols[nts_num];
386 
387  for(uint pos = symbol.first; pos < symbol.second + symbol.first; pos++){
388  if(second_layer_nts[pos] == 0 && pos < in.size()){
389  literals<< in[pos];
390 
391  }
392  }
393  }
394 
395 
396  StatPhase::wrap("Encoding Comp", [&]{
397  // encode dictionary:
398  DLOG(INFO) << "encoding dictionary symbol sizes ";
399 
400  std::shared_ptr<BitOStream> bitout = std::make_shared<BitOStream>(output);
401  typename literal_coder_t::Encoder lit_coder(
402  env().env_for_option("lfs2_lit_coder"),
403  bitout,
404  ViewLiterals(literals.str())
405  );
406  typename len_coder_t::Encoder len_coder(
407  env().env_for_option("lfs2_len_coder"),
408  bitout,
409  NoLiterals()
410  );
411 
412  //encode lengths:
413  DLOG(INFO)<<"number nts: " << non_terminal_symbols.size();
414  Range intrange (0, UINT_MAX);
415  //encode first length:
416  if(non_terminal_symbols.size()>=1){
417  auto symbol = non_terminal_symbols[0];
418  uint last_length=symbol.second;
419  //Range for encoding nts number
420  Range s_length_r (0,last_length);
421  len_coder.encode(last_length,intrange);
422  //encode delta length of following symbols
423  for(uint nts_num = 1; nts_num < non_terminal_symbols.size(); nts_num++){
424  symbol = non_terminal_symbols[nts_num];
425  len_coder.encode(last_length-symbol.second,s_length_r);
426  last_length=symbol.second;
427 
428  }
429  //encode last length, to have zero length last
430  len_coder.encode(symbol.second,s_length_r);
431  }else {
432  len_coder.encode(0,intrange);
433 
434  }
435  Range dict_r(0, non_terminal_symbols.size());
436 
437 
438  long buf_size = bitout->tellp();
439 
440  StatPhase::log("Bytes Length Encoding", buf_size);
441  DLOG(INFO)<<"Bytes Length Encoding: "<< buf_size;
442 
443 
444  DLOG(INFO) << "encoding dictionary symbols";
445  uint dict_literals=0;
446 
447  // encode dictionary strings, backwards, to directly decode strings:
448  if(non_terminal_symbols.size()>=1){
449  std::pair<uint,uint> symbol;
450  for(long nts_num =non_terminal_symbols.size()-1; nts_num >= 0; nts_num--){
451 
452  symbol = non_terminal_symbols[nts_num];
453 
454  for(uint pos = symbol.first; pos < symbol.second + symbol.first ; pos++){
455  if(second_layer_nts[pos] > 0){
456  lit_coder.encode(1, bit_r);
457  lit_coder.encode(second_layer_nts[pos], dict_r);
458  auto symbol = non_terminal_symbols[second_layer_nts[pos] -1];
459 
460 
461 
462  pos += symbol.second - 1;
463 
464  } else {
465  lit_coder.encode(0, bit_r);
466  lit_coder.encode(in[pos],literal_r);
467  dict_literals++;
468 
469  }
470  }
471 
472  }
473  }
474 
475  uint literals=0;
476 
477 
478 
479 
480  buf_size = long(bitout->tellp()) - buf_size;
481  StatPhase::log("Bytes Non-Terminal Symbol Encoding", buf_size);
482 
483 
484  DLOG(INFO)<<"Bytes Non-Terminal Symbol Encoding: "<< buf_size;
485 
486  //encode start symbol
487 
488  DLOG(INFO)<<"encode start symbol";
489  for(uint pos = 0; pos < in.size(); pos++){
490  if(first_layer_nts[pos]>0){
491  lit_coder.encode(1, bit_r);
492  lit_coder.encode(first_layer_nts[pos], dict_r);
493  auto symbol = non_terminal_symbols[first_layer_nts[pos] -1];
494 
495  pos += symbol.second - 1;
496 
497  } else {
498  lit_coder.encode(0, bit_r);
499  lit_coder.encode(in[pos],literal_r);
500  literals++;
501 
502 
503  }
504  }
505 
506  buf_size = long(bitout->tellp()) - buf_size;
507  StatPhase::log("Bytes Start Symbol Encoding", buf_size);
508 
509 
510  DLOG(INFO)<<"Bytes Start Symbol Encoding: "<< buf_size;
511 
512 
513  StatPhase::log("Literals in Dictionary", dict_literals);
514 
515  StatPhase::log("Literals in Start Symbol", literals);
516  StatPhase::log("Literals in Input", in.size());
517 
518  double literal_percent = ((double)dict_literals + (double)literals)/ (double)in.size();
519  StatPhase::log("Literals Encoding / Literals Input", literal_percent);
520 
521 
522  DLOG(INFO)<<"encoding done";
523 
524  });
525  }
526 
527  inline virtual void decompress(Input& input, Output& output) override {
528 
529  DLOG(INFO) << "decompress lfs";
530  std::shared_ptr<BitIStream> bitin = std::make_shared<BitIStream>(input);
531 
532  typename literal_coder_t::Decoder lit_decoder(
533  env().env_for_option("lfs2_lit_coder"),
534  bitin
535  );
536  typename len_coder_t::Decoder len_decoder(
537  env().env_for_option("lfs2_len_coder"),
538  bitin
539  );
540  Range int_r (0,UINT_MAX);
541 
542  uint symbol_length = len_decoder.template decode<uint>(int_r);
543  Range slength_r (0, symbol_length);
544  std::vector<uint> dict_lengths;
545  dict_lengths.reserve(symbol_length);
546  dict_lengths.push_back(symbol_length);
547  while(symbol_length>0){
548 
549  uint current_delta = len_decoder.template decode<uint>(slength_r);
550  symbol_length-=current_delta;
551  dict_lengths.push_back(symbol_length);
552  }
553  dict_lengths.pop_back();
554 
555  DLOG(INFO)<<"decoded number of nts: "<< dict_lengths.size();
556 
557 
558 
559  std::vector<std::string> dictionary;
560  uint dictionary_size = dict_lengths.size();
561 
562  Range dictionary_r (0, dictionary_size);
563 
564 
565  dictionary.resize(dict_lengths.size());
566 
567  std::stringstream ss;
568  uint symbol_number;
569  char c1;
570 
571  DLOG(INFO) << "reading dictionary";
572  for(long i = dict_lengths.size() -1; i>=0 ;i--){
573 
574  ss.str("");
575  ss.clear();
576  long size_cur = (long) dict_lengths[i];
577  while(size_cur > 0){
578  bool bit1 = lit_decoder.template decode<bool>(bit_r);
579 
580 
581  if(bit1){
582  //bit = 1, is nts, decode nts num and copy
583  symbol_number = lit_decoder.template decode<uint>(dictionary_r);
584 
585  symbol_number-=1;
586 
587  if(symbol_number < dictionary.size()){
588 
589  ss << dictionary.at(symbol_number);
590  size_cur-= dict_lengths[symbol_number];
591  } else {
592  break;
593  }
594 
595  } else {
596  //bit = 0, decode literal
597  c1 = lit_decoder.template decode<char>(literal_r); // Dekodiere Literal
598  size_cur--;
599 
600  ss << c1;
601 
602  }
603 
604  }
605 
606  dictionary[i]=ss.str();
607 
608 
609  }
610 
611  auto ostream = output.as_stream();
612  //reading start symbol:
613  while(!lit_decoder.eof()){
614  //decode bit
615  bool bit1 = lit_decoder.template decode<bool>(bit_r);
616  char c1;
617  uint symbol_number;
618  // if bit = 0 its a literal
619  if(!bit1){
620  c1 = lit_decoder.template decode<char>(literal_r); // Dekodiere Literal
621 
622  ostream << c1;
623  } else {
624  //else its a non-terminal
625  symbol_number = lit_decoder.template decode<uint>(dictionary_r); // Dekodiere Literal
626  symbol_number-=1;
627 
628  if(symbol_number < dictionary.size()){
629 
630  ostream << dictionary.at(symbol_number);
631  } else {
632  DLOG(INFO)<< "too large symbol: " << symbol_number;
633  }
634 
635  }
636  }
637  }
638 
639 };
640 
641 //namespaces closing
642 }}
Represents a generic range of positive integers.
Definition: Range.hpp:16
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
constexpr auto bit_r
Global predefined range for bits (0 or 1).
Definition: Range.hpp:108
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
Base for data compressors.
Definition: Compressor.hpp:19
void push_back(const value_type &val)
Definition: IntVector.hpp:426
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Definition: IntVector.hpp:545
unsigned int uint
Definition: characterhash.h:6
virtual void compress(Input &input, Output &output) override
Compress the given input to the given output.
uint64_t as_integer() const
A literal iterator that yields every character from a View.
Definition: Literal.hpp:41
void needs_sentinel_terminator()
Indicates that this Algorithm requires a null terminator symbol in Input.
Definition: Meta.hpp:271
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
Definition: Literal.hpp:37
constexpr auto literal_r
Global predefined reange for literals.
Definition: Range.hpp:111
An abstraction layer for algorithm output.
Definition: Output.hpp:23
len_compact_t pos
Definition: LZSSFactors.hpp:38
Defines data encoding to and decoding from a stream of Elias-Gamma codes.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
void templated(const std::string &accepted_type)
Declares that this option accepts values of the specified Algorithm type T.
Definition: Meta.hpp:93
size_t size() const
Yields the total amount of characters in the input.
Definition: Input.hpp:233
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
InputView as_view() const
Provides a view on the input that allows for random access.
Definition: Input.hpp:260
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
Local environment for a compression/encoding/decompression call.
virtual void decompress(Input &input, Output &output) override
Decompress the given input to the given output.
size_type size() const
Definition: IntVector.hpp:284
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150
An abstraction layer for algorithm input.
Definition: Input.hpp:37