tudocomp
– The TU Dortmund Compression Framework
SimSTStrategy.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <vector>
4 #include <tuple>
5 #include <array>
6 
7 #include <tudocomp/util.hpp>
8 #include <tudocomp/io.hpp>
9 #include <tudocomp/Algorithm.hpp>
10 
11 
13 
14 
15 
16 #include <sdsl/suffix_trees.hpp>
17 
18 
19 
20 
21 namespace tdc {
22 namespace lfs {
23 
24 //template<uint min_lrf = 2 >
25 class SimSTStrategy : public Algorithm {
26 private:
27  typedef sdsl::cst_sct3< sdsl::csa_bitcompressed<> > cst_t;
28  cst_t stree;
29 
30  using node_type = typename cst_t::node_type;
31 
32  // greedily select starting positions and delete them from corresponding vector
33  inline virtual std::vector<uint> select_starting_positions(int node_id, int length){
34 
35  std::vector<uint> selected_starting_positions;
36  std::vector<uint> not_selected_starting_positions;
37 
38  //select occurences greedily non-overlapping:
39  selected_starting_positions.reserve(node_begins[node_id].size());
40 
41 
42  not_selected_starting_positions.reserve(node_begins[node_id].size());
43 
44  long last = 0-length-1;
45  uint current;
46 
47  long min_shorter = 1;
48  for (auto it=node_begins[node_id].begin(); it!=node_begins[node_id].end(); it++){
49 
50  current = *it;
51  if((last + length <= current )&& !dead_positions[current] && !dead_positions[current+length-1]){
52  selected_starting_positions.push_back(current);
53  last = current;
54 
55  } else {
56  not_selected_starting_positions.push_back(current);
57  }
58 
59 
60  if(current < dead_positions.size() && !dead_positions[current] && dead_positions[current+length-1]){
61 
62 
63  while((current+min_shorter < (long) dead_positions.size()) && !dead_positions[current+min_shorter]){
64  min_shorter++;
65  }
66 
67  }
68 
69  }
70 
71 
72  if(min_shorter < length){
73  node_type node = stree.inv_id(node_id + stree.size());
74 
75  if(min_shorter >= (int) min_lrf){
76  //check if parent node is shorter
77 
78  node_type parent = stree.parent(node);
79  uint depth = stree.depth(parent);
80  if(depth < (uint)(min_shorter)){
81 
82  //just re-add node, if the possible replaceable lrf is longer than dpeth of parent node
83  bins[min_shorter].push_back(node_id + stree.size());
84  }
85  }
86  }
87  if(selected_starting_positions.size()>=2){
88  node_begins[node_id].assign(not_selected_starting_positions.begin(), not_selected_starting_positions.end());
89  return selected_starting_positions;
90  } else {
91  return std::vector<uint>();
92  }
93  }
94 
95  //(position in text, non_terminal_symbol_number, length_of_symbol);
96  typedef std::tuple<uint,uint,uint> non_term;
97  typedef std::vector<non_term> non_terminal_symbols;
98  typedef std::vector<std::pair<uint,uint>> rules;
99 
100 
101 
102 
103 
104  BitVector dead_positions;
105  unsigned short min_lrf;
106 
107  //could be node_type
108  std::vector<std::vector<uint> > bins;
109 
110 
111  std::vector<std::vector<uint> > node_begins;
112 
113 public:
114 
115  using Algorithm::Algorithm; //import constructor
116 
117  inline static Meta meta() {
118  Meta m("lfs_comp", "sim_st");
119  m.option("min_lrf").dynamic(2);
120  return m;
121  }
122 
123 
124  inline void compute_rules(io::InputView & input, rules & dictionary, non_terminal_symbols & nts_symbols){
125  min_lrf = env().option("min_lrf").as_integer();
126 
127  //build suffixtree
128  DLOG(INFO)<<"build suffixtree";
129 
130 
131 
132 
133 
134 
135  StatPhase::wrap("Constructing ST", [&]{
136  uint size = input.size();
137  //remove sentinel because sdsl cant handle that (at last pos)
138  //this is just to handle test cases
139  if(input[size-1] == 0){
140  size--;
141  }
142  std::string in_string ((const char*) input.data(), size);
143  sdsl::construct_im(stree, in_string , 1);
144 
145 
146  });
147 
148 
149 
150 
151 
152  DLOG(INFO)<<"computing lrf";
153 
154  //array of vectors for bins of nodes with string depth
155 
156  uint node_counter = 0;
157  uint max_depth =0;
158 
159  StatPhase::wrap("Computing String Depth", [&]{
160  typedef sdsl::cst_bfs_iterator<cst_t> iterator;
161  iterator begin = iterator(&stree, stree.root());
162  iterator end = iterator(&stree, stree.root(), true, true);
163  bins.resize(200);
164  DLOG(INFO)<<"iterate st";
165 
166  for (iterator it = begin; it != end; ++it) {
167 
168  if(!stree.is_leaf(*it)){
169  if(bins.size() <= stree.depth(*it) ){
170  uint resize = bins.size()*2;
171  while (resize<= stree.depth(*it)) {
172  resize*=2;
173  }
174  bins.resize(resize);
175  }
176  bins[stree.depth(*it)].push_back(stree.id(*it));
177  node_counter++;
178  if(max_depth< stree.depth(*it)){
179  max_depth = stree.depth(*it);
180  }
181  }
182  }
183  });
184 
185  StatPhase::log("Number of inner Nodes", stree.nodes() - stree.size());
186  StatPhase::log("Max Depth inner Nodes", max_depth);
187  DLOG(INFO)<<"max depth: "<<max_depth;
188 
189  uint nts_number =0;
190  StatPhase::wrap("Computing LRF Substitution", [&]{
191  node_begins.resize(node_counter);
192  dead_positions = BitVector(input.size(), 0);
193 
194  for(uint i = bins.size()-1; i>=min_lrf; i--){
195  auto bin_it = bins[i].begin();
196  while (bin_it!= bins[i].end()){
197  node_type node = stree.inv_id(*bin_it);
198  uint no_leaf_id = *bin_it - stree.size();
199 
200 
201  if(node_begins[no_leaf_id].empty()){
202 
203  //get leaves or merge child vectors
204  std::vector<uint> offsets;
205  std::vector<uint> leaf_bps;
206 
207  for (auto & child : stree.children(node)) {
208  if(stree.is_leaf(child)){
209  uint temp = stree.csa[stree.lb(child)];
210  if(!dead_positions[temp]){
211 
212  leaf_bps.push_back(temp);
213  }
214 
215  } else {
216  int child_id = stree.id(child) - stree.size();
217  if(!node_begins[child_id ].empty()){
218  //append child list, remember offset
219  offsets.push_back(node_begins[no_leaf_id].size());
220 
221 
222  node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),node_begins[child_id ].begin(), node_begins[child_id].end());
223 
224  node_begins[child_id] = std::vector<uint>();
225 
226  }
227  }
228  }
229  std::sort(leaf_bps.begin(), leaf_bps.end());
230 
231  offsets.push_back(node_begins[no_leaf_id].size());
232  node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),leaf_bps.begin(), leaf_bps.end());
233 
234  //inplace merge with offset
235  for(uint k = 0; k < offsets.size()-1; k++){
236  std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets[k], node_begins[no_leaf_id].begin()+ offsets[k+1]);
237 
238  }
239  //now inplace merge to end
240  std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets[offsets.size()-1], node_begins[no_leaf_id].end());
241 
242  //sort bps of leaves
243 
244 
245  }
246  if(node_begins[no_leaf_id].empty()){
247 
248  bin_it++;
249  continue;
250  }
251 
252 
253  if( (node_begins[no_leaf_id].size()>=2) &&
254  ( ( (uint)( node_begins[no_leaf_id].back() - node_begins[no_leaf_id].front() )) < i )){
255  bin_it++;
256  continue;
257  }
258  //do this
259  //Add new rule
260  //and add new non-terminal symbols
261  std::vector<uint> selected_bp = select_starting_positions(no_leaf_id, i);
262  if(! (selected_bp.size() >=2) ){
263  bin_it++;
264  continue;
265  }
266  //vector of text position, length
267  std::pair<uint,uint> rule = std::make_pair(selected_bp.at(0), i);
268  dictionary.push_back(rule);
269 
270  //iterate over selected pos, add non terminal symbols
271  for(auto bp_it = selected_bp.begin(); bp_it != selected_bp.end(); bp_it++){
272  non_term nts = std::make_tuple(*bp_it, nts_number, i);
273  nts_symbols.push_back(nts);
274  //mark as used
275  for(uint pos = 0; pos<i;pos++){
276  dead_positions[pos+ *bp_it] = 1;
277  }
278  }
279  nts_number++;
280 
281  bin_it++;
282 
283  }
284  }
285 
286  });
287 
288  StatPhase::wrap("Sorting occurences", [&]{
289 
290  DLOG(INFO) << "sorting occurences";
291  std::sort(nts_symbols.begin(), nts_symbols.end());
292 
293  });
294  }
295 };
296 }
297 }
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Definition: Env.hpp:44
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Definition: IntVector.hpp:545
unsigned int uint
Definition: characterhash.h:6
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)
size_type size() const
Returns size of the View.
uint64_t as_integer() const
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
len_compact_t pos
Definition: LZSSFactors.hpp:38
const value_type * data() const noexcept
The backing memory location.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
OptionBuilder option(const std::string &name)
Declares an accepted option for this algorithm.
Definition: Meta.hpp:216
size_type size() const
Definition: IntVector.hpp:284
Interface for algorithms.
Definition: Algorithm.hpp:15
void dynamic()
Declares that this option accepts values of a simple type that can be parsed from a string (e...
Definition: Meta.hpp:150