tudocomp
– The TU Dortmund Compression Framework
BinarySuffixTree.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 //Original Implementation::
4 // https://gist.github.com/makagonov/f7ed8ce729da72621b321f0ab547debb
5 //from: http://stackoverflow.com/questions/9452701/ukkonens-suffix-tree-algorithm-in-plain-english/9513423#9513423
6 
7 #include <string>
8 #include <map>
9 #include <vector>
10 #include <tuple>
11 #include <memory>
12 
13 #include <sstream>
14 
15 
16 #include <math.h>
17 
19 
20 
21 #include <tudocomp/io.hpp>
22 
23 
24 namespace tdc {
25 
26 
28 private:
29 
30 
31  typedef IntVector<uint> vectortype ;
32 
33  //binary property of tree: DynamicIntVector
34  vectortype first_child;
35  vectortype next_sibling;
36 
37 
38  //information of nodes, stored in array
39 
40 
41  //represents the edge leading to this node
42  vectortype start;
43  vectortype end;
44 
45  //suffix link of node
46  vectortype suffix_link;
47 
48  vectortype suffix;
49 
50 
51  //text added to st
52  const io::InputView& Text;
53  int pos;
54 
55  uint current_suffix;
56  uint new_node;
57 
58  //number of suffixes to be added;
59  uint remainder;
60 
61  //active point, from where to start inserting new suffixes
62  uint active_node;
63  uint8_t active_edge;
64  uint active_length;
65 
66 
67  //saves last added node
68  uint last_added_sl;
69 
70  void add_sl(uint node){
71 
72 
73  if(last_added_sl != 0) {
74  suffix_link[last_added_sl]=node;
75  }
76  last_added_sl=node;
77  }
78 
79  uint create_node(uint s, uint e, char c){
80  new_node++;
81  start[new_node] = s;
82  end[new_node] = e;
83 
84  //init empty node
85 
86  return new_node;
87 
88  }
89 
90 
91  void reserve(){
92 
93  auto size = Text.size() * 2 -1;
94  start= vectortype(size + 1, 0);
95  end = vectortype(size + 1, 0);
96  first_child= vectortype(size, 0);
97  next_sibling= vectortype(size, 0);
98  suffix_link= vectortype(size, 0);
99  suffix= vectortype(size + 1, 0);
100 
101 
102 
103 
104  }
105 
106  void resize(){
107  new_node++;
108  start.resize(new_node);
109  end.resize(new_node);
110  first_child.resize(new_node);
111  next_sibling.resize(new_node);
112 
113  suffix.resize(new_node);
114 
115 
116  start.shrink_to_fit();
117  end.shrink_to_fit();
118  first_child.shrink_to_fit();
119  next_sibling.shrink_to_fit();
120  suffix.shrink_to_fit();
121 
122  //deallocate sl, because not necessary, save 20% space
123  suffix_link=vectortype(0, 0);
124 
125  }
126 
127  void compute(){
128  new_node=0;
129 
130 
131 
132 
133 
134  reserve();
135 
136  //no Text is read
137  pos=-1;
138  remainder=0;
139  current_suffix=0;
140 
141 
142  //init root
143  create_node(0,0,0);
144  //suffix corresponds to start
145 
146 
147  //active start node is root
148  active_node=0;
149  active_length=0;
150 
151  last_added_sl=0;
152  DLOG(INFO)<<"Text size: " << Text.size();
153 
154  for (uint i = 0; i < Text.size(); i++) {
155  uint8_t c = Text[i];
156  add_char(c);
157  }
158 
159  resize();
160  }
161 
162 
163 
164 public:
165 
166 
167  //computes edge length:
169  if(node==0){
170  return 0;
171  }
172 
173  if(end[node] == (uint)0){
174  return pos - start[node]+1;
175  } else {
176  return end[node]- start[node];
177  }
178  }
179 
180 
182 
183  }
184 
185 private:
186  inline void add_char(char c){
187  pos++;
188  remainder++;
189  last_added_sl=0;
190 
191 
192  while(remainder > 0){
193  if(active_length==0){
194  active_edge = c;
195  }
196 
197 
198  //if the active node doesnt have the corresponding edge:
199  //find edge corresponding to added active edge
200  bool found = false;
201  uint child = first_child[active_node];
202  uint previous_sibling = child;
203  if(child != 0){
204  do
205  {
206  if( Text[(start[child])] == active_edge){
207  found = true;
208  break;
209  } else {
210  previous_sibling = child;
211  child=next_sibling[child];
212  }
213  }
214  while (child != 0);
215 
216  }
217 
218  //if not found
219  if(!found){
220  //insert new leaf
221  uint leaf = create_node(pos, 0, active_edge);
222  suffix[leaf] = current_suffix++;
223  if(first_child[active_node]== (uint)0){
224  first_child[active_node] = leaf;
225  } else {
226  next_sibling[previous_sibling] = leaf;
227  }
228  add_sl(active_node);
229 
230  } else {
231  uint next = child ;
232  //if the active length is greater than the edge length:
233  //switch active node to that
234  //walk down
235  if(active_length>= edge_length(next)){
236  active_node = next;
237  active_length -= edge_length(next);
238  active_edge = (char) Text[pos-active_length];
239  continue;
240  }
241 
242  //if that suffix is already in the tree::
243  if( (char) Text[start[next]+active_length] == c){
244  active_length++;
245  add_sl(active_node);
246  break;
247  }
248 
249 
250  //now split edge if the edge is found
251 
252  uint split = create_node(start[next], start[next]+active_length, active_edge);
253 
254 
255  start[next]=start[next]+ active_length;
256 
257 
258  //update relations of nodes
259  if(first_child[active_node] == (uint) 0){
260  first_child[active_node]= split;
261  }
262  if(first_child[active_node] == next){
263  first_child[active_node]= split;
264  } else {
265  next_sibling[previous_sibling] = split;
266  }
267  first_child[split] = next;
268  next_sibling[split] = next_sibling[next];
269 
270  uint leaf = create_node(pos, 0, c);
271  next_sibling[next] = leaf;
272  suffix[leaf] = current_suffix++;
273 
274  add_sl(split);
275  }
276  remainder--;
277  if(active_node==0 && active_length>0){
278  active_length--;
279  active_edge = Text[pos-remainder+1];
280  }else {
281  if(suffix_link[active_node] != (uint)0){
282  active_node = suffix_link[active_node];
283  } else {
284  active_node = 0;
285  }
286 
287  }
288  }
289 
290  }
291 public:
292 
293  inline uint get_size(){
294  return Text.size();
295  }
296 
297  inline uint get_first_child(uint node){
298  return first_child[node];
299  }
300 
301  inline uint get_next_sibling(uint node){
302  return next_sibling[node];
303  }
304  inline uint get_suffix(uint node){
305  return suffix[node];
306  }
307 
308 
309  inline uint get_root(){
310  return 0;
311  }
312 
313  inline uint get_edge_length(uint node){
314  return edge_length(node);
315  }
316 
317  inline std::string get_string_of_edge(uint node){
318  std::stringstream ss ;
319  ss << start[node]<< " " <<start[node] + edge_length(node);
320  return ss.str();
321  }
322 
323  inline uint get_tree_size(){
324  return start.size();
325  }
326 
327 
328 
329  inline void print_tree(uint node, std::string depth){
330 
331  uint child = first_child[node];
332  while (child != 0){
333  DLOG(INFO)<< depth << "edge: " << get_string_of_edge(child) << std::endl;
334  print_tree(child, depth+" ");
335  child=next_sibling[child];
336  }
337  }
338 
339  BinarySuffixTree(io::InputView & input) : Text(input){
340  compute();
341  }
342 
343 };
344 
345 }
346 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
uint edge_length(uint node)
std::string get_string_of_edge(uint node)
A vector over arbitrary unsigned integer types.
Definition: IntVector.hpp:175
unsigned int uint
Definition: characterhash.h:6
BinarySuffixTree(io::InputView &input)
uint get_next_sibling(uint node)
size_type size() const
Returns size of the View.
void resize(size_type n)
Definition: IntVector.hpp:327
uint get_first_child(uint node)
Provides a view on the input that allows for random access.
Definition: InputView.hpp:29
uint get_suffix(uint node)
size_type size() const
Definition: IntVector.hpp:284
void print_tree(uint node, std::string depth)
uint get_edge_length(uint node)