16 #include <sdsl/suffix_trees.hpp> 27 typedef sdsl::cst_sct3< sdsl::csa_bitcompressed<> > cst_t;
30 using node_type =
typename cst_t::node_type;
33 inline virtual std::vector<uint> select_starting_positions(
int node_id,
int length){
35 std::vector<uint> selected_starting_positions;
36 std::vector<uint> not_selected_starting_positions;
39 selected_starting_positions.reserve(node_begins[node_id].size());
42 not_selected_starting_positions.reserve(node_begins[node_id].size());
44 long last = 0-length-1;
48 for (
auto it=node_begins[node_id].begin(); it!=node_begins[node_id].end(); it++){
51 if((last + length <= current )&& !dead_positions[current] && !dead_positions[current+length-1]){
52 selected_starting_positions.push_back(current);
56 not_selected_starting_positions.push_back(current);
60 if(current < dead_positions.
size() && !dead_positions[current] && dead_positions[current+length-1]){
63 while((current+min_shorter < (
long) dead_positions.
size()) && !dead_positions[current+min_shorter]){
72 if(min_shorter < length){
73 node_type node = stree.inv_id(node_id + stree.size());
75 if(min_shorter >= (
int) min_lrf){
78 node_type parent = stree.parent(node);
79 uint depth = stree.depth(parent);
80 if(depth < (
uint)(min_shorter)){
83 bins[min_shorter].push_back(node_id + stree.size());
87 if(selected_starting_positions.size()>=2){
88 node_begins[node_id].assign(not_selected_starting_positions.begin(), not_selected_starting_positions.end());
89 return selected_starting_positions;
91 return std::vector<uint>();
96 typedef std::tuple<uint,uint,uint> non_term;
97 typedef std::vector<non_term> non_terminal_symbols;
98 typedef std::vector<std::pair<uint,uint>> rules;
105 unsigned short min_lrf;
108 std::vector<std::vector<uint> > bins;
111 std::vector<std::vector<uint> > node_begins;
118 Meta m(
"lfs_comp",
"sim_st");
128 DLOG(INFO)<<
"build suffixtree";
139 if(input[size-1] == 0){
142 std::string in_string ((
const char*) input.
data(), size);
143 sdsl::construct_im(stree, in_string , 1);
152 DLOG(INFO)<<
"computing lrf";
156 uint node_counter = 0;
160 typedef sdsl::cst_bfs_iterator<cst_t> iterator;
161 iterator begin = iterator(&stree, stree.root());
162 iterator end = iterator(&stree, stree.root(),
true,
true);
164 DLOG(INFO)<<
"iterate st";
166 for (iterator it = begin; it != end; ++it) {
168 if(!stree.is_leaf(*it)){
169 if(bins.size() <= stree.depth(*it) ){
170 uint resize = bins.size()*2;
171 while (resize<= stree.depth(*it)) {
176 bins[stree.depth(*it)].push_back(stree.id(*it));
178 if(max_depth< stree.depth(*it)){
179 max_depth = stree.depth(*it);
185 StatPhase::log(
"Number of inner Nodes", stree.nodes() - stree.size());
187 DLOG(INFO)<<
"max depth: "<<max_depth;
191 node_begins.resize(node_counter);
194 for(
uint i = bins.size()-1; i>=min_lrf; i--){
195 auto bin_it = bins[i].
begin();
196 while (bin_it!= bins[i].end()){
197 node_type node = stree.inv_id(*bin_it);
198 uint no_leaf_id = *bin_it - stree.size();
201 if(node_begins[no_leaf_id].empty()){
204 std::vector<uint> offsets;
205 std::vector<uint> leaf_bps;
207 for (
auto & child : stree.children(node)) {
208 if(stree.is_leaf(child)){
209 uint temp = stree.csa[stree.lb(child)];
210 if(!dead_positions[temp]){
212 leaf_bps.push_back(temp);
216 int child_id = stree.id(child) - stree.size();
217 if(!node_begins[child_id ].empty()){
219 offsets.push_back(node_begins[no_leaf_id].size());
222 node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),node_begins[child_id ].begin(), node_begins[child_id].end());
224 node_begins[child_id] = std::vector<uint>();
229 std::sort(leaf_bps.begin(), leaf_bps.end());
231 offsets.push_back(node_begins[no_leaf_id].size());
232 node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),leaf_bps.begin(), leaf_bps.end());
235 for(
uint k = 0; k < offsets.size()-1; k++){
236 std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets[k], node_begins[no_leaf_id].begin()+ offsets[k+1]);
240 std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets[offsets.size()-1], node_begins[no_leaf_id].end());
246 if(node_begins[no_leaf_id].empty()){
253 if( (node_begins[no_leaf_id].size()>=2) &&
254 ( ( (
uint)( node_begins[no_leaf_id].back() - node_begins[no_leaf_id].front() )) < i )){
261 std::vector<uint> selected_bp = select_starting_positions(no_leaf_id, i);
262 if(! (selected_bp.size() >=2) ){
267 std::pair<uint,uint> rule = std::make_pair(selected_bp.at(0), i);
268 dictionary.push_back(rule);
271 for(
auto bp_it = selected_bp.begin(); bp_it != selected_bp.end(); bp_it++){
272 non_term nts = std::make_tuple(*bp_it, nts_number, i);
273 nts_symbols.push_back(nts);
276 dead_positions[
pos+ *bp_it] = 1;
290 DLOG(INFO) <<
"sorting occurences";
291 std::sort(nts_symbols.begin(), nts_symbols.end());
Contains the text compression and encoding framework.
A vector over arbitrary unsigned integer types.
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)
size_type size() const
Returns size of the View.
uint64_t as_integer() const
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
const value_type * data() const noexcept
The backing memory location.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Interface for algorithms.