6 #include <unordered_map> 25 typedef std::tuple<uint,uint,uint> non_term;
26 typedef std::vector<non_term> non_terminal_symbols;
27 typedef std::vector<std::pair<uint,uint>> rules;
28 typedef uint node_type;
31 std::unique_ptr<BinarySuffixTree> stree;
36 std::vector<std::vector<uint> > bins;
39 std::unordered_map< uint , std::vector< uint > > beginning_positions;
48 inline virtual void compute_string_depth(
uint node,
uint str_depth){
50 uint child = stree->get_first_child(node);
54 while(str_depth>= bins.size()){
55 bins.
resize(bins.size()*2);
60 if(str_depth>max_depth){
66 bins[str_depth].push_back(node);
69 compute_string_depth(child, str_depth + stree->get_edge_length(child));
70 child=stree->get_next_sibling(child);
84 inline virtual std::vector<uint> select_starting_positions(node_type node,
uint length){
87 std::vector<uint> & starting_positions = beginning_positions[node];
88 std::vector<uint> selected_starting_positions;
89 std::vector<uint> not_selected_starting_positions;
92 selected_starting_positions.reserve(starting_positions.size());
94 long last = 0- (long) length - 1;
96 for (
auto it=starting_positions.begin(); it!=starting_positions.end(); ++it){
99 if(last+length <= current && !dead_positions[current] && !dead_positions[current+length-1]){
101 selected_starting_positions.push_back(current);
106 if( !dead_positions[current] ){
107 not_selected_starting_positions.push_back(current);
115 if(selected_starting_positions.size() >= 2){
116 beginning_positions[node] = not_selected_starting_positions;
120 return selected_starting_positions;
129 Meta m(
"lfs_comp",
"bst");
139 DLOG(INFO)<<
"Constructing ST";
140 stree = std::make_unique<BinarySuffixTree>(input);
150 compute_string_depth(0,0);
152 bins.resize(max_depth +1);
153 bins.shrink_to_fit();
158 DLOG(INFO)<<
"max depth: "<<max_depth;
169 beginning_positions.
reserve(node_count);
174 DLOG(INFO)<<
"Iterating nodes";
176 for(
uint i = bins.size()-1; i>=min_lrf; i--){
178 auto bin_it = bins[i].begin();
179 while (bin_it!= bins[i].end()){
182 node_type node = *bin_it;
185 auto bp = beginning_positions.find(node);
189 if(bp == beginning_positions.end()){
192 std::vector<uint> positions;
194 std::vector<uint> offsets;
195 std::vector<uint> leaf_bps;
197 node_type inner = stree->get_first_child(node);
201 if(stree->get_first_child(inner) == 0){
202 uint temp = stree->get_suffix(inner);
203 if(!dead_positions[temp]){
205 leaf_bps.push_back(stree->get_suffix(inner));
208 auto & child_bp = beginning_positions[inner];
209 if(!child_bp.empty()){
210 offsets.push_back(positions.size());
212 positions.insert(positions.end(), child_bp.begin(), child_bp.end());
215 beginning_positions.erase(inner);
220 inner = stree->get_next_sibling(inner);
223 std::sort(leaf_bps.begin(), leaf_bps.end());
224 offsets.push_back(positions.size());
225 positions.insert(positions.end(),leaf_bps.begin(), leaf_bps.end());
226 for(
uint k = 0; k < offsets.size()-1; k++){
227 std::inplace_merge(positions.begin(), positions.begin()+ offsets[k], positions.begin()+ offsets[k+1]);
231 std::inplace_merge(positions.begin(), positions.begin()+ offsets.back(), positions.end());
233 beginning_positions[node]=positions;
239 std::vector<uint> begin_pos = beginning_positions[node];
241 if(begin_pos.size() >= 2 && ( (begin_pos.back() ) - (begin_pos.front()) >= i)){
244 dead_positions[(begin_pos.back())] ||
245 dead_positions[(begin_pos.front())]
250 std::vector<uint> sel_pos = select_starting_positions(node, i);
252 if(! (sel_pos.size() >=2) ){
257 std::pair<uint,uint> rule = std::make_pair(sel_pos.at(0), i);
258 dictionary.push_back(rule);
261 for(
auto bp_it = sel_pos.begin(); bp_it != sel_pos.end(); bp_it++){
263 non_term nts = std::make_tuple(*bp_it, nts_number, i);
264 nts_symbols.push_back(nts);
267 dead_positions[
pos+ *bp_it] = 1;
288 DLOG(INFO) <<
"sorting occurences";
289 std::sort(nts_symbols.begin(), nts_symbols.end());
Contains the text compression and encoding framework.
A vector over arbitrary unsigned integer types.
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)
size_type size() const
Returns size of the View.
uint64_t as_integer() const
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
void reserve(size_type n)
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Interface for algorithms.