15 template<
typename text_t = TextDS<> , u
int min_lrf = 2>
20 typedef std::tuple<uint,uint,uint> non_term;
21 typedef std::vector<non_term> non_terminal_symbols;
22 typedef std::vector<std::pair<uint,uint>> rules;
26 inline virtual std::vector<uint> select_starting_positions(std::vector<uint> starting_positions,
uint length){
27 std::vector<uint> selected_starting_positions;
28 std::sort(starting_positions.begin(), starting_positions.end());
30 selected_starting_positions.
reserve(starting_positions.size());
34 for (std::vector<uint>::iterator it=starting_positions.begin(); it!=starting_positions.end(); ++it){
38 if(last+length <= current){
39 selected_starting_positions.push_back(current);
45 return selected_starting_positions;
51 std::vector<std::vector<uint> > lcp_bins;
57 Meta m(
"lfs_comp",
"esa");
70 text_t t(
env().env_for_option(
"textds"), input);
71 DLOG(INFO) <<
"building sa and lcp";
76 auto& sa_t = t.require_sa();
77 auto& lcp_t = t.require_lcp();
86 DLOG(INFO) <<
"iterate over lcp";
92 for(
uint i = 1; i<lcp_t.size(); i++){
95 if(lcp_t[i] >= min_lrf){
98 lcp_bins.resize(max+1);
102 dif = std::abs((
long)(sa_t[i-1] - sa_t[i]));
103 factor_length = lcp_t[i];
104 factor_length = std::min(factor_length, dif);
109 while(j>0 && lcp_t[j]>factor_length ){
110 alt_dif = std::abs((
long)(sa_t[j] - sa_t[i]));
118 factor_length = lcp_t[i];
119 factor_length = std::min(factor_length, dif);
123 lcp_bins[factor_length].push_back(i);
127 DLOG(INFO)<<
"max lcp: "<<max;
128 DLOG(INFO)<<
"lcp bins: "<<lcp_bins.size();
132 if(lcp_bins.size()< min_lrf){
133 DLOG(INFO)<<
"nothing to replace, returning";
140 DLOG(INFO) <<
"computing lrfs";
148 nts_symbols.
reserve(lcp_bins.size());
149 uint non_terminal_symbol_number = 0;
151 for(
uint lcp_len = lcp_bins.size()-1; lcp_len>= min_lrf; lcp_len--){
152 for(
auto bin_it = lcp_bins[lcp_len].begin(); bin_it!=lcp_bins[lcp_len].end(); bin_it++){
157 std::vector<uint> starting_positions;
158 starting_positions.reserve(20);
167 uint shorter_dif = lcp_len;
170 while(i>=0 && ( lcp_t[i])>=lcp_len){
171 if(!dead_positions[sa_t[i-1]] && !dead_positions[sa_t[i-1]+lcp_len-1]){
172 starting_positions.push_back(sa_t[i-1]);
175 if(!dead_positions[sa_t[i-1]] && dead_positions[sa_t[i-1]+lcp_len-1] ){
176 while(!dead_positions[sa_t[i-1]+lcp_len-shorter_dif]){
184 while(i< lcp_t.size() && lcp_t[i]>=lcp_len){
186 if(!dead_positions[sa_t[i]] && !dead_positions[sa_t[i]+lcp_len-1]){
187 starting_positions.push_back(sa_t[i]);
189 if(!dead_positions[sa_t[i]] && dead_positions[sa_t[i]+lcp_len-1] ){
190 while(!dead_positions[sa_t[i]+lcp_len-shorter_dif]){
196 if(starting_positions.size()>=2){
197 std::vector<uint> selected_starting_positions = select_starting_positions(starting_positions, lcp_len);
200 if(selected_starting_positions.size()>=2){
204 uint offset = sa_t[*bin_it];
205 std::pair<uint,uint> longest_repeating_factor(offset, lcp_len);
206 for (std::vector<uint>::iterator it=selected_starting_positions.begin(); it!=selected_starting_positions.end(); ++it){
207 for(
uint k = 0; k<lcp_len; k++){
208 dead_positions[*it+k]=1;
211 uint length_of_symbol = lcp_len;
212 std::tuple<uint,uint,uint> symbol(*it, non_terminal_symbol_number, length_of_symbol);
213 nts_symbols.push_back(symbol);
216 dictionary.push_back(longest_repeating_factor);
217 non_terminal_symbol_number++;
227 DLOG(INFO) <<
"sorting symbols";
228 std::sort(nts_symbols.begin(), nts_symbols.end());
Contains the text compression and encoding framework.
A vector over arbitrary unsigned integer types.
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
Algorithm(Algorithm const &)=default
Env & env()
Provides access to the environment that the algorithm works in.
void reserve(size_type n)
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Manages text related data structures.
Interface for algorithms.
void compute_rules(io::InputView &input, rules &dictionary, non_terminal_symbols &nts_symbols)