tudocomp
– The TU Dortmund Compression Framework
PLCPStrategy.hpp
Go to the documentation of this file.
1 #pragma once
2 #include <tudocomp/config.h>
3 
4 #include <vector>
5 
7 
8 #include <tudocomp/Algorithm.hpp>
9 #include <tudocomp/ds/TextDS.hpp>
10 
12 #include <tudocomp/ds/LCPSada.hpp>
13 #include <boost/heap/pairing_heap.hpp>
14 
16 
17 namespace tdc {
18 namespace lcpcomp {
19 
20 class PLCPStrategy : public Algorithm {
21 public:
23 
24  inline static Meta meta() {
25  Meta m("lcpcomp_comp", "plcp", "compressor using PLCP array");
26  return m;
27  }
28 
29  inline static ds::dsflags_t textds_flags() {
30  return ds::SA | ds::ISA;
31  }
32 
33  template<typename text_t>
34  inline void factorize(text_t& text,
35  size_t threshold,
36  lzss::FactorBuffer& factors) {
37 
38  // Construct SA, ISA and LCP
39  auto pplcp = StatPhase::wrap("Construct index ds", [&]{
40  text.require(text_t::SA | text_t::ISA);
41  const auto& sa = text.require_sa();
42  return LCPForwardIterator { (construct_plcp_bitvector(env(), sa, text)) };
43  });
44 
45  const auto& sa = text.require_sa();
46  const auto& isa = text.require_isa();
47  const len_t n = sa.size();
48 
49  struct Poi {
50  len_t pos;
51  len_t lcp;
52  len_t no;
53  Poi(len_t _pos, len_t _lcp, len_t _no) : pos(_pos), lcp(_lcp), no(_no) {}
54  bool operator<(const Poi& o) const {
55  DCHECK_NE(o.pos, this->pos);
56  if(o.lcp == this->lcp) return this->pos > o.pos;
57  return this->lcp < o.lcp;
58  }
59  };
60 
61  StatPhase::wrap("Search Peaks", [&]{
62  boost::heap::pairing_heap<Poi> heap;
63  std::vector<typename boost::heap::pairing_heap<Poi>::handle_type> handles;
64 
65  IF_STATS(len_t max_heap_size = 0);
66 
67  // std::stack<poi> pois; // text positions of interest, i.e., starting positions of factors we want to replace
68 
69  len_t lastpos = 0;
70  len_t lastpos_lcp = 0;
71  for(len_t i = 0; i+1 < n; ++i) {
72  while(pplcp.index() < i) pplcp.advance();
73  const len_t plcp_i = pplcp(); DCHECK_EQ(pplcp.index(), i);
74  if(heap.empty()) {
75  if(plcp_i >= threshold) {
76  handles.emplace_back(heap.emplace(i, plcp_i, handles.size()));
77  lastpos = i;
78  lastpos_lcp = plcp_i;
79  }
80  continue;
81  }
82  if(i - lastpos >= lastpos_lcp || tdc_unlikely(i+1 == n)) {
83  IF_DEBUG(bool first = true);
84  IF_STATS(max_heap_size = std::max<len_t>(max_heap_size, heap.size()));
85  DCHECK_EQ(heap.size(), handles.size());
86  while(!heap.empty()) {
87  const Poi& top = heap.top();
88  const len_t source_position = sa[isa[top.pos]-1];
89  factors.emplace_back(top.pos, source_position, top.lcp);
90  const len_t next_pos = top.pos; // store top, this is the current position that gets factorized
91  IF_DEBUG(if(first) DCHECK_EQ(top.pos, lastpos); first = false;)
92 
93  {
94  len_t newlcp_peak = 0; // a new peak can emerge at top.pos+top.lcp
95  bool peak_exists = false;
96  if(top.pos+top.lcp < i)
97  for(len_t j = top.no+1; j < handles.size(); ++j) { // erase all right peaks that got substituted
98  if( handles[j].node_ == nullptr) continue;
99  const Poi poi = *(handles[j]);
100  DCHECK_LT(next_pos, poi.pos);
101  if(poi.pos < next_pos+top.lcp) {
102  heap.erase(handles[j]);
103  handles[j].node_ = nullptr;
104  if(poi.lcp + poi.pos > next_pos+top.lcp) {
105  const len_t remaining_lcp = poi.lcp+poi.pos - (next_pos+top.lcp);
106  DCHECK_NE(remaining_lcp,0);
107  if(newlcp_peak != 0) DCHECK_LE(remaining_lcp, newlcp_peak);
108  newlcp_peak = std::max(remaining_lcp, newlcp_peak);
109  }
110  } else if( poi.pos == next_pos+top.lcp) { peak_exists=true; }
111  else { break; } // only for performance
112  }
113  #ifdef DEBUG
114  if(peak_exists) { //TODO: DEBUG
115  for(len_t j = top.no+1; j < handles.size(); ++j) {
116  if( handles[j].node_ == nullptr) continue;
117  const Poi& poi = *(handles[j]);
118  if(poi.pos == next_pos+top.lcp) {
119  DCHECK_LE(newlcp_peak, poi.lcp);
120  break;
121  }
122  }
123  }
124  #endif
125  if(!peak_exists && newlcp_peak >= threshold) {
126  len_t j = top.no+1;
127  DCHECK(handles[j].node_ == nullptr);
128  handles[j] = heap.emplace(next_pos+top.lcp, newlcp_peak, j);
129  }
130 
131  }
132  handles[top.no].node_ = nullptr;
133  heap.pop(); // top now gets erased
134 
135  for(auto it = handles.rbegin(); it != handles.rend(); ++it) {
136  if( (*it).node_ == nullptr) continue;
137  Poi& poi = (*(*it));
138  if(poi.pos > next_pos) continue;
139  const len_t newlcp = next_pos - poi.pos;
140  if(newlcp < poi.lcp) {
141  if(newlcp < threshold) {
142  heap.erase(*it);
143  it->node_ = nullptr;
144  } else {
145  poi.lcp = newlcp;
146  heap.decrease(*it);
147 
148  }
149  } else {
150  break;
151  }
152  }
153  }
154  handles.clear();
155  --i;
156  continue;
157  }
158  DCHECK_EQ(pplcp.index(), i);
159  DCHECK_EQ(plcp_i, pplcp());
160  if(plcp_i <= lastpos_lcp) continue;
161  DCHECK_LE(threshold, plcp_i);
162  handles.emplace_back(heap.emplace(i,plcp_i, handles.size()));
163  lastpos = i;
164  // DCHECK_EQ(plcp[lastpos], plcp_i);
165  lastpos_lcp = plcp_i;
166  }
167  IF_STATS(StatPhase::log("max heap size", max_heap_size));
168 
169  });
170  }
171 
172 };
173 
174 }}//ns
175 
bool operator<(const ConstGenericView< uliteral_t > &lhs, const ConstGenericView< uliteral_t > &rhs)
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
#define tdc_unlikely(x)
Provides a hint to the compiler that x is expected to resolve to false.
Definition: def.hpp:23
unsigned int dsflags_t
Definition: TextDSFlags.hpp:8
constexpr dsflags_t ISA
Definition: TextDSFlags.hpp:12
Algorithm(Algorithm const &)=default
void factorize(text_t &text, size_t threshold, lzss::FactorBuffer &factors)
Env & env()
Provides access to the environment that the algorithm works in.
Definition: Algorithm.hpp:51
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
len_compact_t pos
Definition: LZSSFactors.hpp:38
#define IF_STATS(x)
x is compiled only when the STATS_DISABLED macro is undefined.
Definition: def.hpp:59
void emplace_back(len_t fpos, len_t fsrc, len_t flen)
Definition: LZSSFactors.hpp:41
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
constexpr dsflags_t SA
Definition: TextDSFlags.hpp:11
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
static ds::dsflags_t textds_flags()
Interface for algorithms.
Definition: Algorithm.hpp:15
sdsl::bit_vector construct_plcp_bitvector(Env &, const sa_t &sa, const text_t &text)
Definition: LCPSada.hpp:174
#define IF_DEBUG(x)
x is compiled only in debug builds.
Definition: def.hpp:32