tudocomp
– The TU Dortmund Compression Framework
ArraysComp.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <tudocomp/Algorithm.hpp>
4 #include <tudocomp/ds/TextDS.hpp>
5 #include <tudocomp/def.hpp>
6 
9 
11 
12 namespace tdc {
13 namespace lcpcomp {
14 
22 class ArraysComp : public Algorithm {
23 public:
24  inline static Meta meta() {
25  Meta m("lcpcomp_comp", "arrays");
26  return m;
27  }
28 
29  inline static ds::dsflags_t textds_flags() {
30  return ds::SA | ds::ISA | ds::LCP;
31  }
32 
33  using Algorithm::Algorithm; //import constructor
34 
35  template<typename text_t>
36  inline void factorize(text_t& text, size_t threshold, lzss::FactorBuffer& factors) {
37 
38  // Construct SA, ISA and LCP
39  auto lcp = StatPhase::wrap("Construct Index Data Structures", [&] {
40  text.require(text_t::SA | text_t::ISA | text_t::LCP);
41 
42  auto lcp = text.release_lcp();
43  StatPhase::log("maxlcp", lcp.max_lcp());
44  return lcp;
45  });
46 
47  auto& sa = text.require_sa();
48  auto& isa = text.require_isa();
49 
50  if(lcp.max_lcp()+1 <= threshold) return; // nothing to factorize
51  const size_t cand_length = lcp.max_lcp()+1-threshold;
52  std::vector<len_compact_t>* cand = new std::vector<len_compact_t>[cand_length];
53 
54  StatPhase::wrap("Fill candidates", [&]{
55  for(size_t i = 1; i < sa.size(); ++i) {
56  if(lcp[i] < threshold) continue;
57  cand[lcp[i]-threshold].push_back(i);
58  }
59 
60  StatPhase::log("entries", [&] () {
61  size_t ret = 0;
62  for(size_t i = 0; i < cand_length; ++i) {
63  ret += cand[i].size();
64  }
65  return ret; }());
66  });
67 
68  StatPhase::wrap("Compute Factors", [&]{
69  StatPhase phase(std::string{"Factors at max. LCP value "}
70  + std::to_string(lcp.max_lcp()));
71 
72  for(size_t maxlcp = lcp.max_lcp(); maxlcp >= threshold; --maxlcp) {
73  IF_STATS({
74  const len_t maxlcpbits = bits_for(maxlcp-threshold);
75  if( ((maxlcpbits ^ (1UL<<(bits_for(maxlcpbits)-1))) == 0) && (( (maxlcp-threshold) ^ (1UL<<(maxlcpbits-1))) == 0)) { // only log at certain LCP values
76  phase.split(std::string{"Factors at max. LCP value "}
77  + std::to_string(maxlcp));
78  phase.log_stat("num factors", factors.size());
79  }
80  })
81  std::vector<len_compact_t>& candcol = cand[maxlcp-threshold]; // select the vector specific to the LCP value
82  for(size_t i = 0; i < candcol.size(); ++i) {
83  const len_compact_t& index = candcol[i];
84  const auto& lcp_value = lcp[index];
85  if(lcp_value < maxlcp) { // if it got resized, we push it down
86  if(lcp_value < threshold) continue; // already erased
87  cand[lcp_value-threshold].push_back(index);
88  continue;
89  }
90  //generate factor
91  const len_t pos_target = sa[index];
92  DCHECK_GT(index,0u);
93  const len_t pos_source = sa[index-1];
94  const len_t factor_length = lcp[index];
95 
96  factors.emplace_back(pos_target, pos_source, factor_length);
97 
98  //erase suffixes on the replaced area
99  for(size_t k = 0; k < factor_length; ++k) {
100  lcp[isa[pos_target + k]] = 0;
101  }
102 
103  const len_t max_affect = std::min(factor_length, pos_target); //if pos_target is at the very beginning, we have less to scan
104  //correct intersecting entries
105  for(len_t k = 0; k < max_affect; ++k) {
106  const len_t pos_suffix = pos_target - k - 1; DCHECK_GE(pos_target,k+1);
107  const len_t ind_suffix = isa[pos_suffix];
108  lcp[ind_suffix] = std::min<len_t>(k+1, lcp[ind_suffix]);
109  }
110 
111  }
112  candcol.clear();
113  candcol.shrink_to_fit();
114  }
115  });
116  delete [] cand;
117  }
118 };
119 
120 }}
121 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
constexpr uint_fast8_t bits_for(size_t n)
Computes the number of bits required to store the given integer value.
Provides meta information about an Algorithm.
Definition: Meta.hpp:34
void factorize(text_t &text, size_t threshold, lzss::FactorBuffer &factors)
Definition: ArraysComp.hpp:36
unsigned int dsflags_t
Definition: TextDSFlags.hpp:8
constexpr dsflags_t ISA
Definition: TextDSFlags.hpp:12
static ds::dsflags_t textds_flags()
Definition: ArraysComp.hpp:29
std::string to_string(tdc::uint_impl_t< N > value)
Definition: uint_t.hpp:241
Provides access to runtime and memory measurement in statistics phases.
Definition: StatPhase.hpp:44
void split(const char *new_title)
Starts a new phase as a sibling, reusing the same object.
Definition: StatPhase.hpp:264
Algorithm(Algorithm const &)=default
Creates arrays instead of an LCP-heap Each array corresponds to one LCP value We do not eagerly invok...
Definition: ArraysComp.hpp:22
uint32_t len_compact_t
Type to represent an bit-compact length value.
Definition: def.hpp:103
fast_t< len_compact_t > len_t
Type to represent an length value.
Definition: def.hpp:114
#define IF_STATS(x)
x is compiled only when the STATS_DISABLED macro is undefined.
Definition: def.hpp:59
void emplace_back(len_t fpos, len_t fsrc, len_t flen)
Definition: LZSSFactors.hpp:41
void log_stat(const char *key, const T &value)
Logs a user statistic for this phase.
Definition: StatPhase.hpp:298
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
Definition: StatPhase.hpp:218
constexpr dsflags_t SA
Definition: TextDSFlags.hpp:11
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Definition: StatPhase.hpp:143
constexpr dsflags_t LCP
Definition: TextDSFlags.hpp:13
Interface for algorithms.
Definition: Algorithm.hpp:15