15 #include <sdsl/suffix_trees.hpp> 34 template<
typename literal_coder_t = HuffmanCoder,
typename len_coder_t = EliasGammaCoder >
42 typedef sdsl::cst_sct3< sdsl::csa_bitcompressed<> > cst_t;
45 using node_type =
typename cst_t::node_type;
58 std::vector<std::pair<uint, uint> > non_terminal_symbols;
62 std::vector<std::vector<uint> > bins;
66 std::vector<std::vector<uint> > node_begins;
77 Meta m(
"compressor",
"lfs2",
92 DLOG(INFO) <<
"Compressor lfs2 instantiated";
113 if(in.size() >= min_lrf){
121 while(in[size-1] == 0){
127 std::string in_string ((
const char*) in.data(), size);
128 sdsl::construct_im(stree, in_string , 1);
135 uint node_counter = 0;
137 typedef sdsl::cst_bfs_iterator<cst_t> iterator;
138 iterator begin = iterator(&stree, stree.root());
139 iterator end = iterator(&stree, stree.root(),
true,
true);
142 DLOG(INFO)<<
"iterate st";
144 for (iterator it = begin; it != end; ++it) {
146 if(!stree.is_leaf(*it)){
148 if(bins.size() <= stree.depth(*it)) {
150 uint resize = bins.size()*2;
151 while (resize<= stree.depth(*it)) {
156 bins[stree.depth(*it)].push_back(stree.id(*it));
161 node_begins.resize(node_counter);
163 uint nts_number = 1 ;
166 DLOG(INFO)<<
"iterate over Node Bins";
167 for(
uint i = bins.size()-1; i>=min_lrf; i--){
170 while(!bins[i].empty()){
171 uint id = bins[i].back();
173 node_type node = stree.inv_id(
id);
174 uint no_leaf_id =
id - stree.size();
178 if(node_begins[no_leaf_id].empty()){
180 std::vector<uint> offsets;
181 std::vector<uint> leaf_bps;
183 for (
auto & child : stree.children(node)) {
184 if(stree.is_leaf(child)){
187 leaf_bps.push_back(stree.csa[stree.lb(child)]);
191 int child_id = stree.id(child) - stree.size();
192 if(!node_begins[child_id ].empty()){
194 offsets.push_back(node_begins[no_leaf_id].size());
197 node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),node_begins[child_id ].begin(), node_begins[child_id].end());
199 node_begins[child_id] = std::vector<uint>();
204 std::sort(leaf_bps.begin(), leaf_bps.end());
206 offsets.push_back(node_begins[no_leaf_id].size());
207 node_begins[no_leaf_id].insert(node_begins[no_leaf_id].end(),leaf_bps.begin(), leaf_bps.end());
210 for(
uint k = 0; k < offsets.size()-1; k++){
211 std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets[k], node_begins[no_leaf_id].begin()+ offsets[k+1]);
215 std::inplace_merge(node_begins[no_leaf_id].begin(), node_begins[no_leaf_id].begin()+ offsets.back(), node_begins[no_leaf_id].end());
221 if(node_begins[no_leaf_id].empty()){
225 if((node_begins[no_leaf_id].size()>=2)){
227 if (( (
uint)( node_begins[no_leaf_id].back() - node_begins[no_leaf_id].front() )) >= i ){
230 signed long last = 0 - (long) i;
231 std::vector<uint> first_layer_viable;
232 std::vector<uint> second_layer_viable;
233 for(
uint occurence : node_begins[no_leaf_id]){
235 if( (last+i <= (
long) occurence)){
236 if(fl_offsets[occurence] == 0){
237 if(fl_offsets[occurence + i -1] == 0){
239 first_layer_viable.push_back(occurence);
244 uint parent_nts= first_layer_nts[ occurence - (fl_offsets[occurence] -1) ];
245 auto nts = non_terminal_symbols[parent_nts-1];
247 if(nts.second >=fl_offsets[occurence]-1 + i ){
248 second_layer_viable.
push_back(occurence);
260 if(first_layer_viable.size() >=1 &&(first_layer_viable.size() + second_layer_viable.size() >= 2) ) {
261 std::pair<uint,uint> nts = std::make_pair(first_layer_viable.front(), i);
262 non_terminal_symbols.push_back(nts);
265 for(
uint occ : first_layer_viable){
266 first_layer_nts[occ]= nts_number;
268 for(
uint nts_length =0; nts_length < i; nts_length++){
269 fl_offsets[occ + nts_length] = nts_length+1;
275 for(
uint sl_occ :second_layer_viable){
276 uint parent_nts= first_layer_nts[ sl_occ - (fl_offsets[sl_occ] -1) ];
278 auto parent_sym = non_terminal_symbols[parent_nts-1];
279 uint parent_start= parent_sym.first;
280 uint sl_start = (parent_start + fl_offsets[sl_occ] -1);
281 uint sl_end = sl_start+i-1;
282 if(second_layer_dead[sl_start] == (
uint)0 && second_layer_dead[sl_end] == (
uint)0){
284 second_layer_nts[sl_start]=nts_number;
286 for(
uint dead = sl_start; dead<=sl_end;dead++){
287 second_layer_dead[dead]=1;
302 uint min_shorter = node_begins[no_leaf_id].
back() - node_begins[no_leaf_id].front();
304 node_type parent = stree.parent(node);
305 uint depth = stree.depth(parent);
306 if(depth < (min_shorter)){
308 bins[min_shorter].push_back(stree.id(node));
325 DLOG(INFO)<<
"Computing symbol depth";
329 for(
uint nts_num =0; nts_num<non_terminal_symbols.size(); nts_num++){
330 auto symbol = non_terminal_symbols[nts_num];
331 uint cur_depth = nts_depth[nts_num];
333 for(
uint pos = symbol.first;
pos < symbol.second + symbol.first ;
pos++){
334 if(second_layer_nts[
pos]>0){
336 uint symbol_num = second_layer_nts[
pos] -1;
337 if(nts_depth[symbol_num]< cur_depth+1){
338 nts_depth[symbol_num]= cur_depth+1;
348 DLOG(INFO)<<
"Computing done";
350 std::sort(nts_depth.begin(), nts_depth.end());
351 if(nts_depth.size()>0){
353 uint max_depth = nts_depth[nts_depth.size()-1];
355 DLOG(INFO)<<
"Max CFG Depth: "<< max_depth;
356 DLOG(INFO)<<
"Number of CFG rules: "<< non_terminal_symbols.
size();
358 if(nts_depth.size()>=4){
359 uint quarter = nts_depth.size() /4;
362 StatPhase::log(
"50 \% quantil CFG Depth", nts_depth[(2*quarter) -1]);
363 StatPhase::log(
"75 \% quantil CFG Depth", nts_depth[(3*quarter) -1]);
374 StatPhase::log(
"Number of CFG rules", non_terminal_symbols.size());
376 std::stringstream literals;
378 for(
uint position = 0; position< in.size(); position++){
379 if(fl_offsets[position]==0){
380 literals << in[position];
383 for(
uint nts_num = 0; nts_num<non_terminal_symbols.size(); nts_num++){
385 auto symbol = non_terminal_symbols[nts_num];
387 for(
uint pos = symbol.first;
pos < symbol.second + symbol.first;
pos++){
388 if(second_layer_nts[
pos] == 0 &&
pos < in.size()){
398 DLOG(INFO) <<
"encoding dictionary symbol sizes ";
400 std::shared_ptr<BitOStream> bitout = std::make_shared<BitOStream>(output);
401 typename literal_coder_t::Encoder lit_coder(
402 env().env_for_option(
"lfs2_lit_coder"),
406 typename len_coder_t::Encoder len_coder(
407 env().env_for_option(
"lfs2_len_coder"),
413 DLOG(INFO)<<
"number nts: " << non_terminal_symbols.size();
414 Range intrange (0, UINT_MAX);
416 if(non_terminal_symbols.size()>=1){
417 auto symbol = non_terminal_symbols[0];
418 uint last_length=symbol.second;
420 Range s_length_r (0,last_length);
421 len_coder.encode(last_length,intrange);
423 for(
uint nts_num = 1; nts_num < non_terminal_symbols.size(); nts_num++){
424 symbol = non_terminal_symbols[nts_num];
425 len_coder.encode(last_length-symbol.second,s_length_r);
426 last_length=symbol.second;
430 len_coder.encode(symbol.second,s_length_r);
432 len_coder.encode(0,intrange);
435 Range dict_r(0, non_terminal_symbols.size());
438 long buf_size = bitout->tellp();
441 DLOG(INFO)<<
"Bytes Length Encoding: "<< buf_size;
444 DLOG(INFO) <<
"encoding dictionary symbols";
445 uint dict_literals=0;
448 if(non_terminal_symbols.size()>=1){
449 std::pair<uint,uint> symbol;
450 for(
long nts_num =non_terminal_symbols.size()-1; nts_num >= 0; nts_num--){
452 symbol = non_terminal_symbols[nts_num];
454 for(
uint pos = symbol.first;
pos < symbol.second + symbol.first ;
pos++){
455 if(second_layer_nts[
pos] > 0){
456 lit_coder.encode(1,
bit_r);
457 lit_coder.encode(second_layer_nts[
pos], dict_r);
458 auto symbol = non_terminal_symbols[second_layer_nts[
pos] -1];
462 pos += symbol.second - 1;
465 lit_coder.encode(0,
bit_r);
480 buf_size = long(bitout->tellp()) - buf_size;
484 DLOG(INFO)<<
"Bytes Non-Terminal Symbol Encoding: "<< buf_size;
488 DLOG(INFO)<<
"encode start symbol";
490 if(first_layer_nts[
pos]>0){
491 lit_coder.encode(1,
bit_r);
492 lit_coder.encode(first_layer_nts[
pos], dict_r);
493 auto symbol = non_terminal_symbols[first_layer_nts[
pos] -1];
495 pos += symbol.second - 1;
498 lit_coder.encode(0,
bit_r);
506 buf_size = long(bitout->tellp()) - buf_size;
510 DLOG(INFO)<<
"Bytes Start Symbol Encoding: "<< buf_size;
518 double literal_percent = ((double)dict_literals + (
double)literals)/ (
double)in.size();
519 StatPhase::log(
"Literals Encoding / Literals Input", literal_percent);
522 DLOG(INFO)<<
"encoding done";
529 DLOG(INFO) <<
"decompress lfs";
530 std::shared_ptr<BitIStream> bitin = std::make_shared<BitIStream>(input);
532 typename literal_coder_t::Decoder lit_decoder(
533 env().env_for_option(
"lfs2_lit_coder"),
536 typename len_coder_t::Decoder len_decoder(
537 env().env_for_option(
"lfs2_len_coder"),
540 Range int_r (0,UINT_MAX);
542 uint symbol_length = len_decoder.template decode<uint>(int_r);
543 Range slength_r (0, symbol_length);
544 std::vector<uint> dict_lengths;
545 dict_lengths.reserve(symbol_length);
546 dict_lengths.push_back(symbol_length);
547 while(symbol_length>0){
549 uint current_delta = len_decoder.template decode<uint>(slength_r);
550 symbol_length-=current_delta;
551 dict_lengths.push_back(symbol_length);
553 dict_lengths.pop_back();
555 DLOG(INFO)<<
"decoded number of nts: "<< dict_lengths.size();
559 std::vector<std::string> dictionary;
560 uint dictionary_size = dict_lengths.size();
562 Range dictionary_r (0, dictionary_size);
565 dictionary.resize(dict_lengths.size());
567 std::stringstream ss;
571 DLOG(INFO) <<
"reading dictionary";
572 for(
long i = dict_lengths.size() -1; i>=0 ;i--){
576 long size_cur = (long) dict_lengths[i];
578 bool bit1 = lit_decoder.template decode<bool>(
bit_r);
583 symbol_number = lit_decoder.template decode<uint>(dictionary_r);
587 if(symbol_number < dictionary.size()){
589 ss << dictionary.at(symbol_number);
590 size_cur-= dict_lengths[symbol_number];
597 c1 = lit_decoder.template decode<char>(
literal_r);
606 dictionary[i]=ss.str();
613 while(!lit_decoder.eof()){
615 bool bit1 = lit_decoder.template decode<bool>(
bit_r);
620 c1 = lit_decoder.template decode<char>(
literal_r);
625 symbol_number = lit_decoder.template decode<uint>(dictionary_r);
628 if(symbol_number < dictionary.size()){
630 ostream << dictionary.at(symbol_number);
632 DLOG(INFO)<<
"too large symbol: " << symbol_number;
Represents a generic range of positive integers.
Contains the text compression and encoding framework.
constexpr auto bit_r
Global predefined range for bits (0 or 1).
A vector over arbitrary unsigned integer types.
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Base for data compressors.
void push_back(const value_type &val)
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
virtual void compress(Input &input, Output &output) override
Compress the given input to the given output.
uint64_t as_integer() const
A literal iterator that yields every character from a View.
Env & env()
Provides access to the environment that the algorithm works in.
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
constexpr auto literal_r
Global predefined reange for literals.
An abstraction layer for algorithm output.
Defines data encoding to and decoding from a stream of Elias-Gamma codes.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
LFS2Compressor(Env &&env)
Local environment for a compression/encoding/decompression call.
virtual void decompress(Input &input, Output &output) override
Decompress the given input to the given output.