6 #include <unordered_map> 36 template<
typename literal_coder_t = HuffmanCoder,
typename len_coder_t = EliasGammaCoder >
42 std::unique_ptr<BinarySuffixTree> stree;
58 std::vector<std::pair<uint, uint> > non_terminal_symbols;
62 std::vector<std::vector<uint> > bins;
66 std::unordered_map< uint , std::vector< uint > > node_begins;
68 inline virtual void compute_string_depth(
uint node,
uint str_depth){
70 uint child = stree->get_first_child(node);
74 while(str_depth>= bins.size()){
75 bins.
resize(bins.size()*2);
80 if(str_depth>max_depth){
86 bins[str_depth].push_back(node);
89 compute_string_depth(child, str_depth + stree->get_edge_length(child));
90 child=stree->get_next_sibling(child);
103 Meta m(
"compressor",
"lfs2bst",
117 DLOG(INFO) <<
"Compressor lfs2bst instantiated";
131 DLOG(INFO) <<
"text length: "<<in.
size();
133 if(in.size() >= min_lrf){
137 DLOG(INFO)<<
"Constructing ST";
138 stree = std::make_unique<BinarySuffixTree>(in);
150 compute_string_depth(0,0);
152 bins.resize(max_depth +1);
153 bins.shrink_to_fit();
158 node_begins.reserve(node_count);
160 uint nts_number = 1 ;
163 DLOG(INFO)<<
"iterate over Node Bins";
164 for(
uint i = bins.size()-1; i>=min_lrf; i--){
167 while(!bins[i].empty()){
168 uint no_leaf_id = bins[i].back();
172 auto bp = node_begins.find(no_leaf_id);
176 if(bp == node_begins.end()){
179 std::vector<uint> positions;
181 std::vector<uint> offsets;
182 std::vector<uint> leaf_bps;
184 uint inner = stree->get_first_child(no_leaf_id);
188 if(stree->get_first_child(inner) == 0){
191 leaf_bps.push_back(stree->get_suffix(inner));
194 auto & child_bp = node_begins[inner];
195 if(!child_bp.empty()){
196 offsets.push_back(positions.size());
198 positions.insert(positions.end(), child_bp.begin(), child_bp.end());
201 node_begins.erase(inner);
206 inner = stree->get_next_sibling(inner);
209 std::sort(leaf_bps.begin(), leaf_bps.end());
210 offsets.push_back(positions.size());
211 positions.insert(positions.end(),leaf_bps.begin(), leaf_bps.end());
212 for(
uint k = 0; k < offsets.size()-1; k++){
213 std::inplace_merge(positions.begin(), positions.begin()+ offsets[k], positions.begin()+ offsets[k+1]);
217 std::inplace_merge(positions.begin(), positions.begin()+ offsets.back(), positions.end());
220 if(!std::is_sorted(positions.begin(), positions.end() )){
221 DLOG(INFO)<<
"positions not sorted!!!";
225 node_begins[no_leaf_id]=positions;
232 std::vector<uint> begin_pos = node_begins[no_leaf_id];
234 if(begin_pos.empty()){
238 if(begin_pos.size()>=2){
240 if ( ( begin_pos.back() - begin_pos.front() ) >= i ){
243 signed long last = 0 - (long) i;
244 std::vector<uint> first_layer_viable;
245 std::vector<uint> second_layer_viable;
246 for(
uint occurence : begin_pos){
248 if( (last+i <= (
long) occurence)){
249 if(fl_offsets[occurence] == 0){
250 if(fl_offsets[occurence + i -1] == 0){
252 first_layer_viable.push_back(occurence);
257 uint parent_nts= first_layer_nts[ occurence - (fl_offsets[occurence] -1) ];
258 auto nts = non_terminal_symbols[parent_nts-1];
260 if(nts.second >=fl_offsets[occurence]-1 + i ){
261 second_layer_viable.
push_back(occurence);
272 if(first_layer_viable.size() >=1 &&(first_layer_viable.size() + second_layer_viable.size() >= 2) ) {
273 std::pair<uint,uint> nts = std::make_pair(first_layer_viable.front(), i);
274 non_terminal_symbols.push_back(nts);
277 for(
uint occ : first_layer_viable){
278 first_layer_nts[occ]= nts_number;
280 for(
uint nts_length =0; nts_length < i; nts_length++){
281 fl_offsets[occ + nts_length] = nts_length+1;
287 for(
uint sl_occ :second_layer_viable){
288 uint parent_nts= first_layer_nts[ sl_occ - (fl_offsets[sl_occ] -1) ];
290 auto parent_sym = non_terminal_symbols[parent_nts-1];
291 uint parent_start= parent_sym.first;
292 uint sl_start = (parent_start + fl_offsets[sl_occ] -1);
293 uint sl_end = sl_start+i-1;
294 if(second_layer_dead[sl_start] == (
uint)0 && second_layer_dead[sl_end] == (
uint)0){
296 second_layer_nts[sl_start]=nts_number;
298 for(
uint dead = sl_start; dead<=sl_end;dead++){
299 second_layer_dead[dead]=1;
321 DLOG(INFO)<<
"Computing symbol depth";
325 for(
uint nts_num =0; nts_num<non_terminal_symbols.size(); nts_num++){
326 auto symbol = non_terminal_symbols[nts_num];
327 uint cur_depth = nts_depth[nts_num];
329 for(
uint pos = symbol.first;
pos < symbol.second + symbol.first ;
pos++){
330 if(second_layer_nts[
pos]>0){
332 uint symbol_num = second_layer_nts[
pos] -1;
333 if(nts_depth[symbol_num]< cur_depth+1){
334 nts_depth[symbol_num]= cur_depth+1;
344 DLOG(INFO)<<
"Computing done";
346 std::sort(nts_depth.begin(), nts_depth.end());
347 if(nts_depth.size()>0){
349 uint max_depth = nts_depth[nts_depth.size()-1];
351 DLOG(INFO)<<
"Max CFG Depth: "<< max_depth;
352 DLOG(INFO)<<
"Number of CFG rules: "<< non_terminal_symbols.
size();
354 if(nts_depth.size()>=4){
355 uint quarter = nts_depth.size() /4;
358 StatPhase::log(
"50 \% quantil CFG Depth", nts_depth[(2*quarter) -1]);
359 StatPhase::log(
"75 \% quantil CFG Depth", nts_depth[(3*quarter) -1]);
369 StatPhase::log(
"Number of CFG rules", non_terminal_symbols.size());
371 std::stringstream literals;
373 for(
uint position = 0; position< in.size(); position++){
374 if(fl_offsets[position]==0){
375 literals << in[position];
378 for(
uint nts_num = 0; nts_num<non_terminal_symbols.size(); nts_num++){
380 auto symbol = non_terminal_symbols[nts_num];
382 for(
uint pos = symbol.first;
pos < symbol.second + symbol.first ;
pos++){
384 if(second_layer_nts[
pos] == 0 &&
pos < in.size()){
392 DLOG(INFO)<<
"encoding";
396 DLOG(INFO) <<
"encoding dictionary symbol sizes ";
398 std::shared_ptr<BitOStream> bitout = std::make_shared<BitOStream>(output);
399 typename literal_coder_t::Encoder lit_coder(
400 env().env_for_option(
"lfs2_lit_coder"),
404 typename len_coder_t::Encoder len_coder(
405 env().env_for_option(
"lfs2_len_coder"),
411 DLOG(INFO)<<
"number nts: " << non_terminal_symbols.size();
412 Range intrange (0, UINT_MAX);
414 if(non_terminal_symbols.size()>=1){
415 auto symbol = non_terminal_symbols[0];
416 uint last_length=symbol.second;
418 Range s_length_r (0,last_length);
419 len_coder.encode(last_length,intrange);
421 for(
uint nts_num = 1; nts_num < non_terminal_symbols.size(); nts_num++){
422 symbol = non_terminal_symbols[nts_num];
423 len_coder.encode(last_length-symbol.second,s_length_r);
424 last_length=symbol.second;
428 len_coder.encode(symbol.second,s_length_r);
430 len_coder.encode(0,intrange);
433 Range dict_r(0, non_terminal_symbols.size());
436 long buf_size = bitout->tellp();
439 DLOG(INFO)<<
"Bytes Length Encoding: "<< buf_size;
442 DLOG(INFO) <<
"encoding dictionary symbols";
443 uint dict_literals=0;
446 if(non_terminal_symbols.size()>=1){
447 std::pair<uint,uint> symbol;
448 for(
long nts_num =non_terminal_symbols.size()-1; nts_num >= 0; nts_num--){
449 symbol = non_terminal_symbols[nts_num];
451 for(
uint pos = symbol.first;
pos < symbol.second + symbol.first ;
pos++){
452 if(second_layer_nts[
pos] > 0){
453 lit_coder.encode(1,
bit_r);
454 lit_coder.encode(second_layer_nts[
pos], dict_r);
455 auto symbol = non_terminal_symbols[second_layer_nts[
pos] -1];
459 pos += symbol.second - 1;
462 lit_coder.encode(0,
bit_r);
477 buf_size = long(bitout->tellp()) - buf_size;
481 DLOG(INFO)<<
"Bytes Non-Terminal Symbol Encoding: "<< buf_size;
485 DLOG(INFO)<<
"encode start symbol";
487 if(first_layer_nts[
pos]>0){
488 lit_coder.encode(1,
bit_r);
489 lit_coder.encode(first_layer_nts[
pos], dict_r);
490 auto symbol = non_terminal_symbols[first_layer_nts[
pos] -1];
492 pos += symbol.second - 1;
495 lit_coder.encode(0,
bit_r);
503 buf_size = long(bitout->tellp()) - buf_size;
507 DLOG(INFO)<<
"Bytes Start Symbol Encoding: "<< buf_size;
515 double literal_percent = ((double)dict_literals + (
double)literals)/ (
double)in.size();
516 StatPhase::log(
"Literals Encoding / Literals Input", literal_percent);
518 DLOG(INFO)<<
"encoding done";
525 DLOG(INFO) <<
"decompress lfs";
526 std::shared_ptr<BitIStream> bitin = std::make_shared<BitIStream>(input);
528 typename literal_coder_t::Decoder lit_decoder(
529 env().env_for_option(
"lfs2_lit_coder"),
532 typename len_coder_t::Decoder len_decoder(
533 env().env_for_option(
"lfs2_len_coder"),
536 Range int_r (0,UINT_MAX);
538 uint symbol_length = len_decoder.template decode<uint>(int_r);
539 Range slength_r (0, symbol_length);
540 std::vector<uint> dict_lengths;
541 dict_lengths.reserve(symbol_length);
542 dict_lengths.push_back(symbol_length);
543 while(symbol_length>0){
545 uint current_delta = len_decoder.template decode<uint>(slength_r);
546 symbol_length-=current_delta;
547 dict_lengths.push_back(symbol_length);
549 dict_lengths.pop_back();
551 DLOG(INFO)<<
"decoded number of nts: "<< dict_lengths.size();
555 std::vector<std::string> dictionary;
556 uint dictionary_size = dict_lengths.size();
558 Range dictionary_r (0, dictionary_size);
561 dictionary.resize(dict_lengths.size());
563 std::stringstream ss;
567 DLOG(INFO) <<
"reading dictionary";
568 for(
long i = dict_lengths.size() -1; i>=0 ;i--){
572 long size_cur = (long) dict_lengths[i];
574 bool bit1 = lit_decoder.template decode<bool>(
bit_r);
579 symbol_number = lit_decoder.template decode<uint>(dictionary_r);
583 if(symbol_number < dictionary.size()){
585 ss << dictionary.at(symbol_number);
586 size_cur-= dict_lengths[symbol_number];
593 c1 = lit_decoder.template decode<char>(
literal_r);
602 dictionary[i]=ss.str();
609 while(!lit_decoder.eof()){
611 bool bit1 = lit_decoder.template decode<bool>(
bit_r);
616 c1 = lit_decoder.template decode<char>(
literal_r);
621 symbol_number = lit_decoder.template decode<uint>(dictionary_r);
624 if(symbol_number < dictionary.size()){
626 ostream << dictionary.at(symbol_number);
628 DLOG(INFO)<<
"too large symbol: " << symbol_number;
Represents a generic range of positive integers.
Contains the text compression and encoding framework.
constexpr auto bit_r
Global predefined range for bits (0 or 1).
A vector over arbitrary unsigned integer types.
const OptionValue & option(const std::string &option) const
Get an option of this algorithm.
Base for data compressors.
void push_back(const value_type &val)
IntVector< uint_t< 1 > > BitVector
Represents a bit vector, alias for IntVector with a fixed bit width of 1.
virtual void compress(Input &input, Output &output) override
Compress the given input to the given output.
uint64_t as_integer() const
A literal iterator that yields every character from a View.
LFS2BSTCompressor(Env &&env)
Env & env()
Provides access to the environment that the algorithm works in.
OutputStream as_stream() const
Creates a stream that allows for character-wise output.
An empty literal iterator that yields no literals whatsoever.
constexpr auto literal_r
Global predefined reange for literals.
An abstraction layer for algorithm output.
virtual void decompress(Input &input, Output &output) override
Decompress the given input to the given output.
Defines data encoding to and decoding from a stream of Elias-Gamma codes.
static void log(const char *key, const T &value)
Logs a user statistic for the current phase.
static auto wrap(const char *title, F func) -> typename std::result_of< F(StatPhase &)>::type
Executes a lambda as a single statistics phase.
Local environment for a compression/encoding/decompression call.