tudocomp
– The TU Dortmund Compression Framework
RestrictedBuffer.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstddef>
4 #include <cstdint>
5 #include <fstream>
6 #include <iostream>
7 #include <string>
8 #include <utility>
9 #include <vector>
10 
14 #include <tudocomp/io/IOUtil.hpp>
16 
18 namespace tdc {namespace io {
20  class RestrictedBuffer {
21  public:
22  static const size_t npos = -1;
23  private:
24  MMap m_map;
25  View m_restricted_data;
26 
27  io::InputRestrictions m_restrictions;
28  InputSource m_source;
29 
30  // mmap needs to be page aligned, so for file mappings
31  // we need to store a offset
32  //
33  // Effectivley this means that for a given mmap in instance in this class,
34  // instead of
35  // |-----------input--------------|
36  // [ [from_______to] ]
37  // it stores
38  // |-----------input--------------|
39  // [ [offset|from_______to] ]
40  size_t m_mmap_page_offset = 0;
41 
42  template<typename I, typename J>
43  inline void escape_with_iters(I read_begin, I read_end, J write_end, bool do_copy = false) {
44  if (!m_restrictions.has_no_escape_restrictions()) {
45  FastEscapeMap fast_escape_map;
46  uint8_t escape_byte;
47 
48  {
49  EscapeMap em(m_restrictions);
50  fast_escape_map = FastEscapeMap(em);
51  escape_byte = fast_escape_map.escape_byte();
52  }
53 
54  while(read_begin != read_end) {
55  --read_end;
56  --write_end;
57 
58  uint8_t current_byte = *read_end;
59 
60  *write_end = fast_escape_map.lookup_byte(current_byte);
61 
62  if (fast_escape_map.lookup_flag_bool(current_byte)) {
63  --write_end;
64  *write_end = escape_byte;
65  }
66  }
67  } else if (do_copy) {
68  while(read_begin != read_end) {
69  --read_end;
70  --write_end;
71  *write_end = *read_end;
72  }
73  }
74  }
75 
76  // NB: The len argument would be redundant, but exists because
77  // the istream iterator does not allow efficient slicing of the input file
78  // TODO: Define custom sliceable ifstream iterator
79  // TODO: ^ Needed in Input as well
80  template<typename T>
81  inline size_t extra_size_needed_due_restrictions(T begin, T end, size_t len) {
82  size_t extra = 0;
83 
84  if (!m_restrictions.has_no_escape_restrictions()) {
85  size_t i = 0;
86 
87  FastEscapeMap fast_escape_map{EscapeMap(m_restrictions)};
88 
89  while((begin != end) && (i < len)) {
90  uint8_t current_byte = *begin;
91 
92  extra += fast_escape_map.lookup_flag(current_byte);
93 
94  ++begin;
95  ++i;
96  }
97 
98  DCHECK_EQ(i, len);
99  }
100 
101  if (m_restrictions.null_terminate()) {
102  extra++;
103  }
104 
105  return extra;
106  }
107 
108  inline void init(size_t m_from, size_t m_to) {
109  if (m_source.is_view()) {
110  View s;
111  if (m_to == npos) {
112  s = m_source.view().slice(m_from);
113  } else {
114  s = m_source.view().slice(m_from, m_to);
115  }
116 
117  size_t extra_size = extra_size_needed_due_restrictions(
118  s.cbegin(), s.cend(), s.size());
119 
120  if (extra_size != 0) {
121  size_t size = s.size() + extra_size;
122  m_map = MMap(size);
123 
124  {
125  GenericView<uint8_t> target = m_map.view();
126  size_t noff = m_restrictions.null_terminate()? 1 : 0;
127  escape_with_iters(s.cbegin(), s.cend(), target.end() - noff, true);
128  // For null termination, a trailing byte is implicit 0
129  }
130 
131  m_restricted_data = m_map.view();
132  } else {
133  m_restricted_data = s;
134  }
135  } else if (m_source.is_file()) {
136  // iterate file to check for escapeable bytes and also null
137 
138  size_t unrestricted_size;
139  if (m_to == npos) {
140  unrestricted_size = read_file_size(m_source.file()) - m_from;
141  } else {
142  unrestricted_size = m_to - m_from;
143  }
144 
145  auto path = m_source.file();
146  auto c_path = path.c_str();
147 
148  size_t extra_size = 0;
149  {
150  auto ifs = create_tdc_ifstream(c_path, m_from);
151 
152  std::istreambuf_iterator<char> begin (ifs);
153  std::istreambuf_iterator<char> end;
154 
155  extra_size = extra_size_needed_due_restrictions(
156  begin, end, unrestricted_size);
157  }
158 
159  size_t aligned_offset = MMap::next_valid_offset(m_from);
160  m_mmap_page_offset = m_from - aligned_offset;
161 
162  DCHECK_EQ(aligned_offset + m_mmap_page_offset, m_from);
163 
164  size_t map_size = unrestricted_size + extra_size + m_mmap_page_offset;
165 
166  if (m_restrictions.has_no_restrictions()) {
167  m_map = MMap(path, MMap::Mode::Read, map_size, aligned_offset);
168 
169  const auto& m = m_map;
170  m_restricted_data = m.view().slice(m_mmap_page_offset);
171  } else {
172  m_map = MMap(path, MMap::Mode::ReadWrite, map_size, aligned_offset);
173 
174  size_t noff = m_restrictions.null_terminate()? 1 : 0;
175 
176  uint8_t* begin_file_data = m_map.view().begin() + m_mmap_page_offset;
177  uint8_t* end_file_data = begin_file_data + unrestricted_size;
178  uint8_t* end_data = end_file_data + extra_size - noff;
179  escape_with_iters(begin_file_data, end_file_data, end_data);
180  if (m_restrictions.null_terminate()) {
181  // ensure the last valid byte is actually 0 if using null termination
182  *end_data = 0;
183  }
184  m_restricted_data = m_map.view().slice(m_mmap_page_offset);
185  }
186  } else if (m_source.is_stream()) {
187  DCHECK_EQ(m_from, 0);
188  DCHECK_EQ(m_to, npos);
189 
190  // Start with a typical page size to not realloc as often
191  // for small inputs
192  size_t capacity = pagesize();
193  size_t size = 0;
194  size_t extra_size = 0;
195  FastEscapeMap fast_escape_map;
196  if (!m_restrictions.has_no_escape_restrictions()) {
197  fast_escape_map = FastEscapeMap {
198  EscapeMap(m_restrictions)
199  };
200  }
201 
202  size_t noff = m_restrictions.null_terminate()? 1 : 0;
203  extra_size += noff;
204 
205  // Initial allocation
206 
207  m_map = MMap(capacity);
208 
209  // Fill and grow
210  {
211  std::istream& is = *(m_source.stream());
212  bool done = false;
213 
214  while(!done) {
215  // fill until capacity
216  uint8_t* ptr = m_map.view().begin() + size;
217  while(size < capacity) {
218  char c;
219  if(!is.get(c)) {
220  done = true;
221  break;
222  } else {
223  *ptr = uint8_t(c);
224  ++ptr;
225  ++size;
226  extra_size += fast_escape_map.lookup_flag(uint8_t(c));
227  }
228  }
229  if (done) break;
230 
231  // realloc to greater size;
232  capacity *= 2;
233  m_map.remap(capacity);
234  }
235 
236  // Throw away overallocation
237  // For null termination,
238  // a trailing unwritten byte is automatically 0
239  m_map.remap(size + extra_size);
240 
241  m_restricted_data = m_map.view();
242  }
243 
244  // Escape
245  {
246  uint8_t* begin_stream_data = m_map.view().begin();
247  uint8_t* end_stream_data = begin_stream_data + size;
248  uint8_t* end_data = end_stream_data + extra_size - noff;
249  escape_with_iters(begin_stream_data, end_stream_data, end_data);
250  }
251  } else {
252  DCHECK(false) << "This should not happen";
253  }
254  }
255 
257  inline static RestrictedBuffer unrestrict(RestrictedBuffer&& other) {
258  DCHECK(other.source().is_stream());
259  if (other.restrictions().has_no_restrictions()) {
260  return std::move(other);
261  }
262  DCHECK(other.restrictions().has_restrictions());
263  DCHECK(other.m_mmap_page_offset == 0);
264 
265  auto x = std::move(other);
266 
267  auto r = x.m_restrictions;
268 
269  auto start = x.m_map.view().begin();
270  auto end = x.m_map.view().end();
271 
272  FastUnescapeMap fast_unescape_map { EscapeMap(r) };
273 
274  auto read_p = start;
275  auto write_p = start;
276 
277  size_t noff = x.m_restrictions.null_terminate()? 1 : 0;
278 
279  auto data_end = end - noff;
280 
281  while (read_p != data_end) {
282  if (*read_p == fast_unescape_map.escape_byte()) {
283  ++read_p;
284  *write_p = fast_unescape_map.lookup_byte(*read_p);
285  } else {
286  *write_p = *read_p;
287  }
288  ++read_p;
289  ++write_p;
290  }
291 
292  auto old_size = x.m_map.view().size();
293  auto reduced_size = (read_p - write_p) + noff;
294 
295  x.m_map.remap(old_size - reduced_size);
296  x.m_restrictions = InputRestrictions();
297  x.m_restricted_data = x.m_map.view();
298 
299  return x;
300  }
301 
303  inline static RestrictedBuffer restrict(RestrictedBuffer&& other,
304  const io::InputRestrictions& restrictions) {
305  DCHECK(other.source().is_stream());
306  if (other.restrictions() == restrictions) {
307  return std::move(other);
308  }
309  DCHECK(other.restrictions().has_no_restrictions());
310  DCHECK(other.m_mmap_page_offset == 0);
311 
312  size_t old_size;
313  size_t extra_size;
314 
315  // Calculate needed extra size:
316  {
317  View s = other.view();
318  other.m_restrictions = restrictions;
319  extra_size = other.extra_size_needed_due_restrictions(
320  s.cbegin(), s.cend(), s.size()
321  );
322  old_size = s.size();
323  }
324 
325  // If nothing about the actual data changed
326  // return it as is
327  if (extra_size == 0) {
328  return std::move(other);
329  }
330 
331  // Else remap and expand the data by escaping:
332 
333  other.m_map.remap(old_size + extra_size);
334  other.m_restricted_data = other.m_map.view();
335 
336  {
337  size_t noff = other.m_restrictions.null_terminate()? 1 : 0;
338 
339  uint8_t* start = other.m_map.view().begin();
340  uint8_t* old_end = start + old_size;
341  uint8_t* new_end = old_end + extra_size - noff;
342 
343  other.escape_with_iters(start, old_end, new_end);
344  if (other.m_restrictions.null_terminate()) {
345  *new_end = 0;
346  }
347 
348  DCHECK_EQ(new_end + noff, other.m_map.view().end());
349  }
350 
351  return std::move(other);
352  }
353 
354  public:
357  inline RestrictedBuffer change_restrictions(
358  const io::InputRestrictions& restrictions) &&
359  {
360  auto& other = *this;
361  auto buf = unrestrict(std::move(other));
362  auto r = restrict(std::move(buf), restrictions);
363  return r;
364  }
365 
366  inline RestrictedBuffer(const InputSource& src,
367  size_t from,
368  size_t to,
369  io::InputRestrictions restrictions):
370  m_restrictions(restrictions),
371  m_source(src)
372  {
373  init(from, to);
374  }
375 
376  inline const InputRestrictions& restrictions() const { return m_restrictions; }
377  inline const InputSource& source() const { return m_source; }
378 
379  inline View view() const { return m_restricted_data; }
380 
381  inline RestrictedBuffer() = delete;
382  };
383 
384 }}
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
len_compact_t len
Definition: LZSSFactors.hpp:38
size_type size() const
Returns size of the View.
len_compact_t src
Definition: LZSSFactors.hpp:38
ConstGenericView slice(size_type from, size_type to=npos) const
Construct a new View that is a sub view into the current one.
ByteView View
Definition: View.hpp:25