tudocomp
– The TU Dortmund Compression Framework
divsufsort.hpp
Go to the documentation of this file.
1 /*
2  * This file integrates customized parts of divsufsort into tudocomp.
3  * divsufsort is licensed under the MIT License, which follows.
4  *
5  * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person
8  * obtaining a copy of this software and associated documentation
9  * files (the "Software"), to deal in the Software without
10  * restriction, including without limitation the rights to use,
11  * copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following
14  * conditions:
15  *
16  * The above copyright notice and this permission notice shall be
17  * included in all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
21  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26  * OTHER DEALINGS IN THE SOFTWARE.
27  */
28 
29 #pragma once
30 
36 
38 
40 namespace tdc {
41 namespace libdivsufsort {
42 
43 // from divsufsort.c
44 /* Sorts suffixes of type B*. */
45 template<typename buffer_t>
46 inline saidx_t sort_typeBstar(
47  const sauchar_t *T, buffer_t& SA,
48  saidx_t *bucket_A, saidx_t *bucket_B,
49  saidx_t n) {
50 
51  saidx_t PAb, ISAb, buf;
52  saidx_t i, j, k, t, m, bufsize;
53  saint_t c0, c1;
54 
55  /* Initialize bucket arrays. */
56  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
57  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
58 
59  /* Count the number of occurrences of the first one or two characters of each
60  type A, B and B* suffix. Moreover, store the beginning position of all
61  type B* suffixes into the array SA. */
62  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
63  /* type A suffix. */
64  do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
65  if(0 <= i) {
66  /* type B* suffix. */
67  ++BUCKET_BSTAR(c0, c1);
68  SA[--m] = i;
69  /* type B suffix. */
70  for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
71  ++BUCKET_B(c0, c1);
72  }
73  }
74  }
75  m = n - m;
76 /*
77 note:
78  A type B* suffix is lexicographically smaller than a type B suffix that
79  begins with the same first two characters.
80 */
81 
82  /* Calculate the index of start/end point of each bucket. */
83  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
84  t = i + BUCKET_A(c0);
85  BUCKET_A(c0) = i + j; /* start point */
86  i = t + BUCKET_B(c0, c0);
87  for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
88  j += BUCKET_BSTAR(c0, c1);
89  BUCKET_BSTAR(c0, c1) = j; /* end point */
90  i += BUCKET_B(c0, c1);
91  }
92  }
93 
94  if(0 < m) {
95  /* Sort the type B* suffixes by their first two characters. */
96  PAb = n - m; ISAb = m;
97  for(i = m - 2; 0 <= i; --i) {
98  t = SA[PAb + i], c0 = T[t], c1 = T[t + 1];
99  SA[--BUCKET_BSTAR(c0, c1)] = i;
100  }
101  t = SA[PAb + m - 1], c0 = T[t], c1 = T[t + 1];
102  SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
103 
104  /* Sort the type B* substrings using sssort. */
105  buf = m, bufsize = n - (2 * m);
106  for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
107  for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
108  i = BUCKET_BSTAR(c0, c1);
109  if(1 < (j - i)) {
110  sssort(T, SA, PAb, i, j,
111  buf, bufsize, 2, n, SA[i] == (m - 1));
112  }
113  }
114  }
115 
116  /* Compute ranks of type B* substrings. */
117  for(i = m - 1; 0 <= i; --i) {
118  if(0 <= SA[i]) {
119  j = i;
120  do { SA[ISAb + SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
121  SA[i + 1] = i - j;
122  if(i <= 0) { break; }
123  }
124  j = i;
125  do { SA[i] = ~SA[i]; SA[ISAb + SA[i]] = j; } while(SA[--i] < 0);
126  SA[ISAb + SA[i]] = j;
127  }
128 
129  /* Construct the inverse suffix array of type B* suffixes using trsort. */
130  trsort(SA, ISAb, 0, m, 1);
131 
132  /* Set the sorted order of tyoe B* suffixes. */
133  for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
134  for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
135  if(0 <= i) {
136  t = i;
137  for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
138  SA[SA[ISAb + (--j)]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
139  }
140  }
141 
142  /* Calculate the index of start/end point of each bucket. */
143  BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
144  for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
145  i = BUCKET_A(c0 + 1) - 1;
146  for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
147  t = i - BUCKET_B(c0, c1);
148  BUCKET_B(c0, c1) = i; /* end point */
149 
150  /* Move all type B* suffixes to the correct position. */
151  for(i = t, j = BUCKET_BSTAR(c0, c1);
152  j <= k;
153  --i, --k) { SA[i] = SA[k]; }
154  }
155  BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
156  BUCKET_B(c0, c0) = i; /* end point */
157  }
158  }
159 
160  return m;
161 }
162 
163 // from divsufsort.c
164 /* Constructs the suffix array by using the sorted order of type B* suffixes. */
165 template<typename buffer_t>
166 inline void construct_SA(
167  const sauchar_t *T, buffer_t& SA,
168  saidx_t *bucket_A, saidx_t *bucket_B,
169  saidx_t n, saidx_t m) {
170 
171  saidx_t i, j, k;
172  saidx_t s;
173  saint_t c0, c1, c2;
174 
175  if(0 < m) {
176  /* Construct the sorted order of type B suffixes by using
177  the sorted order of type B* suffixes. */
178  for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
179  /* Scan the suffix array from right to left. */
180  for(i = BUCKET_BSTAR(c1, c1 + 1),
181  j = BUCKET_A(c1 + 1) - 1, k = -1, c2 = -1;
182  i <= j;
183  --j) {
184  if(0 < (s = SA[j])) {
185  assert(T[s] == c1);
186  assert(((s + 1) < n) && (T[s] <= T[s + 1]));
187  assert(T[s - 1] <= T[s]);
188  SA[j] = ~s;
189  c0 = T[--s];
190  if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
191  if(c0 != c2) {
192  if(0 <= c2) { BUCKET_B(c2, c1) = k; }
193 
194  k = BUCKET_B(c2 = c0, c1);
195  }
196  assert(k < j);
197  SA[k--] = s;
198  } else {
199  assert(((s == 0) && (T[s] == c1)) || (s < 0));
200  SA[j] = ~s;
201  }
202  }
203  }
204  }
205 
206  /* Construct the suffix array by using
207  the sorted order of type B suffixes. */
208  k = BUCKET_A(c2 = T[n - 1]);
209  SA[k++] = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
210  /* Scan the suffix array from left to right. */
211  for(i = 0, j = n; i < j; ++i) {
212  if(0 < (s = SA[i])) {
213  assert(T[s - 1] >= T[s]);
214  c0 = T[--s];
215  if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
216  if(c0 != c2) {
217  BUCKET_A(c2) = k;
218  k = BUCKET_A(c2 = c0);
219  }
220  assert(i < k);
221  SA[k++] = s;
222  } else {
223  assert(s < 0);
224  SA[i] = ~s;
225  }
226  }
227 }
228 
229 // the actual divsufsort execution
230 template<typename buffer_t>
231 inline void divsufsort_run(
232  const sauchar_t* T, buffer_t& SA,
233  saidx_t *bucket_A, saidx_t *bucket_B, saidx_t n) {
234 
235  // sign check
236  SA[0] = -1; DCHECK(SA[0] < 0) << "only signed integer buffers are supported";
237 
238  saidx_t m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
239  construct_SA(T, SA, bucket_A, bucket_B, n, m);
240 }
241 
242 // specialize for DynamicIntVector
243 template<>
244 inline void divsufsort_run<DynamicIntVector>(
245  const sauchar_t* T, DynamicIntVector& SA,
246  saidx_t *bucket_A, saidx_t *bucket_B, saidx_t n) {
247 
248  BufferWrapper<DynamicIntVector> wrapSA(SA);
249  divsufsort_run(T, wrapSA, bucket_A, bucket_B, n);
250 }
251 
252 // from divsufsort.c
253 template<typename buffer_t>
254 inline saint_t divsufsort(const sauchar_t* T, buffer_t& SA, saidx_t n) {
255  saidx_t *bucket_A, *bucket_B;
256  saidx_t m;
257  saint_t err = 0;
258 
259  /* Check arguments. */
260  if((T == NULL) || (n < 0)) { return -1; }
261  else if(n == 0) { return 0; }
262  else if(n == 1) { SA[0] = 0; return 0; }
263  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
264 
265  bucket_A = new saidx_t[BUCKET_A_SIZE];
266  bucket_B = new saidx_t[BUCKET_B_SIZE];
267 
268  /* Suffixsort. */
269  if((bucket_A != NULL) && (bucket_B != NULL)) {
270  divsufsort_run(T, SA, bucket_A, bucket_B, n);
271  } else {
272  err = -2;
273  }
274 
275  delete[] bucket_B;
276  delete[] bucket_A;
277 
278  return err;
279 }
280 
281 } //ns divsufsort
282 
283 using libdivsufsort::saidx_t;
284 using libdivsufsort::divsufsort;
285 
286 } //ns tdc
288 
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
constexpr dsflags_t SA
Definition: TextDSFlags.hpp:11
IntVector< dynamic_t > DynamicIntVector
Represents an integer vector with unspecified (dynamic) bit width.
Definition: IntVector.hpp:553