1 module dkh..string;
2
3 /// Suffix Array Informations
4 struct SA(T) {
5 T[] s; /// base string
6 int[] sa; /// suffix array
7 int[] rsa; /// reverse sa, rsa[sa[i]] == i
8 int[] lcp; /// lcp
9 this(in T[] s) {
10 size_t n = s.length;
11 this.s = s.dup;
12 sa = new int[](n+1);
13 rsa = new int[](n+1);
14 lcp = new int[](n);
15 }
16 }
17
18 int[] sais(T)(in T[] _s, int B = 200) {
19 import std.conv, std.algorithm, std.range;
20 int n = _s.length.to!int;
21 int[] sa = new int[](n+1);
22 if (n == 0) return sa;
23
24 auto s = _s.map!"a+1".array ~ T(0); B++; // add 0 to last
25 // ls
26 bool[] ls = new bool[](n+1);
27 ls[n] = true;
28 foreach_reverse (i; 0..n) {
29 ls[i] = (s[i] == s[i+1]) ? ls[i+1] : (s[i] < s[i+1]);
30 }
31 // sum(l[0], s[0], l[1], s[1], ...)
32 int[] sumL = new int[](B+1), sumS = new int[](B+1);
33 s.each!((i, c) => !ls[i] ? sumS[c]++ : sumL[c+1]++);
34 foreach (i; 0..B) {
35 sumL[i+1] += sumS[i];
36 sumS[i+1] += sumL[i+1];
37 }
38
39 void induce(in int[] lms) {
40 sa[] = -1;
41 auto buf0 = sumS.dup;
42 foreach (d; lms) {
43 sa[buf0[s[d]]++] = d;
44 }
45 auto buf1 = sumL.dup;
46 foreach (v; sa) {
47 if (v >= 1 && !ls[v-1]) {
48 sa[buf1[s[v-1]]++] = v-1;
49 }
50 }
51 auto buf2 = sumL.dup;
52 foreach_reverse (v; sa) {
53 if (v >= 1 && ls[v-1]) {
54 sa[--buf2[s[v-1]+1]] = v-1;
55 }
56 }
57 }
58
59 int[] lms = iota(1, n+1).filter!(i => !ls[i-1] && ls[i]).array;
60 int[] lmsMap = new int[](n+1);
61 lmsMap[] = -1; lms.each!((i, v) => lmsMap[v] = i.to!int);
62
63 induce(lms);
64
65 if (lms.length >= 2) {
66 int m = lms.length.to!int - 1;
67 // sort lms
68 auto lms2 = sa.filter!(v => lmsMap[v] != -1).array;
69 int recN = 1;
70 int[] recS = new int[](m);
71 recS[lmsMap[lms2[1]]] = 1;
72 foreach (i; 2..m+1) {
73 int l = lms2[i-1], r = lms2[i];
74 int nl = lms[lmsMap[l]+1], nr = lms[lmsMap[r]+1];
75 if (cmp(s[l..nl+1], s[r..nr+1])) recN++;
76 recS[lmsMap[lms2[i]]] = recN;
77 }
78 //re induce
79 induce(lms.indexed(sais!int(recS, recN)).array);
80 }
81
82 return sa;
83 }
84
85
86 /// return SA!T. each character must be inside [T(0), T(B)).
87 SA!T suffixArray(T)(in T[] _s, int B = 200) {
88 import std.conv, std.algorithm;
89 int n = _s.length.to!int;
90 auto saInfo = SA!T(_s);
91 if (n == 0) return saInfo;
92
93 with (saInfo) {
94 sa = sais(_s, B);
95 //rsa
96 sa.each!((i, v) => rsa[v] = i.to!int);
97 //lcp
98 int h = 0;
99 foreach (i; 0..n) {
100 int j = sa[rsa[i]-1];
101 if (h > 0) h--;
102 for (; j+h < n && i+h < n; h++) {
103 if (s[j+h] != s[i+h]) break;
104 }
105 lcp[rsa[i]-1] = h;
106 }
107 }
108 return saInfo;
109 }
110
111 ///
112 unittest {
113 import std.algorithm : equal, map;
114 string s = "abracadabra";
115 auto saInfo = s.suffixArray;
116 assert(equal(saInfo.sa.map!(i => s[i..$]), [
117 "",
118 "a",
119 "abra",
120 "abracadabra",
121 "acadabra",
122 "adabra",
123 "bra",
124 "bracadabra",
125 "cadabra",
126 "dabra",
127 "ra",
128 "racadabra",
129 ]));
130 }
131
132 unittest {
133 import dkh.ascii;
134 import std.algorithm, std.conv, std.stdio;
135 import std.random;
136 import std.typecons;
137
138 SA!T naive(T)(in T[] s) {
139 int n = s.length.to!int;
140 auto res = SA!T(s);
141 res.s = s.dup;
142 alias P = Tuple!(string, int);
143 P[] buf = new P[](n+1);
144 foreach (i; 0..n+1) {
145 buf[i] = P(s[i..$].idup, i);
146 }
147 sort(buf);
148 foreach (i, v; buf) {
149 res.sa[i] = v[1];
150 }
151 foreach (i; 0..n+1) {
152 res.rsa[res.sa[i]] = i;
153 }
154 foreach (i; 0..n) {
155 res.lcp[i] = commonPrefix(buf[i][0], buf[i+1][0]).length.to!int;
156 }
157 return res;
158 }
159
160 void f() {
161 int n = uniform(1, 100);
162 ASCIIString s;
163 foreach (j; 0..n) {
164 s ~= uniform('a', 'z');
165 }
166 auto l = naive(s);
167 auto r = suffixArray(s);
168 assert(equal(l.s, r.s));
169 assert(equal(l.sa, r.sa));
170 assert(equal(l.rsa, r.rsa));
171 assert(equal(l.lcp, r.lcp));
172 }
173 import dkh.stopwatch;
174 auto ti = benchmark!f(1000);
175 writeln("SAIS Random1000: ", ti[0].toMsecs);
176 }