1 module dkh..string;
2 
3 /// Suffix Array Informations
4 struct SA(T) {
5     T[] s; /// base string
6     int[] sa; /// suffix array
7     int[] rsa; /// reverse sa, rsa[sa[i]] == i
8     int[] lcp; /// lcp
9     this(in T[] s) {
10         size_t n = s.length;
11         this.s = s.dup;
12         sa = new int[](n+1);
13         rsa = new int[](n+1);
14         lcp = new int[](n);
15     }
16 }
17 
18 int[] sais(T)(in T[] _s, int B = 200) {
19     import std.conv, std.algorithm, std.range;
20     int n = _s.length.to!int;
21     int[] sa = new int[](n+1);
22     if (n == 0) return sa;
23 
24     auto s = _s.map!"a+1".array ~ T(0); B++; // add 0 to last
25     // ls
26     bool[] ls = new bool[](n+1);
27     ls[n] = true;
28     foreach_reverse (i; 0..n) {
29         ls[i] = (s[i] == s[i+1]) ? ls[i+1] : (s[i] < s[i+1]);
30     }
31     // sum(l[0], s[0], l[1], s[1], ...)
32     int[] sumL = new int[](B+1), sumS = new int[](B+1);        
33     s.each!((i, c) => !ls[i] ? sumS[c]++ : sumL[c+1]++);
34     foreach (i; 0..B) {
35         sumL[i+1] += sumS[i];
36         sumS[i+1] += sumL[i+1];
37     }
38 
39     void induce(in int[] lms) {
40         sa[] = -1;
41         auto buf0 = sumS.dup;
42         foreach (d; lms) {
43             sa[buf0[s[d]]++] = d;
44         }
45         auto buf1 = sumL.dup;
46         foreach (v; sa) {
47             if (v >= 1 && !ls[v-1]) {
48                 sa[buf1[s[v-1]]++] = v-1;
49             }
50         }
51         auto buf2 = sumL.dup;
52         foreach_reverse (v; sa) {
53             if (v >= 1 && ls[v-1]) {
54                 sa[--buf2[s[v-1]+1]] = v-1;
55             }
56         }
57     }
58     
59     int[] lms = iota(1, n+1).filter!(i => !ls[i-1] && ls[i]).array;
60     int[] lmsMap = new int[](n+1);
61     lmsMap[] = -1; lms.each!((i, v) => lmsMap[v] = i.to!int);
62 
63     induce(lms);
64     
65     if (lms.length >= 2) {
66         int m = lms.length.to!int - 1;
67         // sort lms
68         auto lms2 = sa.filter!(v => lmsMap[v] != -1).array;
69         int recN = 1;
70         int[] recS = new int[](m);
71         recS[lmsMap[lms2[1]]] = 1;
72         foreach (i; 2..m+1) {
73             int l = lms2[i-1], r = lms2[i];
74             int nl = lms[lmsMap[l]+1], nr = lms[lmsMap[r]+1];
75             if (cmp(s[l..nl+1], s[r..nr+1])) recN++;
76             recS[lmsMap[lms2[i]]] = recN;
77         }
78         //re induce
79         induce(lms.indexed(sais!int(recS, recN)).array);
80     }
81 
82     return sa;
83 }
84 
85 
86 /// return SA!T. each character must be inside [T(0), T(B)).
87 SA!T suffixArray(T)(in T[] _s, int B = 200) {
88     import std.conv, std.algorithm;
89     int n = _s.length.to!int;
90     auto saInfo = SA!T(_s);
91     if (n == 0) return saInfo;
92     
93     with (saInfo) {
94         sa = sais(_s, B);
95         //rsa
96         sa.each!((i, v) => rsa[v] = i.to!int);
97         //lcp
98         int h = 0;
99         foreach (i; 0..n) {
100             int j = sa[rsa[i]-1];
101             if (h > 0) h--;
102             for (; j+h < n && i+h < n; h++) {
103                 if (s[j+h] != s[i+h]) break;
104             }
105             lcp[rsa[i]-1] = h;
106         }
107     }
108     return saInfo;
109 }
110 
111 ///
112 unittest {
113     import std.algorithm : equal, map;
114     string s = "abracadabra";
115     auto saInfo = s.suffixArray;
116     assert(equal(saInfo.sa.map!(i => s[i..$]), [
117         "",
118         "a",
119         "abra",
120         "abracadabra",
121         "acadabra",
122         "adabra",
123         "bra",
124         "bracadabra",
125         "cadabra",
126         "dabra",
127         "ra",
128         "racadabra",
129     ]));
130 }
131 
132 unittest {
133     import dkh.ascii;
134     import std.algorithm, std.conv, std.stdio;
135     import std.random;
136     import std.typecons;
137 
138     SA!T naive(T)(in T[] s) {
139         int n = s.length.to!int;
140         auto res = SA!T(s);
141         res.s = s.dup;
142         alias P = Tuple!(string, int);
143         P[] buf = new P[](n+1);
144         foreach (i; 0..n+1) {
145             buf[i] = P(s[i..$].idup, i);
146         }
147         sort(buf);
148         foreach (i, v; buf) {
149             res.sa[i] = v[1];
150         }
151         foreach (i; 0..n+1) {
152             res.rsa[res.sa[i]] = i;
153         }
154         foreach (i; 0..n) {
155             res.lcp[i] = commonPrefix(buf[i][0], buf[i+1][0]).length.to!int;
156         }
157         return res;
158     }
159 
160     void f() {
161         int n = uniform(1, 100);
162         ASCIIString s;
163         foreach (j; 0..n) {
164             s ~= uniform('a', 'z');
165         }
166         auto l = naive(s);
167         auto r = suffixArray(s);
168         assert(equal(l.s, r.s));
169         assert(equal(l.sa, r.sa));
170         assert(equal(l.rsa, r.rsa));
171         assert(equal(l.lcp, r.lcp));
172     }
173     import dkh.stopwatch;
174     auto ti = benchmark!f(1000);
175     writeln("SAIS Random1000: ", ti[0].toMsecs);
176 }