-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsort.c
258 lines (233 loc) · 9.23 KB
/
tsort.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/* tsort.c - topological sort dependency resolver
* vi: tabstop=2 softtabstop=2 shiftwidth=2
*
* Copyright 2024 Ray Gardner <[email protected]>
* License: 0BSD
*
* See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/tsort.html
USE_TSORT(NEWTOY(tsort, ">1", TOYFLAG_USR|TOYFLAG_BIN))
config TSORT
bool "tsort"
default y
help
usage: tsort [FILE]
Topological sort dependency resolver.
Read pairs of input strings indicating before/after dependency relationships
and find an ordering that respects all dependencies. On success output each
string once to stdout, on failure print error and output cycle pairs.
*/
#include "toys.h"
// A topological sort reads a set of relations, i.e. pairs of
// (predecessor, successor) symbols, and writes a list of unique symbols
// such that each predecessor appears before any of its successors.
// This is Knuth's Algorithm T, Topological sort,
// from The Art of Computer Programming Vol 1, sec. 2.2.3.
//
// The algorithm builds a table of symbols such that each entry has a count of
// its predecessors and a linked list of its successors. When all symbols have
// been read, the symbols with a predecessor count of zero are put into a
// queue. The symbol at the front of the queue is removed and output, and then
// the predecessor count of each of its successors is reduced by one. If any
// of those counts are reduced to zero, that successor's symbol is placed at
// the rear of the queue, and the process is repeated for the next symbol at
// the front of the queue. When the queue is empty, the algorithm checks if
// all the symbols have been output. If not, then there is a loop (cycle) in
// the set of relations. The algorithm walks 'backwards' through predecessors
// to print a cycle.
// The original algo expects relations as pairs drawn from consecutive
// integers starting at 1. This version assumes arbitrary symbols, so uses a
// hash table to relate the symbols to integers.
// Symbol table, indexed by integers in the hash table below.
// symoffs: offset of the symbol string in the input data
// pq: (Knuth's COUNT aka QLINK) count of predecessors of a symbol
// and used later as a link in the queue of symbols with no
// remaining predecessors
// succx: (Knuth's TOP) index into linked list of successors in the successors
// table, or 0 if no successors
struct symbols {int symoffs, pq, succx;};
// Successors table, indexed by symbols.succx field.
// symbx: (Knuths's SUC) index into symbols table
// next: (Knuth's NEXT) index to next successor, or 0
struct successors {int symbx, next;};
// Hash table to associate symbols with integer indexes to tables 'symbols'
// and 'successors'. The probe sequence is borrowed from Python dict, using
// the 'perturb' idea to mix in upper bits.
// See https://github.com/python/cpython/blob/main/Objects/dictobject.c
//
// dat is input data area; dat + offs points to symbol to look up.
// hashtbl is the actual hash table. Entries are 0 (unused) or the index into
// symbols table (stbl).
// hashmask is 2**n-1 where n is table size
// symcnt is count of symbols inserted
static int hash_lookup(char *dat, int offs,
int *hashtbl, int hashmask, int *symcnt, struct symbols *stbl)
{
int n, probe;
char *s = dat + offs;
unsigned perturb = 5381; // djb2a hash
while (*s) perturb = perturb * 33 ^ *s++;
probe = perturb & hashmask;
while ((n = hashtbl[probe])) {
if (! strcmp(dat + offs, dat + stbl[n].symoffs)) return n;
probe = (probe * 5 + 1 + (perturb >>= 5)) & hashmask;
}
hashtbl[probe] = ++*symcnt;
stbl[*symcnt].symoffs = offs;
return *symcnt;
}
static void do_tsort(int fd, char *name)
{
struct symbols *stbl;
struct successors *succlist;
int i, j, k;
int nsym, nsym2;
int *hashtbl, hashmask, symcnt = 0;
char *pred = 0, *succ; // pointers to pair symbols (predecessor, successor)
int npred, nsucc; // indexes of pairs in stbl
int succ_cnt = 0; // number of successors added to succlist structure
char *whitespace = " \r\n\t";
int nleft, front, rear = 0;
char *dat; // input area; holds entire input
off_t plen = 0;
// readfd() ensures dat is at least one byte longer than plen
// which makes it safe to store a NUL byte after last symbol.
if (!(dat = readfd(fd, 0, &plen))) return;
if (plen >= INT_MAX) error_exit("input too big");
// Make sure input has no NUL bytes
for (i = 0; i < plen; i++)
if (!dat[i]) dat[i] = ' ';
// T1. Initialize. Get number of pairs, allocate tables.
// Count input symbols (non-unique)
for (i = nsym = 0; i < plen; nsym++) {
i += strspn(dat + i, whitespace);
if (i == plen) break;
i += strcspn(dat + i, whitespace);
}
// nsym == number_of_pairs * 2
if (nsym & 1) error_exit("bad input (not pairs)");
// Worst case is every symbol is unique (a b, c d, ...)
// so need a hash table for at least nsym entries.
// Hash table must be large enough to load to < 100%
// Size must be power of 2; use next above (102% of nsym plus 5).
for (k = 2; k < nsym + nsym / 50 + 5; k *= 2)
;
hashmask = k - 1;
hashtbl = xzalloc(k * sizeof *hashtbl);
// Need a symbols structure to hold (nsym + 1) symbols (stbl)
// plus successors structure (nsym / 2 + 1) successors (succlist).
// The +1 is to accommodate 1-based indexing.
stbl = xzalloc((nsym+1) * sizeof(*stbl) + (nsym/2+1) * sizeof(*succlist));
succlist = (void *)&stbl[nsym+1];
// re-scan the symbols, enter in hashtbl and stbl.
for (i = nsym2 = 0; i < plen; nsym2++) {
// T2. Next relation. Get a relation from input.
i += strspn(dat + i, whitespace);
if (i == plen) break;
j = strcspn(dat + i, whitespace);
dat[i+j] = 0; // NUL terminate symbol. readfd() ensures safe at buffer end
if (! (nsym2 & 1)) {
pred = dat + i;
} else {
succ = dat + i;
npred = hash_lookup(dat, pred-dat, hashtbl, hashmask, &symcnt, stbl);
nsucc = hash_lookup(dat, i, hashtbl, hashmask, &symcnt, stbl);
// T3. Record the relation.
// Slight mod: if self-reference relation, do not record it.
// (If it's self-reference, the symbol is already in the table now.)
// Note that this links the most recent into the head of the successor
// list, reversing their order in the input. If the symbols table kept a
// tail pointer, they could be kept in the list in original order.
if (strcmp(pred, succ)) {
stbl[nsucc].pq++;
succlist[++succ_cnt].symbx = nsucc;
succlist[succ_cnt].next = stbl[npred].succx;
stbl[npred].succx = succ_cnt;
}
}
i += j + 1;
}
free(hashtbl); // Done using hash table.
// T4. Scan for zeros. Build a queue of symbols w/ no predecessor.
// Note Knuth's QLINK is our stbl[].pq.
stbl[0].pq = 0;
for (k = 1; k <= symcnt; k++) {
if (! stbl[k].pq) {
stbl[rear].pq = k;
rear = k;
}
}
front = stbl[0].pq;
// T5. Output front of queue.
nleft = symcnt; // This will count down as symbols are output.
while (front) {
xprintf("%s\n", dat + stbl[front].symoffs);
nleft--;
j = stbl[front].succx;
stbl[front].succx = 0; // Per Knuth 2.2.3 Exercise 23
// T6. Erase relations. Step through successors of symbol we just output,
// decreasing the predecessor count of each one. If that count went to
// zero, put that symbol on the queue.
while (j) {
stbl[succlist[j].symbx].pq--;
if (stbl[succlist[j].symbx].pq == 0) {
stbl[rear].pq = succlist[j].symbx;
rear = succlist[j].symbx;
}
j = succlist[j].next;
}
// T7. Remove the symbol from front of queue.
front = stbl[front].pq;
}
// T8. End, if OK. If any symbols left, there is a loop (cycle)
// in the associated graph. Find and print a loop.
if (nleft) {
int k0, k, kn, knn;
error_msg("input contains a loop:");
// Per TAOCP Knuth 2.2.3 Exercise 23, find and print a loop.
for (k = 1; k <= symcnt; k++) stbl[k].pq = 0;
// T9. (reuses succx and pq fields) For each symbol, set pq field to
// refer to a predecessor.
for (k = 1; k <= symcnt; k++) {
j = stbl[k].succx;
stbl[k].succx = 0;
// T10. Step thru successors, setting corresponding pq field.
while (j) {
if (!stbl[succlist[j].symbx].pq)
stbl[succlist[j].symbx].pq = k;
j = succlist[j].next;
}
}
// T11. Find a symbol with its pq field set.
for (k = 1; k <= symcnt; k++)
if (stbl[k].pq) break;
//if (k > symcnt) error_exit("bug"); // Not needed if algo is correct...
// T12. Step thru graph backwards, flagging symbols until start of loop.
do {
stbl[k].succx = 1;
k = stbl[k].pq;
} while (stbl[k].succx == 0);
// Per note after T14, the loop links backwards. Reverse the loop.
// Not sure if this is the cleanest way... (rdg)
k0 = k;
kn = stbl[k].pq;
do {
knn = stbl[kn].pq;
stbl[kn].pq = k;
k = kn;
kn = knn;
} while (k != k0);
// T13. Step through loop, printing symbols.
do {
xprintf("%s ->\n", dat + stbl[k].symoffs);
stbl[k].succx = 0;
k = stbl[k].pq;
} while (stbl[k].succx == 1);
// T14. Print first element of loop again to tie it off.
xprintf("%s\n", dat + stbl[k].symoffs);
}
free(stbl);
}
void tsort_main(void)
{
loopfiles(toys.optargs, do_tsort);
}