-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlexer.c
115 lines (102 loc) · 3.33 KB
/
lexer.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "anicca.h"
#include "verb.h"
#include "adverb.h"
#include "conjunction.h"
#include "noun.h"
#include "lexer.h"
#include "primitive.h"
#define DCOL 9
#define DROW 10
static ST dfa[DROW][DCOL] = {
/*SS*/ {{SX,EN},{SS,EO},{SA,EN},{SN,EN},{SA,EN},{S9,EN},{SX,EN},{SX,EN},{SQ,EN}},
/*SX*/ {{SX,EW},{SS,EY},{SA,EW},{SN,EW},{SA,EW},{S9,EW},{SX,EO},{SX,EO},{SQ,EW}},
/*SA*/ {{SX,EW},{SS,EY},{SA,EO},{SA,EO},{SA,EO},{SA,EO},{SX,EO},{SX,EO},{SQ,EW}},
/*SN*/ {{SX,EW},{SS,EY},{SA,EO},{SA,EO},{SM,EO},{SA,EO},{SX,EO},{SX,EO},{SQ,EW}},
/*SM*/ {{SX,EW},{SS,EY},{SA,EO},{SA,EO},{SA,EO},{SA,EO},{SO,EO},{SX,EO},{SQ,EW}},
/*SO*/ {{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SX,EO},{SX,EO},{SZ,EO}},
/*S9*/ {{SX,EV},{SS,EZ},{S9,EO},{S9,EO},{S9,EO},{S9,EO},{S9,EO},{SX,EO},{SQ,EV}},
/*SQ*/ {{SQ,EO},{SQ,EO},{SQ,EO},{SQ,EO},{SQ,EO},{SQ,EO},{SQ,EO},{SQ,EO},{SC,EO}},
/*SC*/ {{SX,EW},{SS,EY},{SA,EW},{SN,EW},{SA,EW},{S9,EW},{SX,EW},{SX,EW},{SQ,EO}},
/*SZ*/ {{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO},{SZ,EO}}
/* CX CS CA CN CB C9 CD CC CQ */
};
/*
parse_literal
input: (1)Length of string, (2)Pointer to string.
output: Array of type string with length (n-2).
*/
static A parse_literal(I n, C *s) { A z=gstr(n-=2,++s); R z; }
/*
parse_name
input: (1)Length of string, (2)Pointer to string.
output: Array of type name.
*/
static A parse_name(I n, C *s) { A z=gnm(n,s); R z; }
/*
token_index
input: Boxed string to be lexed.
output: Array of size 2n (n = number of tokens), in the form:
[start index token 1, length token 1, start index token 2,
length token 2, ..., start index token n, length token n].
*/
static MONAD(token_index) {
C e, sn, t, s=SS, vec=0, *str=CAV(y);
I i, jv, j=0, k=0, n=AN(y), *v;
ST pr;
A z=ga(INT,1,n+n,NULL); v=IAV(z);
DO(n, t=ctype(str[i]); pr=dfa[s][t];
e=pr.effect; sn=pr.new;
switch (e) {
case EO: break;
case EN: { j=i; break; }
case EW: { v[k++]=j; v[k++]=i-j; j=i; break; }
case EY: { v[k++]=j; v[k++]=i-j; j=-1; break; }
case EV: {
if (!vec) { v[k++]=j; v[k]=i-j; jv=j; }
else { v[k]=i-jv; }
j=i; vec=1; break;
}
case EZ: {
if (!vec) { v[k++]=j; v[k]=i-j; jv=j; }
else { v[k]=i-jv; }
j=-1; vec=1; break;
}
case ES: goto end; break;
}
if (vec && sn!=S9 && sn!=SS) { vec=0; k++; }
s=sn;
);
end:
z=ra(z,INT,k); R z;
}
/*
tokens
input:
y: Boxed string to be tokenized.
output: Array of boxed tokens.
*/
MONAD(tokens) {
C c, vn, *str=CAV(y), *s;
A x=token_index(y), z, v, *av;
I n=AN(x)/2, *indx=IAV(x), j, ws, wl, t;
z = ga(BOX,1,n+5,NULL); av = AAV(z); *av++ = mark;
DO(n, j=i+i; ws=indx[j]; wl=indx[j+1];
s=&str[ws]; c=*s; t=ctype(c);
vn=verb_name(wl,s); v=primitive_lookup(vn);
if (AT(v)&MARK) {
switch (t) {
case CS:
case C9: { *av++ = parse_noun(wl,s); break; }
case CQ: { *av++ = parse_literal(wl,s); break; }
case CA: { *av++ = parse_name(wl,s); break; }
default: break; /* error */
}
}
else { *av++=v; }
);
DO(4, *av++=mark); R z;
}