表驱动的LL(1)语法分析器和LR(1)语法分析器(JAVA实现)

表驱动的LL(1)语法分析器和LR(1)语法分析器(JAVA实现)

本文主要参考《编译器设计(第二版)》(英文名:Engineering a Compiler)

源码链接:https://github.com/Blore-lzn/Compiler

一、基本数据结构

产生式(Production)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
public static class Production {
public final String left; // 左部符号
public final ArrayList<String> rights = new ArrayList<>(); // 右部符号串
public final int handler;
private static int HANDLER = 0;

public Production(String left, String[] rights) {
this.left = left;
this.rights.addAll(Arrays.asList(rights));
this.handler = HANDLER++;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Production that = (Production) o;
return handler == that.handler && Objects.equals(left, that.left) &&
Objects.equals(rights, that.rights);
}

@Override
public int hashCode() {
return Objects.hash(left, rights, handler);
}

@Override
public String toString() {
return left + " -> " + rights;
}
}

文法(Grammar)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
public static class Grammar {
public final ArrayList<String> T = new ArrayList<>(); // 终结符
public final ArrayList<String> NT = new ArrayList<>(); // 非终结符
public final ArrayList<Production> prods = new ArrayList<>(); //产生式
public final HashMap<String, ArrayList<Production>> nt2prods = new HashMap<>();

public void addProduction(Production p) {
prods.add(p);
nt2prods.computeIfAbsent(p.left, k -> new ArrayList<>());
nt2prods.get(p.left).add(p);
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Production p : prods) {
sb.append(p.toString()).append("\n");
}
return sb.toString();
}
}

二、基础集合构建

1 FIRST集合

对于语法符号 $\alpha$,FIRST($\alpha$)是从$\alpha$推导出的语句开头可能出现的终结符集合(注意$\alpha$可以为单个语法符号也可以为符号串)

单个语法符号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
public void generateFirstSet() {
firstSet.clear();
/* 终结符的FIRST集是其本身 */
for (String s : grammar.T) {
HashSet<String> set = new HashSet<>();
set.add(s);
firstSet.put(s, set);
}
for (String s : grammar.NT) {
firstSet.put(s, new HashSet<>());
}
boolean changed = true;
while (changed) {
changed = false;

for (Production p : grammar.prods) {
HashSet<String> rhs = new HashSet<>(firstSet.get(p.rights.get(0)));
firstSet.get(p.rights.get(0)).stream().filter(s -> !s.equals(EPS))
.forEach(rhs::add);
int i = 0;
while (i < p.rights.size() - 1 && firstSet.get(p.rights.get(i)).contains(EPS)) {
firstSet.get(p.rights.get(i + 1)).stream().filter(s -> !s.equals(EPS))
.forEach(rhs::add);
i = i + 1;
}
if (i == p.rights.size() - 1 && firstSet.get(p.rights.get(i)).contains(EPS)) {
rhs.add(EPS);
}
rhs.addAll(firstSet.get(p.left));
if (!Objects.equals(firstSet.get(p.left), rhs)) {
changed = true;
firstSet.put(p.left, rhs);
}
}
}
}

符号串

(需先计算单个语法符号的FIRST集合)

1
2
3
4
5
6
7
8
9
10
public HashSet<String> getFirstSetOfStrings(ArrayList<String> symbols) {
HashSet<String> res = new HashSet<>();
for (String s : symbols) {
res.addAll(firstSet.get(s));
if (!firstSet.get(s).contains(EPS)) {
break;
}
}
return res;
}

2 FOLLOW集合

对于非终结符 $\alpha$,FOLLOW($\alpha$)是在语句中紧接着$\alpha$出现的单词的集合

(需先计算FIRST集合)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
public void generateFollowSet() {
followSet.clear();
for (String s : grammar.NT) {
followSet.put(s, new HashSet<>());
}
followSet.get(S).add(EOF);
boolean changed = true;
while (changed) {
changed = false;
for (Production p : grammar.prods) {
HashSet<String> trailer = new HashSet<>(followSet.get(p.left));
for (int i = p.rights.size() - 1; i >= 0; i--) {
String bi = p.rights.get(i);
if (isNT(bi)) {
HashSet<String> newFollowSet = new HashSet<>(trailer);
newFollowSet.addAll(followSet.get(bi));
if (!Objects.equals(newFollowSet, followSet.get(bi))) {
changed = true;
followSet.put(bi, newFollowSet);
}
if (firstSet.get(bi).contains(EPS)) {
firstSet.get(bi).stream().filter(s -> !s.equals(EPS))
.forEach(trailer::add);
} else {
trailer = new HashSet<>(firstSet.get(bi));
}
} else {
trailer = new HashSet<>(firstSet.get(bi));
}
}
}
}
}

3 FIRST+集合

增强的FIRST集合(主要解决FIRST存在空串的情况)

无回溯的语法必定具有的性质:对任何匹配多个产生式的非终结符$A$,$A\rightarrow\beta_1\vert\beta_2\vert\dots\vert\beta_n$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
public void generateFirstPlusSet() {
firstPlusSet.clear();
for (Production p : grammar.prods) {
if (!p.rights.isEmpty()) {
HashSet<String> newFirstPlusSet = getFirstSetOfStrings(p.rights);
if (!newFirstPlusSet.contains(EPS)) {
firstPlusSet.put(p, newFirstPlusSet);
} else {
newFirstPlusSet.addAll(followSet.get(p.left));
firstPlusSet.put(p, newFirstPlusSet);
}
}
}
}

三、表驱动的LL(1)语法分析器

1 构造LL(1)表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
protected HashMap<Pair<String, String>, Production> table = new HashMap<>();

public void makeTable() {
generateFirstSet();
generateFollowSet();
generateFirstPlusSet();
for (Production p : grammar.prods) {
firstPlusSet.get(p).stream().filter(this::isT)
.forEach(w -> table.put(new Pair<>(p.left, w), p));
if (firstPlusSet.get(p).contains(EOF)) {
table.put(new Pair<>(p.left, EOF), p);
}
}
}

2 LL(1)语法分析器框架

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
public boolean parse() {
Stack<String> st = new Stack<>();
String word = lexer.nextToken();
st.push(EOF);
st.push(S);
String focus = st.peek();
while (true) {
if (focus.equals(EOF) && word.equals(EOF)) {
return true;
} else if (isT(focus) || focus.equals(EOF)) {
if (Objects.equals(focus, word)) {
st.pop();
word = lexer.nextToken();
} else {
return false;
}
} else {
Production p = table.get(new Pair<>(focus, word));
if (p != null) {
st.pop();
for (int i = p.rights.size() - 1; i >= 0; i--) {
if (!p.rights.get(i).equals(EPS)) {
st.push(p.rights.get(i));
}
}
} else {
return false;
}
}
focus = st.peek();
}
}

四、表驱动的LR(1)语法分析器

1 构建规范族

(1) LR(1)项结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
public static class LR1Item {
public final Production p;
public final int pos; // ·在p.rights[pos]前
public final String next;

public LR1Item(Production p, int pos, String next) {
this.p = p;
this.pos = pos;
this.next = next;
}

public boolean isFinal() {
return p.left.equals(S) && pos == p.rights.size() && next.equals(EOF);
}

public String getSymAfterDot() {
if (pos >= p.rights.size()) {
return null;
} else {
return p.rights.get(pos);
}
}

public boolean hasSymAfterDot() {
return pos < p.rights.size();
}

public boolean dotBeforeNT(Grammar grammar) {
return grammar.NT.contains(getSymAfterDot());
}

public boolean dotBeforeSym(String sym) {
return Objects.equals(sym, getSymAfterDot());
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
LR1Item lr1Item = (LR1Item) o;
return pos == lr1Item.pos && Objects.equals(p, lr1Item.p) &&
Objects.equals(next, lr1Item.next);
}

@Override
public int hashCode() {
return Objects.hash(p, pos, next);
}

@Override
public String toString() {
ArrayList<String> rights = new ArrayList<>(p.rights);
rights.add(pos, "·");
return "[" + p.left + " -> " + rights + ", " + next + "]";
}
}

(2) 规范组结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
public static class CCSet {
public HashSet<LR1Item> items = new HashSet<>();
public HashMap<String, CCSet> edges = new HashMap<>();
public int handler = -1;

public void setHandler(int handler) {
this.handler = handler;
}

public void addEdge(String s, CCSet ccSet) {
edges.put(s, ccSet);
}

public HashSet<String> getAllSymsAfterDot() {
HashSet<String> res = new HashSet<>();
for (LR1Item item : items) {
if (item.getSymAfterDot() != null) {
res.add(item.getSymAfterDot());
}
}
return res;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
CCSet ccSet = (CCSet) o;
return Objects.equals(items, ccSet.items);
}

@Override
public int hashCode() {
return Objects.hash(items);
}

@Override
public String toString() {
return "CC" + handler;
}
}

(3) closure过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
public CCSet closure(CCSet s) {
boolean changed = true;
while (changed) {
changed = false;
HashSet<LR1Item> newItems = new HashSet<>(s.items);
for (LR1Item item : s.items) {
if (item.dotBeforeNT(grammar)) {
String nt = item.p.rights.get(item.pos);
ArrayList<String> succ = new ArrayList<>();
for (int i = item.pos + 1; i < item.p.rights.size(); i++) {
succ.add(item.p.rights.get(i));
}
succ.add(item.next);
HashSet<String> succFirstSet = getFirstSetOfStrings(succ);
for (Production p : grammar.nt2prods.get(nt)) {
for (String b : succFirstSet) {
LR1Item it = new LR1Item(p, 0, b);
if (!newItems.contains(it)) {
changed = true;
newItems.add(it);
}
}
}
}
}
s.items = newItems;
}

return s;
}

(4) goto过程

1
2
3
4
5
6
7
8
9
public CCSet gotoFunc(CCSet s, String x) {
CCSet moved = new CCSet();
for (LR1Item i : s.items) {
if (i.dotBeforeSym(x)) {
moved.items.add(new LR1Item(i.p, i.pos + 1, i.next));
}
}
return closure(moved);
}

(5) 构建CC

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
public void makeCCSets() {
CCSet cc0 = new CCSet();
for (Production p : grammar.nt2prods.get(S)) {
cc0.items.add(new LR1Item(p, 0, EOF));
}
cc0 = closure(cc0);
allCCSets.add(cc0);
boolean changed = true;
HashSet<CCSet> vis = new HashSet<>();
while (changed) {
changed = false;
ArrayList<CCSet> newAllCCSets = new ArrayList<>(allCCSets);
for (CCSet cc : allCCSets) {
if (!vis.contains(cc)) {
vis.add(cc);
for (String x : cc.getAllSymsAfterDot()) {
CCSet temp = gotoFunc(cc, x);
if (!newAllCCSets.contains(temp)) {
changed = true;
newAllCCSets.add(temp);
}
cc.addEdge(x, temp);
}
}
}
allCCSets = newAllCCSets;
}
// 编号(可省略)
for (int i = 0; i < allCCSets.size(); i++) {
allCCSets.get(i).setHandler(i);
}
}

(6) 填表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
public void makeTable() {
for (CCSet cc : allCCSets) {
for (LR1Item item : cc.items) {
if (item.hasSymAfterDot() && isT(item.getSymAfterDot()) &&
cc.edges.containsKey(item.getSymAfterDot())) {
actionMap.put(new Pair<>(cc, item.getSymAfterDot()),
new Pair<>(SHIFT, cc.edges.get(item.getSymAfterDot())));
} else if (!item.hasSymAfterDot()) {
actionMap.put(new Pair<>(cc, item.next),
new Pair<>(REDUCE, item.p));
}
// 原文伪代码为else if
if (item.isFinal()) {
actionMap.put(new Pair<>(cc, EOF),
new Pair<>(ACCEPTED, null));
}
}

for (String nt : grammar.NT) {
if (cc.edges.containsKey(nt)) {
gotoMap.put(new Pair<>(cc, nt), cc.edges.get(nt));
}
}
}
}

2 LR(1)语法分析器框架

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
public boolean parse() {
Stack<Object> st = new Stack<>();
st.push("$");
st.push(allCCSets.get(0));
CCSet state;
String word = lexer.nextToken();
while (true) {
state = (CCSet) st.peek();
Pair<String, Object> action = actionMap.get(new Pair<>(state, word));
if (action == null) {
return false;
}
if (action.getFirst().equals(REDUCE)) {
Production p = ((Production) action.getSecond());
for (int i = 0; i < p.rights.size(); i++) {
st.pop();
st.pop();
}
state = (CCSet) st.peek();
st.push(p.left);
st.push(gotoMap.get(new Pair<>(state, p.left)));
} else if (action.getFirst().equals(SHIFT)) {
st.push(word);
st.push(action.getSecond());
word = lexer.nextToken();
} else if (action.getFirst().equals(ACCEPTED)) {
break;
}
}

return true;
}