表驱动的LL(1)语法分析器和LR(1)语法分析器(JAVA实现)
本文主要参考《编译器设计(第二版)》(英文名:Engineering a Compiler)
源码链接:https://github.com/Blore-lzn/Compiler
一、基本数据结构 产生式(Production) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 public static class Production { public final String left; public final ArrayList<String> rights = new ArrayList<>(); public final int handler; private static int HANDLER = 0 ; public Production (String left, String[] rights) { this .left = left; this .rights.addAll(Arrays.asList(rights)); this .handler = HANDLER++; } @Override public boolean equals (Object o) { if (this == o) { return true ; } if (o == null || getClass() != o.getClass()) { return false ; } Production that = (Production) o; return handler == that.handler && Objects.equals(left, that.left) && Objects.equals(rights, that.rights); } @Override public int hashCode () { return Objects.hash(left, rights, handler); } @Override public String toString () { return left + " -> " + rights; } }
文法(Grammar) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 public static class Grammar { public final ArrayList<String> T = new ArrayList<>(); public final ArrayList<String> NT = new ArrayList<>(); public final ArrayList<Production> prods = new ArrayList<>(); public final HashMap<String, ArrayList<Production>> nt2prods = new HashMap<>(); public void addProduction (Production p) { prods.add(p); nt2prods.computeIfAbsent(p.left, k -> new ArrayList<>()); nt2prods.get(p.left).add(p); } @Override public String toString () { StringBuilder sb = new StringBuilder(); for (Production p : prods) { sb.append(p.toString()).append("\n" ); } return sb.toString(); } }
二、基础集合构建 1 FIRST集合 对于语法符号 $\alpha$,FIRST($\alpha$)是从$\alpha$推导出的语句开头可能出现的终结符集合(注意$\alpha$可以为单个语法符号也可以为符号串)
单个语法符号 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 public void generateFirstSet () { firstSet.clear(); for (String s : grammar.T) { HashSet<String> set = new HashSet<>(); set.add(s); firstSet.put(s, set); } for (String s : grammar.NT) { firstSet.put(s, new HashSet<>()); } boolean changed = true ; while (changed) { changed = false ; for (Production p : grammar.prods) { HashSet<String> rhs = new HashSet<>(firstSet.get(p.rights.get(0 ))); firstSet.get(p.rights.get(0 )).stream().filter(s -> !s.equals(EPS)) .forEach(rhs::add); int i = 0 ; while (i < p.rights.size() - 1 && firstSet.get(p.rights.get(i)).contains(EPS)) { firstSet.get(p.rights.get(i + 1 )).stream().filter(s -> !s.equals(EPS)) .forEach(rhs::add); i = i + 1 ; } if (i == p.rights.size() - 1 && firstSet.get(p.rights.get(i)).contains(EPS)) { rhs.add(EPS); } rhs.addAll(firstSet.get(p.left)); if (!Objects.equals(firstSet.get(p.left), rhs)) { changed = true ; firstSet.put(p.left, rhs); } } } }
符号串 (需先计算单个语法符号的FIRST集合)
1 2 3 4 5 6 7 8 9 10 public HashSet<String> getFirstSetOfStrings (ArrayList<String> symbols) { HashSet<String> res = new HashSet<>(); for (String s : symbols) { res.addAll(firstSet.get(s)); if (!firstSet.get(s).contains(EPS)) { break ; } } return res; }
2 FOLLOW集合 对于非终结符 $\alpha$,FOLLOW($\alpha$)是在语句中紧接着$\alpha$出现的单词的集合
(需先计算FIRST集合)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 public void generateFollowSet () { followSet.clear(); for (String s : grammar.NT) { followSet.put(s, new HashSet<>()); } followSet.get(S).add(EOF); boolean changed = true ; while (changed) { changed = false ; for (Production p : grammar.prods) { HashSet<String> trailer = new HashSet<>(followSet.get(p.left)); for (int i = p.rights.size() - 1 ; i >= 0 ; i--) { String bi = p.rights.get(i); if (isNT(bi)) { HashSet<String> newFollowSet = new HashSet<>(trailer); newFollowSet.addAll(followSet.get(bi)); if (!Objects.equals(newFollowSet, followSet.get(bi))) { changed = true ; followSet.put(bi, newFollowSet); } if (firstSet.get(bi).contains(EPS)) { firstSet.get(bi).stream().filter(s -> !s.equals(EPS)) .forEach(trailer::add); } else { trailer = new HashSet<>(firstSet.get(bi)); } } else { trailer = new HashSet<>(firstSet.get(bi)); } } } } }
3 FIRST+集合 增强的FIRST集合(主要解决FIRST存在空串的情况)
无回溯的语法必定具有的性质:对任何匹配多个产生式的非终结符$A$,$A\rightarrow\beta_1\vert\beta_2\vert\dots\vert\beta_n$
1 2 3 4 5 6 7 8 9 10 11 12 13 14 public void generateFirstPlusSet () { firstPlusSet.clear(); for (Production p : grammar.prods) { if (!p.rights.isEmpty()) { HashSet<String> newFirstPlusSet = getFirstSetOfStrings(p.rights); if (!newFirstPlusSet.contains(EPS)) { firstPlusSet.put(p, newFirstPlusSet); } else { newFirstPlusSet.addAll(followSet.get(p.left)); firstPlusSet.put(p, newFirstPlusSet); } } } }
三、表驱动的LL(1)语法分析器 1 构造LL(1)表 1 2 3 4 5 6 7 8 9 10 11 12 13 14 protected HashMap<Pair<String, String>, Production> table = new HashMap<>();public void makeTable () { generateFirstSet(); generateFollowSet(); generateFirstPlusSet(); for (Production p : grammar.prods) { firstPlusSet.get(p).stream().filter(this ::isT) .forEach(w -> table.put(new Pair<>(p.left, w), p)); if (firstPlusSet.get(p).contains(EOF)) { table.put(new Pair<>(p.left, EOF), p); } } }
2 LL(1)语法分析器框架 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 public boolean parse () { Stack<String> st = new Stack<>(); String word = lexer.nextToken(); st.push(EOF); st.push(S); String focus = st.peek(); while (true ) { if (focus.equals(EOF) && word.equals(EOF)) { return true ; } else if (isT(focus) || focus.equals(EOF)) { if (Objects.equals(focus, word)) { st.pop(); word = lexer.nextToken(); } else { return false ; } } else { Production p = table.get(new Pair<>(focus, word)); if (p != null ) { st.pop(); for (int i = p.rights.size() - 1 ; i >= 0 ; i--) { if (!p.rights.get(i).equals(EPS)) { st.push(p.rights.get(i)); } } } else { return false ; } } focus = st.peek(); } }
四、表驱动的LR(1)语法分析器 1 构建规范族 (1) LR(1)项结构 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 public static class LR1Item { public final Production p; public final int pos; public final String next; public LR1Item (Production p, int pos, String next) { this .p = p; this .pos = pos; this .next = next; } public boolean isFinal () { return p.left.equals(S) && pos == p.rights.size() && next.equals(EOF); } public String getSymAfterDot () { if (pos >= p.rights.size()) { return null ; } else { return p.rights.get(pos); } } public boolean hasSymAfterDot () { return pos < p.rights.size(); } public boolean dotBeforeNT (Grammar grammar) { return grammar.NT.contains(getSymAfterDot()); } public boolean dotBeforeSym (String sym) { return Objects.equals(sym, getSymAfterDot()); } @Override public boolean equals (Object o) { if (this == o) { return true ; } if (o == null || getClass() != o.getClass()) { return false ; } LR1Item lr1Item = (LR1Item) o; return pos == lr1Item.pos && Objects.equals(p, lr1Item.p) && Objects.equals(next, lr1Item.next); } @Override public int hashCode () { return Objects.hash(p, pos, next); } @Override public String toString () { ArrayList<String> rights = new ArrayList<>(p.rights); rights.add(pos, "·" ); return "[" + p.left + " -> " + rights + ", " + next + "]" ; } }
(2) 规范组结构 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 public static class CCSet { public HashSet<LR1Item> items = new HashSet<>(); public HashMap<String, CCSet> edges = new HashMap<>(); public int handler = -1 ; public void setHandler (int handler) { this .handler = handler; } public void addEdge (String s, CCSet ccSet) { edges.put(s, ccSet); } public HashSet<String> getAllSymsAfterDot () { HashSet<String> res = new HashSet<>(); for (LR1Item item : items) { if (item.getSymAfterDot() != null ) { res.add(item.getSymAfterDot()); } } return res; } @Override public boolean equals (Object o) { if (this == o) { return true ; } if (o == null || getClass() != o.getClass()) { return false ; } CCSet ccSet = (CCSet) o; return Objects.equals(items, ccSet.items); } @Override public int hashCode () { return Objects.hash(items); } @Override public String toString () { return "CC" + handler; } }
(3) closure过程 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 public CCSet closure (CCSet s) { boolean changed = true ; while (changed) { changed = false ; HashSet<LR1Item> newItems = new HashSet<>(s.items); for (LR1Item item : s.items) { if (item.dotBeforeNT(grammar)) { String nt = item.p.rights.get(item.pos); ArrayList<String> succ = new ArrayList<>(); for (int i = item.pos + 1 ; i < item.p.rights.size(); i++) { succ.add(item.p.rights.get(i)); } succ.add(item.next); HashSet<String> succFirstSet = getFirstSetOfStrings(succ); for (Production p : grammar.nt2prods.get(nt)) { for (String b : succFirstSet) { LR1Item it = new LR1Item(p, 0 , b); if (!newItems.contains(it)) { changed = true ; newItems.add(it); } } } } } s.items = newItems; } return s; }
(4) goto过程 1 2 3 4 5 6 7 8 9 public CCSet gotoFunc (CCSet s, String x) { CCSet moved = new CCSet(); for (LR1Item i : s.items) { if (i.dotBeforeSym(x)) { moved.items.add(new LR1Item(i.p, i.pos + 1 , i.next)); } } return closure(moved); }
(5) 构建CC 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 public void makeCCSets () { CCSet cc0 = new CCSet(); for (Production p : grammar.nt2prods.get(S)) { cc0.items.add(new LR1Item(p, 0 , EOF)); } cc0 = closure(cc0); allCCSets.add(cc0); boolean changed = true ; HashSet<CCSet> vis = new HashSet<>(); while (changed) { changed = false ; ArrayList<CCSet> newAllCCSets = new ArrayList<>(allCCSets); for (CCSet cc : allCCSets) { if (!vis.contains(cc)) { vis.add(cc); for (String x : cc.getAllSymsAfterDot()) { CCSet temp = gotoFunc(cc, x); if (!newAllCCSets.contains(temp)) { changed = true ; newAllCCSets.add(temp); } cc.addEdge(x, temp); } } } allCCSets = newAllCCSets; } for (int i = 0 ; i < allCCSets.size(); i++) { allCCSets.get(i).setHandler(i); } }
(6) 填表 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 public void makeTable () { for (CCSet cc : allCCSets) { for (LR1Item item : cc.items) { if (item.hasSymAfterDot() && isT(item.getSymAfterDot()) && cc.edges.containsKey(item.getSymAfterDot())) { actionMap.put(new Pair<>(cc, item.getSymAfterDot()), new Pair<>(SHIFT, cc.edges.get(item.getSymAfterDot()))); } else if (!item.hasSymAfterDot()) { actionMap.put(new Pair<>(cc, item.next), new Pair<>(REDUCE, item.p)); } if (item.isFinal()) { actionMap.put(new Pair<>(cc, EOF), new Pair<>(ACCEPTED, null )); } } for (String nt : grammar.NT) { if (cc.edges.containsKey(nt)) { gotoMap.put(new Pair<>(cc, nt), cc.edges.get(nt)); } } } }
2 LR(1)语法分析器框架 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 public boolean parse () { Stack<Object> st = new Stack<>(); st.push("$" ); st.push(allCCSets.get(0 )); CCSet state; String word = lexer.nextToken(); while (true ) { state = (CCSet) st.peek(); Pair<String, Object> action = actionMap.get(new Pair<>(state, word)); if (action == null ) { return false ; } if (action.getFirst().equals(REDUCE)) { Production p = ((Production) action.getSecond()); for (int i = 0 ; i < p.rights.size(); i++) { st.pop(); st.pop(); } state = (CCSet) st.peek(); st.push(p.left); st.push(gotoMap.get(new Pair<>(state, p.left))); } else if (action.getFirst().equals(SHIFT)) { st.push(word); st.push(action.getSecond()); word = lexer.nextToken(); } else if (action.getFirst().equals(ACCEPTED)) { break ; } } return true ; }