Auto merge of #21452 - bleibig:bison-grammar, r=nikomatsakis

This adds a new lexer/parser combo for the entire Rust language can be generated with with flex and bison, taken from my project at https://github.com/bleibig/rust-grammar. There is also a testing script that runs the generated parser with all *.rs files in the repository (except for tests in compile-fail or ones that marked as "ignore-test" or "ignore-lexer-test"). If you have flex and bison installed, you can run these tests using the new "check-grammar" make target. This does not depend on or interact with the existing testing code in the grammar, which only provides and tests a lexer specification. OS X users should take note that the version of bison that comes with the Xcode toolchain (2.3) is too old to work with this grammar, they need to download and install version 3.0 or later. The parser builds up an S-expression-based AST, which can be displayed by giving the "-v" argument to parser-lalr (normally it only gives output on error). It is only a rough approximation of what is parsed and doesn't capture every detail and nuance of the program. Hopefully this should be sufficient for issue #2234, or at least a good starting point.
2015-01-24 22:14:14 +00:00 · 2015-01-24 22:14:14 +00:00 · 4e4e8cff16
parent bb7cc4eb26 f39297f991
commit 4e4e8cff16
7 changed files with 2663 additions and 0 deletions
--- a/2
+++ b/2
@ -645,6 +645,8 @@ probe CFG_ISCC             iscc
 probe CFG_JAVAC            javac
 probe CFG_ANTLR4           antlr4
 probe CFG_GRUN             grun
+probe CFG_FLEX             flex
+probe CFG_BISON            bison
 probe CFG_PANDOC           pandoc
 probe CFG_PDFLATEX         pdflatex
 probe CFG_XELATEX          xelatex
--- a/mk/grammar.mk
+++ b/mk/grammar.mk
@ -14,6 +14,11 @@ B = $(CFG_BUILD_DIR)/$(CFG_BUILD)/stage2/
 L = $(B)lib/rustlib/$(CFG_BUILD)/lib
 LD = $(CFG_BUILD)/stage2/lib/rustlib/$(CFG_BUILD)/lib/
 RUSTC = $(STAGE2_T_$(CFG_BUILD)_H_$(CFG_BUILD))
+ifeq ($(CFG_OSTYPE),apple-darwin)
+	FLEX_LDFLAGS=-ll
+else
+	FLEX_LDFLAGS=-lfl
+endif

 # Run the reference lexer against libsyntax and compare the tokens and spans.
 # If "// ignore-lexer-test" is present in the file, it will be ignored.
@ -67,3 +72,46 @@ $(info cfg: javac not available, skipping lexer test...)
 check-lexer:

 endif
+
+$(BG)lex.yy.c: $(SG)lexer.l $(BG)
+	@$(call E, flex: $@)
+	$(Q)$(CFG_FLEX) -o $@ $<
+
+$(BG)lexer-lalr.o: $(BG)lex.yy.c $(BG)parser-lalr.tab.h
+	@$(call E, cc: $@)
+	$(Q)$(CFG_CC) -include $(BG)parser-lalr.tab.h -c -o $@ $<
+
+$(BG)parser-lalr.tab.c $(BG)parser-lalr.tab.h: $(SG)parser-lalr.y
+	@$(call E, bison: $@)
+	$(Q)$(CFG_BISON) $< --output=$(BG)parser-lalr.tab.c --defines=$(BG)parser-lalr.tab.h \
+		--name-prefix=rs --warnings=error=all
+
+$(BG)parser-lalr.o: $(BG)parser-lalr.tab.c
+	@$(call E, cc: $@)
+	$(Q)$(CFG_CC) -c -o $@ $<
+
+$(BG)parser-lalr-main.o: $(SG)parser-lalr-main.c
+	@$(call E, cc: $@)
+	$(Q)$(CFG_CC) -std=c99 -c -o $@ $<
+
+$(BG)parser-lalr: $(BG)parser-lalr.o $(BG)parser-lalr-main.o $(BG)lexer-lalr.o
+	@$(call E, cc: $@)
+	$(Q)$(CFG_CC) -o $@ $^ $(FLEX_LDFLAGS)
+
+
+ifdef CFG_FLEX
+ifdef CFG_BISON
+check-grammar: $(BG) $(BG)parser-lalr
+	$(info Verifying grammar ...)
+	$(SG)testparser.py -p $(BG)parser-lalr -s $(S)src
+
+else
+$(info cfg: bison not available, skipping parser test...)
+check-grammar:
+
+endif
+else
+$(info cfg: flex not available, skipping parser test...)
+check-grammar:
+
+endif
--- a/src/grammar/lexer.l
+++ b/src/grammar/lexer.l
@ -0,0 +1,342 @@
+%{
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <stdio.h>
+#include <ctype.h>
+
+static int num_hashes;
+static int end_hashes;
+static int saw_non_hash;
+
+%}
+
+%option stack
+%option yylineno
+
+%x str
+%x rawstr
+%x rawstr_esc_begin
+%x rawstr_esc_body
+%x rawstr_esc_end
+%x byte
+%x bytestr
+%x rawbytestr
+%x rawbytestr_nohash
+%x pound
+%x shebang_or_attr
+%x ltorchar
+%x linecomment
+%x doc_line
+%x blockcomment
+%x doc_block
+%x suffix
+
+ident [a-zA-Z\x80-\xff_][a-zA-Z0-9\x80-\xff_]*
+
+%%
+
+<suffix>{ident}            { BEGIN(INITIAL); }
+<suffix>(.|\n)  { yyless(0); BEGIN(INITIAL); }
+
+[ \n\t\r]             { }
+
+\xef\xbb\xbf {
+  // UTF-8 byte order mark (BOM), ignore if in line 1, error otherwise
+  if (yyget_lineno() != 1) {
+    return -1;
+  }
+}
+
+\/\/(\/|\!)           { BEGIN(doc_line); yymore(); }
+<doc_line>\n          { BEGIN(INITIAL);
+                        yyleng--;
+                        yytext[yyleng] = 0;
+                        return ((yytext[2] == '!') ? INNER_DOC_COMMENT : OUTER_DOC_COMMENT);
+                      }
+<doc_line>[^\n]*      { yymore(); }
+
+\/\/|\/\/\/\/         { BEGIN(linecomment); }
+<linecomment>\n       { BEGIN(INITIAL); }
+<linecomment>[^\n]*   { }
+
+\/\*(\*|\!)[^*]       { yy_push_state(INITIAL); yy_push_state(doc_block); yymore(); }
+<doc_block>\/\*       { yy_push_state(doc_block); yymore(); }
+<doc_block>\*\/       {
+    yy_pop_state();
+    if (yy_top_state() == doc_block) {
+        yymore();
+    } else {
+        return ((yytext[2] == '!') ? INNER_DOC_COMMENT : OUTER_DOC_COMMENT);
+    }
+}
+<doc_block>(.|\n)     { yymore(); }
+
+\/\*                  { yy_push_state(blockcomment); }
+<blockcomment>\/\*    { yy_push_state(blockcomment); }
+<blockcomment>\*\/    { yy_pop_state(); }
+<blockcomment>(.|\n)   { }
+
+_        { return UNDERSCORE; }
+as       { return AS; }
+box      { return BOX; }
+break    { return BREAK; }
+const    { return CONST; }
+continue { return CONTINUE; }
+crate    { return CRATE; }
+else     { return ELSE; }
+enum     { return ENUM; }
+extern   { return EXTERN; }
+false    { return FALSE; }
+fn       { return FN; }
+for      { return FOR; }
+if       { return IF; }
+impl     { return IMPL; }
+in       { return IN; }
+let      { return LET; }
+loop     { return LOOP; }
+match    { return MATCH; }
+mod      { return MOD; }
+move     { return MOVE; }
+mut      { return MUT; }
+priv     { return PRIV; }
+proc     { return PROC; }
+pub      { return PUB; }
+ref      { return REF; }
+return   { return RETURN; }
+self     { return SELF; }
+static   { return STATIC; }
+struct   { return STRUCT; }
+trait    { return TRAIT; }
+true     { return TRUE; }
+type     { return TYPE; }
+typeof   { return TYPEOF; }
+unsafe   { return UNSAFE; }
+use      { return USE; }
+where    { return WHERE; }
+while    { return WHILE; }
+
+{ident}  { return IDENT; }
+
+0x[0-9a-fA-F_]+                                    { BEGIN(suffix); return LIT_INTEGER; }
+0o[0-8_]+                                          { BEGIN(suffix); return LIT_INTEGER; }
+0b[01_]+                                           { BEGIN(suffix); return LIT_INTEGER; }
+[0-9][0-9_]*                                       { BEGIN(suffix); return LIT_INTEGER; }
+[0-9][0-9_]*\.(\.|[a-zA-Z])    { yyless(yyleng - 2); BEGIN(suffix); return LIT_INTEGER; }
+
+[0-9][0-9_]*\.[0-9_]*([eE][-\+]?[0-9_]+)?          { BEGIN(suffix); return LIT_FLOAT; }
+[0-9][0-9_]*(\.[0-9_]*)?[eE][-\+]?[0-9_]+          { BEGIN(suffix); return LIT_FLOAT; }
+
+;      { return ';'; }
+,      { return ','; }
+\.\.\. { return DOTDOTDOT; }
+\.\.   { return DOTDOT; }
+\.     { return '.'; }
+\(     { return '('; }
+\)     { return ')'; }
+\{     { return '{'; }
+\}     { return '}'; }
+\[     { return '['; }
+\]     { return ']'; }
+@      { return '@'; }
+#      { BEGIN(pound); yymore(); }
+<pound>\! { BEGIN(shebang_or_attr); yymore(); }
+<shebang_or_attr>\[ {
+  BEGIN(INITIAL);
+  yyless(2);
+  return SHEBANG;
+}
+<shebang_or_attr>[^\[\n]*\n {
+  // Since the \n was eaten as part of the token, yylineno will have
+  // been incremented to the value 2 if the shebang was on the first
+  // line. This yyless undoes that, setting yylineno back to 1.
+  yyless(yyleng - 1);
+  if (yyget_lineno() == 1) {
+    BEGIN(INITIAL);
+    return SHEBANG_LINE;
+  } else {
+    BEGIN(INITIAL);
+    yyless(2);
+    return SHEBANG;
+  }
+}
+<pound>. { BEGIN(INITIAL); yyless(1); return '#'; }
+
+\~     { return '~'; }
+::     { return MOD_SEP; }
+:      { return ':'; }
+\$     { return '$'; }
+\?     { return '?'; }
+
+==    { return EQEQ; }
+=>    { return FAT_ARROW; }
+=     { return '='; }
+\!=   { return NE; }
+\!    { return '!'; }
+\<=   { return LE; }
+\<\<  { return SHL; }
+\<\<= { return SHLEQ; }
+\<    { return '<'; }
+\>=   { return GE; }
+\>\>  { return SHR; }
+\>\>= { return SHREQ; }
+\>    { return '>'; }
+
+\x27                                  { BEGIN(ltorchar); yymore(); }
+<ltorchar>static                      { BEGIN(INITIAL); return STATIC_LIFETIME; }
+<ltorchar>{ident}                     { BEGIN(INITIAL); return LIFETIME; }
+<ltorchar>\\[nrt\\\x27\x220]\x27      { BEGIN(suffix); return LIT_CHAR; }
+<ltorchar>\\x[0-9a-fA-F]{2}\x27       { BEGIN(suffix); return LIT_CHAR; }
+<ltorchar>\\u\{[0-9a-fA-F]?{6}\}\x27  { BEGIN(suffix); return LIT_CHAR; }
+<ltorchar>.\x27                       { BEGIN(suffix); return LIT_CHAR; }
+<ltorchar>[\x80-\xff]{2,4}\x27        { BEGIN(suffix); return LIT_CHAR; }
+<ltorchar><<EOF>>                     { BEGIN(INITIAL); return -1; }
+
+b\x22              { BEGIN(bytestr); yymore(); }
+<bytestr>\x22      { BEGIN(suffix); return LIT_BINARY; }
+
+<bytestr><<EOF>>                { return -1; }
+<bytestr>\\[n\nrt\\\x27\x220]   { yymore(); }
+<bytestr>\\x[0-9a-fA-F]{2}      { yymore(); }
+<bytestr>\\u\{[0-9a-fA-F]?{6}\} { yymore(); }
+<bytestr>\\[^n\nrt\\\x27\x220]  { return -1; }
+<bytestr>(.|\n)                 { yymore(); }
+
+br\x22                      { BEGIN(rawbytestr_nohash); yymore(); }
+<rawbytestr_nohash>\x22     { BEGIN(suffix); return LIT_BINARY_RAW; }
+<rawbytestr_nohash>(.|\n)   { yymore(); }
+<rawbytestr_nohash><<EOF>>  { return -1; }
+
+br/# {
+    BEGIN(rawbytestr);
+    yymore();
+    num_hashes = 0;
+    saw_non_hash = 0;
+    end_hashes = 0;
+}
+<rawbytestr># {
+    if (!saw_non_hash) {
+        num_hashes++;
+    } else if (end_hashes != 0) {
+        end_hashes++;
+        if (end_hashes == num_hashes) {
+            BEGIN(INITIAL);
+            return LIT_BINARY_RAW;
+        }
+    }
+    yymore();
+}
+<rawbytestr>\x22# {
+    end_hashes = 1;
+    if (end_hashes == num_hashes) {
+        BEGIN(INITIAL);
+        return LIT_BINARY_RAW;
+    }
+    yymore();
+}
+<rawbytestr>(.|\n) {
+    if (!saw_non_hash) {
+        saw_non_hash = 1;
+    }
+    if (end_hashes != 0) {
+        end_hashes = 0;
+    }
+    yymore();
+}
+<rawbytestr><<EOF>> { return -1; }
+
+b\x27                        { BEGIN(byte); yymore(); }
+<byte>\\[nrt\\\x27\x220]\x27 { BEGIN(INITIAL); return LIT_BYTE; }
+<byte>\\x[0-9a-fA-F]{2}\x27  { BEGIN(INITIAL); return LIT_BYTE; }
+<byte>\\u[0-9a-fA-F]{4}\x27  { BEGIN(INITIAL); return LIT_BYTE; }
+<byte>\\U[0-9a-fA-F]{8}\x27  { BEGIN(INITIAL); return LIT_BYTE; }
+<byte>.\x27                  { BEGIN(INITIAL); return LIT_BYTE; }
+<byte><<EOF>>                { BEGIN(INITIAL); return -1; }
+
+r\x22           { BEGIN(rawstr); yymore(); }
+<rawstr>\x22    { BEGIN(suffix); return LIT_STR_RAW; }
+<rawstr>(.|\n)  { yymore(); }
+<rawstr><<EOF>> { return -1; }
+
+r/#             {
+    BEGIN(rawstr_esc_begin);
+    yymore();
+    num_hashes = 0;
+    saw_non_hash = 0;
+    end_hashes = 0;
+}
+
+<rawstr_esc_begin># {
+    num_hashes++;
+    yymore();
+}
+<rawstr_esc_begin>\x22 {
+    BEGIN(rawstr_esc_body);
+    yymore();
+}
+<rawstr_esc_begin>(.|\n) { return -1; }
+
+<rawstr_esc_body>\x22/# {
+  BEGIN(rawstr_esc_end);
+  yymore();
+ }
+<rawstr_esc_body>(.|\n) {
+  yymore();
+ }
+
+<rawstr_esc_end># {
+  end_hashes++;
+  if (end_hashes == num_hashes) {
+    BEGIN(INITIAL);
+    return LIT_STR_RAW;
+  }
+  yymore();
+ }
+<rawstr_esc_end>[^#] {
+  end_hashes = 0;
+  BEGIN(rawstr_esc_body);
+  yymore();
+ }
+
+<rawstr_esc_begin,rawstr_esc_body,rawstr_esc_end><<EOF>> { return -1; }
+
+\x22                     { BEGIN(str); yymore(); }
+<str>\x22                { BEGIN(suffix); return LIT_STR; }
+
+<str><<EOF>>                { return -1; }
+<str>\\[n\nrt\\\x27\x220]   { yymore(); }
+<str>\\x[0-9a-fA-F]{2}      { yymore(); }
+<str>\\u\{[0-9a-fA-F]?{6}\} { yymore(); }
+<str>\\[^n\nrt\\\x27\x220]  { return -1; }
+<str>(.|\n)                 { yymore(); }
+
+-\>  { return RARROW; }
+-    { return '-'; }
+-=   { return MINUSEQ; }
+&&   { return ANDAND; }
+&    { return '&'; }
+&=   { return ANDEQ; }
+\|\| { return OROR; }
+\|   { return '|'; }
+\|=  { return OREQ; }
+\+   { return '+'; }
+\+=  { return PLUSEQ; }
+\*   { return '*'; }
+\*=  { return STAREQ; }
+\/   { return '/'; }
+\/=  { return SLASHEQ; }
+\^   { return '^'; }
+\^=  { return CARETEQ; }
+%    { return '%'; }
+%=   { return PERCENTEQ; }
+
+<<EOF>> { return 0; }
+
+%%
--- a/src/grammar/parser-lalr-main.c
+++ b/src/grammar/parser-lalr-main.c
@ -0,0 +1,203 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern int yylex();
+extern int rsparse();
+
+#define PUSHBACK_LEN 4
+
+static char pushback[PUSHBACK_LEN];
+static int verbose;
+
+void print(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  if (verbose) {
+    vprintf(format, args);
+  }
+  va_end(args);
+}
+
+// If there is a non-null char at the head of the pushback queue,
+// dequeue it and shift the rest of the queue forwards. Otherwise,
+// return the token from calling yylex.
+int rslex() {
+  if (pushback[0] == '\0') {
+    return yylex();
+  } else {
+    char c = pushback[0];
+    memmove(pushback, pushback + 1, PUSHBACK_LEN - 1);
+    pushback[PUSHBACK_LEN - 1] = '\0';
+    return c;
+  }
+}
+
+// Note: this does nothing if the pushback queue is full. As long as
+// there aren't more than PUSHBACK_LEN consecutive calls to push_back
+// in an action, this shouldn't be a problem.
+void push_back(char c) {
+  for (int i = 0; i < PUSHBACK_LEN; ++i) {
+    if (pushback[i] == '\0') {
+      pushback[i] = c;
+      break;
+    }
+  }
+}
+
+extern int rsdebug;
+
+struct node {
+  struct node *next;
+  struct node *prev;
+  int own_string;
+  char const *name;
+  int n_elems;
+  struct node *elems[];
+};
+
+struct node *nodes = NULL;
+int n_nodes;
+
+struct node *mk_node(char const *name, int n, ...) {
+  va_list ap;
+  int i = 0;
+  unsigned sz = sizeof(struct node) + (n * sizeof(struct node *));
+  struct node *nn, *nd = (struct node *)malloc(sz);
+
+  print("# New %d-ary node: %s = %p\n", n, name, nd);
+
+  nd->own_string = 0;
+  nd->prev = NULL;
+  nd->next = nodes;
+  if (nodes) {
+    nodes->prev = nd;
+  }
+  nodes = nd;
+
+  nd->name = name;
+  nd->n_elems = n;
+
+  va_start(ap, n);
+  while (i < n) {
+    nn = va_arg(ap, struct node *);
+    print("#   arg[%d]: %p\n", i, nn);
+    print("#            (%s ...)\n", nn->name);
+    nd->elems[i++] = nn;
+  }
+  va_end(ap);
+  n_nodes++;
+  return nd;
+}
+
+struct node *mk_atom(char *name) {
+  struct node *nd = mk_node((char const *)strdup(name), 0);
+  nd->own_string = 1;
+  return nd;
+}
+
+struct node *mk_none() {
+  return mk_atom("<none>");
+}
+
+struct node *ext_node(struct node *nd, int n, ...) {
+  va_list ap;
+  int i = 0, c = nd->n_elems + n;
+  unsigned sz = sizeof(struct node) + (c * sizeof(struct node *));
+  struct node *nn;
+
+  print("# Extending %d-ary node by %d nodes: %s = %p",
+        nd->n_elems, c, nd->name, nd);
+
+  if (nd->next) {
+    nd->next->prev = nd->prev;
+  }
+  if (nd->prev) {
+    nd->prev->next = nd->next;
+  }
+  nd = realloc(nd, sz);
+  nd->prev = NULL;
+  nd->next = nodes;
+  nodes->prev = nd;
+  nodes = nd;
+
+  print(" ==> %p\n", nd);
+
+  va_start(ap, n);
+  while (i < n) {
+    nn = va_arg(ap, struct node *);
+    print("#   arg[%d]: %p\n", i, nn);
+    print("#            (%s ...)\n", nn->name);
+    nd->elems[nd->n_elems++] = nn;
+    ++i;
+  }
+  va_end(ap);
+  return nd;
+}
+
+int const indent_step = 4;
+
+void print_indent(int depth) {
+  while (depth) {
+    if (depth-- % indent_step == 0) {
+      print("|");
+    } else {
+      print(" ");
+    }
+  }
+}
+
+void print_node(struct node *n, int depth) {
+  int i = 0;
+  print_indent(depth);
+  if (n->n_elems == 0) {
+    print("%s\n", n->name);
+  } else {
+    print("(%s\n", n->name);
+    for (i = 0; i < n->n_elems; ++i) {
+      print_node(n->elems[i], depth + indent_step);
+    }
+    print_indent(depth);
+    print(")\n");
+  }
+}
+
+int main(int argc, char **argv) {
+  if (argc == 2 && strcmp(argv[1], "-v") == 0) {
+    verbose = 1;
+  } else {
+    verbose = 0;
+  }
+  int ret = 0;
+  struct node *tmp;
+  memset(pushback, '\0', PUSHBACK_LEN);
+  ret = rsparse();
+  print("--- PARSE COMPLETE: ret:%d, n_nodes:%d ---\n", ret, n_nodes);
+  if (nodes) {
+    print_node(nodes, 0);
+  }
+  while (nodes) {
+    tmp = nodes;
+    nodes = tmp->next;
+    if (tmp->own_string) {
+      free((void*)tmp->name);
+    }
+    free(tmp);
+  }
+  return ret;
+}
+
+void rserror(char const *s) {
+  fprintf(stderr, "%s\n", s);
+}
--- a/src/grammar/parser-lalr.y
+++ b/src/grammar/parser-lalr.y
--- a/src/grammar/testparser.py
+++ b/src/grammar/testparser.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python
+#
+# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+import sys
+
+import os
+import subprocess
+import argparse
+
+# usage: testparser.py [-h] [-p PARSER [PARSER ...]] -s SOURCE_DIR
+
+# Parsers should read from stdin and return exit status 0 for a
+# successful parse, and nonzero for an unsuccessful parse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-p', '--parser', nargs='+')
+parser.add_argument('-s', '--source-dir', nargs=1, required=True)
+args = parser.parse_args(sys.argv[1:])
+
+total = 0
+ok = {}
+bad = {}
+for parser in args.parser:
+    ok[parser] = 0
+    bad[parser] = []
+devnull = open(os.devnull, 'w')
+print "\n"
+
+for base, dirs, files in os.walk(args.source_dir[0]):
+    for f in filter(lambda p: p.endswith('.rs'), files):
+        p = os.path.join(base, f)
+        compile_fail = 'compile-fail' in p
+        ignore = any('ignore-test' in line or 'ignore-lexer-test' in line
+                     for line in open(p).readlines())
+        if compile_fail or ignore:
+            continue
+        total += 1
+        for parser in args.parser:
+            if subprocess.call(parser, stdin=open(p), stderr=subprocess.STDOUT, stdout=devnull) == 0:
+                ok[parser] += 1
+            else:
+                bad[parser].append(p)
+        parser_stats = ', '.join(['{}: {}'.format(parser, ok[parser]) for parser in args.parser])
+        sys.stdout.write("\033[K\r total: {}, {}, scanned {}"
+                         .format(total, os.path.relpath(parser_stats), os.path.relpath(p)))
+
+devnull.close()
+
+print "\n"
+
+for parser in args.parser:
+    filename = os.path.basename(parser) + '.bad'
+    print("writing {} files that failed to parse with {} to {}".format(len(bad[parser]), parser, filename))
+    with open(filename, "w") as f:
+          for p in bad[parser]:
+              f.write(p)
+              f.write("\n")
--- a/src/grammar/tokens.h
+++ b/src/grammar/tokens.h
@ -0,0 +1,91 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+enum Token {
+  SHL = 257, // Parser generators reserve 0-256 for char literals
+  SHR,
+  LE,
+  EQEQ,
+  NE,
+  GE,
+  ANDAND,
+  OROR,
+  SHLEQ,
+  SHREQ,
+  MINUSEQ,
+  ANDEQ,
+  OREQ,
+  PLUSEQ,
+  STAREQ,
+  SLASHEQ,
+  CARETEQ,
+  PERCENTEQ,
+  DOTDOT,
+  DOTDOTDOT,
+  MOD_SEP,
+  RARROW,
+  FAT_ARROW,
+  LIT_BYTE,
+  LIT_CHAR,
+  LIT_INTEGER,
+  LIT_FLOAT,
+  LIT_STR,
+  LIT_STR_RAW,
+  LIT_BINARY,
+  LIT_BINARY_RAW,
+  IDENT,
+  UNDERSCORE,
+  LIFETIME,
+
+  // keywords
+  SELF,
+  STATIC,
+  AS,
+  BREAK,
+  CRATE,
+  ELSE,
+  ENUM,
+  EXTERN,
+  FALSE,
+  FN,
+  FOR,
+  IF,
+  IMPL,
+  IN,
+  LET,
+  LOOP,
+  MATCH,
+  MOD,
+  MOVE,
+  MUT,
+  PRIV,
+  PUB,
+  REF,
+  RETURN,
+  STRUCT,
+  TRUE,
+  TRAIT,
+  TYPE,
+  UNSAFE,
+  USE,
+  WHILE,
+  CONTINUE,
+  PROC,
+  BOX,
+  CONST,
+  WHERE,
+  TYPEOF,
+  INNER_DOC_COMMENT,
+  OUTER_DOC_COMMENT,
+
+  SHEBANG,
+  SHEBANG_LINE,
+  STATIC_LIFETIME
+};