rust/src/comp/fe/lexer.rs

334 lines
8.6 KiB
Rust
Raw Normal View History

import std._io.stdio_reader;
import std._str;
import std.map;
import std.map.hashmap;
fn new_str_hash[V]() -> map.hashmap[str,V] {
let map.hashfn[str] hasher = _str.hash;
let map.eqfn[str] eqer = _str.eq;
ret map.mk_hashmap[str,V](hasher, eqer);
}
type reader = obj {
fn is_eof() -> bool;
fn peek() -> char;
fn bump();
fn get_pos() -> tup(str,uint,uint);
fn get_keywords() -> hashmap[str,token.token];
fn get_reserved() -> hashmap[str,()];
};
fn new_reader(stdio_reader rdr, str filename) -> reader
{
obj reader(stdio_reader rdr,
str filename,
mutable char c,
mutable uint line,
mutable uint col,
hashmap[str,token.token] keywords,
hashmap[str,()] reserved)
{
fn is_eof() -> bool {
ret c == (-1) as char;
}
fn get_pos() -> tup(str,uint,uint) {
ret tup(filename, line, col);
}
fn peek() -> char {
ret c;
}
fn bump() {
c = rdr.getc() as char;
if (c == '\n') {
line += 1u;
col = 0u;
} else {
col += 1u;
}
}
fn get_keywords() -> hashmap[str,token.token] {
ret keywords;
}
fn get_reserved() -> hashmap[str,()] {
ret reserved;
}
}
auto keywords = new_str_hash[token.token]();
auto reserved = new_str_hash[()]();
keywords.insert("mod", token.MOD());
keywords.insert("use", token.USE());
keywords.insert("meta", token.META());
keywords.insert("auth", token.AUTH());
keywords.insert("syntax", token.SYNTAX());
keywords.insert("if", token.IF());
keywords.insert("else", token.ELSE());
keywords.insert("while", token.WHILE());
keywords.insert("do", token.DO());
keywords.insert("alt", token.ALT());
keywords.insert("case", token.CASE());
keywords.insert("for", token.FOR());
keywords.insert("each", token.EACH());
keywords.insert("put", token.PUT());
keywords.insert("ret", token.RET());
keywords.insert("be", token.BE());
ret reader(rdr, filename, rdr.getc() as char, 1u, 1u,
keywords, reserved);
}
2010-08-19 18:42:17 -07:00
fn in_range(char c, char lo, char hi) -> bool {
ret lo <= c && c <= hi;
2010-08-19 18:42:17 -07:00
}
fn is_alpha(char c) -> bool {
ret in_range(c, 'a', 'z') ||
in_range(c, 'A', 'Z');
}
fn is_dec_digit(char c) -> bool {
ret in_range(c, '0', '9');
}
fn is_hex_digit(char c) -> bool {
ret in_range(c, '0', '9') ||
in_range(c, 'a', 'f') ||
in_range(c, 'A', 'F');
}
fn is_bin_digit(char c) -> bool {
ret c == '0' || c == '1';
}
fn is_whitespace(char c) -> bool {
ret c == ' ' || c == '\t' || c == '\r' || c == '\n';
2010-08-19 18:42:17 -07:00
}
fn consume_any_whitespace(stdio_reader rdr, char c) -> char {
auto c1 = c;
while (is_whitespace(c1)) {
c1 = rdr.getc() as char;
}
be consume_any_line_comment(rdr, c1);
}
fn consume_any_line_comment(stdio_reader rdr, char c) -> char {
auto c1 = c;
if (c1 == '/') {
auto c2 = rdr.getc() as char;
if (c2 == '/') {
while (c1 != '\n') {
c1 = rdr.getc() as char;
}
// Restart whitespace munch.
be consume_any_whitespace(rdr, c1);
}
}
ret c;
}
fn next_token(stdio_reader rdr) -> token.token {
2010-08-19 18:42:17 -07:00
auto eof = (-1) as char;
auto c = rdr.getc() as char;
auto accum_str = "";
auto accum_int = 0;
2010-08-19 18:42:17 -07:00
2010-08-24 08:50:56 -07:00
fn next(stdio_reader rdr) -> char {
ret rdr.getc() as char;
}
fn forget(stdio_reader rdr, char c) {
rdr.ungetc(c as int);
}
c = consume_any_whitespace(rdr, c);
2010-08-19 18:42:17 -07:00
if (c == eof) { ret token.EOF(); }
2010-08-20 10:03:31 -07:00
if (is_alpha(c)) {
while (is_alpha(c)) {
accum_str += (c as u8);
2010-08-24 08:50:56 -07:00
c = next(rdr);
}
2010-08-24 08:50:56 -07:00
forget(rdr, c);
ret token.IDENT(accum_str);
2010-08-20 10:03:31 -07:00
}
if (is_dec_digit(c)) {
if (c == '0') {
} else {
while (is_dec_digit(c)) {
accum_int *= 10;
accum_int += (c as int) - ('0' as int);
2010-08-24 08:50:56 -07:00
c = next(rdr);
}
2010-08-24 08:50:56 -07:00
forget(rdr, c);
ret token.LIT_INT(accum_int);
}
}
fn op_or_opeq(stdio_reader rdr, char c2,
token.op op) -> token.token {
if (c2 == '=') {
ret token.OPEQ(op);
} else {
2010-08-24 08:50:56 -07:00
forget(rdr, c2);
ret token.OP(op);
}
}
alt (c) {
// One-byte tokens.
case (';') { ret token.SEMI(); }
case (',') { ret token.COMMA(); }
case ('.') { ret token.DOT(); }
case ('(') { ret token.LPAREN(); }
case (')') { ret token.RPAREN(); }
case ('{') { ret token.LBRACE(); }
case ('}') { ret token.RBRACE(); }
case ('[') { ret token.LBRACKET(); }
case (']') { ret token.RBRACKET(); }
case ('@') { ret token.AT(); }
case ('#') { ret token.POUND(); }
// Multi-byte tokens.
case ('=') {
2010-08-24 08:50:56 -07:00
auto c2 = next(rdr);
if (c2 == '=') {
ret token.OP(token.EQEQ());
} else {
2010-08-24 08:50:56 -07:00
forget(rdr, c2);
ret token.OP(token.EQ());
}
}
case ('\'') {
// FIXME: general utf8-consumption support.
auto c2 = next(rdr);
if (c2 == '\\') {
c2 = next(rdr);
alt (c2) {
case ('n') { c2 = '\n'; }
case ('r') { c2 = '\r'; }
case ('t') { c2 = '\t'; }
case ('\\') { c2 = '\\'; }
case ('\'') { c2 = '\''; }
// FIXME: unicode numeric escapes.
case (_) {
log "unknown character escape";
log c2;
fail;
}
}
}
if (next(rdr) != '\'') {
log "unterminated character constant";
fail;
}
ret token.LIT_CHAR(c2);
}
case ('"') {
// FIXME: general utf8-consumption support.
auto c2 = next(rdr);
while (c2 != '"') {
alt (c2) {
case ('\\') {
c2 = next(rdr);
alt (c2) {
case ('n') { accum_str += '\n' as u8; }
case ('r') { accum_str += '\r' as u8; }
case ('t') { accum_str += '\t' as u8; }
case ('\\') { accum_str += '\\' as u8; }
case ('"') { accum_str += '"' as u8; }
// FIXME: unicode numeric escapes.
case (_) {
log "unknown string escape";
log c2;
fail;
}
}
}
case (_) {
accum_str += c2 as u8;
}
}
c2 = next(rdr);
}
ret token.LIT_STR(accum_str);
}
case ('-') {
2010-08-24 08:50:56 -07:00
auto c2 = next(rdr);
if (c2 == '>') {
ret token.RARROW();
} else {
ret op_or_opeq(rdr, c2, token.MINUS());
}
}
case ('&') {
2010-08-24 08:50:56 -07:00
auto c2 = next(rdr);
if (c2 == '&') {
ret token.OP(token.ANDAND());
} else {
ret op_or_opeq(rdr, c2, token.AND());
}
}
case ('+') {
2010-08-24 08:50:56 -07:00
ret op_or_opeq(rdr, next(rdr), token.PLUS());
}
case ('*') {
2010-08-24 08:50:56 -07:00
ret op_or_opeq(rdr, next(rdr), token.STAR());
}
case ('/') {
2010-08-24 08:50:56 -07:00
ret op_or_opeq(rdr, next(rdr), token.STAR());
}
case ('!') {
2010-08-24 08:50:56 -07:00
ret op_or_opeq(rdr, next(rdr), token.NOT());
}
case ('^') {
2010-08-24 08:50:56 -07:00
ret op_or_opeq(rdr, next(rdr), token.CARET());
}
case ('%') {
2010-08-24 08:50:56 -07:00
ret op_or_opeq(rdr, next(rdr), token.PERCENT());
}
}
log "lexer stopping at ";
log c;
ret token.EOF();
}
//
// Local Variables:
// mode: rust
// fill-column: 78;
// indent-tabs-mode: nil
// c-basic-offset: 4
// buffer-file-coding-system: utf-8-unix
// compile-command: "make -k -C ../.. 2>&1 | sed -e 's/\\/x\\//x:\\//g'";
// End:
//