From 4a0f4f5e311b7443b29ea31ad0e68e6b4aa63736 Mon Sep 17 00:00:00 2001 From: Brian Anderson Date: Wed, 14 Nov 2012 22:27:53 -0800 Subject: [PATCH] Refactor the lexer to use FilePos types --- src/libsyntax/codemap.rs | 19 ++++++--- src/libsyntax/ext/source_util.rs | 11 ++++-- src/libsyntax/parse.rs | 22 +++++++---- src/libsyntax/parse/comments.rs | 10 ++--- src/libsyntax/parse/eval.rs | 9 +---- src/libsyntax/parse/lexer.rs | 68 ++++++++++++++++++++------------ 6 files changed, 85 insertions(+), 54 deletions(-) diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index f3b0611e463..e1c5eb07eb8 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -200,11 +200,8 @@ pub impl FileMap { start_pos); } - fn next_line(@self, +chpos: CharPos, +byte_pos: BytePos) { - self.lines.push(FilePos { - ch: chpos, - byte: byte_pos + self.start_pos.byte - }); + fn next_line(@self, +pos: FilePos) { + self.lines.push(pos); } pub fn get_line(@self, line: int) -> ~str unsafe { @@ -231,6 +228,18 @@ pub impl CodeMap { } pub fn add_filemap(@self, filemap: @FileMap) { + let expected_byte_pos = if self.files.len() == 0 { + 0 + } else { + let last_start = self.files.last().start_pos.byte.to_uint(); + let last_len = self.files.last().src.len(); + last_start + last_len + }; + let actual_byte_pos = filemap.start_pos.byte.to_uint(); + debug!("codemap: adding filemap: %s", filemap.name); + debug!("codemap: expected offset: %u", expected_byte_pos); + debug!("codemap: actual offset: %u", actual_byte_pos); + assert expected_byte_pos == actual_byte_pos; self.files.push(filemap); } diff --git a/src/libsyntax/ext/source_util.rs b/src/libsyntax/ext/source_util.rs index 3fed340904a..726dbbb56e2 100644 --- a/src/libsyntax/ext/source_util.rs +++ b/src/libsyntax/ext/source_util.rs @@ -58,10 +58,13 @@ fn expand_include(cx: ext_ctxt, sp: span, arg: ast::mac_arg, _body: ast::mac_body) -> @ast::expr { let args = get_mac_args(cx, sp, arg, 1u, option::Some(1u), ~"include"); let file = expr_to_str(cx, args[0], ~"#include_str requires a string"); - let p = parse::new_parser_from_file(cx.parse_sess(), cx.cfg(), - &res_rel_file(cx, sp, &Path(file)), - parse::parser::SOURCE_FILE); - return p.parse_expr(); + let (p, rdr) = parse::new_parser_etc_from_file( + cx.parse_sess(), cx.cfg(), + &res_rel_file(cx, sp, &Path(file)), + parse::parser::SOURCE_FILE); + let e = p.parse_expr(); + parse::update_parse_sess_position(&cx.parse_sess(), &rdr); + return e; } fn expand_include_str(cx: ext_ctxt, sp: codemap::span, arg: ast::mac_arg, diff --git a/src/libsyntax/parse.rs b/src/libsyntax/parse.rs index 97ca8568cfe..c5ffbb2ab0a 100644 --- a/src/libsyntax/parse.rs +++ b/src/libsyntax/parse.rs @@ -11,6 +11,7 @@ export parse_crate_from_source_str; export parse_expr_from_source_str, parse_item_from_source_str; export parse_stmt_from_source_str; export parse_from_source_str; +export update_parse_sess_position; use parser::Parser; use attr::parser_attr; @@ -76,7 +77,7 @@ fn parse_crate_from_crate_file(input: &Path, cfg: ast::crate_cfg, let leading_attrs = p.parse_inner_attrs_and_next(); let { inner: crate_attrs, next: first_cdir_attr } = leading_attrs; let cdirs = p.parse_crate_directives(token::EOF, first_cdir_attr); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); let cx = @{sess: sess, cfg: /* FIXME (#2543) */ copy p.cfg}; let companionmod = input.filestem().map(|s| Path(*s)); let (m, attrs) = eval::eval_crate_directives_to_mod( @@ -96,7 +97,7 @@ fn parse_crate_from_source_file(input: &Path, cfg: ast::crate_cfg, let (p, rdr) = new_parser_etc_from_file(sess, cfg, input, parser::SOURCE_FILE); let r = p.parse_crate_mod(cfg); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); return r; } @@ -106,7 +107,7 @@ fn parse_crate_from_source_str(name: ~str, source: @~str, cfg: ast::crate_cfg, codemap::FssNone, source); let r = p.parse_crate_mod(cfg); p.abort_if_errors(); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); return r; } @@ -116,7 +117,7 @@ fn parse_expr_from_source_str(name: ~str, source: @~str, cfg: ast::crate_cfg, codemap::FssNone, source); let r = p.parse_expr(); p.abort_if_errors(); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); return r; } @@ -127,7 +128,7 @@ fn parse_item_from_source_str(name: ~str, source: @~str, cfg: ast::crate_cfg, codemap::FssNone, source); let r = p.parse_item(attrs); p.abort_if_errors(); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); return r; } @@ -138,7 +139,7 @@ fn parse_stmt_from_source_str(name: ~str, source: @~str, cfg: ast::crate_cfg, codemap::FssNone, source); let r = p.parse_stmt(attrs); p.abort_if_errors(); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); return r; } @@ -155,7 +156,7 @@ fn parse_from_source_str(f: fn (p: Parser) -> T, p.reader.fatal(~"expected end-of-string"); } p.abort_if_errors(); - eval::update_parse_sess_position(&sess, &rdr); + update_parse_sess_position(&sess, &rdr); move r } @@ -216,3 +217,10 @@ fn new_parser_from_tt(sess: parse_sess, cfg: ast::crate_cfg, None, tt); return Parser(sess, cfg, trdr as reader, parser::SOURCE_FILE) } + +fn update_parse_sess_position(sess: &parse_sess, r: &lexer::string_reader) { + sess.pos = FilePos { + ch: r.last_pos.ch, + byte: r.last_pos.byte + }; +} diff --git a/src/libsyntax/parse/comments.rs b/src/libsyntax/parse/comments.rs index ba36e6f88e2..92736b9f361 100644 --- a/src/libsyntax/parse/comments.rs +++ b/src/libsyntax/parse/comments.rs @@ -131,7 +131,7 @@ fn consume_non_eol_whitespace(rdr: string_reader) { fn push_blank_line_comment(rdr: string_reader, comments: &mut ~[cmnt]) { debug!(">>> blank-line comment"); let v: ~[~str] = ~[]; - comments.push({style: blank_line, lines: v, pos: rdr.chpos}); + comments.push({style: blank_line, lines: v, pos: rdr.last_pos.ch}); } fn consume_whitespace_counting_blank_lines(rdr: string_reader, @@ -148,7 +148,7 @@ fn consume_whitespace_counting_blank_lines(rdr: string_reader, fn read_shebang_comment(rdr: string_reader, code_to_the_left: bool, comments: &mut ~[cmnt]) { debug!(">>> shebang comment"); - let p = rdr.chpos; + let p = rdr.last_pos.ch; debug!("<<< shebang comment"); comments.push({ style: if code_to_the_left { trailing } else { isolated }, @@ -160,7 +160,7 @@ fn read_shebang_comment(rdr: string_reader, code_to_the_left: bool, fn read_line_comments(rdr: string_reader, code_to_the_left: bool, comments: &mut ~[cmnt]) { debug!(">>> line comments"); - let p = rdr.chpos; + let p = rdr.last_pos.ch; let mut lines: ~[~str] = ~[]; while rdr.curr == '/' && nextch(rdr) == '/' { let line = read_one_line_comment(rdr); @@ -209,7 +209,7 @@ fn trim_whitespace_prefix_and_push_line(lines: &mut ~[~str], fn read_block_comment(rdr: string_reader, code_to_the_left: bool, comments: &mut ~[cmnt]) { debug!(">>> block comment"); - let p = rdr.chpos; + let p = rdr.last_pos.ch; let mut lines: ~[~str] = ~[]; let mut col: CharPos = rdr.col; bump(rdr); @@ -319,7 +319,7 @@ fn gather_comments_and_literals(span_diagnostic: diagnostic::span_handler, } - let bstart = rdr.pos; + let bstart = rdr.pos.byte; rdr.next_token(); //discard, and look ahead; we're working with internal state let {tok: tok, sp: sp} = rdr.peek(); diff --git a/src/libsyntax/parse/eval.rs b/src/libsyntax/parse/eval.rs index bfbec5c530f..47dbc0cd6ee 100644 --- a/src/libsyntax/parse/eval.rs +++ b/src/libsyntax/parse/eval.rs @@ -1,8 +1,8 @@ +use parse::update_parse_sess_position; use parser::{Parser, SOURCE_FILE}; use attr::parser_attr; export eval_crate_directives_to_mod; -export update_parse_sess_position; type ctx = @{sess: parse::parse_sess, @@ -74,13 +74,6 @@ fn parse_companion_mod(cx: ctx, prefix: &Path, suffix: &Option) } } -fn update_parse_sess_position(sess: &parse_sess, r: &lexer::string_reader) { - sess.pos = FilePos { - ch: r.chpos, - byte: sess.pos.byte + r.pos - }; -} - fn cdir_path_opt(default: ~str, attrs: ~[ast::attribute]) -> ~str { match ::attr::first_attr_value_str_by_name(attrs, ~"path") { Some(d) => d, diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index 178d772ecc6..459935def67 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -1,5 +1,5 @@ use diagnostic::span_handler; -use codemap::{span, CodeMap, CharPos, BytePos}; +use codemap::{span, CodeMap, CharPos, BytePos, FilePos}; use ext::tt::transcribe::{tt_reader, new_tt_reader, dup_tt_reader, tt_next_token}; @@ -21,10 +21,10 @@ trait reader { type string_reader = @{ span_diagnostic: span_handler, src: @~str, + mut pos: FilePos, + mut last_pos: FilePos, mut col: CharPos, - mut pos: BytePos, mut curr: char, - mut chpos: CharPos, filemap: @codemap::FileMap, interner: @token::ident_interner, /* cached: */ @@ -48,9 +48,10 @@ fn new_low_level_string_reader(span_diagnostic: span_handler, // Force the initial reader bump to start on a fresh line let initial_char = '\n'; let r = @{span_diagnostic: span_diagnostic, src: filemap.src, - mut col: CharPos(0), mut pos: BytePos(0), + mut pos: filemap.start_pos, + mut last_pos: filemap.start_pos, + mut col: CharPos(0), mut curr: initial_char, - mut chpos: filemap.start_pos.ch, filemap: filemap, interner: itr, /* dummy values; not read */ mut peek_tok: token::EOF, @@ -61,7 +62,9 @@ fn new_low_level_string_reader(span_diagnostic: span_handler, fn dup_string_reader(&&r: string_reader) -> string_reader { @{span_diagnostic: r.span_diagnostic, src: r.src, - mut col: r.col, mut pos: r.pos, mut curr: r.curr, mut chpos: r.chpos, + mut pos: r.pos, + mut last_pos: r.last_pos, + mut col: r.col, mut curr: r.curr, filemap: r.filemap, interner: r.interner, mut peek_tok: r.peek_tok, mut peek_span: r.peek_span} } @@ -116,34 +119,48 @@ fn string_advance_token(&&r: string_reader) { if is_eof(r) { r.peek_tok = token::EOF; } else { - let start_chpos = r.chpos; + let start_chpos = r.last_pos.ch; r.peek_tok = next_token_inner(r); - r.peek_span = ast_util::mk_sp(start_chpos, r.chpos); + r.peek_span = ast_util::mk_sp(start_chpos, r.last_pos.ch); }; } +fn byte_offset(rdr: string_reader) -> BytePos { + (rdr.pos.byte - rdr.filemap.start_pos.byte) +} + fn get_str_from(rdr: string_reader, start: BytePos) -> ~str unsafe { // I'm pretty skeptical about this subtraction. What if there's a // multi-byte character before the mark? - return str::slice(*rdr.src, start.to_uint() - 1u, rdr.pos.to_uint() - 1u); + return str::slice(*rdr.src, start.to_uint() - 1u, + byte_offset(rdr).to_uint() - 1u); } fn bump(rdr: string_reader) { - if rdr.pos.to_uint() < (*rdr.src).len() { + rdr.last_pos = rdr.pos; + let current_byte_offset = byte_offset(rdr).to_uint();; + if current_byte_offset < (*rdr.src).len() { + let last_char = rdr.curr; + let next = str::char_range_at(*rdr.src, current_byte_offset); + let byte_offset_diff = next.next - current_byte_offset; + rdr.pos = FilePos { + ch: rdr.pos.ch + CharPos(1u), + byte: rdr.pos.byte + BytePos(byte_offset_diff) + }; + rdr.curr = next.ch; rdr.col += CharPos(1u); - rdr.chpos += CharPos(1u); - if rdr.curr == '\n' { - rdr.filemap.next_line(rdr.chpos, rdr.pos); + if last_char == '\n' { + rdr.filemap.next_line(rdr.last_pos); rdr.col = CharPos(0u); } - let next = str::char_range_at(*rdr.src, rdr.pos.to_uint()); - rdr.pos = BytePos(next.next); - rdr.curr = next.ch; } else { // XXX: What does this accomplish? if (rdr.curr != -1 as char) { - rdr.chpos += CharPos(1u); + rdr.pos = FilePos { + ch: rdr.pos.ch + CharPos(1u), + byte: rdr.pos.byte + BytePos(1u) + }; rdr.col += CharPos(1u); rdr.curr = -1 as char; } @@ -153,8 +170,9 @@ fn is_eof(rdr: string_reader) -> bool { rdr.curr == -1 as char } fn nextch(rdr: string_reader) -> char { - if rdr.pos.to_uint() < (*rdr.src).len() { - return str::char_at(*rdr.src, rdr.pos.to_uint()); + let offset = byte_offset(rdr).to_uint(); + if offset < (*rdr.src).len() { + return str::char_at(*rdr.src, offset); } else { return -1 as char; } } @@ -211,7 +229,7 @@ fn consume_any_line_comment(rdr: string_reader) bump(rdr); // line comments starting with "///" or "//!" are doc-comments if rdr.curr == '/' || rdr.curr == '!' { - let start_chpos = rdr.chpos - CharPos(2u); + let start_chpos = rdr.pos.ch - CharPos(2u); let mut acc = ~"//"; while rdr.curr != '\n' && !is_eof(rdr) { str::push_char(&mut acc, rdr.curr); @@ -219,7 +237,7 @@ fn consume_any_line_comment(rdr: string_reader) } return Some({ tok: token::DOC_COMMENT(rdr.interner.intern(@acc)), - sp: ast_util::mk_sp(start_chpos, rdr.chpos) + sp: ast_util::mk_sp(start_chpos, rdr.pos.ch) }); } else { while rdr.curr != '\n' && !is_eof(rdr) { bump(rdr); } @@ -234,7 +252,7 @@ fn consume_any_line_comment(rdr: string_reader) if nextch(rdr) == '!' { let cmap = @CodeMap::new(); (*cmap).files.push(rdr.filemap); - let loc = cmap.lookup_char_pos_adj(rdr.chpos); + let loc = cmap.lookup_char_pos_adj(rdr.last_pos.ch); if loc.line == 1u && loc.col == CharPos(0u) { while rdr.curr != '\n' && !is_eof(rdr) { bump(rdr); } return consume_whitespace_and_comments(rdr); @@ -250,7 +268,7 @@ fn consume_block_comment(rdr: string_reader) // block comments starting with "/**" or "/*!" are doc-comments if rdr.curr == '*' || rdr.curr == '!' { - let start_chpos = rdr.chpos - CharPos(2u); + let start_chpos = rdr.pos.ch - CharPos(2u); let mut acc = ~"/*"; while !(rdr.curr == '*' && nextch(rdr) == '/') && !is_eof(rdr) { str::push_char(&mut acc, rdr.curr); @@ -264,7 +282,7 @@ fn consume_block_comment(rdr: string_reader) bump(rdr); return Some({ tok: token::DOC_COMMENT(rdr.interner.intern(@acc)), - sp: ast_util::mk_sp(start_chpos, rdr.chpos) + sp: ast_util::mk_sp(start_chpos, rdr.pos.ch) }); } } else { @@ -584,7 +602,7 @@ fn next_token_inner(rdr: string_reader) -> token::Token { return token::LIT_INT(c2 as i64, ast::ty_char); } '"' => { - let n = rdr.pos; + let n = byte_offset(rdr); bump(rdr); while rdr.curr != '"' { if is_eof(rdr) {