// Copyright 2014 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #![feature(plugin, rustc_private)] extern crate syntax; extern crate syntax_pos; extern crate rustc; #[macro_use] extern crate log; use std::collections::HashMap; use std::env; use std::fs::File; use std::io::{BufRead, Read}; use std::path::Path; use syntax::parse::lexer; use rustc::dep_graph::DepGraph; use rustc::session::{self, config}; use rustc::middle::cstore::DummyCrateStore; use std::rc::Rc; use syntax::ast; use syntax::codemap; use syntax::parse::token::{self, BinOpToken, DelimToken, Lit, Token}; use syntax::parse::lexer::TokenAndSpan; use syntax_pos::Pos; use syntax::symbol::{Symbol, keywords}; fn parse_token_list(file: &str) -> HashMap { fn id() -> token::Token { Token::Ident(ast::Ident::with_empty_ctxt(keywords::Invalid.name())) } let mut res = HashMap::new(); res.insert("-1".to_string(), Token::Eof); for line in file.split('\n') { let eq = match line.trim().rfind('=') { Some(val) => val, None => continue }; let val = &line[..eq]; let num = &line[eq + 1..]; let tok = match val { "SHR" => Token::BinOp(BinOpToken::Shr), "DOLLAR" => Token::Dollar, "LT" => Token::Lt, "STAR" => Token::BinOp(BinOpToken::Star), "FLOAT_SUFFIX" => id(), "INT_SUFFIX" => id(), "SHL" => Token::BinOp(BinOpToken::Shl), "LBRACE" => Token::OpenDelim(DelimToken::Brace), "RARROW" => Token::RArrow, "LIT_STR" => Token::Literal(Lit::Str_(keywords::Invalid.name()), None), "DOTDOT" => Token::DotDot, "MOD_SEP" => Token::ModSep, "DOTDOTDOT" => Token::DotDotDot, "NOT" => Token::Not, "AND" => Token::BinOp(BinOpToken::And), "LPAREN" => Token::OpenDelim(DelimToken::Paren), "ANDAND" => Token::AndAnd, "AT" => Token::At, "LBRACKET" => Token::OpenDelim(DelimToken::Bracket), "LIT_STR_RAW" => Token::Literal(Lit::StrRaw(keywords::Invalid.name(), 0), None), "RPAREN" => Token::CloseDelim(DelimToken::Paren), "SLASH" => Token::BinOp(BinOpToken::Slash), "COMMA" => Token::Comma, "LIFETIME" => Token::Lifetime( ast::Ident::with_empty_ctxt(keywords::Invalid.name())), "CARET" => Token::BinOp(BinOpToken::Caret), "TILDE" => Token::Tilde, "IDENT" => id(), "PLUS" => Token::BinOp(BinOpToken::Plus), "LIT_CHAR" => Token::Literal(Lit::Char(keywords::Invalid.name()), None), "LIT_BYTE" => Token::Literal(Lit::Byte(keywords::Invalid.name()), None), "EQ" => Token::Eq, "RBRACKET" => Token::CloseDelim(DelimToken::Bracket), "COMMENT" => Token::Comment, "DOC_COMMENT" => Token::DocComment(keywords::Invalid.name()), "DOT" => Token::Dot, "EQEQ" => Token::EqEq, "NE" => Token::Ne, "GE" => Token::Ge, "PERCENT" => Token::BinOp(BinOpToken::Percent), "RBRACE" => Token::CloseDelim(DelimToken::Brace), "BINOP" => Token::BinOp(BinOpToken::Plus), "POUND" => Token::Pound, "OROR" => Token::OrOr, "LIT_INTEGER" => Token::Literal(Lit::Integer(keywords::Invalid.name()), None), "BINOPEQ" => Token::BinOpEq(BinOpToken::Plus), "LIT_FLOAT" => Token::Literal(Lit::Float(keywords::Invalid.name()), None), "WHITESPACE" => Token::Whitespace, "UNDERSCORE" => Token::Underscore, "MINUS" => Token::BinOp(BinOpToken::Minus), "SEMI" => Token::Semi, "COLON" => Token::Colon, "FAT_ARROW" => Token::FatArrow, "OR" => Token::BinOp(BinOpToken::Or), "GT" => Token::Gt, "LE" => Token::Le, "LIT_BINARY" => Token::Literal(Lit::ByteStr(keywords::Invalid.name()), None), "LIT_BINARY_RAW" => Token::Literal( Lit::ByteStrRaw(keywords::Invalid.name(), 0), None), "QUESTION" => Token::Question, "SHEBANG" => Token::Shebang(keywords::Invalid.name()), _ => continue, }; res.insert(num.to_string(), tok); } debug!("Token map: {:?}", res); res } fn str_to_binop(s: &str) -> token::BinOpToken { match s { "+" => BinOpToken::Plus, "/" => BinOpToken::Slash, "-" => BinOpToken::Minus, "*" => BinOpToken::Star, "%" => BinOpToken::Percent, "^" => BinOpToken::Caret, "&" => BinOpToken::And, "|" => BinOpToken::Or, "<<" => BinOpToken::Shl, ">>" => BinOpToken::Shr, _ => panic!("Bad binop str `{}`", s), } } /// Assuming a string/byte string literal, strip out the leading/trailing /// hashes and surrounding quotes/raw/byte prefix. fn fix(mut lit: &str) -> ast::Name { let prefix: Vec = lit.chars().take(2).collect(); if prefix[0] == 'r' { if prefix[1] == 'b' { lit = &lit[2..] } else { lit = &lit[1..]; } } else if prefix[0] == 'b' { lit = &lit[1..]; } let leading_hashes = count(lit); // +1/-1 to adjust for single quotes Symbol::intern(&lit[leading_hashes + 1..lit.len() - leading_hashes - 1]) } /// Assuming a char/byte literal, strip the 'b' prefix and the single quotes. fn fixchar(mut lit: &str) -> ast::Name { let prefix = lit.chars().next().unwrap(); if prefix == 'b' { lit = &lit[1..]; } Symbol::intern(&lit[1..lit.len() - 1]) } fn count(lit: &str) -> usize { lit.chars().take_while(|c| *c == '#').count() } fn parse_antlr_token(s: &str, tokens: &HashMap, surrogate_pairs_pos: &[usize], has_bom: bool) -> TokenAndSpan { // old regex: // \[@(?P\d+),(?P\d+):(?P\d+)='(?P.+?)',<(?P-?\d+)>,\d+:\d+] let start = s.find("[@").unwrap(); let comma = start + s[start..].find(",").unwrap(); let colon = comma + s[comma..].find(":").unwrap(); let content_start = colon + s[colon..].find("='").unwrap(); // Use rfind instead of find, because we don't want to stop at the content let content_end = content_start + s[content_start..].rfind("',<").unwrap(); let toknum_end = content_end + s[content_end..].find(">,").unwrap(); let start = &s[comma + 1 .. colon]; let end = &s[colon + 1 .. content_start]; let content = &s[content_start + 2 .. content_end]; let toknum = &s[content_end + 3 .. toknum_end]; let not_found = format!("didn't find token {:?} in the map", toknum); let proto_tok = tokens.get(toknum).expect(¬_found[..]); let nm = Symbol::intern(content); debug!("What we got: content (`{}`), proto: {:?}", content, proto_tok); let real_tok = match *proto_tok { Token::BinOp(..) => Token::BinOp(str_to_binop(content)), Token::BinOpEq(..) => Token::BinOpEq(str_to_binop(&content[..content.len() - 1])), Token::Literal(Lit::Str_(..), n) => Token::Literal(Lit::Str_(fix(content)), n), Token::Literal(Lit::StrRaw(..), n) => Token::Literal(Lit::StrRaw(fix(content), count(content)), n), Token::Literal(Lit::Char(..), n) => Token::Literal(Lit::Char(fixchar(content)), n), Token::Literal(Lit::Byte(..), n) => Token::Literal(Lit::Byte(fixchar(content)), n), Token::DocComment(..) => Token::DocComment(nm), Token::Literal(Lit::Integer(..), n) => Token::Literal(Lit::Integer(nm), n), Token::Literal(Lit::Float(..), n) => Token::Literal(Lit::Float(nm), n), Token::Literal(Lit::ByteStr(..), n) => Token::Literal(Lit::ByteStr(nm), n), Token::Literal(Lit::ByteStrRaw(..), n) => Token::Literal(Lit::ByteStrRaw(fix(content), count(content)), n), Token::Ident(..) => Token::Ident(ast::Ident::with_empty_ctxt(nm)), Token::Lifetime(..) => Token::Lifetime(ast::Ident::with_empty_ctxt(nm)), ref t => t.clone() }; let start_offset = if real_tok == Token::Eof { 1 } else { 0 }; let offset = if has_bom { 1 } else { 0 }; let mut lo = start.parse::().unwrap() - start_offset - offset; let mut hi = end.parse::().unwrap() + 1 - offset; // Adjust the span: For each surrogate pair already encountered, subtract one position. lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32; hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32; let sp = syntax_pos::Span { lo: syntax_pos::BytePos(lo), hi: syntax_pos::BytePos(hi), expn_id: syntax_pos::NO_EXPANSION }; TokenAndSpan { tok: real_tok, sp: sp } } fn tok_cmp(a: &token::Token, b: &token::Token) -> bool { match a { &Token::Ident(id) => match b { &Token::Ident(id2) => id == id2, _ => false }, _ => a == b } } fn span_cmp(antlr_sp: codemap::Span, rust_sp: codemap::Span, cm: &codemap::CodeMap) -> bool { antlr_sp.expn_id == rust_sp.expn_id && antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() && antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize() } fn main() { fn next(r: &mut lexer::StringReader) -> TokenAndSpan { use syntax::parse::lexer::Reader; r.next_token() } let mut args = env::args().skip(1); let filename = args.next().unwrap(); if filename.find("parse-fail").is_some() { return; } // Rust's lexer let mut code = String::new(); File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap(); let surrogate_pairs_pos: Vec = code.chars().enumerate() .filter(|&(_, c)| c as usize > 0xFFFF) .map(|(n, _)| n) .enumerate() .map(|(x, n)| x + n) .collect(); let has_bom = code.starts_with("\u{feff}"); debug!("Pairs: {:?}", surrogate_pairs_pos); let options = config::basic_options(); let session = session::build_session(options, &DepGraph::new(false), None, syntax::errors::registry::Registry::new(&[]), Rc::new(DummyCrateStore)); let filemap = session.parse_sess.codemap() .new_filemap("".to_string(), None, code); let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap); let cm = session.codemap(); // ANTLR let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap(); let mut token_list = String::new(); token_file.read_to_string(&mut token_list).unwrap(); let token_map = parse_token_list(&token_list[..]); let stdin = std::io::stdin(); let lock = stdin.lock(); let lines = lock.lines(); let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(), &token_map, &surrogate_pairs_pos[..], has_bom)); for antlr_tok in antlr_tokens { let rustc_tok = next(&mut lexer); if rustc_tok.tok == Token::Eof && antlr_tok.tok == Token::Eof { continue } assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans", rustc_tok, antlr_tok); macro_rules! matches { ( $($x:pat),+ ) => ( match rustc_tok.tok { $($x => match antlr_tok.tok { $x => { if !tok_cmp(&rustc_tok.tok, &antlr_tok.tok) { // FIXME #15677: needs more robust escaping in // antlr warn!("Different names for {:?} and {:?}", rustc_tok, antlr_tok); } } _ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok) },)* ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok) } ) } matches!( Token::Literal(Lit::Byte(..), _), Token::Literal(Lit::Char(..), _), Token::Literal(Lit::Integer(..), _), Token::Literal(Lit::Float(..), _), Token::Literal(Lit::Str_(..), _), Token::Literal(Lit::StrRaw(..), _), Token::Literal(Lit::ByteStr(..), _), Token::Literal(Lit::ByteStrRaw(..), _), Token::Ident(..), Token::Lifetime(..), Token::Interpolated(..), Token::DocComment(..), Token::Shebang(..) ); } }