From bccdba02960b3cd428addbc2c856065ebb81eb04 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 6 Jun 2014 16:04:04 +0100 Subject: [PATCH 1/7] Add a b'x' byte literal of type u8. --- src/librustc/middle/const_eval.rs | 1 + src/librustc/middle/lint.rs | 1 + src/librustc/middle/trans/consts.rs | 1 + src/librustc/middle/typeck/check/mod.rs | 1 + src/librustdoc/clean/mod.rs | 8 +++ src/librustdoc/html/highlight.rs | 2 +- src/libsyntax/ast.rs | 1 + src/libsyntax/ext/concat.rs | 1 + src/libsyntax/ext/quote.rs | 6 +++ src/libsyntax/parse/lexer/mod.rs | 68 +++++++++++++++++++++++-- src/libsyntax/parse/parser.rs | 3 +- src/libsyntax/parse/token.rs | 11 ++++ src/libsyntax/print/pprust.rs | 6 +++ src/test/compile-fail/byte-literals.rs | 25 +++++++++ src/test/compile-fail/concat.rs | 1 + src/test/run-pass/byte-literals.rs | 38 ++++++++++++++ 16 files changed, 169 insertions(+), 5 deletions(-) create mode 100644 src/test/compile-fail/byte-literals.rs create mode 100644 src/test/run-pass/byte-literals.rs diff --git a/src/librustc/middle/const_eval.rs b/src/librustc/middle/const_eval.rs index 13d0443a00f..3c5b0664f03 100644 --- a/src/librustc/middle/const_eval.rs +++ b/src/librustc/middle/const_eval.rs @@ -506,6 +506,7 @@ pub fn lit_to_const(lit: &Lit) -> const_val { LitBinary(ref data) => { const_binary(Rc::new(data.iter().map(|x| *x).collect())) } + LitByte(n) => const_uint(n as u64), LitChar(n) => const_uint(n as u64), LitInt(n, _) => const_int(n), LitUint(n, _) => const_uint(n), diff --git a/src/librustc/middle/lint.rs b/src/librustc/middle/lint.rs index 392821a6ad3..4c11693e7a6 100644 --- a/src/librustc/middle/lint.rs +++ b/src/librustc/middle/lint.rs @@ -805,6 +805,7 @@ fn check_type_limits(cx: &Context, e: &ast::Expr) { } else { t }; let (min, max) = uint_ty_range(uint_type); let lit_val: u64 = match lit.node { + ast::LitByte(_v) => return, // _v is u8, within range by definition ast::LitInt(v, _) => v as u64, ast::LitUint(v, _) => v, ast::LitIntUnsuffixed(v) => v as u64, diff --git a/src/librustc/middle/trans/consts.rs b/src/librustc/middle/trans/consts.rs index 45019edc58b..f5e66a527e7 100644 --- a/src/librustc/middle/trans/consts.rs +++ b/src/librustc/middle/trans/consts.rs @@ -43,6 +43,7 @@ pub fn const_lit(cx: &CrateContext, e: &ast::Expr, lit: ast::Lit) -> ValueRef { let _icx = push_ctxt("trans_lit"); match lit.node { + ast::LitByte(b) => C_integral(Type::uint_from_ty(cx, ast::TyU8), b as u64, false), ast::LitChar(i) => C_integral(Type::char(cx), i as u64, false), ast::LitInt(i, t) => C_integral(Type::int_from_ty(cx, t), i as u64, true), ast::LitUint(u, t) => C_integral(Type::uint_from_ty(cx, t), u, false), diff --git a/src/librustc/middle/typeck/check/mod.rs b/src/librustc/middle/typeck/check/mod.rs index 73b92e5b868..2516a00ff76 100644 --- a/src/librustc/middle/typeck/check/mod.rs +++ b/src/librustc/middle/typeck/check/mod.rs @@ -1715,6 +1715,7 @@ pub fn check_lit(fcx: &FnCtxt, lit: &ast::Lit) -> ty::t { ast::LitBinary(..) => { ty::mk_slice(tcx, ty::ReStatic, ty::mt{ ty: ty::mk_u8(), mutbl: ast::MutImmutable }) } + ast::LitByte(_) => ty::mk_u8(), ast::LitChar(_) => ty::mk_char(), ast::LitInt(_, t) => ty::mk_mach_int(t), ast::LitUint(_, t) => ty::mk_mach_uint(t), diff --git a/src/librustdoc/clean/mod.rs b/src/librustdoc/clean/mod.rs index 823e0f6a1b3..5e84a90121f 100644 --- a/src/librustdoc/clean/mod.rs +++ b/src/librustdoc/clean/mod.rs @@ -1924,6 +1924,14 @@ fn lit_to_str(lit: &ast::Lit) -> String { match lit.node { ast::LitStr(ref st, _) => st.get().to_string(), ast::LitBinary(ref data) => format!("{:?}", data.as_slice()), + ast::LitByte(b) => { + let mut res = String::from_str("b'"); + (b as char).escape_default(|c| { + res.push_char(c); + }); + res.push_char('\''); + res + }, ast::LitChar(c) => format!("'{}'", c), ast::LitInt(i, _t) => i.to_str(), ast::LitUint(u, _t) => u.to_str(), diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index f0d7b029deb..8a63b55afed 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -140,7 +140,7 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader, } // text literals - t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", + t::LIT_BYTE(..) | t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", // number literals t::LIT_INT(..) | t::LIT_UINT(..) | t::LIT_INT_UNSUFFIXED(..) | diff --git a/src/libsyntax/ast.rs b/src/libsyntax/ast.rs index 86dd736ceea..aeafc0e306c 100644 --- a/src/libsyntax/ast.rs +++ b/src/libsyntax/ast.rs @@ -616,6 +616,7 @@ pub type Lit = Spanned; pub enum Lit_ { LitStr(InternedString, StrStyle), LitBinary(Rc >), + LitByte(u8), LitChar(char), LitInt(i64, IntTy), LitUint(u64, UintTy), diff --git a/src/libsyntax/ext/concat.rs b/src/libsyntax/ext/concat.rs index 83f45ca9f16..670e38327d6 100644 --- a/src/libsyntax/ext/concat.rs +++ b/src/libsyntax/ext/concat.rs @@ -47,6 +47,7 @@ pub fn expand_syntax_ext(cx: &mut base::ExtCtxt, ast::LitBool(b) => { accumulator.push_str(format!("{}", b).as_slice()); } + ast::LitByte(..) | ast::LitBinary(..) => { cx.span_err(e.span, "cannot concatenate a binary literal"); } diff --git a/src/libsyntax/ext/quote.rs b/src/libsyntax/ext/quote.rs index 6514d8fa418..407715ab4da 100644 --- a/src/libsyntax/ext/quote.rs +++ b/src/libsyntax/ext/quote.rs @@ -436,6 +436,12 @@ fn mk_token(cx: &ExtCtxt, sp: Span, tok: &token::Token) -> Gc { vec!(mk_binop(cx, sp, binop))); } + LIT_BYTE(i) => { + let e_byte = cx.expr_lit(sp, ast::LitByte(i)); + + return cx.expr_call(sp, mk_token_path(cx, sp, "LIT_BYTE"), vec!(e_byte)); + } + LIT_CHAR(i) => { let e_char = cx.expr_lit(sp, ast::LitChar(i)); diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index f7eac0b323f..7e4cb195cea 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -650,10 +650,13 @@ impl<'a> StringReader<'a> { /// token, and updates the interner fn next_token_inner(&mut self) -> token::Token { let c = self.curr; - if ident_start(c) && !self.nextch_is('"') && !self.nextch_is('#') { + if ident_start(c) && match (c.unwrap(), self.nextch()) { // Note: r as in r" or r#" is part of a raw string literal, - // not an identifier, and is handled further down. - + // b as in b' is part of a byte literal. + // They are not identifiers, and are handled further down. + ('r', Some('"')) | ('r', Some('#')) | ('b', Some('\'')) => false, + _ => true + } { let start = self.last_pos; while ident_continue(self.curr) { self.bump(); @@ -854,6 +857,65 @@ impl<'a> StringReader<'a> { self.bump(); // advance curr past token return token::LIT_CHAR(c2); } + 'b' => { + self.bump(); + assert!(self.curr_is('\''), "Should have been a token::IDENT"); + self.bump(); + let start = self.last_pos; + + // the eof will be picked up by the final `'` check below + let mut c2 = self.curr.unwrap_or('\x00'); + self.bump(); + + match c2 { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self.curr; + let escaped_pos = self.last_pos; + self.bump(); + match escaped { + None => {} + Some(e) => { + c2 = match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self.scan_numeric_escape(2u, '\''), + c2 => { + self.err_span_char(escaped_pos, self.last_pos, + "unknown byte escape", c2); + c2 + } + } + } + } + } + '\t' | '\n' | '\r' | '\'' => { + self.err_span_char( start, self.last_pos, + "byte constant must be escaped", c2); + } + _ if c2 > '\x7F' => { + self.err_span_char( start, self.last_pos, + "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", c2); + } + _ => {} + } + if !self.curr_is('\'') { + self.fatal_span_verbose( + // Byte offsetting here is okay because the + // character before position `start` are an + // ascii single quote and ascii 'b'. + start - BytePos(2), self.last_pos, + "unterminated byte constant".to_string()); + } + self.bump(); // advance curr past token + return token::LIT_BYTE(c2 as u8); + } '"' => { let mut accum_str = String::new(); let start_bpos = self.last_pos; diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index bbe0680ef14..0bd47ede214 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -33,7 +33,7 @@ use ast::{ForeignItem, ForeignItemStatic, ForeignItemFn, ForeignMod}; use ast::{Ident, NormalFn, Inherited, Item, Item_, ItemStatic}; use ast::{ItemEnum, ItemFn, ItemForeignMod, ItemImpl}; use ast::{ItemMac, ItemMod, ItemStruct, ItemTrait, ItemTy, Lit, Lit_}; -use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar}; +use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte}; use ast::{LitIntUnsuffixed, LitNil, LitStr, LitUint, Local, LocalLet}; use ast::{MutImmutable, MutMutable, Mac_, MacInvocTT, Matcher, MatchNonterminal}; use ast::{MatchSeq, MatchTok, Method, MutTy, BiMul, Mutability}; @@ -1512,6 +1512,7 @@ impl<'a> Parser<'a> { // matches token_lit = LIT_INT | ... pub fn lit_from_token(&mut self, tok: &token::Token) -> Lit_ { match *tok { + token::LIT_BYTE(i) => LitByte(i), token::LIT_CHAR(i) => LitChar(i), token::LIT_INT(i, it) => LitInt(i, it), token::LIT_UINT(u, ut) => LitUint(u, ut), diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index a4a022708d9..b8f13624a32 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -78,6 +78,7 @@ pub enum Token { DOLLAR, /* Literals */ + LIT_BYTE(u8), LIT_CHAR(char), LIT_INT(i64, ast::IntTy), LIT_UINT(u64, ast::UintTy), @@ -193,6 +194,14 @@ pub fn to_str(t: &Token) -> String { DOLLAR => "$".to_string(), /* Literals */ + LIT_BYTE(b) => { + let mut res = String::from_str("b'"); + (b as char).escape_default(|c| { + res.push_char(c); + }); + res.push_char('\''); + res + } LIT_CHAR(c) => { let mut res = String::from_str("'"); c.escape_default(|c| { @@ -273,6 +282,7 @@ pub fn can_begin_expr(t: &Token) -> bool { IDENT(_, _) => true, UNDERSCORE => true, TILDE => true, + LIT_BYTE(_) => true, LIT_CHAR(_) => true, LIT_INT(_, _) => true, LIT_UINT(_, _) => true, @@ -311,6 +321,7 @@ pub fn close_delimiter_for(t: &Token) -> Option { pub fn is_lit(t: &Token) -> bool { match *t { + LIT_BYTE(_) => true, LIT_CHAR(_) => true, LIT_INT(_, _) => true, LIT_UINT(_, _) => true, diff --git a/src/libsyntax/print/pprust.rs b/src/libsyntax/print/pprust.rs index badfbe7eb15..6ea2eed293e 100644 --- a/src/libsyntax/print/pprust.rs +++ b/src/libsyntax/print/pprust.rs @@ -2305,6 +2305,12 @@ impl<'a> State<'a> { } match lit.node { ast::LitStr(ref st, style) => self.print_string(st.get(), style), + ast::LitByte(byte) => { + let mut res = String::from_str("b'"); + (byte as char).escape_default(|c| res.push_char(c)); + res.push_char('\''); + word(&mut self.s, res.as_slice()) + } ast::LitChar(ch) => { let mut res = String::from_str("'"); ch.escape_default(|c| res.push_char(c)); diff --git a/src/test/compile-fail/byte-literals.rs b/src/test/compile-fail/byte-literals.rs new file mode 100644 index 00000000000..436078fa762 --- /dev/null +++ b/src/test/compile-fail/byte-literals.rs @@ -0,0 +1,25 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +// ignore-tidy-tab + +static FOO: u8 = b'\f'; //~ ERROR unknown byte escape + +pub fn main() { + b'\f'; //~ ERROR unknown byte escape + b'\x0Z'; //~ ERROR illegal character in numeric character escape: Z + b' '; //~ ERROR byte constant must be escaped + b'''; //~ ERROR byte constant must be escaped + b'é'; //~ ERROR byte constant must be ASCII + b'a //~ ERROR unterminated byte constant +} + + diff --git a/src/test/compile-fail/concat.rs b/src/test/compile-fail/concat.rs index c34e402c90b..a3dc1174424 100644 --- a/src/test/compile-fail/concat.rs +++ b/src/test/compile-fail/concat.rs @@ -9,6 +9,7 @@ // except according to those terms. fn main() { + concat!(b'f'); //~ ERROR: cannot concatenate a binary literal concat!(foo); //~ ERROR: expected a literal concat!(foo()); //~ ERROR: expected a literal } diff --git a/src/test/run-pass/byte-literals.rs b/src/test/run-pass/byte-literals.rs new file mode 100644 index 00000000000..560b2f0337a --- /dev/null +++ b/src/test/run-pass/byte-literals.rs @@ -0,0 +1,38 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +static FOO: u8 = b'\xF0'; + +pub fn main() { + assert_eq!(b'a', 97u8); + assert_eq!(b'\n', 10u8); + assert_eq!(b'\r', 13u8); + assert_eq!(b'\t', 9u8); + assert_eq!(b'\\', 92u8); + assert_eq!(b'\'', 39u8); + assert_eq!(b'\"', 34u8); + assert_eq!(b'\0', 0u8); + assert_eq!(b'\xF0', 240u8); + assert_eq!(FOO, 240u8); + + // FIXME: Do we want this to be valid? + assert_eq!([42, ..b'\t'].as_slice(), &[42, 42, 42, 42, 42, 42, 42, 42, 42]); + + match 42 { + b'*' => {}, + _ => fail!() + } + + match 100 { + b'a' .. b'z' => {}, + _ => fail!() + } +} From d7e01b5809cd600a30bab29da698acb3d1b52409 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 7 Jun 2014 15:32:01 +0100 Subject: [PATCH 2/7] Add a b"xx" byte string literal of type &'static [u8]. --- src/libcore/str.rs | 4 + src/libregex_macros/lib.rs | 2 +- src/librustc/middle/const_eval.rs | 1 + src/librustc/middle/trans/_match.rs | 17 +- src/librustdoc/html/highlight.rs | 3 +- src/libsyntax/parse/lexer/mod.rs | 157 ++++++++++++------ src/libsyntax/parse/parser.rs | 3 +- src/libsyntax/parse/token.rs | 16 +- src/libsyntax/print/pprust.rs | 16 +- src/test/compile-fail/byte-string-literals.rs | 23 +++ src/test/compile-fail/concat.rs | 1 + src/test/run-pass/byte-literals.rs | 12 ++ 12 files changed, 185 insertions(+), 70 deletions(-) create mode 100644 src/test/compile-fail/byte-string-literals.rs diff --git a/src/libcore/str.rs b/src/libcore/str.rs index c01997f1c42..84ffb7fb20e 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -560,6 +560,8 @@ Section: Comparing strings // share the implementation of the lang-item vs. non-lang-item // eq_slice. +/// NOTE: This function is (ab)used in rustc::middle::trans::_match +/// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[inline] fn eq_slice_(a: &str, b: &str) -> bool { #[allow(ctypes)] @@ -572,6 +574,8 @@ fn eq_slice_(a: &str, b: &str) -> bool { } /// Bytewise slice equality +/// NOTE: This function is (ab)used in rustc::middle::trans::_match +/// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[cfg(not(test))] #[lang="str_eq"] #[inline] diff --git a/src/libregex_macros/lib.rs b/src/libregex_macros/lib.rs index 8641936cc34..ff5cada05ea 100644 --- a/src/libregex_macros/lib.rs +++ b/src/libregex_macros/lib.rs @@ -182,7 +182,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, #[allow(unused_variable)] fn run(&mut self, start: uint, end: uint) -> Vec> { let mut matched = false; - let prefix_bytes: &[u8] = &$prefix_bytes; + let prefix_bytes: &[u8] = $prefix_bytes; let mut clist = &mut Threads::new(self.which); let mut nlist = &mut Threads::new(self.which); diff --git a/src/librustc/middle/const_eval.rs b/src/librustc/middle/const_eval.rs index 3c5b0664f03..72def2c10da 100644 --- a/src/librustc/middle/const_eval.rs +++ b/src/librustc/middle/const_eval.rs @@ -529,6 +529,7 @@ pub fn compare_const_vals(a: &const_val, b: &const_val) -> Option { (&const_float(a), &const_float(b)) => compare_vals(a, b), (&const_str(ref a), &const_str(ref b)) => compare_vals(a, b), (&const_bool(a), &const_bool(b)) => compare_vals(a, b), + (&const_binary(ref a), &const_binary(ref b)) => compare_vals(a, b), _ => None } } diff --git a/src/librustc/middle/trans/_match.rs b/src/librustc/middle/trans/_match.rs index 9361d64250c..808d894be43 100644 --- a/src/librustc/middle/trans/_match.rs +++ b/src/librustc/middle/trans/_match.rs @@ -1273,13 +1273,24 @@ fn compare_values<'a>( val: bool_to_i1(result.bcx, result.val) } } - _ => cx.sess().bug("only scalars and strings supported in compare_values"), + _ => cx.sess().bug("only strings supported in compare_values"), }, ty::ty_rptr(_, mt) => match ty::get(mt.ty).sty { ty::ty_str => compare_str(cx, lhs, rhs, rhs_t), - _ => cx.sess().bug("only scalars and strings supported in compare_values"), + ty::ty_vec(mt, _) => match ty::get(mt.ty).sty { + ty::ty_uint(ast::TyU8) => { + // NOTE: cast &[u8] to &str and abuse the str_eq lang item, + // which calls memcmp(). + let t = ty::mk_str_slice(cx.tcx(), ty::ReStatic, ast::MutImmutable); + let lhs = BitCast(cx, lhs, type_of::type_of(cx.ccx(), t).ptr_to()); + let rhs = BitCast(cx, rhs, type_of::type_of(cx.ccx(), t).ptr_to()); + compare_str(cx, lhs, rhs, rhs_t) + }, + _ => cx.sess().bug("only byte strings supported in compare_values"), + }, + _ => cx.sess().bug("on string and byte strings supported in compare_values"), }, - _ => cx.sess().bug("only scalars and strings supported in compare_values"), + _ => cx.sess().bug("only scalars, byte strings, and strings supported in compare_values"), } } diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 8a63b55afed..172a1be7b4e 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -140,7 +140,8 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader, } // text literals - t::LIT_BYTE(..) | t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", + t::LIT_BYTE(..) | t::LIT_BINARY(..) | + t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", // number literals t::LIT_INT(..) | t::LIT_UINT(..) | t::LIT_INT_UNSUFFIXED(..) | diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 7e4cb195cea..59bcf059fcd 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -654,7 +654,8 @@ impl<'a> StringReader<'a> { // Note: r as in r" or r#" is part of a raw string literal, // b as in b' is part of a byte literal. // They are not identifiers, and are handled further down. - ('r', Some('"')) | ('r', Some('#')) | ('b', Some('\'')) => false, + ('r', Some('"')) | ('r', Some('#')) | + ('b', Some('"')) | ('b', Some('\'')) => false, _ => true } { let start = self.last_pos; @@ -859,62 +860,124 @@ impl<'a> StringReader<'a> { } 'b' => { self.bump(); - assert!(self.curr_is('\''), "Should have been a token::IDENT"); - self.bump(); - let start = self.last_pos; + return match self.curr { + Some('\'') => parse_byte(self), + Some('"') => parse_byte_string(self), + _ => unreachable!() // Should have been a token::IDENT above. + }; - // the eof will be picked up by the final `'` check below - let mut c2 = self.curr.unwrap_or('\x00'); - self.bump(); + fn parse_byte(self_: &mut StringReader) -> token::Token { + self_.bump(); + let start = self_.last_pos; - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self.curr; - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self.scan_numeric_escape(2u, '\''), - c2 => { - self.err_span_char(escaped_pos, self.last_pos, - "unknown byte escape", c2); - c2 + // the eof will be picked up by the final `'` check below + let mut c2 = self_.curr.unwrap_or('\x00'); + self_.bump(); + + match c2 { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self_.curr; + let escaped_pos = self_.last_pos; + self_.bump(); + match escaped { + None => {} + Some(e) => { + c2 = match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self_.scan_numeric_escape(2u, '\''), + c2 => { + self_.err_span_char( + escaped_pos, self_.last_pos, + "unknown byte escape", c2); + c2 + } } } } } + '\t' | '\n' | '\r' | '\'' => { + self_.err_span_char( start, self_.last_pos, + "byte constant must be escaped", c2); + } + _ => if c2 > '\x7F' { + self_.err_span_char( start, self_.last_pos, + "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", c2); + } } - '\t' | '\n' | '\r' | '\'' => { - self.err_span_char( start, self.last_pos, - "byte constant must be escaped", c2); + if !self_.curr_is('\'') { + // Byte offsetting here is okay because the + // character before position `start` are an + // ascii single quote and ascii 'b'. + self_.fatal_span_verbose( + start - BytePos(2), self_.last_pos, + "unterminated byte constant".to_string()); } - _ if c2 > '\x7F' => { - self.err_span_char( start, self.last_pos, - "byte constant must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte", c2); - } - _ => {} + self_.bump(); // advance curr past token + return token::LIT_BYTE(c2 as u8); } - if !self.curr_is('\'') { - self.fatal_span_verbose( - // Byte offsetting here is okay because the - // character before position `start` are an - // ascii single quote and ascii 'b'. - start - BytePos(2), self.last_pos, - "unterminated byte constant".to_string()); + + fn parse_byte_string(self_: &mut StringReader) -> token::Token { + self_.bump(); + let start = self_.last_pos; + let mut value = Vec::new(); + while !self_.curr_is('"') { + if self_.is_eof() { + self_.fatal_span(start, self_.last_pos, + "unterminated double quote byte string"); + } + + let ch = self_.curr.unwrap(); + self_.bump(); + match ch { + '\\' => { + if self_.is_eof() { + self_.fatal_span(start, self_.last_pos, + "unterminated double quote byte string"); + } + + let escaped = self_.curr.unwrap(); + let escaped_pos = self_.last_pos; + self_.bump(); + match escaped { + 'n' => value.push('\n' as u8), + 'r' => value.push('\r' as u8), + 't' => value.push('\t' as u8), + '\\' => value.push('\\' as u8), + '\'' => value.push('\'' as u8), + '"' => value.push('"' as u8), + '\n' => self_.consume_whitespace(), + '0' => value.push(0), + 'x' => { + value.push(self_.scan_numeric_escape(2u, '"') as u8); + } + c2 => { + self_.err_span_char(escaped_pos, self_.last_pos, + "unknown byte string escape", c2); + } + } + } + _ => { + if ch <= '\x7F' { + value.push(ch as u8) + } else { + self_.err_span_char(self_.last_pos, self_.last_pos, + "byte string must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", ch); + } + } + } + } + self_.bump(); + return token::LIT_BINARY(Rc::new(value)); } - self.bump(); // advance curr past token - return token::LIT_BYTE(c2 as u8); } '"' => { let mut accum_str = String::new(); diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 0bd47ede214..826d28ef3ff 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -33,7 +33,7 @@ use ast::{ForeignItem, ForeignItemStatic, ForeignItemFn, ForeignMod}; use ast::{Ident, NormalFn, Inherited, Item, Item_, ItemStatic}; use ast::{ItemEnum, ItemFn, ItemForeignMod, ItemImpl}; use ast::{ItemMac, ItemMod, ItemStruct, ItemTrait, ItemTy, Lit, Lit_}; -use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte}; +use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte, LitBinary}; use ast::{LitIntUnsuffixed, LitNil, LitStr, LitUint, Local, LocalLet}; use ast::{MutImmutable, MutMutable, Mac_, MacInvocTT, Matcher, MatchNonterminal}; use ast::{MatchSeq, MatchTok, Method, MutTy, BiMul, Mutability}; @@ -1529,6 +1529,7 @@ impl<'a> Parser<'a> { token::LIT_STR_RAW(s, n) => { LitStr(self.id_to_interned_str(s), ast::RawStr(n)) } + token::LIT_BINARY(ref v) => LitBinary(v.clone()), token::LPAREN => { self.expect(&token::RPAREN); LitNil }, _ => { self.unexpected_last(tok); } } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index b8f13624a32..b76dcaf0b94 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -87,6 +87,7 @@ pub enum Token { LIT_FLOAT_UNSUFFIXED(ast::Ident), LIT_STR(ast::Ident), LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */ + LIT_BINARY(Rc>), /* Name components */ // an identifier contains an "is_mod_name" boolean, @@ -231,17 +232,22 @@ pub fn to_str(t: &Token) -> String { body } LIT_STR(s) => { - (format!("\"{}\"", get_ident(s).get().escape_default())).to_string() + format!("\"{}\"", get_ident(s).get().escape_default()) } LIT_STR_RAW(s, n) => { - (format!("r{delim}\"{string}\"{delim}", - delim="#".repeat(n), string=get_ident(s))).to_string() + format!("r{delim}\"{string}\"{delim}", + delim="#".repeat(n), string=get_ident(s)) + } + LIT_BINARY(ref v) => { + format!( + "b\"{}\"", + v.iter().map(|&b| b as char).collect::().escape_default()) } /* Name components */ IDENT(s, _) => get_ident(s).get().to_string(), LIFETIME(s) => { - (format!("{}", get_ident(s))).to_string() + format!("{}", get_ident(s)) } UNDERSCORE => "_".to_string(), @@ -291,6 +297,7 @@ pub fn can_begin_expr(t: &Token) -> bool { LIT_FLOAT_UNSUFFIXED(_) => true, LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, + LIT_BINARY(_) => true, POUND => true, AT => true, NOT => true, @@ -330,6 +337,7 @@ pub fn is_lit(t: &Token) -> bool { LIT_FLOAT_UNSUFFIXED(_) => true, LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, + LIT_BINARY(_) => true, _ => false } } diff --git a/src/libsyntax/print/pprust.rs b/src/libsyntax/print/pprust.rs index 6ea2eed293e..fafebd3c5dc 100644 --- a/src/libsyntax/print/pprust.rs +++ b/src/libsyntax/print/pprust.rs @@ -2342,19 +2342,9 @@ impl<'a> State<'a> { ast::LitBool(val) => { if val { word(&mut self.s, "true") } else { word(&mut self.s, "false") } } - ast::LitBinary(ref arr) => { - try!(self.ibox(indent_unit)); - try!(word(&mut self.s, "[")); - try!(self.commasep_cmnt(Inconsistent, - arr.as_slice(), - |s, u| { - word(&mut s.s, - format!("{}", - *u).as_slice()) - }, - |_| lit.span)); - try!(word(&mut self.s, "]")); - self.end() + ast::LitBinary(ref v) => { + let escaped: String = v.iter().map(|&b| b as char).collect(); + word(&mut self.s, format!("b\"{}\"", escaped.escape_default()).as_slice()) } } } diff --git a/src/test/compile-fail/byte-string-literals.rs b/src/test/compile-fail/byte-string-literals.rs new file mode 100644 index 00000000000..ec67cdd77e1 --- /dev/null +++ b/src/test/compile-fail/byte-string-literals.rs @@ -0,0 +1,23 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +// ignore-tidy-tab + +static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape + +pub fn main() { + b"\f"; //~ ERROR unknown byte escape + b"\x0Z"; //~ ERROR illegal character in numeric character escape: Z + b"é"; //~ ERROR byte constant must be ASCII + b"a //~ ERROR unterminated double quote byte string +} + + diff --git a/src/test/compile-fail/concat.rs b/src/test/compile-fail/concat.rs index a3dc1174424..dc31126e6d6 100644 --- a/src/test/compile-fail/concat.rs +++ b/src/test/compile-fail/concat.rs @@ -10,6 +10,7 @@ fn main() { concat!(b'f'); //~ ERROR: cannot concatenate a binary literal + concat!(b"foo"); //~ ERROR: cannot concatenate a binary literal concat!(foo); //~ ERROR: expected a literal concat!(foo()); //~ ERROR: expected a literal } diff --git a/src/test/run-pass/byte-literals.rs b/src/test/run-pass/byte-literals.rs index 560b2f0337a..58df7dc8efd 100644 --- a/src/test/run-pass/byte-literals.rs +++ b/src/test/run-pass/byte-literals.rs @@ -10,6 +10,7 @@ static FOO: u8 = b'\xF0'; +static BAR: &'static [u8] = b"a\xF0\t"; pub fn main() { assert_eq!(b'a', 97u8); @@ -35,4 +36,15 @@ pub fn main() { b'a' .. b'z' => {}, _ => fail!() } + + assert_eq!(b"a\n\r\t\\\'\"\0\xF0", + &[97u8, 10u8, 13u8, 9u8, 92u8, 39u8, 34u8, 0u8, 240u8]); + assert_eq!(b"a\ + b", &[97u8, 98u8]); + assert_eq!(BAR, &[97u8, 240u8, 9u8]); + + match &[97u8, 10u8] { + b"a\n" => {}, + _ => fail!(), + } } From b8a4c1415b154fa1e5bd8bb54e681f0f5e21e2a4 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 13 Jun 2014 18:56:24 +0100 Subject: [PATCH 3/7] Add br##"xx"## raw byte string literals. --- src/librustdoc/html/highlight.rs | 2 +- src/libsyntax/parse/lexer/mod.rs | 56 ++++++++++++++++++- src/libsyntax/parse/parser.rs | 1 + src/libsyntax/parse/token.rs | 7 +++ src/test/compile-fail/raw-byte-string-eof.rs | 16 ++++++ .../compile-fail/raw-byte-string-literals.rs | 17 ++++++ src/test/run-pass/byte-literals.rs | 8 ++- 7 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 src/test/compile-fail/raw-byte-string-eof.rs create mode 100644 src/test/compile-fail/raw-byte-string-literals.rs diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 172a1be7b4e..daa9ee3da84 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -140,7 +140,7 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader, } // text literals - t::LIT_BYTE(..) | t::LIT_BINARY(..) | + t::LIT_BYTE(..) | t::LIT_BINARY(..) | t::LIT_BINARY_RAW(..) | t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", // number literals diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 59bcf059fcd..31f15fd7495 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -650,12 +650,13 @@ impl<'a> StringReader<'a> { /// token, and updates the interner fn next_token_inner(&mut self) -> token::Token { let c = self.curr; - if ident_start(c) && match (c.unwrap(), self.nextch()) { + if ident_start(c) && match (c.unwrap(), self.nextch(), self.nextnextch()) { // Note: r as in r" or r#" is part of a raw string literal, // b as in b' is part of a byte literal. // They are not identifiers, and are handled further down. - ('r', Some('"')) | ('r', Some('#')) | - ('b', Some('"')) | ('b', Some('\'')) => false, + ('r', Some('"'), _) | ('r', Some('#'), _) | + ('b', Some('"'), _) | ('b', Some('\''), _) | + ('b', Some('r'), Some('"')) | ('b', Some('r'), Some('#')) => false, _ => true } { let start = self.last_pos; @@ -863,6 +864,7 @@ impl<'a> StringReader<'a> { return match self.curr { Some('\'') => parse_byte(self), Some('"') => parse_byte_string(self), + Some('r') => parse_raw_byte_string(self), _ => unreachable!() // Should have been a token::IDENT above. }; @@ -978,6 +980,54 @@ impl<'a> StringReader<'a> { self_.bump(); return token::LIT_BINARY(Rc::new(value)); } + + fn parse_raw_byte_string(self_: &mut StringReader) -> token::Token { + let start_bpos = self_.last_pos; + self_.bump(); + let mut hash_count = 0u; + while self_.curr_is('#') { + self_.bump(); + hash_count += 1; + } + + if self_.is_eof() { + self_.fatal_span(start_bpos, self_.last_pos, "unterminated raw string"); + } else if !self_.curr_is('"') { + self_.fatal_span_char(start_bpos, self_.last_pos, + "only `#` is allowed in raw string delimitation; \ + found illegal character", + self_.curr.unwrap()); + } + self_.bump(); + let content_start_bpos = self_.last_pos; + let mut content_end_bpos; + 'outer: loop { + match self_.curr { + None => self_.fatal_span(start_bpos, self_.last_pos, + "unterminated raw string"), + Some('"') => { + content_end_bpos = self_.last_pos; + for _ in range(0, hash_count) { + self_.bump(); + if !self_.curr_is('#') { + continue 'outer; + } + } + break; + }, + Some(c) => if c > '\x7F' { + self_.err_span_char(self_.last_pos, self_.last_pos, + "raw byte string must be ASCII", c); + } + } + self_.bump(); + } + self_.bump(); + let bytes = self_.with_str_from_to(content_start_bpos, + content_end_bpos, + |s| s.as_bytes().to_owned()); + return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count); + } } '"' => { let mut accum_str = String::new(); diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 826d28ef3ff..ae2ec216bee 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -1529,6 +1529,7 @@ impl<'a> Parser<'a> { token::LIT_STR_RAW(s, n) => { LitStr(self.id_to_interned_str(s), ast::RawStr(n)) } + token::LIT_BINARY_RAW(ref v, _) | token::LIT_BINARY(ref v) => LitBinary(v.clone()), token::LPAREN => { self.expect(&token::RPAREN); LitNil }, _ => { self.unexpected_last(tok); } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index b76dcaf0b94..a2af417ed79 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -88,6 +88,7 @@ pub enum Token { LIT_STR(ast::Ident), LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */ LIT_BINARY(Rc>), + LIT_BINARY_RAW(Rc>, uint), /* raw binary str delimited by n hash symbols */ /* Name components */ // an identifier contains an "is_mod_name" boolean, @@ -243,6 +244,10 @@ pub fn to_str(t: &Token) -> String { "b\"{}\"", v.iter().map(|&b| b as char).collect::().escape_default()) } + LIT_BINARY_RAW(ref s, n) => { + format!("br{delim}\"{string}\"{delim}", + delim="#".repeat(n), string=s.as_slice().to_ascii().as_str_ascii()) + } /* Name components */ IDENT(s, _) => get_ident(s).get().to_string(), @@ -298,6 +303,7 @@ pub fn can_begin_expr(t: &Token) -> bool { LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, LIT_BINARY(_) => true, + LIT_BINARY_RAW(_, _) => true, POUND => true, AT => true, NOT => true, @@ -338,6 +344,7 @@ pub fn is_lit(t: &Token) -> bool { LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, LIT_BINARY(_) => true, + LIT_BINARY_RAW(_, _) => true, _ => false } } diff --git a/src/test/compile-fail/raw-byte-string-eof.rs b/src/test/compile-fail/raw-byte-string-eof.rs new file mode 100644 index 00000000000..83ea9db39b7 --- /dev/null +++ b/src/test/compile-fail/raw-byte-string-eof.rs @@ -0,0 +1,16 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +pub fn main() { + br##"a"#; //~ unterminated raw string +} + + diff --git a/src/test/compile-fail/raw-byte-string-literals.rs b/src/test/compile-fail/raw-byte-string-literals.rs new file mode 100644 index 00000000000..7a3d1b2318a --- /dev/null +++ b/src/test/compile-fail/raw-byte-string-literals.rs @@ -0,0 +1,17 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +pub fn main() { + br"é"; //~ raw byte string must be ASCII + br##~"a"~##; //~ only `#` is allowed in raw string delimitation +} + + diff --git a/src/test/run-pass/byte-literals.rs b/src/test/run-pass/byte-literals.rs index 58df7dc8efd..5317fdc391f 100644 --- a/src/test/run-pass/byte-literals.rs +++ b/src/test/run-pass/byte-literals.rs @@ -11,6 +11,7 @@ static FOO: u8 = b'\xF0'; static BAR: &'static [u8] = b"a\xF0\t"; +static BAZ: &'static [u8] = br"a\n"; pub fn main() { assert_eq!(b'a', 97u8); @@ -24,7 +25,6 @@ pub fn main() { assert_eq!(b'\xF0', 240u8); assert_eq!(FOO, 240u8); - // FIXME: Do we want this to be valid? assert_eq!([42, ..b'\t'].as_slice(), &[42, 42, 42, 42, 42, 42, 42, 42, 42]); match 42 { @@ -47,4 +47,10 @@ pub fn main() { b"a\n" => {}, _ => fail!(), } + + assert_eq!(BAZ, &[97u8, 92u8, 110u8]); + assert_eq!(br"a\n", &[97u8, 92u8, 110u8]); + assert_eq!(br"a\n", b"a\\n"); + assert_eq!(br###"a"##b"###, &[97u8, 34u8, 35u8, 35u8, 98u8]); + assert_eq!(br###"a"##b"###, b"a\"##b"); } From 3a52a8a8b8079de795dabdd35985f9f663aa0b5d Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 13 Jun 2014 20:26:37 +0100 Subject: [PATCH 4/7] Document the byte, byte string, and raw byte string literals. --- src/doc/rust.md | 71 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/src/doc/rust.md b/src/doc/rust.md index 8f47e81ba5c..97757a53e4c 100644 --- a/src/doc/rust.md +++ b/src/doc/rust.md @@ -234,7 +234,7 @@ rule. A literal is a form of constant expression, so is evaluated (primarily) at compile time. ~~~~ {.ebnf .gram} -literal : string_lit | char_lit | num_lit ; +literal : string_lit | char_lit | byte_string_lit | byte_lit | num_lit ; ~~~~ #### Character and string literals @@ -244,17 +244,17 @@ char_lit : '\x27' char_body '\x27' ; string_lit : '"' string_body * '"' | 'r' raw_string ; char_body : non_single_quote - | '\x5c' [ '\x27' | common_escape ] ; + | '\x5c' [ '\x27' | common_escape | unicode_escape ] ; string_body : non_double_quote - | '\x5c' [ '\x22' | common_escape ] ; + | '\x5c' [ '\x22' | common_escape | unicode_escape ] ; raw_string : '"' raw_string_body '"' | '#' raw_string '#' ; common_escape : '\x5c' | 'n' | 'r' | 't' | '0' | 'x' hex_digit 2 - | 'u' hex_digit 4 - | 'U' hex_digit 8 ; +unicode_escape : 'u' hex_digit 4 + | 'U' hex_digit 8 ; hex_digit : 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' @@ -294,7 +294,7 @@ the following forms: escaped in order to denote *itself*. Raw string literals do not process any escapes. They start with the character -`U+0072` (`r`), followed zero or more of the character `U+0023` (`#`) and a +`U+0072` (`r`), followed by zero or more of the character `U+0023` (`#`) and a `U+0022` (double-quote) character. The _raw string body_ is not defined in the EBNF grammar above: it can contain any sequence of Unicode characters and is terminated only by another `U+0022` (double-quote) character, followed by the @@ -319,6 +319,65 @@ r##"foo #"# bar"##; // foo #"# bar "\\x52"; r"\x52"; // \x52 ~~~~ +#### Byte and byte string literals + +~~~~ {.ebnf .gram} +byte_lit : 'b' '\x27' byte_body '\x27' ; +byte_string_lit : 'b' '"' string_body * '"' | 'b' 'r' raw_byte_string ; + +byte_body : ascii_non_single_quote + | '\x5c' [ '\x27' | common_escape ] ; + +byte_string_body : ascii_non_double_quote + | '\x5c' [ '\x22' | common_escape ] ; +raw_byte_string : '"' raw_byte_string_body '"' | '#' raw_byte_string '#' ; + +~~~~ + +A _byte literal_ is a single ASCII character (in the `U+0000` to `U+007F` range) +enclosed within two `U+0027` (single-quote) characters, +with the exception of `U+0027` itself, +which must be _escaped_ by a preceding U+005C character (`\`), +or a single _escape_. +It is equivalent to a `u8` unsigned 8-bit integer _number literal_. + +A _byte string literal_ is a sequence of ASCII characters and _escapes_ +enclosed within two `U+0022` (double-quote) characters, +with the exception of `U+0022` itself, +which must be _escaped_ by a preceding `U+005C` character (`\`), +or a _raw byte string literal_. +It is equivalent to a `&'static [u8]` borrowed vectior unsigned 8-bit integers. + +Some additional _escapes_ are available in either byte or non-raw byte string +literals. An escape starts with a `U+005C` (`\`) and continues with one of +the following forms: + + * An _byte escape_ escape starts with `U+0078` (`x`) and is + followed by exactly two _hex digits_. It denotes the byte + equal to the provided hex value. + * A _whitespace escape_ is one of the characters `U+006E` (`n`), `U+0072` + (`r`), or `U+0074` (`t`), denoting the bytes values `0x0A` (ASCII LF), + `0x0D` (ASCII CR) or `0x09` (ASCII HT) respectively. + * The _backslash escape_ is the character `U+005C` (`\`) which must be + escaped in order to denote its ASCII encoding `0x5C`. + +Raw byte string literals do not process any escapes. +They start with the character `U+0072` (`r`), +followed by `U+0062` (`b`), +followed by zero or more of the character `U+0023` (`#`), +and a `U+0022` (double-quote) character. +The _raw string body_ is not defined in the EBNF grammar above: +it can contain any sequence of ASCII characters and is +terminated only by another `U+0022` (double-quote) character, followed by the +same number of `U+0023` (`#`) characters that preceded the opening `U+0022` +(double-quote) character. +A raw byte string literal can not contain any non-ASCII byte. + +All characters contained in the raw string body represent their ASCII encoding, +the characters `U+0022` (double-quote) (except when followed by at least as +many `U+0023` (`#`) characters as were used to start the raw string literal) or +`U+005C` (`\`) do not have any special meaning. + #### Number literals ~~~~ {.ebnf .gram} From 612bbaf7a07fe247e5e2d057cc4f10742918ead0 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 17 Jun 2014 17:58:13 +0200 Subject: [PATCH 5/7] Refactor backslash-escape parsing to share similar code. Move into a new syntax::parse::lexer::StringReader method the code that was almost duplicated for parsing backslash-escapes in byte, byte string, char, and string literals. --- src/libsyntax/parse/lexer/mod.rs | 220 ++++++++++--------------------- 1 file changed, 69 insertions(+), 151 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 31f15fd7495..f67b77d64dd 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -636,6 +636,67 @@ impl<'a> StringReader<'a> { } } + /// Scan for a single (possibly escaped) byte or char + /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. + /// `start` is the position of `first_source_char`, which is already consumed. + fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char, + ascii_only: bool, delim: char) -> Option { + match first_source_char { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self.curr; + let escaped_pos = self.last_pos; + self.bump(); + match escaped { + None => {}, // EOF here is an error that will be checked later. + Some(e) => { + return Some(match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self.scan_numeric_escape(2u, delim), + 'u' if !ascii_only => self.scan_numeric_escape(4u, delim), + 'U' if !ascii_only => self.scan_numeric_escape(8u, delim), + '\n' if delim == '"' => { + self.consume_whitespace(); + return None + }, + c => { + let last_pos = self.last_pos; + self.err_span_char( + escaped_pos, last_pos, + if ascii_only { "unknown byte escape" } + else { "unknown character escape" }, + c); + c + } + }) + } + } + } + '\t' | '\n' | '\r' | '\'' if delim == '\'' => { + let last_pos = self.last_pos; + self.err_span_char( + start, last_pos, + if ascii_only { "byte constant must be escaped" } + else { "character constant must be escaped" }, + first_source_char); + } + _ => if ascii_only && first_source_char > '\x7F' { + let last_pos = self.last_pos; + self.err_span_char( + start, last_pos, + "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", first_source_char); + } + } + Some(first_source_char) + } + fn binop(&mut self, op: token::BinOp) -> token::Token { self.bump(); if self.curr_is('=') { @@ -810,43 +871,7 @@ impl<'a> StringReader<'a> { } // Otherwise it is a character constant: - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self.curr; - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self.scan_numeric_escape(2u, '\''), - 'u' => self.scan_numeric_escape(4u, '\''), - 'U' => self.scan_numeric_escape(8u, '\''), - c2 => { - let last_bpos = self.last_pos; - self.err_span_char(escaped_pos, last_bpos, - "unknown character escape", c2); - c2 - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' => { - let last_bpos = self.last_pos; - self.err_span_char( start, last_bpos, - "character constant must be escaped", c2); - } - _ => {} - } + c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap(); if !self.curr_is('\'') { let last_bpos = self.last_pos; self.fatal_span_verbose( @@ -876,44 +901,7 @@ impl<'a> StringReader<'a> { let mut c2 = self_.curr.unwrap_or('\x00'); self_.bump(); - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self_.curr; - let escaped_pos = self_.last_pos; - self_.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self_.scan_numeric_escape(2u, '\''), - c2 => { - self_.err_span_char( - escaped_pos, self_.last_pos, - "unknown byte escape", c2); - c2 - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' => { - self_.err_span_char( start, self_.last_pos, - "byte constant must be escaped", c2); - } - _ => if c2 > '\x7F' { - self_.err_span_char( start, self_.last_pos, - "byte constant must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte", c2); - } - } + c2 = self_.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap(); if !self_.curr_is('\'') { // Byte offsetting here is okay because the // character before position `start` are an @@ -936,46 +924,11 @@ impl<'a> StringReader<'a> { "unterminated double quote byte string"); } + let ch_start = self_.last_pos; let ch = self_.curr.unwrap(); self_.bump(); - match ch { - '\\' => { - if self_.is_eof() { - self_.fatal_span(start, self_.last_pos, - "unterminated double quote byte string"); - } - - let escaped = self_.curr.unwrap(); - let escaped_pos = self_.last_pos; - self_.bump(); - match escaped { - 'n' => value.push('\n' as u8), - 'r' => value.push('\r' as u8), - 't' => value.push('\t' as u8), - '\\' => value.push('\\' as u8), - '\'' => value.push('\'' as u8), - '"' => value.push('"' as u8), - '\n' => self_.consume_whitespace(), - '0' => value.push(0), - 'x' => { - value.push(self_.scan_numeric_escape(2u, '"') as u8); - } - c2 => { - self_.err_span_char(escaped_pos, self_.last_pos, - "unknown byte string escape", c2); - } - } - } - _ => { - if ch <= '\x7F' { - value.push(ch as u8) - } else { - self_.err_span_char(self_.last_pos, self_.last_pos, - "byte string must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte", ch); - } - } - } + self_.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"') + .map(|ch| value.push(ch as u8)); } self_.bump(); return token::LIT_BINARY(Rc::new(value)); @@ -1039,46 +992,11 @@ impl<'a> StringReader<'a> { self.fatal_span(start_bpos, last_bpos, "unterminated double quote string"); } + let ch_start = self.last_pos; let ch = self.curr.unwrap(); self.bump(); - match ch { - '\\' => { - if self.is_eof() { - let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, - "unterminated double quote string"); - } - - let escaped = self.curr.unwrap(); - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - 'n' => accum_str.push_char('\n'), - 'r' => accum_str.push_char('\r'), - 't' => accum_str.push_char('\t'), - '\\' => accum_str.push_char('\\'), - '\'' => accum_str.push_char('\''), - '"' => accum_str.push_char('"'), - '\n' => self.consume_whitespace(), - '0' => accum_str.push_char('\x00'), - 'x' => { - accum_str.push_char(self.scan_numeric_escape(2u, '"')); - } - 'u' => { - accum_str.push_char(self.scan_numeric_escape(4u, '"')); - } - 'U' => { - accum_str.push_char(self.scan_numeric_escape(8u, '"')); - } - c2 => { - let last_bpos = self.last_pos; - self.err_span_char(escaped_pos, last_bpos, - "unknown string escape", c2); - } - } - } - _ => accum_str.push_char(ch) - } + self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"') + .map(|ch| accum_str.push_char(ch)); } self.bump(); return token::LIT_STR(str_to_ident(accum_str.as_slice())); From 8de2618182f14bb2245e8e89f171aaf9b2f29690 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 18 Jun 2014 00:06:26 +0200 Subject: [PATCH 6/7] Fix some violations of stronger guarantees for mutable borrows. See 159e27aebb940926ccf1bad0b2b12087d36ad903 --- src/libsyntax/parse/lexer/mod.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index f67b77d64dd..9039f346edb 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -906,8 +906,9 @@ impl<'a> StringReader<'a> { // Byte offsetting here is okay because the // character before position `start` are an // ascii single quote and ascii 'b'. + let last_pos = self_.last_pos; self_.fatal_span_verbose( - start - BytePos(2), self_.last_pos, + start - BytePos(2), last_pos, "unterminated byte constant".to_string()); } self_.bump(); // advance curr past token @@ -920,7 +921,8 @@ impl<'a> StringReader<'a> { let mut value = Vec::new(); while !self_.curr_is('"') { if self_.is_eof() { - self_.fatal_span(start, self_.last_pos, + let last_pos = self_.last_pos; + self_.fatal_span(start, last_pos, "unterminated double quote byte string"); } @@ -944,20 +946,25 @@ impl<'a> StringReader<'a> { } if self_.is_eof() { - self_.fatal_span(start_bpos, self_.last_pos, "unterminated raw string"); + let last_pos = self_.last_pos; + self_.fatal_span(start_bpos, last_pos, "unterminated raw string"); } else if !self_.curr_is('"') { - self_.fatal_span_char(start_bpos, self_.last_pos, + let last_pos = self_.last_pos; + let ch = self_.curr.unwrap(); + self_.fatal_span_char(start_bpos, last_pos, "only `#` is allowed in raw string delimitation; \ found illegal character", - self_.curr.unwrap()); + ch); } self_.bump(); let content_start_bpos = self_.last_pos; let mut content_end_bpos; 'outer: loop { match self_.curr { - None => self_.fatal_span(start_bpos, self_.last_pos, - "unterminated raw string"), + None => { + let last_pos = self_.last_pos; + self_.fatal_span(start_bpos, last_pos, "unterminated raw string") + }, Some('"') => { content_end_bpos = self_.last_pos; for _ in range(0, hash_count) { @@ -969,8 +976,9 @@ impl<'a> StringReader<'a> { break; }, Some(c) => if c > '\x7F' { - self_.err_span_char(self_.last_pos, self_.last_pos, - "raw byte string must be ASCII", c); + let last_pos = self_.last_pos; + self_.err_span_char( + last_pos, last_pos, "raw byte string must be ASCII", c); } } self_.bump(); From 3744d828513092d1ed64c4c6f8cd2536f7a5ff0d Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 18 Jun 2014 00:40:57 +0200 Subject: [PATCH 7/7] Fix expected error message in a test. The change is a result of the char/string parsing refactor. --- src/test/compile-fail/lex-unknown-str-escape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/compile-fail/lex-unknown-str-escape.rs b/src/test/compile-fail/lex-unknown-str-escape.rs index f7809b02b0b..9a59c422711 100644 --- a/src/test/compile-fail/lex-unknown-str-escape.rs +++ b/src/test/compile-fail/lex-unknown-str-escape.rs @@ -9,5 +9,5 @@ // except according to those terms. static s: &'static str = - "\●" //~ ERROR: unknown string escape + "\●" //~ ERROR: unknown character escape ;