2020-01-24 03:39:23 +02:00
//! Lexer analyzes raw input string and produces lexemes (tokens).
2020-01-26 20:44:49 +02:00
//! It is just a bridge to `rustc_lexer`.
2019-09-30 11:58:53 +03:00
2020-06-27 10:26:28 +03:00
use rustc_lexer ::{ LiteralKind as LK , RawStrError } ;
2020-04-25 00:57:47 +02:00
use std ::convert ::TryInto ;
2018-10-15 19:55:32 +03:00
use crate ::{
2020-02-06 02:33:18 +02:00
SyntaxError ,
2018-07-29 15:16:07 +03:00
SyntaxKind ::{ self , * } ,
2020-04-24 23:40:41 +02:00
TextRange , TextSize , T ,
2018-07-29 15:16:07 +03:00
} ;
2017-12-31 13:32:00 +03:00
2018-07-29 15:16:07 +03:00
/// A token of Rust source.
#[ derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash) ]
pub struct Token {
/// The kind of token.
pub kind : SyntaxKind ,
/// The length of the token.
2020-04-24 23:40:41 +02:00
pub len : TextSize ,
2018-07-29 15:16:07 +03:00
}
2017-12-31 16:42:22 +03:00
2020-01-26 20:44:49 +02:00
/// Break a string up into its component tokens.
/// Beware that it checks for shebang first and its length contributes to resulting
/// tokens offsets.
2020-01-28 07:09:13 +02:00
pub fn tokenize ( text : & str ) -> ( Vec < Token > , Vec < SyntaxError > ) {
2020-01-26 20:44:49 +02:00
// non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
if text . is_empty ( ) {
2020-01-28 07:09:13 +02:00
return Default ::default ( ) ;
2020-01-26 20:44:49 +02:00
}
2020-01-24 03:39:23 +02:00
2020-01-28 07:09:13 +02:00
let mut tokens = Vec ::new ( ) ;
let mut errors = Vec ::new ( ) ;
2020-04-25 00:57:47 +02:00
let mut offset = match rustc_lexer ::strip_shebang ( text ) {
Some ( shebang_len ) = > {
tokens . push ( Token { kind : SHEBANG , len : shebang_len . try_into ( ) . unwrap ( ) } ) ;
2020-01-26 20:44:49 +02:00
shebang_len
2020-04-25 00:57:47 +02:00
}
None = > 0 ,
} ;
2020-01-24 03:39:23 +02:00
2020-01-26 20:44:49 +02:00
let text_without_shebang = & text [ offset .. ] ;
2020-01-24 03:39:23 +02:00
2020-01-26 20:44:49 +02:00
for rustc_token in rustc_lexer ::tokenize ( text_without_shebang ) {
2020-04-25 00:57:47 +02:00
let token_len : TextSize = rustc_token . len . try_into ( ) . unwrap ( ) ;
let token_range = TextRange ::at ( offset . try_into ( ) . unwrap ( ) , token_len ) ;
2020-01-28 07:09:13 +02:00
2020-02-06 02:33:18 +02:00
let ( syntax_kind , err_message ) =
2020-01-28 07:09:13 +02:00
rustc_token_kind_to_syntax_kind ( & rustc_token . kind , & text [ token_range ] ) ;
tokens . push ( Token { kind : syntax_kind , len : token_len } ) ;
2020-02-06 02:33:18 +02:00
if let Some ( err_message ) = err_message {
errors . push ( SyntaxError ::new ( err_message , token_range ) ) ;
2020-01-28 07:09:13 +02:00
}
2020-01-26 20:44:49 +02:00
offset + = rustc_token . len ;
}
2020-01-28 07:09:13 +02:00
( tokens , errors )
2020-01-24 03:39:23 +02:00
}
2020-01-28 07:09:13 +02:00
/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
/// encountered at the beginning of the string.
///
/// Returns `None` if the string contains zero *or two or more* tokens.
/// The token is malformed if the returned error is not `None`.
///
/// Beware that unescape errors are not checked at tokenization time.
pub fn lex_single_syntax_kind ( text : & str ) -> Option < ( SyntaxKind , Option < SyntaxError > ) > {
2020-01-28 07:18:35 +02:00
lex_first_token ( text )
2020-04-24 23:40:41 +02:00
. filter ( | ( token , _ ) | token . len = = TextSize ::of ( text ) )
2020-01-28 07:09:13 +02:00
. map ( | ( token , error ) | ( token . kind , error ) )
}
2020-01-28 07:13:18 +02:00
/// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and
2020-01-28 07:09:13 +02:00
/// returns `None` if any tokenization error occured.
2020-01-26 20:44:49 +02:00
///
2020-01-28 07:09:13 +02:00
/// Beware that unescape errors are not checked at tokenization time.
pub fn lex_single_valid_syntax_kind ( text : & str ) -> Option < SyntaxKind > {
2020-01-28 07:18:35 +02:00
lex_first_token ( text )
2020-04-24 23:40:41 +02:00
. filter ( | ( token , error ) | ! error . is_some ( ) & & token . len = = TextSize ::of ( text ) )
2020-01-28 07:09:13 +02:00
. map ( | ( token , _error ) | token . kind )
2020-01-24 03:39:23 +02:00
}
2020-01-28 07:13:18 +02:00
/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
/// encountered at the beginning of the string.
2020-01-26 20:44:49 +02:00
///
2020-01-28 07:09:13 +02:00
/// Returns `None` if the string contains zero tokens or if the token was parsed
/// with an error.
2020-01-28 07:13:18 +02:00
/// The token is malformed if the returned error is not `None`.
2020-01-28 07:09:13 +02:00
///
/// Beware that unescape errors are not checked at tokenization time.
2020-01-28 07:18:35 +02:00
fn lex_first_token ( text : & str ) -> Option < ( Token , Option < SyntaxError > ) > {
2020-01-26 20:44:49 +02:00
// non-empty string is a precondtion of `rustc_lexer::first_token()`.
if text . is_empty ( ) {
2020-01-28 07:09:13 +02:00
return None ;
2020-01-26 20:44:49 +02:00
}
2020-01-28 07:09:13 +02:00
let rustc_token = rustc_lexer ::first_token ( text ) ;
2020-02-06 02:33:18 +02:00
let ( syntax_kind , err_message ) = rustc_token_kind_to_syntax_kind ( & rustc_token . kind , text ) ;
2020-01-28 07:09:13 +02:00
2020-04-25 00:57:47 +02:00
let token = Token { kind : syntax_kind , len : rustc_token . len . try_into ( ) . unwrap ( ) } ;
let optional_error = err_message
. map ( | err_message | SyntaxError ::new ( err_message , TextRange ::up_to ( TextSize ::of ( text ) ) ) ) ;
2020-01-28 07:09:13 +02:00
2020-02-06 02:33:18 +02:00
Some ( ( token , optional_error ) )
2020-01-24 03:39:23 +02:00
}
2020-02-06 02:33:18 +02:00
/// Returns `SyntaxKind` and an optional tokenize error message.
2020-01-28 07:09:13 +02:00
fn rustc_token_kind_to_syntax_kind (
rustc_token_kind : & rustc_lexer ::TokenKind ,
token_text : & str ,
2020-02-06 02:33:18 +02:00
) -> ( SyntaxKind , Option < & 'static str > ) {
2020-01-28 07:09:13 +02:00
// A note on an intended tradeoff:
2020-01-24 03:39:23 +02:00
// We drop some useful infromation here (see patterns with double dots `..`)
// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
2020-01-28 07:09:13 +02:00
// being `u16` that come from `rowan::SyntaxKind`.
2020-01-24 03:39:23 +02:00
2020-01-28 07:09:13 +02:00
let syntax_kind = {
match rustc_token_kind {
2020-10-01 15:20:18 -04:00
rustc_lexer ::TokenKind ::LineComment { doc_style : _ } = > COMMENT ,
2020-01-28 07:09:13 +02:00
2020-10-01 15:20:18 -04:00
rustc_lexer ::TokenKind ::BlockComment { doc_style : _ , terminated : true } = > COMMENT ,
rustc_lexer ::TokenKind ::BlockComment { doc_style : _ , terminated : false } = > {
2020-02-06 02:33:18 +02:00
return (
COMMENT ,
Some ( " Missing trailing `*/` symbols to terminate the block comment " ) ,
) ;
2020-01-26 20:44:49 +02:00
}
2020-01-28 07:09:13 +02:00
2020-04-10 17:06:57 +02:00
rustc_lexer ::TokenKind ::Whitespace = > WHITESPACE ,
2020-01-28 07:09:13 +02:00
2020-04-10 17:06:57 +02:00
rustc_lexer ::TokenKind ::Ident = > {
2020-01-28 07:09:13 +02:00
if token_text = = " _ " {
UNDERSCORE
} else {
SyntaxKind ::from_keyword ( token_text ) . unwrap_or ( IDENT )
}
2020-01-26 20:44:49 +02:00
}
2020-01-24 03:39:23 +02:00
2020-04-10 17:06:57 +02:00
rustc_lexer ::TokenKind ::RawIdent = > IDENT ,
rustc_lexer ::TokenKind ::Literal { kind , .. } = > return match_literal_kind ( & kind ) ,
2020-01-28 07:09:13 +02:00
2020-04-10 17:06:57 +02:00
rustc_lexer ::TokenKind ::Lifetime { starts_with_number : false } = > LIFETIME ,
rustc_lexer ::TokenKind ::Lifetime { starts_with_number : true } = > {
2020-02-06 02:33:18 +02:00
return ( LIFETIME , Some ( " Lifetime name cannot start with a number " ) )
2020-01-28 07:09:13 +02:00
}
2020-04-10 17:06:57 +02:00
rustc_lexer ::TokenKind ::Semi = > T! [ ; ] ,
rustc_lexer ::TokenKind ::Comma = > T! [ , ] ,
rustc_lexer ::TokenKind ::Dot = > T! [ . ] ,
rustc_lexer ::TokenKind ::OpenParen = > T! [ '(' ] ,
rustc_lexer ::TokenKind ::CloseParen = > T! [ ')' ] ,
rustc_lexer ::TokenKind ::OpenBrace = > T! [ '{' ] ,
rustc_lexer ::TokenKind ::CloseBrace = > T! [ '}' ] ,
rustc_lexer ::TokenKind ::OpenBracket = > T! [ '[' ] ,
rustc_lexer ::TokenKind ::CloseBracket = > T! [ ']' ] ,
rustc_lexer ::TokenKind ::At = > T! [ @ ] ,
rustc_lexer ::TokenKind ::Pound = > T! [ #] ,
rustc_lexer ::TokenKind ::Tilde = > T! [ ~ ] ,
rustc_lexer ::TokenKind ::Question = > T! [ ? ] ,
rustc_lexer ::TokenKind ::Colon = > T! [ :] ,
rustc_lexer ::TokenKind ::Dollar = > T! [ $ ] ,
rustc_lexer ::TokenKind ::Eq = > T! [ = ] ,
2020-10-01 15:20:18 -04:00
rustc_lexer ::TokenKind ::Bang = > T! [ ! ] ,
2020-04-10 17:06:57 +02:00
rustc_lexer ::TokenKind ::Lt = > T! [ < ] ,
rustc_lexer ::TokenKind ::Gt = > T! [ > ] ,
rustc_lexer ::TokenKind ::Minus = > T! [ - ] ,
rustc_lexer ::TokenKind ::And = > T! [ & ] ,
rustc_lexer ::TokenKind ::Or = > T! [ | ] ,
rustc_lexer ::TokenKind ::Plus = > T! [ + ] ,
rustc_lexer ::TokenKind ::Star = > T! [ * ] ,
rustc_lexer ::TokenKind ::Slash = > T! [ / ] ,
rustc_lexer ::TokenKind ::Caret = > T! [ ^ ] ,
rustc_lexer ::TokenKind ::Percent = > T! [ % ] ,
rustc_lexer ::TokenKind ::Unknown = > ERROR ,
2020-01-28 07:09:13 +02:00
}
2020-01-26 20:44:49 +02:00
} ;
2020-01-24 03:39:23 +02:00
2020-01-28 07:09:13 +02:00
return ( syntax_kind , None ) ;
2020-01-24 03:39:23 +02:00
2020-02-06 02:33:18 +02:00
fn match_literal_kind ( kind : & rustc_lexer ::LiteralKind ) -> ( SyntaxKind , Option < & 'static str > ) {
2020-01-28 07:09:13 +02:00
#[ rustfmt::skip ]
let syntax_kind = match * kind {
LK ::Int { empty_int : false , .. } = > INT_NUMBER ,
LK ::Int { empty_int : true , .. } = > {
2020-02-06 02:33:18 +02:00
return ( INT_NUMBER , Some ( " Missing digits after the integer base prefix " ) )
2020-01-28 07:09:13 +02:00
}
LK ::Float { empty_exponent : false , .. } = > FLOAT_NUMBER ,
LK ::Float { empty_exponent : true , .. } = > {
2020-02-06 02:33:18 +02:00
return ( FLOAT_NUMBER , Some ( " Missing digits after the exponent symbol " ) )
2020-01-28 07:09:13 +02:00
}
LK ::Char { terminated : true } = > CHAR ,
LK ::Char { terminated : false } = > {
2020-02-06 02:33:18 +02:00
return ( CHAR , Some ( " Missing trailing `'` symbol to terminate the character literal " ) )
2020-01-28 07:09:13 +02:00
}
LK ::Byte { terminated : true } = > BYTE ,
LK ::Byte { terminated : false } = > {
2020-02-06 02:33:18 +02:00
return ( BYTE , Some ( " Missing trailing `'` symbol to terminate the byte literal " ) )
2019-07-22 17:47:33 +03:00
}
2020-01-28 07:09:13 +02:00
LK ::Str { terminated : true } = > STRING ,
LK ::Str { terminated : false } = > {
2020-02-06 02:33:18 +02:00
return ( STRING , Some ( " Missing trailing ` \" ` symbol to terminate the string literal " ) )
2020-01-28 07:09:13 +02:00
}
LK ::ByteStr { terminated : true } = > BYTE_STRING ,
LK ::ByteStr { terminated : false } = > {
2020-02-06 02:33:18 +02:00
return ( BYTE_STRING , Some ( " Missing trailing ` \" ` symbol to terminate the byte string literal " ) )
2020-01-24 03:39:23 +02:00
}
2020-06-27 10:26:28 +03:00
LK ::RawStr { err , .. } = > match err {
None = > RAW_STRING ,
Some ( RawStrError ::InvalidStarter { .. } ) = > return ( RAW_STRING , Some ( " Missing ` \" ` symbol after `#` symbols to begin the raw string literal " ) ) ,
Some ( RawStrError ::NoTerminator { expected , found , .. } ) = > if expected = = found {
2020-05-01 15:29:03 +03:00
return ( RAW_STRING , Some ( " Missing trailing ` \" ` to terminate the raw string literal " ) )
} else {
return ( RAW_STRING , Some ( " Missing trailing ` \" ` with `#` symbols to terminate the raw string literal " ) )
} ,
2020-06-27 10:26:28 +03:00
Some ( RawStrError ::TooManyDelimiters { .. } ) = > return ( RAW_STRING , Some ( " Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols " ) ) ,
2020-05-01 15:29:03 +03:00
} ,
2020-06-27 10:26:28 +03:00
LK ::RawByteStr { err , .. } = > match err {
None = > RAW_BYTE_STRING ,
Some ( RawStrError ::InvalidStarter { .. } ) = > return ( RAW_BYTE_STRING , Some ( " Missing ` \" ` symbol after `#` symbols to begin the raw byte string literal " ) ) ,
Some ( RawStrError ::NoTerminator { expected , found , .. } ) = > if expected = = found {
2020-05-01 15:29:03 +03:00
return ( RAW_BYTE_STRING , Some ( " Missing trailing ` \" ` to terminate the raw byte string literal " ) )
} else {
return ( RAW_BYTE_STRING , Some ( " Missing trailing ` \" ` with `#` symbols to terminate the raw byte string literal " ) )
2020-01-24 03:39:23 +02:00
2020-05-01 15:29:03 +03:00
} ,
2020-06-27 10:26:28 +03:00
Some ( RawStrError ::TooManyDelimiters { .. } ) = > return ( RAW_BYTE_STRING , Some ( " Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols " ) ) ,
2020-05-01 15:29:03 +03:00
} ,
2020-01-28 07:09:13 +02:00
} ;
2020-01-26 20:44:49 +02:00
( syntax_kind , None )
2017-12-31 17:54:33 +03:00
}
2019-04-05 18:45:19 +08:00
}