rust/src/grammar/RustLexer.g4

lexer grammar RustLexer;

tokens {
    EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
    MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
    BINOPEQ, AT, DOT, DOTDOT, DOTDOTDOT, COMMA, SEMI, COLON,
    MOD_SEP, RARROW, FAT_ARROW, LPAREN, RPAREN, LBRACKET, RBRACKET,
    LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
    LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
    LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
    COMMENT
}

/* Note: due to antlr limitations, we can't represent XID_start and
 * XID_continue properly. ASCII-only substitute. */

fragment XID_start : [_a-zA-Z] ;
fragment XID_continue : [_a-zA-Z0-9] ;


/* Expression-operator symbols */

EQ      : '=' ;
LT      : '<' ;
LE      : '<=' ;
EQEQ    : '==' ;
NE      : '!=' ;
GE      : '>=' ;
GT      : '>' ;
ANDAND  : '&&' ;
OROR    : '||' ;
NOT     : '!' ;
TILDE   : '~' ;
PLUS    : '+' ;
MINUS   : '-' ;
STAR    : '*' ;
SLASH   : '/' ;
PERCENT : '%' ;
CARET   : '^' ;
AND     : '&' ;
OR      : '|' ;
SHL     : '<<' ;
SHR     : '>>' ;

BINOP
    : PLUS
    | SLASH
    | MINUS
    | STAR
    | PERCENT
    | CARET
    | AND
    | OR
    | SHL
    | SHR
    ;

BINOPEQ : BINOP EQ ;

/* "Structural symbols" */

AT         : '@' ;
DOT        : '.' ;
DOTDOT     : '..' ;
DOTDOTDOT  : '...' ;
COMMA      : ',' ;
SEMI       : ';' ;
COLON      : ':' ;
MOD_SEP    : '::' ;
RARROW     : '->' ;
FAT_ARROW  : '=>' ;
LPAREN     : '(' ;
RPAREN     : ')' ;
LBRACKET   : '[' ;
RBRACKET   : ']' ;
LBRACE     : '{' ;
RBRACE     : '}' ;
POUND      : '#';
DOLLAR     : '$' ;
UNDERSCORE : '_' ;

// Literals

fragment HEXIT
  : [0-9a-fA-F]
  ;

fragment CHAR_ESCAPE
  : [nrt\\'"0]
  | [xX] HEXIT HEXIT
  | 'u' HEXIT HEXIT HEXIT HEXIT
  | 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
  ;

fragment SUFFIX
  : IDENT
  ;

LIT_CHAR
  : '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
  ;

LIT_BYTE
  : 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
  ;

LIT_INTEGER
  : [0-9][0-9_]* SUFFIX?
  | '0b' [01][01_]* SUFFIX?
  | '0o' [0-7][0-7_]* SUFFIX?
  | '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?
  ;

LIT_FLOAT
  : [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
  ;

LIT_STR
  : '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
  ;

LIT_BINARY : 'b' LIT_STR SUFFIX?;
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;

/* this is a bit messy */

fragment LIT_STR_RAW_INNER
  : '"' .*? '"'
  | LIT_STR_RAW_INNER2
  ;

fragment LIT_STR_RAW_INNER2
  : POUND LIT_STR_RAW_INNER POUND
  ;

LIT_STR_RAW
  : 'r' LIT_STR_RAW_INNER SUFFIX?
  ;

IDENT : XID_start XID_continue* ;

LIFETIME : '\'' IDENT ;

WHITESPACE : [ \r\n\t]+ ;

UNDOC_COMMENT     : '////' ~[\r\n]* -> type(COMMENT) ;
YESDOC_COMMENT    : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
LINE_COMMENT      : '//' ~[\r\n]* -> type(COMMENT) ;

DOC_BLOCK_COMMENT
  : ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
  ;

BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`lexer grammar RustLexer;`

First pass at line comment correctness 2014-07-14 16:13:38 -05:00			`tokens {`
			`EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,`
			`MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,`
			`BINOPEQ, AT, DOT, DOTDOT, DOTDOTDOT, COMMA, SEMI, COLON,`
			`MOD_SEP, RARROW, FAT_ARROW, LPAREN, RPAREN, LBRACKET, RBRACKET,`
			`LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,`
			`LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,`
			`LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,`
			`COMMENT`
			`}`

Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`/* Note: due to antlr limitations, we can't represent XID_start and`
			`* XID_continue properly. ASCII-only substitute. */`

			`fragment XID_start : [_a-zA-Z] ;`
			`fragment XID_continue : [_a-zA-Z0-9] ;`

First pass at line comment correctness 2014-07-14 16:13:38 -05:00
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`/* Expression-operator symbols */`

			`EQ : '=' ;`
			`LT : '<' ;`
			`LE : '<=' ;`
			`EQEQ : '==' ;`
			`NE : '!=' ;`
			`GE : '>=' ;`
			`GT : '>' ;`
			`ANDAND : '&&' ;`
			`OROR : '\|\|' ;`
			`NOT : '!' ;`
			`TILDE : '~' ;`
			`PLUS : '+' ;`
			`MINUS : '-' ;`
			`STAR : '*' ;`
			`SLASH : '/' ;`
			`PERCENT : '%' ;`
			`CARET : '^' ;`
			`AND : '&' ;`
			`OR : '\|' ;`
			`SHL : '<<' ;`
			`SHR : '>>' ;`

			`BINOP`
			`: PLUS`
Byte/raw binary literal fixes 2014-07-14 22:45:39 -05:00			`\| SLASH`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`\| MINUS`
			`\| STAR`
			`\| PERCENT`
			`\| CARET`
			`\| AND`
			`\| OR`
			`\| SHL`
			`\| SHR`
			`;`

			`BINOPEQ : BINOP EQ ;`

			`/* "Structural symbols" */`

			`AT : '@' ;`
			`DOT : '.' ;`
			`DOTDOT : '..' ;`
			`DOTDOTDOT : '...' ;`
			`COMMA : ',' ;`
			`SEMI : ';' ;`
			`COLON : ':' ;`
			`MOD_SEP : '::' ;`
			`RARROW : '->' ;`
			`FAT_ARROW : '=>' ;`
			`LPAREN : '(' ;`
			`RPAREN : ')' ;`
			`LBRACKET : '[' ;`
			`RBRACKET : ']' ;`
			`LBRACE : '{' ;`
			`RBRACE : '}' ;`
			`POUND : '#';`
			`DOLLAR : '$' ;`
			`UNDERSCORE : '_' ;`

			`// Literals`

			`fragment HEXIT`
			`: [0-9a-fA-F]`
			`;`

			`fragment CHAR_ESCAPE`
			`: [nrt\\'"0]`
			`\| [xX] HEXIT HEXIT`
			`\| 'u' HEXIT HEXIT HEXIT HEXIT`
			`\| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT`
			`;`

Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`fragment SUFFIX`
			`: IDENT`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`LIT_CHAR`
			`: '\'' ( '\\' CHAR_ESCAPE \| ~[\\'\n\t\r] ) '\'' SUFFIX?`
Byte/raw binary literal fixes 2014-07-14 22:45:39 -05:00			`;`

Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`LIT_BYTE`
			`: 'b\'' ( '\\' ( [xX] HEXIT HEXIT \| [nrt\\'"0] ) \| ~[\\'\n\t\r] ) '\'' SUFFIX?`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

			`LIT_INTEGER`
Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`: [0-9][0-9_]* SUFFIX?`
			`\| '0b' [01][01_]* SUFFIX?`
			`\| '0o' [0-7][0-7_]* SUFFIX?`
			`\| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

			`LIT_FLOAT`
Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`: [0-9][0-9_]* ('.' \| ('.' [0-9][0-9_])? ([eE] [-+]? [0-9][0-9_])? SUFFIX?)`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

			`LIT_STR`
Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`: '"' ('\\\n' \| '\\\r\n' \| '\\' CHAR_ESCAPE \| .)*? '"' SUFFIX?`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`LIT_BINARY : 'b' LIT_STR SUFFIX?;`
			`LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00
			`/* this is a bit messy */`

			`fragment LIT_STR_RAW_INNER`
			`: '"' .*? '"'`
			`\| LIT_STR_RAW_INNER2`
			`;`

			`fragment LIT_STR_RAW_INNER2`
			`: POUND LIT_STR_RAW_INNER POUND`
			`;`

			`LIT_STR_RAW`
Adjust Antlr4 lexer to include suffixes. This makes the formal lexical grammar (more closely) reflect the one implemented by the compiler. 2014-11-19 03:25:48 -06:00			`: 'r' LIT_STR_RAW_INNER SUFFIX?`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

			`IDENT : XID_start XID_continue* ;`

			`LIFETIME : '\'' IDENT ;`

			`WHITESPACE : [ \r\n\t]+ ;`

Refine the tooling, handle comments 2014-07-14 19:27:28 -05:00			`UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;`
			`YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;`
			`OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;`
			`LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00
First pass at line comment correctness 2014-07-14 16:13:38 -05:00			`DOC_BLOCK_COMMENT`
Byte/raw binary literal fixes 2014-07-14 22:45:39 -05:00			`: ('/*' ~[] \| '/!') (DOC_BLOCK_COMMENT \| .)? '*/' -> type(DOC_COMMENT)`
Lexer; subtly wrong; no makefile 2014-07-14 03:52:18 -05:00			`;`

First pass at line comment correctness 2014-07-14 16:13:38 -05:00			`BLOCK_COMMENT : '/' (BLOCK_COMMENT \| .)? '*/' -> type(COMMENT) ;`