Auto merge of #75642 - matklad:lexer-comments, r=petrochenkov
Move doc comment parsing to rustc_lexer Plain comments are trivia, while doc comments are not, so it feels like this belongs to the rustc_lexer. The specific reason to do this is the desire to use rustc_lexer in rustdoc for syntax highlighting, without duplicating "is this a doc comment?" logic there. r? @ghost
This commit is contained in:
commit
b51651ae9d
@ -1,4 +1,3 @@
|
||||
use crate::ast::AttrStyle;
|
||||
use rustc_span::source_map::SourceMap;
|
||||
use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol};
|
||||
|
||||
@ -24,45 +23,6 @@ pub struct Comment {
|
||||
pub pos: BytePos,
|
||||
}
|
||||
|
||||
/// For a full line comment string returns its doc comment style if it's a doc comment
|
||||
/// and returns `None` if it's a regular comment.
|
||||
pub fn line_doc_comment_style(line_comment: &str) -> Option<AttrStyle> {
|
||||
let line_comment = line_comment.as_bytes();
|
||||
assert!(line_comment.starts_with(b"//"));
|
||||
match line_comment.get(2) {
|
||||
// `//!` is an inner line doc comment.
|
||||
Some(b'!') => Some(AttrStyle::Inner),
|
||||
Some(b'/') => match line_comment.get(3) {
|
||||
// `////` (more than 3 slashes) is not considered a doc comment.
|
||||
Some(b'/') => None,
|
||||
// Otherwise `///` is an outer line doc comment.
|
||||
_ => Some(AttrStyle::Outer),
|
||||
},
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// For a full block comment string returns its doc comment style if it's a doc comment
|
||||
/// and returns `None` if it's a regular comment.
|
||||
pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option<AttrStyle> {
|
||||
let block_comment = block_comment.as_bytes();
|
||||
assert!(block_comment.starts_with(b"/*"));
|
||||
assert!(!terminated || block_comment.ends_with(b"*/"));
|
||||
match block_comment.get(2) {
|
||||
// `/*!` is an inner block doc comment.
|
||||
Some(b'!') => Some(AttrStyle::Inner),
|
||||
Some(b'*') => match block_comment.get(3) {
|
||||
// `/***` (more than 2 stars) is not considered a doc comment.
|
||||
Some(b'*') => None,
|
||||
// `/**/` is not considered a doc comment.
|
||||
Some(b'/') if block_comment.len() == 4 => None,
|
||||
// Otherwise `/**` is an outer block doc comment.
|
||||
_ => Some(AttrStyle::Outer),
|
||||
},
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Makes a doc string more presentable to users.
|
||||
/// Used by rustdoc and perhaps other tools, but not by rustc.
|
||||
pub fn beautify_doc_string(data: Symbol) -> String {
|
||||
@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
|
||||
}
|
||||
}
|
||||
}
|
||||
rustc_lexer::TokenKind::BlockComment { terminated } => {
|
||||
if block_doc_comment_style(token_text, terminated).is_none() {
|
||||
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
|
||||
if doc_style.is_none() {
|
||||
let code_to_the_right = match text[pos + token.len..].chars().next() {
|
||||
Some('\r' | '\n') => false,
|
||||
_ => true,
|
||||
@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
|
||||
comments.push(Comment { style, lines, pos: pos_in_file })
|
||||
}
|
||||
}
|
||||
rustc_lexer::TokenKind::LineComment => {
|
||||
if line_doc_comment_style(token_text).is_none() {
|
||||
rustc_lexer::TokenKind::LineComment { doc_style } => {
|
||||
if doc_style.is_none() {
|
||||
comments.push(Comment {
|
||||
style: if code_to_the_left {
|
||||
CommentStyle::Trailing
|
||||
|
@ -1,13 +1,6 @@
|
||||
use super::*;
|
||||
use rustc_span::with_default_session_globals;
|
||||
|
||||
#[test]
|
||||
fn line_doc_comments() {
|
||||
assert!(line_doc_comment_style("///").is_some());
|
||||
assert!(line_doc_comment_style("/// blah").is_some());
|
||||
assert!(line_doc_comment_style("////").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_doc_comment_1() {
|
||||
with_default_session_globals(|| {
|
||||
|
@ -51,12 +51,12 @@ impl Token {
|
||||
pub enum TokenKind {
|
||||
// Multi-char tokens:
|
||||
/// "// comment"
|
||||
LineComment,
|
||||
LineComment { doc_style: Option<DocStyle> },
|
||||
/// `/* block comment */`
|
||||
///
|
||||
/// Block comments can be recursive, so the sequence like `/* /* */`
|
||||
/// will not be considered terminated and will result in a parsing error.
|
||||
BlockComment { terminated: bool },
|
||||
BlockComment { doc_style: Option<DocStyle>, terminated: bool },
|
||||
/// Any whitespace characters sequence.
|
||||
Whitespace,
|
||||
/// "ident" or "continue"
|
||||
@ -129,6 +129,12 @@ pub enum TokenKind {
|
||||
Unknown,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum DocStyle {
|
||||
Outer,
|
||||
Inner,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum LiteralKind {
|
||||
/// "12_u8", "0o100", "0b120i99"
|
||||
@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
|
||||
// a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level),
|
||||
// then it may be valid Rust code, so consider it Rust code.
|
||||
let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok|
|
||||
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. })
|
||||
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. })
|
||||
);
|
||||
if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
|
||||
// No other choice than to consider this a shebang.
|
||||
@ -410,13 +416,32 @@ impl Cursor<'_> {
|
||||
fn line_comment(&mut self) -> TokenKind {
|
||||
debug_assert!(self.prev() == '/' && self.first() == '/');
|
||||
self.bump();
|
||||
|
||||
let doc_style = match self.first() {
|
||||
// `//!` is an inner line doc comment.
|
||||
'!' => Some(DocStyle::Inner),
|
||||
// `////` (more than 3 slashes) is not considered a doc comment.
|
||||
'/' if self.second() != '/' => Some(DocStyle::Outer),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
self.eat_while(|c| c != '\n');
|
||||
LineComment
|
||||
LineComment { doc_style }
|
||||
}
|
||||
|
||||
fn block_comment(&mut self) -> TokenKind {
|
||||
debug_assert!(self.prev() == '/' && self.first() == '*');
|
||||
self.bump();
|
||||
|
||||
let doc_style = match self.first() {
|
||||
// `/*!` is an inner block doc comment.
|
||||
'!' => Some(DocStyle::Inner),
|
||||
// `/***` (more than 2 stars) is not considered a doc comment.
|
||||
// `/**/` is not considered a doc comment.
|
||||
'*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let mut depth = 1usize;
|
||||
while let Some(c) = self.bump() {
|
||||
match c {
|
||||
@ -438,7 +463,7 @@ impl Cursor<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
BlockComment { terminated: depth == 0 }
|
||||
BlockComment { doc_style, terminated: depth == 0 }
|
||||
}
|
||||
|
||||
fn whitespace(&mut self) -> TokenKind {
|
||||
|
@ -1,5 +1,5 @@
|
||||
use rustc_ast::ast::AttrStyle;
|
||||
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
|
||||
use rustc_ast::util::comments;
|
||||
use rustc_data_structures::sync::Lrc;
|
||||
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
|
||||
use rustc_lexer::Base;
|
||||
@ -15,7 +15,7 @@ mod tokentrees;
|
||||
mod unescape_error_reporting;
|
||||
mod unicode_chars;
|
||||
|
||||
use rustc_lexer::unescape::Mode;
|
||||
use rustc_lexer::{unescape::Mode, DocStyle};
|
||||
use unescape_error_reporting::{emit_unescape_error, push_escaped_char};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@ -168,25 +168,23 @@ impl<'a> StringReader<'a> {
|
||||
/// symbols and runs additional validation.
|
||||
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
|
||||
match token {
|
||||
rustc_lexer::TokenKind::LineComment => {
|
||||
let string = self.str_from(start);
|
||||
if let Some(attr_style) = comments::line_doc_comment_style(string) {
|
||||
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
|
||||
// Opening delimiter of the length 3 is not included into the symbol.
|
||||
token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..]))
|
||||
} else {
|
||||
token::Comment
|
||||
rustc_lexer::TokenKind::LineComment { doc_style } => {
|
||||
match doc_style {
|
||||
Some(doc_style) => {
|
||||
// Opening delimiter of the length 3 is not included into the symbol.
|
||||
let content_start = start + BytePos(3);
|
||||
let content = self.str_from(content_start);
|
||||
|
||||
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
|
||||
}
|
||||
None => token::Comment,
|
||||
}
|
||||
}
|
||||
rustc_lexer::TokenKind::BlockComment { terminated } => {
|
||||
let string = self.str_from(start);
|
||||
let attr_style = comments::block_doc_comment_style(string, terminated);
|
||||
|
||||
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
|
||||
if !terminated {
|
||||
let msg = if attr_style.is_some() {
|
||||
"unterminated block doc-comment"
|
||||
} else {
|
||||
"unterminated block comment"
|
||||
let msg = match doc_style {
|
||||
Some(_) => "unterminated block doc-comment",
|
||||
None => "unterminated block comment",
|
||||
};
|
||||
let last_bpos = self.pos;
|
||||
self.sess
|
||||
@ -199,18 +197,17 @@ impl<'a> StringReader<'a> {
|
||||
.emit();
|
||||
FatalError.raise();
|
||||
}
|
||||
match doc_style {
|
||||
Some(doc_style) => {
|
||||
// Opening delimiter of the length 3 and closing delimiter of the length 2
|
||||
// are not included into the symbol.
|
||||
let content_start = start + BytePos(3);
|
||||
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
|
||||
let content = self.str_from_to(content_start, content_end);
|
||||
|
||||
if let Some(attr_style) = attr_style {
|
||||
self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment");
|
||||
// Opening delimiter of the length 3 and closing delimiter of the length 2
|
||||
// are not included into the symbol.
|
||||
token::DocComment(
|
||||
CommentKind::Block,
|
||||
attr_style,
|
||||
Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]),
|
||||
)
|
||||
} else {
|
||||
token::Comment
|
||||
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
|
||||
}
|
||||
None => token::Comment,
|
||||
}
|
||||
}
|
||||
rustc_lexer::TokenKind::Whitespace => token::Whitespace,
|
||||
@ -319,6 +316,34 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn cook_doc_comment(
|
||||
&self,
|
||||
content_start: BytePos,
|
||||
content: &str,
|
||||
comment_kind: CommentKind,
|
||||
doc_style: DocStyle,
|
||||
) -> TokenKind {
|
||||
if content.contains('\r') {
|
||||
for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
|
||||
self.err_span_(
|
||||
content_start + BytePos(idx as u32),
|
||||
content_start + BytePos(idx as u32 + 1),
|
||||
match comment_kind {
|
||||
CommentKind::Line => "bare CR not allowed in doc-comment",
|
||||
CommentKind::Block => "bare CR not allowed in block doc-comment",
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let attr_style = match doc_style {
|
||||
DocStyle::Outer => AttrStyle::Outer,
|
||||
DocStyle::Inner => AttrStyle::Inner,
|
||||
};
|
||||
|
||||
token::DocComment(comment_kind, attr_style, Symbol::intern(content))
|
||||
}
|
||||
|
||||
fn cook_lexer_literal(
|
||||
&self,
|
||||
start: BytePos,
|
||||
@ -472,17 +497,6 @@ impl<'a> StringReader<'a> {
|
||||
&self.src[self.src_index(start)..self.src_index(end)]
|
||||
}
|
||||
|
||||
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
|
||||
let mut idx = 0;
|
||||
loop {
|
||||
idx = match s[idx..].find('\r') {
|
||||
None => break,
|
||||
Some(it) => idx + it + 1,
|
||||
};
|
||||
self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg);
|
||||
}
|
||||
}
|
||||
|
||||
fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
|
||||
match opt_err {
|
||||
Some(RawStrError::InvalidStarter { bad_char }) => {
|
||||
|
Loading…
x
Reference in New Issue
Block a user