Auto merge of #91393 - Julian-Wollersberger:lexer_optimization, r=petrochenkov
Optimize `rustc_lexer` The `cursor.first()` method in `rustc_lexer` now calls the `chars.next()` method instead of `chars.nth_char(0)`. This allows LLVM to optimize the code better. The biggest win is that `eat_while()` is now fully inlined and generates better assembly. This improves the lexer's performance by 35% in a micro-benchmark I made (Lexing all 18MB of code in the compiler directory). But lexing is only a small part of the overall compilation time, so I don't know how significant it is. Big thanks to criterion and `cargo asm`.
This commit is contained in:
commit
2a9e0831d6
@ -2,10 +2,11 @@
|
|||||||
|
|
||||||
/// Peekable iterator over a char sequence.
|
/// Peekable iterator over a char sequence.
|
||||||
///
|
///
|
||||||
/// Next characters can be peeked via `nth_char` method,
|
/// Next characters can be peeked via `first` method,
|
||||||
/// and position can be shifted forward via `bump` method.
|
/// and position can be shifted forward via `bump` method.
|
||||||
pub(crate) struct Cursor<'a> {
|
pub(crate) struct Cursor<'a> {
|
||||||
initial_len: usize,
|
initial_len: usize,
|
||||||
|
/// Iterator over chars. Slightly faster than a &str.
|
||||||
chars: Chars<'a>,
|
chars: Chars<'a>,
|
||||||
#[cfg(debug_assertions)]
|
#[cfg(debug_assertions)]
|
||||||
prev: char,
|
prev: char,
|
||||||
@ -37,22 +38,21 @@ pub(crate) fn prev(&self) -> char {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns nth character relative to the current cursor position.
|
/// Peeks the next symbol from the input stream without consuming it.
|
||||||
/// If requested position doesn't exist, `EOF_CHAR` is returned.
|
/// If requested position doesn't exist, `EOF_CHAR` is returned.
|
||||||
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
|
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
|
||||||
/// it should be checked with `is_eof` method.
|
/// it should be checked with `is_eof` method.
|
||||||
fn nth_char(&self, n: usize) -> char {
|
|
||||||
self.chars().nth(n).unwrap_or(EOF_CHAR)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Peeks the next symbol from the input stream without consuming it.
|
|
||||||
pub(crate) fn first(&self) -> char {
|
pub(crate) fn first(&self) -> char {
|
||||||
self.nth_char(0)
|
// `.next()` optimizes better than `.nth(0)`
|
||||||
|
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Peeks the second symbol from the input stream without consuming it.
|
/// Peeks the second symbol from the input stream without consuming it.
|
||||||
pub(crate) fn second(&self) -> char {
|
pub(crate) fn second(&self) -> char {
|
||||||
self.nth_char(1)
|
// `.next()` optimizes better than `.nth(1)`
|
||||||
|
let mut iter = self.chars.clone();
|
||||||
|
iter.next();
|
||||||
|
iter.next().unwrap_or(EOF_CHAR)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks if there is nothing more to consume.
|
/// Checks if there is nothing more to consume.
|
||||||
@ -65,9 +65,9 @@ pub(crate) fn len_consumed(&self) -> usize {
|
|||||||
self.initial_len - self.chars.as_str().len()
|
self.initial_len - self.chars.as_str().len()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a `Chars` iterator over the remaining characters.
|
/// Resets the number of bytes consumed to 0.
|
||||||
fn chars(&self) -> Chars<'a> {
|
pub(crate) fn reset_len_consumed(&mut self) {
|
||||||
self.chars.clone()
|
self.initial_len = self.chars.as_str().len();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Moves to the next character.
|
/// Moves to the next character.
|
||||||
@ -81,4 +81,13 @@ pub(crate) fn bump(&mut self) -> Option<char> {
|
|||||||
|
|
||||||
Some(c)
|
Some(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||||
|
pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||||
|
// It was tried making optimized version of this for eg. line comments, but
|
||||||
|
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||||
|
while predicate(self.first()) && !self.is_eof() {
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -227,14 +227,15 @@ pub fn first_token(input: &str) -> Token {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Creates an iterator that produces tokens from the input string.
|
/// Creates an iterator that produces tokens from the input string.
|
||||||
pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
|
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
|
||||||
|
let mut cursor = Cursor::new(input);
|
||||||
std::iter::from_fn(move || {
|
std::iter::from_fn(move || {
|
||||||
if input.is_empty() {
|
if cursor.is_eof() {
|
||||||
return None;
|
None
|
||||||
|
} else {
|
||||||
|
cursor.reset_len_consumed();
|
||||||
|
Some(cursor.advance_token())
|
||||||
}
|
}
|
||||||
let token = first_token(input);
|
|
||||||
input = &input[token.len..];
|
|
||||||
Some(token)
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -832,11 +833,4 @@ fn eat_identifier(&mut self) {
|
|||||||
|
|
||||||
self.eat_while(is_id_continue);
|
self.eat_while(is_id_continue);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
|
||||||
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
|
||||||
while predicate(self.first()) && !self.is_eof() {
|
|
||||||
self.bump();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user