From addfd8d9e8518a64f5f01940e60a0a5201a89c9d Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 14 Nov 2021 16:47:13 +0300 Subject: [PATCH 01/11] start SOA parser interface --- crates/parser/src/lib.rs | 1 + crates/parser/src/tokens.rs | 58 +++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 crates/parser/src/tokens.rs diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 51ee9692048..720ecf6fb62 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -20,6 +20,7 @@ mod syntax_kind; mod event; mod parser; mod grammar; +mod tokens; pub(crate) use token_set::TokenSet; diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs new file mode 100644 index 00000000000..053d90a1724 --- /dev/null +++ b/crates/parser/src/tokens.rs @@ -0,0 +1,58 @@ +use crate::SyntaxKind; + +type bits = u64; + +pub type IdentKind = u8; + +/// Main input to the parser. +/// +/// A sequence of tokens represented internally as a struct of arrays. +#[derive(Default)] +pub struct Tokens { + kind: Vec, + joint: Vec, + ident_kind: Vec, +} + +impl Tokens { + pub fn push(&mut self, was_joint: bool, kind: SyntaxKind) { + self.push_impl(was_joint, kind, 0) + } + pub fn push_ident(&mut self, ident_kind: IdentKind) { + self.push_impl(false, SyntaxKind::IDENT, ident_kind) + } + fn push_impl(&mut self, was_joint: bool, kind: SyntaxKind, ctx: IdentKind) { + let idx = self.len(); + if idx % (bits::BITS as usize) == 0 { + self.joint.push(0); + } + if was_joint && idx > 0 { + self.set_joint(idx - 1); + } + self.kind.push(kind); + self.ident_kind.push(ctx); + } + fn set_joint(&mut self, n: usize) { + let (idx, b_idx) = self.bit_index(n); + self.joint[idx] |= 1 << b_idx; + } + fn get_joint(&self, n: usize) -> bool { + let (idx, b_idx) = self.bit_index(n); + self.joint[idx] & 1 << b_idx != 0 + } + fn bit_index(&self, n: usize) -> (usize, usize) { + let idx = n / (bits::BITS as usize); + let b_idx = n % (bits::BITS as usize); + (idx, b_idx) + } + + pub fn len(&self) -> usize { + self.kind.len() + } + pub(crate) fn get(&self, idx: usize) -> Option<(SyntaxKind, bool, IdentKind)> { + let kind = *self.kind.get(idx)?; + let joint = self.get_joint(idx); + let ident_kind = *self.ident_kind.get(idx)?; + Some((kind, joint, ident_kind)) + } +} From d5ad0f3ca03e4ffbdfafb46a5e12d23059bc760e Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 14 Nov 2021 21:37:10 +0300 Subject: [PATCH 02/11] use eof token pattenr --- crates/parser/src/tokens.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 053d90a1724..2fd807f9b3c 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -49,10 +49,18 @@ impl Tokens { pub fn len(&self) -> usize { self.kind.len() } - pub(crate) fn get(&self, idx: usize) -> Option<(SyntaxKind, bool, IdentKind)> { - let kind = *self.kind.get(idx)?; + pub(crate) fn get(&self, idx: usize) -> (SyntaxKind, bool, IdentKind) { + if idx > self.len() { + return self.eof(); + } + let kind = self.kind[idx]; let joint = self.get_joint(idx); - let ident_kind = *self.ident_kind.get(idx)?; - Some((kind, joint, ident_kind)) + let ident_kind = self.ident_kind[idx]; + (kind, joint, ident_kind) + } + + #[cold] + fn eof(&self) -> (SyntaxKind, bool, IdentKind) { + (SyntaxKind::EOF, false, 0) } } From 26bfd6023ffbc7fbd66bc4857e6c74b35e7fc9b4 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 14 Nov 2021 22:13:44 +0300 Subject: [PATCH 03/11] Switch parser to use tokens --- crates/parser/src/grammar/expressions.rs | 5 +--- crates/parser/src/grammar/items.rs | 12 ++++----- crates/parser/src/grammar/items/adt.rs | 2 +- crates/parser/src/lib.rs | 19 +++++++------- crates/parser/src/parser.rs | 33 ++++++++++++------------ crates/parser/src/tokens.rs | 29 ++++++++++----------- 6 files changed, 47 insertions(+), 53 deletions(-) diff --git a/crates/parser/src/grammar/expressions.rs b/crates/parser/src/grammar/expressions.rs index 54eb96d84e5..4b9c579a052 100644 --- a/crates/parser/src/grammar/expressions.rs +++ b/crates/parser/src/grammar/expressions.rs @@ -296,10 +296,7 @@ fn lhs(p: &mut Parser, r: Restrictions) -> Option<(CompletedMarker, BlockLike)> T![&] => { m = p.start(); p.bump(T![&]); - if p.at(IDENT) - && p.at_contextual_kw("raw") - && (p.nth_at(1, T![mut]) || p.nth_at(1, T![const])) - { + if p.at_contextual_kw(T![raw]) && (p.nth_at(1, T![mut]) || p.nth_at(1, T![const])) { p.bump_remap(T![raw]); p.bump_any(); } else { diff --git a/crates/parser/src/grammar/items.rs b/crates/parser/src/grammar/items.rs index 39be0e1a192..896efaf3757 100644 --- a/crates/parser/src/grammar/items.rs +++ b/crates/parser/src/grammar/items.rs @@ -122,14 +122,14 @@ pub(super) fn opt_item(p: &mut Parser, m: Marker) -> Result<(), Marker> { has_mods = true; abi(p); } - if p.at(IDENT) && p.at_contextual_kw("auto") && p.nth(1) == T![trait] { + if p.at_contextual_kw(T![auto]) && p.nth(1) == T![trait] { p.bump_remap(T![auto]); has_mods = true; } // test default_item // default impl T for Foo {} - if p.at(IDENT) && p.at_contextual_kw("default") { + if p.at_contextual_kw(T![default]) { match p.nth(1) { T![fn] | T![type] | T![const] | T![impl] => { p.bump_remap(T![default]); @@ -176,7 +176,7 @@ pub(super) fn opt_item(p: &mut Parser, m: Marker) -> Result<(), Marker> { // test existential_type // existential type Foo: Fn() -> usize; - if p.at(IDENT) && p.at_contextual_kw("existential") && p.nth(1) == T![type] { + if p.at_contextual_kw(T![existential]) && p.nth(1) == T![type] { p.bump_remap(T![existential]); has_mods = true; } @@ -224,10 +224,10 @@ fn opt_item_without_modifiers(p: &mut Parser, m: Marker) -> Result<(), Marker> { T![type] => type_alias(p, m), T![struct] => adt::strukt(p, m), T![enum] => adt::enum_(p, m), - IDENT if p.at_contextual_kw("union") && p.nth(1) == IDENT => adt::union(p, m), + IDENT if p.at_contextual_kw(T![union]) && p.nth(1) == IDENT => adt::union(p, m), T![macro] => macro_def(p, m), - IDENT if p.at_contextual_kw("macro_rules") && p.nth(1) == BANG => macro_rules(p, m), + IDENT if p.at_contextual_kw(T![macro_rules]) && p.nth(1) == BANG => macro_rules(p, m), T![const] if (la == IDENT || la == T![_] || la == T![mut]) => consts::konst(p, m), T![static] => consts::static_(p, m), @@ -319,7 +319,7 @@ pub(crate) fn extern_item_list(p: &mut Parser) { } fn macro_rules(p: &mut Parser, m: Marker) { - assert!(p.at_contextual_kw("macro_rules")); + assert!(p.at_contextual_kw(T![macro_rules])); p.bump_remap(T![macro_rules]); p.expect(T![!]); diff --git a/crates/parser/src/grammar/items/adt.rs b/crates/parser/src/grammar/items/adt.rs index c5bd5b14bae..83b7ff05786 100644 --- a/crates/parser/src/grammar/items/adt.rs +++ b/crates/parser/src/grammar/items/adt.rs @@ -10,7 +10,7 @@ pub(super) fn strukt(p: &mut Parser, m: Marker) { // test union_item // struct U { i: i32, f: f32 } pub(super) fn union(p: &mut Parser, m: Marker) { - assert!(p.at_contextual_kw("union")); + assert!(p.at_contextual_kw(T![union])); p.bump_remap(T![union]); struct_or_union(p, m, false); } diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 720ecf6fb62..fd447194bf9 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -26,6 +26,8 @@ pub(crate) use token_set::TokenSet; pub use syntax_kind::SyntaxKind; +use crate::tokens::Tokens; + #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ParseError(pub Box); @@ -53,6 +55,7 @@ pub struct Token { /// Is the current token joined to the next one (`> >` vs `>>`). pub is_jointed_to_next: bool, + pub contextual_kw: SyntaxKind, } /// `TreeSink` abstracts details of a particular syntax tree implementation. @@ -93,15 +96,11 @@ pub enum ParserEntryPoint { } /// Parse given tokens into the given sink as a rust file. -pub fn parse_source_file(token_source: &mut dyn TokenSource, tree_sink: &mut dyn TreeSink) { - parse(token_source, tree_sink, ParserEntryPoint::SourceFile); +pub fn parse_source_file(tokens: &Tokens, tree_sink: &mut dyn TreeSink) { + parse(tokens, tree_sink, ParserEntryPoint::SourceFile); } -pub fn parse( - token_source: &mut dyn TokenSource, - tree_sink: &mut dyn TreeSink, - entry_point: ParserEntryPoint, -) { +pub fn parse(tokens: &Tokens, tree_sink: &mut dyn TreeSink, entry_point: ParserEntryPoint) { let entry_point: fn(&'_ mut parser::Parser) = match entry_point { ParserEntryPoint::SourceFile => grammar::entry_points::source_file, ParserEntryPoint::Path => grammar::entry_points::path, @@ -119,7 +118,7 @@ pub fn parse( ParserEntryPoint::Attr => grammar::entry_points::attr, }; - let mut p = parser::Parser::new(token_source); + let mut p = parser::Parser::new(tokens); entry_point(&mut p); let events = p.finish(); event::process(tree_sink, events); @@ -142,9 +141,9 @@ impl Reparser { /// /// Tokens must start with `{`, end with `}` and form a valid brace /// sequence. - pub fn parse(self, token_source: &mut dyn TokenSource, tree_sink: &mut dyn TreeSink) { + pub fn parse(self, tokens: &Tokens, tree_sink: &mut dyn TreeSink) { let Reparser(r) = self; - let mut p = parser::Parser::new(token_source); + let mut p = parser::Parser::new(tokens); r(&mut p); let events = p.finish(); event::process(tree_sink, events); diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index 44c5f8e12f5..759f87f4966 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -7,9 +7,10 @@ use limit::Limit; use crate::{ event::Event, + tokens::Tokens, ParseError, SyntaxKind::{self, EOF, ERROR, TOMBSTONE}, - TokenSet, TokenSource, T, + TokenSet, T, }; /// `Parser` struct provides the low-level API for @@ -22,7 +23,8 @@ use crate::{ /// "start expression, consume number literal, /// finish expression". See `Event` docs for more. pub(crate) struct Parser<'t> { - token_source: &'t mut dyn TokenSource, + tokens: &'t Tokens, + pos: usize, events: Vec, steps: Cell, } @@ -30,8 +32,8 @@ pub(crate) struct Parser<'t> { static PARSER_STEP_LIMIT: Limit = Limit::new(15_000_000); impl<'t> Parser<'t> { - pub(super) fn new(token_source: &'t mut dyn TokenSource) -> Parser<'t> { - Parser { token_source, events: Vec::new(), steps: Cell::new(0) } + pub(super) fn new(tokens: &'t Tokens) -> Parser<'t> { + Parser { tokens, pos: 0, events: Vec::new(), steps: Cell::new(0) } } pub(crate) fn finish(self) -> Vec { @@ -54,7 +56,7 @@ impl<'t> Parser<'t> { assert!(PARSER_STEP_LIMIT.check(steps as usize).is_ok(), "the parser seems stuck"); self.steps.set(steps + 1); - self.token_source.lookahead_nth(n).kind + self.tokens.get(self.pos + n).kind } /// Checks if the current token is `kind`. @@ -90,7 +92,7 @@ impl<'t> Parser<'t> { T![<<=] => self.at_composite3(n, T![<], T![<], T![=]), T![>>=] => self.at_composite3(n, T![>], T![>], T![=]), - _ => self.token_source.lookahead_nth(n).kind == kind, + _ => self.tokens.get(self.pos + n).kind == kind, } } @@ -129,24 +131,24 @@ impl<'t> Parser<'t> { } fn at_composite2(&self, n: usize, k1: SyntaxKind, k2: SyntaxKind) -> bool { - let t1 = self.token_source.lookahead_nth(n); + let t1 = self.tokens.get(self.pos + n); if t1.kind != k1 || !t1.is_jointed_to_next { return false; } - let t2 = self.token_source.lookahead_nth(n + 1); + let t2 = self.tokens.get(self.pos + n + 1); t2.kind == k2 } fn at_composite3(&self, n: usize, k1: SyntaxKind, k2: SyntaxKind, k3: SyntaxKind) -> bool { - let t1 = self.token_source.lookahead_nth(n); + let t1 = self.tokens.get(self.pos + n); if t1.kind != k1 || !t1.is_jointed_to_next { return false; } - let t2 = self.token_source.lookahead_nth(n + 1); + let t2 = self.tokens.get(self.pos + n + 1); if t2.kind != k2 || !t2.is_jointed_to_next { return false; } - let t3 = self.token_source.lookahead_nth(n + 2); + let t3 = self.tokens.get(self.pos + n + 2); t3.kind == k3 } @@ -156,8 +158,8 @@ impl<'t> Parser<'t> { } /// Checks if the current token is contextual keyword with text `t`. - pub(crate) fn at_contextual_kw(&self, kw: &str) -> bool { - self.token_source.is_keyword(kw) + pub(crate) fn at_contextual_kw(&self, kw: SyntaxKind) -> bool { + self.tokens.get(self.pos).contextual_kw == kw } /// Starts a new node in the syntax tree. All nodes and tokens @@ -243,10 +245,7 @@ impl<'t> Parser<'t> { } fn do_bump(&mut self, kind: SyntaxKind, n_raw_tokens: u8) { - for _ in 0..n_raw_tokens { - self.token_source.bump(); - } - + self.pos += n_raw_tokens as usize; self.push_event(Event::Token { kind, n_raw_tokens }); } diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 2fd807f9b3c..495d9713ea9 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -1,9 +1,8 @@ -use crate::SyntaxKind; +use crate::{SyntaxKind, Token}; +#[allow(non_camel_case_types)] type bits = u64; -pub type IdentKind = u8; - /// Main input to the parser. /// /// A sequence of tokens represented internally as a struct of arrays. @@ -11,17 +10,17 @@ pub type IdentKind = u8; pub struct Tokens { kind: Vec, joint: Vec, - ident_kind: Vec, + contextual_kw: Vec, } impl Tokens { pub fn push(&mut self, was_joint: bool, kind: SyntaxKind) { - self.push_impl(was_joint, kind, 0) + self.push_impl(was_joint, kind, SyntaxKind::EOF) } - pub fn push_ident(&mut self, ident_kind: IdentKind) { - self.push_impl(false, SyntaxKind::IDENT, ident_kind) + pub fn push_ident(&mut self, contextual_kw: SyntaxKind) { + self.push_impl(false, SyntaxKind::IDENT, contextual_kw) } - fn push_impl(&mut self, was_joint: bool, kind: SyntaxKind, ctx: IdentKind) { + fn push_impl(&mut self, was_joint: bool, kind: SyntaxKind, contextual_kw: SyntaxKind) { let idx = self.len(); if idx % (bits::BITS as usize) == 0 { self.joint.push(0); @@ -30,7 +29,7 @@ impl Tokens { self.set_joint(idx - 1); } self.kind.push(kind); - self.ident_kind.push(ctx); + self.contextual_kw.push(contextual_kw); } fn set_joint(&mut self, n: usize) { let (idx, b_idx) = self.bit_index(n); @@ -49,18 +48,18 @@ impl Tokens { pub fn len(&self) -> usize { self.kind.len() } - pub(crate) fn get(&self, idx: usize) -> (SyntaxKind, bool, IdentKind) { + pub(crate) fn get(&self, idx: usize) -> Token { if idx > self.len() { return self.eof(); } let kind = self.kind[idx]; - let joint = self.get_joint(idx); - let ident_kind = self.ident_kind[idx]; - (kind, joint, ident_kind) + let is_jointed_to_next = self.get_joint(idx); + let contextual_kw = self.contextual_kw[idx]; + Token { kind, is_jointed_to_next, contextual_kw } } #[cold] - fn eof(&self) -> (SyntaxKind, bool, IdentKind) { - (SyntaxKind::EOF, false, 0) + fn eof(&self) -> Token { + Token { kind: SyntaxKind::EOF, is_jointed_to_next: false, contextual_kw: SyntaxKind::EOF } } } From 6ce587ba5aa9fca9330edfe369704e7c3c805c75 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 17:58:45 +0300 Subject: [PATCH 04/11] parser tests work --- crates/parser/src/lib.rs | 31 +------ crates/parser/src/syntax_kind/generated.rs | 12 +++ crates/parser/src/tokens.rs | 26 ++++-- crates/syntax/src/parsing.rs | 45 +++++++--- crates/syntax/src/parsing/reparsing.rs | 12 +-- .../syntax/src/parsing/text_token_source.rs | 82 ------------------- crates/syntax/src/parsing/text_tree_sink.rs | 12 ++- crates/syntax/src/tests/sourcegen_ast.rs | 12 +++ 8 files changed, 92 insertions(+), 140 deletions(-) delete mode 100644 crates/syntax/src/parsing/text_token_source.rs diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index fd447194bf9..1e9f59fa530 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -24,40 +24,11 @@ mod tokens; pub(crate) use token_set::TokenSet; -pub use syntax_kind::SyntaxKind; - -use crate::tokens::Tokens; +pub use crate::{syntax_kind::SyntaxKind, tokens::Tokens}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ParseError(pub Box); -/// `TokenSource` abstracts the source of the tokens parser operates on. -/// -/// Hopefully this will allow us to treat text and token trees in the same way! -pub trait TokenSource { - fn current(&self) -> Token; - - /// Lookahead n token - fn lookahead_nth(&self, n: usize) -> Token; - - /// bump cursor to next token - fn bump(&mut self); - - /// Is the current token a specified keyword? - fn is_keyword(&self, kw: &str) -> bool; -} - -/// `Token` abstracts the cursor of `TokenSource` operates on. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub struct Token { - /// What is the current token? - pub kind: SyntaxKind, - - /// Is the current token joined to the next one (`> >` vs `>>`). - pub is_jointed_to_next: bool, - pub contextual_kw: SyntaxKind, -} - /// `TreeSink` abstracts details of a particular syntax tree implementation. pub trait TreeSink { /// Adds new token to the current branch. diff --git a/crates/parser/src/syntax_kind/generated.rs b/crates/parser/src/syntax_kind/generated.rs index 99e7651906a..601a5792afd 100644 --- a/crates/parser/src/syntax_kind/generated.rs +++ b/crates/parser/src/syntax_kind/generated.rs @@ -334,6 +334,18 @@ impl SyntaxKind { }; Some(kw) } + pub fn from_contextual_keyword(ident: &str) -> Option { + let kw = match ident { + "auto" => AUTO_KW, + "default" => DEFAULT_KW, + "existential" => EXISTENTIAL_KW, + "union" => UNION_KW, + "raw" => RAW_KW, + "macro_rules" => MACRO_RULES_KW, + _ => return None, + }; + Some(kw) + } pub fn from_char(c: char) -> Option { let tok = match c { ';' => SEMICOLON, diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 495d9713ea9..4f10956070f 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -1,8 +1,19 @@ -use crate::{SyntaxKind, Token}; +use crate::SyntaxKind; #[allow(non_camel_case_types)] type bits = u64; +/// `Token` abstracts the cursor of `TokenSource` operates on. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(crate) struct Token { + /// What is the current token? + pub(crate) kind: SyntaxKind, + + /// Is the current token joined to the next one (`> >` vs `>>`). + pub(crate) is_jointed_to_next: bool, + pub(crate) contextual_kw: SyntaxKind, +} + /// Main input to the parser. /// /// A sequence of tokens represented internally as a struct of arrays. @@ -49,13 +60,14 @@ impl Tokens { self.kind.len() } pub(crate) fn get(&self, idx: usize) -> Token { - if idx > self.len() { - return self.eof(); + if idx < self.len() { + let kind = self.kind[idx]; + let is_jointed_to_next = self.get_joint(idx); + let contextual_kw = self.contextual_kw[idx]; + Token { kind, is_jointed_to_next, contextual_kw } + } else { + self.eof() } - let kind = self.kind[idx]; - let is_jointed_to_next = self.get_joint(idx); - let contextual_kw = self.contextual_kw[idx]; - Token { kind, is_jointed_to_next, contextual_kw } } #[cold] diff --git a/crates/syntax/src/parsing.rs b/crates/syntax/src/parsing.rs index a45f262877e..652668e80b2 100644 --- a/crates/syntax/src/parsing.rs +++ b/crates/syntax/src/parsing.rs @@ -2,12 +2,10 @@ //! incremental reparsing. pub(crate) mod lexer; -mod text_token_source; mod text_tree_sink; mod reparsing; use parser::SyntaxKind; -use text_token_source::TextTokenSource; use text_tree_sink::TextTreeSink; use crate::{syntax_node::GreenNode, AstNode, SyntaxError, SyntaxNode}; @@ -15,12 +13,12 @@ use crate::{syntax_node::GreenNode, AstNode, SyntaxError, SyntaxNode}; pub(crate) use crate::parsing::{lexer::*, reparsing::incremental_reparse}; pub(crate) fn parse_text(text: &str) -> (GreenNode, Vec) { - let (tokens, lexer_errors) = tokenize(text); + let (lexer_tokens, lexer_errors) = tokenize(text); + let parser_tokens = to_parser_tokens(text, &lexer_tokens); - let mut token_source = TextTokenSource::new(text, &tokens); - let mut tree_sink = TextTreeSink::new(text, &tokens); + let mut tree_sink = TextTreeSink::new(text, &lexer_tokens); - parser::parse_source_file(&mut token_source, &mut tree_sink); + parser::parse_source_file(&parser_tokens, &mut tree_sink); let (tree, mut parser_errors) = tree_sink.finish(); parser_errors.extend(lexer_errors); @@ -33,26 +31,47 @@ pub(crate) fn parse_text_as( text: &str, entry_point: parser::ParserEntryPoint, ) -> Result { - let (tokens, lexer_errors) = tokenize(text); + let (lexer_tokens, lexer_errors) = tokenize(text); if !lexer_errors.is_empty() { return Err(()); } - let mut token_source = TextTokenSource::new(text, &tokens); - let mut tree_sink = TextTreeSink::new(text, &tokens); + let parser_tokens = to_parser_tokens(text, &lexer_tokens); + + let mut tree_sink = TextTreeSink::new(text, &lexer_tokens); // TextTreeSink assumes that there's at least some root node to which it can attach errors and // tokens. We arbitrarily give it a SourceFile. use parser::TreeSink; tree_sink.start_node(SyntaxKind::SOURCE_FILE); - parser::parse(&mut token_source, &mut tree_sink, entry_point); + parser::parse(&parser_tokens, &mut tree_sink, entry_point); tree_sink.finish_node(); - let (tree, parser_errors) = tree_sink.finish(); - use parser::TokenSource; - if !parser_errors.is_empty() || token_source.current().kind != SyntaxKind::EOF { + let (tree, parser_errors, eof) = tree_sink.finish_eof(); + if !parser_errors.is_empty() || !eof { return Err(()); } SyntaxNode::new_root(tree).first_child().and_then(T::cast).ok_or(()) } + +pub(crate) fn to_parser_tokens(text: &str, lexer_tokens: &[lexer::Token]) -> ::parser::Tokens { + let mut off = 0; + let mut res = parser::Tokens::default(); + let mut was_joint = true; + for t in lexer_tokens { + if t.kind.is_trivia() { + was_joint = false; + } else if t.kind == SyntaxKind::IDENT { + let token_text = &text[off..][..usize::from(t.len)]; + let contextual_kw = + SyntaxKind::from_contextual_keyword(token_text).unwrap_or(SyntaxKind::IDENT); + res.push_ident(contextual_kw); + } else { + res.push(was_joint, t.kind); + was_joint = true; + } + off += usize::from(t.len); + } + res +} diff --git a/crates/syntax/src/parsing/reparsing.rs b/crates/syntax/src/parsing/reparsing.rs index 186cc9e74c8..62f39a93472 100644 --- a/crates/syntax/src/parsing/reparsing.rs +++ b/crates/syntax/src/parsing/reparsing.rs @@ -12,8 +12,8 @@ use text_edit::Indel; use crate::{ parsing::{ lexer::{lex_single_syntax_kind, tokenize, Token}, - text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, + to_parser_tokens, }, syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode}, SyntaxError, @@ -91,14 +91,14 @@ fn reparse_block( let (node, reparser) = find_reparsable_node(root, edit.delete)?; let text = get_text_after_edit(node.clone().into(), edit); - let (tokens, new_lexer_errors) = tokenize(&text); - if !is_balanced(&tokens) { + let (lexer_tokens, new_lexer_errors) = tokenize(&text); + if !is_balanced(&lexer_tokens) { return None; } + let parser_tokens = to_parser_tokens(&text, &lexer_tokens); - let mut token_source = TextTokenSource::new(&text, &tokens); - let mut tree_sink = TextTreeSink::new(&text, &tokens); - reparser.parse(&mut token_source, &mut tree_sink); + let mut tree_sink = TextTreeSink::new(&text, &lexer_tokens); + reparser.parse(&parser_tokens, &mut tree_sink); let (green, mut new_parser_errors) = tree_sink.finish(); new_parser_errors.extend(new_lexer_errors); diff --git a/crates/syntax/src/parsing/text_token_source.rs b/crates/syntax/src/parsing/text_token_source.rs deleted file mode 100644 index 11dfc63a65b..00000000000 --- a/crates/syntax/src/parsing/text_token_source.rs +++ /dev/null @@ -1,82 +0,0 @@ -//! See `TextTokenSource` docs. - -use parser::TokenSource; - -use crate::{parsing::lexer::Token, SyntaxKind::EOF, TextRange, TextSize}; - -/// Implementation of `parser::TokenSource` that takes tokens from source code text. -pub(crate) struct TextTokenSource<'t> { - text: &'t str, - /// token and its start position (non-whitespace/comment tokens) - /// ```non-rust - /// struct Foo; - /// ^------^--^- - /// | | \________ - /// | \____ \ - /// | \ | - /// (struct, 0) (Foo, 7) (;, 10) - /// ``` - /// `[(struct, 0), (Foo, 7), (;, 10)]` - token_offset_pairs: Vec<(Token, TextSize)>, - - /// Current token and position - curr: (parser::Token, usize), -} - -impl<'t> TokenSource for TextTokenSource<'t> { - fn current(&self) -> parser::Token { - self.curr.0 - } - - fn lookahead_nth(&self, n: usize) -> parser::Token { - mk_token(self.curr.1 + n, &self.token_offset_pairs) - } - - fn bump(&mut self) { - if self.curr.0.kind == EOF { - return; - } - - let pos = self.curr.1 + 1; - self.curr = (mk_token(pos, &self.token_offset_pairs), pos); - } - - fn is_keyword(&self, kw: &str) -> bool { - self.token_offset_pairs - .get(self.curr.1) - .map_or(false, |(token, offset)| &self.text[TextRange::at(*offset, token.len)] == kw) - } -} - -fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { - let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) { - Some((token, offset)) => ( - token.kind, - token_offset_pairs - .get(pos + 1) - .map_or(false, |(_, next_offset)| offset + token.len == *next_offset), - ), - None => (EOF, false), - }; - parser::Token { kind, is_jointed_to_next } -} - -impl<'t> TextTokenSource<'t> { - /// Generate input from tokens(expect comment and whitespace). - pub(crate) fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> { - let token_offset_pairs: Vec<_> = raw_tokens - .iter() - .filter_map({ - let mut len = 0.into(); - move |token| { - let pair = if token.kind.is_trivia() { None } else { Some((*token, len)) }; - len += token.len; - pair - } - }) - .collect(); - - let first = mk_token(0, &token_offset_pairs); - TextTokenSource { text, token_offset_pairs, curr: (first, 0) } - } -} diff --git a/crates/syntax/src/parsing/text_tree_sink.rs b/crates/syntax/src/parsing/text_tree_sink.rs index 8c1de92048f..c1792199fdc 100644 --- a/crates/syntax/src/parsing/text_tree_sink.rs +++ b/crates/syntax/src/parsing/text_tree_sink.rs @@ -104,7 +104,7 @@ impl<'a> TextTreeSink<'a> { } } - pub(super) fn finish(mut self) -> (GreenNode, Vec) { + pub(super) fn finish_eof(mut self) -> (GreenNode, Vec, bool) { match mem::replace(&mut self.state, State::Normal) { State::PendingFinish => { self.eat_trivias(); @@ -113,7 +113,15 @@ impl<'a> TextTreeSink<'a> { State::PendingStart | State::Normal => unreachable!(), } - self.inner.finish_raw() + let (node, errors) = self.inner.finish_raw(); + let is_eof = self.token_pos == self.tokens.len(); + + (node, errors, is_eof) + } + + pub(super) fn finish(self) -> (GreenNode, Vec) { + let (node, errors, _eof) = self.finish_eof(); + (node, errors) } fn eat_trivias(&mut self) { diff --git a/crates/syntax/src/tests/sourcegen_ast.rs b/crates/syntax/src/tests/sourcegen_ast.rs index dcd813bbe03..c66edadc3ce 100644 --- a/crates/syntax/src/tests/sourcegen_ast.rs +++ b/crates/syntax/src/tests/sourcegen_ast.rs @@ -359,6 +359,10 @@ fn generate_syntax_kinds(grammar: KindsSrc<'_>) -> String { let full_keywords = full_keywords_values.iter().map(|kw| format_ident!("{}_KW", to_upper_snake_case(kw))); + let contextual_keywords_values = &grammar.contextual_keywords; + let contextual_keywords = + contextual_keywords_values.iter().map(|kw| format_ident!("{}_KW", to_upper_snake_case(kw))); + let all_keywords_values = grammar.keywords.iter().chain(grammar.contextual_keywords.iter()).collect::>(); let all_keywords_idents = all_keywords_values.iter().map(|kw| format_ident!("{}", kw)); @@ -428,6 +432,14 @@ fn generate_syntax_kinds(grammar: KindsSrc<'_>) -> String { Some(kw) } + pub fn from_contextual_keyword(ident: &str) -> Option { + let kw = match ident { + #(#contextual_keywords_values => #contextual_keywords,)* + _ => return None, + }; + Some(kw) + } + pub fn from_char(c: char) -> Option { let tok = match c { #(#single_byte_tokens_values => #single_byte_tokens,)* From 965585748e6bdff8b0b83d3b1b2185ea30108221 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 18:38:49 +0300 Subject: [PATCH 05/11] more orthogonal interface --- crates/parser/src/tokens.rs | 17 ++++++++++------- crates/syntax/src/parsing.rs | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 4f10956070f..e1aea6acfcf 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -25,20 +25,23 @@ pub struct Tokens { } impl Tokens { - pub fn push(&mut self, was_joint: bool, kind: SyntaxKind) { - self.push_impl(was_joint, kind, SyntaxKind::EOF) + pub fn push(&mut self, kind: SyntaxKind) { + self.push_impl(kind, SyntaxKind::EOF) + } + pub fn was_joint(&mut self, yes: bool) { + let idx = self.len(); + if yes && idx > 0 { + self.set_joint(idx - 1); + } } pub fn push_ident(&mut self, contextual_kw: SyntaxKind) { - self.push_impl(false, SyntaxKind::IDENT, contextual_kw) + self.push_impl(SyntaxKind::IDENT, contextual_kw) } - fn push_impl(&mut self, was_joint: bool, kind: SyntaxKind, contextual_kw: SyntaxKind) { + fn push_impl(&mut self, kind: SyntaxKind, contextual_kw: SyntaxKind) { let idx = self.len(); if idx % (bits::BITS as usize) == 0 { self.joint.push(0); } - if was_joint && idx > 0 { - self.set_joint(idx - 1); - } self.kind.push(kind); self.contextual_kw.push(contextual_kw); } diff --git a/crates/syntax/src/parsing.rs b/crates/syntax/src/parsing.rs index 652668e80b2..5cafe70dd7d 100644 --- a/crates/syntax/src/parsing.rs +++ b/crates/syntax/src/parsing.rs @@ -68,7 +68,8 @@ pub(crate) fn to_parser_tokens(text: &str, lexer_tokens: &[lexer::Token]) -> ::p SyntaxKind::from_contextual_keyword(token_text).unwrap_or(SyntaxKind::IDENT); res.push_ident(contextual_kw); } else { - res.push(was_joint, t.kind); + res.was_joint(was_joint); + res.push(t.kind); was_joint = true; } off += usize::from(t.len); From 1055a6111adc55e99e5f63884a8f4fbf5e7bb28d Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 19:06:40 +0300 Subject: [PATCH 06/11] port mbe to soa tokens --- crates/mbe/src/lib.rs | 2 +- crates/mbe/src/subtree_source.rs | 174 ----------------------------- crates/mbe/src/syntax_bridge.rs | 6 +- crates/mbe/src/to_parser_tokens.rs | 97 ++++++++++++++++ crates/mbe/src/tt_iter.rs | 6 +- crates/parser/src/lib.rs | 7 +- crates/parser/src/tokens.rs | 21 ++++ 7 files changed, 130 insertions(+), 183 deletions(-) delete mode 100644 crates/mbe/src/subtree_source.rs create mode 100644 crates/mbe/src/to_parser_tokens.rs diff --git a/crates/mbe/src/lib.rs b/crates/mbe/src/lib.rs index b58b86b38d6..1a56878fdb5 100644 --- a/crates/mbe/src/lib.rs +++ b/crates/mbe/src/lib.rs @@ -10,7 +10,7 @@ mod parser; mod expander; mod syntax_bridge; mod tt_iter; -mod subtree_source; +mod to_parser_tokens; #[cfg(test)] mod benchmark; diff --git a/crates/mbe/src/subtree_source.rs b/crates/mbe/src/subtree_source.rs deleted file mode 100644 index 6bdd787e301..00000000000 --- a/crates/mbe/src/subtree_source.rs +++ /dev/null @@ -1,174 +0,0 @@ -//! Our parser is generic over the source of tokens it parses. -//! -//! This module defines tokens sourced from declarative macros. - -use parser::{Token, TokenSource}; -use syntax::{lex_single_syntax_kind, SmolStr, SyntaxKind, SyntaxKind::*, T}; -use tt::buffer::TokenBuffer; - -#[derive(Debug, Clone, Eq, PartialEq)] -struct TtToken { - tt: Token, - text: SmolStr, -} - -pub(crate) struct SubtreeTokenSource { - cached: Vec, - curr: (Token, usize), -} - -impl<'a> SubtreeTokenSource { - pub(crate) fn new(buffer: &TokenBuffer) -> SubtreeTokenSource { - let mut current = buffer.begin(); - let mut cached = Vec::with_capacity(100); - - while !current.eof() { - let cursor = current; - let tt = cursor.token_tree(); - - // Check if it is lifetime - if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt { - if punct.char == '\'' { - let next = cursor.bump(); - if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(ident), _)) = - next.token_tree() - { - let text = SmolStr::new("'".to_string() + &ident.text); - cached.push(TtToken { - tt: Token { kind: LIFETIME_IDENT, is_jointed_to_next: false }, - text, - }); - current = next.bump(); - continue; - } else { - panic!("Next token must be ident : {:#?}", next.token_tree()); - } - } - } - - current = match tt { - Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => { - cached.push(convert_leaf(leaf)); - cursor.bump() - } - Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => { - if let Some(d) = subtree.delimiter_kind() { - cached.push(convert_delim(d, false)); - } - cursor.subtree().unwrap() - } - None => match cursor.end() { - Some(subtree) => { - if let Some(d) = subtree.delimiter_kind() { - cached.push(convert_delim(d, true)); - } - cursor.bump() - } - None => continue, - }, - }; - } - - let mut res = SubtreeTokenSource { - curr: (Token { kind: EOF, is_jointed_to_next: false }, 0), - cached, - }; - res.curr = (res.token(0), 0); - res - } - - fn token(&self, pos: usize) -> Token { - match self.cached.get(pos) { - Some(it) => it.tt, - None => Token { kind: EOF, is_jointed_to_next: false }, - } - } -} - -impl<'a> TokenSource for SubtreeTokenSource { - fn current(&self) -> Token { - self.curr.0 - } - - /// Lookahead n token - fn lookahead_nth(&self, n: usize) -> Token { - self.token(self.curr.1 + n) - } - - /// bump cursor to next token - fn bump(&mut self) { - if self.current().kind == EOF { - return; - } - self.curr = (self.token(self.curr.1 + 1), self.curr.1 + 1); - } - - /// Is the current token a specified keyword? - fn is_keyword(&self, kw: &str) -> bool { - match self.cached.get(self.curr.1) { - Some(t) => t.text == *kw, - None => false, - } - } -} - -fn convert_delim(d: tt::DelimiterKind, closing: bool) -> TtToken { - let (kinds, texts) = match d { - tt::DelimiterKind::Parenthesis => ([T!['('], T![')']], "()"), - tt::DelimiterKind::Brace => ([T!['{'], T!['}']], "{}"), - tt::DelimiterKind::Bracket => ([T!['['], T![']']], "[]"), - }; - - let idx = closing as usize; - let kind = kinds[idx]; - let text = &texts[idx..texts.len() - (1 - idx)]; - TtToken { tt: Token { kind, is_jointed_to_next: false }, text: SmolStr::new(text) } -} - -fn convert_literal(l: &tt::Literal) -> TtToken { - let is_negated = l.text.starts_with('-'); - let inner_text = &l.text[if is_negated { 1 } else { 0 }..]; - - let kind = lex_single_syntax_kind(inner_text) - .map(|(kind, _error)| kind) - .filter(|kind| { - kind.is_literal() && (!is_negated || matches!(kind, FLOAT_NUMBER | INT_NUMBER)) - }) - .unwrap_or_else(|| panic!("Fail to convert given literal {:#?}", &l)); - - TtToken { tt: Token { kind, is_jointed_to_next: false }, text: l.text.clone() } -} - -fn convert_ident(ident: &tt::Ident) -> TtToken { - let kind = match ident.text.as_ref() { - "true" => T![true], - "false" => T![false], - "_" => UNDERSCORE, - i if i.starts_with('\'') => LIFETIME_IDENT, - _ => SyntaxKind::from_keyword(ident.text.as_str()).unwrap_or(IDENT), - }; - - TtToken { tt: Token { kind, is_jointed_to_next: false }, text: ident.text.clone() } -} - -fn convert_punct(p: tt::Punct) -> TtToken { - let kind = match SyntaxKind::from_char(p.char) { - None => panic!("{:#?} is not a valid punct", p), - Some(kind) => kind, - }; - - let text = { - let mut buf = [0u8; 4]; - let s: &str = p.char.encode_utf8(&mut buf); - SmolStr::new(s) - }; - TtToken { tt: Token { kind, is_jointed_to_next: p.spacing == tt::Spacing::Joint }, text } -} - -fn convert_leaf(leaf: &tt::Leaf) -> TtToken { - match leaf { - tt::Leaf::Literal(l) => convert_literal(l), - tt::Leaf::Ident(ident) => convert_ident(ident), - tt::Leaf::Punct(punct) => convert_punct(*punct), - } -} diff --git a/crates/mbe/src/syntax_bridge.rs b/crates/mbe/src/syntax_bridge.rs index 0b65fa171f4..28a23f6be2c 100644 --- a/crates/mbe/src/syntax_bridge.rs +++ b/crates/mbe/src/syntax_bridge.rs @@ -12,7 +12,7 @@ use syntax::{ use tt::buffer::{Cursor, TokenBuffer}; use crate::{ - subtree_source::SubtreeTokenSource, tt_iter::TtIter, ExpandError, ParserEntryPoint, TokenMap, + to_parser_tokens::to_parser_tokens, tt_iter::TtIter, ExpandError, ParserEntryPoint, TokenMap, }; /// Convert the syntax node to a `TokenTree` (what macro @@ -56,9 +56,9 @@ pub fn token_tree_to_syntax_node( } _ => TokenBuffer::from_subtree(tt), }; - let mut token_source = SubtreeTokenSource::new(&buffer); + let parser_tokens = to_parser_tokens(&buffer); let mut tree_sink = TtTreeSink::new(buffer.begin()); - parser::parse(&mut token_source, &mut tree_sink, entry_point); + parser::parse(&parser_tokens, &mut tree_sink, entry_point); if tree_sink.roots.len() != 1 { return Err(ExpandError::ConversionError); } diff --git a/crates/mbe/src/to_parser_tokens.rs b/crates/mbe/src/to_parser_tokens.rs new file mode 100644 index 00000000000..435226342ec --- /dev/null +++ b/crates/mbe/src/to_parser_tokens.rs @@ -0,0 +1,97 @@ +//! Convert macro-by-example tokens which are specific to macro expansion into a +//! format that works for our parser. + +use syntax::{lex_single_syntax_kind, SyntaxKind, SyntaxKind::*, T}; +use tt::buffer::TokenBuffer; + +pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens { + let mut res = parser::Tokens::default(); + + let mut current = buffer.begin(); + + while !current.eof() { + let cursor = current; + let tt = cursor.token_tree(); + + // Check if it is lifetime + if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt { + if punct.char == '\'' { + let next = cursor.bump(); + match next.token_tree() { + Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(_ident), _)) => { + res.push(LIFETIME_IDENT); + current = next.bump(); + continue; + } + _ => panic!("Next token must be ident : {:#?}", next.token_tree()), + } + } + } + + current = match tt { + Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => { + match leaf { + tt::Leaf::Literal(lit) => { + let is_negated = lit.text.starts_with('-'); + let inner_text = &lit.text[if is_negated { 1 } else { 0 }..]; + + let kind = lex_single_syntax_kind(inner_text) + .map(|(kind, _error)| kind) + .filter(|kind| { + kind.is_literal() + && (!is_negated || matches!(kind, FLOAT_NUMBER | INT_NUMBER)) + }) + .unwrap_or_else(|| panic!("Fail to convert given literal {:#?}", &lit)); + + res.push(kind); + } + tt::Leaf::Ident(ident) => match ident.text.as_ref() { + "_" => res.push(T![_]), + i if i.starts_with('\'') => res.push(LIFETIME_IDENT), + _ => match SyntaxKind::from_keyword(&ident.text) { + Some(kind) => res.push(kind), + None => { + let contextual_keyword = + SyntaxKind::from_contextual_keyword(&ident.text) + .unwrap_or(SyntaxKind::IDENT); + res.push_ident(contextual_keyword); + } + }, + }, + tt::Leaf::Punct(punct) => { + let kind = SyntaxKind::from_char(punct.char) + .unwrap_or_else(|| panic!("{:#?} is not a valid punct", punct)); + res.push(kind); + res.was_joint(punct.spacing == tt::Spacing::Joint); + } + } + cursor.bump() + } + Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => { + if let Some(d) = subtree.delimiter_kind() { + res.push(match d { + tt::DelimiterKind::Parenthesis => T!['('], + tt::DelimiterKind::Brace => T!['{'], + tt::DelimiterKind::Bracket => T!['['], + }); + } + cursor.subtree().unwrap() + } + None => match cursor.end() { + Some(subtree) => { + if let Some(d) = subtree.delimiter_kind() { + res.push(match d { + tt::DelimiterKind::Parenthesis => T![')'], + tt::DelimiterKind::Brace => T!['}'], + tt::DelimiterKind::Bracket => T![']'], + }) + } + cursor.bump() + } + None => continue, + }, + }; + } + + res +} diff --git a/crates/mbe/src/tt_iter.rs b/crates/mbe/src/tt_iter.rs index ff0272808b7..d05e84b0f02 100644 --- a/crates/mbe/src/tt_iter.rs +++ b/crates/mbe/src/tt_iter.rs @@ -1,7 +1,7 @@ //! A "Parser" structure for token trees. We use this when parsing a declarative //! macro definition into a list of patterns and templates. -use crate::{subtree_source::SubtreeTokenSource, ExpandError, ExpandResult, ParserEntryPoint}; +use crate::{to_parser_tokens::to_parser_tokens, ExpandError, ExpandResult, ParserEntryPoint}; use parser::TreeSink; use syntax::SyntaxKind; @@ -116,10 +116,10 @@ impl<'a> TtIter<'a> { } let buffer = TokenBuffer::from_tokens(self.inner.as_slice()); - let mut src = SubtreeTokenSource::new(&buffer); + let parser_tokens = to_parser_tokens(&buffer); let mut sink = OffsetTokenSink { cursor: buffer.begin(), error: false }; - parser::parse(&mut src, &mut sink, entry_point); + parser::parse(&parser_tokens, &mut sink, entry_point); let mut err = if !sink.cursor.is_root() || sink.error { Some(err!("expected {:?}", entry_point)) diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 1e9f59fa530..2e2d96d0275 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -1,8 +1,11 @@ //! The Rust parser. //! +//! NOTE: The crate is undergoing refactors, don't believe everything the docs +//! say :-) +//! //! The parser doesn't know about concrete representation of tokens and syntax -//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead. -//! As a consequence, this crate does not contain a lexer. +//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead. As +//! a consequence, this crate does not contain a lexer. //! //! The [`Parser`] struct from the [`parser`] module is a cursor into the //! sequence of tokens. Parsing routines use [`Parser`] to inspect current diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index e1aea6acfcf..dff5e583b1c 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -1,3 +1,8 @@ +//! Input for the parser -- a sequence of tokens. +//! +//! As of now, parser doesn't have access to the *text* of the tokens, and makes +//! decisions based solely on their classification. + use crate::SyntaxKind; #[allow(non_camel_case_types)] @@ -28,6 +33,22 @@ impl Tokens { pub fn push(&mut self, kind: SyntaxKind) { self.push_impl(kind, SyntaxKind::EOF) } + /// Sets jointness for the last token we've pushed. + /// + /// This is a separate API rather than an argument to the `push` to make it + /// convenient both for textual and mbe tokens. With text, you know whether + /// the *previous* token was joint, with mbe, you know whether the *current* + /// one is joint. This API allows for styles of usage: + /// + /// ``` + /// // In text: + /// tokens.was_joint(prev_joint); + /// tokens.push(curr); + /// + /// // In MBE: + /// token.push(curr); + /// tokens.push(curr_joint) + /// ``` pub fn was_joint(&mut self, yes: bool) { let idx = self.len(); if yes && idx > 0 { From 18d4737fb9845e09bd860860a6a687bd7edd3bcd Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 19:17:04 +0300 Subject: [PATCH 07/11] add cross-crate inlines --- crates/parser/src/tokens.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index dff5e583b1c..de831f0f705 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -30,6 +30,7 @@ pub struct Tokens { } impl Tokens { + #[inline] pub fn push(&mut self, kind: SyntaxKind) { self.push_impl(kind, SyntaxKind::EOF) } @@ -49,15 +50,18 @@ impl Tokens { /// token.push(curr); /// tokens.push(curr_joint) /// ``` + #[inline] pub fn was_joint(&mut self, yes: bool) { let idx = self.len(); if yes && idx > 0 { self.set_joint(idx - 1); } } + #[inline] pub fn push_ident(&mut self, contextual_kw: SyntaxKind) { self.push_impl(SyntaxKind::IDENT, contextual_kw) } + #[inline] fn push_impl(&mut self, kind: SyntaxKind, contextual_kw: SyntaxKind) { let idx = self.len(); if idx % (bits::BITS as usize) == 0 { @@ -80,7 +84,7 @@ impl Tokens { (idx, b_idx) } - pub fn len(&self) -> usize { + fn len(&self) -> usize { self.kind.len() } pub(crate) fn get(&self, idx: usize) -> Token { From 57e6ef0bfbfda17276f7f9c62abee81f3f086f91 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 19:22:37 +0300 Subject: [PATCH 08/11] tighten up invariants --- crates/mbe/src/to_parser_tokens.rs | 4 +++- crates/parser/src/tokens.rs | 7 ++----- crates/syntax/src/parsing.rs | 20 ++++++++++++-------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/crates/mbe/src/to_parser_tokens.rs b/crates/mbe/src/to_parser_tokens.rs index 435226342ec..644689f432a 100644 --- a/crates/mbe/src/to_parser_tokens.rs +++ b/crates/mbe/src/to_parser_tokens.rs @@ -62,7 +62,9 @@ pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens { let kind = SyntaxKind::from_char(punct.char) .unwrap_or_else(|| panic!("{:#?} is not a valid punct", punct)); res.push(kind); - res.was_joint(punct.spacing == tt::Spacing::Joint); + if punct.spacing == tt::Spacing::Joint { + res.was_joint(); + } } } cursor.bump() diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index de831f0f705..1128cfe99d6 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -51,11 +51,8 @@ impl Tokens { /// tokens.push(curr_joint) /// ``` #[inline] - pub fn was_joint(&mut self, yes: bool) { - let idx = self.len(); - if yes && idx > 0 { - self.set_joint(idx - 1); - } + pub fn was_joint(&mut self) { + self.set_joint(self.len() - 1); } #[inline] pub fn push_ident(&mut self, contextual_kw: SyntaxKind) { diff --git a/crates/syntax/src/parsing.rs b/crates/syntax/src/parsing.rs index 5cafe70dd7d..865e146482c 100644 --- a/crates/syntax/src/parsing.rs +++ b/crates/syntax/src/parsing.rs @@ -58,18 +58,22 @@ pub(crate) fn parse_text_as( pub(crate) fn to_parser_tokens(text: &str, lexer_tokens: &[lexer::Token]) -> ::parser::Tokens { let mut off = 0; let mut res = parser::Tokens::default(); - let mut was_joint = true; + let mut was_joint = false; for t in lexer_tokens { if t.kind.is_trivia() { was_joint = false; - } else if t.kind == SyntaxKind::IDENT { - let token_text = &text[off..][..usize::from(t.len)]; - let contextual_kw = - SyntaxKind::from_contextual_keyword(token_text).unwrap_or(SyntaxKind::IDENT); - res.push_ident(contextual_kw); } else { - res.was_joint(was_joint); - res.push(t.kind); + if t.kind == SyntaxKind::IDENT { + let token_text = &text[off..][..usize::from(t.len)]; + let contextual_kw = + SyntaxKind::from_contextual_keyword(token_text).unwrap_or(SyntaxKind::IDENT); + res.push_ident(contextual_kw); + } else { + if was_joint { + res.was_joint(); + } + res.push(t.kind); + } was_joint = true; } off += usize::from(t.len); From 6e4bb5701433ab8f0809700827c76035d8be4813 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 19:31:32 +0300 Subject: [PATCH 09/11] simplify --- crates/parser/src/parser.rs | 30 ++++++++++------------------ crates/parser/src/tokens.rs | 40 +++++++++++-------------------------- 2 files changed, 23 insertions(+), 47 deletions(-) diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index 759f87f4966..4c891108a60 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -56,7 +56,7 @@ impl<'t> Parser<'t> { assert!(PARSER_STEP_LIMIT.check(steps as usize).is_ok(), "the parser seems stuck"); self.steps.set(steps + 1); - self.tokens.get(self.pos + n).kind + self.tokens.kind(self.pos + n) } /// Checks if the current token is `kind`. @@ -92,7 +92,7 @@ impl<'t> Parser<'t> { T![<<=] => self.at_composite3(n, T![<], T![<], T![=]), T![>>=] => self.at_composite3(n, T![>], T![>], T![=]), - _ => self.tokens.get(self.pos + n).kind == kind, + _ => self.tokens.kind(self.pos + n) == kind, } } @@ -131,25 +131,17 @@ impl<'t> Parser<'t> { } fn at_composite2(&self, n: usize, k1: SyntaxKind, k2: SyntaxKind) -> bool { - let t1 = self.tokens.get(self.pos + n); - if t1.kind != k1 || !t1.is_jointed_to_next { - return false; - } - let t2 = self.tokens.get(self.pos + n + 1); - t2.kind == k2 + self.tokens.kind(self.pos + n) == k1 + && self.tokens.kind(self.pos + n + 1) == k2 + && self.tokens.is_joint(self.pos + n) } fn at_composite3(&self, n: usize, k1: SyntaxKind, k2: SyntaxKind, k3: SyntaxKind) -> bool { - let t1 = self.tokens.get(self.pos + n); - if t1.kind != k1 || !t1.is_jointed_to_next { - return false; - } - let t2 = self.tokens.get(self.pos + n + 1); - if t2.kind != k2 || !t2.is_jointed_to_next { - return false; - } - let t3 = self.tokens.get(self.pos + n + 2); - t3.kind == k3 + self.tokens.kind(self.pos + n) == k1 + && self.tokens.kind(self.pos + n + 1) == k2 + && self.tokens.kind(self.pos + n + 2) == k3 + && self.tokens.is_joint(self.pos + n) + && self.tokens.is_joint(self.pos + n + 1) } /// Checks if the current token is in `kinds`. @@ -159,7 +151,7 @@ impl<'t> Parser<'t> { /// Checks if the current token is contextual keyword with text `t`. pub(crate) fn at_contextual_kw(&self, kw: SyntaxKind) -> bool { - self.tokens.get(self.pos).contextual_kw == kw + self.tokens.contextual_kind(self.pos) == kw } /// Starts a new node in the syntax tree. All nodes and tokens diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 1128cfe99d6..74725df6d0d 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -8,17 +8,6 @@ use crate::SyntaxKind; #[allow(non_camel_case_types)] type bits = u64; -/// `Token` abstracts the cursor of `TokenSource` operates on. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub(crate) struct Token { - /// What is the current token? - pub(crate) kind: SyntaxKind, - - /// Is the current token joined to the next one (`> >` vs `>>`). - pub(crate) is_jointed_to_next: bool, - pub(crate) contextual_kw: SyntaxKind, -} - /// Main input to the parser. /// /// A sequence of tokens represented internally as a struct of arrays. @@ -71,10 +60,6 @@ impl Tokens { let (idx, b_idx) = self.bit_index(n); self.joint[idx] |= 1 << b_idx; } - fn get_joint(&self, n: usize) -> bool { - let (idx, b_idx) = self.bit_index(n); - self.joint[idx] & 1 << b_idx != 0 - } fn bit_index(&self, n: usize) -> (usize, usize) { let idx = n / (bits::BITS as usize); let b_idx = n % (bits::BITS as usize); @@ -84,19 +69,18 @@ impl Tokens { fn len(&self) -> usize { self.kind.len() } - pub(crate) fn get(&self, idx: usize) -> Token { - if idx < self.len() { - let kind = self.kind[idx]; - let is_jointed_to_next = self.get_joint(idx); - let contextual_kw = self.contextual_kw[idx]; - Token { kind, is_jointed_to_next, contextual_kw } - } else { - self.eof() - } - } +} - #[cold] - fn eof(&self) -> Token { - Token { kind: SyntaxKind::EOF, is_jointed_to_next: false, contextual_kw: SyntaxKind::EOF } +/// pub(crate) impl used by the parser. +impl Tokens { + pub(crate) fn kind(&self, idx: usize) -> SyntaxKind { + self.kind.get(idx).copied().unwrap_or(SyntaxKind::EOF) + } + pub(crate) fn contextual_kind(&self, idx: usize) -> SyntaxKind { + self.contextual_kw.get(idx).copied().unwrap_or(SyntaxKind::EOF) + } + pub(crate) fn is_joint(&self, n: usize) -> bool { + let (idx, b_idx) = self.bit_index(n); + self.joint[idx] & 1 << b_idx != 0 } } From 980dd56cdc69efda8b4da348cbd062b0fa204108 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 19:32:04 +0300 Subject: [PATCH 10/11] consistency --- crates/parser/src/tokens.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 74725df6d0d..bf4d6294088 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -15,7 +15,7 @@ type bits = u64; pub struct Tokens { kind: Vec, joint: Vec, - contextual_kw: Vec, + contextual_kind: Vec, } impl Tokens { @@ -44,17 +44,17 @@ impl Tokens { self.set_joint(self.len() - 1); } #[inline] - pub fn push_ident(&mut self, contextual_kw: SyntaxKind) { - self.push_impl(SyntaxKind::IDENT, contextual_kw) + pub fn push_ident(&mut self, contextual_kind: SyntaxKind) { + self.push_impl(SyntaxKind::IDENT, contextual_kind) } #[inline] - fn push_impl(&mut self, kind: SyntaxKind, contextual_kw: SyntaxKind) { + fn push_impl(&mut self, kind: SyntaxKind, contextual_kind: SyntaxKind) { let idx = self.len(); if idx % (bits::BITS as usize) == 0 { self.joint.push(0); } self.kind.push(kind); - self.contextual_kw.push(contextual_kw); + self.contextual_kind.push(contextual_kind); } fn set_joint(&mut self, n: usize) { let (idx, b_idx) = self.bit_index(n); @@ -77,7 +77,7 @@ impl Tokens { self.kind.get(idx).copied().unwrap_or(SyntaxKind::EOF) } pub(crate) fn contextual_kind(&self, idx: usize) -> SyntaxKind { - self.contextual_kw.get(idx).copied().unwrap_or(SyntaxKind::EOF) + self.contextual_kind.get(idx).copied().unwrap_or(SyntaxKind::EOF) } pub(crate) fn is_joint(&self, n: usize) -> bool { let (idx, b_idx) = self.bit_index(n); From 3b5b988526b9cec74422f46e20ab1b2f9826d39c Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 12 Dec 2021 19:36:14 +0300 Subject: [PATCH 11/11] prettyfy --- crates/parser/src/tokens.rs | 39 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index bf4d6294088..1c0672492da 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -18,11 +18,16 @@ pub struct Tokens { contextual_kind: Vec, } +/// `pub` impl used by callers to create `Tokens`. impl Tokens { #[inline] pub fn push(&mut self, kind: SyntaxKind) { self.push_impl(kind, SyntaxKind::EOF) } + #[inline] + pub fn push_ident(&mut self, contextual_kind: SyntaxKind) { + self.push_impl(SyntaxKind::IDENT, contextual_kind) + } /// Sets jointness for the last token we've pushed. /// /// This is a separate API rather than an argument to the `push` to make it @@ -41,11 +46,9 @@ impl Tokens { /// ``` #[inline] pub fn was_joint(&mut self) { - self.set_joint(self.len() - 1); - } - #[inline] - pub fn push_ident(&mut self, contextual_kind: SyntaxKind) { - self.push_impl(SyntaxKind::IDENT, contextual_kind) + let n = self.len() - 1; + let (idx, b_idx) = self.bit_index(n); + self.joint[idx] |= 1 << b_idx; } #[inline] fn push_impl(&mut self, kind: SyntaxKind, contextual_kind: SyntaxKind) { @@ -56,22 +59,9 @@ impl Tokens { self.kind.push(kind); self.contextual_kind.push(contextual_kind); } - fn set_joint(&mut self, n: usize) { - let (idx, b_idx) = self.bit_index(n); - self.joint[idx] |= 1 << b_idx; - } - fn bit_index(&self, n: usize) -> (usize, usize) { - let idx = n / (bits::BITS as usize); - let b_idx = n % (bits::BITS as usize); - (idx, b_idx) - } - - fn len(&self) -> usize { - self.kind.len() - } } -/// pub(crate) impl used by the parser. +/// pub(crate) impl used by the parser to consume `Tokens`. impl Tokens { pub(crate) fn kind(&self, idx: usize) -> SyntaxKind { self.kind.get(idx).copied().unwrap_or(SyntaxKind::EOF) @@ -84,3 +74,14 @@ impl Tokens { self.joint[idx] & 1 << b_idx != 0 } } + +impl Tokens { + fn bit_index(&self, n: usize) -> (usize, usize) { + let idx = n / (bits::BITS as usize); + let b_idx = n % (bits::BITS as usize); + (idx, b_idx) + } + fn len(&self) -> usize { + self.kind.len() + } +}