port mbe to soa tokens

2021-12-12 19:06:40 +03:00 · 2021-12-12 19:06:40 +03:00 · 1055a6111a
commit 1055a6111a
parent 965585748e
7 changed files with 130 additions and 183 deletions
--- a/crates/mbe/src/lib.rs
+++ b/crates/mbe/src/lib.rs
@ -10,7 +10,7 @@ mod parser;
 mod expander;
 mod syntax_bridge;
 mod tt_iter;
-mod subtree_source;
+mod to_parser_tokens;

 #[cfg(test)]
 mod benchmark;
--- a/crates/mbe/src/subtree_source.rs
+++ b/crates/mbe/src/subtree_source.rs
@ -1,174 +0,0 @@
-//! Our parser is generic over the source of tokens it parses.
-//!
-//! This module defines tokens sourced from declarative macros.
-
-use parser::{Token, TokenSource};
-use syntax::{lex_single_syntax_kind, SmolStr, SyntaxKind, SyntaxKind::*, T};
-use tt::buffer::TokenBuffer;
-
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct TtToken {
-    tt: Token,
-    text: SmolStr,
-}
-
-pub(crate) struct SubtreeTokenSource {
-    cached: Vec<TtToken>,
-    curr: (Token, usize),
-}
-
-impl<'a> SubtreeTokenSource {
-    pub(crate) fn new(buffer: &TokenBuffer) -> SubtreeTokenSource {
-        let mut current = buffer.begin();
-        let mut cached = Vec::with_capacity(100);
-
-        while !current.eof() {
-            let cursor = current;
-            let tt = cursor.token_tree();
-
-            // Check if it is lifetime
-            if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt {
-                if punct.char == '\'' {
-                    let next = cursor.bump();
-                    if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(ident), _)) =
-                        next.token_tree()
-                    {
-                        let text = SmolStr::new("'".to_string() + &ident.text);
-                        cached.push(TtToken {
-                            tt: Token { kind: LIFETIME_IDENT, is_jointed_to_next: false },
-                            text,
-                        });
-                        current = next.bump();
-                        continue;
-                    } else {
-                        panic!("Next token must be ident : {:#?}", next.token_tree());
-                    }
-                }
-            }
-
-            current = match tt {
-                Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => {
-                    cached.push(convert_leaf(leaf));
-                    cursor.bump()
-                }
-                Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
-                    if let Some(d) = subtree.delimiter_kind() {
-                        cached.push(convert_delim(d, false));
-                    }
-                    cursor.subtree().unwrap()
-                }
-                None => match cursor.end() {
-                    Some(subtree) => {
-                        if let Some(d) = subtree.delimiter_kind() {
-                            cached.push(convert_delim(d, true));
-                        }
-                        cursor.bump()
-                    }
-                    None => continue,
-                },
-            };
-        }
-
-        let mut res = SubtreeTokenSource {
-            curr: (Token { kind: EOF, is_jointed_to_next: false }, 0),
-            cached,
-        };
-        res.curr = (res.token(0), 0);
-        res
-    }
-
-    fn token(&self, pos: usize) -> Token {
-        match self.cached.get(pos) {
-            Some(it) => it.tt,
-            None => Token { kind: EOF, is_jointed_to_next: false },
-        }
-    }
-}
-
-impl<'a> TokenSource for SubtreeTokenSource {
-    fn current(&self) -> Token {
-        self.curr.0
-    }
-
-    /// Lookahead n token
-    fn lookahead_nth(&self, n: usize) -> Token {
-        self.token(self.curr.1 + n)
-    }
-
-    /// bump cursor to next token
-    fn bump(&mut self) {
-        if self.current().kind == EOF {
-            return;
-        }
-        self.curr = (self.token(self.curr.1 + 1), self.curr.1 + 1);
-    }
-
-    /// Is the current token a specified keyword?
-    fn is_keyword(&self, kw: &str) -> bool {
-        match self.cached.get(self.curr.1) {
-            Some(t) => t.text == *kw,
-            None => false,
-        }
-    }
-}
-
-fn convert_delim(d: tt::DelimiterKind, closing: bool) -> TtToken {
-    let (kinds, texts) = match d {
-        tt::DelimiterKind::Parenthesis => ([T!['('], T![')']], "()"),
-        tt::DelimiterKind::Brace => ([T!['{'], T!['}']], "{}"),
-        tt::DelimiterKind::Bracket => ([T!['['], T![']']], "[]"),
-    };
-
-    let idx = closing as usize;
-    let kind = kinds[idx];
-    let text = &texts[idx..texts.len() - (1 - idx)];
-    TtToken { tt: Token { kind, is_jointed_to_next: false }, text: SmolStr::new(text) }
-}
-
-fn convert_literal(l: &tt::Literal) -> TtToken {
-    let is_negated = l.text.starts_with('-');
-    let inner_text = &l.text[if is_negated { 1 } else { 0 }..];
-
-    let kind = lex_single_syntax_kind(inner_text)
-        .map(|(kind, _error)| kind)
-        .filter(|kind| {
-            kind.is_literal() && (!is_negated || matches!(kind, FLOAT_NUMBER | INT_NUMBER))
-        })
-        .unwrap_or_else(|| panic!("Fail to convert given literal {:#?}", &l));
-
-    TtToken { tt: Token { kind, is_jointed_to_next: false }, text: l.text.clone() }
-}
-
-fn convert_ident(ident: &tt::Ident) -> TtToken {
-    let kind = match ident.text.as_ref() {
-        "true" => T![true],
-        "false" => T![false],
-        "_" => UNDERSCORE,
-        i if i.starts_with('\'') => LIFETIME_IDENT,
-        _ => SyntaxKind::from_keyword(ident.text.as_str()).unwrap_or(IDENT),
-    };
-
-    TtToken { tt: Token { kind, is_jointed_to_next: false }, text: ident.text.clone() }
-}
-
-fn convert_punct(p: tt::Punct) -> TtToken {
-    let kind = match SyntaxKind::from_char(p.char) {
-        None => panic!("{:#?} is not a valid punct", p),
-        Some(kind) => kind,
-    };
-
-    let text = {
-        let mut buf = [0u8; 4];
-        let s: &str = p.char.encode_utf8(&mut buf);
-        SmolStr::new(s)
-    };
-    TtToken { tt: Token { kind, is_jointed_to_next: p.spacing == tt::Spacing::Joint }, text }
-}
-
-fn convert_leaf(leaf: &tt::Leaf) -> TtToken {
-    match leaf {
-        tt::Leaf::Literal(l) => convert_literal(l),
-        tt::Leaf::Ident(ident) => convert_ident(ident),
-        tt::Leaf::Punct(punct) => convert_punct(*punct),
-    }
-}
--- a/crates/mbe/src/syntax_bridge.rs
+++ b/crates/mbe/src/syntax_bridge.rs
@ -12,7 +12,7 @@ use syntax::{
 use tt::buffer::{Cursor, TokenBuffer};

 use crate::{
-    subtree_source::SubtreeTokenSource, tt_iter::TtIter, ExpandError, ParserEntryPoint, TokenMap,
+    to_parser_tokens::to_parser_tokens, tt_iter::TtIter, ExpandError, ParserEntryPoint, TokenMap,
 };

 /// Convert the syntax node to a `TokenTree` (what macro
@ -56,9 +56,9 @@ pub fn token_tree_to_syntax_node(
        }
        _ => TokenBuffer::from_subtree(tt),
    };
-    let mut token_source = SubtreeTokenSource::new(&buffer);
+    let parser_tokens = to_parser_tokens(&buffer);
    let mut tree_sink = TtTreeSink::new(buffer.begin());
-    parser::parse(&mut token_source, &mut tree_sink, entry_point);
+    parser::parse(&parser_tokens, &mut tree_sink, entry_point);
    if tree_sink.roots.len() != 1 {
        return Err(ExpandError::ConversionError);
    }
--- a/crates/mbe/src/to_parser_tokens.rs
+++ b/crates/mbe/src/to_parser_tokens.rs
@ -0,0 +1,97 @@
+//! Convert macro-by-example tokens which are specific to macro expansion into a
+//! format that works for our parser.
+
+use syntax::{lex_single_syntax_kind, SyntaxKind, SyntaxKind::*, T};
+use tt::buffer::TokenBuffer;
+
+pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens {
+    let mut res = parser::Tokens::default();
+
+    let mut current = buffer.begin();
+
+    while !current.eof() {
+        let cursor = current;
+        let tt = cursor.token_tree();
+
+        // Check if it is lifetime
+        if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt {
+            if punct.char == '\'' {
+                let next = cursor.bump();
+                match next.token_tree() {
+                    Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(_ident), _)) => {
+                        res.push(LIFETIME_IDENT);
+                        current = next.bump();
+                        continue;
+                    }
+                    _ => panic!("Next token must be ident : {:#?}", next.token_tree()),
+                }
+            }
+        }
+
+        current = match tt {
+            Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => {
+                match leaf {
+                    tt::Leaf::Literal(lit) => {
+                        let is_negated = lit.text.starts_with('-');
+                        let inner_text = &lit.text[if is_negated { 1 } else { 0 }..];
+
+                        let kind = lex_single_syntax_kind(inner_text)
+                            .map(|(kind, _error)| kind)
+                            .filter(|kind| {
+                                kind.is_literal()
+                                    && (!is_negated || matches!(kind, FLOAT_NUMBER | INT_NUMBER))
+                            })
+                            .unwrap_or_else(|| panic!("Fail to convert given literal {:#?}", &lit));
+
+                        res.push(kind);
+                    }
+                    tt::Leaf::Ident(ident) => match ident.text.as_ref() {
+                        "_" => res.push(T![_]),
+                        i if i.starts_with('\'') => res.push(LIFETIME_IDENT),
+                        _ => match SyntaxKind::from_keyword(&ident.text) {
+                            Some(kind) => res.push(kind),
+                            None => {
+                                let contextual_keyword =
+                                    SyntaxKind::from_contextual_keyword(&ident.text)
+                                        .unwrap_or(SyntaxKind::IDENT);
+                                res.push_ident(contextual_keyword);
+                            }
+                        },
+                    },
+                    tt::Leaf::Punct(punct) => {
+                        let kind = SyntaxKind::from_char(punct.char)
+                            .unwrap_or_else(|| panic!("{:#?} is not a valid punct", punct));
+                        res.push(kind);
+                        res.was_joint(punct.spacing == tt::Spacing::Joint);
+                    }
+                }
+                cursor.bump()
+            }
+            Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
+                if let Some(d) = subtree.delimiter_kind() {
+                    res.push(match d {
+                        tt::DelimiterKind::Parenthesis => T!['('],
+                        tt::DelimiterKind::Brace => T!['{'],
+                        tt::DelimiterKind::Bracket => T!['['],
+                    });
+                }
+                cursor.subtree().unwrap()
+            }
+            None => match cursor.end() {
+                Some(subtree) => {
+                    if let Some(d) = subtree.delimiter_kind() {
+                        res.push(match d {
+                            tt::DelimiterKind::Parenthesis => T![')'],
+                            tt::DelimiterKind::Brace => T!['}'],
+                            tt::DelimiterKind::Bracket => T![']'],
+                        })
+                    }
+                    cursor.bump()
+                }
+                None => continue,
+            },
+        };
+    }
+
+    res
+}
--- a/crates/mbe/src/tt_iter.rs
+++ b/crates/mbe/src/tt_iter.rs
@ -1,7 +1,7 @@
 //! A "Parser" structure for token trees. We use this when parsing a declarative
 //! macro definition into a list of patterns and templates.

-use crate::{subtree_source::SubtreeTokenSource, ExpandError, ExpandResult, ParserEntryPoint};
+use crate::{to_parser_tokens::to_parser_tokens, ExpandError, ExpandResult, ParserEntryPoint};

 use parser::TreeSink;
 use syntax::SyntaxKind;
@ -116,10 +116,10 @@ impl<'a> TtIter<'a> {
        }

        let buffer = TokenBuffer::from_tokens(self.inner.as_slice());
-        let mut src = SubtreeTokenSource::new(&buffer);
+        let parser_tokens = to_parser_tokens(&buffer);
        let mut sink = OffsetTokenSink { cursor: buffer.begin(), error: false };

-        parser::parse(&mut src, &mut sink, entry_point);
+        parser::parse(&parser_tokens, &mut sink, entry_point);

        let mut err = if !sink.cursor.is_root() || sink.error {
            Some(err!("expected {:?}", entry_point))
--- a/crates/parser/src/lib.rs
+++ b/crates/parser/src/lib.rs
@ -1,8 +1,11 @@
 //! The Rust parser.
 //!
+//! NOTE: The crate is undergoing refactors, don't believe everything the docs
+//! say :-)
+//!
 //! The parser doesn't know about concrete representation of tokens and syntax
-//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead.
-//! As a consequence, this crate does not contain a lexer.
+//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead. As
+//! a consequence, this crate does not contain a lexer.
 //!
 //! The [`Parser`] struct from the [`parser`] module is a cursor into the
 //! sequence of tokens.  Parsing routines use [`Parser`] to inspect current
--- a/crates/parser/src/tokens.rs
+++ b/crates/parser/src/tokens.rs
@ -1,3 +1,8 @@
+//! Input for the parser -- a sequence of tokens.
+//!
+//! As of now, parser doesn't have access to the *text* of the tokens, and makes
+//! decisions based solely on their classification.
+
 use crate::SyntaxKind;

 #[allow(non_camel_case_types)]
@ -28,6 +33,22 @@ impl Tokens {
    pub fn push(&mut self, kind: SyntaxKind) {
        self.push_impl(kind, SyntaxKind::EOF)
    }
+    /// Sets jointness for the last token we've pushed.
+    ///
+    /// This is a separate API rather than an argument to the `push` to make it
+    /// convenient both for textual and mbe tokens. With text, you know whether
+    /// the *previous* token was joint, with mbe, you know whether the *current*
+    /// one is joint. This API allows for styles of usage:
+    ///
+    /// ```
+    /// // In text:
+    /// tokens.was_joint(prev_joint);
+    /// tokens.push(curr);
+    ///
+    /// // In MBE:
+    /// token.push(curr);
+    /// tokens.push(curr_joint)
+    /// ```
    pub fn was_joint(&mut self, yes: bool) {
        let idx = self.len();
        if yes && idx > 0 {