rust/crates/parser/src/lib.rs

//! The Rust parser.
//!
//! NOTE: The crate is undergoing refactors, don't believe everything the docs
//! say :-)
//!
//! The parser doesn't know about concrete representation of tokens and syntax
//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead. As
//! a consequence, this crate does not contain a lexer.
//!
//! The [`Parser`] struct from the [`parser`] module is a cursor into the
//! sequence of tokens.  Parsing routines use [`Parser`] to inspect current
//! state and advance the parsing.
//!
//! The actual parsing happens in the [`grammar`] module.
//!
//! Tests for this crate live in the `syntax` crate.
//!
//! [`Parser`]: crate::parser::Parser

#![warn(rust_2018_idioms, unused_lifetimes, semicolon_in_expressions_from_macros)]
#![allow(rustdoc::private_intra_doc_links)]

mod lexed_str;
mod token_set;
mod syntax_kind;
mod event;
mod parser;
mod grammar;
mod input;
mod output;
mod shortcuts;

#[cfg(test)]
mod tests;

pub(crate) use token_set::TokenSet;

pub use crate::{
    input::Input,
    lexed_str::LexedStr,
    output::{Output, Step},
    shortcuts::StrStep,
    syntax_kind::SyntaxKind,
};

/// Parse the whole of the input as a given syntactic construct.
///
/// This covers two main use-cases:
///
///   * Parsing a Rust file.
///   * Parsing a result of macro expansion.
///
/// That is, for something like
///
/// ```
/// quick_check! {
///    fn prop() {}
/// }
/// ```
///
/// the input to the macro will be parsed with [`PrefixEntryPoint::Item`], and
/// the result will be [`TopEntryPoint::MacroItems`].
///
/// [`TopEntryPoint::parse`] makes a guarantee that
///   * all input is consumed
///   * the result is a valid tree (there's one root node)
#[derive(Debug)]
pub enum TopEntryPoint {
    SourceFile,
    MacroStmts,
    MacroItems,
    Pattern,
    Type,
    Expr,
    /// Edge case -- macros generally don't expand to attributes, with the
    /// exception of `cfg_attr` which does!
    MetaItem,
}

impl TopEntryPoint {
    pub fn parse(&self, input: &Input) -> Output {
        let entry_point: fn(&'_ mut parser::Parser<'_>) = match self {
            TopEntryPoint::SourceFile => grammar::entry::top::source_file,
            TopEntryPoint::MacroStmts => grammar::entry::top::macro_stmts,
            TopEntryPoint::MacroItems => grammar::entry::top::macro_items,
            TopEntryPoint::Pattern => grammar::entry::top::pattern,
            TopEntryPoint::Type => grammar::entry::top::type_,
            TopEntryPoint::Expr => grammar::entry::top::expr,
            TopEntryPoint::MetaItem => grammar::entry::top::meta_item,
        };
        let mut p = parser::Parser::new(input);
        entry_point(&mut p);
        let events = p.finish();
        let res = event::process(events);

        if cfg!(debug_assertions) {
            let mut depth = 0;
            let mut first = true;
            for step in res.iter() {
                assert!(depth > 0 || first);
                first = false;
                match step {
                    Step::Enter { .. } => depth += 1,
                    Step::Exit => depth -= 1,
                    Step::FloatSplit { ends_in_dot: has_pseudo_dot } => {
                        depth -= 1 + !has_pseudo_dot as usize
                    }
                    Step::Token { .. } | Step::Error { .. } => (),
                }
            }
            assert!(!first, "no tree at all");
            assert_eq!(depth, 0, "unbalanced tree");
        }

        res
    }
}

/// Parse a prefix of the input as a given syntactic construct.
///
/// This is used by macro-by-example parser to implement things like `$i:item`
/// and the naming of variants follows the naming of macro fragments.
///
/// Note that this is generally non-optional -- the result is intentionally not
/// `Option<Output>`. The way MBE work, by the time we *try* to parse `$e:expr`
/// we already commit to expression. In other words, this API by design can't be
/// used to implement "rollback and try another alternative" logic.
#[derive(Debug)]
pub enum PrefixEntryPoint {
    Vis,
    Block,
    Stmt,
    Pat,
    PatTop,
    Ty,
    Expr,
    Path,
    Item,
    MetaItem,
}

impl PrefixEntryPoint {
    pub fn parse(&self, input: &Input) -> Output {
        let entry_point: fn(&'_ mut parser::Parser<'_>) = match self {
            PrefixEntryPoint::Vis => grammar::entry::prefix::vis,
            PrefixEntryPoint::Block => grammar::entry::prefix::block,
            PrefixEntryPoint::Stmt => grammar::entry::prefix::stmt,
            PrefixEntryPoint::Pat => grammar::entry::prefix::pat,
            PrefixEntryPoint::PatTop => grammar::entry::prefix::pat_top,
            PrefixEntryPoint::Ty => grammar::entry::prefix::ty,
            PrefixEntryPoint::Expr => grammar::entry::prefix::expr,
            PrefixEntryPoint::Path => grammar::entry::prefix::path,
            PrefixEntryPoint::Item => grammar::entry::prefix::item,
            PrefixEntryPoint::MetaItem => grammar::entry::prefix::meta_item,
        };
        let mut p = parser::Parser::new(input);
        entry_point(&mut p);
        let events = p.finish();
        event::process(events)
    }
}

/// A parsing function for a specific braced-block.
pub struct Reparser(fn(&mut parser::Parser<'_>));

impl Reparser {
    /// If the node is a braced block, return the corresponding `Reparser`.
    pub fn for_node(
        node: SyntaxKind,
        first_child: Option<SyntaxKind>,
        parent: Option<SyntaxKind>,
    ) -> Option<Reparser> {
        grammar::reparser(node, first_child, parent).map(Reparser)
    }

    /// Re-parse given tokens using this `Reparser`.
    ///
    /// Tokens must start with `{`, end with `}` and form a valid brace
    /// sequence.
    pub fn parse(self, tokens: &Input) -> Output {
        let Reparser(r) = self;
        let mut p = parser::Parser::new(tokens);
        r(&mut p);
        let events = p.finish();
        event::process(events)
    }
}
docs 2019-02-21 06:24:42 -06:00			`//! The Rust parser.`
			`//!`
port mbe to soa tokens 2021-12-12 10:06:40 -06:00			`//! NOTE: The crate is undergoing refactors, don't believe everything the docs`
			`//! say :-)`
			`//!`
docs 2019-02-21 06:24:42 -06:00			`//! The parser doesn't know about concrete representation of tokens and syntax`
port mbe to soa tokens 2021-12-12 10:06:40 -06:00			//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead. As
			`//! a consequence, this crate does not contain a lexer.`
docs 2019-02-21 06:24:42 -06:00			`//!`
tree-wide: fix rustdoc warnings, add some links 2021-08-03 22:57:31 -05:00			//! The [`Parser`] struct from the [`parser`] module is a cursor into the
			//! sequence of tokens. Parsing routines use [`Parser`] to inspect current
			`//! state and advance the parsing.`
docs 2019-02-21 06:24:42 -06:00			`//!`
tree-wide: fix rustdoc warnings, add some links 2021-08-03 22:57:31 -05:00			//! The actual parsing happens in the [`grammar`] module.
docs 2019-02-21 06:24:42 -06:00			`//!`
tree-wide: fix rustdoc warnings, add some links 2021-08-03 22:57:31 -05:00			//! Tests for this crate live in the `syntax` crate.
			`//!`
			//! [`Parser`]: crate::parser::Parser
Enable extra warnings required by rust-lang/rust 2022-07-20 07:59:42 -05:00
			`#![warn(rust_2018_idioms, unused_lifetimes, semicolon_in_expressions_from_macros)]`
tree-wide: fix rustdoc warnings, add some links 2021-08-03 22:57:31 -05:00			`#![allow(rustdoc::private_intra_doc_links)]`
minor: modernize 2021-09-06 10:42:07 -05:00
soa all the things 2021-12-18 06:31:50 -06:00			`mod lexed_str;`
move parser to a separate crate 2019-02-21 04:27:45 -06:00			`mod token_set;`
			`mod syntax_kind;`
			`mod event;`
			`mod parser;`
			`mod grammar;`
internal: rename 2021-12-25 12:59:02 -06:00			`mod input;`
			`mod output;`
internal: move ws attachment logic to the parser crate This has to re-introduce the `sink` pattern, because doing this purely with iterators is awkward :( Maaaybe the event vector was a false start? But, anyway, I like the current factoring more -- it sort-of obvious that we do want to keep ws-attachment business in the parser, and that we also don't want that to depend on the particular tree structure. I think `shortcuts` module achieves that. 2021-12-26 07:47:10 -06:00			`mod shortcuts;`
move parser to a separate crate 2019-02-21 04:27:45 -06:00
move lexing to the parser crate 2021-12-12 12:32:58 -06:00			`#[cfg(test)]`
			`mod tests;`

move parser to a separate crate 2019-02-21 04:27:45 -06:00			`pub(crate) use token_set::TokenSet;`

internal: replace TreeSink with a data structure The general theme of this is to make parser a better independent library. The specific thing we do here is replacing callback based TreeSink with a data structure. That is, rather than calling user-provided tree construction methods, the parser now spits out a very bare-bones tree, effectively a log of a DFS traversal. This makes the parser usable without any specifc tree sink, and allows us to, eg, move tests into this crate. Now, it's also true that this is a distinction without a difference, as the old and the new interface are equivalent in expressiveness. Still, this new thing seems somewhat simpler. But yeah, I admit I don't have a suuper strong motivation here, just a hunch that this is better. 2021-12-19 08:36:23 -06:00			`pub use crate::{`
internal: rename 2021-12-25 12:59:02 -06:00			`input::Input,`
internal: replace TreeSink with a data structure The general theme of this is to make parser a better independent library. The specific thing we do here is replacing callback based TreeSink with a data structure. That is, rather than calling user-provided tree construction methods, the parser now spits out a very bare-bones tree, effectively a log of a DFS traversal. This makes the parser usable without any specifc tree sink, and allows us to, eg, move tests into this crate. Now, it's also true that this is a distinction without a difference, as the old and the new interface are equivalent in expressiveness. Still, this new thing seems somewhat simpler. But yeah, I admit I don't have a suuper strong motivation here, just a hunch that this is better. 2021-12-19 08:36:23 -06:00			`lexed_str::LexedStr,`
internal: rename 2021-12-25 12:59:02 -06:00			`output::{Output, Step},`
internal: move ws attachment logic to the parser crate This has to re-introduce the `sink` pattern, because doing this purely with iterators is awkward :( Maaaybe the event vector was a false start? But, anyway, I like the current factoring more -- it sort-of obvious that we do want to keep ws-attachment business in the parser, and that we also don't want that to depend on the particular tree structure. I think `shortcuts` module achieves that. 2021-12-26 07:47:10 -06:00			`shortcuts::StrStep,`
internal: replace TreeSink with a data structure The general theme of this is to make parser a better independent library. The specific thing we do here is replacing callback based TreeSink with a data structure. That is, rather than calling user-provided tree construction methods, the parser now spits out a very bare-bones tree, effectively a log of a DFS traversal. This makes the parser usable without any specifc tree sink, and allows us to, eg, move tests into this crate. Now, it's also true that this is a distinction without a difference, as the old and the new interface are equivalent in expressiveness. Still, this new thing seems somewhat simpler. But yeah, I admit I don't have a suuper strong motivation here, just a hunch that this is better. 2021-12-19 08:36:23 -06:00			`syntax_kind::SyntaxKind,`
			`};`
move parser to a separate crate 2019-02-21 04:27:45 -06:00
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`/// Parse the whole of the input as a given syntactic construct.`
			`///`
			`/// This covers two main use-cases:`
			`///`
			`/// * Parsing a Rust file.`
			`/// * Parsing a result of macro expansion.`
			`///`
			`/// That is, for something like`
			`///`
			/// ```
			`/// quick_check! {`
			`/// fn prop() {}`
			`/// }`
			/// ```
			`///`
			/// the input to the macro will be parsed with [`PrefixEntryPoint::Item`], and
check top level entry point invariants 2022-01-02 09:41:32 -06:00			/// the result will be [`TopEntryPoint::MacroItems`].
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`///`
check top level entry point invariants 2022-01-02 09:41:32 -06:00			/// [`TopEntryPoint::parse`] makes a guarantee that
			`/// * all input is consumed`
			`/// * the result is a valid tree (there's one root node)`
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`#[derive(Debug)]`
			`pub enum TopEntryPoint {`
			`SourceFile,`
			`MacroStmts,`
			`MacroItems,`
			`Pattern,`
			`Type,`
			`Expr,`
add top level tests for types 2022-01-02 08:45:18 -06:00			`/// Edge case -- macros generally don't expand to attributes, with the`
			/// exception of `cfg_attr` which does!
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`MetaItem,`
			`}`

			`impl TopEntryPoint {`
			`pub fn parse(&self, input: &Input) -> Output {`
Run cargo fix --edition-idioms 2022-07-20 08:02:08 -05:00			`let entry_point: fn(&'_ mut parser::Parser<'_>) = match self {`
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`TopEntryPoint::SourceFile => grammar::entry::top::source_file,`
			`TopEntryPoint::MacroStmts => grammar::entry::top::macro_stmts,`
			`TopEntryPoint::MacroItems => grammar::entry::top::macro_items,`
enforce parsing invariant for patterns 2022-01-02 08:32:15 -06:00			`TopEntryPoint::Pattern => grammar::entry::top::pattern,`
add top level tests for types 2022-01-02 08:45:18 -06:00			`TopEntryPoint::Type => grammar::entry::top::type_,`
add top-level tests for expressions 2022-01-02 08:52:05 -06:00			`TopEntryPoint::Expr => grammar::entry::top::expr,`
check top level entry point invariants 2022-01-02 09:41:32 -06:00			`TopEntryPoint::MetaItem => grammar::entry::top::meta_item,`
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`};`
			`let mut p = parser::Parser::new(input);`
			`entry_point(&mut p);`
			`let events = p.finish();`
check top level entry point invariants 2022-01-02 09:41:32 -06:00			`let res = event::process(events);`

			`if cfg!(debug_assertions) {`
			`let mut depth = 0;`
			`let mut first = true;`
			`for step in res.iter() {`
			`assert!(depth > 0 \|\| first);`
			`first = false;`
			`match step {`
			`Step::Enter { .. } => depth += 1,`
			`Step::Exit => depth -= 1,`
Fixup comments 2023-02-07 11:08:05 -06:00			`Step::FloatSplit { ends_in_dot: has_pseudo_dot } => {`
			`depth -= 1 + !has_pseudo_dot as usize`
			`}`
Fix up token_tree_to_syntax_node float split handling 2023-02-07 08:21:37 -06:00			`Step::Token { .. } \| Step::Error { .. } => (),`
check top level entry point invariants 2022-01-02 09:41:32 -06:00			`}`
			`}`
			`assert!(!first, "no tree at all");`
Fix up token_tree_to_syntax_node float split handling 2023-02-07 08:21:37 -06:00			`assert_eq!(depth, 0, "unbalanced tree");`
check top level entry point invariants 2022-01-02 09:41:32 -06:00			`}`

			`res`
add TopEntryPoint 2021-12-27 08:54:51 -06:00			`}`
			`}`

more intuitive order 2022-01-02 09:46:01 -06:00			`/// Parse a prefix of the input as a given syntactic construct.`
			`///`
			/// This is used by macro-by-example parser to implement things like `$i:item`
			`/// and the naming of variants follows the naming of macro fragments.`
			`///`
			`/// Note that this is generally non-optional -- the result is intentionally not`
			/// `Option<Output>`. The way MBE work, by the time we try to parse `$e:expr`
			`/// we already commit to expression. In other words, this API by design can't be`
			`/// used to implement "rollback and try another alternative" logic.`
			`#[derive(Debug)]`
			`pub enum PrefixEntryPoint {`
			`Vis,`
			`Block,`
			`Stmt,`
			`Pat,`
fix: Fix pat fragment handling in 2021 edition 2023-04-24 15:21:37 -05:00			`PatTop,`
more intuitive order 2022-01-02 09:46:01 -06:00			`Ty,`
			`Expr,`
			`Path,`
			`Item,`
			`MetaItem,`
			`}`

			`impl PrefixEntryPoint {`
			`pub fn parse(&self, input: &Input) -> Output {`
Run cargo fix --edition-idioms 2022-07-20 08:02:08 -05:00			`let entry_point: fn(&'_ mut parser::Parser<'_>) = match self {`
more intuitive order 2022-01-02 09:46:01 -06:00			`PrefixEntryPoint::Vis => grammar::entry::prefix::vis,`
			`PrefixEntryPoint::Block => grammar::entry::prefix::block,`
			`PrefixEntryPoint::Stmt => grammar::entry::prefix::stmt,`
			`PrefixEntryPoint::Pat => grammar::entry::prefix::pat,`
fix: Fix pat fragment handling in 2021 edition 2023-04-24 15:21:37 -05:00			`PrefixEntryPoint::PatTop => grammar::entry::prefix::pat_top,`
more intuitive order 2022-01-02 09:46:01 -06:00			`PrefixEntryPoint::Ty => grammar::entry::prefix::ty,`
			`PrefixEntryPoint::Expr => grammar::entry::prefix::expr,`
			`PrefixEntryPoint::Path => grammar::entry::prefix::path,`
			`PrefixEntryPoint::Item => grammar::entry::prefix::item,`
			`PrefixEntryPoint::MetaItem => grammar::entry::prefix::meta_item,`
			`};`
			`let mut p = parser::Parser::new(input);`
			`entry_point(&mut p);`
			`let events = p.finish();`
			`event::process(events)`
			`}`
			`}`

docs 2019-02-21 06:24:42 -06:00			`/// A parsing function for a specific braced-block.`
Run cargo fix --edition-idioms 2022-07-20 08:02:08 -05:00			`pub struct Reparser(fn(&mut parser::Parser<'_>));`
move parser to a separate crate 2019-02-21 04:27:45 -06:00
			`impl Reparser {`
docs 2019-02-21 06:24:42 -06:00			/// If the node is a braced block, return the corresponding `Reparser`.
move parser to a separate crate 2019-02-21 04:27:45 -06:00			`pub fn for_node(`
			`node: SyntaxKind,`
			`first_child: Option<SyntaxKind>,`
			`parent: Option<SyntaxKind>,`
			`) -> Option<Reparser> {`
			`grammar::reparser(node, first_child, parent).map(Reparser)`
			`}`

docs 2019-02-21 06:24:42 -06:00			/// Re-parse given tokens using this `Reparser`.
			`///`
			/// Tokens must start with `{`, end with `}` and form a valid brace
			`/// sequence.`
internal: rename 2021-12-25 12:59:02 -06:00			`pub fn parse(self, tokens: &Input) -> Output {`
fix compilation 2019-02-21 04:37:32 -06:00			`let Reparser(r) = self;`
Switch parser to use tokens 2021-11-14 13:13:44 -06:00			`let mut p = parser::Parser::new(tokens);`
fix compilation 2019-02-21 04:37:32 -06:00			`r(&mut p);`
			`let events = p.finish();`
internal: replace TreeSink with a data structure The general theme of this is to make parser a better independent library. The specific thing we do here is replacing callback based TreeSink with a data structure. That is, rather than calling user-provided tree construction methods, the parser now spits out a very bare-bones tree, effectively a log of a DFS traversal. This makes the parser usable without any specifc tree sink, and allows us to, eg, move tests into this crate. Now, it's also true that this is a distinction without a difference, as the old and the new interface are equivalent in expressiveness. Still, this new thing seems somewhat simpler. But yeah, I admit I don't have a suuper strong motivation here, just a hunch that this is better. 2021-12-19 08:36:23 -06:00			`event::process(events)`
fix compilation 2019-02-21 04:37:32 -06:00			`}`
move parser to a separate crate 2019-02-21 04:27:45 -06:00			`}`