581 lines
20 KiB
Rust
581 lines
20 KiB
Rust
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
|
// file at the top-level directory of this distribution and at
|
|
// http://rust-lang.org/COPYRIGHT.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
// option. This file may not be copied, modified, or distributed
|
|
// except according to those terms.
|
|
|
|
// FIXME: Currently, the VM simulates an NFA. It would be nice to have another
|
|
// VM that simulates a DFA.
|
|
//
|
|
// According to Russ Cox[1], a DFA performs better than an NFA, principally
|
|
// because it reuses states previously computed by the machine *and* doesn't
|
|
// keep track of capture groups. The drawback of a DFA (aside from its
|
|
// complexity) is that it can't accurately return the locations of submatches.
|
|
// The NFA *can* do that. (This is my understanding anyway.)
|
|
//
|
|
// Cox suggests that a DFA ought to be used to answer "does this match" and
|
|
// "where does it match" questions. (In the latter, the starting position of
|
|
// the match is computed by executing the regex backwards.) Cox also suggests
|
|
// that a DFA should be run when asking "where are the submatches", which can
|
|
// 1) quickly answer "no" is there's no match and 2) discover the substring
|
|
// that matches, which means running the NFA on smaller input.
|
|
//
|
|
// Currently, the NFA simulation implemented below does some dirty tricks to
|
|
// avoid tracking capture groups when they aren't needed (which only works
|
|
// for 'is_match', not 'find'). This is a half-measure, but does provide some
|
|
// perf improvement.
|
|
//
|
|
// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
|
|
//
|
|
// [1] - http://swtch.com/~rsc/regex/regex3.html
|
|
|
|
pub use self::MatchKind::*;
|
|
pub use self::StepState::*;
|
|
|
|
use std::cmp;
|
|
use std::mem;
|
|
use std::slice::SliceExt;
|
|
use compile::{
|
|
Program,
|
|
Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
|
|
Save, Jump, Split,
|
|
};
|
|
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
|
|
use unicode::regex::PERLW;
|
|
|
|
pub type CaptureLocs = Vec<Option<uint>>;
|
|
|
|
/// Indicates the type of match to be performed by the VM.
|
|
#[deriving(Copy)]
|
|
pub enum MatchKind {
|
|
/// Only checks if a match exists or not. Does not return location.
|
|
Exists,
|
|
/// Returns the start and end indices of the entire match in the input
|
|
/// given.
|
|
Location,
|
|
/// Returns the start and end indices of each submatch in the input given.
|
|
Submatches,
|
|
}
|
|
|
|
/// Runs an NFA simulation on the compiled expression given on the search text
|
|
/// `input`. The search begins at byte index `start` and ends at byte index
|
|
/// `end`. (The range is specified here so that zero-width assertions will work
|
|
/// correctly when searching for successive non-overlapping matches.)
|
|
///
|
|
/// The `which` parameter indicates what kind of capture information the caller
|
|
/// wants. There are three choices: match existence only, the location of the
|
|
/// entire match or the locations of the entire match in addition to the
|
|
/// locations of each submatch.
|
|
pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
|
|
start: uint, end: uint) -> CaptureLocs {
|
|
Nfa {
|
|
which: which,
|
|
prog: prog,
|
|
input: input,
|
|
start: start,
|
|
end: end,
|
|
ic: 0,
|
|
chars: CharReader::new(input),
|
|
}.run()
|
|
}
|
|
|
|
struct Nfa<'r, 't> {
|
|
which: MatchKind,
|
|
prog: &'r Program,
|
|
input: &'t str,
|
|
start: uint,
|
|
end: uint,
|
|
ic: uint,
|
|
chars: CharReader<'t>,
|
|
}
|
|
|
|
/// Indicates the next action to take after a single non-empty instruction
|
|
/// is processed.
|
|
#[deriving(Copy)]
|
|
pub enum StepState {
|
|
/// This is returned if and only if a Match instruction is reached and
|
|
/// we only care about the existence of a match. It instructs the VM to
|
|
/// quit early.
|
|
StepMatchEarlyReturn,
|
|
/// Indicates that a match was found. Thus, the rest of the states in the
|
|
/// *current* queue should be dropped (i.e., leftmost-first semantics).
|
|
/// States in the "next" queue can still be processed.
|
|
StepMatch,
|
|
/// No match was found. Continue with the next state in the queue.
|
|
StepContinue,
|
|
}
|
|
|
|
impl<'r, 't> Nfa<'r, 't> {
|
|
fn run(&mut self) -> CaptureLocs {
|
|
let ncaps = match self.which {
|
|
Exists => 0,
|
|
Location => 1,
|
|
Submatches => self.prog.num_captures(),
|
|
};
|
|
let mut matched = false;
|
|
let ninsts = self.prog.insts.len();
|
|
let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
|
|
let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
|
|
|
|
let mut groups = Vec::from_elem(ncaps * 2, None);
|
|
|
|
// Determine if the expression starts with a '^' so we can avoid
|
|
// simulating .*?
|
|
// Make sure multi-line mode isn't enabled for it, otherwise we can't
|
|
// drop the initial .*?
|
|
let prefix_anchor =
|
|
match self.prog.insts[1] {
|
|
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
|
|
_ => false,
|
|
};
|
|
|
|
self.ic = self.start;
|
|
let mut next_ic = self.chars.set(self.start);
|
|
while self.ic <= self.end {
|
|
if clist.size == 0 {
|
|
// We have a match and we're done exploring alternatives.
|
|
// Time to quit.
|
|
if matched {
|
|
break
|
|
}
|
|
|
|
// If there are no threads to try, then we'll have to start
|
|
// over at the beginning of the regex.
|
|
// BUT, if there's a literal prefix for the program, try to
|
|
// jump ahead quickly. If it can't be found, then we can bail
|
|
// out early.
|
|
if self.prog.prefix.len() > 0 && clist.size == 0 {
|
|
let needle = self.prog.prefix.as_bytes();
|
|
let haystack = self.input.as_bytes()[self.ic..];
|
|
match find_prefix(needle, haystack) {
|
|
None => break,
|
|
Some(i) => {
|
|
self.ic += i;
|
|
next_ic = self.chars.set(self.ic);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// This simulates a preceding '.*?' for every regex by adding
|
|
// a state starting at the current position in the input for the
|
|
// beginning of the program only if we don't already have a match.
|
|
if clist.size == 0 || (!prefix_anchor && !matched) {
|
|
self.add(clist, 0, groups.as_mut_slice())
|
|
}
|
|
|
|
// Now we try to read the next character.
|
|
// As a result, the 'step' method will look at the previous
|
|
// character.
|
|
self.ic = next_ic;
|
|
next_ic = self.chars.advance();
|
|
|
|
for i in range(0, clist.size) {
|
|
let pc = clist.pc(i);
|
|
let step_state = self.step(groups.as_mut_slice(), nlist,
|
|
clist.groups(i), pc);
|
|
match step_state {
|
|
StepMatchEarlyReturn => return vec![Some(0), Some(0)],
|
|
StepMatch => { matched = true; break },
|
|
StepContinue => {},
|
|
}
|
|
}
|
|
mem::swap(&mut clist, &mut nlist);
|
|
nlist.empty();
|
|
}
|
|
match self.which {
|
|
Exists if matched => vec![Some(0), Some(0)],
|
|
Exists => vec![None, None],
|
|
Location | Submatches => groups,
|
|
}
|
|
}
|
|
|
|
fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
|
|
caps: &mut [Option<uint>], pc: uint)
|
|
-> StepState {
|
|
match self.prog.insts[pc] {
|
|
Match => {
|
|
match self.which {
|
|
Exists => {
|
|
return StepMatchEarlyReturn
|
|
}
|
|
Location => {
|
|
groups[0] = caps[0];
|
|
groups[1] = caps[1];
|
|
return StepMatch
|
|
}
|
|
Submatches => {
|
|
for (slot, val) in groups.iter_mut().zip(caps.iter()) {
|
|
*slot = *val;
|
|
}
|
|
return StepMatch
|
|
}
|
|
}
|
|
}
|
|
OneChar(c, flags) => {
|
|
if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
|
|
self.add(nlist, pc+1, caps);
|
|
}
|
|
}
|
|
CharClass(ref ranges, flags) => {
|
|
if self.chars.prev.is_some() {
|
|
let c = self.chars.prev.unwrap();
|
|
let negate = flags & FLAG_NEGATED > 0;
|
|
let casei = flags & FLAG_NOCASE > 0;
|
|
let found = ranges.as_slice();
|
|
let found = found.binary_search(|&rc| class_cmp(casei, c, rc))
|
|
.found().is_some();
|
|
if found ^ negate {
|
|
self.add(nlist, pc+1, caps);
|
|
}
|
|
}
|
|
}
|
|
Any(flags) => {
|
|
if flags & FLAG_DOTNL > 0
|
|
|| !self.char_eq(false, self.chars.prev, '\n') {
|
|
self.add(nlist, pc+1, caps)
|
|
}
|
|
}
|
|
EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
|
|
| Save(_) | Jump(_) | Split(_, _) => {},
|
|
}
|
|
StepContinue
|
|
}
|
|
|
|
fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
|
|
if nlist.contains(pc) {
|
|
return
|
|
}
|
|
// We have to add states to the threads list even if their empty.
|
|
// TL;DR - It prevents cycles.
|
|
// If we didn't care about cycles, we'd *only* add threads that
|
|
// correspond to non-jumping instructions (OneChar, Any, Match, etc.).
|
|
// But, it's possible for valid regexs (like '(a*)*') to result in
|
|
// a cycle in the instruction list. e.g., We'll keep chasing the Split
|
|
// instructions forever.
|
|
// So we add these instructions to our thread queue, but in the main
|
|
// VM loop, we look for them but simply ignore them.
|
|
// Adding them to the queue prevents them from being revisited so we
|
|
// can avoid cycles (and the inevitable stack overflow).
|
|
//
|
|
// We make a minor optimization by indicating that the state is "empty"
|
|
// so that its capture groups are not filled in.
|
|
match self.prog.insts[pc] {
|
|
EmptyBegin(flags) => {
|
|
let multi = flags & FLAG_MULTI > 0;
|
|
nlist.add(pc, groups, true);
|
|
if self.chars.is_begin()
|
|
|| (multi && self.char_is(self.chars.prev, '\n')) {
|
|
self.add(nlist, pc + 1, groups)
|
|
}
|
|
}
|
|
EmptyEnd(flags) => {
|
|
let multi = flags & FLAG_MULTI > 0;
|
|
nlist.add(pc, groups, true);
|
|
if self.chars.is_end()
|
|
|| (multi && self.char_is(self.chars.cur, '\n')) {
|
|
self.add(nlist, pc + 1, groups)
|
|
}
|
|
}
|
|
EmptyWordBoundary(flags) => {
|
|
nlist.add(pc, groups, true);
|
|
if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
|
|
self.add(nlist, pc + 1, groups)
|
|
}
|
|
}
|
|
Save(slot) => {
|
|
nlist.add(pc, groups, true);
|
|
match self.which {
|
|
Location if slot <= 1 => {
|
|
let old = groups[slot];
|
|
groups[slot] = Some(self.ic);
|
|
self.add(nlist, pc + 1, groups);
|
|
groups[slot] = old;
|
|
}
|
|
Submatches => {
|
|
let old = groups[slot];
|
|
groups[slot] = Some(self.ic);
|
|
self.add(nlist, pc + 1, groups);
|
|
groups[slot] = old;
|
|
}
|
|
Exists | Location => self.add(nlist, pc + 1, groups),
|
|
}
|
|
}
|
|
Jump(to) => {
|
|
nlist.add(pc, groups, true);
|
|
self.add(nlist, to, groups)
|
|
}
|
|
Split(x, y) => {
|
|
nlist.add(pc, groups, true);
|
|
self.add(nlist, x, groups);
|
|
self.add(nlist, y, groups);
|
|
}
|
|
Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
|
|
nlist.add(pc, groups, false);
|
|
}
|
|
}
|
|
}
|
|
|
|
// FIXME: For case insensitive comparisons, it uses the uppercase
|
|
// character and tests for equality. IIUC, this does not generalize to
|
|
// all of Unicode. I believe we need to check the entire fold for each
|
|
// character. This will be easy to add if and when it gets added to Rust's
|
|
// standard library.
|
|
#[inline]
|
|
fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
|
|
match textc {
|
|
None => false,
|
|
Some(textc) => {
|
|
regc == textc
|
|
|| (casei && regc.to_uppercase() == textc.to_uppercase())
|
|
}
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn char_is(&self, textc: Option<char>, regc: char) -> bool {
|
|
textc == Some(regc)
|
|
}
|
|
}
|
|
|
|
/// CharReader is responsible for maintaining a "previous" and a "current"
|
|
/// character. This one-character lookahead is necessary for assertions that
|
|
/// look one character before or after the current position.
|
|
pub struct CharReader<'t> {
|
|
/// The previous character read. It is None only when processing the first
|
|
/// character of the input.
|
|
pub prev: Option<char>,
|
|
/// The current character.
|
|
pub cur: Option<char>,
|
|
input: &'t str,
|
|
next: uint,
|
|
}
|
|
|
|
impl<'t> CharReader<'t> {
|
|
/// Returns a new CharReader that advances through the input given.
|
|
/// Note that a CharReader has no knowledge of the range in which to search
|
|
/// the input.
|
|
pub fn new(input: &'t str) -> CharReader<'t> {
|
|
CharReader {
|
|
prev: None,
|
|
cur: None,
|
|
input: input,
|
|
next: 0,
|
|
}
|
|
}
|
|
|
|
/// Sets the previous and current character given any arbitrary byte
|
|
/// index (at a Unicode codepoint boundary).
|
|
#[inline]
|
|
pub fn set(&mut self, ic: uint) -> uint {
|
|
self.prev = None;
|
|
self.cur = None;
|
|
self.next = 0;
|
|
|
|
if self.input.len() == 0 {
|
|
return 1
|
|
}
|
|
if ic > 0 {
|
|
let i = cmp::min(ic, self.input.len());
|
|
let prev = self.input.char_range_at_reverse(i);
|
|
self.prev = Some(prev.ch);
|
|
}
|
|
if ic < self.input.len() {
|
|
let cur = self.input.char_range_at(ic);
|
|
self.cur = Some(cur.ch);
|
|
self.next = cur.next;
|
|
self.next
|
|
} else {
|
|
self.input.len() + 1
|
|
}
|
|
}
|
|
|
|
/// Does the same as `set`, except it always advances to the next
|
|
/// character in the input (and therefore does half as many UTF8 decodings).
|
|
#[inline]
|
|
pub fn advance(&mut self) -> uint {
|
|
self.prev = self.cur;
|
|
if self.next < self.input.len() {
|
|
let cur = self.input.char_range_at(self.next);
|
|
self.cur = Some(cur.ch);
|
|
self.next = cur.next;
|
|
} else {
|
|
self.cur = None;
|
|
self.next = self.input.len() + 1;
|
|
}
|
|
self.next
|
|
}
|
|
|
|
/// Returns true if and only if this is the beginning of the input
|
|
/// (ignoring the range of the input to search).
|
|
#[inline]
|
|
pub fn is_begin(&self) -> bool { self.prev.is_none() }
|
|
|
|
/// Returns true if and only if this is the end of the input
|
|
/// (ignoring the range of the input to search).
|
|
#[inline]
|
|
pub fn is_end(&self) -> bool { self.cur.is_none() }
|
|
|
|
/// Returns true if and only if the current position is a word boundary.
|
|
/// (Ignoring the range of the input to search.)
|
|
pub fn is_word_boundary(&self) -> bool {
|
|
if self.is_begin() {
|
|
return is_word(self.cur)
|
|
}
|
|
if self.is_end() {
|
|
return is_word(self.prev)
|
|
}
|
|
(is_word(self.cur) && !is_word(self.prev))
|
|
|| (is_word(self.prev) && !is_word(self.cur))
|
|
}
|
|
}
|
|
|
|
struct Thread {
|
|
pc: uint,
|
|
groups: Vec<Option<uint>>,
|
|
}
|
|
|
|
struct Threads {
|
|
which: MatchKind,
|
|
queue: Vec<Thread>,
|
|
sparse: Vec<uint>,
|
|
size: uint,
|
|
}
|
|
|
|
impl Threads {
|
|
// This is using a wicked neat trick to provide constant time lookup
|
|
// for threads in the queue using a sparse set. A queue of threads is
|
|
// allocated once with maximal size when the VM initializes and is reused
|
|
// throughout execution. That is, there should be zero allocation during
|
|
// the execution of a VM.
|
|
//
|
|
// See http://research.swtch.com/sparse for the deets.
|
|
fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
|
|
Threads {
|
|
which: which,
|
|
queue: Vec::from_fn(num_insts, |_| {
|
|
Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) }
|
|
}),
|
|
sparse: Vec::from_elem(num_insts, 0u),
|
|
size: 0,
|
|
}
|
|
}
|
|
|
|
fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
|
|
let t = &mut self.queue[self.size];
|
|
t.pc = pc;
|
|
match (empty, self.which) {
|
|
(_, Exists) | (true, _) => {},
|
|
(false, Location) => {
|
|
t.groups[0] = groups[0];
|
|
t.groups[1] = groups[1];
|
|
}
|
|
(false, Submatches) => {
|
|
for (slot, val) in t.groups.iter_mut().zip(groups.iter()) {
|
|
*slot = *val;
|
|
}
|
|
}
|
|
}
|
|
self.sparse[pc] = self.size;
|
|
self.size += 1;
|
|
}
|
|
|
|
#[inline]
|
|
fn contains(&self, pc: uint) -> bool {
|
|
let s = self.sparse[pc];
|
|
s < self.size && self.queue[s].pc == pc
|
|
}
|
|
|
|
#[inline]
|
|
fn empty(&mut self) {
|
|
self.size = 0;
|
|
}
|
|
|
|
#[inline]
|
|
fn pc(&self, i: uint) -> uint {
|
|
self.queue[i].pc
|
|
}
|
|
|
|
#[inline]
|
|
fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
|
|
self.queue[i].groups.as_mut_slice()
|
|
}
|
|
}
|
|
|
|
/// Returns true if the character is a word character, according to the
|
|
/// (Unicode friendly) Perl character class '\w'.
|
|
/// Note that this is only use for testing word boundaries. The actual '\w'
|
|
/// is encoded as a CharClass instruction.
|
|
pub fn is_word(c: Option<char>) -> bool {
|
|
let c = match c {
|
|
None => return false,
|
|
Some(c) => c,
|
|
};
|
|
// Try the common ASCII case before invoking binary search.
|
|
match c {
|
|
'_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true,
|
|
_ => PERLW.binary_search(|&(start, end)| {
|
|
if c >= start && c <= end {
|
|
Equal
|
|
} else if start > c {
|
|
Greater
|
|
} else {
|
|
Less
|
|
}
|
|
}).found().is_some()
|
|
}
|
|
}
|
|
|
|
/// Given a character and a single character class range, return an ordering
|
|
/// indicating whether the character is less than the start of the range,
|
|
/// in the range (inclusive) or greater than the end of the range.
|
|
///
|
|
/// If `casei` is `true`, then this ordering is computed case insensitively.
|
|
///
|
|
/// This function is meant to be used with a binary search.
|
|
#[inline]
|
|
fn class_cmp(casei: bool, mut textc: char,
|
|
(mut start, mut end): (char, char)) -> Ordering {
|
|
if casei {
|
|
// FIXME: This is pretty ridiculous. All of this case conversion
|
|
// can be moved outside this function:
|
|
// 1) textc should be uppercased outside the bsearch.
|
|
// 2) the character class itself should be uppercased either in the
|
|
// parser or the compiler.
|
|
// FIXME: This is too simplistic for correct Unicode support.
|
|
// See also: char_eq
|
|
textc = textc.to_uppercase();
|
|
start = start.to_uppercase();
|
|
end = end.to_uppercase();
|
|
}
|
|
if textc >= start && textc <= end {
|
|
Equal
|
|
} else if start > textc {
|
|
Greater
|
|
} else {
|
|
Less
|
|
}
|
|
}
|
|
|
|
/// Returns the starting location of `needle` in `haystack`.
|
|
/// If `needle` is not in `haystack`, then `None` is returned.
|
|
///
|
|
/// Note that this is using a naive substring algorithm.
|
|
#[inline]
|
|
pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
|
|
let (hlen, nlen) = (haystack.len(), needle.len());
|
|
if nlen > hlen || nlen == 0 {
|
|
return None
|
|
}
|
|
for (offset, window) in haystack.windows(nlen).enumerate() {
|
|
if window == needle {
|
|
return Some(offset)
|
|
}
|
|
}
|
|
None
|
|
}
|