rust/src/libregex/parse.rs
Andrew Gallant b8b7484703 Add a regex crate to the Rust distribution.
Also adds a regex_macros crate, which provides natively compiled
regular expressions with a syntax extension.

Closes #3591.

RFC: 0007-regexps
2014-04-25 00:27:24 -04:00

1029 lines
36 KiB
Rust

// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::char;
use std::cmp;
use std::fmt;
use std::iter;
use std::num;
use std::str;
/// Static data containing Unicode ranges for general categories and scripts.
use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
#[allow(visible_private_types)]
pub mod unicode;
/// The maximum number of repetitions allowed with the `{n,m}` syntax.
static MAX_REPEAT: uint = 1000;
/// Error corresponds to something that can go wrong while parsing
/// a regular expression.
///
/// (Once an expression is compiled, it is not possible to produce an error
/// via searching, splitting or replacing.)
pub struct Error {
/// The *approximate* character index of where the error occurred.
pub pos: uint,
/// A message describing the error.
pub msg: ~str,
}
impl fmt::Show for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f.buf, "Regex syntax error near position {}: {}",
self.pos, self.msg)
}
}
/// Represents the abstract syntax of a regular expression.
/// It is showable so that error messages resulting from a bug can provide
/// useful information.
/// It is cloneable so that expressions can be repeated for the counted
/// repetition feature. (No other copying is done.)
///
/// Note that this representation prevents one from reproducing the regex as
/// it was typed. (But it could be used to reproduce an equivalent regex.)
#[deriving(Show, Clone)]
pub enum Ast {
Nothing,
Literal(char, Flags),
Dot(Flags),
Class(Vec<(char, char)>, Flags),
Begin(Flags),
End(Flags),
WordBoundary(Flags),
Capture(uint, Option<~str>, ~Ast),
// Represent concatenation as a flat vector to avoid blowing the
// stack in the compiler.
Cat(Vec<~Ast>),
Alt(~Ast, ~Ast),
Rep(~Ast, Repeater, Greed),
}
#[deriving(Show, Eq, Clone)]
pub enum Repeater {
ZeroOne,
ZeroMore,
OneMore,
}
#[deriving(Show, Clone)]
pub enum Greed {
Greedy,
Ungreedy,
}
impl Greed {
pub fn is_greedy(&self) -> bool {
match *self {
Greedy => true,
_ => false,
}
}
fn swap(self, swapped: bool) -> Greed {
if !swapped { return self }
match self {
Greedy => Ungreedy,
Ungreedy => Greedy,
}
}
}
/// BuildAst is a regrettable type that represents intermediate state for
/// constructing an abstract syntax tree. Its central purpose is to facilitate
/// parsing groups and alternations while also maintaining a stack of flag
/// state.
#[deriving(Show)]
enum BuildAst {
Ast(~Ast),
Paren(Flags, uint, ~str), // '('
Bar, // '|'
}
impl BuildAst {
fn paren(&self) -> bool {
match *self {
Paren(_, _, _) => true,
_ => false,
}
}
fn flags(&self) -> Flags {
match *self {
Paren(flags, _, _) => flags,
_ => fail!("Cannot get flags from {}", self),
}
}
fn capture(&self) -> Option<uint> {
match *self {
Paren(_, 0, _) => None,
Paren(_, c, _) => Some(c),
_ => fail!("Cannot get capture group from {}", self),
}
}
fn capture_name(&self) -> Option<~str> {
match *self {
Paren(_, 0, _) => None,
Paren(_, _, ref name) => {
if name.len() == 0 {
None
} else {
Some(name.clone())
}
}
_ => fail!("Cannot get capture name from {}", self),
}
}
fn bar(&self) -> bool {
match *self {
Bar => true,
_ => false,
}
}
fn unwrap(self) -> Result<~Ast, Error> {
match self {
Ast(x) => Ok(x),
_ => fail!("Tried to unwrap non-AST item: {}", self),
}
}
}
/// Flags represents all options that can be twiddled by a user in an
/// expression.
pub type Flags = u8;
pub static FLAG_EMPTY: u8 = 0;
pub static FLAG_NOCASE: u8 = 1 << 0; // i
pub static FLAG_MULTI: u8 = 1 << 1; // m
pub static FLAG_DOTNL: u8 = 1 << 2; // s
pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U
pub static FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary
struct Parser<'a> {
// The input, parsed only as a sequence of UTF8 code points.
chars: Vec<char>,
// The index of the current character in the input.
chari: uint,
// The intermediate state representing the AST.
stack: Vec<BuildAst>,
// The current set of flags.
flags: Flags,
// The total number of capture groups.
// Incremented each time an opening left paren is seen (assuming it is
// opening a capture group).
caps: uint,
// A set of all capture group names used only to detect duplicates.
names: Vec<~str>,
}
pub fn parse(s: &str) -> Result<~Ast, Error> {
Parser {
chars: s.chars().collect(),
chari: 0,
stack: vec!(),
flags: FLAG_EMPTY,
caps: 0,
names: vec!(),
}.parse()
}
impl<'a> Parser<'a> {
fn parse(&mut self) -> Result<~Ast, Error> {
loop {
let c = self.cur();
match c {
'?' | '*' | '+' => try!(self.push_repeater(c)),
'\\' => {
let ast = try!(self.parse_escape());
self.push(ast)
}
'{' => try!(self.parse_counted()),
'[' => match self.try_parse_ascii() {
None => try!(self.parse_class()),
Some(class) => self.push(class),
},
'(' => {
if self.peek_is(1, '?') {
try!(self.expect('?'))
try!(self.parse_group_opts())
} else {
self.caps += 1;
self.stack.push(Paren(self.flags, self.caps, ~""))
}
}
')' => {
let catfrom = try!(
self.pos_last(false, |x| x.paren() || x.bar()));
try!(self.concat(catfrom));
let altfrom = try!(self.pos_last(false, |x| x.paren()));
// Before we smush the alternates together and pop off the
// left paren, let's grab the old flags and see if we
// need a capture.
let (cap, cap_name, oldflags) = {
let paren = self.stack.get(altfrom-1);
(paren.capture(), paren.capture_name(), paren.flags())
};
try!(self.alternate(altfrom));
self.flags = oldflags;
// If this was a capture, pop what we just pushed in
// alternate and make it a capture.
if cap.is_some() {
let ast = try!(self.pop_ast());
self.push(~Capture(cap.unwrap(), cap_name, ast));
}
}
'|' => {
let catfrom = try!(
self.pos_last(true, |x| x.paren() || x.bar()));
try!(self.concat(catfrom));
self.stack.push(Bar);
}
_ => try!(self.push_literal(c)),
}
if !self.next_char() {
break
}
}
// Try to improve error handling. At this point, there should be
// no remaining open parens.
if self.stack.iter().any(|x| x.paren()) {
return self.err("Unclosed parenthesis.")
}
let catfrom = try!(self.pos_last(true, |x| x.bar()));
try!(self.concat(catfrom));
try!(self.alternate(0));
assert!(self.stack.len() == 1);
self.pop_ast()
}
fn noteof(&mut self, expected: &str) -> Result<(), Error> {
match self.next_char() {
true => Ok(()),
false => self.err(format!("Expected {} but got EOF.", expected)),
}
}
fn expect(&mut self, expected: char) -> Result<(), Error> {
match self.next_char() {
true if self.cur() == expected => Ok(()),
true => self.err(format!("Expected '{}' but got '{}'.",
expected, self.cur())),
false => self.err(format!("Expected '{}' but got EOF.", expected)),
}
}
fn next_char(&mut self) -> bool {
self.chari += 1;
self.chari < self.chars.len()
}
fn pop_ast(&mut self) -> Result<~Ast, Error> {
match self.stack.pop().unwrap().unwrap() {
Err(e) => Err(e),
Ok(ast) => Ok(ast),
}
}
fn push(&mut self, ast: ~Ast) {
self.stack.push(Ast(ast))
}
fn push_repeater(&mut self, c: char) -> Result<(), Error> {
if self.stack.len() == 0 {
return self.err(
"A repeat operator must be preceded by a valid expression.")
}
let rep: Repeater = match c {
'?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
_ => fail!("Not a valid repeater operator."),
};
match self.peek(1) {
Some('*') | Some('+') =>
return self.err(
"Double repeat operators are not supported."),
_ => {},
}
let ast = try!(self.pop_ast());
match ast {
~Begin(_) | ~End(_) | ~WordBoundary(_) =>
return self.err(
"Repeat arguments cannot be empty width assertions."),
_ => {}
}
let greed = try!(self.get_next_greedy());
self.push(~Rep(ast, rep, greed));
Ok(())
}
fn push_literal(&mut self, c: char) -> Result<(), Error> {
match c {
'.' => {
self.push(~Dot(self.flags))
}
'^' => {
self.push(~Begin(self.flags))
}
'$' => {
self.push(~End(self.flags))
}
_ => {
self.push(~Literal(c, self.flags))
}
}
Ok(())
}
// Parses all forms of character classes.
// Assumes that '[' is the current character.
fn parse_class(&mut self) -> Result<(), Error> {
let negated =
if self.peek_is(1, '^') {
try!(self.expect('^'))
FLAG_NEGATED
} else {
FLAG_EMPTY
};
let mut ranges: Vec<(char, char)> = vec!();
let mut alts: Vec<~Ast> = vec!();
if self.peek_is(1, ']') {
try!(self.expect(']'))
ranges.push((']', ']'))
}
while self.peek_is(1, '-') {
try!(self.expect('-'))
ranges.push(('-', '-'))
}
loop {
try!(self.noteof("a closing ']' or a non-empty character class)"))
let mut c = self.cur();
match c {
'[' =>
match self.try_parse_ascii() {
Some(~Class(asciis, flags)) => {
alts.push(~Class(asciis, flags ^ negated));
continue
}
Some(ast) =>
fail!("Expected Class AST but got '{}'", ast),
// Just drop down and try to add as a regular character.
None => {},
},
'\\' => {
match try!(self.parse_escape()) {
~Class(asciis, flags) => {
alts.push(~Class(asciis, flags ^ negated));
continue
}
~Literal(c2, _) => c = c2, // process below
~Begin(_) | ~End(_) | ~WordBoundary(_) =>
return self.err(
"\\A, \\z, \\b and \\B are not valid escape \
sequences inside a character class."),
ast => fail!("Unexpected AST item '{}'", ast),
}
}
_ => {},
}
match c {
']' => {
if ranges.len() > 0 {
let flags = negated | (self.flags & FLAG_NOCASE);
let mut ast = ~Class(combine_ranges(ranges), flags);
for alt in alts.move_iter() {
ast = ~Alt(alt, ast)
}
self.push(ast);
} else if alts.len() > 0 {
let mut ast = alts.pop().unwrap();
for alt in alts.move_iter() {
ast = ~Alt(alt, ast)
}
self.push(ast);
}
return Ok(())
}
c => {
if self.peek_is(1, '-') && !self.peek_is(2, ']') {
try!(self.expect('-'))
try!(self.noteof("not a ']'"))
let c2 = self.cur();
if c2 < c {
return self.err(format!(
"Invalid character class range '{}-{}'", c, c2))
}
ranges.push((c, self.cur()))
} else {
ranges.push((c, c))
}
}
}
}
}
// Tries to parse an ASCII character class of the form [:name:].
// If successful, returns an AST character class corresponding to name
// and moves the parser to the final ']' character.
// If unsuccessful, no state is changed and None is returned.
// Assumes that '[' is the current character.
fn try_parse_ascii(&mut self) -> Option<~Ast> {
if !self.peek_is(1, ':') {
return None
}
let closer =
match self.pos(']') {
Some(i) => i,
None => return None,
};
if *self.chars.get(closer-1) != ':' {
return None
}
if closer - self.chari <= 3 {
return None
}
let mut name_start = self.chari + 2;
let negated =
if self.peek_is(2, '^') {
name_start += 1;
FLAG_NEGATED
} else {
FLAG_EMPTY
};
let name = self.slice(name_start, closer - 1);
match find_class(ASCII_CLASSES, name) {
None => None,
Some(ranges) => {
self.chari = closer;
let flags = negated | (self.flags & FLAG_NOCASE);
Some(~Class(combine_ranges(ranges), flags))
}
}
}
// Parses counted repetition. Supports:
// {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
// Assumes that '{' is the current character.
// Returns either an error or moves the parser to the final '}' character.
// (Or the '?' character if not greedy.)
fn parse_counted(&mut self) -> Result<(), Error> {
// Scan until the closing '}' and grab the stuff in {}.
let start = self.chari;
let closer =
match self.pos('}') {
Some(i) => i,
None => return self.err(format!(
"No closing brace for counted repetition starting at \
position {}.", start)),
};
self.chari = closer;
let greed = try!(self.get_next_greedy());
let inner = str::from_chars(
self.chars.as_slice().slice(start + 1, closer));
// Parse the min and max values from the regex.
let (mut min, mut max): (uint, Option<uint>);
if !inner.contains(",") {
min = try!(self.parse_uint(inner));
max = Some(min);
} else {
let pieces: Vec<&str> = inner.splitn(',', 1).collect();
let (smin, smax) = (*pieces.get(0), *pieces.get(1));
if smin.len() == 0 {
return self.err("Max repetitions cannot be specified \
without min repetitions.")
}
min = try!(self.parse_uint(smin));
max =
if smax.len() == 0 {
None
} else {
Some(try!(self.parse_uint(smax)))
};
}
// Do some bounds checking and make sure max >= min.
if min > MAX_REPEAT {
return self.err(format!(
"{} exceeds maximum allowed repetitions ({})",
min, MAX_REPEAT));
}
if max.is_some() {
let m = max.unwrap();
if m > MAX_REPEAT {
return self.err(format!(
"{} exceeds maximum allowed repetitions ({})",
m, MAX_REPEAT));
}
if m < min {
return self.err(format!(
"Max repetitions ({}) cannot be smaller than min \
repetitions ({}).", m, min));
}
}
// Now manipulate the AST be repeating elements.
if max.is_none() {
// Require N copies of what's on the stack and then repeat it.
let ast = try!(self.pop_ast());
for _ in iter::range(0, min) {
self.push(ast.clone())
}
self.push(~Rep(ast, ZeroMore, greed));
} else {
// Require N copies of what's on the stack and then repeat it
// up to M times optionally.
let ast = try!(self.pop_ast());
for _ in iter::range(0, min) {
self.push(ast.clone())
}
if max.is_some() {
for _ in iter::range(min, max.unwrap()) {
self.push(~Rep(ast.clone(), ZeroOne, greed))
}
}
// It's possible that we popped something off the stack but
// never put anything back on it. To keep things simple, add
// a no-op expression.
if min == 0 && (max.is_none() || max == Some(0)) {
self.push(~Nothing)
}
}
Ok(())
}
// Parses all escape sequences.
// Assumes that '\' is the current character.
fn parse_escape(&mut self) -> Result<~Ast, Error> {
try!(self.noteof("an escape sequence following a '\\'"))
let c = self.cur();
if is_punct(c) {
return Ok(~Literal(c, FLAG_EMPTY))
}
match c {
'a' => Ok(~Literal('\x07', FLAG_EMPTY)),
'f' => Ok(~Literal('\x0C', FLAG_EMPTY)),
't' => Ok(~Literal('\t', FLAG_EMPTY)),
'n' => Ok(~Literal('\n', FLAG_EMPTY)),
'r' => Ok(~Literal('\r', FLAG_EMPTY)),
'v' => Ok(~Literal('\x0B', FLAG_EMPTY)),
'A' => Ok(~Begin(FLAG_EMPTY)),
'z' => Ok(~End(FLAG_EMPTY)),
'b' => Ok(~WordBoundary(FLAG_EMPTY)),
'B' => Ok(~WordBoundary(FLAG_NEGATED)),
'0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
'x' => Ok(try!(self.parse_hex())),
'p' | 'P' => Ok(try!(self.parse_unicode_name())),
'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
let ranges = perl_unicode_class(c);
let mut flags = self.flags & FLAG_NOCASE;
if c.is_uppercase() { flags |= FLAG_NEGATED }
Ok(~Class(ranges, flags))
}
_ => self.err(format!("Invalid escape sequence '\\\\{}'", c)),
}
}
// Parses a unicode character class name, either of the form \pF where
// F is a one letter unicode class name or of the form \p{name} where
// name is the unicode class name.
// Assumes that \p or \P has been read (and 'p' or 'P' is the current
// character).
fn parse_unicode_name(&mut self) -> Result<~Ast, Error> {
let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
let mut name: ~str;
if self.peek_is(1, '{') {
try!(self.expect('{'))
let closer =
match self.pos('}') {
Some(i) => i,
None => return self.err(format!(
"Missing '\\}' for unclosed '\\{' at position {}",
self.chari)),
};
if closer - self.chari + 1 == 0 {
return self.err("No Unicode class name found.")
}
name = self.slice(self.chari + 1, closer);
self.chari = closer;
} else {
if self.chari + 1 >= self.chars.len() {
return self.err("No single letter Unicode class name found.")
}
name = self.slice(self.chari + 1, self.chari + 2);
self.chari += 1;
}
match find_class(UNICODE_CLASSES, name) {
None => return self.err(format!(
"Could not find Unicode class '{}'", name)),
Some(ranges) => {
Ok(~Class(ranges, negated | (self.flags & FLAG_NOCASE)))
}
}
}
// Parses an octal number, up to 3 digits.
// Assumes that \n has been read, where n is the first digit.
fn parse_octal(&mut self) -> Result<~Ast, Error> {
let start = self.chari;
let mut end = start + 1;
let (d2, d3) = (self.peek(1), self.peek(2));
if d2 >= Some('0') && d2 <= Some('7') {
try!(self.noteof("expected octal character in [0-7]"))
end += 1;
if d3 >= Some('0') && d3 <= Some('7') {
try!(self.noteof("expected octal character in [0-7]"))
end += 1;
}
}
let s = self.slice(start, end);
match num::from_str_radix::<u32>(s, 8) {
Some(n) => Ok(~Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
None => self.err(format!(
"Could not parse '{}' as octal number.", s)),
}
}
// Parse a hex number. Either exactly two digits or anything in {}.
// Assumes that \x has been read.
fn parse_hex(&mut self) -> Result<~Ast, Error> {
if !self.peek_is(1, '{') {
try!(self.expect('{'))
return self.parse_hex_two()
}
let start = self.chari + 2;
let closer =
match self.pos('}') {
None => return self.err(format!(
"Missing '\\}' for unclosed '\\{' at position {}", start)),
Some(i) => i,
};
self.chari = closer;
self.parse_hex_digits(self.slice(start, closer))
}
// Parses a two-digit hex number.
// Assumes that \xn has been read, where n is the first digit and is the
// current character.
// After return, parser will point at the second digit.
fn parse_hex_two(&mut self) -> Result<~Ast, Error> {
let (start, end) = (self.chari, self.chari + 2);
let bad = self.slice(start - 2, self.chars.len());
try!(self.noteof(format!("Invalid hex escape sequence '{}'", bad)))
self.parse_hex_digits(self.slice(start, end))
}
// Parses `s` as a hexadecimal number.
fn parse_hex_digits(&self, s: &str) -> Result<~Ast, Error> {
match num::from_str_radix::<u32>(s, 16) {
Some(n) => Ok(~Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
None => self.err(format!(
"Could not parse '{}' as hex number.", s)),
}
}
// Parses a named capture.
// Assumes that '(?P<' has been consumed and that the current character
// is '<'.
// When done, parser will be at the closing '>' character.
fn parse_named_capture(&mut self) -> Result<(), Error> {
try!(self.noteof("a capture name"))
let closer =
match self.pos('>') {
Some(i) => i,
None => return self.err("Capture name must end with '>'."),
};
if closer - self.chari == 0 {
return self.err("Capture names must have at least 1 character.")
}
let name = self.slice(self.chari, closer);
if !name.chars().all(is_valid_cap) {
return self.err(
"Capture names can only have underscores, letters and digits.")
}
if self.names.contains(&name) {
return self.err(format!("Duplicate capture group name '{}'.", name))
}
self.names.push(name.clone());
self.chari = closer;
self.caps += 1;
self.stack.push(Paren(self.flags, self.caps, name));
Ok(())
}
// Parses non-capture groups and options.
// Assumes that '(?' has already been consumed and '?' is the current
// character.
fn parse_group_opts(&mut self) -> Result<(), Error> {
if self.peek_is(1, 'P') && self.peek_is(2, '<') {
try!(self.expect('P')) try!(self.expect('<'))
return self.parse_named_capture()
}
let start = self.chari;
let mut flags = self.flags;
let mut sign = 1;
let mut saw_flag = false;
loop {
try!(self.noteof("expected non-empty set of flags or closing ')'"))
match self.cur() {
'i' => { flags = flags | FLAG_NOCASE; saw_flag = true},
'm' => { flags = flags | FLAG_MULTI; saw_flag = true},
's' => { flags = flags | FLAG_DOTNL; saw_flag = true},
'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
'-' => {
if sign < 0 {
return self.err(format!(
"Cannot negate flags twice in '{}'.",
self.slice(start, self.chari + 1)))
}
sign = -1;
saw_flag = false;
flags = flags ^ flags;
}
':' | ')' => {
if sign < 0 {
if !saw_flag {
return self.err(format!(
"A valid flag does not follow negation in '{}'",
self.slice(start, self.chari + 1)))
}
flags = flags ^ flags;
}
if self.cur() == ':' {
// Save the old flags with the opening paren.
self.stack.push(Paren(self.flags, 0, ~""));
}
self.flags = flags;
return Ok(())
}
_ => return self.err(format!(
"Unrecognized flag '{}'.", self.cur())),
}
}
}
// Peeks at the next character and returns whether it's ungreedy or not.
// If it is, then the next character is consumed.
fn get_next_greedy(&mut self) -> Result<Greed, Error> {
Ok(if self.peek_is(1, '?') {
try!(self.expect('?'))
Ungreedy
} else {
Greedy
}.swap(self.flags & FLAG_SWAP_GREED > 0))
}
// Searches the stack (starting at the top) until it finds an expression
// for which `pred` returns true. The index of that expression in the
// stack is returned.
// If there's no match, then one of two things happens depending on the
// values of `allow_start`. When it's true, then `0` will be returned.
// Otherwise, an error will be returned.
// Generally, `allow_start` is only true when you're *not* expecting an
// opening parenthesis.
fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool)
-> Result<uint, Error> {
let from = match self.stack.iter().rev().position(pred) {
Some(i) => i,
None => {
if allow_start {
self.stack.len()
} else {
return self.err("No matching opening parenthesis.")
}
}
};
// Adjust index since 'from' is for the reversed stack.
// Also, don't include the '(' or '|'.
Ok(self.stack.len() - from)
}
// concat starts at `from` in the parser's stack and concatenates all
// expressions up to the top of the stack. The resulting concatenation is
// then pushed on to the stack.
// Usually `from` corresponds to the position of an opening parenthesis,
// a '|' (alternation) or the start of the entire expression.
fn concat(&mut self, from: uint) -> Result<(), Error> {
let ast = try!(self.build_from(from, concat_flatten));
self.push(ast);
Ok(())
}
// concat starts at `from` in the parser's stack and alternates all
// expressions up to the top of the stack. The resulting alternation is
// then pushed on to the stack.
// Usually `from` corresponds to the position of an opening parenthesis
// or the start of the entire expression.
// This will also drop any opening parens or alternation bars found in
// the intermediate AST.
fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
// Unlike in the concatenation case, we want 'build_from' to continue
// all the way to the opening left paren (so it will be popped off and
// thrown away). But be careful with overflow---we can't count on the
// open paren to be there.
if from > 0 { from = from - 1}
let ast = try!(self.build_from(from, Alt));
self.push(ast);
Ok(())
}
// build_from combines all AST elements starting at 'from' in the
// parser's stack using 'mk' to combine them. If any such element is not an
// AST then it is popped off the stack and ignored.
fn build_from(&mut self, from: uint, mk: |~Ast, ~Ast| -> Ast)
-> Result<~Ast, Error> {
if from >= self.stack.len() {
return self.err("Empty group or alternate not allowed.")
}
let mut combined = try!(self.pop_ast());
let mut i = self.stack.len();
while i > from {
i = i - 1;
match self.stack.pop().unwrap() {
Ast(x) => combined = ~mk(x, combined),
_ => {},
}
}
Ok(combined)
}
fn parse_uint(&self, s: &str) -> Result<uint, Error> {
match from_str::<uint>(s) {
Some(i) => Ok(i),
None => self.err(format!(
"Expected an unsigned integer but got '{}'.", s)),
}
}
fn char_from_u32(&self, n: u32) -> Result<char, Error> {
match char::from_u32(n) {
Some(c) => Ok(c),
None => self.err(format!(
"Could not decode '{}' to unicode character.", n)),
}
}
fn pos(&self, c: char) -> Option<uint> {
self.chars.iter()
.skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
}
fn err<T>(&self, msg: &str) -> Result<T, Error> {
Err(Error {
pos: self.chari,
msg: msg.to_owned(),
})
}
fn peek(&self, offset: uint) -> Option<char> {
if self.chari + offset >= self.chars.len() {
return None
}
Some(*self.chars.get(self.chari + offset))
}
fn peek_is(&self, offset: uint, is: char) -> bool {
self.peek(offset) == Some(is)
}
fn cur(&self) -> char {
*self.chars.get(self.chari)
}
fn slice(&self, start: uint, end: uint) -> ~str {
str::from_chars(self.chars.as_slice().slice(start, end))
}
}
// Given an unordered collection of character ranges, combine_ranges returns
// an ordered sequence of character ranges where no two ranges overlap. They
// are ordered from least to greatest (using start position).
fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
// Returns true iff the two character classes overlap or share a boundary.
// e.g., ('a', 'g') and ('h', 'm') would return true.
fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
}
// This is currently O(n^2), but I think with sufficient cleverness,
// it can be reduced to O(n) **if necessary**.
let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
for (us, ue) in unordered.move_iter() {
let (mut us, mut ue) = (us, ue);
assert!(us <= ue);
let mut which: Option<uint> = None;
for (i, &(os, oe)) in ordered.iter().enumerate() {
if should_merge((us, ue), (os, oe)) {
us = cmp::min(us, os);
ue = cmp::max(ue, oe);
which = Some(i);
break
}
}
match which {
None => ordered.push((us, ue)),
Some(i) => *ordered.get_mut(i) = (us, ue),
}
}
ordered.sort();
ordered
}
// Constructs a Unicode friendly Perl character class from \d, \s or \w
// (or any of their negated forms). Note that this does not handle negation.
fn perl_unicode_class(which: char) -> Vec<(char, char)> {
match which.to_lowercase() {
'd' => Vec::from_slice(PERLD),
's' => Vec::from_slice(PERLS),
'w' => Vec::from_slice(PERLW),
_ => unreachable!(),
}
}
// Returns a concatenation of two expressions. This also guarantees that a
// `Cat` expression will never be a direct child of another `Cat` expression.
fn concat_flatten(x: ~Ast, y: ~Ast) -> Ast {
match (x, y) {
(~Cat(mut xs), ~Cat(ys)) => { xs.push_all_move(ys); Cat(xs) }
(~Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
(ast, ~Cat(mut xs)) => { xs.unshift(ast); Cat(xs) }
(ast1, ast2) => Cat(vec!(ast1, ast2)),
}
}
pub fn is_punct(c: char) -> bool {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
'[' | ']' | '{' | '}' | '^' | '$' => true,
_ => false,
}
}
fn is_valid_cap(c: char) -> bool {
c == '_' || (c >= '0' && c <= '9')
|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}
fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
match classes.bsearch(|&(s, _)| s.cmp(&name)) {
Some(i) => Some(Vec::from_slice(classes[i].val1())),
None => None,
}
}
type Class = &'static [(char, char)];
type NamedClasses = &'static [(&'static str, Class)];
static ASCII_CLASSES: NamedClasses = &[
// Classes must be in alphabetical order so that bsearch works.
// [:alnum:] alphanumeric (== [0-9A-Za-z])
// [:alpha:] alphabetic (== [A-Za-z])
// [:ascii:] ASCII (== [\x00-\x7F])
// [:blank:] blank (== [\t ])
// [:cntrl:] control (== [\x00-\x1F\x7F])
// [:digit:] digits (== [0-9])
// [:graph:] graphical (== [!-~])
// [:lower:] lower case (== [a-z])
// [:print:] printable (== [ -~] == [ [:graph:]])
// [:punct:] punctuation (== [!-/:-@[-`{-~])
// [:space:] whitespace (== [\t\n\v\f\r ])
// [:upper:] upper case (== [A-Z])
// [:word:] word characters (== [0-9A-Za-z_])
// [:xdigit:] hex digit (== [0-9A-Fa-f])
// Taken from: http://golang.org/pkg/regex/syntax/
("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]),
("alpha", &[('A', 'Z'), ('a', 'z')]),
("ascii", &[('\x00', '\x7F')]),
("blank", &[(' ', ' '), ('\t', '\t')]),
("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]),
("digit", &[('0', '9')]),
("graph", &[('!', '~')]),
("lower", &[('a', 'z')]),
("print", &[(' ', '~')]),
("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]),
("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
('\r', '\r'), (' ', ' ')]),
("upper", &[('A', 'Z')]),
("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]),
("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]),
];