diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index d355421039e..e86cf462cab 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -2893,22 +2893,6 @@ mod bench { b.iter(|| assert_eq!(s.split('V').count(), 3)); } - #[bench] - fn split_unicode_not_ascii(b: &mut Bencher) { - struct NotAscii(char); - impl CharEq for NotAscii { - fn matches(&mut self, c: char) -> bool { - let NotAscii(cc) = *self; - cc == c - } - fn only_ascii(&self) -> bool { false } - } - let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam"; - - b.iter(|| assert_eq!(s.split(NotAscii('V')).count(), 3)); - } - - #[bench] fn split_ascii(b: &mut Bencher) { let s = "Mary had a little lamb, Little lamb, little-lamb."; @@ -2917,23 +2901,6 @@ mod bench { b.iter(|| assert_eq!(s.split(' ').count(), len)); } - #[bench] - fn split_not_ascii(b: &mut Bencher) { - struct NotAscii(char); - impl CharEq for NotAscii { - #[inline] - fn matches(&mut self, c: char) -> bool { - let NotAscii(cc) = *self; - cc == c - } - fn only_ascii(&self) -> bool { false } - } - let s = "Mary had a little lamb, Little lamb, little-lamb."; - let len = s.split(' ').count(); - - b.iter(|| assert_eq!(s.split(NotAscii(' ')).count(), len)); - } - #[bench] fn split_extern_fn(b: &mut Bencher) { let s = "Mary had a little lamb, Little lamb, little-lamb."; diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index a9308302033..820ad4d8586 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -156,7 +156,6 @@ impl FromStr for bool { /// An error returned when parsing a `bool` from a string fails. #[derive(Debug, Clone, PartialEq)] -#[allow(missing_copy_implementations)] #[stable(feature = "rust1", since = "1.0.0")] pub struct ParseBoolError { _priv: () } @@ -235,7 +234,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { pub unsafe fn from_c_str(s: *const i8) -> &'static str { let s = s as *const u8; let mut len = 0; - while *s.offset(len as int) != 0 { + while *s.offset(len as isize) != 0 { len += 1; } let v: &'static [u8] = ::mem::transmute(Slice { data: s, len: len }); @@ -258,7 +257,7 @@ impl CharEq for char { fn matches(&mut self, c: char) -> bool { *self == c } #[inline] - fn only_ascii(&self) -> bool { (*self as usize) < 128 } + fn only_ascii(&self) -> bool { (*self as u32) < 128 } } impl CharEq for F where F: FnMut(char) -> bool { @@ -764,7 +763,8 @@ impl TwoWaySearcher { // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> Option<(usize, usize)> { + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) + -> Option<(usize, usize)> { 'search: loop { // Check that we have room to search in if self.position + needle.len() > haystack.len() { @@ -955,6 +955,7 @@ Section: Comparing strings /// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[inline] fn eq_slice_(a: &str, b: &str) -> bool { + // NOTE: In theory n should be libc::size_t and not usize, but libc is not available here #[allow(improper_ctypes)] extern { fn memcmp(s1: *const i8, s2: *const i8, n: usize) -> i32; } a.len() == b.len() && unsafe { @@ -1489,7 +1490,7 @@ impl StrExt for str { fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str where P::Searcher: DoubleEndedSearcher<'a> { let mut i = 0; - let mut j = self.len(); + let mut j = 0; let mut matcher = pat.into_searcher(self); if let Some((a, b)) = matcher.next_reject() { i = a; @@ -1507,7 +1508,7 @@ impl StrExt for str { #[inline] fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { - let mut i = 0; + let mut i = self.len(); let mut matcher = pat.into_searcher(self); if let Some((a, _)) = matcher.next_reject() { i = a; @@ -1521,7 +1522,7 @@ impl StrExt for str { #[inline] fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str where P::Searcher: ReverseSearcher<'a> { - let mut j = self.len(); + let mut j = 0; let mut matcher = pat.into_searcher(self); if let Some((_, b)) = matcher.next_reject_back() { j = b; diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 501fc27b376..9cd5510db37 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -58,6 +58,7 @@ pub trait Pattern<'a>: Sized { // Searcher +#[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum SearchStep { Match(usize, usize), Reject(usize, usize), @@ -190,7 +191,7 @@ impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} // Impl for &str -// TODO: Optimize the naive implementation here +// Todo: Optimize the naive implementation here #[derive(Clone)] pub struct StrSearcher<'a, 'b> { @@ -235,13 +236,16 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { }, |m: &mut StrSearcher| { // Forward step for nonempty needle - let possible_match = &m.haystack[m.start .. m.start + m.needle.len()]; + // Compare if bytes are equal + let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()]; let current_start = m.start; - if possible_match == m.needle { + if possible_match == m.needle.as_bytes() { m.start += m.needle.len(); SearchStep::Match(current_start, m.start) } else { - m.start += possible_match.chars().next().unwrap().len_utf8(); + // Skip a char + let haystack_suffix = &m.haystack[m.start..]; + m.start += haystack_suffix.chars().next().unwrap().len_utf8(); SearchStep::Reject(current_start, m.start) } }) @@ -262,13 +266,16 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { }, |m: &mut StrSearcher| { // Backward step for nonempty needle - let possible_match = &m.haystack[m.end - m.needle.len() .. m.end]; + // Compare if bytes are equal + let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end]; let current_end = m.end; - if possible_match == m.needle { + if possible_match == m.needle.as_bytes() { m.end -= m.needle.len(); SearchStep::Match(m.end, current_end) } else { - m.end -= possible_match.chars().rev().next().unwrap().len_utf8(); + // Skip a char + let haystack_prefix = &m.haystack[..m.end]; + m.end -= haystack_prefix.chars().rev().next().unwrap().len_utf8(); SearchStep::Reject(m.end, current_end) } }) @@ -290,6 +297,9 @@ where F: FnOnce(&mut StrSearcher) -> SearchStep, } else if m.start + m.needle.len() <= m.end { // Case for needle != "" g(&mut m) + } else if m.start < m.end { + m.done = true; + SearchStep::Reject(m.start, m.end) } else { m.done = true; SearchStep::Done @@ -352,7 +362,8 @@ impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { use ops::Deref; -impl<'a, 'b, P: 'b + ?Sized, T: Deref + ?Sized> Pattern<'a> for &'b T where &'b P: Pattern<'a> { +impl<'a, 'b, P: 'b + ?Sized, T: Deref + ?Sized> Pattern<'a> for &'b T +where &'b P: Pattern<'a> { type Searcher = <&'b P as Pattern<'a>>::Searcher; associated_items!(<&'b P as Pattern<'a>>::Searcher, s, (&**s)); diff --git a/src/libcoretest/str.rs b/src/libcoretest/str.rs index acd8cc42c72..beb746d25b6 100644 --- a/src/libcoretest/str.rs +++ b/src/libcoretest/str.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -139,8 +139,150 @@ fn test_utf16_code_units() { vec![0xE9, 0xD83D, 0xDCA9]) } +#[test] +fn starts_with_in_unicode() { + assert!(!"├── Cargo.toml".starts_with("# ")); +} -// rm x86_64-unknown-linux-gnu/stage1/test/coretesttest-x86_64-unknown-linux-gnu; env PLEASE_BENCH=1 make check-stage1-coretest TESTNAME=str::bench +#[test] +fn starts_short_long() { + assert!(!"".starts_with("##")); + assert!(!"##".starts_with("####")); + assert!("####".starts_with("##")); + assert!(!"##ä".starts_with("####")); + assert!("####ä".starts_with("##")); + assert!(!"##".starts_with("####ä")); + assert!("##ä##".starts_with("##ä")); + + assert!("".starts_with("")); + assert!("ä".starts_with("")); + assert!("#ä".starts_with("")); + assert!("##ä".starts_with("")); + assert!("ä###".starts_with("")); + assert!("#ä##".starts_with("")); + assert!("##ä#".starts_with("")); +} + +#[test] +fn contains_weird_cases() { + assert!("* \t".contains_char(' ')); + assert!(!"* \t".contains_char('?')); + assert!(!"* \t".contains_char('\u{1F4A9}')); +} + +#[test] +fn trim_ws() { + assert_eq!(" \t a \t ".trim_left_matches(|c: char| c.is_whitespace()), + "a \t "); + assert_eq!(" \t a \t ".trim_right_matches(|c: char| c.is_whitespace()), + " \t a"); + assert_eq!(" \t a \t ".trim_matches(|c: char| c.is_whitespace()), + "a"); + assert_eq!(" \t \t ".trim_left_matches(|c: char| c.is_whitespace()), + ""); + assert_eq!(" \t \t ".trim_right_matches(|c: char| c.is_whitespace()), + ""); + assert_eq!(" \t \t ".trim_matches(|c: char| c.is_whitespace()), + ""); +} + +mod pattern { + use std::str::Pattern; + use std::str::{Searcher, ReverseSearcher, DoubleEndedSearcher}; + use std::str::SearchStep::{self, Match, Reject, Done}; + + macro_rules! make_test { + ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { + mod $name { + use std::str::Pattern; + use std::str::{Searcher, ReverseSearcher, DoubleEndedSearcher}; + use std::str::SearchStep::{self, Match, Reject, Done}; + use super::{cmp_search_to_vec}; + #[test] + fn fwd() { + cmp_search_to_vec(false, $p, $h, vec![$($e),*]); + } + #[test] + fn bwd() { + cmp_search_to_vec(true, $p, $h, vec![$($e),*]); + } + } + } + } + + fn cmp_search_to_vec<'a, P: Pattern<'a>>(rev: bool, pat: P, haystack: &'a str, + right: Vec) + where P::Searcher: ReverseSearcher<'a> + { + let mut searcher = pat.into_searcher(haystack); + let mut v = vec![]; + loop { + match if !rev {searcher.next()} else {searcher.next_back()} { + Match(a, b) => v.push(Match(a, b)), + Reject(a, b) => v.push(Reject(a, b)), + Done => break, + } + } + if rev { + v.reverse(); + } + assert_eq!(v, right); + } + + make_test!(str_searcher_ascii_haystack, "bb", "abbcbbd", [ + Reject(0, 1), + Match (1, 3), + Reject(3, 4), + Match (4, 6), + Reject(6, 7), + ]); + make_test!(str_searcher_empty_needle_ascii_haystack, "", "abbcbbd", [ + Match(0, 0), + Match(1, 1), + Match(2, 2), + Match(3, 3), + Match(4, 4), + Match(5, 5), + Match(6, 6), + Match(7, 7), + ]); + make_test!(str_searcher_mulibyte_haystack, " ", "├──", [ + Reject(0, 3), + Reject(3, 6), + Reject(6, 9), + ]); + make_test!(str_searcher_empty_needle_mulibyte_haystack, "", "├──", [ + Match(0, 0), + Match(3, 3), + Match(6, 6), + Match(9, 9), + ]); + make_test!(str_searcher_empty_needle_empty_haystack, "", "", [ + Match(0, 0), + ]); + make_test!(str_searcher_nonempty_needle_empty_haystack, "├", "", [ + ]); + make_test!(char_searcher_ascii_haystack, 'b', "abbcbbd", [ + Reject(0, 1), + Match (1, 2), + Match (2, 3), + Reject(3, 4), + Match (4, 5), + Match (5, 6), + Reject(6, 7), + ]); + make_test!(char_searcher_mulibyte_haystack, ' ', "├──", [ + Reject(0, 3), + Reject(3, 6), + Reject(6, 9), + ]); + make_test!(char_searcher_short_haystack, '\u{1F4A9}', "* \t", [ + Reject(0, 1), + Reject(1, 2), + Reject(2, 3), + ]); + +} mod bench { macro_rules! make_test_inner {