auto merge of #8737 : blake2-ppc/rust/std-str-rsplit, r=huonw

Make CharSplitIterator double-ended which is simple given that the operation is symmetric, once the split-N feature is factored out into its own adaptor. `.rsplitn_iter()` allows splitting `N` times from the back of a string, so it is a completely new feature. With the double-ended impl, `.split_iter()`, `.line_iter()`, `.word_iter()` all allow picking off elements from either end. `split_options_iter` is removed with the factoring of the split- and split-N- iterators, instead there is `split_terminator_iter`. --- Add benchmarks using `#[bench]` and tune CharSplitIterator a bit after Huon Wilson's suggestions Benchmarks 1-5 do the same split using different implementations of `CharEq`, all splitting an ascii string on ascii space. Benchmarks 6-7 split a unicode string on an ascii char. Before this PR test str::bench::split_iter_ascii ... bench: 166 ns/iter (+/- 2) test str::bench::split_iter_closure ... bench: 113 ns/iter (+/- 1) test str::bench::split_iter_extern_fn ... bench: 286 ns/iter (+/- 7) test str::bench::split_iter_not_ascii ... bench: 114 ns/iter (+/- 4) test str::bench::split_iter_slice ... bench: 220 ns/iter (+/- 12) test str::bench::split_iter_unicode_ascii ... bench: 217 ns/iter (+/- 3) test str::bench::split_iter_unicode_not_ascii ... bench: 248 ns/iter (+/- 3) PR, first commit test str::bench::split_iter_ascii ... bench: 331 ns/iter (+/- 9) test str::bench::split_iter_closure ... bench: 114 ns/iter (+/- 2) test str::bench::split_iter_extern_fn ... bench: 314 ns/iter (+/- 6) test str::bench::split_iter_not_ascii ... bench: 132 ns/iter (+/- 1) test str::bench::split_iter_slice ... bench: 157 ns/iter (+/- 3) test str::bench::split_iter_unicode_ascii ... bench: 502 ns/iter (+/- 64) test str::bench::split_iter_unicode_not_ascii ... bench: 250 ns/iter (+/- 3) PR, final version test str::bench::split_iter_ascii ... bench: 106 ns/iter (+/- 4) test str::bench::split_iter_closure ... bench: 107 ns/iter (+/- 1) test str::bench::split_iter_extern_fn ... bench: 267 ns/iter (+/- 6) test str::bench::split_iter_not_ascii ... bench: 108 ns/iter (+/- 1) test str::bench::split_iter_slice ... bench: 170 ns/iter (+/- 8) test str::bench::split_iter_unicode_ascii ... bench: 128 ns/iter (+/- 5) test str::bench::split_iter_unicode_not_ascii ... bench: 252 ns/iter (+/- 3) --- There are several ways to deal with `CharEq::only_ascii`. It is a performance optimization, so with that in mind, we allow passing bogus char (outside ascii) as long as they don't match. We use a byte value check to make sure we don't split on these (would split substrings in the middle of encoded char). (A more principled way would be to only pass the ascii codepoints to the CharEq when it indicates only_ascii, but that undoes some of the performance optimization.)
2013-08-26 05:06:16 -07:00 · 2013-08-26 05:06:16 -07:00 · 540d98e7fc
commit 540d98e7fc
parent 501eea0842 4de9bca4d8
1 changed files with 307 additions and 75 deletions
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@ -23,7 +23,7 @@ use clone::{Clone, DeepClone};
 use container::{Container, Mutable};
 use iter::Times;
 use iterator::{Iterator, FromIterator, Extendable};
-use iterator::{Filter, AdditiveIterator, Map, Enumerate};
+use iterator::{Filter, AdditiveIterator, Map};
 use iterator::{Invert, DoubleEndedIterator};
 use libc;
 use num::{Saturating, Zero};
@ -388,28 +388,32 @@ pub type ByteIterator<'self> =
 /// Use with the `std::iterator` module.
 pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;

-/// An iterator over byte index and either &u8 or char
-#[deriving(Clone)]
-enum OffsetIterator<'self> {
-    // use ByteIterator here when it can be cloned
-    ByteOffset(Enumerate<vec::VecIterator<'self, u8>>),
-    CharOffset(CharOffsetIterator<'self>),
-}
-
 /// An iterator over the substrings of a string, separated by `sep`.
 #[deriving(Clone)]
-pub struct CharSplitIterator<'self,Sep> {
-    priv iter: OffsetIterator<'self>,
+pub struct CharSplitIterator<'self, Sep> {
+    /// The slice remaining to be iterated
    priv string: &'self str,
-    priv position: uint,
    priv sep: Sep,
-    /// The number of splits remaining
-    priv count: uint,
    /// Whether an empty string at the end is allowed
    priv allow_trailing_empty: bool,
+    priv only_ascii: bool,
    priv finished: bool,
 }

+/// An iterator over the substrings of a string, separated by `sep`,
+/// starting from the back of the string.
+pub type CharRSplitIterator<'self, Sep> = Invert<CharSplitIterator<'self, Sep>>;
+
+/// An iterator over the substrings of a string, separated by `sep`,
+/// splitting at most `count` times.
+#[deriving(Clone)]
+pub struct CharSplitNIterator<'self, Sep> {
+    priv iter: CharSplitIterator<'self, Sep>,
+    /// The number of splits remaining
+    priv count: uint,
+    priv invert: bool,
+}
+
 /// An iterator over the words of a string, separated by an sequence of whitespace
 pub type WordIterator<'self> =
    Filter<'self, &'self str, CharSplitIterator<'self, extern "Rust" fn(char) -> bool>>;
@ -418,46 +422,101 @@ pub type WordIterator<'self> =
 pub type AnyLineIterator<'self> =
    Map<'self, &'self str, &'self str, CharSplitIterator<'self, char>>;

+impl<'self, Sep> CharSplitIterator<'self, Sep> {
+    #[inline]
+    fn get_end(&mut self) -> Option<&'self str> {
+        if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
+            self.finished = true;
+            Some(self.string)
+        } else {
+            None
+        }
+    }
+}
+
 impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> {
    #[inline]
    fn next(&mut self) -> Option<&'self str> {
        if self.finished { return None }

-        let start = self.position;
-        let len = self.string.len();
-
-        if self.count > 0 {
-            match self.iter {
-                // this gives a *huge* speed up for splitting on ASCII
-                // characters (e.g. '\n' or ' ')
-                ByteOffset(ref mut iter) =>
-                    for (idx, &byte) in *iter {
-                        if self.sep.matches(byte as char) {
-                            self.position = idx + 1;
-                            self.count -= 1;
-                            return Some(unsafe {
-                                raw::slice_bytes(self.string, start, idx)
-                            })
-                        }
-                    },
-                CharOffset(ref mut iter) =>
-                    for (idx, ch) in *iter {
-                        if self.sep.matches(ch) {
-                            // skip over the separator
-                            self.position = self.string.char_range_at(idx).next;
-                            self.count -= 1;
-                            return Some(unsafe {
-                                raw::slice_bytes(self.string, start, idx)
-                            })
-                        }
-                    },
+        let mut next_split = None;
+        if self.only_ascii {
+            for (idx, byte) in self.string.byte_iter().enumerate() {
+                if self.sep.matches(byte as char) && byte < 128u8 {
+                    next_split = Some((idx, idx + 1));
+                    break;
+                }
+            }
+        } else {
+            for (idx, ch) in self.string.char_offset_iter() {
+                if self.sep.matches(ch) {
+                    next_split = Some((idx, self.string.char_range_at(idx).next));
+                    break;
+                }
            }
        }
-        self.finished = true;
-        if self.allow_trailing_empty || start < len {
-            Some(unsafe { raw::slice_bytes(self.string, start, len) })
+        match next_split {
+            Some((a, b)) => unsafe {
+                let elt = raw::slice_unchecked(self.string, 0, a);
+                self.string = raw::slice_unchecked(self.string, b, self.string.len());
+                Some(elt)
+            },
+            None => self.get_end(),
+        }
+    }
+}
+
+impl<'self, Sep: CharEq> DoubleEndedIterator<&'self str>
+for CharSplitIterator<'self, Sep> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'self str> {
+        if self.finished { return None }
+
+        if !self.allow_trailing_empty {
+            self.allow_trailing_empty = true;
+            match self.next_back() {
+                Some(elt) if !elt.is_empty() => return Some(elt),
+                _ => if self.finished { return None }
+            }
+        }
+        let len = self.string.len();
+        let mut next_split = None;
+
+        if self.only_ascii {
+            for (j, byte) in self.string.byte_rev_iter().enumerate() {
+                if self.sep.matches(byte as char) && byte < 128u8 {
+                    let idx = len - j - 1;
+                    next_split = Some((idx, idx + 1));
+                    break;
+                }
+            }
        } else {
-            None
+            for (idx, ch) in self.string.char_offset_rev_iter() {
+                if self.sep.matches(ch) {
+                    next_split = Some((idx, self.string.char_range_at(idx).next));
+                    break;
+                }
+            }
+        }
+        match next_split {
+            Some((a, b)) => unsafe {
+                let elt = raw::slice_unchecked(self.string, b, len);
+                self.string = raw::slice_unchecked(self.string, 0, a);
+                Some(elt)
+            },
+            None => { self.finished = true; Some(self.string) }
+        }
+    }
+}
+
+impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitNIterator<'self, Sep> {
+    #[inline]
+    fn next(&mut self) -> Option<&'self str> {
+        if self.count != 0 {
+            self.count -= 1;
+            if self.invert { self.iter.next_back() } else { self.iter.next() }
+        } else {
+            self.iter.get_end()
        }
    }
 }
@ -1300,9 +1359,10 @@ pub trait StrSlice<'self> {
    fn char_offset_iter(&self) -> CharOffsetIterator<'self>;
    fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self>;
    fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
-    fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep>;
-    fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
-        -> CharSplitIterator<'self, Sep>;
+    fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
+    fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
+    fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep>;
+    fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
    fn matches_index_iter(&self, sep: &'self str) -> MatchesIndexIterator<'self>;
    fn split_str_iter(&self, &'self str) -> StrSplitIterator<'self>;
    fn line_iter(&self) -> CharSplitIterator<'self, char>;
@ -1439,40 +1499,78 @@ impl<'self> StrSlice<'self> for &'self str {
    /// ~~~
    #[inline]
    fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep> {
-        self.split_options_iter(sep, self.len(), true)
+        CharSplitIterator {
+            string: *self,
+            only_ascii: sep.only_ascii(),
+            sep: sep,
+            allow_trailing_empty: true,
+            finished: false,
+        }
    }

    /// An iterator over substrings of `self`, separated by characters
    /// matched by `sep`, restricted to splitting at most `count`
    /// times.
    #[inline]
-    fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep> {
-        self.split_options_iter(sep, count, true)
+    fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
+        -> CharSplitNIterator<'self, Sep> {
+        CharSplitNIterator {
+            iter: self.split_iter(sep),
+            count: count,
+            invert: false,
+        }
    }

    /// An iterator over substrings of `self`, separated by characters
-    /// matched by `sep`, splitting at most `count` times, and
-    /// possibly not including the trailing empty substring, if it
-    /// exists.
+    /// matched by `sep`.
+    ///
+    /// Equivalent to `split_iter`, except that the trailing substring
+    /// is skipped if empty (terminator semantics).
+    ///
+    /// # Example
+    ///
+    /// ~~~ {.rust}
+    /// let v: ~[&str] = "A.B.".split_terminator_iter('.').collect();
+    /// assert_eq!(v, ~["A", "B"]);
+    /// ~~~
    #[inline]
-    fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
+    fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep)
        -> CharSplitIterator<'self, Sep> {
-        let iter = if sep.only_ascii() {
-            ByteOffset(self.as_bytes().iter().enumerate())
-        } else {
-            CharOffset(self.char_offset_iter())
-        };
        CharSplitIterator {
-            iter: iter,
-            string: *self,
-            position: 0,
-            sep: sep,
-            count: count,
-            allow_trailing_empty: allow_trailing_empty,
-            finished: false,
+            allow_trailing_empty: false,
+            ..self.split_iter(sep)
        }
    }

+    /// An iterator over substrings of `self`, separated by characters
+    /// matched by `sep`, in reverse order
+    ///
+    /// # Example
+    ///
+    /// ~~~ {.rust}
+    /// let v: ~[&str] = "Mary had a little lamb".rsplit_iter(' ').collect();
+    /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
+    /// ~~~
+    #[inline]
+    fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep> {
+        self.split_iter(sep).invert()
+    }
+
+    /// An iterator over substrings of `self`, separated by characters
+    /// matched by `sep`, starting from the end of the string.
+    /// Restricted to splitting at most `count` times.
+    #[inline]
+    fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
+        -> CharSplitNIterator<'self, Sep> {
+        CharSplitNIterator {
+            iter: self.split_iter(sep),
+            count: count,
+            invert: true,
+        }
+    }
+
+
+
    /// An iterator over the start and end indices of each match of
    /// `sep` within `self`.
    #[inline]
@ -1506,7 +1604,7 @@ impl<'self> StrSlice<'self> for &'self str {
    /// by `\n`).
    #[inline]
    fn line_iter(&self) -> CharSplitIterator<'self, char> {
-        self.split_options_iter('\n', self.len(), false)
+        self.split_terminator_iter('\n')
    }

    /// An iterator over the lines of a string, separated by either
@ -3400,17 +3498,33 @@ mod tests {
        let data = "\nMäry häd ä little lämb\nLittle lämb\n";

        let split: ~[&str] = data.split_iter(' ').collect();
-        assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
+        assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
+
+        let mut rsplit: ~[&str] = data.rsplit_iter(' ').collect();
+        rsplit.reverse();
+        assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);

        let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect();
-        assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
+        assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
+
+        let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == ' ').collect();
+        rsplit.reverse();
+        assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);

        // Unicode
        let split: ~[&str] = data.split_iter('ä').collect();
-        assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
+        assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
+
+        let mut rsplit: ~[&str] = data.rsplit_iter('ä').collect();
+        rsplit.reverse();
+        assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);

        let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect();
-        assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
+        assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
+
+        let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == 'ä').collect();
+        rsplit.reverse();
+        assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
    }

    #[test]
@ -3431,14 +3545,49 @@ mod tests {
        assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
    }

+    #[test]
+    fn test_rsplitn_char_iterator() {
+        let data = "\nMäry häd ä little lämb\nLittle lämb\n";
+
+        let mut split: ~[&str] = data.rsplitn_iter(' ', 3).collect();
+        split.reverse();
+        assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
+
+        let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == ' ', 3).collect();
+        split.reverse();
+        assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
+
+        // Unicode
+        let mut split: ~[&str] = data.rsplitn_iter('ä', 3).collect();
+        split.reverse();
+        assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
+
+        let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == 'ä', 3).collect();
+        split.reverse();
+        assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
+    }
+
    #[test]
    fn test_split_char_iterator_no_trailing() {
        let data = "\nMäry häd ä little lämb\nLittle lämb\n";

-        let split: ~[&str] = data.split_options_iter('\n', 1000, true).collect();
+        let split: ~[&str] = data.split_iter('\n').collect();
        assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);

-        let split: ~[&str] = data.split_options_iter('\n', 1000, false).collect();
+        let split: ~[&str] = data.split_terminator_iter('\n').collect();
+        assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
+    }
+
+    #[test]
+    fn test_rev_split_char_iterator_no_trailing() {
+        let data = "\nMäry häd ä little lämb\nLittle lämb\n";
+
+        let mut split: ~[&str] = data.split_iter('\n').invert().collect();
+        split.reverse();
+        assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
+
+        let mut split: ~[&str] = data.split_terminator_iter('\n').invert().collect();
+        split.reverse();
        assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
    }

@ -3701,6 +3850,89 @@ mod bench {
        }
    }

+    #[bench]
+    fn split_iter_unicode_ascii(bh: &mut BenchHarness) {
+        let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
+
+        do bh.iter {
+            assert_eq!(s.split_iter('V').len(), 3);
+        }
+    }
+
+    #[bench]
+    fn split_iter_unicode_not_ascii(bh: &mut BenchHarness) {
+        struct NotAscii(char);
+        impl CharEq for NotAscii {
+            fn matches(&self, c: char) -> bool {
+                **self == c
+            }
+            fn only_ascii(&self) -> bool { false }
+        }
+        let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
+
+        do bh.iter {
+            assert_eq!(s.split_iter(NotAscii('V')).len(), 3);
+        }
+    }
+
+
+    #[bench]
+    fn split_iter_ascii(bh: &mut BenchHarness) {
+        let s = "Mary had a little lamb, Little lamb, little-lamb.";
+        let len = s.split_iter(' ').len();
+
+        do bh.iter {
+            assert_eq!(s.split_iter(' ').len(), len);
+        }
+    }
+
+    #[bench]
+    fn split_iter_not_ascii(bh: &mut BenchHarness) {
+        struct NotAscii(char);
+        impl CharEq for NotAscii {
+            #[inline]
+            fn matches(&self, c: char) -> bool { **self == c }
+            fn only_ascii(&self) -> bool { false }
+        }
+        let s = "Mary had a little lamb, Little lamb, little-lamb.";
+        let len = s.split_iter(' ').len();
+
+        do bh.iter {
+            assert_eq!(s.split_iter(NotAscii(' ')).len(), len);
+        }
+    }
+
+    #[bench]
+    fn split_iter_extern_fn(bh: &mut BenchHarness) {
+        let s = "Mary had a little lamb, Little lamb, little-lamb.";
+        let len = s.split_iter(' ').len();
+        fn pred(c: char) -> bool { c == ' ' }
+
+        do bh.iter {
+            assert_eq!(s.split_iter(pred).len(), len);
+        }
+    }
+
+    #[bench]
+    fn split_iter_closure(bh: &mut BenchHarness) {
+        let s = "Mary had a little lamb, Little lamb, little-lamb.";
+        let len = s.split_iter(' ').len();
+
+        do bh.iter {
+            assert_eq!(s.split_iter(|c: char| c == ' ').len(), len);
+        }
+    }
+
+    #[bench]
+    fn split_iter_slice(bh: &mut BenchHarness) {
+        let s = "Mary had a little lamb, Little lamb, little-lamb.";
+        let len = s.split_iter(' ').len();
+
+        do bh.iter {
+            assert_eq!(s.split_iter(&[' ']).len(), len);
+        }
+    }
+
    #[bench]
    fn is_utf8_100_ascii(bh: &mut BenchHarness) {