Tweak from_utf8_lossy to return a new MaybeOwned enum

MaybeOwned allows from_utf8_lossy to avoid allocation if there are no invalid bytes in the input.
2014-02-07 14:58:37 -08:00 · 2014-02-07 14:58:37 -08:00 · 28467f5d19
commit 28467f5d19
parent dde2e0b386
2 changed files with 96 additions and 29 deletions
--- a/src/libstd/path/mod.rs
+++ b/src/libstd/path/mod.rs
@ -508,10 +508,10 @@ impl<'a, P: GenericPath> ToStr for Display<'a, P> {
        if self.filename {
            match self.path.filename() {
                None => ~"",
-                Some(v) => str::from_utf8_lossy(v)
+                Some(v) => str::from_utf8_lossy(v).into_owned()
            }
        } else {
-            str::from_utf8_lossy(self.path.as_vec())
+            str::from_utf8_lossy(self.path.as_vec()).into_owned()
        }
    }
 }
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@ -729,6 +729,11 @@ Section: Misc

 /// Determines if a vector of bytes contains valid UTF-8
 pub fn is_utf8(v: &[u8]) -> bool {
+    first_non_utf8_index(v).is_none()
+}
+
+#[inline(always)]
+fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
    let mut i = 0u;
    let total = v.len();
    fn unsafe_get(xs: &[u8], i: uint) -> u8 {
@ -740,10 +745,10 @@ pub fn is_utf8(v: &[u8]) -> bool {
            i += 1u;
        } else {
            let w = utf8_char_width(v_i);
-            if w == 0u { return false; }
+            if w == 0u { return Some(i); }

            let nexti = i + w;
-            if nexti > total { return false; }
+            if nexti > total { return Some(i); }

            // 2-byte encoding is for codepoints  \u0080 to  \u07ff
            //        first  C2 80        last DF BF
@ -766,7 +771,7 @@ pub fn is_utf8(v: &[u8]) -> bool {
            // UTF8-tail   = %x80-BF
            match w {
                2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
-                    return false
+                    return Some(i)
                },
                3 => match (v_i,
                            unsafe_get(v, i + 1),
@ -775,7 +780,7 @@ pub fn is_utf8(v: &[u8]) -> bool {
                    (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
                    (0xED        , 0x80 .. 0x9F, TAG_CONT_U8) => (),
                    (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
-                    _ => return false,
+                    _ => return Some(i),
                },
                _ => match (v_i,
                            unsafe_get(v, i + 1),
@ -784,14 +789,14 @@ pub fn is_utf8(v: &[u8]) -> bool {
                    (0xF0        , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
                    (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
                    (0xF4        , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
-                    _ => return false,
+                    _ => return Some(i)
                },
            }

            i = nexti;
        }
    }
-    true
+    None
 }

 /// Determines if a vector of `u16` contains valid UTF-16
@ -910,6 +915,53 @@ macro_rules! utf8_acc_cont_byte(

 static TAG_CONT_U8: u8 = 128u8;

+/// Enum that represents either a borrowed or an owned string.
+#[deriving(Eq,Clone)]
+pub enum MaybeOwned<'a> {
+    /// A borrowed string
+    Slice(&'a str),
+    /// An owned string
+    Owned(~str)
+}
+
+impl<'a> Str for MaybeOwned<'a> {
+    #[inline]
+    fn as_slice<'b>(&'b self) -> &'b str {
+        match *self {
+            Slice(s) => s,
+            Owned(ref s) => s.as_slice()
+        }
+    }
+
+    #[inline]
+    fn into_owned(self) -> ~str {
+        match self {
+            Slice(s) => s.to_owned(),
+            Owned(s) => s
+        }
+    }
+}
+
+impl<'a> ToStr for MaybeOwned<'a> {
+    #[inline]
+    fn to_str(&self) -> ~str {
+        match *self {
+            Slice(s) => s.to_str(),
+            Owned(ref s) => s.clone()
+        }
+    }
+}
+
+impl<'a> ::fmt::Show for MaybeOwned<'a> {
+    #[inline]
+    fn fmt(mo: &MaybeOwned, f: &mut ::fmt::Formatter) -> ::fmt::Result {
+        match *mo {
+            Slice(ref s) => ::fmt::Show::fmt(s, f),
+            Owned(ref s) => ::fmt::Show::fmt(&s.as_slice(), f)
+        }
+    }
+}
+
 /// Converts a vector of bytes to a new utf-8 string.
 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
 ///
@ -918,12 +970,16 @@ static TAG_CONT_U8: u8 = 128u8;
 /// ```rust
 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
 /// let output = std::str::from_utf8_lossy(input);
-/// assert_eq!(output, ~"Hello \uFFFDWorld");
+/// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
 /// ```
-pub fn from_utf8_lossy(v: &[u8]) -> ~str {
+pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
+    let firstbad = match first_non_utf8_index(v) {
+        None => return Slice(unsafe { cast::transmute(v) }),
+        Some(i) => i
+    };
+
    static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
-    let mut i = 0u;
-    let mut lastgood = 0u;
+    let mut i = firstbad;
    let total = v.len();
    fn unsafe_get(xs: &[u8], i: uint) -> u8 {
        unsafe { *xs.unsafe_ref(i) }
@ -937,23 +993,32 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str {
    }
    let mut res = with_capacity(total);

+    if i > 0 {
+        unsafe { raw::push_bytes(&mut res, v.slice_to(i)) };
+    }
+
+    // subseqidx is the index of the first byte of the subsequence we're looking at.
+    // It's used to copy a bunch of contiguous good codepoints at once instead of copying
+    // them one by one.
+    let mut subseqidx = firstbad;
+
    while i < total {
        let i_ = i;
        let byte = unsafe_get(v, i);
        i += 1;

-        macro_rules! error(() => {
+        macro_rules! error(() => ({
            unsafe {
-                if lastgood != i_ {
-                    raw::push_bytes(&mut res, v.slice(lastgood, i_));
+                if subseqidx != i_ {
+                    raw::push_bytes(&mut res, v.slice(subseqidx, i_));
                }
-                lastgood = i;
+                subseqidx = i;
                raw::push_bytes(&mut res, REPLACEMENT);
            }
-        })
+        }))

        if byte < 128u8 {
-            // lastgood handles this
+            // subseqidx handles this
        } else {
            let w = utf8_char_width(byte);

@ -1012,8 +1077,10 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str {
            }
        }
    }
-    unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) };
-    res
+    if subseqidx < total {
+        unsafe { raw::push_bytes(&mut res, v.slice(subseqidx, total)) };
+    }
+    Owned(res)
 }

 /// Unsafe operations
@ -3943,32 +4010,32 @@ mod tests {
    #[test]
    fn test_str_from_utf8_lossy() {
        let xs = bytes!("hello");
-        assert_eq!(from_utf8_lossy(xs), ~"hello");
+        assert_eq!(from_utf8_lossy(xs), Slice("hello"));

        let xs = bytes!("ศไทย中华Việt Nam");
-        assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam");
+        assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));

        let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
-        assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));

        let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
-        assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));

        let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
-        assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));

        let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
-        assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));

        let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
-        assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));

        let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
-        assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));

        // surrogates
        let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
-        assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar");
+        assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
    }

    #[test]