From 5aee5a11e3d4807c6df190e33cc6c4dc81ef7ea3 Mon Sep 17 00:00:00 2001
From: Gary Linscott <glinscott@gmail.com>
Date: Wed, 10 Jul 2013 17:06:16 -0400
Subject: [PATCH] Optimize is_utf8

Manually unroll the multibyte loops, and optimize for the single
byte chars.
---
 src/libstd/str.rs               | 24 ++++++++++++++++--------
 src/test/run-pass/utf8_chars.rs | 11 +++++++++++
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index bc59164637e..1d8a2d404a7 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool {
     let mut i = 0u;
     let total = v.len();
     while i < total {
-        let mut chsize = utf8_char_width(v[i]);
-        if chsize == 0u { return false; }
-        if i + chsize > total { return false; }
-        i += 1u;
-        while chsize > 1u {
-            if v[i] & 192u8 != TAG_CONT_U8 { return false; }
+        if v[i] < 128u8 {
             i += 1u;
-            chsize -= 1u;
+        } else {
+            let w = utf8_char_width(v[i]);
+            if w == 0u { return false; }
+
+            let nexti = i + w;
+            if nexti > total { return false; }
+
+            if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
+            if w > 2 {
+                if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
+                if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
+            }
+
+            i = nexti;
         }
     }
-    return true;
+    true
 }
 
 /// Determines if a vector of `u16` contains valid UTF-16
diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs
index 4364bcc1274..556d7dd521c 100644
--- a/src/test/run-pass/utf8_chars.rs
+++ b/src/test/run-pass/utf8_chars.rs
@@ -27,9 +27,20 @@ pub fn main() {
     assert!(s.char_at(1u) == 'é');
 
     assert!((str::is_utf8(s.as_bytes())));
+    // invalid prefix
     assert!((!str::is_utf8(~[0x80_u8])));
+    // invalid 2 byte prefix
     assert!((!str::is_utf8(~[0xc0_u8])));
     assert!((!str::is_utf8(~[0xc0_u8, 0x10_u8])));
+    // invalid 3 byte prefix
+    assert!((!str::is_utf8(~[0xe0_u8])));
+    assert!((!str::is_utf8(~[0xe0_u8, 0x10_u8])));
+    assert!((!str::is_utf8(~[0xe0_u8, 0xff_u8, 0x10_u8])));
+    // invalid 4 byte prefix
+    assert!((!str::is_utf8(~[0xf0_u8])));
+    assert!((!str::is_utf8(~[0xf0_u8, 0x10_u8])));
+    assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0x10_u8])));
+    assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0xff_u8, 0x10_u8])));
 
     let mut stack = ~"a×c€";
     assert_eq!(stack.pop_char(), '€');