From 5aee5a11e3d4807c6df190e33cc6c4dc81ef7ea3 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Wed, 10 Jul 2013 17:06:16 -0400 Subject: [PATCH] Optimize is_utf8 Manually unroll the multibyte loops, and optimize for the single byte chars. --- src/libstd/str.rs | 24 ++++++++++++++++-------- src/test/run-pass/utf8_chars.rs | 11 +++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index bc59164637e..1d8a2d404a7 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool { let mut i = 0u; let total = v.len(); while i < total { - let mut chsize = utf8_char_width(v[i]); - if chsize == 0u { return false; } - if i + chsize > total { return false; } - i += 1u; - while chsize > 1u { - if v[i] & 192u8 != TAG_CONT_U8 { return false; } + if v[i] < 128u8 { i += 1u; - chsize -= 1u; + } else { + let w = utf8_char_width(v[i]); + if w == 0u { return false; } + + let nexti = i + w; + if nexti > total { return false; } + + if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; } + if w > 2 { + if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; } + if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; } + } + + i = nexti; } } - return true; + true } /// Determines if a vector of `u16` contains valid UTF-16 diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs index 4364bcc1274..556d7dd521c 100644 --- a/src/test/run-pass/utf8_chars.rs +++ b/src/test/run-pass/utf8_chars.rs @@ -27,9 +27,20 @@ pub fn main() { assert!(s.char_at(1u) == 'é'); assert!((str::is_utf8(s.as_bytes()))); + // invalid prefix assert!((!str::is_utf8(~[0x80_u8]))); + // invalid 2 byte prefix assert!((!str::is_utf8(~[0xc0_u8]))); assert!((!str::is_utf8(~[0xc0_u8, 0x10_u8]))); + // invalid 3 byte prefix + assert!((!str::is_utf8(~[0xe0_u8]))); + assert!((!str::is_utf8(~[0xe0_u8, 0x10_u8]))); + assert!((!str::is_utf8(~[0xe0_u8, 0xff_u8, 0x10_u8]))); + // invalid 4 byte prefix + assert!((!str::is_utf8(~[0xf0_u8]))); + assert!((!str::is_utf8(~[0xf0_u8, 0x10_u8]))); + assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0x10_u8]))); + assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0xff_u8, 0x10_u8]))); let mut stack = ~"a×c€"; assert_eq!(stack.pop_char(), '€');