diff --git a/src/libstd/str.rs b/src/libstd/str.rs index bc59164637e..1d8a2d404a7 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool { let mut i = 0u; let total = v.len(); while i < total { - let mut chsize = utf8_char_width(v[i]); - if chsize == 0u { return false; } - if i + chsize > total { return false; } - i += 1u; - while chsize > 1u { - if v[i] & 192u8 != TAG_CONT_U8 { return false; } + if v[i] < 128u8 { i += 1u; - chsize -= 1u; + } else { + let w = utf8_char_width(v[i]); + if w == 0u { return false; } + + let nexti = i + w; + if nexti > total { return false; } + + if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; } + if w > 2 { + if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; } + if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; } + } + + i = nexti; } } - return true; + true } /// Determines if a vector of `u16` contains valid UTF-16 diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs index 4364bcc1274..556d7dd521c 100644 --- a/src/test/run-pass/utf8_chars.rs +++ b/src/test/run-pass/utf8_chars.rs @@ -27,9 +27,20 @@ pub fn main() { assert!(s.char_at(1u) == 'é'); assert!((str::is_utf8(s.as_bytes()))); + // invalid prefix assert!((!str::is_utf8(~[0x80_u8]))); + // invalid 2 byte prefix assert!((!str::is_utf8(~[0xc0_u8]))); assert!((!str::is_utf8(~[0xc0_u8, 0x10_u8]))); + // invalid 3 byte prefix + assert!((!str::is_utf8(~[0xe0_u8]))); + assert!((!str::is_utf8(~[0xe0_u8, 0x10_u8]))); + assert!((!str::is_utf8(~[0xe0_u8, 0xff_u8, 0x10_u8]))); + // invalid 4 byte prefix + assert!((!str::is_utf8(~[0xf0_u8]))); + assert!((!str::is_utf8(~[0xf0_u8, 0x10_u8]))); + assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0x10_u8]))); + assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0xff_u8, 0x10_u8]))); let mut stack = ~"a×c€"; assert_eq!(stack.pop_char(), '€');