From 4c4a5f734d300a8e2dffe43c0c391e7317c47bc6 Mon Sep 17 00:00:00 2001
From: Kevin Cantu <me@kevincantu.org>
Date: Tue, 24 Jan 2012 01:29:45 -0800
Subject: [PATCH] Reorganizing str.rs to group and document strings better (no
 functional changes, though FIXMEs added)

---
 src/libcore/str.rs | 1931 ++++++++++++++++++++++++--------------------
 1 file changed, 1046 insertions(+), 885 deletions(-)

diff --git a/src/libcore/str.rs b/src/libcore/str.rs
index fa1aca29efa..823f7cc78c6 100644
--- a/src/libcore/str.rs
+++ b/src/libcore/str.rs
@@ -1,196 +1,118 @@
 /*
 Module: str
 
-String manipulation.
+String manipulation
+
+Strings are a packed UTF-8 representation of text, stored as null terminated
+buffers of u8 bytes.  Strings should be considered by character,
+for correctness, but some UTF-8 unsafe functions are also provided.
+For some heavy-duty uses, we recommend trying std::rope.
 */
 
-export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len,
-       byte_len_range, index,
-       rindex, find, starts_with, ends_with, substr, slice, split, splitn,
-       split_str, split_func, split_char, lines, lines_any, words,
-       concat, connect, to_lower, to_upper, replace, char_slice,
-       trim_left, trim_right, trim, unshift_char, shift_char, pop_char,
-       push_char, is_utf8, from_chars, to_chars, char_len, char_len_range,
-       char_at, bytes, is_ascii, shift_byte, pop_byte,
-       unsafe_from_byte, unsafe_from_bytes, from_char, char_range_at,
-       from_bytes,
-       from_cstr, sbuf, as_buf, push_byte, utf8_char_width, safe_slice,
-       contains, iter_chars, chars_iter, bytes_iter, words_iter, lines_iter,
-       loop_chars, loop_chars_sub, escape, any, all, map, windowed;
+export
+   // Creating a string
+   from_bytes,
+   unsafe_from_bytes,
+   unsafe_from_byte,
+   //push_utf8_bytes,
+   from_char,
+   from_chars,
+   from_cstr,
+   concat,
+   connect,
+
+   // Adding things to and removing things from a string
+   push_char,
+   pop_char,
+   shift_char,
+   unshift_char,
+   push_byte,
+   //push_bytes,
+   pop_byte,
+   shift_byte,
+   trim_left,
+   trim_right,
+   trim,
+
+   // Transforming strings
+   bytes,
+   to_chars,
+   substr,
+   char_slice,
+   slice,
+   safe_slice,
+   split,
+   splitn,
+   split_str,
+   split_func,
+   split_char,
+   lines,
+   lines_any,
+   words,
+   windowed,
+   to_lower,
+   to_upper,
+   replace,
+   escape,
+
+   // Comparing strings
+   eq,
+   lteq,
+   hash,
+
+   // Iterating through strings
+   loop_chars,
+   all,
+   any,
+   map,
+   bytes_iter,
+   iter_chars,
+   chars_iter,
+   words_iter,
+   lines_iter,
+
+   // Searching
+   index,
+   rindex,
+   find,
+   contains,
+   starts_with,
+   ends_with,
+
+   // String properties
+   is_ascii,
+   is_empty,
+   is_not_empty,
+   is_whitespace,
+   byte_len,
+   char_len,
+
+   // Misc
+   // FIXME: perhaps some more of this section shouldn't be exported?
+   is_utf8,
+   char_len_range,
+   byte_len_range,
+   utf8_char_width,
+   char_range_at,
+   char_at,
+   loop_chars_sub,
+   escape_char,
+   as_buf,
+   //buf,
+   sbuf;
+
+
 
 #[abi = "cdecl"]
 native mod rustrt {
     fn rust_str_push(&s: str, ch: u8);
 }
 
-/*
-Function: eq
-
-Bytewise string equality
-*/
-pure fn eq(&&a: str, &&b: str) -> bool { a == b }
+// FIXME: add pure to a lot of functions
 
 /*
-Function: lteq
-
-Bytewise less than or equal
+Section: Creating a string
 */
-pure fn lteq(&&a: str, &&b: str) -> bool { a <= b }
-
-/*
-Function: hash
-
-String hash function
-*/
-fn hash(&&s: str) -> uint {
-    // djb hash.
-    // FIXME: replace with murmur.
-
-    let u: uint = 5381u;
-    for c: u8 in s { u *= 33u; u += c as uint; }
-    ret u;
-}
-
-// UTF-8 tags and ranges
-const tag_cont_u8: u8 = 128u8;
-const tag_cont: uint = 128u;
-const max_one_b: uint = 128u;
-const tag_two_b: uint = 192u;
-const max_two_b: uint = 2048u;
-const tag_three_b: uint = 224u;
-const max_three_b: uint = 65536u;
-const tag_four_b: uint = 240u;
-const max_four_b: uint = 2097152u;
-const tag_five_b: uint = 248u;
-const max_five_b: uint = 67108864u;
-const tag_six_b: uint = 252u;
-
-/*
-Function: is_utf8
-
-Determines if a vector uf bytes contains valid UTF-8
-*/
-fn is_utf8(v: [u8]) -> bool {
-    let i = 0u;
-    let total = vec::len::<u8>(v);
-    while i < total {
-        let chsize = utf8_char_width(v[i]);
-        if chsize == 0u { ret false; }
-        if i + chsize > total { ret false; }
-        i += 1u;
-        while chsize > 1u {
-            if v[i] & 192u8 != tag_cont_u8 { ret false; }
-            i += 1u;
-            chsize -= 1u;
-        }
-    }
-    ret true;
-}
-
-/*
-Function: is_ascii
-
-Determines if a string contains only ASCII characters
-*/
-fn is_ascii(s: str) -> bool {
-    let i: uint = byte_len(s);
-    while i > 0u { i -= 1u; if s[i] & 128u8 != 0u8 { ret false; } }
-    ret true;
-}
-
-/*
-Predicate: is_empty
-
-Returns true if the string has length 0
-*/
-pure fn is_empty(s: str) -> bool { for c: u8 in s { ret false; } ret true; }
-
-/*
-Predicate: is_not_empty
-
-Returns true if the string has length greater than 0
-*/
-pure fn is_not_empty(s: str) -> bool { !is_empty(s) }
-
-/*
-Function: is_whitespace
-
-Returns true if the string contains only whitespace
-*/
-fn is_whitespace(s: str) -> bool {
-    ret loop_chars(s, char::is_whitespace);
-}
-
-/*
-Function: byte_len
-
-Returns the length in bytes of a string
-*/
-pure fn byte_len(s: str) -> uint unsafe {
-    let v: [u8] = unsafe::reinterpret_cast(s);
-    let vlen = vec::len(v);
-    unsafe::leak(v);
-    // There should always be a null terminator
-    assert (vlen > 0u);
-    ret vlen - 1u;
-}
-
-/*
-Function: byte_len_range
-
-As byte_len but for a substring
-
-Parameters:
-s - A string
-byte_offset - The byte offset at which to start in the string
-char_len    - The number of chars (not bytes!) in the range
-
-Returns:
-The number of bytes in the substring starting at `byte_offset` and
-containing `char_len` chars.
-
-Safety note:
-
-This function fails if `byte_offset` or `char_len` do not represent
-valid positions in `s`
-*/
-fn byte_len_range(s: str, byte_offset: uint, char_len: uint) -> uint {
-    let i = byte_offset;
-    let chars = 0u;
-    while chars < char_len {
-        let chsize = utf8_char_width(s[i]);
-        assert (chsize > 0u);
-        i += chsize;
-        chars += 1u;
-    }
-    ret i - byte_offset;
-}
-
-/*
-Function: bytes
-
-Converts a string to a vector of bytes. The result vector is not
-null-terminated.
-*/
-fn bytes(s: str) -> [u8] unsafe {
-    let v = unsafe::reinterpret_cast(s);
-    let vcopy = vec::slice(v, 0u, vec::len(v) - 1u);
-    unsafe::leak(v);
-    ret vcopy;
-}
-
-/*
-Function: unsafe_from_bytes
-
-Converts a vector of bytes to a string. Does not verify that the
-vector contains valid UTF-8.
-*/
-fn unsafe_from_bytes(v: [const u8]) -> str unsafe {
-    let vcopy: [u8] = v + [0u8];
-    let scopy: str = unsafe::reinterpret_cast(vcopy);
-    unsafe::leak(vcopy);
-    ret scopy;
-}
 
 /*
 Function: from_bytes
@@ -205,11 +127,28 @@ fn from_bytes(vv: [u8]) -> result::t<str, str> {
    }
 }
 
+/*
+Function: unsafe_from_bytes
+
+Converts a vector of bytes to a string. Does not verify that the
+vector contains valid UTF-8.
+
+// FIXME: remove?
+*/
+fn unsafe_from_bytes(v: [const u8]) -> str unsafe {
+    let vcopy: [u8] = v + [0u8];
+    let scopy: str = unsafe::reinterpret_cast(vcopy);
+    unsafe::leak(vcopy);
+    ret scopy;
+}
+
 /*
 Function: unsafe_from_byte
 
 Converts a byte to a string. Does not verify that the byte is
 valid UTF-8.
+
+FIXME: rename to 'from_byte'
 */
 fn unsafe_from_byte(u: u8) -> str { unsafe_from_bytes([u]) }
 
@@ -265,250 +204,53 @@ fn from_chars(chs: [char]) -> str {
 }
 
 /*
-Function: utf8_char_width
+Function: from_cstr
 
-Given a first byte, determine how many bytes are in this UTF-8 character
+Create a Rust string from a null-terminated C string
 */
-pure fn utf8_char_width(b: u8) -> uint {
-    let byte: uint = b as uint;
-    if byte < 128u { ret 1u; }
-    if byte < 192u {
-        ret 0u; // Not a valid start byte
-
-    }
-    if byte < 224u { ret 2u; }
-    if byte < 240u { ret 3u; }
-    if byte < 248u { ret 4u; }
-    if byte < 252u { ret 5u; }
-    ret 6u;
-}
-
-/*
-Function: char_range_at
-
-Pluck a character out of a string and return the index of the next character.
-This function can be used to iterate over the unicode characters of a string.
-
-Example:
-> let s = "中华Việt Nam";
-> let i = 0u;
-> while i < str::byte_len(s) {
->    let {ch, next} = str::char_range_at(s, i);
->    std::io::println(#fmt("%u: %c",i,ch));
->    i = next;
-> }
-
-Example output:
-
-      0: 中
-      3: 华
-      6: V
-      7: i
-      8: ệ
-      11: t
-      12:
-      13: N
-      14: a
-      15: m
-
-Parameters:
-
-s - The string
-i - The byte offset of the char to extract
-
-Returns:
-
-A record {ch: char, next: uint} containing the char value and the byte
-index of the next unicode character.
-
-Failure:
-
-If `i` is greater than or equal to the length of the string.
-If `i` is not the index of the beginning of a valid UTF-8 character.
-*/
-fn char_range_at(s: str, i: uint) -> {ch: char, next: uint} {
-    let b0 = s[i];
-    let w = utf8_char_width(b0);
-    assert (w != 0u);
-    if w == 1u { ret {ch: b0 as char, next: i + 1u}; }
-    let val = 0u;
-    let end = i + w;
-    let i = i + 1u;
-    while i < end {
-        let byte = s[i];
-        assert (byte & 192u8 == tag_cont_u8);
-        val <<= 6u;
-        val += byte & 63u8 as uint;
-        i += 1u;
-    }
-    // Clunky way to get the right bits from the first byte. Uses two shifts,
-    // the first to clip off the marker bits at the left of the byte, and then
-    // a second (as uint) to get it to the right position.
-    val += (b0 << (w + 1u as u8) as uint) << ((w - 1u) * 6u - w - 1u);
-    ret {ch: val as char, next: i};
-}
-
-/*
-Function: char_at
-
-Pluck a character out of a string
-*/
-fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; }
-
-/*
-Function: iter_chars
-
-Iterate over the characters in a string
-*/
-fn iter_chars(s: str, it: fn(char)) {
-    let pos = 0u, len = byte_len(s);
-    while (pos < len) {
-        let {ch, next} = char_range_at(s, pos);
-        pos = next;
-        it(ch);
-    }
-}
-
-/*
-Function: chars_iter
-
-Iterate over the characters in a string
-
-FIXME: A synonym to iter_chars
-*/
-fn chars_iter(ss: str, it: fn(char)) {
-    iter_chars(ss, it)
-}
-
-/*
-Function: bytes_iter
-
-Iterate over the bytes in a string
-
-FIXME: Should it really include the last byte '\0'?
-*/
-fn bytes_iter(ss: str, it: fn(u8)) {
-    let pos = 0u;
-    let len = byte_len(ss);
-
-    while (pos < len) {
-        it(ss[pos]);
-        pos += 1u;
-    }
-}
-
-/*
-Function: loop_chars
-
-Loop through a string, char by char
-
-Parameters:
-s  - A string to traverse. It may be empty.
-it - A block to execute with each consecutive character of `s`.
-Return `true` to continue, `false` to stop.
-
-Returns:
-
-`true` If execution proceeded correctly, `false` if it was interrupted,
-that is if `it` returned `false` at any point.
- */
-fn loop_chars(s: str, it: fn(char) -> bool) -> bool{
-    ret loop_chars_sub(s, 0u, byte_len(s), it);
-}
-
-/*
-Function: loop_chars_sub
-
-Loop through a substring, char by char
-
-Parameters:
-s           - A string to traverse. It may be empty.
-byte_offset - The byte offset at which to start in the string.
-byte_len    - The number of bytes to traverse in the string
-it          - A block to execute with each consecutive character of `s`.
-Return `true` to continue, `false` to stop.
-
-Returns:
-
-`true` If execution proceeded correctly, `false` if it was interrupted,
-that is if `it` returned `false` at any point.
-
-Safety note:
-- This function does not check whether the substring is valid.
-- This function fails if `byte_offset` or `byte_len` do not
- represent valid positions inside `s`
- */
-fn loop_chars_sub(s: str, byte_offset: uint, byte_len: uint,
-              it: fn(char) -> bool) -> bool {
-   let i = byte_offset;
-   let result = true;
-   while i < byte_len {
-      let {ch, next} = char_range_at(s, i);
-      if !it(ch) {result = false; break;}
-      i = next;
-   }
-   ret result;
-}
-
-
-/*
-Function: char_len
-
-Count the number of unicode characters in a string
-*/
-fn char_len(s: str) -> uint {
-    ret char_len_range(s, 0u, byte_len(s));
-}
-
-/*
-Function: char_len_range
-
-As char_len but for a slice of a string
-
-Parameters:
- s           - A valid string
- byte_start  - The position inside `s` where to start counting in bytes.
- byte_len    - The number of bytes of `s` to take into account.
-
-Returns:
- The number of Unicode characters in `s` in
-segment [byte_start, byte_start+len( .
-
-Safety note:
-- This function does not check whether the substring is valid.
-- This function fails if `byte_offset` or `byte_len` do not
- represent valid positions inside `s`
-*/
-fn char_len_range(s: str, byte_start: uint, byte_len: uint) -> uint {
-    let i     = byte_start;
-    let len   = 0u;
-    while i < byte_len {
-        let chsize = utf8_char_width(s[i]);
-        assert (chsize > 0u);
-        len += 1u;
-        i += chsize;
-    }
-    assert (i == byte_len);
-    ret len;
-}
-
-/*
-Function: to_chars
-
-Convert a string to a vector of characters
-*/
-fn to_chars(s: str) -> [char] {
-    let buf: [char] = [];
+unsafe fn from_cstr(cstr: sbuf) -> str {
+    let res = "";
+    let start = cstr;
+    let curr = start;
     let i = 0u;
-    let len = byte_len(s);
-    while i < len {
-        let cur = char_range_at(s, i);
-        buf += [cur.ch];
-        i = cur.next;
+    while *curr != 0u8 {
+        push_byte(res, *curr);
+        i += 1u;
+        curr = ptr::offset(start, i);
     }
-    ret buf;
+    ret res;
 }
 
+/*
+Function: concat
+
+Concatenate a vector of strings
+*/
+fn concat(v: [str]) -> str {
+    let s: str = "";
+    for ss: str in v { s += ss; }
+    ret s;
+}
+
+/*
+Function: connect
+
+Concatenate a vector of strings, placing a given separator between each
+*/
+fn connect(v: [str], sep: str) -> str {
+    let s: str = "";
+    let first: bool = true;
+    for ss: str in v {
+        if first { first = false; } else { s += sep; }
+        s += ss;
+    }
+    ret s;
+}
+
+/*
+Section: Adding to and removing from a string
+*/
+
 /*
 Function: push_char
 
@@ -556,6 +298,636 @@ Prepend a char to a string
 */
 fn unshift_char(&s: str, ch: char) { s = from_char(ch) + s; }
 
+/*
+Function: push_byte
+
+Appends a byte to a string.
+
+This function is not unicode-safe.
+*/
+fn push_byte(&s: str, b: u8) { rustrt::rust_str_push(s, b); }
+
+/*
+Function: push_bytes
+
+Appends a vector of bytes to a string.
+
+This function is not unicode-safe.
+*/
+fn push_bytes(&s: str, bytes: [u8]) {
+    for byte in bytes { rustrt::rust_str_push(s, byte); }
+}
+
+/*
+Function: pop_byte
+
+Removes the last byte from a string and returns it.
+
+This function is not unicode-safe.
+*/
+fn pop_byte(&s: str) -> u8 {
+    let len = byte_len(s);
+    assert (len > 0u);
+    let b = s[len - 1u];
+    s = substr(s, 0u, len - 1u);
+    ret b;
+}
+
+/*
+Function: shift_byte
+
+Removes the first byte from a string and returns it.
+
+This function is not unicode-safe.
+*/
+fn shift_byte(&s: str) -> u8 {
+    let len = byte_len(s);
+    assert (len > 0u);
+    let b = s[0];
+    s = substr(s, 1u, len - 1u);
+    ret b;
+}
+
+/*
+Function: trim_left
+
+Returns a string with leading whitespace removed.
+*/
+fn trim_left(s: str) -> str {
+    fn count_whities(s: [char]) -> uint {
+        let i = 0u;
+        while i < vec::len(s) {
+            if !char::is_whitespace(s[i]) { break; }
+            i += 1u;
+        }
+        ret i;
+    }
+    let chars = to_chars(s);
+    let whities = count_whities(chars);
+    ret from_chars(vec::slice(chars, whities, vec::len(chars)));
+}
+
+/*
+Function: trim_right
+
+Returns a string with trailing whitespace removed.
+*/
+fn trim_right(s: str) -> str {
+    fn count_whities(s: [char]) -> uint {
+        let i = vec::len(s);
+        while 0u < i {
+            if !char::is_whitespace(s[i - 1u]) { break; }
+            i -= 1u;
+        }
+        ret i;
+    }
+    let chars = to_chars(s);
+    let whities = count_whities(chars);
+    ret from_chars(vec::slice(chars, 0u, whities));
+}
+
+/*
+Function: trim
+
+Returns a string with leading and trailing whitespace removed
+*/
+fn trim(s: str) -> str { trim_left(trim_right(s)) }
+
+
+/*
+Section: Transforming strings
+*/
+
+/*
+Function: bytes
+
+Converts a string to a vector of bytes. The result vector is not
+null-terminated.
+*/
+fn bytes(s: str) -> [u8] unsafe {
+    let v = unsafe::reinterpret_cast(s);
+    let vcopy = vec::slice(v, 0u, vec::len(v) - 1u);
+    unsafe::leak(v);
+    ret vcopy;
+}
+
+/*
+Function: to_chars
+
+Convert a string to a vector of characters
+
+FIXME: rename to 'chars'
+*/
+fn to_chars(s: str) -> [char] {
+    let buf: [char] = [];
+    let i = 0u;
+    let len = byte_len(s);
+    while i < len {
+        let cur = char_range_at(s, i);
+        buf += [cur.ch];
+        i = cur.next;
+    }
+    ret buf;
+}
+
+/*
+Function: substr
+
+Take a substring of another. Returns a string containing `len` bytes
+starting at byte offset `begin`.
+
+FIXME: This function is not unicode-safe.
+
+Failure:
+
+If `begin` + `len` is is greater than the byte length of the string
+*/
+fn substr(s: str, begin: uint, len: uint) -> str {
+    ret slice(s, begin, begin + len);
+}
+
+/*
+Function: char_slice
+
+Unicode-safe slice. Returns a slice of the given string containing
+the characters in the range [`begin`..`end`). `begin` and `end` are
+character indexes, not byte indexes.
+
+Failure:
+
+- If begin is greater than end
+- If end is greater than the character length of the string
+
+FIXME: rename to slice(), make faster by avoiding char conversion
+*/
+fn char_slice(s: str, begin: uint, end: uint) -> str {
+    from_chars(vec::slice(to_chars(s), begin, end))
+}
+
+/*
+Function: slice
+
+Takes a bytewise slice from a string. Returns the substring from
+[`begin`..`end`).
+
+This function is not unicode-safe.
+
+Failure:
+
+- If begin is greater than end.
+- If end is greater than the length of the string.
+
+FIXME: rename to slice_byte or slice_byte_unsafe
+*/
+fn slice(s: str, begin: uint, end: uint) -> str unsafe {
+    // FIXME: Typestate precondition
+    assert (begin <= end);
+    assert (end <= byte_len(s));
+
+    let v: [u8] = unsafe::reinterpret_cast(s);
+    let v2 = vec::slice(v, begin, end);
+    unsafe::leak(v);
+    v2 += [0u8];
+    let s2: str = unsafe::reinterpret_cast(v2);
+    unsafe::leak(v2);
+    ret s2;
+}
+
+/*
+Function: safe_slice
+
+FIXME: make sure char_slice / slice / byte_slice
+       have these preconditions and assertions
+FIXME: this shouldn't be mistaken for a UTF-8 safe slice
+*/
+fn safe_slice(s: str, begin: uint, end: uint) : uint::le(begin, end) -> str {
+    // would need some magic to make this a precondition
+    assert (end <= byte_len(s));
+    ret slice(s, begin, end);
+}
+
+/*
+Function: split
+
+Split a string at each occurance of a given separator
+
+Returns:
+
+A vector containing all the strings between each occurance of the separator
+
+FIXME: should be renamed to split_byte
+*/
+fn split(s: str, sep: u8) -> [str] {
+    let v: [str] = [];
+    let accum: str = "";
+    let ends_with_sep: bool = false;
+    for c: u8 in s {
+        if c == sep {
+            v += [accum];
+            accum = "";
+            ends_with_sep = true;
+        } else { accum += unsafe_from_byte(c); ends_with_sep = false; }
+    }
+    if byte_len(accum) != 0u || ends_with_sep { v += [accum]; }
+    ret v;
+}
+
+/*
+Function: splitn
+
+Split a string at each occurance of a given separator up to count times.
+
+Returns:
+
+A vector containing all the strings between each occurance of the separator
+
+FIXME: rename to 'splitn_char'
+*/
+fn splitn(s: str, sep: u8, count: uint) -> [str] {
+    let v = [];
+    let accum = "";
+    let n = count;
+    let ends_with_sep: bool = false;
+    for c in s {
+        if n > 0u && c == sep {
+            n -= 1u;
+            v += [accum];
+            accum = "";
+            ends_with_sep = true;
+        } else { accum += unsafe_from_byte(c); ends_with_sep = false; }
+    }
+    if byte_len(accum) != 0u || ends_with_sep { v += [accum]; }
+    ret v;
+}
+
+/*
+Function: split_str
+
+Splits a string at each occurrence of the given separator string. Empty
+leading fields are suppressed, and empty trailing fields are preserved.
+
+Returns:
+
+A vector containing all the strings between each occurrence of the separator.
+
+FIXME: should behave like split and split_char:
+         assert ["", "XXX", "YYY", ""] == split_str(".XXX.YYY.", ".");
+*/
+fn split_str(s: str, sep: str) -> [str] {
+    assert byte_len(sep) > 0u;
+    let v: [str] = [], accum = "", sep_match = 0u, leading = true;
+    for c: u8 in s {
+        // Did we match the entire separator?
+        if sep_match == byte_len(sep) {
+            if !leading { v += [accum]; }
+            accum = "";
+            sep_match = 0u;
+        }
+
+        if c == sep[sep_match] {
+            sep_match += 1u;
+        } else {
+            sep_match = 0u;
+            accum += unsafe_from_byte(c);
+            leading = false;
+        }
+    }
+
+    if byte_len(accum) > 0u { v += [accum]; }
+    if sep_match == byte_len(sep) { v += [""]; }
+
+    ret v;
+}
+
+/*
+Function: split_func
+
+Splits a string into substrings using a function
+(unicode safe)
+
+FIXME: rename to 'split'
+*/
+fn split_func(ss: str, sepfn: fn(cc: char)->bool) -> [str] {
+    let vv: [str] = [];
+    let accum: str = "";
+    let ends_with_sep: bool = false;
+
+    str::iter_chars(ss, {|cc| if sepfn(cc) {
+            vv += [accum];
+            accum = "";
+            ends_with_sep = true;
+        } else {
+            str::push_char(accum, cc);
+            ends_with_sep = false;
+        }
+    });
+
+    if char_len(accum) >= 0u || ends_with_sep {
+        vv += [accum];
+    }
+
+    ret vv;
+}
+
+/*
+Function: split_char
+
+Splits a string into a vector of the substrings separated by a given character
+*/
+fn split_char(ss: str, cc: char) -> [str] {
+   split_func(ss, {|kk| kk == cc})
+}
+
+/*
+Function: lines
+
+Splits a string into a vector of the substrings
+separated by LF ('\n')
+*/
+fn lines(ss: str) -> [str] {
+    split_func(ss, {|cc| cc == '\n'})
+}
+
+/*
+Function: lines_any
+
+Splits a string into a vector of the substrings
+separated by LF ('\n') and/or CR LF ('\r\n')
+*/
+fn lines_any(ss: str) -> [str] {
+    vec::map(lines(ss), {|s| trim_right(s)})
+}
+
+/*
+Function: words
+
+Splits a string into a vector of the substrings
+separated by whitespace
+*/
+fn words(ss: str) -> [str] {
+    ret vec::filter( split_func(ss, {|cc| char::is_whitespace(cc)}),
+                     {|w| 0u < str::char_len(w)});
+}
+
+/*
+Function: windowed
+
+Create a vector of substrings of size `nn`
+*/
+fn windowed(nn: uint, ss: str) -> [str] {
+    let ww = [];
+    let len = str::char_len(ss);
+
+    assert 1u <= nn;
+
+    let ii = 0u;
+    while ii+nn <= len {
+        let w = char_slice( ss, ii, ii+nn );
+        vec::push(ww,w);
+        ii += 1u;
+    }
+
+    ret ww;
+}
+
+/*
+Function: to_lower
+
+Convert a string to lowercase
+
+FIXME: rewrite with map
+*/
+fn to_lower(s: str) -> str {
+    let outstr = "";
+    iter_chars(s) { |c|
+        push_char(outstr, char::to_lower(c));
+    }
+    ret outstr;
+}
+
+/*
+Function: to_upper
+
+Convert a string to uppercase
+
+FIXME: rewrite with map
+*/
+fn to_upper(s: str) -> str {
+    let outstr = "";
+    iter_chars(s) { |c|
+        push_char(outstr, char::to_upper(c));
+    }
+    ret outstr;
+}
+
+// FIXME: This is super-inefficient
+/*
+Function: replace
+
+Replace all occurances of one string with another
+
+Parameters:
+
+s - The string containing substrings to replace
+from - The string to replace
+to - The replacement string
+
+Returns:
+
+The original string with all occurances of `from` replaced with `to`
+*/
+fn replace(s: str, from: str, to: str) : is_not_empty(from) -> str {
+    // FIXME (694): Shouldn't have to check this
+    check (is_not_empty(from));
+    if byte_len(s) == 0u {
+        ret "";
+    } else if starts_with(s, from) {
+        ret to + replace(slice(s, byte_len(from), byte_len(s)), from, to);
+    } else {
+        let idx = find(s, from);
+        if idx == -1 {
+            ret s;
+        }
+        ret char_slice(s, 0u, idx as uint) + to +
+            replace(char_slice(s, idx as uint + char_len(from), char_len(s)),
+                    from, to);
+    }
+}
+
+/*
+Function: escape
+
+Escapes special characters inside the string, making it safe for transfer.
+*/
+fn escape(s: str) -> str {
+    let r = "";
+    loop_chars(s, { |c| r += escape_char(c); true });
+    r
+}
+
+/*
+Section: Comparing strings
+*/
+
+/*
+Function: eq
+
+Bytewise string equality
+*/
+pure fn eq(&&a: str, &&b: str) -> bool { a == b }
+
+/*
+Function: lteq
+
+Bytewise less than or equal
+*/
+pure fn lteq(&&a: str, &&b: str) -> bool { a <= b }
+
+/*
+Function: hash
+
+String hash function
+*/
+fn hash(&&s: str) -> uint {
+    // djb hash.
+    // FIXME: replace with murmur.
+
+    let u: uint = 5381u;
+    for c: u8 in s { u *= 33u; u += c as uint; }
+    ret u;
+}
+
+/*
+Section: Iterating through strings
+*/
+
+/*
+Function: loop_chars
+
+Loop through a string, char by char
+
+Parameters:
+s  - A string to traverse. It may be empty.
+it - A block to execute with each consecutive character of `s`.
+Return `true` to continue, `false` to stop.
+
+Returns:
+
+`true` If execution proceeded correctly, `false` if it was interrupted,
+that is if `it` returned `false` at any point.
+
+FIXME: rename to 'chars_loop' (change? currently a synonym to 'all')
+ */
+fn loop_chars(s: str, it: fn(char) -> bool) -> bool{
+    ret loop_chars_sub(s, 0u, byte_len(s), it);
+}
+
+/*
+Function: all
+
+Return true if a predicate matches all characters or
+if the string contains no characters
+
+// FIXME: a synonym to loop_chars
+*/
+fn all(ss: str, ff: fn(char) -> bool) -> bool {
+    str::loop_chars(ss, ff)
+}
+
+/*
+Function: any
+
+Return true if a predicate matches any character
+(and false if it matches none or there are no characters)
+*/
+fn any(ss: str, pred: fn(char) -> bool) -> bool {
+   !all(ss, {|cc| !pred(cc)})
+}
+
+/*
+Function: map
+
+Apply a function to each character
+*/
+fn map(ss: str, ff: fn(char) -> char) -> str {
+    let result = "";
+
+    str::iter_chars(ss, {|cc|
+        str::push_char(result, ff(cc));
+    });
+
+    ret result;
+}
+
+/*
+Function: bytes_iter
+
+Iterate over the bytes in a string
+
+FIXME: Should it really include the last byte '\0'?
+*/
+fn bytes_iter(ss: str, it: fn(u8)) {
+    let pos = 0u;
+    let len = byte_len(ss);
+
+    while (pos < len) {
+        it(ss[pos]);
+        pos += 1u;
+    }
+}
+
+/*
+Function: iter_chars
+
+Iterate over the characters in a string
+
+FIXME: rename to 'chars_iter'
+*/
+fn iter_chars(s: str, it: fn(char)) {
+    let pos = 0u, len = byte_len(s);
+    while (pos < len) {
+        let {ch, next} = char_range_at(s, pos);
+        pos = next;
+        it(ch);
+    }
+}
+
+/*
+Function: chars_iter
+
+Iterate over the characters in a string
+
+FIXME: A synonym to iter_chars
+*/
+fn chars_iter(ss: str, it: fn(char)) {
+    iter_chars(ss, it)
+}
+
+/*
+Function: words_iter
+
+Apply a function to each word
+*/
+fn words_iter(ss: str, ff: fn(&&str)) {
+    vec::iter(words(ss), ff)
+}
+
+/*
+Function: lines_iter
+
+Apply a function to each lines (by '\n')
+*/
+fn lines_iter(ss: str, ff: fn(&&str)) {
+    vec::iter(lines(ss), ff)
+}
+
+// FIXME: ADD split_char_iter
+// FIXME: ADD splitn_char_iter
+
+/*
+Section: Searching
+*/
+
 /*
 Function: index
 
@@ -666,439 +1038,326 @@ fn ends_with(haystack: str, needle: str) -> bool {
 }
 
 /*
-Function: substr
-
-Take a substring of another. Returns a string containing `len` bytes
-starting at byte offset `begin`.
-
-This function is not unicode-safe.
-
-Failure:
-
-If `begin` + `len` is is greater than the byte length of the string
+Section: String properties
 */
-fn substr(s: str, begin: uint, len: uint) -> str {
-    ret slice(s, begin, begin + len);
+
+/*
+Function: is_ascii
+
+Determines if a string contains only ASCII characters
+
+FIXME: possibly implement using char::is_ascii when it exists
+*/
+fn is_ascii(s: str) -> bool {
+    let i: uint = byte_len(s);
+    while i > 0u { i -= 1u; if s[i] & 128u8 != 0u8 { ret false; } }
+    ret true;
 }
 
 /*
-Function: slice
+Predicate: is_empty
 
-Takes a bytewise slice from a string. Returns the substring from
-[`begin`..`end`).
-
-This function is not unicode-safe.
-
-Failure:
-
-- If begin is greater than end.
-- If end is greater than the length of the string.
+Returns true if the string has length 0
 */
-fn slice(s: str, begin: uint, end: uint) -> str unsafe {
-    // FIXME: Typestate precondition
-    assert (begin <= end);
-    assert (end <= byte_len(s));
+pure fn is_empty(s: str) -> bool { for c: u8 in s { ret false; } ret true; }
 
+/*
+Predicate: is_not_empty
+
+Returns true if the string has length greater than 0
+*/
+pure fn is_not_empty(s: str) -> bool { !is_empty(s) }
+
+/*
+Function: is_whitespace
+
+Returns true if the string contains only whitespace
+*/
+fn is_whitespace(s: str) -> bool {
+    ret loop_chars(s, char::is_whitespace);
+}
+
+/*
+Function: byte_len
+
+Returns the length in bytes of a string
+
+FIXME: rename to 'len_bytes'?
+*/
+pure fn byte_len(s: str) -> uint unsafe {
     let v: [u8] = unsafe::reinterpret_cast(s);
-    let v2 = vec::slice(v, begin, end);
+    let vlen = vec::len(v);
     unsafe::leak(v);
-    v2 += [0u8];
-    let s2: str = unsafe::reinterpret_cast(v2);
-    unsafe::leak(v2);
-    ret s2;
+    // There should always be a null terminator
+    assert (vlen > 0u);
+    ret vlen - 1u;
 }
 
 /*
-Function: safe_slice
+Function: char_len
+
+Count the number of unicode characters in a string
+
+FIXME: rename to 'len_chars'?
 */
-fn safe_slice(s: str, begin: uint, end: uint) : uint::le(begin, end) -> str {
-    // would need some magic to make this a precondition
-    assert (end <= byte_len(s));
-    ret slice(s, begin, end);
+fn char_len(s: str) -> uint {
+    ret char_len_range(s, 0u, byte_len(s));
 }
 
 /*
-Function: shift_byte
-
-Removes the first byte from a string and returns it.
-
-This function is not unicode-safe.
+Section: Misc
 */
-fn shift_byte(&s: str) -> u8 {
-    let len = byte_len(s);
-    assert (len > 0u);
-    let b = s[0];
-    s = substr(s, 1u, len - 1u);
-    ret b;
-}
 
 /*
-Function: pop_byte
+Function: is_utf8
 
-Removes the last byte from a string and returns it.
-
-This function is not unicode-safe.
+Determines if a vector of bytes contains valid UTF-8
 */
-fn pop_byte(&s: str) -> u8 {
-    let len = byte_len(s);
-    assert (len > 0u);
-    let b = s[len - 1u];
-    s = substr(s, 0u, len - 1u);
-    ret b;
-}
-
-/*
-Function: push_byte
-
-Appends a byte to a string.
-
-This function is not unicode-safe.
-*/
-fn push_byte(&s: str, b: u8) { rustrt::rust_str_push(s, b); }
-
-/*
-Function: push_bytes
-
-Appends a vector of bytes to a string.
-
-This function is not unicode-safe.
-*/
-fn push_bytes(&s: str, bytes: [u8]) {
-    for byte in bytes { rustrt::rust_str_push(s, byte); }
-}
-
-/*
-Function: split
-
-Split a string at each occurance of a given separator
-
-Returns:
-
-A vector containing all the strings between each occurance of the separator
-
-FIXME: should be renamed to split_byte
-*/
-fn split(s: str, sep: u8) -> [str] {
-    let v: [str] = [];
-    let accum: str = "";
-    let ends_with_sep: bool = false;
-    for c: u8 in s {
-        if c == sep {
-            v += [accum];
-            accum = "";
-            ends_with_sep = true;
-        } else { accum += unsafe_from_byte(c); ends_with_sep = false; }
-    }
-    if byte_len(accum) != 0u || ends_with_sep { v += [accum]; }
-    ret v;
-}
-
-/*
-Function: splitn
-
-Split a string at each occurance of a given separator up to count times.
-
-Returns:
-
-A vector containing all the strings between each occurance of the separator
-*/
-fn splitn(s: str, sep: u8, count: uint) -> [str] {
-    let v = [];
-    let accum = "";
-    let n = count;
-    let ends_with_sep: bool = false;
-    for c in s {
-        if n > 0u && c == sep {
-            n -= 1u;
-            v += [accum];
-            accum = "";
-            ends_with_sep = true;
-        } else { accum += unsafe_from_byte(c); ends_with_sep = false; }
-    }
-    if byte_len(accum) != 0u || ends_with_sep { v += [accum]; }
-    ret v;
-}
-
-/*
-Function: split_str
-
-Splits a string at each occurrence of the given separator string. Empty
-leading fields are suppressed, and empty trailing fields are preserved.
-
-Returns:
-
-A vector containing all the strings between each occurrence of the separator.
-
-FIXME: should behave like split and split_char:
-         assert ["", "XXX", "YYY", ""] == split_str(".XXX.YYY.", ".");
-*/
-fn split_str(s: str, sep: str) -> [str] {
-    assert byte_len(sep) > 0u;
-    let v: [str] = [], accum = "", sep_match = 0u, leading = true;
-    for c: u8 in s {
-        // Did we match the entire separator?
-        if sep_match == byte_len(sep) {
-            if !leading { v += [accum]; }
-            accum = "";
-            sep_match = 0u;
-        }
-
-        if c == sep[sep_match] {
-            sep_match += 1u;
-        } else {
-            sep_match = 0u;
-            accum += unsafe_from_byte(c);
-            leading = false;
+fn is_utf8(v: [u8]) -> bool {
+    let i = 0u;
+    let total = vec::len::<u8>(v);
+    while i < total {
+        let chsize = utf8_char_width(v[i]);
+        if chsize == 0u { ret false; }
+        if i + chsize > total { ret false; }
+        i += 1u;
+        while chsize > 1u {
+            if v[i] & 192u8 != tag_cont_u8 { ret false; }
+            i += 1u;
+            chsize -= 1u;
         }
     }
-
-    if byte_len(accum) > 0u { v += [accum]; }
-    if sep_match == byte_len(sep) { v += [""]; }
-
-    ret v;
+    ret true;
 }
 
 /*
-Function: split_func
+Function: char_len_range
 
-Splits a string into substrings using a function
-(unicode safe)
+As char_len but for a slice of a string
 
-FIXME: will be renamed to split.
+Parameters:
+ s           - A valid string
+ byte_start  - The position inside `s` where to start counting in bytes.
+ byte_len    - The number of bytes of `s` to take into account.
+
+Returns:
+ The number of Unicode characters in `s` in
+segment [byte_start, byte_start+len( .
+
+Safety note:
+- This function does not check whether the substring is valid.
+- This function fails if `byte_offset` or `byte_len` do not
+ represent valid positions inside `s`
+
+FIXME: rename to 'substr_len_chars'
 */
-fn split_func(ss: str, sepfn: fn(cc: char)->bool) -> [str] {
-    let vv: [str] = [];
-    let accum: str = "";
-    let ends_with_sep: bool = false;
-
-    str::iter_chars(ss, {|cc| if sepfn(cc) {
-            vv += [accum];
-            accum = "";
-            ends_with_sep = true;
-        } else {
-            str::push_char(accum, cc);
-            ends_with_sep = false;
-        }
-    });
-
-    if char_len(accum) >= 0u || ends_with_sep {
-        vv += [accum];
+fn char_len_range(s: str, byte_start: uint, byte_len: uint) -> uint {
+    let i     = byte_start;
+    let len   = 0u;
+    while i < byte_len {
+        let chsize = utf8_char_width(s[i]);
+        assert (chsize > 0u);
+        len += 1u;
+        i += chsize;
     }
-
-    ret vv;
+    assert (i == byte_len);
+    ret len;
 }
 
 /*
-Function: split_char
+Function: byte_len_range
 
-Splits a string into a vector of the substrings separated by a given character
+As byte_len but for a substring
+
+Parameters:
+s - A string
+byte_offset - The byte offset at which to start in the string
+char_len    - The number of chars (not bytes!) in the range
+
+Returns:
+The number of bytes in the substring starting at `byte_offset` and
+containing `char_len` chars.
+
+Safety note:
+
+This function fails if `byte_offset` or `char_len` do not represent
+valid positions in `s`
+
+FIXME: rename to 'substr_len_bytes'
 */
-fn split_char(ss: str, cc: char) -> [str] {
-   split_func(ss, {|kk| kk == cc})
-}
-
-/*
-Function: lines
-
-Splits a string into a vector of the substrings
-separated by LF ('\n')
-*/
-fn lines(ss: str) -> [str] {
-    split_func(ss, {|cc| cc == '\n'})
-}
-
-/*
-Function: lines_any
-
-Splits a string into a vector of the substrings
-separated by LF ('\n') and/or CR LF ('\r\n')
-*/
-fn lines_any(ss: str) -> [str] {
-    vec::map(lines(ss), {|s| trim_right(s)})
-}
-
-/*
-Function: words
-
-Splits a string into a vector of the substrings
-separated by whitespace
-*/
-fn words(ss: str) -> [str] {
-    ret vec::filter( split_func(ss, {|cc| char::is_whitespace(cc)}),
-                     {|w| 0u < str::char_len(w)});
-}
-
-/*
-Function: words_iter
-
-Apply a function to each word
-*/
-fn words_iter(ss: str, ff: fn(&&str)) {
-    vec::iter(words(ss), ff)
-}
-
-/*
-Function: lines_iter
-
-Apply a function to each lines (by '\n')
-*/
-fn lines_iter(ss: str, ff: fn(&&str)) {
-    vec::iter(lines(ss), ff)
-}
-
-/*
-Function: concat
-
-Concatenate a vector of strings
-*/
-fn concat(v: [str]) -> str {
-    let s: str = "";
-    for ss: str in v { s += ss; }
-    ret s;
-}
-
-/*
-Function: connect
-
-Concatenate a vector of strings, placing a given separator between each
-*/
-fn connect(v: [str], sep: str) -> str {
-    let s: str = "";
-    let first: bool = true;
-    for ss: str in v {
-        if first { first = false; } else { s += sep; }
-        s += ss;
+fn byte_len_range(s: str, byte_offset: uint, char_len: uint) -> uint {
+    let i = byte_offset;
+    let chars = 0u;
+    while chars < char_len {
+        let chsize = utf8_char_width(s[i]);
+        assert (chsize > 0u);
+        i += chsize;
+        chars += 1u;
     }
-    ret s;
+    ret i - byte_offset;
 }
 
 /*
-Function: to_lower
+Function: utf8_char_width
+
+Given a first byte, determine how many bytes are in this UTF-8 character
 
-Convert a string to lowercase
 */
-fn to_lower(s: str) -> str {
-    let outstr = "";
-    iter_chars(s) { |c|
-        push_char(outstr, char::to_lower(c));
-    }
-    ret outstr;
-}
-/*
-Function: to_upper
+pure fn utf8_char_width(b: u8) -> uint {
+    let byte: uint = b as uint;
+    if byte < 128u { ret 1u; }
+    if byte < 192u {
+        ret 0u; // Not a valid start byte
 
-Convert a string to uppercase
-*/
-fn to_upper(s: str) -> str {
-    let outstr = "";
-    iter_chars(s) { |c|
-        push_char(outstr, char::to_upper(c));
     }
-    ret outstr;
+    if byte < 224u { ret 2u; }
+    if byte < 240u { ret 3u; }
+    if byte < 248u { ret 4u; }
+    if byte < 252u { ret 5u; }
+    ret 6u;
 }
 
-// FIXME: This is super-inefficient
 /*
-Function: replace
+Function: char_range_at
 
-Replace all occurances of one string with another
+Pluck a character out of a string and return the index of the next character.
+This function can be used to iterate over the unicode characters of a string.
+
+Example:
+> let s = "中华Việt Nam";
+> let i = 0u;
+> while i < str::byte_len(s) {
+>    let {ch, next} = str::char_range_at(s, i);
+>    std::io::println(#fmt("%u: %c",i,ch));
+>    i = next;
+> }
+
+Example output:
+
+      0: 中
+      3: 华
+      6: V
+      7: i
+      8: ệ
+      11: t
+      12:
+      13: N
+      14: a
+      15: m
 
 Parameters:
 
-s - The string containing substrings to replace
-from - The string to replace
-to - The replacement string
+s - The string
+i - The byte offset of the char to extract
 
 Returns:
 
-The original string with all occurances of `from` replaced with `to`
-*/
-fn replace(s: str, from: str, to: str) : is_not_empty(from) -> str {
-    // FIXME (694): Shouldn't have to check this
-    check (is_not_empty(from));
-    if byte_len(s) == 0u {
-        ret "";
-    } else if starts_with(s, from) {
-        ret to + replace(slice(s, byte_len(from), byte_len(s)), from, to);
-    } else {
-        let idx = find(s, from);
-        if idx == -1 {
-            ret s;
-        }
-        ret char_slice(s, 0u, idx as uint) + to +
-            replace(char_slice(s, idx as uint + char_len(from), char_len(s)),
-                    from, to);
-    }
-}
-
-// FIXME: Also not efficient
-/*
-Function: char_slice
-
-Unicode-safe slice. Returns a slice of the given string containing
-the characters in the range [`begin`..`end`). `begin` and `end` are
-character indexes, not byte indexes.
+A record {ch: char, next: uint} containing the char value and the byte
+index of the next unicode character.
 
 Failure:
 
-- If begin is greater than end
-- If end is greater than the character length of the string
+If `i` is greater than or equal to the length of the string.
+If `i` is not the index of the beginning of a valid UTF-8 character.
 */
-fn char_slice(s: str, begin: uint, end: uint) -> str {
-    from_chars(vec::slice(to_chars(s), begin, end))
-}
-
-/*
-Function: trim_left
-
-Returns a string with leading whitespace removed.
-*/
-fn trim_left(s: str) -> str {
-    fn count_whities(s: [char]) -> uint {
-        let i = 0u;
-        while i < vec::len(s) {
-            if !char::is_whitespace(s[i]) { break; }
-            i += 1u;
-        }
-        ret i;
+fn char_range_at(s: str, i: uint) -> {ch: char, next: uint} {
+    let b0 = s[i];
+    let w = utf8_char_width(b0);
+    assert (w != 0u);
+    if w == 1u { ret {ch: b0 as char, next: i + 1u}; }
+    let val = 0u;
+    let end = i + w;
+    let i = i + 1u;
+    while i < end {
+        let byte = s[i];
+        assert (byte & 192u8 == tag_cont_u8);
+        val <<= 6u;
+        val += byte & 63u8 as uint;
+        i += 1u;
     }
-    let chars = to_chars(s);
-    let whities = count_whities(chars);
-    ret from_chars(vec::slice(chars, whities, vec::len(chars)));
+    // Clunky way to get the right bits from the first byte. Uses two shifts,
+    // the first to clip off the marker bits at the left of the byte, and then
+    // a second (as uint) to get it to the right position.
+    val += (b0 << (w + 1u as u8) as uint) << ((w - 1u) * 6u - w - 1u);
+    ret {ch: val as char, next: i};
 }
 
 /*
-Function: trim_right
+Function: char_at
 
-Returns a string with trailing whitespace removed.
+Pluck a character out of a string
 */
-fn trim_right(s: str) -> str {
-    fn count_whities(s: [char]) -> uint {
-        let i = vec::len(s);
-        while 0u < i {
-            if !char::is_whitespace(s[i - 1u]) { break; }
-            i -= 1u;
-        }
-        ret i;
+fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; }
+
+/*
+Function: loop_chars_sub
+
+Loop through a substring, char by char
+
+Parameters:
+s           - A string to traverse. It may be empty.
+byte_offset - The byte offset at which to start in the string.
+byte_len    - The number of bytes to traverse in the string
+it          - A block to execute with each consecutive character of `s`.
+Return `true` to continue, `false` to stop.
+
+Returns:
+
+`true` If execution proceeded correctly, `false` if it was interrupted,
+that is if `it` returned `false` at any point.
+
+Safety note:
+- This function does not check whether the substring is valid.
+- This function fails if `byte_offset` or `byte_len` do not
+ represent valid positions inside `s`
+
+FIXME: rename to 'substr_all'
+ */
+fn loop_chars_sub(s: str, byte_offset: uint, byte_len: uint,
+              it: fn(char) -> bool) -> bool {
+   let i = byte_offset;
+   let result = true;
+   while i < byte_len {
+      let {ch, next} = char_range_at(s, i);
+      if !it(ch) {result = false; break;}
+      i = next;
+   }
+   ret result;
+}
+
+
+/*
+Function: escape_char
+
+Escapes a single character.
+*/
+fn escape_char(c: char) -> str {
+    alt c {
+      '"' { "\\\"" }
+      '\\' { "\\\\" }
+      '\n' { "\\n" }
+      '\t' { "\\t" }
+      '\r' { "\\r" }
+      // FIXME: uncomment this when extfmt is moved to core
+      // in a snapshot.
+      // '\x00' to '\x1f' { #fmt["\\x%02x", c as uint] }
+      v { from_char(c) }
     }
-    let chars = to_chars(s);
-    let whities = count_whities(chars);
-    ret from_chars(vec::slice(chars, 0u, whities));
 }
 
-/*
-Function: trim
-
-Returns a string with leading and trailing whitespace removed
-*/
-fn trim(s: str) -> str { trim_left(trim_right(s)) }
-
-/*
-Type: sbuf
-
-An unsafe buffer of bytes. Corresponds to a C char pointer.
-*/
-type sbuf = *u8;
+// UTF-8 tags and ranges
+const tag_cont_u8: u8 = 128u8;
+const tag_cont: uint = 128u;
+const max_one_b: uint = 128u;
+const tag_two_b: uint = 192u;
+const max_two_b: uint = 2048u;
+const tag_three_b: uint = 224u;
+const max_three_b: uint = 65536u;
+const tag_four_b: uint = 240u;
+const max_four_b: uint = 2097152u;
+const tag_five_b: uint = 248u;
+const max_five_b: uint = 67108864u;
+const tag_six_b: uint = 252u;
 
 // NB: This is intentionally unexported because it's easy to misuse (there's
 // no guarantee that the string is rooted). Instead, use as_buf below.
@@ -1125,110 +1384,12 @@ fn as_buf<T>(s: str, f: fn(sbuf) -> T) -> T unsafe {
 }
 
 /*
-Function: from_cstr
+Type: sbuf
 
-Create a Rust string from a null-terminated C string
+An unsafe buffer of bytes. Corresponds to a C char pointer.
 */
-unsafe fn from_cstr(cstr: sbuf) -> str {
-    let res = "";
-    let start = cstr;
-    let curr = start;
-    let i = 0u;
-    while *curr != 0u8 {
-        push_byte(res, *curr);
-        i += 1u;
-        curr = ptr::offset(start, i);
-    }
-    ret res;
-}
+type sbuf = *u8;
 
-/*
-Function: escape_char
-
-Escapes a single character.
-*/
-fn escape_char(c: char) -> str {
-    alt c {
-      '"' { "\\\"" }
-      '\\' { "\\\\" }
-      '\n' { "\\n" }
-      '\t' { "\\t" }
-      '\r' { "\\r" }
-      // FIXME: uncomment this when extfmt is moved to core
-      // in a snapshot.
-      // '\x00' to '\x1f' { #fmt["\\x%02x", c as uint] }
-      v { from_char(c) }
-    }
-}
-
-/*
-Function: escape
-
-Escapes special characters inside the string, making it safe for transfer.
-*/
-fn escape(s: str) -> str {
-    let r = "";
-    loop_chars(s, { |c| r += escape_char(c); true });
-    r
-}
-
-/*
-Function: all
-
-Return true if a predicate matches all characters or
-if the string contains no characters
-
-// FIXME: a synonym to loop_chars
-*/
-fn all(ss: str, ff: fn(char) -> bool) -> bool {
-    str::loop_chars(ss, ff)
-}
-
-/*
-Function: any
-
-Return true if a predicate matches any character
-(and false if it matches none or there are no characters)
-*/
-fn any(ss: str, pred: fn(char) -> bool) -> bool {
-   !all(ss, {|cc| !pred(cc)})
-}
-
-/*
-Function: map
-
-Apply a function to each character
-*/
-fn map(ss: str, ff: fn(char) -> char) -> str {
-    let result = "";
-
-    str::iter_chars(ss, {|cc|
-        str::push_char(result, ff(cc));
-    });
-
-    ret result;
-}
-
-/*
-Function: windowed
-
-Create a vector of substrings of size `nn`
-*/
-fn windowed(nn: uint, ss: str) -> [str] {
-    let ww = [];
-    let len = str::char_len(ss);
-
-    assert 1u <= nn;
-
-    let ii = 0u;
-    while ii+nn <= len {
-        let w = char_slice( ss, ii, ii+nn );
-        vec::push(ww,w);
-        ii += 1u;
-    }
-
-    ret ww;
-}
 
 #[cfg(test)]
 mod tests {