From 2ba44e24d5fccfc6fe7c28a742d18e03b6acafae Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sun, 12 Feb 2012 01:32:09 -0800 Subject: [PATCH] (core::str) rename byte_len_range -> substr_len_bytes and rename char_len_range -> substr_len_chars --- src/comp/syntax/codemap.rs | 2 +- src/libcore/str.rs | 92 +++++++++++++++++++++----------------- src/libstd/rope.rs | 10 ++--- 3 files changed, 56 insertions(+), 48 deletions(-) diff --git a/src/comp/syntax/codemap.rs b/src/comp/syntax/codemap.rs index cb2590e1951..d1b60916e19 100644 --- a/src/comp/syntax/codemap.rs +++ b/src/comp/syntax/codemap.rs @@ -135,7 +135,7 @@ fn lookup_byte_offset(cm: codemap::codemap, chpos: uint) let {fm,line} = lookup_line(cm,chpos,lookup); let line_offset = fm.lines[line].byte - fm.start_pos.byte; let col = chpos - fm.lines[line].ch; - let col_offset = str::byte_len_range(*fm.src, line_offset, col); + let col_offset = str::substr_len_bytes(*fm.src, line_offset, col); ret {fm: fm, pos: line_offset + col_offset}; } diff --git a/src/libcore/str.rs b/src/libcore/str.rs index c9f42914032..522ae89d395 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -85,8 +85,8 @@ export // Misc // FIXME: perhaps some more of this section shouldn't be exported? is_utf8, - char_len_range, - byte_len_range, + substr_len_bytes, + substr_len_chars, utf8_char_width, char_range_at, char_at, @@ -271,7 +271,7 @@ Failure: If the string does not contain any characters. */ fn pop_char(&s: str) -> char unsafe { - let end = byte_len(s); + let end = len_bytes(s); let {ch:ch, prev:end} = char_range_at_reverse(s, end); s = unsafe::slice_bytes(s, 0u, end); ret ch; @@ -288,7 +288,7 @@ If the string does not contain any characters. */ fn shift_char(&s: str) -> char unsafe { let r = char_range_at(s, 0u); - s = unsafe::slice_bytes(s, r.next, byte_len(s)); + s = unsafe::slice_bytes(s, r.next, len_bytes(s)); ret r.ch; } @@ -367,7 +367,7 @@ Convert a string to a vector of characters fn chars(s: str) -> [char] { let buf: [char] = []; let i = 0u; - let len = byte_len(s); + let len = len_bytes(s); while i < len { let cur = char_range_at(s, i); buf += [cur.ch]; @@ -443,7 +443,7 @@ fn splitn_byte(ss: str, sep: u8, count: uint) -> [str] unsafe { assert u8::is_ascii(sep); let vv = []; - let start = 0u, current = 0u, len = byte_len(ss); + let start = 0u, current = 0u, len = len_bytes(ss); let splits_done = 0u; while splits_done < count && current < len { @@ -550,7 +550,7 @@ up to `count` times fn splitn_char(ss: str, sep: char, count: uint) -> [str] unsafe { let vv = []; - let start = 0u, current = 0u, len = byte_len(ss); + let start = 0u, current = 0u, len = len_bytes(ss); let splits_done = 0u; while splits_done < count && current < len { @@ -656,10 +656,11 @@ Returns: The original string with all occurances of `from` replaced with `to` */ fn replace(s: str, from: str, to: str) : is_not_empty(from) -> str unsafe { - if byte_len(s) == 0u { + if len_bytes(s) == 0u { ret ""; } else if starts_with(s, from) { - ret to + replace(unsafe::slice_bytes(s, byte_len(from), byte_len(s)), + ret to + replace( + unsafe::slice_bytes(s, len_bytes(from), len_bytes(s)), from, to); } else { let idx = find(s, from); @@ -726,7 +727,7 @@ Return true if a predicate matches all characters or if the string contains no characters */ fn all(s: str, it: fn(char) -> bool) -> bool{ - ret substr_all(s, 0u, byte_len(s), it); + ret substr_all(s, 0u, len_bytes(s), it); } /* @@ -746,7 +747,7 @@ Apply a function to each character */ fn map(ss: str, ff: fn(char) -> char) -> str { let result = ""; - reserve(result, byte_len(ss)); + reserve(result, len_bytes(ss)); chars_iter(ss, {|cc| str::push_char(result, ff(cc)); @@ -762,7 +763,7 @@ Iterate over the bytes in a string */ fn bytes_iter(ss: str, it: fn(u8)) { let pos = 0u; - let len = byte_len(ss); + let len = len_bytes(ss); while (pos < len) { it(ss[pos]); @@ -776,7 +777,7 @@ Function: chars_iter Iterate over the characters in a string */ fn chars_iter(s: str, it: fn(char)) { - let pos = 0u, len = byte_len(s); + let pos = 0u, len = len_bytes(s); while (pos < len) { let {ch, next} = char_range_at(s, pos); pos = next; @@ -833,7 +834,7 @@ Section: Searching fn index(ss: str, cc: char) -> option { let bii = 0u; let cii = 0u; - let len = byte_len(ss); + let len = len_bytes(ss); while bii < len { let {ch, next} = char_range_at(ss, bii); @@ -855,7 +856,7 @@ fn index(ss: str, cc: char) -> option { // Returns the index of the first matching char // (as option some/none) fn rindex(ss: str, cc: char) -> option { - let bii = byte_len(ss); + let bii = len_bytes(ss); let cii = len(ss); while bii > 0u { let {ch, prev} = char_range_at_reverse(ss, bii); @@ -890,8 +891,8 @@ The index of the first occurance of `needle`, or -1 if not found. FIXME: UTF-8 */ fn find(haystack: str, needle: str) -> int { - let haystack_len: int = byte_len(haystack) as int; - let needle_len: int = byte_len(needle) as int; + let haystack_len: int = len_bytes(haystack) as int; + let needle_len: int = len_bytes(needle) as int; if needle_len == 0 { ret 0; } fn match_at(haystack: str, needle: str, i: int) -> bool { let j: int = i; @@ -931,8 +932,8 @@ haystack - The string to look in needle - The string to look for */ fn starts_with(haystack: str, needle: str) -> bool { - let haystack_len: uint = byte_len(haystack); - let needle_len: uint = byte_len(needle); + let haystack_len: uint = len_bytes(haystack); + let needle_len: uint = len_bytes(needle); if needle_len == 0u { ret true; } if needle_len > haystack_len { ret false; } ret eq(substr(haystack, 0u, needle_len), needle); @@ -969,7 +970,7 @@ Function: is_ascii Determines if a string contains only ASCII characters */ fn is_ascii(s: str) -> bool { - let i: uint = byte_len(s); + let i: uint = len_bytes(s); while i > 0u { i -= 1u; if !u8::is_ascii(s[i]) { ret false; } } ret true; } @@ -1019,7 +1020,7 @@ pure fn byte_len(s: str) -> uint unsafe { len_bytes(s) } // String length or size in characters. // (Synonyms: len_chars, char_len) fn len(s: str) -> uint { - char_len_range(s, 0u, byte_len(s)) + substr_len_chars(s, 0u, len_bytes(s)) } fn len_chars(s: str) -> uint { len(s) } @@ -1054,7 +1055,7 @@ fn is_utf8(v: [u8]) -> bool { } /* -Function: char_len_range +Function: substr_len_chars As char_len but for a slice of a string @@ -1071,10 +1072,8 @@ Safety note: - This function does not check whether the substring is valid. - This function fails if `byte_offset` or `byte_len` do not represent valid positions inside `s` - -FIXME: rename to 'substr_len_chars' */ -fn char_len_range(s: str, byte_start: uint, byte_len: uint) -> uint { +fn substr_len_chars(s: str, byte_start: uint, byte_len: uint) -> uint { let i = byte_start; let byte_stop = i + byte_len; let len = 0u; @@ -1088,7 +1087,7 @@ fn char_len_range(s: str, byte_start: uint, byte_len: uint) -> uint { } /* -Function: byte_len_range +Function: substr_len_bytes As byte_len but for a substring @@ -1108,7 +1107,7 @@ valid positions in `s` FIXME: rename to 'substr_len_bytes' */ -fn byte_len_range(s: str, byte_offset: uint, char_len: uint) -> uint { +fn substr_len_bytes(s: str, byte_offset: uint, char_len: uint) -> uint { let i = byte_offset; let chars = 0u; while chars < char_len { @@ -1149,7 +1148,7 @@ This function can be used to iterate over the unicode characters of a string. Example: > let s = "中华Việt Nam"; > let i = 0u; -> while i < str::byte_len(s) { +> while i < str::len_bytes(s) { > let {ch, next} = str::char_range_at(s, i); > std::io::println(#fmt("%u: %c",i,ch)); > i = next; @@ -1391,7 +1390,7 @@ mod unsafe { unsafe fn slice_bytes(s: str, begin: uint, end: uint) -> str unsafe { // FIXME: Typestate precondition assert (begin <= end); - assert (end <= byte_len(s)); + assert (end <= len_bytes(s)); let v = as_bytes(s) { |v| vec::slice(v, begin, end) }; v += [0u8]; @@ -1408,7 +1407,7 @@ mod unsafe { unsafe fn slice_bytes_safe_range(s: str, begin: uint, end: uint) : uint::le(begin, end) -> str { // would need some magic to make this a precondition - assert (end <= byte_len(s)); + assert (end <= len_bytes(s)); ret slice_bytes(s, begin, end); } @@ -1430,7 +1429,7 @@ mod unsafe { // // Removes the last byte from a string and returns it. (Not UTF-8 safe). unsafe fn pop_byte(&s: str) -> u8 unsafe { - let len = byte_len(s); + let len = len_bytes(s); assert (len > 0u); let b = s[len - 1u]; s = unsafe::slice_bytes(s, 0u, len - 1u); @@ -1441,7 +1440,7 @@ mod unsafe { // // Removes the first byte from a string and returns it. (Not UTF-8 safe). unsafe fn shift_byte(&s: str) -> u8 unsafe { - let len = byte_len(s); + let len = len_bytes(s); assert (len > 0u); let b = s[0]; s = unsafe::slice_bytes(s, 1u, len); @@ -1470,14 +1469,23 @@ mod tests { } #[test] - fn test_bytes_len() { - assert (byte_len("") == 0u); - assert (byte_len("hello world") == 11u); - assert (byte_len("\x63") == 1u); - assert (byte_len("\xa2") == 2u); - assert (byte_len("\u03c0") == 2u); - assert (byte_len("\u2620") == 3u); - assert (byte_len("\U0001d11e") == 4u); + fn test_len() { + assert (len_bytes("") == 0u); + assert (len_bytes("hello world") == 11u); + assert (len_bytes("\x63") == 1u); + assert (len_bytes("\xa2") == 2u); + assert (len_bytes("\u03c0") == 2u); + assert (len_bytes("\u2620") == 3u); + assert (len_bytes("\U0001d11e") == 4u); + + assert (len("") == 0u); + assert (len("hello world") == 11u); + assert (len("\x63") == 1u); + assert (len("\xa2") == 1u); + assert (len("\u03c0") == 1u); + assert (len("\u2620") == 1u); + assert (len("\U0001d11e") == 1u); + assert (len("ประเทศไทย中华Việt Nam") == 19u); } #[test] @@ -1712,7 +1720,7 @@ mod tests { #[test] fn test_substr() { fn t(a: str, b: str, start: int) { - assert (eq(substr(a, start as uint, byte_len(b)), b)); + assert (eq(substr(a, start as uint, len_bytes(b)), b)); } t("hello", "llo", 2); t("hello", "el", 1); @@ -2002,7 +2010,7 @@ mod tests { let v: [u8] = bytes(s1); let s2: str = from_bytes(v); let i: uint = 0u; - let n1: uint = byte_len(s1); + let n1: uint = len_bytes(s1); let n2: uint = vec::len::(v); assert (n1 == n2); while i < n1 { diff --git a/src/libstd/rope.rs b/src/libstd/rope.rs index c329cdd3494..07ee837a9b9 100644 --- a/src/libstd/rope.rs +++ b/src/libstd/rope.rs @@ -743,7 +743,7 @@ mod node { */ fn of_substr(str: @str, byte_start: uint, byte_len: uint) -> @node { ret of_substr_unsafer(str, byte_start, byte_len, - str::char_len_range(*str, byte_start, byte_len)); + str::substr_len_chars(*str, byte_start, byte_len)); } /* @@ -795,7 +795,7 @@ mod node { if i == 0u { first_leaf_char_len } else { hint_max_leaf_char_len }; let chunk_byte_len = - str::byte_len_range(*str, offset, chunk_char_len); + str::substr_len_bytes(*str, offset, chunk_char_len); nodes[i] = @leaf({ byte_offset: offset, byte_len: chunk_byte_len, @@ -998,7 +998,7 @@ mod node { alt(*node) { node::leaf(x) { let char_len = - str::char_len_range(*x.content, byte_offset, byte_len); + str::substr_len_chars(*x.content, byte_offset, byte_len); ret @leaf({byte_offset: byte_offset, byte_len: byte_len, char_len: char_len, @@ -1059,9 +1059,9 @@ mod node { ret node; } let byte_offset = - str::byte_len_range(*x.content, 0u, char_offset); + str::substr_len_bytes(*x.content, 0u, char_offset); let byte_len = - str::byte_len_range(*x.content, byte_offset, char_len); + str::substr_len_bytes(*x.content, byte_offset, char_len); ret @leaf({byte_offset: byte_offset, byte_len: byte_len, char_len: char_len,