Added string functions: split_func, split_char, lines, lines_any, words,
and more tests
This commit is contained in:
parent
3466c9b4be
commit
d8b0a1910a
@ -7,7 +7,8 @@
|
||||
export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len,
|
||||
byte_len_range, index,
|
||||
rindex, find, starts_with, ends_with, substr, slice, split, splitn,
|
||||
split_str, concat, connect, to_lower, to_upper, replace, char_slice,
|
||||
split_str, split_func, split_char, lines, lines_any, words,
|
||||
concat, connect, to_lower, to_upper, replace, char_slice,
|
||||
trim_left, trim_right, trim, unshift_char, shift_char, pop_char,
|
||||
push_char, is_utf8, from_chars, to_chars, char_len, char_len_range,
|
||||
char_at, bytes, is_ascii, shift_byte, pop_byte,
|
||||
@ -252,7 +253,7 @@ fn from_chars(chs: [char]) -> str {
|
||||
/*
|
||||
Function: utf8_char_width
|
||||
|
||||
FIXME: What does this function do?
|
||||
Given a first byte, determine how many bytes are in this UTF-8 character
|
||||
*/
|
||||
pure fn utf8_char_width(b: u8) -> uint {
|
||||
let byte: uint = b as uint;
|
||||
@ -275,15 +276,27 @@ fn from_chars(chs: [char]) -> str {
|
||||
This function can be used to iterate over the unicode characters of a string.
|
||||
|
||||
Example:
|
||||
|
||||
> let s = "Clam chowder, hot sauce, pork rinds";
|
||||
> let i = 0;
|
||||
> while i < len(s) {
|
||||
> let {ch, next} = char_range_at(s, i);
|
||||
> log(debug, ch);
|
||||
> i = next;
|
||||
> let s = "中华Việt Nam";
|
||||
> let i = 0u;
|
||||
> while i < str::byte_len(s) {
|
||||
> let {ch, next} = str::char_range_at(s, i);
|
||||
> std::io::println(#fmt("%u: %c",i,ch));
|
||||
> i = next;
|
||||
> }
|
||||
|
||||
Example output:
|
||||
|
||||
0: 中
|
||||
3: 华
|
||||
6: V
|
||||
7: i
|
||||
8: ệ
|
||||
11: t
|
||||
12:
|
||||
13: N
|
||||
14: a
|
||||
15: m
|
||||
|
||||
Parameters:
|
||||
|
||||
s - The string
|
||||
@ -721,6 +734,8 @@ fn push_bytes(&s: str, bytes: [u8]) {
|
||||
Returns:
|
||||
|
||||
A vector containing all the strings between each occurance of the separator
|
||||
|
||||
FIXME: should be renamed to split_byte
|
||||
*/
|
||||
fn split(s: str, sep: u8) -> [str] {
|
||||
let v: [str] = [];
|
||||
@ -772,6 +787,9 @@ fn splitn(s: str, sep: u8, count: uint) -> [str] {
|
||||
Returns:
|
||||
|
||||
A vector containing all the strings between each occurrence of the separator.
|
||||
|
||||
FIXME: should behave like split and split_char:
|
||||
assert ["", "XXX", "YYY", ""] == split_str(".XXX.YYY.", ".");
|
||||
*/
|
||||
fn split_str(s: str, sep: str) -> [str] {
|
||||
assert byte_len(sep) > 0u;
|
||||
@ -799,6 +817,76 @@ fn split_str(s: str, sep: str) -> [str] {
|
||||
ret v;
|
||||
}
|
||||
|
||||
/*
|
||||
Function: split_func
|
||||
|
||||
Splits a string into substrings using a function
|
||||
(unicode safe)
|
||||
|
||||
FIXME: will be renamed to split.
|
||||
*/
|
||||
fn split_func(ss: str, sepfn: fn&(cc: char)->bool) -> [str] {
|
||||
let vv: [str] = [];
|
||||
let accum: str = "";
|
||||
let ends_with_sep: bool = false;
|
||||
|
||||
str::iter_chars(ss, {|cc| if sepfn(cc) {
|
||||
vv += [accum];
|
||||
accum = "";
|
||||
ends_with_sep = true;
|
||||
} else {
|
||||
str::push_char(accum, cc);
|
||||
ends_with_sep = false;
|
||||
}
|
||||
});
|
||||
|
||||
if char_len(accum) >= 0u || ends_with_sep {
|
||||
vv += [accum];
|
||||
}
|
||||
|
||||
ret vv;
|
||||
}
|
||||
|
||||
/*
|
||||
Function: split_char
|
||||
|
||||
Splits a string into a vector of the substrings separated by a given character
|
||||
*/
|
||||
fn split_char(ss: str, cc: char) -> [str] {
|
||||
split_func(ss, {|kk| kk == cc})
|
||||
}
|
||||
|
||||
/*
|
||||
Function: lines
|
||||
|
||||
Splits a string into a vector of the substrings
|
||||
separated by LF ('\n')
|
||||
*/
|
||||
fn lines(ss: str) -> [str] {
|
||||
split_func(ss, {|cc| cc == '\n'})
|
||||
}
|
||||
|
||||
/*
|
||||
Function: lines_any
|
||||
|
||||
Splits a string into a vector of the substrings
|
||||
separated by LF ('\n') and/or CR LF ('\r\n')
|
||||
*/
|
||||
fn lines_any(ss: str) -> [str] {
|
||||
vec::map(lines(ss), {|s| trim_right(s)})
|
||||
}
|
||||
|
||||
/*
|
||||
Function: words
|
||||
|
||||
Splits a string into a vector of the substrings
|
||||
separated by whitespace
|
||||
*/
|
||||
fn words(ss: str) -> [str] {
|
||||
ret vec::filter( split_func(ss, {|cc| char::is_whitespace(cc)}),
|
||||
{|w| 0u < str::char_len(w)});
|
||||
}
|
||||
|
||||
/*
|
||||
Function: concat
|
||||
|
||||
|
@ -80,12 +80,83 @@ fn t(s: str, sep: str, i: int, k: str) {
|
||||
let v = str::split_str(s, sep);
|
||||
assert str::eq(v[i], k);
|
||||
}
|
||||
|
||||
//FIXME: should behave like split and split_char:
|
||||
//assert ["", "XXX", "YYY", ""] == str::split_str(".XXX.YYY.", ".");
|
||||
|
||||
t("abc::hello::there", "::", 0, "abc");
|
||||
t("abc::hello::there", "::", 1, "hello");
|
||||
t("abc::hello::there", "::", 2, "there");
|
||||
t("::hello::there", "::", 0, "hello");
|
||||
t("hello::there::", "::", 2, "");
|
||||
t("::hello::there::", "::", 2, "");
|
||||
t("ประเทศไทย中华Việt Nam", "中华", 0, "ประเทศไทย");
|
||||
t("ประเทศไทย中华Việt Nam", "中华", 1, "Việt Nam");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_func () {
|
||||
let data = "ประเทศไทย中华Việt Nam";
|
||||
assert ["ประเทศไทย中", "Việt Nam"]
|
||||
== str::split_func (data, {|cc| cc == '华'});
|
||||
|
||||
assert ["", "", "XXX", "YYY", ""]
|
||||
== str::split_func("zzXXXzYYYz", char::is_lowercase);
|
||||
|
||||
assert ["zz", "", "", "z", "", "", "z"]
|
||||
== str::split_func("zzXXXzYYYz", char::is_uppercase);
|
||||
|
||||
assert ["",""] == str::split_func("z", {|cc| cc == 'z'});
|
||||
assert [""] == str::split_func("", {|cc| cc == 'z'});
|
||||
assert ["ok"] == str::split_func("ok", {|cc| cc == 'z'});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_char () {
|
||||
let data = "ประเทศไทย中华Việt Nam";
|
||||
assert ["ประเทศไทย中", "Việt Nam"]
|
||||
== str::split_char(data, '华');
|
||||
|
||||
assert ["", "", "XXX", "YYY", ""]
|
||||
== str::split_char("zzXXXzYYYz", 'z');
|
||||
assert ["",""] == str::split_char("z", 'z');
|
||||
assert [""] == str::split_char("", 'z');
|
||||
assert ["ok"] == str::split_char("ok", 'z');
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lines () {
|
||||
let lf = "\nMary had a little lamb\nLittle lamb\n";
|
||||
let crlf = "\r\nMary had a little lamb\r\nLittle lamb\r\n";
|
||||
|
||||
assert ["", "Mary had a little lamb", "Little lamb", ""]
|
||||
== str::lines(lf);
|
||||
|
||||
assert ["", "Mary had a little lamb", "Little lamb", ""]
|
||||
== str::lines_any(lf);
|
||||
|
||||
assert ["\r", "Mary had a little lamb\r", "Little lamb\r", ""]
|
||||
== str::lines(crlf);
|
||||
|
||||
assert ["", "Mary had a little lamb", "Little lamb", ""]
|
||||
== str::lines_any(crlf);
|
||||
|
||||
assert [""] == str::lines ("");
|
||||
assert [""] == str::lines_any("");
|
||||
assert ["",""] == str::lines ("\n");
|
||||
assert ["",""] == str::lines_any("\n");
|
||||
assert ["banana"] == str::lines ("banana");
|
||||
assert ["banana"] == str::lines_any("banana");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_words () {
|
||||
let data = "\nMary had a little lamb\nLittle lamb\n";
|
||||
assert ["Mary","had","a","little","lamb","Little","lamb"]
|
||||
== str::words(data);
|
||||
|
||||
assert ["ok"] == str::words("ok");
|
||||
assert [] == str::words("");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -215,6 +286,27 @@ fn test_char_slice() {
|
||||
assert (str::eq("bc", str::char_slice("abc", 1u, 3u)));
|
||||
assert (str::eq("", str::char_slice("abc", 1u, 1u)));
|
||||
assert (str::eq("\u65e5", str::char_slice("\u65e5\u672c", 0u, 1u)));
|
||||
|
||||
let data = "ประเทศไทย中华";
|
||||
assert (str::eq("ป", str::char_slice(data, 0u, 1u)));
|
||||
assert (str::eq("ร", str::char_slice(data, 1u, 2u)));
|
||||
assert (str::eq("华", str::char_slice(data, 10u, 11u)));
|
||||
assert (str::eq("", str::char_slice(data, 1u, 1u)));
|
||||
|
||||
fn a_million_letter_X() -> str {
|
||||
let i = 0;
|
||||
let rs = "";
|
||||
while i < 100000 { rs += "华华华华华华华华华华"; i += 1; }
|
||||
ret rs;
|
||||
}
|
||||
fn half_a_million_letter_X() -> str {
|
||||
let i = 0;
|
||||
let rs = "";
|
||||
while i < 100000 { rs += "华华华华华"; i += 1; }
|
||||
ret rs;
|
||||
}
|
||||
assert (str::eq(half_a_million_letter_X(),
|
||||
str::char_slice(a_million_letter_X(), 0u, 500000u)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
Loading…
Reference in New Issue
Block a user