Added string functions: split_func, split_char, lines, lines_any, words,

and more tests
This commit is contained in:
Kevin Cantu 2012-01-15 20:20:06 -08:00
parent 3466c9b4be
commit d8b0a1910a
2 changed files with 189 additions and 9 deletions

View File

@ -7,7 +7,8 @@
export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len,
byte_len_range, index,
rindex, find, starts_with, ends_with, substr, slice, split, splitn,
split_str, concat, connect, to_lower, to_upper, replace, char_slice,
split_str, split_func, split_char, lines, lines_any, words,
concat, connect, to_lower, to_upper, replace, char_slice,
trim_left, trim_right, trim, unshift_char, shift_char, pop_char,
push_char, is_utf8, from_chars, to_chars, char_len, char_len_range,
char_at, bytes, is_ascii, shift_byte, pop_byte,
@ -252,7 +253,7 @@ fn from_chars(chs: [char]) -> str {
/*
Function: utf8_char_width
FIXME: What does this function do?
Given a first byte, determine how many bytes are in this UTF-8 character
*/
pure fn utf8_char_width(b: u8) -> uint {
let byte: uint = b as uint;
@ -275,15 +276,27 @@ fn from_chars(chs: [char]) -> str {
This function can be used to iterate over the unicode characters of a string.
Example:
> let s = "Clam chowder, hot sauce, pork rinds";
> let i = 0;
> while i < len(s) {
> let {ch, next} = char_range_at(s, i);
> log(debug, ch);
> i = next;
> let s = "中华Việt Nam";
> let i = 0u;
> while i < str::byte_len(s) {
> let {ch, next} = str::char_range_at(s, i);
> std::io::println(#fmt("%u: %c",i,ch));
> i = next;
> }
Example output:
0:
3:
6: V
7: i
8:
11: t
12:
13: N
14: a
15: m
Parameters:
s - The string
@ -721,6 +734,8 @@ fn push_bytes(&s: str, bytes: [u8]) {
Returns:
A vector containing all the strings between each occurance of the separator
FIXME: should be renamed to split_byte
*/
fn split(s: str, sep: u8) -> [str] {
let v: [str] = [];
@ -772,6 +787,9 @@ fn splitn(s: str, sep: u8, count: uint) -> [str] {
Returns:
A vector containing all the strings between each occurrence of the separator.
FIXME: should behave like split and split_char:
assert ["", "XXX", "YYY", ""] == split_str(".XXX.YYY.", ".");
*/
fn split_str(s: str, sep: str) -> [str] {
assert byte_len(sep) > 0u;
@ -799,6 +817,76 @@ fn split_str(s: str, sep: str) -> [str] {
ret v;
}
/*
Function: split_func
Splits a string into substrings using a function
(unicode safe)
FIXME: will be renamed to split.
*/
fn split_func(ss: str, sepfn: fn&(cc: char)->bool) -> [str] {
let vv: [str] = [];
let accum: str = "";
let ends_with_sep: bool = false;
str::iter_chars(ss, {|cc| if sepfn(cc) {
vv += [accum];
accum = "";
ends_with_sep = true;
} else {
str::push_char(accum, cc);
ends_with_sep = false;
}
});
if char_len(accum) >= 0u || ends_with_sep {
vv += [accum];
}
ret vv;
}
/*
Function: split_char
Splits a string into a vector of the substrings separated by a given character
*/
fn split_char(ss: str, cc: char) -> [str] {
split_func(ss, {|kk| kk == cc})
}
/*
Function: lines
Splits a string into a vector of the substrings
separated by LF ('\n')
*/
fn lines(ss: str) -> [str] {
split_func(ss, {|cc| cc == '\n'})
}
/*
Function: lines_any
Splits a string into a vector of the substrings
separated by LF ('\n') and/or CR LF ('\r\n')
*/
fn lines_any(ss: str) -> [str] {
vec::map(lines(ss), {|s| trim_right(s)})
}
/*
Function: words
Splits a string into a vector of the substrings
separated by whitespace
*/
fn words(ss: str) -> [str] {
ret vec::filter( split_func(ss, {|cc| char::is_whitespace(cc)}),
{|w| 0u < str::char_len(w)});
}
/*
Function: concat

View File

@ -80,12 +80,83 @@ fn t(s: str, sep: str, i: int, k: str) {
let v = str::split_str(s, sep);
assert str::eq(v[i], k);
}
//FIXME: should behave like split and split_char:
//assert ["", "XXX", "YYY", ""] == str::split_str(".XXX.YYY.", ".");
t("abc::hello::there", "::", 0, "abc");
t("abc::hello::there", "::", 1, "hello");
t("abc::hello::there", "::", 2, "there");
t("::hello::there", "::", 0, "hello");
t("hello::there::", "::", 2, "");
t("::hello::there::", "::", 2, "");
t("ประเทศไทย中华Việt Nam", "中华", 0, "ประเทศไทย");
t("ประเทศไทย中华Việt Nam", "中华", 1, "Việt Nam");
}
#[test]
fn test_split_func () {
let data = "ประเทศไทย中华Việt Nam";
assert ["ประเทศไทย中", "Việt Nam"]
== str::split_func (data, {|cc| cc == '华'});
assert ["", "", "XXX", "YYY", ""]
== str::split_func("zzXXXzYYYz", char::is_lowercase);
assert ["zz", "", "", "z", "", "", "z"]
== str::split_func("zzXXXzYYYz", char::is_uppercase);
assert ["",""] == str::split_func("z", {|cc| cc == 'z'});
assert [""] == str::split_func("", {|cc| cc == 'z'});
assert ["ok"] == str::split_func("ok", {|cc| cc == 'z'});
}
#[test]
fn test_split_char () {
let data = "ประเทศไทย中华Việt Nam";
assert ["ประเทศไทย中", "Việt Nam"]
== str::split_char(data, '华');
assert ["", "", "XXX", "YYY", ""]
== str::split_char("zzXXXzYYYz", 'z');
assert ["",""] == str::split_char("z", 'z');
assert [""] == str::split_char("", 'z');
assert ["ok"] == str::split_char("ok", 'z');
}
#[test]
fn test_lines () {
let lf = "\nMary had a little lamb\nLittle lamb\n";
let crlf = "\r\nMary had a little lamb\r\nLittle lamb\r\n";
assert ["", "Mary had a little lamb", "Little lamb", ""]
== str::lines(lf);
assert ["", "Mary had a little lamb", "Little lamb", ""]
== str::lines_any(lf);
assert ["\r", "Mary had a little lamb\r", "Little lamb\r", ""]
== str::lines(crlf);
assert ["", "Mary had a little lamb", "Little lamb", ""]
== str::lines_any(crlf);
assert [""] == str::lines ("");
assert [""] == str::lines_any("");
assert ["",""] == str::lines ("\n");
assert ["",""] == str::lines_any("\n");
assert ["banana"] == str::lines ("banana");
assert ["banana"] == str::lines_any("banana");
}
#[test]
fn test_words () {
let data = "\nMary had a little lamb\nLittle lamb\n";
assert ["Mary","had","a","little","lamb","Little","lamb"]
== str::words(data);
assert ["ok"] == str::words("ok");
assert [] == str::words("");
}
#[test]
@ -215,6 +286,27 @@ fn test_char_slice() {
assert (str::eq("bc", str::char_slice("abc", 1u, 3u)));
assert (str::eq("", str::char_slice("abc", 1u, 1u)));
assert (str::eq("\u65e5", str::char_slice("\u65e5\u672c", 0u, 1u)));
let data = "ประเทศไทย中华";
assert (str::eq("", str::char_slice(data, 0u, 1u)));
assert (str::eq("", str::char_slice(data, 1u, 2u)));
assert (str::eq("", str::char_slice(data, 10u, 11u)));
assert (str::eq("", str::char_slice(data, 1u, 1u)));
fn a_million_letter_X() -> str {
let i = 0;
let rs = "";
while i < 100000 { rs += "华华华华华华华华华华"; i += 1; }
ret rs;
}
fn half_a_million_letter_X() -> str {
let i = 0;
let rs = "";
while i < 100000 { rs += "华华华华华"; i += 1; }
ret rs;
}
assert (str::eq(half_a_million_letter_X(),
str::char_slice(a_million_letter_X(), 0u, 500000u)));
}
#[test]