libcore: handle trailing newlines more like other languages.
Specifically, `lines` and `each_line` will not emit a trailing empty string when given "...\n". Also, add `read_lines`, which just collects all of `each_line` into a vector, and `split_*_no_trailing` which will is the generalised version of `lines`.
This commit is contained in:
parent
9584c60871
commit
429b8a9b9e
@ -99,8 +99,8 @@ pub trait ReaderUtil {
|
||||
/// Read len bytes into a new vec.
|
||||
fn read_bytes(&self, len: uint) -> ~[u8];
|
||||
|
||||
/// Read up until a specified character (which is not returned) or EOF.
|
||||
fn read_until(&self, c: char) -> ~str;
|
||||
/// Read up until a specified character (which is optionally included) or EOF.
|
||||
fn read_until(&self, c: char, include: bool) -> ~str;
|
||||
|
||||
/// Read up until the first '\n' char (which is not returned), or EOF.
|
||||
fn read_line(&self) -> ~str;
|
||||
@ -126,6 +126,9 @@ pub trait ReaderUtil {
|
||||
/// Iterate over every line until the iterator breaks or EOF.
|
||||
fn each_line(&self, it: &fn(&str) -> bool);
|
||||
|
||||
/// Read all the lines of the file into a vector.
|
||||
fn read_lines(&self) -> ~[~str];
|
||||
|
||||
/// Read n (between 1 and 8) little-endian unsigned integer bytes.
|
||||
fn read_le_uint_n(&self, nbytes: uint) -> u64;
|
||||
|
||||
@ -219,11 +222,14 @@ impl<T:Reader> ReaderUtil for T {
|
||||
bytes
|
||||
}
|
||||
|
||||
fn read_until(&self, c: char) -> ~str {
|
||||
fn read_until(&self, c: char, include: bool) -> ~str {
|
||||
let mut bytes = ~[];
|
||||
loop {
|
||||
let ch = self.read_byte();
|
||||
if ch == -1 || ch == c as int {
|
||||
if include && ch == c as int {
|
||||
bytes.push(ch as u8);
|
||||
}
|
||||
break;
|
||||
}
|
||||
bytes.push(ch as u8);
|
||||
@ -232,7 +238,7 @@ impl<T:Reader> ReaderUtil for T {
|
||||
}
|
||||
|
||||
fn read_line(&self) -> ~str {
|
||||
self.read_until('\n')
|
||||
self.read_until('\n', false)
|
||||
}
|
||||
|
||||
fn read_chars(&self, n: uint) -> ~[char] {
|
||||
@ -306,7 +312,7 @@ impl<T:Reader> ReaderUtil for T {
|
||||
}
|
||||
|
||||
fn read_c_str(&self) -> ~str {
|
||||
self.read_until(0 as char)
|
||||
self.read_until(0 as char, false)
|
||||
}
|
||||
|
||||
fn read_whole_stream(&self) -> ~[u8] {
|
||||
@ -329,7 +335,29 @@ impl<T:Reader> ReaderUtil for T {
|
||||
|
||||
fn each_line(&self, it: &fn(s: &str) -> bool) {
|
||||
while !self.eof() {
|
||||
if !it(self.read_line()) { break; }
|
||||
// include the \n, so that we can distinguish an entirely empty
|
||||
// line read after "...\n", and the trailing empty line in
|
||||
// "...\n\n".
|
||||
let mut line = self.read_until('\n', true);
|
||||
|
||||
// blank line at the end of the reader is ignored
|
||||
if self.eof() && line.is_empty() { break; }
|
||||
|
||||
// trim the \n, so that each_line is consistent with read_line
|
||||
let n = str::len(line);
|
||||
if line[n-1] == '\n' as u8 {
|
||||
unsafe { str::raw::set_len(&mut line, n-1); }
|
||||
}
|
||||
|
||||
if !it(line) { break; }
|
||||
}
|
||||
}
|
||||
|
||||
fn read_lines(&self) -> ~[~str] {
|
||||
do vec::build |push| {
|
||||
for self.each_line |line| {
|
||||
push(str::from_slice(line));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1335,6 +1363,21 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_lines() {
|
||||
do io::with_str_reader(~"a\nb\nc\n") |inp| {
|
||||
fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]);
|
||||
}
|
||||
|
||||
do io::with_str_reader(~"a\nb\nc") |inp| {
|
||||
fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]);
|
||||
}
|
||||
|
||||
do io::with_str_reader(~"") |inp| {
|
||||
fail_unless!(inp.read_lines().is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_readchars_wide() {
|
||||
let wide_test = ~"生锈的汤匙切肉汤hello生锈的汤匙切肉汤";
|
||||
|
@ -437,28 +437,37 @@ pub pure fn slice(s: &'a str, begin: uint, end: uint) -> &'a str {
|
||||
unsafe { raw::slice_bytes(s, begin, end) }
|
||||
}
|
||||
|
||||
/// Splits a string into substrings at each occurrence of a given character
|
||||
/// Splits a string into substrings at each occurrence of a given
|
||||
/// character.
|
||||
pub pure fn split_char(s: &str, sep: char) -> ~[~str] {
|
||||
split_char_inner(s, sep, len(s), true)
|
||||
split_char_inner(s, sep, len(s), true, true)
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string into substrings at each occurrence of a given
|
||||
* character up to 'count' times
|
||||
* character up to 'count' times.
|
||||
*
|
||||
* The byte must be a valid UTF-8/ASCII byte
|
||||
*/
|
||||
pub pure fn splitn_char(s: &str, sep: char, count: uint) -> ~[~str] {
|
||||
split_char_inner(s, sep, count, true)
|
||||
split_char_inner(s, sep, count, true, true)
|
||||
}
|
||||
|
||||
/// Like `split_char`, but omits empty strings from the returned vector
|
||||
pub pure fn split_char_nonempty(s: &str, sep: char) -> ~[~str] {
|
||||
split_char_inner(s, sep, len(s), false)
|
||||
split_char_inner(s, sep, len(s), false, false)
|
||||
}
|
||||
|
||||
pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool)
|
||||
-> ~[~str] {
|
||||
/**
|
||||
* Like `split_char`, but a trailing empty string is omitted
|
||||
* (e.g. `split_char_no_trailing("A B ",' ') == ~[~"A",~"B"]`)
|
||||
*/
|
||||
pub pure fn split_char_no_trailing(s: &str, sep: char) -> ~[~str] {
|
||||
split_char_inner(s, sep, len(s), true, false)
|
||||
}
|
||||
|
||||
pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool,
|
||||
allow_trailing_empty: bool) -> ~[~str] {
|
||||
if sep < 128u as char {
|
||||
let b = sep as u8, l = len(s);
|
||||
let mut result = ~[], done = 0u;
|
||||
@ -475,19 +484,20 @@ pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool)
|
||||
}
|
||||
i += 1u;
|
||||
}
|
||||
if allow_empty || start < l {
|
||||
// only push a non-empty trailing substring
|
||||
if allow_trailing_empty || start < l {
|
||||
unsafe { result.push(raw::slice_bytes_unique(s, start, l) ) };
|
||||
}
|
||||
result
|
||||
} else {
|
||||
splitn(s, |cur| cur == sep, count)
|
||||
split_inner(s, |cur| cur == sep, count, allow_empty, allow_trailing_empty)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Splits a string into substrings using a character function
|
||||
pub pure fn split(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
|
||||
split_inner(s, sepfn, len(s), true)
|
||||
split_inner(s, sepfn, len(s), true, true)
|
||||
}
|
||||
|
||||
/**
|
||||
@ -498,16 +508,25 @@ pub pure fn splitn(s: &str,
|
||||
sepfn: &fn(char) -> bool,
|
||||
count: uint)
|
||||
-> ~[~str] {
|
||||
split_inner(s, sepfn, count, true)
|
||||
split_inner(s, sepfn, count, true, true)
|
||||
}
|
||||
|
||||
/// Like `split`, but omits empty strings from the returned vector
|
||||
pub pure fn split_nonempty(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
|
||||
split_inner(s, sepfn, len(s), false)
|
||||
split_inner(s, sepfn, len(s), false, false)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Like `split`, but a trailing empty string is omitted
|
||||
* (e.g. `split_no_trailing("A B ",' ') == ~[~"A",~"B"]`)
|
||||
*/
|
||||
pub pure fn split_no_trailing(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
|
||||
split_inner(s, sepfn, len(s), true, false)
|
||||
}
|
||||
|
||||
pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint,
|
||||
allow_empty: bool) -> ~[~str] {
|
||||
allow_empty: bool, allow_trailing_empty: bool) -> ~[~str] {
|
||||
let l = len(s);
|
||||
let mut result = ~[], i = 0u, start = 0u, done = 0u;
|
||||
while i < l && done < count {
|
||||
@ -523,7 +542,7 @@ pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint,
|
||||
}
|
||||
i = next;
|
||||
}
|
||||
if allow_empty || start < l {
|
||||
if allow_trailing_empty || start < l {
|
||||
unsafe {
|
||||
result.push(raw::slice_bytes_unique(s, start, l));
|
||||
}
|
||||
@ -630,9 +649,11 @@ pub fn levdistance(s: &str, t: &str) -> uint {
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string into a vector of the substrings separated by LF ('\n')
|
||||
* Splits a string into a vector of the substrings separated by LF ('\n').
|
||||
*/
|
||||
pub pure fn lines(s: &str) -> ~[~str] { split_char(s, '\n') }
|
||||
pub pure fn lines(s: &str) -> ~[~str] {
|
||||
split_char_no_trailing(s, '\n')
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string into a vector of the substrings separated by LF ('\n')
|
||||
@ -651,7 +672,7 @@ pub pure fn lines_any(s: &str) -> ~[~str] {
|
||||
|
||||
/// Splits a string into a vector of the substrings separated by whitespace
|
||||
pub pure fn words(s: &str) -> ~[~str] {
|
||||
split_nonempty(s, |c| char::is_whitespace(c))
|
||||
split_nonempty(s, char::is_whitespace)
|
||||
}
|
||||
|
||||
/** Split a string into a vector of substrings,
|
||||
@ -2669,6 +2690,35 @@ mod tests {
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_char_no_trailing() {
|
||||
fn t(s: &str, c: char, u: &[~str]) {
|
||||
debug!(~"split_byte: " + s);
|
||||
let v = split_char_no_trailing(s, c);
|
||||
debug!("split_byte to: %?", v);
|
||||
fail_unless!(vec::all2(v, u, |a,b| a == b));
|
||||
}
|
||||
t(~"abc.hello.there", '.', ~[~"abc", ~"hello", ~"there"]);
|
||||
t(~".hello.there", '.', ~[~"", ~"hello", ~"there"]);
|
||||
t(~"...hello.there.", '.', ~[~"", ~"", ~"", ~"hello", ~"there"]);
|
||||
|
||||
fail_unless!(~[~"", ~"", ~"", ~"hello", ~"there"]
|
||||
== split_char_no_trailing(~"...hello.there.", '.'));
|
||||
|
||||
fail_unless!(~[] == split_char_no_trailing(~"", 'z'));
|
||||
fail_unless!(~[~""] == split_char_no_trailing(~"z", 'z'));
|
||||
fail_unless!(~[~"ok"] == split_char_no_trailing(~"ok", 'z'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_char_no_trailing_2() {
|
||||
let data = ~"ประเทศไทย中华Việt Nam";
|
||||
fail_unless!(~[~"ประเทศไทย中华", ~"iệt Nam"]
|
||||
== split_char_no_trailing(data, 'V'));
|
||||
fail_unless!(~[~"ประเ", ~"ศไ", ~"ย中华Việt Nam"]
|
||||
== split_char_no_trailing(data, 'ท'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_str() {
|
||||
fn t(s: &str, sep: &'a str, i: int, k: &str) {
|
||||
@ -2722,28 +2772,45 @@ mod tests {
|
||||
fail_unless!(~[~"ok"] == split(~"ok", |cc| cc == 'z'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_no_trailing() {
|
||||
let data = ~"ประเทศไทย中华Việt Nam";
|
||||
fail_unless!(~[~"ประเทศไทย中", ~"Việt Nam"]
|
||||
== split_no_trailing (data, |cc| cc == '华'));
|
||||
|
||||
fail_unless!(~[~"", ~"", ~"XXX", ~"YYY"]
|
||||
== split_no_trailing(~"zzXXXzYYYz", char::is_lowercase));
|
||||
|
||||
fail_unless!(~[~"zz", ~"", ~"", ~"z", ~"", ~"", ~"z"]
|
||||
== split_no_trailing(~"zzXXXzYYYz", char::is_uppercase));
|
||||
|
||||
fail_unless!(~[~""] == split_no_trailing(~"z", |cc| cc == 'z'));
|
||||
fail_unless!(~[] == split_no_trailing(~"", |cc| cc == 'z'));
|
||||
fail_unless!(~[~"ok"] == split_no_trailing(~"ok", |cc| cc == 'z'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lines() {
|
||||
let lf = ~"\nMary had a little lamb\nLittle lamb\n";
|
||||
let crlf = ~"\r\nMary had a little lamb\r\nLittle lamb\r\n";
|
||||
|
||||
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
|
||||
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
|
||||
== lines(lf));
|
||||
|
||||
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
|
||||
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
|
||||
== lines_any(lf));
|
||||
|
||||
fail_unless!(~[~"\r", ~"Mary had a little lamb\r",
|
||||
~"Little lamb\r", ~""]
|
||||
~"Little lamb\r"]
|
||||
== lines(crlf));
|
||||
|
||||
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
|
||||
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
|
||||
== lines_any(crlf));
|
||||
|
||||
fail_unless!(~[~""] == lines (~""));
|
||||
fail_unless!(~[~""] == lines_any(~""));
|
||||
fail_unless!(~[~"",~""] == lines (~"\n"));
|
||||
fail_unless!(~[~"",~""] == lines_any(~"\n"));
|
||||
fail_unless!(~[] == lines (~""));
|
||||
fail_unless!(~[] == lines_any(~""));
|
||||
fail_unless!(~[~""] == lines (~"\n"));
|
||||
fail_unless!(~[~""] == lines_any(~"\n"));
|
||||
fail_unless!(~[~"banana"] == lines (~"banana"));
|
||||
fail_unless!(~[~"banana"] == lines_any(~"banana"));
|
||||
}
|
||||
@ -3359,7 +3426,6 @@ mod tests {
|
||||
0 => fail_unless!("" == x),
|
||||
1 => fail_unless!("Mary had a little lamb" == x),
|
||||
2 => fail_unless!("Little lamb" == x),
|
||||
3 => fail_unless!("" == x),
|
||||
_ => ()
|
||||
}
|
||||
ii += 1;
|
||||
|
Loading…
x
Reference in New Issue
Block a user