diff --git a/src/libcore/io.rs b/src/libcore/io.rs index b160da359f8..c5bfa942458 100644 --- a/src/libcore/io.rs +++ b/src/libcore/io.rs @@ -99,8 +99,8 @@ pub trait ReaderUtil { /// Read len bytes into a new vec. fn read_bytes(&self, len: uint) -> ~[u8]; - /// Read up until a specified character (which is not returned) or EOF. - fn read_until(&self, c: char) -> ~str; + /// Read up until a specified character (which is optionally included) or EOF. + fn read_until(&self, c: char, include: bool) -> ~str; /// Read up until the first '\n' char (which is not returned), or EOF. fn read_line(&self) -> ~str; @@ -126,6 +126,9 @@ pub trait ReaderUtil { /// Iterate over every line until the iterator breaks or EOF. fn each_line(&self, it: &fn(&str) -> bool); + /// Read all the lines of the file into a vector. + fn read_lines(&self) -> ~[~str]; + /// Read n (between 1 and 8) little-endian unsigned integer bytes. fn read_le_uint_n(&self, nbytes: uint) -> u64; @@ -219,11 +222,14 @@ impl ReaderUtil for T { bytes } - fn read_until(&self, c: char) -> ~str { + fn read_until(&self, c: char, include: bool) -> ~str { let mut bytes = ~[]; loop { let ch = self.read_byte(); if ch == -1 || ch == c as int { + if include && ch == c as int { + bytes.push(ch as u8); + } break; } bytes.push(ch as u8); @@ -232,7 +238,7 @@ impl ReaderUtil for T { } fn read_line(&self) -> ~str { - self.read_until('\n') + self.read_until('\n', false) } fn read_chars(&self, n: uint) -> ~[char] { @@ -306,7 +312,7 @@ impl ReaderUtil for T { } fn read_c_str(&self) -> ~str { - self.read_until(0 as char) + self.read_until(0 as char, false) } fn read_whole_stream(&self) -> ~[u8] { @@ -329,7 +335,29 @@ impl ReaderUtil for T { fn each_line(&self, it: &fn(s: &str) -> bool) { while !self.eof() { - if !it(self.read_line()) { break; } + // include the \n, so that we can distinguish an entirely empty + // line read after "...\n", and the trailing empty line in + // "...\n\n". + let mut line = self.read_until('\n', true); + + // blank line at the end of the reader is ignored + if self.eof() && line.is_empty() { break; } + + // trim the \n, so that each_line is consistent with read_line + let n = str::len(line); + if line[n-1] == '\n' as u8 { + unsafe { str::raw::set_len(&mut line, n-1); } + } + + if !it(line) { break; } + } + } + + fn read_lines(&self) -> ~[~str] { + do vec::build |push| { + for self.each_line |line| { + push(str::from_slice(line)); + } } } @@ -1335,6 +1363,21 @@ mod tests { } } + #[test] + fn test_read_lines() { + do io::with_str_reader(~"a\nb\nc\n") |inp| { + fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]); + } + + do io::with_str_reader(~"a\nb\nc") |inp| { + fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]); + } + + do io::with_str_reader(~"") |inp| { + fail_unless!(inp.read_lines().is_empty()); + } + } + #[test] fn test_readchars_wide() { let wide_test = ~"生锈的汤匙切肉汤hello生锈的汤匙切肉汤"; diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 980e984f75b..bbb5ce3d8a1 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -437,28 +437,37 @@ pub pure fn slice(s: &'a str, begin: uint, end: uint) -> &'a str { unsafe { raw::slice_bytes(s, begin, end) } } -/// Splits a string into substrings at each occurrence of a given character +/// Splits a string into substrings at each occurrence of a given +/// character. pub pure fn split_char(s: &str, sep: char) -> ~[~str] { - split_char_inner(s, sep, len(s), true) + split_char_inner(s, sep, len(s), true, true) } /** * Splits a string into substrings at each occurrence of a given - * character up to 'count' times + * character up to 'count' times. * * The byte must be a valid UTF-8/ASCII byte */ pub pure fn splitn_char(s: &str, sep: char, count: uint) -> ~[~str] { - split_char_inner(s, sep, count, true) + split_char_inner(s, sep, count, true, true) } /// Like `split_char`, but omits empty strings from the returned vector pub pure fn split_char_nonempty(s: &str, sep: char) -> ~[~str] { - split_char_inner(s, sep, len(s), false) + split_char_inner(s, sep, len(s), false, false) } -pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool) - -> ~[~str] { +/** + * Like `split_char`, but a trailing empty string is omitted + * (e.g. `split_char_no_trailing("A B ",' ') == ~[~"A",~"B"]`) + */ +pub pure fn split_char_no_trailing(s: &str, sep: char) -> ~[~str] { + split_char_inner(s, sep, len(s), true, false) +} + +pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool, + allow_trailing_empty: bool) -> ~[~str] { if sep < 128u as char { let b = sep as u8, l = len(s); let mut result = ~[], done = 0u; @@ -475,19 +484,20 @@ pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool) } i += 1u; } - if allow_empty || start < l { + // only push a non-empty trailing substring + if allow_trailing_empty || start < l { unsafe { result.push(raw::slice_bytes_unique(s, start, l) ) }; } result } else { - splitn(s, |cur| cur == sep, count) + split_inner(s, |cur| cur == sep, count, allow_empty, allow_trailing_empty) } } /// Splits a string into substrings using a character function pub pure fn split(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] { - split_inner(s, sepfn, len(s), true) + split_inner(s, sepfn, len(s), true, true) } /** @@ -498,16 +508,25 @@ pub pure fn splitn(s: &str, sepfn: &fn(char) -> bool, count: uint) -> ~[~str] { - split_inner(s, sepfn, count, true) + split_inner(s, sepfn, count, true, true) } /// Like `split`, but omits empty strings from the returned vector pub pure fn split_nonempty(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] { - split_inner(s, sepfn, len(s), false) + split_inner(s, sepfn, len(s), false, false) +} + + +/** + * Like `split`, but a trailing empty string is omitted + * (e.g. `split_no_trailing("A B ",' ') == ~[~"A",~"B"]`) + */ +pub pure fn split_no_trailing(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] { + split_inner(s, sepfn, len(s), true, false) } pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint, - allow_empty: bool) -> ~[~str] { + allow_empty: bool, allow_trailing_empty: bool) -> ~[~str] { let l = len(s); let mut result = ~[], i = 0u, start = 0u, done = 0u; while i < l && done < count { @@ -523,7 +542,7 @@ pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint, } i = next; } - if allow_empty || start < l { + if allow_trailing_empty || start < l { unsafe { result.push(raw::slice_bytes_unique(s, start, l)); } @@ -630,9 +649,11 @@ pub fn levdistance(s: &str, t: &str) -> uint { } /** - * Splits a string into a vector of the substrings separated by LF ('\n') + * Splits a string into a vector of the substrings separated by LF ('\n'). */ -pub pure fn lines(s: &str) -> ~[~str] { split_char(s, '\n') } +pub pure fn lines(s: &str) -> ~[~str] { + split_char_no_trailing(s, '\n') +} /** * Splits a string into a vector of the substrings separated by LF ('\n') @@ -651,7 +672,7 @@ pub pure fn lines_any(s: &str) -> ~[~str] { /// Splits a string into a vector of the substrings separated by whitespace pub pure fn words(s: &str) -> ~[~str] { - split_nonempty(s, |c| char::is_whitespace(c)) + split_nonempty(s, char::is_whitespace) } /** Split a string into a vector of substrings, @@ -2669,6 +2690,35 @@ mod tests { } + #[test] + fn test_split_char_no_trailing() { + fn t(s: &str, c: char, u: &[~str]) { + debug!(~"split_byte: " + s); + let v = split_char_no_trailing(s, c); + debug!("split_byte to: %?", v); + fail_unless!(vec::all2(v, u, |a,b| a == b)); + } + t(~"abc.hello.there", '.', ~[~"abc", ~"hello", ~"there"]); + t(~".hello.there", '.', ~[~"", ~"hello", ~"there"]); + t(~"...hello.there.", '.', ~[~"", ~"", ~"", ~"hello", ~"there"]); + + fail_unless!(~[~"", ~"", ~"", ~"hello", ~"there"] + == split_char_no_trailing(~"...hello.there.", '.')); + + fail_unless!(~[] == split_char_no_trailing(~"", 'z')); + fail_unless!(~[~""] == split_char_no_trailing(~"z", 'z')); + fail_unless!(~[~"ok"] == split_char_no_trailing(~"ok", 'z')); + } + + #[test] + fn test_split_char_no_trailing_2() { + let data = ~"ประเทศไทย中华Việt Nam"; + fail_unless!(~[~"ประเทศไทย中华", ~"iệt Nam"] + == split_char_no_trailing(data, 'V')); + fail_unless!(~[~"ประเ", ~"ศไ", ~"ย中华Việt Nam"] + == split_char_no_trailing(data, 'ท')); + } + #[test] fn test_split_str() { fn t(s: &str, sep: &'a str, i: int, k: &str) { @@ -2722,28 +2772,45 @@ mod tests { fail_unless!(~[~"ok"] == split(~"ok", |cc| cc == 'z')); } + #[test] + fn test_split_no_trailing() { + let data = ~"ประเทศไทย中华Việt Nam"; + fail_unless!(~[~"ประเทศไทย中", ~"Việt Nam"] + == split_no_trailing (data, |cc| cc == '华')); + + fail_unless!(~[~"", ~"", ~"XXX", ~"YYY"] + == split_no_trailing(~"zzXXXzYYYz", char::is_lowercase)); + + fail_unless!(~[~"zz", ~"", ~"", ~"z", ~"", ~"", ~"z"] + == split_no_trailing(~"zzXXXzYYYz", char::is_uppercase)); + + fail_unless!(~[~""] == split_no_trailing(~"z", |cc| cc == 'z')); + fail_unless!(~[] == split_no_trailing(~"", |cc| cc == 'z')); + fail_unless!(~[~"ok"] == split_no_trailing(~"ok", |cc| cc == 'z')); + } + #[test] fn test_lines() { let lf = ~"\nMary had a little lamb\nLittle lamb\n"; let crlf = ~"\r\nMary had a little lamb\r\nLittle lamb\r\n"; - fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""] + fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"] == lines(lf)); - fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""] + fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"] == lines_any(lf)); fail_unless!(~[~"\r", ~"Mary had a little lamb\r", - ~"Little lamb\r", ~""] + ~"Little lamb\r"] == lines(crlf)); - fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""] + fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"] == lines_any(crlf)); - fail_unless!(~[~""] == lines (~"")); - fail_unless!(~[~""] == lines_any(~"")); - fail_unless!(~[~"",~""] == lines (~"\n")); - fail_unless!(~[~"",~""] == lines_any(~"\n")); + fail_unless!(~[] == lines (~"")); + fail_unless!(~[] == lines_any(~"")); + fail_unless!(~[~""] == lines (~"\n")); + fail_unless!(~[~""] == lines_any(~"\n")); fail_unless!(~[~"banana"] == lines (~"banana")); fail_unless!(~[~"banana"] == lines_any(~"banana")); } @@ -3359,7 +3426,6 @@ mod tests { 0 => fail_unless!("" == x), 1 => fail_unless!("Mary had a little lamb" == x), 2 => fail_unless!("Little lamb" == x), - 3 => fail_unless!("" == x), _ => () } ii += 1;