From ec5a028adac360537c8f37a669eda522bd8c9b6b Mon Sep 17 00:00:00 2001 From: Huon Wilson <dbau.pp+github@gmail.com> Date: Mon, 10 Jun 2013 21:46:36 +1000 Subject: [PATCH] std: convert str::char_at* to methods. --- src/compiletest/runtest.rs | 6 +- src/libextra/getopts.rs | 8 +- src/libextra/rope.rs | 7 +- src/libextra/time.rs | 10 +- src/librustc/middle/lint.rs | 2 +- src/libstd/io.rs | 2 +- src/libstd/str.rs | 257 ++++++++++++++++---------------- src/libsyntax/parse/lexer.rs | 4 +- src/test/run-pass/utf8_chars.rs | 4 +- 9 files changed, 147 insertions(+), 153 deletions(-) diff --git a/src/compiletest/runtest.rs b/src/compiletest/runtest.rs index d87101ffb85..c9e44a79160 100644 --- a/src/compiletest/runtest.rs +++ b/src/compiletest/runtest.rs @@ -429,7 +429,7 @@ fn scan_char(haystack: &str, needle: char, idx: &mut uint) -> bool { if *idx >= haystack.len() { return false; } - let range = str::char_range_at(haystack, *idx); + let range = haystack.char_range_at(*idx); if range.ch != needle { return false; } @@ -440,7 +440,7 @@ fn scan_char(haystack: &str, needle: char, idx: &mut uint) -> bool { fn scan_integer(haystack: &str, idx: &mut uint) -> bool { let mut i = *idx; while i < haystack.len() { - let range = str::char_range_at(haystack, i); + let range = haystack.char_range_at(i); if range.ch < '0' || '9' < range.ch { break; } @@ -460,7 +460,7 @@ fn scan_string(haystack: &str, needle: &str, idx: &mut uint) -> bool { if haystack_i >= haystack.len() { return false; } - let range = str::char_range_at(haystack, haystack_i); + let range = haystack.char_range_at(haystack_i); haystack_i = range.next; if !scan_char(needle, range.ch, &mut needle_i) { return false; diff --git a/src/libextra/getopts.rs b/src/libextra/getopts.rs index 111de53052c..76e921f02f9 100644 --- a/src/libextra/getopts.rs +++ b/src/libextra/getopts.rs @@ -112,7 +112,7 @@ pub struct Opt { fn mkname(nm: &str) -> Name { if nm.len() == 1u { - Short(str::char_at(nm, 0u)) + Short(nm.char_at(0u)) } else { Long(nm.to_owned()) } @@ -261,7 +261,7 @@ pub fn getopts(args: &[~str], opts: &[Opt]) -> Result { let mut last_valid_opt_id = None; names = ~[]; while j < curlen { - let range = str::char_range_at(cur, j); + let range = cur.char_range_at(j); let opt = Short(range.ch); /* In a series of potential options (eg. -aheJ), if we @@ -565,11 +565,11 @@ pub mod groups { hasarg: hasarg, occur: occur}], - (1,0) => ~[Opt {name: Short(str::char_at(short_name, 0)), + (1,0) => ~[Opt {name: Short(short_name.char_at(0)), hasarg: hasarg, occur: occur}], - (1,_) => ~[Opt {name: Short(str::char_at(short_name, 0)), + (1,_) => ~[Opt {name: Short(short_name.char_at(0)), hasarg: hasarg, occur: occur}, Opt {name: Long((long_name)), diff --git a/src/libextra/rope.rs b/src/libextra/rope.rs index de78e0a6eeb..80d80fa0ade 100644 --- a/src/libextra/rope.rs +++ b/src/libextra/rope.rs @@ -1132,7 +1132,7 @@ pub mod node { pub fn char_at(mut node: @Node, mut pos: uint) -> char { loop { match *node { - Leaf(x) => return str::char_at(*x.content, pos), + Leaf(x) => return x.content.char_at(pos), Concat(Concat {left, right, _}) => { let left_len = char_len(left); node = if left_len > pos { left } @@ -1257,8 +1257,7 @@ pub mod node { return None } else { let range = - str::char_range_at(*aleaf.content, - (*it).leaf_byte_pos + aleaf.byte_offset); + aleaf.content.char_range_at((*it).leaf_byte_pos + aleaf.byte_offset); let ch = range.ch; let next = range.next; (*it).leaf_byte_pos = next - aleaf.byte_offset; @@ -1345,7 +1344,7 @@ mod tests { equal = false; } break; } Some(c) => { - let range = str::char_range_at(*sample, string_iter); + let range = sample.char_range_at(string_iter); string_iter = range.next; if range.ch != c { equal = false; break; } } diff --git a/src/libextra/time.rs b/src/libextra/time.rs index fea5cb560ac..caaa2994405 100644 --- a/src/libextra/time.rs +++ b/src/libextra/time.rs @@ -296,7 +296,7 @@ priv fn do_strptime(s: &str, format: &str) -> Result<Tm, ~str> { let mut i = 0u; while i < digits { - let range = str::char_range_at(ss, pos); + let range = ss.char_range_at(pos); pos = range.next; match range.ch { @@ -323,7 +323,7 @@ priv fn do_strptime(s: &str, format: &str) -> Result<Tm, ~str> { } fn parse_char(s: &str, pos: uint, c: char) -> Result<uint, ~str> { - let range = str::char_range_at(s, pos); + let range = s.char_range_at(pos); if c == range.ch { Ok(range.next) @@ -600,7 +600,7 @@ priv fn do_strptime(s: &str, format: &str) -> Result<Tm, ~str> { let mut pos = pos; let len = s.len(); while pos < len { - let range = str::char_range_at(s, pos); + let range = s.char_range_at(pos); pos = range.next; if range.ch == ' ' { break; } } @@ -609,7 +609,7 @@ priv fn do_strptime(s: &str, format: &str) -> Result<Tm, ~str> { } } 'z' => { - let range = str::char_range_at(s, pos); + let range = s.char_range_at(pos); if range.ch == '+' || range.ch == '-' { match match_digits(s, range.next, 4u, false) { @@ -655,7 +655,7 @@ priv fn do_strptime(s: &str, format: &str) -> Result<Tm, ~str> { let mut result = Err(~"Invalid time"); while !rdr.eof() && pos < len { - let range = str::char_range_at(s, pos); + let range = s.char_range_at(pos); let ch = range.ch; let next = range.next; diff --git a/src/librustc/middle/lint.rs b/src/librustc/middle/lint.rs index 7462067162d..92147bf4e0f 100644 --- a/src/librustc/middle/lint.rs +++ b/src/librustc/middle/lint.rs @@ -842,7 +842,7 @@ fn check_item_non_camel_case_types(cx: &Context, it: @ast::item) { let ident = cx.sess.str_of(ident); assert!(!ident.is_empty()); let ident = ident.trim_chars(&['_']); - char::is_uppercase(str::char_at(ident, 0)) && + char::is_uppercase(ident.char_at(0)) && !ident.contains_char('_') } diff --git a/src/libstd/io.rs b/src/libstd/io.rs index 8ec3a4cdd81..7f3af79e27c 100644 --- a/src/libstd/io.rs +++ b/src/libstd/io.rs @@ -672,7 +672,7 @@ impl<T:Reader> ReaderUtil for T { val <<= 6; val += (next & 63) as uint; } - // See str::char_at + // See str::StrSlice::char_at val += ((b0 << ((w + 1) as u8)) as uint) << (w - 1) * 6 - w - 1u; chars.push(val as char); diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 605a11032a1..25d9a63b479 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -370,7 +370,7 @@ Section: Adding to and removing from a string pub fn pop_char(s: &mut ~str) -> char { let end = s.len(); assert!(end > 0u); - let CharRange {ch, next} = char_range_at_reverse(*s, end); + let CharRange {ch, next} = s.char_range_at_reverse(end); unsafe { raw::set_len(s, next); } return ch; } @@ -383,7 +383,7 @@ pub fn pop_char(s: &mut ~str) -> char { * If the string does not contain any characters */ pub fn shift_char(s: &mut ~str) -> char { - let CharRange {ch, next} = char_range_at(*s, 0u); + let CharRange {ch, next} = s.char_range_at(0u); *s = unsafe { raw::slice_bytes_owned(*s, next, s.len()) }; return ch; } @@ -399,7 +399,7 @@ pub fn shift_char(s: &mut ~str) -> char { */ #[inline] pub fn slice_shift_char<'a>(s: &'a str) -> (char, &'a str) { - let CharRange {ch, next} = char_range_at(s, 0u); + let CharRange {ch, next} = s.char_range_at(0u); let next_s = unsafe { raw::slice_bytes(s, next, s.len()) }; return (ch, next_s); } @@ -532,7 +532,7 @@ impl<'self, Sep: CharEq> Iterator<&'self str> for StrCharSplitIterator<'self, Se } } else { while self.position < l && self.count > 0 { - let CharRange {ch, next} = char_range_at(self.string, self.position); + let CharRange {ch, next} = self.string.char_range_at(self.position); if self.sep.matches(ch) { let slice = unsafe { raw::slice_bytes(self.string, start, self.position) }; @@ -1198,7 +1198,7 @@ pub fn count_chars(s: &str, start: uint, end: uint) -> uint { assert!(is_char_boundary(s, end)); let mut (i, len) = (start, 0u); while i < end { - let next = char_range_at(s, i).next; + let next = s.char_range_at(i).next; len += 1u; i = next; } @@ -1213,7 +1213,7 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint { let l = s.len(); while cnt > 0u { assert!(end < l); - let next = char_range_at(s, end).next; + let next = s.char_range_at(end).next; cnt -= 1u; end = next; } @@ -1233,130 +1233,12 @@ pub fn utf8_char_width(b: u8) -> uint { return 6u; } -/** - * Returns false if the index points into the middle of a multi-byte - * character sequence. - */ -pub fn is_char_boundary(s: &str, index: uint) -> bool { - if index == s.len() { return true; } - let b = s[index]; - return b < 128u8 || b >= 192u8; -} - -/** - * Pluck a character out of a string and return the index of the next - * character. - * - * This function can be used to iterate over the unicode characters of a - * string. - * - * # Example - * - * ~~~ {.rust} - * let s = "中华Việt Nam"; - * let i = 0u; - * while i < s.len() { - * let CharRange {ch, next} = str::char_range_at(s, i); - * std::io::println(fmt!("%u: %c",i,ch)); - * i = next; - * } - * ~~~ - * - * # Example output - * - * ~~~ - * 0: 中 - * 3: 华 - * 6: V - * 7: i - * 8: ệ - * 11: t - * 12: - * 13: N - * 14: a - * 15: m - * ~~~ - * - * # Arguments - * - * * s - The string - * * i - The byte offset of the char to extract - * - * # Return value - * - * A record {ch: char, next: uint} containing the char value and the byte - * index of the next unicode character. - * - * # Failure - * - * If `i` is greater than or equal to the length of the string. - * If `i` is not the index of the beginning of a valid UTF-8 character. - */ -pub fn char_range_at(s: &str, i: uint) -> CharRange { - let b0 = s[i]; - let w = utf8_char_width(b0); - assert!((w != 0u)); - if w == 1u { return CharRange {ch: b0 as char, next: i + 1u}; } - let mut val = 0u; - let end = i + w; - let mut i = i + 1u; - while i < end { - let byte = s[i]; - assert_eq!(byte & 192u8, tag_cont_u8); - val <<= 6u; - val += (byte & 63u8) as uint; - i += 1u; - } - // Clunky way to get the right bits from the first byte. Uses two shifts, - // the first to clip off the marker bits at the left of the byte, and then - // a second (as uint) to get it to the right position. - val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u); - return CharRange {ch: val as char, next: i}; -} - -/// Plucks the character starting at the `i`th byte of a string -pub fn char_at(s: &str, i: uint) -> char { - return char_range_at(s, i).ch; -} - #[allow(missing_doc)] pub struct CharRange { ch: char, next: uint } -/** - * Given a byte position and a str, return the previous char and its position. - * - * This function can be used to iterate over a unicode string in reverse. - * - * Returns 0 for next index if called on start index 0. - */ -pub fn char_range_at_reverse(ss: &str, start: uint) -> CharRange { - let mut prev = start; - - // while there is a previous byte == 10...... - while prev > 0u && ss[prev - 1u] & 192u8 == tag_cont_u8 { - prev -= 1u; - } - - // now refer to the initial byte of previous char - if prev > 0u { - prev -= 1u; - } else { - prev = 0u; - } - - - let ch = char_at(ss, prev); - return CharRange {ch:ch, next:prev}; -} - -/// Plucks the character ending at the `i`th byte of a string -pub fn char_at_reverse(s: &str, i: uint) -> char { - char_range_at_reverse(s, i).ch -} - // UTF-8 tags and ranges static tag_cont_u8: u8 = 128u8; static tag_cont: uint = 128u; @@ -1776,7 +1658,10 @@ pub trait StrSlice<'self> { fn trim_right_chars(&self, chars_to_trim: &[char]) -> &'self str; fn to_owned(&self) -> ~str; fn to_managed(&self) -> @str; + fn is_char_boundary(s: &str, index: uint) -> bool; + fn char_range_at(&self, start: uint) -> CharRange; fn char_at(&self, i: uint) -> char; + fn char_range_at_reverse(&self, start: uint) -> CharRange; fn char_at_reverse(&self, i: uint) -> char; fn to_bytes(&self) -> ~[u8]; @@ -1967,7 +1852,7 @@ impl<'self> StrSlice<'self> for &'self str { match self.rfind(|c| !char::is_whitespace(c)) { None => "", Some(last) => { - let next = char_range_at(*self, last).next; + let next = self.char_range_at(last).next; unsafe { raw::slice_bytes(*self, 0u, next) } } } @@ -2019,8 +1904,8 @@ impl<'self> StrSlice<'self> for &'self str { match self.rfind(|c| !chars_to_trim.contains(&c)) { None => "", Some(last) => { - let next = char_range_at(self, last).next; - unsafe { raw::slice_bytes(self, 0u, next) } + let next = self.char_range_at(last).next; + unsafe { raw::slice_bytes(*self, 0u, next) } } } } @@ -2037,12 +1922,122 @@ impl<'self> StrSlice<'self> for &'self str { unsafe { ::cast::transmute(v) } } - #[inline] - fn char_at(&self, i: uint) -> char { char_at(*self, i) } + /** + * Returns false if the index points into the middle of a multi-byte + * character sequence. + */ + fn is_char_boundary(&self, index: uint) -> bool { + if index == self.len() { return true; } + let b = self[index]; + return b < 128u8 || b >= 192u8; + } + /** + * Pluck a character out of a string and return the index of the next + * character. + * + * This function can be used to iterate over the unicode characters of a + * string. + * + * # Example + * + * ~~~ {.rust} + * let s = "中华Việt Nam"; + * let i = 0u; + * while i < s.len() { + * let CharRange {ch, next} = s.char_range_at(i); + * std::io::println(fmt!("%u: %c",i,ch)); + * i = next; + * } + * ~~~ + * + * # Example output + * + * ~~~ + * 0: 中 + * 3: 华 + * 6: V + * 7: i + * 8: ệ + * 11: t + * 12: + * 13: N + * 14: a + * 15: m + * ~~~ + * + * # Arguments + * + * * s - The string + * * i - The byte offset of the char to extract + * + * # Return value + * + * A record {ch: char, next: uint} containing the char value and the byte + * index of the next unicode character. + * + * # Failure + * + * If `i` is greater than or equal to the length of the string. + * If `i` is not the index of the beginning of a valid UTF-8 character. + */ + fn char_range_at(&self, i: uint) -> CharRange { + let b0 = self[i]; + let w = utf8_char_width(b0); + assert!((w != 0u)); + if w == 1u { return CharRange {ch: b0 as char, next: i + 1u}; } + let mut val = 0u; + let end = i + w; + let mut i = i + 1u; + while i < end { + let byte = self[i]; + assert_eq!(byte & 192u8, tag_cont_u8); + val <<= 6u; + val += (byte & 63u8) as uint; + i += 1u; + } + // Clunky way to get the right bits from the first byte. Uses two shifts, + // the first to clip off the marker bits at the left of the byte, and then + // a second (as uint) to get it to the right position. + val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u); + return CharRange {ch: val as char, next: i}; + } + + /// Plucks the character starting at the `i`th byte of a string + #[inline] + fn char_at(&self, i: uint) -> char { self.char_range_at(i).ch } + + /** + * Given a byte position and a str, return the previous char and its position. + * + * This function can be used to iterate over a unicode string in reverse. + * + * Returns 0 for next index if called on start index 0. + */ + fn char_range_at_reverse(&self, start: uint) -> CharRange { + let mut prev = start; + + // while there is a previous byte == 10...... + while prev > 0u && self[prev - 1u] & 192u8 == tag_cont_u8 { + prev -= 1u; + } + + // now refer to the initial byte of previous char + if prev > 0u { + prev -= 1u; + } else { + prev = 0u; + } + + + let ch = self.char_at(prev); + return CharRange {ch:ch, next:prev}; + } + + /// Plucks the character ending at the `i`th byte of a string #[inline] fn char_at_reverse(&self, i: uint) -> char { - char_at_reverse(*self, i) + self.char_range_at_reverse(i).ch } fn to_bytes(&self) -> ~[u8] { to_bytes(*self) } @@ -3182,7 +3177,7 @@ mod tests { #[test] fn test_char_range_at_reverse_underflow() { - assert_eq!(char_range_at_reverse("abc", 0).next, 0); + assert_eq!("abc".char_range_at_reverse(0).next, 0); } #[test] diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index 809a222352f..5f9bc4ca6f6 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -180,7 +180,7 @@ pub fn bump(rdr: &mut StringReader) { if current_byte_offset < (*rdr.src).len() { assert!(rdr.curr != -1 as char); let last_char = rdr.curr; - let next = str::char_range_at(*rdr.src, current_byte_offset); + let next = rdr.src.char_range_at(current_byte_offset); let byte_offset_diff = next.next - current_byte_offset; rdr.pos = rdr.pos + BytePos(byte_offset_diff); rdr.curr = next.ch; @@ -204,7 +204,7 @@ pub fn is_eof(rdr: @mut StringReader) -> bool { pub fn nextch(rdr: @mut StringReader) -> char { let offset = byte_offset(rdr, rdr.pos).to_uint(); if offset < (*rdr.src).len() { - return str::char_at(*rdr.src, offset); + return rdr.src.char_at(offset); } else { return -1 as char; } } diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs index 94990d649d8..b7ce617fe50 100644 --- a/src/test/run-pass/utf8_chars.rs +++ b/src/test/run-pass/utf8_chars.rs @@ -22,8 +22,8 @@ pub fn main() { assert!(str::char_len(s) == 4u); assert!(str::to_chars(s).len() == 4u); assert!(str::from_chars(str::to_chars(s)) == s); - assert!(str::char_at(s, 0u) == 'e'); - assert!(str::char_at(s, 1u) == 'é'); + assert!(s.char_at(0u) == 'e'); + assert!(s.char_at(1u) == 'é'); assert!((str::is_utf8(str::to_bytes(s)))); assert!((!str::is_utf8(~[0x80_u8])));