rust/src/libstd/str.rs

4011 lines
123 KiB
Rust
Raw Normal View History

// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2012-12-10 17:44:02 -06:00
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
String manipulation
# Basic Usage
Rust's string type is one of the core primitive types of the language. While
represented by the name `str`, the name `str` is not actually a valid type in
Rust. Each string must also be decorated with how its ownership. This means that
there are three common kinds of strings in rust:
* `~str` - This is an owned string. This type obeys all of the normal semantics
of the `~T` types, meaning that it has one, and only one, owner. This
type cannot be implicitly copied, and is moved out of when passed to
other functions.
* `@str` - This is a managed string. Similarly to `@T`, this type can be
implicitly copied, and each implicit copy will increment the
reference count to the string. This means that there is not "true
owner" of the string, and the string will be deallocated when the
reference count reaches 0.
* `&str` - Finally, this is the borrowed string type. This type of string can
only be created from one of the other two kinds of strings. As the
name "borrowed" implies, this type of string is owned elsewhere, and
this string cannot be moved out of.
As an example, here's a few different kinds of strings.
~~~{.rust}
let owned_string = ~"I am an owned string";
let managed_string = @"This string is garbage-collected";
let borrowed_string1 = "This string is borrowed with the 'static lifetime";
let borrowed_string2: &str = owned_string; // owned strings can be borrowed
let borrowed_string3: &str = managed_string; // managed strings can also be borrowed
~~~
From the example above, you can see that rust has 3 different kinds of string
literals. The owned/managed literals correspond to the owned/managed string
types, but the "borrowed literal" is actually more akin to C's concept of a
static string.
When a string is declared without a `~` or `@` sigil, then the string is
allocated statically in the rodata of the executable/library. The string then
has the type `&'static str` meaning that the string is valid for the `'static`
lifetime, otherwise known as the lifetime of the entire program. As can be
inferred from the type, these static strings are not mutable.
# Mutability
Many languages have immutable strings by default, and rust has a particular
flavor on this idea. As with the rest of Rust types, strings are immutable by
default. If a string is declared as `mut`, however, it may be mutated. This
works the same way as the rest of Rust's type system in the sense that if
there's a mutable reference to a string, there may only be one mutable reference
to that string. With these guarantees, strings can easily transition between
being mutable/immutable with the same benefits of having mutable strings in
other languages.
~~~{.rust}
let mut buf = ~"testing";
buf.push_char(' ');
buf.push_str("123");
assert_eq!(buf, ~"testing 123");
~~~
# Representation
Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
encoded UTF-8 sequences. Additionally, strings are not null-terminated
and can contain null codepoints.
The actual representation of strings have direct mappings to vectors:
* `~str` is the same as `~[u8]`
* `&str` is the same as `&[u8]`
* `@str` is the same as `@[u8]`
*/
use at_vec;
use cast;
use cast::transmute;
use char;
use char::Char;
2013-08-16 05:17:02 -05:00
use clone::{Clone, DeepClone};
use container::{Container, Mutable};
use iter::{Iterator, FromIterator, Extendable, range};
use iter::{Filter, AdditiveIterator, Map};
use iter::{Invert, DoubleEndedIterator, ExactSize};
use libc;
2013-08-10 08:38:00 -05:00
use num::{Saturating};
use option::{None, Option, Some};
use ptr;
use ptr::RawPtr;
use to_str::ToStr;
use uint;
use vec;
use vec::{OwnedVector, OwnedCopyableVector, ImmutableVector, MutableVector};
2013-08-10 08:38:00 -05:00
use default::Default;
use send_str::{SendStr, SendStrOwned};
/*
Section: Conditions
*/
condition! {
pub not_utf8: (~str) -> ~str;
}
/*
Section: Creating a string
*/
/// Convert a vector of bytes to a new UTF-8 string
///
/// # Failure
///
/// Raises the `not_utf8` condition if invalid UTF-8
pub fn from_utf8(vv: &[u8]) -> ~str {
use str::not_utf8::cond;
match from_utf8_opt(vv) {
None => {
let first_bad_byte = *vv.iter().find(|&b| !is_utf8([*b])).unwrap();
cond.raise(fmt!("from_utf8: input is not UTF-8; first bad byte is %u",
first_bad_byte as uint))
}
Some(s) => s
}
}
/// Convert a vector of bytes to a new UTF-8 string, if possible.
/// Returns None if the vector contains invalid UTF-8.
pub fn from_utf8_opt(vv: &[u8]) -> Option<~str> {
if is_utf8(vv) {
Some(unsafe { raw::from_utf8(vv) })
} else {
None
}
}
/// Consumes a vector of bytes to create a new utf-8 string
///
/// # Failure
///
/// Raises the `not_utf8` condition if invalid UTF-8
pub fn from_utf8_owned(vv: ~[u8]) -> ~str {
use str::not_utf8::cond;
if !is_utf8(vv) {
2013-08-09 22:49:29 -05:00
let first_bad_byte = *vv.iter().find(|&b| !is_utf8([*b])).unwrap();
cond.raise(fmt!("from_utf8: input is not UTF-8; first bad byte is %u",
first_bad_byte as uint))
} else {
unsafe { raw::from_utf8_owned(vv) }
}
}
/// Consumes a vector of bytes to create a new utf-8 string.
/// Returns None if the vector contains invalid UTF-8.
pub fn from_utf8_owned_opt(vv: ~[u8]) -> Option<~str> {
if is_utf8(vv) {
Some(unsafe { raw::from_utf8_owned(vv) })
} else {
None
}
}
/// Converts a vector to a string slice without performing any allocations.
///
/// Once the slice has been validated as utf-8, it is transmuted in-place and
/// returned as a '&str' instead of a '&[u8]'
///
/// # Failure
///
/// Fails if invalid UTF-8
pub fn from_utf8_slice<'a>(v: &'a [u8]) -> &'a str {
from_utf8_slice_opt(v).expect("from_utf8_slice: not utf-8")
}
/// Converts a vector to a string slice without performing any allocations.
///
/// Returns None if the slice is not utf-8.
pub fn from_utf8_slice_opt<'a>(v: &'a [u8]) -> Option<&'a str> {
if is_utf8(v) {
Some(unsafe { cast::transmute(v) })
} else { None }
2013-08-04 15:22:56 -05:00
}
impl ToStr for ~str {
#[inline]
fn to_str(&self) -> ~str { self.to_owned() }
}
impl<'self> ToStr for &'self str {
#[inline]
fn to_str(&self) -> ~str { self.to_owned() }
}
impl ToStr for @str {
#[inline]
fn to_str(&self) -> ~str { self.to_owned() }
}
/// Convert a byte to a UTF-8 string
///
/// # Failure
///
/// Fails if invalid UTF-8
2013-08-04 15:22:56 -05:00
pub fn from_byte(b: u8) -> ~str {
assert!(b < 128u8);
unsafe { ::cast::transmute(~[b]) }
}
/// Convert a char to a string
pub fn from_char(ch: char) -> ~str {
let mut buf = ~"";
buf.push_char(ch);
2012-12-12 17:38:50 -06:00
buf
}
/// Convert a vector of chars to a string
pub fn from_chars(chs: &[char]) -> ~str {
let mut buf = ~"";
buf.reserve(chs.len());
for ch in chs.iter() {
buf.push_char(*ch)
}
2012-12-12 17:38:50 -06:00
buf
}
#[doc(hidden)]
pub fn push_str(lhs: &mut ~str, rhs: &str) {
lhs.push_str(rhs)
}
#[allow(missing_doc)]
pub trait StrVector {
fn concat(&self) -> ~str;
fn connect(&self, sep: &str) -> ~str;
}
impl<'self, S: Str> StrVector for &'self [S] {
/// Concatenate a vector of strings.
fn concat(&self) -> ~str {
if self.is_empty() { return ~""; }
// `len` calculation may overflow but push_str but will check boundaries
let len = self.iter().map(|s| s.as_slice().len()).sum();
let mut result = with_capacity(len);
for s in self.iter() {
result.push_str(s.as_slice())
}
result
}
2013-08-04 15:22:56 -05:00
/// Concatenate a vector of strings, placing a given separator between each.
fn connect(&self, sep: &str) -> ~str {
2013-08-04 15:22:56 -05:00
if self.is_empty() { return ~""; }
// concat is faster
if sep.is_empty() { return self.concat(); }
// this is wrong without the guarantee that `self` is non-empty
// `len` calculation may overflow but push_str but will check boundaries
2013-08-04 15:22:56 -05:00
let len = sep.len() * (self.len() - 1)
+ self.iter().map(|s| s.as_slice().len()).sum();
let mut result = with_capacity(len);
2013-08-04 15:22:56 -05:00
let mut first = true;
for s in self.iter() {
if first {
first = false;
} else {
result.push_str(sep);
2013-08-04 15:22:56 -05:00
}
result.push_str(s.as_slice());
2013-08-04 15:22:56 -05:00
}
result
2013-08-04 15:22:56 -05:00
}
}
/// Something that can be used to compare against a character
pub trait CharEq {
/// Determine if the splitter should split at the given character
fn matches(&self, char) -> bool;
/// Indicate if this is only concerned about ASCII characters,
/// which can allow for a faster implementation.
fn only_ascii(&self) -> bool;
}
impl CharEq for char {
#[inline]
fn matches(&self, c: char) -> bool { *self == c }
fn only_ascii(&self) -> bool { (*self as uint) < 128 }
}
impl<'self> CharEq for &'self fn(char) -> bool {
#[inline]
fn matches(&self, c: char) -> bool { (*self)(c) }
fn only_ascii(&self) -> bool { false }
}
impl CharEq for extern "Rust" fn(char) -> bool {
#[inline]
fn matches(&self, c: char) -> bool { (*self)(c) }
fn only_ascii(&self) -> bool { false }
}
impl<'self, C: CharEq> CharEq for &'self [C] {
#[inline]
fn matches(&self, c: char) -> bool {
2013-07-04 21:13:26 -05:00
self.iter().any(|m| m.matches(c))
}
fn only_ascii(&self) -> bool {
self.iter().all(|m| m.only_ascii())
}
}
/*
Section: Iterators
*/
/// External iterator for a string's characters.
/// Use with the `std::iterator` module.
#[deriving(Clone)]
pub struct CharIterator<'self> {
/// The slice remaining to be iterated
priv string: &'self str,
}
impl<'self> Iterator<char> for CharIterator<'self> {
#[inline]
fn next(&mut self) -> Option<char> {
// Decode the next codepoint, then update
// the slice to be just the remaining part
if self.string.len() != 0 {
let CharRange {ch, next} = self.string.char_range_at(0);
unsafe {
self.string = raw::slice_unchecked(self.string, next, self.string.len());
}
Some(ch)
} else {
None
}
}
#[inline]
fn size_hint(&self) -> (uint, Option<uint>) {
(self.string.len().saturating_add(3)/4, Some(self.string.len()))
}
}
impl<'self> DoubleEndedIterator<char> for CharIterator<'self> {
#[inline]
fn next_back(&mut self) -> Option<char> {
if self.string.len() != 0 {
let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
unsafe {
self.string = raw::slice_unchecked(self.string, 0, next);
}
Some(ch)
} else {
None
}
}
}
/// External iterator for a string's characters and their byte offsets.
/// Use with the `std::iterator` module.
#[deriving(Clone)]
pub struct CharOffsetIterator<'self> {
/// The original string to be iterated
priv string: &'self str,
priv iter: CharIterator<'self>,
}
impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> {
#[inline]
fn next(&mut self) -> Option<(uint, char)> {
// Compute the byte offset by using the pointer offset between
// the original string slice and the iterator's remaining part
let offset = do self.string.as_imm_buf |a, _| {
do self.iter.string.as_imm_buf |b, _| {
b as uint - a as uint
}
};
self.iter.next().map_move(|ch| (offset, ch))
}
#[inline]
fn size_hint(&self) -> (uint, Option<uint>) {
self.iter.size_hint()
}
}
impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> {
#[inline]
fn next_back(&mut self) -> Option<(uint, char)> {
self.iter.next_back().map_move(|ch| {
let offset = do self.string.as_imm_buf |a, _| {
do self.iter.string.as_imm_buf |b, len| {
b as uint - a as uint + len
}
};
(offset, ch)
})
}
}
/// External iterator for a string's characters in reverse order.
/// Use with the `std::iterator` module.
pub type CharRevIterator<'self> = Invert<CharIterator<'self>>;
/// External iterator for a string's characters and their byte offsets in reverse order.
/// Use with the `std::iterator` module.
pub type CharOffsetRevIterator<'self> = Invert<CharOffsetIterator<'self>>;
/// External iterator for a string's bytes.
/// Use with the `std::iterator` module.
pub type ByteIterator<'self> =
Map<'self, &'self u8, u8, vec::VecIterator<'self, u8>>;
/// External iterator for a string's bytes in reverse order.
/// Use with the `std::iterator` module.
pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;
/// An iterator over the substrings of a string, separated by `sep`.
#[deriving(Clone)]
pub struct CharSplitIterator<'self, Sep> {
/// The slice remaining to be iterated
priv string: &'self str,
priv sep: Sep,
/// Whether an empty string at the end is allowed
priv allow_trailing_empty: bool,
priv only_ascii: bool,
priv finished: bool,
}
/// An iterator over the substrings of a string, separated by `sep`,
/// starting from the back of the string.
pub type CharRSplitIterator<'self, Sep> = Invert<CharSplitIterator<'self, Sep>>;
/// An iterator over the substrings of a string, separated by `sep`,
/// splitting at most `count` times.
#[deriving(Clone)]
pub struct CharSplitNIterator<'self, Sep> {
priv iter: CharSplitIterator<'self, Sep>,
/// The number of splits remaining
priv count: uint,
priv invert: bool,
}
/// An iterator over the words of a string, separated by an sequence of whitespace
pub type WordIterator<'self> =
Filter<'self, &'self str, CharSplitIterator<'self, extern "Rust" fn(char) -> bool>>;
/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
pub type AnyLineIterator<'self> =
Map<'self, &'self str, &'self str, CharSplitIterator<'self, char>>;
impl<'self, Sep> CharSplitIterator<'self, Sep> {
#[inline]
fn get_end(&mut self) -> Option<&'self str> {
if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
self.finished = true;
Some(self.string)
} else {
None
}
}
}
impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> {
#[inline]
fn next(&mut self) -> Option<&'self str> {
if self.finished { return None }
let mut next_split = None;
if self.only_ascii {
for (idx, byte) in self.string.byte_iter().enumerate() {
if self.sep.matches(byte as char) && byte < 128u8 {
next_split = Some((idx, idx + 1));
break;
}
}
} else {
for (idx, ch) in self.string.char_offset_iter() {
if self.sep.matches(ch) {
next_split = Some((idx, self.string.char_range_at(idx).next));
break;
}
}
}
match next_split {
Some((a, b)) => unsafe {
let elt = raw::slice_unchecked(self.string, 0, a);
self.string = raw::slice_unchecked(self.string, b, self.string.len());
Some(elt)
},
None => self.get_end(),
}
}
}
impl<'self, Sep: CharEq> DoubleEndedIterator<&'self str>
for CharSplitIterator<'self, Sep> {
#[inline]
fn next_back(&mut self) -> Option<&'self str> {
if self.finished { return None }
if !self.allow_trailing_empty {
self.allow_trailing_empty = true;
match self.next_back() {
Some(elt) if !elt.is_empty() => return Some(elt),
_ => if self.finished { return None }
}
}
let len = self.string.len();
let mut next_split = None;
if self.only_ascii {
for (idx, byte) in self.string.byte_iter().enumerate().invert() {
if self.sep.matches(byte as char) && byte < 128u8 {
next_split = Some((idx, idx + 1));
break;
}
}
} else {
for (idx, ch) in self.string.char_offset_rev_iter() {
if self.sep.matches(ch) {
next_split = Some((idx, self.string.char_range_at(idx).next));
break;
}
}
}
match next_split {
Some((a, b)) => unsafe {
let elt = raw::slice_unchecked(self.string, b, len);
self.string = raw::slice_unchecked(self.string, 0, a);
Some(elt)
},
None => { self.finished = true; Some(self.string) }
}
}
}
impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitNIterator<'self, Sep> {
#[inline]
fn next(&mut self) -> Option<&'self str> {
if self.count != 0 {
self.count -= 1;
if self.invert { self.iter.next_back() } else { self.iter.next() }
} else {
self.iter.get_end()
}
}
}
/// An iterator over the start and end indices of the matches of a
/// substring within a larger string
#[deriving(Clone)]
pub struct MatchesIndexIterator<'self> {
priv haystack: &'self str,
priv needle: &'self str,
priv position: uint,
}
2013-05-02 17:33:18 -05:00
/// An iterator over the substrings of a string separated by a given
/// search string
#[deriving(Clone)]
pub struct StrSplitIterator<'self> {
priv it: MatchesIndexIterator<'self>,
priv last_end: uint,
priv finished: bool
}
impl<'self> Iterator<(uint, uint)> for MatchesIndexIterator<'self> {
#[inline]
fn next(&mut self) -> Option<(uint, uint)> {
// See Issue #1932 for why this is a naive search
let (h_len, n_len) = (self.haystack.len(), self.needle.len());
2013-06-10 17:55:51 -05:00
let mut match_start = 0;
let mut match_i = 0;
while self.position < h_len {
if self.haystack[self.position] == self.needle[match_i] {
if match_i == 0 { match_start = self.position; }
match_i += 1;
self.position += 1;
if match_i == n_len {
// found a match!
return Some((match_start, self.position));
}
2013-05-02 17:33:18 -05:00
} else {
// failed match, backtrack
if match_i > 0 {
match_i = 0;
self.position = match_start;
}
self.position += 1;
2013-05-02 17:33:18 -05:00
}
}
None
2013-05-02 17:33:18 -05:00
}
}
2012-02-23 07:59:27 -06:00
impl<'self> Iterator<&'self str> for StrSplitIterator<'self> {
#[inline]
fn next(&mut self) -> Option<&'self str> {
if self.finished { return None; }
2012-02-23 07:59:27 -06:00
match self.it.next() {
Some((from, to)) => {
let ret = Some(self.it.haystack.slice(self.last_end, from));
self.last_end = to;
ret
}
None => {
self.finished = true;
Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
}
2013-05-02 17:33:18 -05:00
}
}
}
// Helper functions used for Unicode normalization
fn canonical_sort(comb: &mut [(char, u8)]) {
use iter::range;
use tuple::CopyableTuple;
let len = comb.len();
for i in range(0, len) {
let mut swapped = false;
for j in range(1, len-i) {
let classA = comb[j-1].second();
let classB = comb[j].second();
if classA != 0 && classB != 0 && classA > classB {
comb.swap(j-1, j);
swapped = true;
}
}
if !swapped { break; }
}
}
#[deriving(Clone)]
enum NormalizationForm {
NFD,
NFKD
}
/// External iterator for a string's normalization's characters.
/// Use with the `std::iterator` module.
#[deriving(Clone)]
struct NormalizationIterator<'self> {
priv kind: NormalizationForm,
priv iter: CharIterator<'self>,
priv buffer: ~[(char, u8)],
priv sorted: bool
}
impl<'self> Iterator<char> for NormalizationIterator<'self> {
#[inline]
fn next(&mut self) -> Option<char> {
use unicode::decompose::canonical_combining_class;
match self.buffer.head_opt() {
Some(&(c, 0)) => {
self.sorted = false;
self.buffer.shift();
return Some(c);
}
Some(&(c, _)) if self.sorted => {
self.buffer.shift();
return Some(c);
}
_ => self.sorted = false
}
let decomposer = match self.kind {
NFD => char::decompose_canonical,
NFKD => char::decompose_compatible
};
if !self.sorted {
for ch in self.iter {
do decomposer(ch) |d| {
let class = canonical_combining_class(d);
if class == 0 && !self.sorted {
canonical_sort(self.buffer);
self.sorted = true;
}
self.buffer.push((d, class));
}
if self.sorted { break }
}
}
if !self.sorted {
canonical_sort(self.buffer);
self.sorted = true;
}
match self.buffer.shift_opt() {
Some((c, 0)) => {
self.sorted = false;
Some(c)
}
Some((c, _)) => Some(c),
None => None
}
}
fn size_hint(&self) -> (uint, Option<uint>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
/// Replace all occurrences of one string with another
///
/// # Arguments
///
/// * s - The string containing substrings to replace
/// * from - The string to replace
/// * to - The replacement string
///
/// # Return value
///
/// The original string with all occurances of `from` replaced with `to`
pub fn replace(s: &str, from: &str, to: &str) -> ~str {
let mut result = ~"";
let mut last_end = 0;
for (start, end) in s.matches_index_iter(from) {
result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
result.push_str(to);
last_end = end;
}
result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
result
}
/*
Section: Comparing strings
*/
/// Bytewise slice equality
#[cfg(not(test))]
2013-08-04 15:22:56 -05:00
#[lang="str_eq"]
#[inline]
pub fn eq_slice(a: &str, b: &str) -> bool {
do a.as_imm_buf |ap, alen| {
do b.as_imm_buf |bp, blen| {
if (alen != blen) { false }
else {
unsafe {
libc::memcmp(ap as *libc::c_void,
bp as *libc::c_void,
alen as libc::size_t) == 0
}
}
}
}
}
/// Bytewise slice equality
#[cfg(test)]
2013-08-04 15:22:56 -05:00
#[inline]
pub fn eq_slice(a: &str, b: &str) -> bool {
do a.as_imm_buf |ap, alen| {
do b.as_imm_buf |bp, blen| {
if (alen != blen) { false }
else {
unsafe {
libc::memcmp(ap as *libc::c_void,
bp as *libc::c_void,
alen as libc::size_t) == 0
}
}
}
}
}
/// Bytewise string equality
#[cfg(not(test))]
#[lang="uniq_str_eq"]
2013-05-02 02:49:11 -05:00
#[inline]
pub fn eq(a: &~str, b: &~str) -> bool {
eq_slice(*a, *b)
}
#[cfg(test)]
2013-05-02 02:49:11 -05:00
#[inline]
pub fn eq(a: &~str, b: &~str) -> bool {
eq_slice(*a, *b)
}
/*
Section: Searching
*/
// Utility used by various searching functions
fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
2012-03-06 22:48:40 -06:00
let mut i = at;
for c in needle.byte_iter() { if haystack[i] != c { return false; } i += 1u; }
2012-08-01 19:30:05 -05:00
return true;
}
/*
Section: Misc
*/
/// Determines if a vector of bytes contains valid UTF-8
2013-06-23 22:44:11 -05:00
pub fn is_utf8(v: &[u8]) -> bool {
2012-03-06 22:48:40 -06:00
let mut i = 0u;
let total = v.len();
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
unsafe { *xs.unsafe_ref(i) }
}
while i < total {
let v_i = unsafe_get(v, i);
if v_i < 128u8 {
i += 1u;
} else {
let w = utf8_char_width(v_i);
if w == 0u { return false; }
let nexti = i + w;
if nexti > total { return false; }
// 2-byte encoding is for codepoints \u0080 to \u07ff
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u0800 to \uffff
// first E0 A0 80 last EF BF BF
// excluding surrogates codepoints \ud800 to \udfff
// ED A0 80 to ED BF BF
// 4-byte encoding is for codepoints \u10000 to \u10ffff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
// UTF8-tail = %x80-BF
match w {
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
return false
},
3 => match (v_i,
unsafe_get(v, i + 1),
unsafe_get(v, i + 2) & 192u8) {
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
_ => return false,
},
_ => match (v_i,
unsafe_get(v, i + 1),
unsafe_get(v, i + 2) & 192u8,
unsafe_get(v, i + 3) & 192u8) {
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
_ => return false,
},
}
i = nexti;
}
}
true
}
/// Determines if a vector of `u16` contains valid UTF-16
pub fn is_utf16(v: &[u16]) -> bool {
2013-05-14 04:52:12 -05:00
let len = v.len();
2012-03-06 22:48:40 -06:00
let mut i = 0u;
while (i < len) {
let u = v[i];
if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
i += 1u;
} else {
2012-08-01 19:30:05 -05:00
if i+1u < len { return false; }
let u2 = v[i+1u];
2012-08-01 19:30:05 -05:00
if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false; }
if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false; }
i += 2u;
}
}
2012-08-01 19:30:05 -05:00
return true;
}
/// Iterates over the utf-16 characters in the specified slice, yielding each
/// decoded unicode character to the function provided.
///
/// # Failures
///
/// * Fails on invalid utf-16 data
pub fn utf16_chars(v: &[u16], f: &fn(char)) {
2013-05-14 04:52:12 -05:00
let len = v.len();
2012-03-06 22:48:40 -06:00
let mut i = 0u;
2012-03-06 18:00:29 -06:00
while (i < len && v[i] != 0u16) {
2013-04-12 00:10:01 -05:00
let u = v[i];
if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
f(unsafe { cast::transmute(u as u32) });
i += 1u;
} else {
let u2 = v[i+1u];
2013-03-28 20:39:09 -05:00
assert!(u >= 0xD800_u16 && u <= 0xDBFF_u16);
assert!(u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16);
let mut c: u32 = (u - 0xD800_u16) as u32;
c = c << 10;
c |= (u2 - 0xDC00_u16) as u32;
c |= 0x1_0000_u32 as u32;
f(unsafe { cast::transmute(c) });
i += 2u;
}
}
}
/// Allocates a new string from the utf-16 slice provided
pub fn from_utf16(v: &[u16]) -> ~str {
let mut buf = ~"";
buf.reserve(v.len());
utf16_chars(v, |ch| buf.push_char(ch));
2012-12-12 17:38:50 -06:00
buf
}
/// Allocates a new string with the specified capacity. The string returned is
/// the empty string, but has capacity for much more.
#[inline]
pub fn with_capacity(capacity: uint) -> ~str {
unsafe {
cast::transmute(vec::with_capacity::<~[u8]>(capacity))
}
}
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8, ..256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];
/// Given a first byte, determine how many bytes are in this UTF-8 character
pub fn utf8_char_width(b: u8) -> uint {
return UTF8_CHAR_WIDTH[b] as uint;
}
#[allow(missing_doc)]
2012-11-26 22:05:19 -06:00
pub struct CharRange {
ch: char,
next: uint
}
// Return the initial codepoint accumulator for the first byte.
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
// for width 3, and 3 bits for width 4
macro_rules! utf8_first_byte(
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
)
// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
)
static TAG_CONT_U8: u8 = 128u8;
/// Unsafe operations
2012-09-28 17:41:10 -05:00
pub mod raw {
use option::{Option, Some};
use cast;
use libc;
use ptr;
use str::is_utf8;
use vec;
use vec::MutableVector;
2013-08-04 15:22:56 -05:00
use unstable::raw::Slice;
/// Create a Rust string from a *u8 buffer of the given length
2013-08-04 15:22:56 -05:00
pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
let mut v: ~[u8] = vec::with_capacity(len);
do v.as_mut_buf |vbuf, _len| {
ptr::copy_memory(vbuf, buf as *u8, len)
};
vec::raw::set_len(&mut v, len);
assert!(is_utf8(v));
::cast::transmute(v)
}
#[lang="strdup_uniq"]
#[cfg(not(test))]
#[allow(missing_doc)]
#[inline]
pub unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
from_buf_len(ptr, len)
}
/// Create a Rust string from a null-terminated C string
pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
let mut curr = buf;
let mut i = 0;
while *curr != 0 {
i += 1;
curr = ptr::offset(buf, i);
}
from_buf_len(buf as *u8, i as uint)
}
/// Converts a vector of bytes to a new owned string.
pub unsafe fn from_utf8(v: &[u8]) -> ~str {
do v.as_imm_buf |buf, len| {
from_buf_len(buf, len)
}
}
/// Converts an owned vector of bytes to a new owned string. This assumes
/// that the utf-8-ness of the vector has already been validated
2013-08-04 15:22:56 -05:00
#[inline]
pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
2013-08-04 15:22:56 -05:00
cast::transmute(v)
}
/// Converts a byte to a string.
pub unsafe fn from_byte(u: u8) -> ~str { from_utf8([u]) }
/// Form a slice from a C string. Unsafe because the caller must ensure the
/// C string has the static lifetime, or else the return value may be
/// invalidated later.
2013-08-04 15:22:56 -05:00
pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
let s = s as *u8;
let mut curr = s;
let mut len = 0u;
while *curr != 0u8 {
len += 1u;
curr = ptr::offset(s, len as int);
}
let v = Slice { data: s, len: len };
assert!(is_utf8(::cast::transmute(v)));
::cast::transmute(v)
}
/// Takes a bytewise (not UTF-8) slice from a string.
///
/// Returns the substring from [`begin`..`end`).
///
/// # Failure
///
/// If begin is greater than end.
/// If end is greater than the length of the string.
2013-08-04 15:22:56 -05:00
#[inline]
pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
assert!(begin <= end);
assert!(end <= s.len());
slice_unchecked(s, begin, end)
}
2013-08-04 15:22:56 -05:00
/// Takes a bytewise (not UTF-8) slice from a string.
///
/// Returns the substring from [`begin`..`end`).
///
/// Caller must check slice boundaries!
#[inline]
pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
do s.as_imm_buf |sbuf, _n| {
2013-08-04 15:22:56 -05:00
cast::transmute(Slice {
data: sbuf.offset(begin as int),
2013-08-04 15:22:56 -05:00
len: end - begin,
})
}
}
/// Appends a byte to a string.
/// The caller must preserve the valid UTF-8 property.
#[inline]
pub unsafe fn push_byte(s: &mut ~str, b: u8) {
as_owned_vec(s).push(b)
}
/// Appends a vector of bytes to a string.
/// The caller must preserve the valid UTF-8 property.
#[inline]
pub unsafe fn push_bytes(s: &mut ~str, bytes: &[u8]) {
vec::bytes::push_bytes(as_owned_vec(s), bytes);
2012-08-25 18:36:02 -05:00
}
/// Removes the last byte from a string and returns it.
/// The caller must preserve the valid UTF-8 property.
pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
let len = s.len();
2013-03-28 20:39:09 -05:00
assert!((len > 0u));
2012-08-25 18:36:02 -05:00
let b = s[len - 1u];
set_len(s, len - 1u);
2012-08-25 18:36:02 -05:00
return b;
}
/// Removes the first byte from a string and returns it.
/// The caller must preserve the valid UTF-8 property.
pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
let len = s.len();
2013-03-28 20:39:09 -05:00
assert!((len > 0u));
2012-08-25 18:36:02 -05:00
let b = s[0];
*s = s.slice(1, len).to_owned();
2012-08-25 18:36:02 -05:00
return b;
}
/// Access the str in its vector representation.
/// The caller must preserve the valid UTF-8 property when modifying.
#[inline]
pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
cast::transmute(s)
}
2013-08-04 15:22:56 -05:00
/// Sets the length of a string
///
/// This will explicitly set the size of the string, without actually
/// modifying its buffers, so it is up to the caller to ensure that
2013-08-04 15:22:56 -05:00
/// the string is actually the specified size.
#[inline]
pub unsafe fn set_len(s: &mut ~str, new_len: uint) {
vec::raw::set_len(as_owned_vec(s), new_len)
2013-08-04 15:22:56 -05:00
}
/// Parses a C "multistring", eg windows env values or
/// the req->ptr result in a uv_fs_readdir() call.
/// Optionally, a `count` can be passed in, limiting the
/// parsing to only being done `count`-times.
#[inline]
pub unsafe fn from_c_multistring(buf: *libc::c_char, count: Option<uint>) -> ~[~str] {
#[fixed_stack_segment]; #[inline(never)];
let mut curr_ptr: uint = buf as uint;
let mut result = ~[];
let mut ctr = 0;
let (limited_count, limit) = match count {
Some(limit) => (true, limit),
None => (false, 0)
};
while(((limited_count && ctr < limit) || !limited_count)
&& *(curr_ptr as *libc::c_char) != 0 as libc::c_char) {
let env_pair = from_c_str(
curr_ptr as *libc::c_char);
result.push(env_pair);
curr_ptr +=
libc::strlen(curr_ptr as *libc::c_char) as uint
+ 1;
ctr += 1;
}
result
}
2013-08-04 15:22:56 -05:00
/// Sets the length of a string
///
/// This will explicitly set the size of the string, without actually
/// modifing its buffers, so it is up to the caller to ensure that
/// the string is actually the specified size.
#[test]
fn test_from_buf_len() {
unsafe {
let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
2012-09-12 19:45:23 -05:00
let b = vec::raw::to_ptr(a);
let c = from_buf_len(b, 3u);
assert_eq!(c, ~"AAA");
}
}
#[test]
fn test_str_multistring_parsing() {
use option::None;
unsafe {
let input = bytes!("zero", "\x00", "one", "\x00", "\x00");
let ptr = vec::raw::to_ptr(input);
let mut result = from_c_multistring(ptr as *libc::c_char, None);
assert!(result.len() == 2);
let mut ctr = 0;
for x in result.iter() {
match ctr {
0 => assert_eq!(x, &~"zero"),
1 => assert_eq!(x, &~"one"),
_ => fail!("shouldn't happen!")
}
ctr += 1;
}
}
}
}
/*
Section: Trait implementations
*/
#[cfg(not(test))]
2012-09-28 17:41:10 -05:00
pub mod traits {
use ops::Add;
use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
use super::{Str, eq_slice};
2013-08-01 02:16:42 -05:00
use option::{Some, None};
impl<'self> Add<&'self str,~str> for &'self str {
#[inline]
fn add(&self, rhs: & &'self str) -> ~str {
let mut ret = self.to_owned();
ret.push_str(*rhs);
ret
}
}
impl<'self> TotalOrd for &'self str {
#[inline]
fn cmp(&self, other: & &'self str) -> Ordering {
for (s_b, o_b) in self.byte_iter().zip(other.byte_iter()) {
match s_b.cmp(&o_b) {
Greater => return Greater,
Less => return Less,
Equal => ()
}
}
self.len().cmp(&other.len())
}
}
impl TotalOrd for ~str {
#[inline]
fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
}
impl TotalOrd for @str {
#[inline]
fn cmp(&self, other: &@str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
}
impl<'self> Eq for &'self str {
#[inline]
fn eq(&self, other: & &'self str) -> bool {
eq_slice((*self), (*other))
}
#[inline]
fn ne(&self, other: & &'self str) -> bool { !(*self).eq(other) }
}
impl Eq for ~str {
#[inline]
fn eq(&self, other: &~str) -> bool {
eq_slice((*self), (*other))
}
}
impl Eq for @str {
#[inline]
fn eq(&self, other: &@str) -> bool {
eq_slice((*self), (*other))
}
}
impl<'self> TotalEq for &'self str {
#[inline]
fn equals(&self, other: & &'self str) -> bool {
eq_slice((*self), (*other))
}
}
impl TotalEq for ~str {
#[inline]
fn equals(&self, other: &~str) -> bool {
eq_slice((*self), (*other))
}
}
impl TotalEq for @str {
#[inline]
fn equals(&self, other: &@str) -> bool {
eq_slice((*self), (*other))
}
}
impl<'self> Ord for &'self str {
#[inline]
fn lt(&self, other: & &'self str) -> bool { self.cmp(other) == Less }
}
impl Ord for ~str {
#[inline]
fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
}
impl Ord for @str {
#[inline]
fn lt(&self, other: &@str) -> bool { self.cmp(other) == Less }
}
impl<'self, S: Str> Equiv<S> for &'self str {
#[inline]
fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
}
impl<'self, S: Str> Equiv<S> for @str {
#[inline]
fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
}
impl<'self, S: Str> Equiv<S> for ~str {
#[inline]
fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
}
}
#[cfg(test)]
2012-09-28 17:41:10 -05:00
pub mod traits {}
/// Any string that can be represented as a slice
pub trait Str {
/// Work with `self` as a slice.
fn as_slice<'a>(&'a self) -> &'a str;
/// Convert `self` into a ~str, not making a copy if possible
fn into_owned(self) -> ~str;
}
impl<'self> Str for &'self str {
#[inline]
fn as_slice<'a>(&'a self) -> &'a str { *self }
#[inline]
fn into_owned(self) -> ~str { self.to_owned() }
}
impl<'self> Str for ~str {
#[inline]
fn as_slice<'a>(&'a self) -> &'a str {
let s: &'a str = *self; s
}
#[inline]
fn into_owned(self) -> ~str { self }
}
impl<'self> Str for @str {
#[inline]
fn as_slice<'a>(&'a self) -> &'a str {
let s: &'a str = *self; s
}
#[inline]
fn into_owned(self) -> ~str { self.to_owned() }
}
impl<'self> Container for &'self str {
2013-08-04 15:22:56 -05:00
#[inline]
fn len(&self) -> uint {
do self.as_imm_buf |_p, n| { n }
}
}
impl Container for ~str {
#[inline]
fn len(&self) -> uint { self.as_slice().len() }
}
impl Container for @str {
#[inline]
fn len(&self) -> uint { self.as_slice().len() }
}
impl Mutable for ~str {
/// Remove all content, make the string empty
#[inline]
fn clear(&mut self) {
unsafe {
raw::set_len(self, 0)
}
}
}
#[allow(missing_doc)]
pub trait StrSlice<'self> {
fn contains<'a>(&self, needle: &'a str) -> bool;
fn contains_char(&self, needle: char) -> bool;
fn iter(&self) -> CharIterator<'self>;
fn rev_iter(&self) -> CharRevIterator<'self>;
fn byte_iter(&self) -> ByteIterator<'self>;
fn byte_rev_iter(&self) -> ByteRevIterator<'self>;
fn char_offset_iter(&self) -> CharOffsetIterator<'self>;
fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self>;
fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep>;
fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
fn matches_index_iter(&self, sep: &'self str) -> MatchesIndexIterator<'self>;
fn split_str_iter(&self, &'self str) -> StrSplitIterator<'self>;
fn line_iter(&self) -> CharSplitIterator<'self, char>;
fn any_line_iter(&self) -> AnyLineIterator<'self>;
fn word_iter(&self) -> WordIterator<'self>;
fn nfd_iter(&self) -> NormalizationIterator<'self>;
fn nfkd_iter(&self) -> NormalizationIterator<'self>;
fn ends_with(&self, needle: &str) -> bool;
fn is_whitespace(&self) -> bool;
fn is_alphanumeric(&self) -> bool;
fn char_len(&self) -> uint;
fn slice(&self, begin: uint, end: uint) -> &'self str;
fn slice_from(&self, begin: uint) -> &'self str;
fn slice_to(&self, end: uint) -> &'self str;
fn slice_chars(&self, begin: uint, end: uint) -> &'self str;
fn starts_with(&self, needle: &str) -> bool;
fn escape_default(&self) -> ~str;
fn escape_unicode(&self) -> ~str;
fn trim(&self) -> &'self str;
fn trim_left(&self) -> &'self str;
fn trim_right(&self) -> &'self str;
fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
2013-06-11 06:46:40 -05:00
fn replace(&self, from: &str, to: &str) -> ~str;
fn to_owned(&self) -> ~str;
fn to_managed(&self) -> @str;
fn to_utf16(&self) -> ~[u16];
fn to_send_str(&self) -> SendStr;
2013-06-10 08:01:45 -05:00
fn is_char_boundary(&self, index: uint) -> bool;
2013-06-10 06:46:36 -05:00
fn char_range_at(&self, start: uint) -> CharRange;
fn char_at(&self, i: uint) -> char;
2013-06-10 06:46:36 -05:00
fn char_range_at_reverse(&self, start: uint) -> CharRange;
fn char_at_reverse(&self, i: uint) -> char;
fn as_bytes(&self) -> &'self [u8];
fn find<C: CharEq>(&self, search: C) -> Option<uint>;
fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
fn find_str(&self, &str) -> Option<uint>;
2013-06-10 21:05:42 -05:00
fn repeat(&self, nn: uint) -> ~str;
fn slice_shift_char(&self) -> (char, &'self str);
fn lev_distance(&self, t: &str) -> uint;
fn subslice_offset(&self, inner: &str) -> uint;
2013-07-10 19:33:11 -05:00
fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T;
}
/// Extension methods for strings
impl<'self> StrSlice<'self> for &'self str {
/// Returns true if one string contains another
///
/// # Arguments
///
/// * needle - The string to look for
2012-03-16 19:35:38 -05:00
#[inline]
fn contains<'a>(&self, needle: &'a str) -> bool {
self.find_str(needle).is_some()
2013-03-04 21:36:15 -06:00
}
/// Returns true if a string contains a char.
///
/// # Arguments
///
/// * needle - The char to look for
#[inline]
fn contains_char(&self, needle: char) -> bool {
self.find(needle).is_some()
}
/// An iterator over the characters of `self`. Note, this iterates
/// over unicode code-points, not unicode graphemes.
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[char] = "abc åäö".iter().collect();
/// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
/// ~~~
2013-04-18 07:50:55 -05:00
#[inline]
fn iter(&self) -> CharIterator<'self> {
CharIterator{string: *self}
2013-04-18 07:50:55 -05:00
}
/// An iterator over the characters of `self`, in reverse order.
#[inline]
fn rev_iter(&self) -> CharRevIterator<'self> {
self.iter().invert()
}
2013-04-18 07:50:55 -05:00
/// An iterator over the bytes of `self`
#[inline]
fn byte_iter(&self) -> ByteIterator<'self> {
self.as_bytes().iter().map(|&b| b)
2013-05-02 17:33:18 -05:00
}
/// An iterator over the bytes of `self`, in reverse order
#[inline]
fn byte_rev_iter(&self) -> ByteRevIterator<'self> {
self.byte_iter().invert()
}
/// An iterator over the characters of `self` and their byte offsets.
#[inline]
fn char_offset_iter(&self) -> CharOffsetIterator<'self> {
CharOffsetIterator{string: *self, iter: self.iter()}
}
/// An iterator over the characters of `self` and their byte offsets,
/// in reverse order.
#[inline]
fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self> {
self.char_offset_iter().invert()
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`.
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[&str] = "Mary had a little lamb".split_iter(' ').collect();
/// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
///
/// let v: ~[&str] = "abc1def2ghi".split_iter(|c: char| c.is_digit()).collect();
/// assert_eq!(v, ~["abc", "def", "ghi"]);
/// ~~~
#[inline]
fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep> {
CharSplitIterator {
string: *self,
only_ascii: sep.only_ascii(),
sep: sep,
allow_trailing_empty: true,
finished: false,
}
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, restricted to splitting at most `count`
/// times.
#[inline]
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
-> CharSplitNIterator<'self, Sep> {
CharSplitNIterator {
iter: self.split_iter(sep),
count: count,
invert: false,
}
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`.
///
/// Equivalent to `split_iter`, except that the trailing substring
/// is skipped if empty (terminator semantics).
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[&str] = "A.B.".split_terminator_iter('.').collect();
/// assert_eq!(v, ~["A", "B"]);
/// ~~~
#[inline]
fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep)
-> CharSplitIterator<'self, Sep> {
CharSplitIterator {
allow_trailing_empty: false,
..self.split_iter(sep)
}
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, in reverse order
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[&str] = "Mary had a little lamb".rsplit_iter(' ').collect();
/// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
/// ~~~
#[inline]
fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep> {
self.split_iter(sep).invert()
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, starting from the end of the string.
/// Restricted to splitting at most `count` times.
#[inline]
fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
-> CharSplitNIterator<'self, Sep> {
CharSplitNIterator {
iter: self.split_iter(sep),
count: count,
invert: true,
}
}
/// An iterator over the start and end indices of each match of
/// `sep` within `self`.
#[inline]
fn matches_index_iter(&self, sep: &'self str) -> MatchesIndexIterator<'self> {
assert!(!sep.is_empty())
MatchesIndexIterator {
haystack: *self,
needle: sep,
position: 0
}
}
/// An iterator over the substrings of `self` separated by `sep`.
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[&str] = "abcXXXabcYYYabc".split_str_iter("abc").collect()
/// assert_eq!(v, ["", "XXX", "YYY", ""]);
/// ~~~
#[inline]
fn split_str_iter(&self, sep: &'self str) -> StrSplitIterator<'self> {
StrSplitIterator {
it: self.matches_index_iter(sep),
last_end: 0,
finished: false
}
}
/// An iterator over the lines of a string (subsequences separated
/// by `\n`).
#[inline]
fn line_iter(&self) -> CharSplitIterator<'self, char> {
self.split_terminator_iter('\n')
}
/// An iterator over the lines of a string, separated by either
/// `\n` or (`\r\n`).
fn any_line_iter(&self) -> AnyLineIterator<'self> {
do self.line_iter().map |line| {
let l = line.len();
if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
else { line }
}
}
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace).
#[inline]
fn word_iter(&self) -> WordIterator<'self> {
self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
}
/// Returns the string in Unicode Normalization Form D (canonical decomposition)
fn nfd_iter(&self) -> NormalizationIterator<'self> {
NormalizationIterator {
iter: self.iter(),
buffer: ~[],
sorted: false,
kind: NFD
}
}
/// Returns the string in Unicode Normalization Form KD (compatibility decomposition)
fn nfkd_iter(&self) -> NormalizationIterator<'self> {
NormalizationIterator {
iter: self.iter(),
buffer: ~[],
sorted: false,
kind: NFKD
}
}
/// Returns true if the string contains only whitespace
///
/// Whitespace characters are determined by `char::is_whitespace`
2012-03-16 19:35:38 -05:00
#[inline]
fn is_whitespace(&self) -> bool { self.iter().all(char::is_whitespace) }
/// Returns true if the string contains only alphanumerics
///
/// Alphanumeric characters are determined by `char::is_alphanumeric`
2012-06-08 19:57:39 -05:00
#[inline]
fn is_alphanumeric(&self) -> bool { self.iter().all(char::is_alphanumeric) }
2013-03-15 02:33:12 -05:00
/// Returns the number of characters that a string holds
#[inline]
2013-08-09 22:30:03 -05:00
fn char_len(&self) -> uint { self.iter().len() }
/// Returns a slice of the given string from the byte range
/// [`begin`..`end`)
///
/// Fails when `begin` and `end` do not point to valid characters or
/// beyond the last character of the string
2012-03-16 19:35:38 -05:00
#[inline]
fn slice(&self, begin: uint, end: uint) -> &'self str {
2013-08-18 06:57:34 -05:00
assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
unsafe { raw::slice_bytes(*self, begin, end) }
2013-03-04 21:36:15 -06:00
}
/// Returns a slice of the string from `begin` to its end.
///
/// Fails when `begin` does not point to a valid character, or is
/// out of bounds.
2012-03-16 19:35:38 -05:00
#[inline]
fn slice_from(&self, begin: uint) -> &'self str {
self.slice(begin, self.len())
}
/// Returns a slice of the string from the beginning to byte
/// `end`.
///
/// Fails when `end` does not point to a valid character, or is
/// out of bounds.
#[inline]
fn slice_to(&self, end: uint) -> &'self str {
assert!(self.is_char_boundary(end));
unsafe { raw::slice_bytes(*self, 0, end) }
}
/// Returns a slice of the string from the char range
/// [`begin`..`end`).
///
/// Fails if `begin` > `end` or the either `begin` or `end` are
/// beyond the last character of the string.
fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
assert!(begin <= end);
2013-06-14 21:40:11 -05:00
let mut count = 0;
let mut begin_byte = None;
let mut end_byte = None;
// This could be even more efficient by not decoding,
// only finding the char boundaries
for (idx, _) in self.char_offset_iter() {
if count == begin { begin_byte = Some(idx); }
if count == end { end_byte = Some(idx); break; }
count += 1;
}
if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
match (begin_byte, end_byte) {
(None, _) => fail!("slice_chars: `begin` is beyond end of string"),
(_, None) => fail!("slice_chars: `end` is beyond end of string"),
(Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
}
}
/// Returns true if `needle` is a prefix of the string.
fn starts_with<'a>(&self, needle: &'a str) -> bool {
let (self_len, needle_len) = (self.len(), needle.len());
if needle_len == 0u { true }
else if needle_len > self_len { false }
else { match_at(*self, needle, 0u) }
2013-03-04 21:36:15 -06:00
}
/// Returns true if `needle` is a suffix of the string.
fn ends_with(&self, needle: &str) -> bool {
let (self_len, needle_len) = (self.len(), needle.len());
if needle_len == 0u { true }
else if needle_len > self_len { false }
else { match_at(*self, needle, self_len - needle_len) }
}
/// Escape each char in `s` with char::escape_default.
2013-06-11 07:13:23 -05:00
fn escape_default(&self) -> ~str {
let mut out: ~str = ~"";
out.reserve_at_least(self.len());
for c in self.iter() {
do c.escape_default |c| {
out.push_char(c);
}
2013-06-11 07:13:23 -05:00
}
out
}
/// Escape each char in `s` with char::escape_unicode.
2013-06-11 07:13:23 -05:00
fn escape_unicode(&self) -> ~str {
let mut out: ~str = ~"";
out.reserve_at_least(self.len());
for c in self.iter() {
do c.escape_unicode |c| {
out.push_char(c);
}
2013-06-11 07:13:23 -05:00
}
out
}
2012-07-23 13:51:12 -05:00
/// Returns a string with leading and trailing whitespace removed
#[inline]
2013-06-10 06:03:16 -05:00
fn trim(&self) -> &'self str {
self.trim_left().trim_right()
}
/// Returns a string with leading whitespace removed
#[inline]
2013-06-10 06:03:16 -05:00
fn trim_left(&self) -> &'self str {
self.trim_left_chars(&char::is_whitespace)
2013-06-10 06:03:16 -05:00
}
/// Returns a string with trailing whitespace removed
#[inline]
2013-06-10 06:03:16 -05:00
fn trim_right(&self) -> &'self str {
self.trim_right_chars(&char::is_whitespace)
2013-06-10 06:03:16 -05:00
}
/// Returns a string with characters that match `to_trim` removed.
///
/// # Arguments
///
/// * to_trim - a character matcher
///
/// # Example
///
/// ~~~ {.rust}
/// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
/// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
/// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
/// ~~~
#[inline]
fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
self.trim_left_chars(to_trim).trim_right_chars(to_trim)
}
/// Returns a string with leading `chars_to_trim` removed.
///
/// # Arguments
///
/// * to_trim - a character matcher
///
/// # Example
///
/// ~~~ {.rust}
/// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
/// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
/// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
/// ~~~
#[inline]
fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
match self.find(|c: char| !to_trim.matches(c)) {
2013-06-10 06:03:16 -05:00
None => "",
Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
}
}
/// Returns a string with trailing `chars_to_trim` removed.
///
/// # Arguments
///
/// * to_trim - a character matcher
///
/// # Example
///
/// ~~~ {.rust}
/// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
/// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
/// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
/// ~~~
#[inline]
fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
match self.rfind(|c: char| !to_trim.matches(c)) {
2013-06-10 06:03:16 -05:00
None => "",
Some(last) => {
2013-06-10 06:46:36 -05:00
let next = self.char_range_at(last).next;
unsafe { raw::slice_bytes(*self, 0u, next) }
2013-06-10 06:03:16 -05:00
}
}
}
/// Replace all occurrences of one string with another
///
/// # Arguments
///
/// * from - The string to replace
/// * to - The replacement string
///
/// # Return value
///
/// The original string with all occurances of `from` replaced with `to`
fn replace(&self, from: &str, to: &str) -> ~str {
2013-06-14 21:40:11 -05:00
let mut result = ~"";
let mut last_end = 0;
for (start, end) in self.matches_index_iter(from) {
2013-06-11 06:46:40 -05:00
result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
result.push_str(to);
last_end = end;
}
result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
result
}
/// Copy a slice into a new unique str
2013-08-04 15:22:56 -05:00
#[inline]
fn to_owned(&self) -> ~str {
do self.as_imm_buf |src, len| {
unsafe {
let mut v = vec::with_capacity(len);
do v.as_mut_buf |dst, _| {
ptr::copy_memory(dst, src, len);
}
vec::raw::set_len(&mut v, len);
::cast::transmute(v)
}
}
}
#[inline]
fn to_managed(&self) -> @str {
unsafe {
let v: *&[u8] = cast::transmute(self);
cast::transmute(at_vec::to_managed(*v))
}
}
/// Converts to a vector of `u16` encoded as UTF-16.
fn to_utf16(&self) -> ~[u16] {
let mut u = ~[];
for ch in self.iter() {
// Arithmetic with u32 literals is easier on the eyes than chars.
let mut ch = ch as u32;
if (ch & 0xFFFF_u32) == ch {
// The BMP falls through (assuming non-surrogate, as it
// should)
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
u.push(ch as u16)
} else {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
ch -= 0x1_0000_u32;
let w1 = 0xD800_u16 | ((ch >> 10) as u16);
let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
u.push_all([w1, w2])
}
}
u
}
#[inline]
fn to_send_str(&self) -> SendStr {
SendStrOwned(self.to_owned())
}
/// Returns false if the index points into the middle of a multi-byte
/// character sequence.
2013-08-18 06:57:34 -05:00
#[inline]
2013-06-10 06:46:36 -05:00
fn is_char_boundary(&self, index: uint) -> bool {
if index == self.len() { return true; }
let b = self[index];
return b < 128u8 || b >= 192u8;
}
/// Pluck a character out of a string and return the index of the next
/// character.
///
/// This function can be used to iterate over the unicode characters of a
/// string.
///
/// # Example
///
/// ~~~ {.rust}
/// let s = "中华Việt Nam";
/// let i = 0u;
/// while i < s.len() {
/// let CharRange {ch, next} = s.char_range_at(i);
/// printfln!("%u: %c", i, ch);
/// i = next;
/// }
/// ~~~
///
/// # Example output
///
/// ~~~
/// 0: 中
/// 3: 华
/// 6: V
/// 7: i
/// 8: ệ
/// 11: t
/// 12:
/// 13: N
/// 14: a
/// 15: m
/// ~~~
///
/// # Arguments
///
/// * s - The string
/// * i - The byte offset of the char to extract
///
/// # Return value
///
/// A record {ch: char, next: uint} containing the char value and the byte
/// index of the next unicode character.
///
/// # Failure
///
/// If `i` is greater than or equal to the length of the string.
/// If `i` is not the index of the beginning of a valid UTF-8 character.
#[inline]
2013-06-10 06:46:36 -05:00
fn char_range_at(&self, i: uint) -> CharRange {
if (self[i] < 128u8) {
return CharRange {ch: self[i] as char, next: i + 1 };
2013-06-10 06:46:36 -05:00
}
// Multibyte case is a fn to allow char_range_at to inline cleanly
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
let mut val = s[i] as uint;
let w = UTF8_CHAR_WIDTH[val] as uint;
assert!((w != 0));
val = utf8_first_byte!(val, w);
val = utf8_acc_cont_byte!(val, s[i + 1]);
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
return CharRange {ch: unsafe { transmute(val as u32) }, next: i + w};
}
return multibyte_char_range_at(*self, i);
2013-06-10 06:46:36 -05:00
}
/// Plucks the character starting at the `i`th byte of a string
#[inline]
2013-06-10 06:46:36 -05:00
fn char_at(&self, i: uint) -> char { self.char_range_at(i).ch }
/// Given a byte position and a str, return the previous char and its position.
///
/// This function can be used to iterate over a unicode string in reverse.
///
/// Returns 0 for next index if called on start index 0.
2013-08-18 06:57:34 -05:00
#[inline]
2013-06-10 06:46:36 -05:00
fn char_range_at_reverse(&self, start: uint) -> CharRange {
let mut prev = start;
prev = prev.saturating_sub(1);
if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2013-06-10 06:46:36 -05:00
// Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
fn multibyte_char_range_at_rev(s: &str, mut i: uint) -> CharRange {
// while there is a previous byte == 10......
while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
i -= 1u;
}
2013-06-10 06:46:36 -05:00
let mut val = s[i] as uint;
let w = UTF8_CHAR_WIDTH[val] as uint;
assert!((w != 0));
2013-06-10 06:46:36 -05:00
val = utf8_first_byte!(val, w);
val = utf8_acc_cont_byte!(val, s[i + 1]);
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2013-06-10 06:46:36 -05:00
return CharRange {ch: unsafe { transmute(val as u32) }, next: i};
}
2013-06-10 06:46:36 -05:00
return multibyte_char_range_at_rev(*self, prev);
2013-06-10 06:46:36 -05:00
}
2013-06-10 06:46:36 -05:00
/// Plucks the character ending at the `i`th byte of a string
2013-03-15 02:32:11 -05:00
#[inline]
fn char_at_reverse(&self, i: uint) -> char {
2013-06-10 06:46:36 -05:00
self.char_range_at_reverse(i).ch
2013-03-15 02:32:11 -05:00
}
/// Work with the byte buffer of a string as a byte slice.
2013-08-04 15:22:56 -05:00
fn as_bytes(&self) -> &'self [u8] {
unsafe { cast::transmute(*self) }
}
/// Returns the byte index of the first character of `self` that matches `search`
///
/// # Return value
///
/// `Some` containing the byte index of the last matching character
/// or `None` if there is no match
fn find<C: CharEq>(&self, search: C) -> Option<uint> {
if search.only_ascii() {
self.byte_iter().position(|b| search.matches(b as char))
} else {
for (index, c) in self.char_offset_iter() {
if search.matches(c) { return Some(index); }
}
None
}
}
/// Returns the byte index of the last character of `self` that matches `search`
///
/// # Return value
///
/// `Some` containing the byte index of the last matching character
/// or `None` if there is no match
fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
if search.only_ascii() {
self.byte_iter().rposition(|b| search.matches(b as char))
} else {
for (index, c) in self.char_offset_rev_iter() {
if search.matches(c) { return Some(index); }
}
None
}
}
/// Returns the byte index of the first matching substring
///
/// # Arguments
///
/// * `needle` - The string to search for
///
/// # Return value
///
/// `Some` containing the byte index of the first matching substring
/// or `None` if there is no match
fn find_str(&self, needle: &str) -> Option<uint> {
if needle.is_empty() {
Some(0)
} else {
self.matches_index_iter(needle)
.next()
.map_move(|(start, _end)| start)
}
}
2013-06-10 21:05:42 -05:00
/// Given a string, make a new string with repeated copies of it.
2013-08-04 15:22:56 -05:00
fn repeat(&self, nn: uint) -> ~str {
let mut ret = with_capacity(nn * self.len());
for _ in range(0, nn) {
ret.push_str(*self);
2013-08-04 15:22:56 -05:00
}
ret
2013-08-04 15:22:56 -05:00
}
/// Retrieves the first character from a string slice and returns
/// it. This does not allocate a new string; instead, it returns a
/// slice that point one character beyond the character that was
/// shifted.
///
/// # Failure
///
/// If the string does not contain any characters
#[inline]
fn slice_shift_char(&self) -> (char, &'self str) {
let CharRange {ch, next} = self.char_range_at(0u);
let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
return (ch, next_s);
}
/// Levenshtein Distance between two strings.
fn lev_distance(&self, t: &str) -> uint {
let slen = self.len();
let tlen = t.len();
if slen == 0 { return tlen; }
if tlen == 0 { return slen; }
let mut dcol = vec::from_fn(tlen + 1, |x| x);
for (i, sc) in self.iter().enumerate() {
let mut current = i;
dcol[0] = current + 1;
for (j, tc) in t.iter().enumerate() {
let next = dcol[j + 1];
if sc == tc {
dcol[j + 1] = current;
} else {
dcol[j + 1] = ::cmp::min(current, next);
dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
}
current = next;
}
}
return dcol[tlen];
}
/// Returns the byte offset of an inner slice relative to an enclosing outer slice.
///
/// Fails if `inner` is not a direct slice contained within self.
///
/// # Example
///
/// ~~~ {.rust}
/// let string = "a\nb\nc";
/// let mut lines = ~[];
/// for line in string.line_iter() { lines.push(line) }
///
/// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
/// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
/// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
/// ~~~
#[inline]
fn subslice_offset(&self, inner: &str) -> uint {
do self.as_imm_buf |a, a_len| {
do inner.as_imm_buf |b, b_len| {
let a_start: uint;
let a_end: uint;
let b_start: uint;
let b_end: uint;
unsafe {
a_start = cast::transmute(a); a_end = a_len + cast::transmute(a);
b_start = cast::transmute(b); b_end = b_len + cast::transmute(b);
}
assert!(a_start <= b_start);
assert!(b_end <= a_end);
b_start - a_start
}
}
}
/// Work with the byte buffer and length of a slice.
///
/// The buffer does not have a null terminator.
2013-07-10 19:33:11 -05:00
#[inline]
fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T {
let v: &[u8] = unsafe { cast::transmute(*self) };
v.as_imm_buf(f)
2013-07-10 19:33:11 -05:00
}
2012-03-16 19:35:38 -05:00
}
#[allow(missing_doc)]
pub trait OwnedStr {
fn push_str_no_overallocate(&mut self, rhs: &str);
fn push_str(&mut self, rhs: &str);
fn push_char(&mut self, c: char);
fn pop_char(&mut self) -> char;
fn shift_char(&mut self) -> char;
fn unshift_char(&mut self, ch: char);
fn append(self, rhs: &str) -> ~str;
fn reserve(&mut self, n: uint);
fn reserve_at_least(&mut self, n: uint);
2013-06-11 06:43:29 -05:00
fn capacity(&self) -> uint;
2013-08-24 00:05:35 -05:00
fn truncate(&mut self, len: uint);
fn into_bytes(self) -> ~[u8];
/// Work with the mutable byte buffer and length of a slice.
///
/// The buffer does not have a null terminator.
///
/// The caller must make sure any mutations to this buffer keep the string
/// valid UTF-8!
fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T;
}
impl OwnedStr for ~str {
/// Appends a string slice to the back of a string, without overallocating
#[inline]
fn push_str_no_overallocate(&mut self, rhs: &str) {
let new_cap = self.len() + rhs.len();
self.reserve(new_cap);
self.push_str(rhs);
}
/// Appends a string slice to the back of a string
2013-05-02 02:49:11 -05:00
#[inline]
fn push_str(&mut self, rhs: &str) {
unsafe {
raw::push_bytes(self, rhs.as_bytes());
}
}
/// Appends a character to the back of a string
2013-05-02 02:49:11 -05:00
#[inline]
fn push_char(&mut self, c: char) {
let cur_len = self.len();
// may use up to 4 bytes.
unsafe {
raw::as_owned_vec(self).reserve_additional(4);
// Attempt to not use an intermediate buffer by just pushing bytes
// directly onto this string.
let used = do self.as_mut_buf |buf, _| {
do vec::raw::mut_buf_as_slice(buf.offset(cur_len as int), 4) |slc| {
c.encode_utf8(slc)
}
};
raw::set_len(self, cur_len + used);
}
}
/// Remove the final character from a string and return it
///
/// # Failure
///
/// If the string does not contain any characters
fn pop_char(&mut self) -> char {
let end = self.len();
assert!(end > 0u);
let CharRange {ch, next} = self.char_range_at_reverse(end);
unsafe { raw::set_len(self, next); }
return ch;
}
/// Remove the first character from a string and return it
///
/// # Failure
///
/// If the string does not contain any characters
fn shift_char(&mut self) -> char {
let CharRange {ch, next} = self.char_range_at(0u);
*self = self.slice(next, self.len()).to_owned();
return ch;
}
/// Prepend a char to a string
fn unshift_char(&mut self, ch: char) {
// This could be more efficient.
let mut new_str = ~"";
new_str.push_char(ch);
new_str.push_str(*self);
*self = new_str;
}
2013-06-10 21:01:45 -05:00
/// Concatenate two strings together.
#[inline]
fn append(self, rhs: &str) -> ~str {
let mut new_str = self;
new_str.push_str_no_overallocate(rhs);
new_str
2013-06-10 21:01:45 -05:00
}
/// Reserves capacity for exactly `n` bytes in the given string.
///
/// Assuming single-byte characters, the resulting string will be large
/// enough to hold a string of length `n`.
///
/// If the capacity for `s` is already equal to or greater than the requested
/// capacity, then no action is taken.
///
/// # Arguments
///
/// * s - A string
/// * n - The number of bytes to reserve space for
2013-08-04 15:22:56 -05:00
#[inline]
fn reserve(&mut self, n: uint) {
2013-08-04 15:22:56 -05:00
unsafe {
raw::as_owned_vec(self).reserve(n)
2013-08-04 15:22:56 -05:00
}
}
/// Reserves capacity for at least `n` bytes in the given string.
///
/// Assuming single-byte characters, the resulting string will be large
/// enough to hold a string of length `n`.
2013-08-04 15:22:56 -05:00
///
/// This function will over-allocate in order to amortize the allocation costs
/// in scenarios where the caller may need to repeatedly reserve additional
/// space.
///
/// If the capacity for `s` is already equal to or greater than the requested
/// capacity, then no action is taken.
///
/// # Arguments
///
/// * s - A string
/// * n - The number of bytes to reserve space for
#[inline]
fn reserve_at_least(&mut self, n: uint) {
self.reserve(uint::next_power_of_two_opt(n).unwrap_or(n))
2013-08-04 15:22:56 -05:00
}
/// Returns the number of single-byte characters the string can hold without
/// reallocating
2013-08-04 15:22:56 -05:00
fn capacity(&self) -> uint {
unsafe {
let buf: &~[u8] = cast::transmute(self);
buf.capacity()
}
}
2013-08-24 00:05:35 -05:00
/// Shorten a string to the specified length (which must be <= the current length)
#[inline]
fn truncate(&mut self, len: uint) {
assert!(len <= self.len());
assert!(self.is_char_boundary(len));
unsafe { raw::set_len(self, len); }
}
/// Consumes the string, returning the underlying byte buffer.
///
/// The buffer does not have a null terminator.
#[inline]
fn into_bytes(self) -> ~[u8] {
unsafe { cast::transmute(self) }
}
2013-08-24 00:05:35 -05:00
#[inline]
fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T {
unsafe {
raw::as_owned_vec(self).as_mut_buf(f)
}
}
}
2013-03-15 17:26:59 -05:00
impl Clone for ~str {
#[inline]
2013-03-15 17:26:59 -05:00
fn clone(&self) -> ~str {
self.to_owned()
2013-03-15 17:26:59 -05:00
}
}
2013-08-16 05:17:02 -05:00
impl DeepClone for ~str {
#[inline]
fn deep_clone(&self) -> ~str {
self.to_owned()
}
}
2013-07-02 14:47:32 -05:00
impl Clone for @str {
#[inline]
fn clone(&self) -> @str {
*self
}
}
2013-08-16 05:17:02 -05:00
impl DeepClone for @str {
#[inline]
fn deep_clone(&self) -> @str {
*self
}
}
impl FromIterator<char> for ~str {
#[inline]
fn from_iterator<T: Iterator<char>>(iterator: &mut T) -> ~str {
let (lower, _) = iterator.size_hint();
let mut buf = with_capacity(lower);
buf.extend(iterator);
buf
}
}
impl Extendable<char> for ~str {
#[inline]
fn extend<T: Iterator<char>>(&mut self, iterator: &mut T) {
let (lower, _) = iterator.size_hint();
let reserve = lower + self.len();
self.reserve_at_least(reserve);
for ch in *iterator {
self.push_char(ch)
}
}
}
2013-06-17 02:05:51 -05:00
// This works because every lifetime is a sub-lifetime of 'static
2013-08-10 08:38:00 -05:00
impl<'self> Default for &'self str {
fn default() -> &'self str { "" }
2013-06-17 02:05:51 -05:00
}
2013-08-10 08:38:00 -05:00
impl Default for ~str {
fn default() -> ~str { ~"" }
}
2013-08-10 08:38:00 -05:00
impl Default for @str {
fn default() -> @str { @"" }
}
2012-01-17 19:28:21 -06:00
#[cfg(test)]
mod tests {
use container::Container;
use option::{None, Some};
use ptr;
use str::*;
use vec;
use vec::{Vector, ImmutableVector, CopyableVector};
2013-03-01 21:07:12 -06:00
use cmp::{TotalOrd, Less, Equal, Greater};
use send_str::{SendStrOwned, SendStrStatic};
2012-01-17 19:28:21 -06:00
#[test]
fn test_eq() {
2013-03-28 20:39:09 -05:00
assert!((eq(&~"", &~"")));
assert!((eq(&~"foo", &~"foo")));
assert!((!eq(&~"foo", &~"bar")));
2012-01-17 19:28:21 -06:00
}
2012-09-03 12:47:10 -05:00
#[test]
fn test_eq_slice() {
assert!((eq_slice("foobar".slice(0, 3), "foo")));
assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2013-03-28 20:39:09 -05:00
assert!((!eq_slice("foo1", "foo2")));
2012-09-03 12:47:10 -05:00
}
2012-01-17 19:28:21 -06:00
#[test]
2012-02-03 05:28:49 -06:00
fn test_le() {
assert!("" <= "");
assert!("" <= "foo");
assert!("foo" <= "foo");
2013-06-27 10:45:24 -05:00
assert!("foo" != "bar");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_len() {
assert_eq!("".len(), 0u);
assert_eq!("hello world".len(), 11u);
assert_eq!("\x63".len(), 1u);
assert_eq!("\xa2".len(), 2u);
assert_eq!("\u03c0".len(), 2u);
assert_eq!("\u2620".len(), 3u);
assert_eq!("\U0001d11e".len(), 4u);
assert_eq!("".char_len(), 0u);
assert_eq!("hello world".char_len(), 11u);
assert_eq!("\x63".char_len(), 1u);
assert_eq!("\xa2".char_len(), 1u);
assert_eq!("\u03c0".char_len(), 1u);
assert_eq!("\u2620".char_len(), 1u);
assert_eq!("\U0001d11e".char_len(), 1u);
assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_find() {
assert_eq!("hello".find('l'), Some(2u));
assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
assert!("hello".find('x').is_none());
assert!("hello".find(|c:char| c == 'x').is_none());
assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
}
#[test]
fn test_rfind() {
assert_eq!("hello".rfind('l'), Some(3u));
assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
assert!("hello".rfind('x').is_none());
assert!("hello".rfind(|c:char| c == 'x').is_none());
assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
}
#[test]
2013-06-10 21:01:45 -05:00
fn test_push_str() {
let mut s = ~"";
s.push_str("");
assert_eq!(s.slice_from(0), "");
s.push_str("abc");
assert_eq!(s.slice_from(0), "abc");
s.push_str("ประเทศไทย中华Việt Nam");
assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
}
2013-06-10 21:01:45 -05:00
#[test]
fn test_append() {
let mut s = ~"";
s = s.append("");
assert_eq!(s.slice_from(0), "");
s = s.append("abc");
assert_eq!(s.slice_from(0), "abc");
s = s.append("ประเทศไทย中华Việt Nam");
assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
}
#[test]
fn test_pop_char() {
let mut data = ~"ประเทศไทย中华";
let cc = data.pop_char();
assert_eq!(~"ประเทศไทย中", data);
assert_eq!('华', cc);
}
#[test]
fn test_pop_char_2() {
let mut data2 = ~"";
let cc2 = data2.pop_char();
assert_eq!(~"", data2);
assert_eq!('华', cc2);
}
#[test]
#[should_fail]
fn test_pop_char_fail() {
let mut data = ~"";
let _cc3 = data.pop_char();
}
#[test]
fn test_push_char() {
let mut data = ~"ประเทศไทย中";
data.push_char('华');
data.push_char('b'); // 1 byte
data.push_char('¢'); // 2 byte
data.push_char('€'); // 3 byte
data.push_char('𤭢'); // 4 byte
assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
}
#[test]
fn test_shift_char() {
let mut data = ~"ประเทศไทย中";
let cc = data.shift_char();
assert_eq!(~"ระเทศไทย中", data);
assert_eq!('ป', cc);
}
#[test]
fn test_unshift_char() {
let mut data = ~"ประเทศไทย中";
data.unshift_char('华');
assert_eq!(~"华ประเทศไทย中", data);
}
#[test]
fn test_collect() {
let empty = "";
let s: ~str = empty.iter().collect();
assert_eq!(empty, s.as_slice());
let data = "ประเทศไทย中";
let s: ~str = data.iter().collect();
assert_eq!(data, s.as_slice());
}
#[test]
fn test_extend() {
let data = ~"ประเทศไทย中";
let mut cpy = data.clone();
let other = "abc";
let mut it = other.iter();
cpy.extend(&mut it);
assert_eq!(cpy, data + other);
}
#[test]
fn test_clear() {
let mut empty = ~"";
empty.clear();
assert_eq!("", empty.as_slice());
let mut data = ~"ประเทศไทย中";
data.clear();
assert_eq!("", data.as_slice());
data.push_char('华');
assert_eq!("", data.as_slice());
}
#[test]
fn test_into_bytes() {
let data = ~"asdf";
let buf = data.into_bytes();
assert_eq!(bytes!("asdf"), buf.as_slice());
}
2012-01-17 19:28:21 -06:00
#[test]
fn test_find_str() {
2012-02-13 02:17:59 -06:00
// byte positions
assert_eq!("".find_str(""), Some(0u));
assert!("banana".find_str("apple pie").is_none());
2012-02-16 21:16:08 -06:00
2013-05-23 11:39:17 -05:00
let data = "abcabc";
assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2013-06-10 08:01:45 -05:00
assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
assert!(data.slice(2u, 4u).find_str("ab").is_none());
2012-02-13 05:07:29 -06:00
let mut data = ~"ประเทศไทย中华Việt Nam";
data = data + data;
assert!(data.find_str("ไท华").is_none());
assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_slice_chars() {
fn t(a: &str, b: &str, start: uint) {
assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2012-01-17 19:28:21 -06:00
}
t("", "", 0);
t("hello", "llo", 2);
t("hello", "el", 1);
t("αβλ", "β", 1);
t("αβλ", "", 3);
assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_concat() {
2012-09-21 20:36:32 -05:00
fn t(v: &[~str], s: &str) {
assert_eq!(v.concat(), s.to_str());
2012-09-21 20:36:32 -05:00
}
2013-05-23 11:39:17 -05:00
t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
let v: &[~str] = [];
t(v, "");
t([~"hi"], "hi");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_connect() {
2012-09-21 20:36:32 -05:00
fn t(v: &[~str], sep: &str, s: &str) {
assert_eq!(v.connect(sep), s.to_str());
2012-01-17 19:28:21 -06:00
}
2013-05-23 11:39:17 -05:00
t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
" ", "you know I'm no good");
let v: &[~str] = [];
2013-05-23 11:39:17 -05:00
t(v, " ", "");
t([~"hi"], " ", "hi");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_concat_slices() {
fn t(v: &[&str], s: &str) {
assert_eq!(v.concat(), s.to_str());
}
t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
let v: &[&str] = [];
t(v, "");
t(["hi"], "hi");
}
#[test]
fn test_connect_slices() {
fn t(v: &[&str], sep: &str, s: &str) {
assert_eq!(v.connect(sep), s.to_str());
}
t(["you", "know", "I'm", "no", "good"],
" ", "you know I'm no good");
t([], " ", "");
t(["hi"], " ", "hi");
}
#[test]
fn test_repeat() {
2013-06-10 21:05:42 -05:00
assert_eq!("x".repeat(4), ~"xxxx");
assert_eq!("hi".repeat(4), ~"hihihihi");
assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
assert_eq!("".repeat(4), ~"");
assert_eq!("hi".repeat(0), ~"");
}
2012-01-17 19:28:21 -06:00
#[test]
fn test_unsafe_slice() {
assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
2013-03-21 05:58:03 -05:00
fn a_million_letter_a() -> ~str {
let mut i = 0;
let mut rs = ~"";
while i < 100000 { rs.push_str("aaaaaaaaaa"); i += 1; }
2013-03-21 05:58:03 -05:00
rs
2012-01-17 19:28:21 -06:00
}
2013-03-21 05:58:03 -05:00
fn half_a_million_letter_a() -> ~str {
let mut i = 0;
let mut rs = ~"";
while i < 100000 { rs.push_str("aaaaa"); i += 1; }
2013-03-21 05:58:03 -05:00
rs
}
let letters = a_million_letter_a();
2013-03-28 20:39:09 -05:00
assert!(half_a_million_letter_a() ==
unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_starts_with() {
assert!(("".starts_with("")));
assert!(("abc".starts_with("")));
assert!(("abc".starts_with("a")));
assert!((!"a".starts_with("abc")));
assert!((!"".starts_with("abc")));
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_ends_with() {
assert!(("".ends_with("")));
assert!(("abc".ends_with("")));
assert!(("abc".ends_with("c")));
assert!((!"a".ends_with("abc")));
assert!((!"".ends_with("abc")));
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_is_empty() {
assert!("".is_empty());
assert!(!"a".is_empty());
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_replace() {
2013-05-23 11:39:17 -05:00
let a = "a";
2013-06-11 06:46:40 -05:00
assert_eq!("".replace(a, "b"), ~"");
assert_eq!("a".replace(a, "b"), ~"b");
assert_eq!("ab".replace(a, "b"), ~"bb");
2013-05-23 11:39:17 -05:00
let test = "test";
2013-06-11 06:46:40 -05:00
assert!(" test test ".replace(test, "toast") ==
2013-03-06 21:09:17 -06:00
~" toast toast ");
2013-06-11 06:46:40 -05:00
assert_eq!(" test test ".replace(test, ""), ~" ");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_replace_2a() {
let data = ~"ประเทศไทย中华";
let repl = ~"دولة الكويت";
let a = ~"ประเ";
let A = ~"دولة الكويتทศไทย中华";
2013-06-11 06:46:40 -05:00
assert_eq!(data.replace(a, repl), A);
}
#[test]
fn test_replace_2b() {
let data = ~"ประเทศไทย中华";
let repl = ~"دولة الكويت";
let b = ~"ะเ";
let B = ~"ปรدولة الكويتทศไทย中华";
2013-06-11 06:46:40 -05:00
assert_eq!(data.replace(b, repl), B);
}
#[test]
fn test_replace_2c() {
let data = ~"ประเทศไทย中华";
let repl = ~"دولة الكويت";
let c = ~"中华";
let C = ~"ประเทศไทยدولة الكويت";
2013-06-11 06:46:40 -05:00
assert_eq!(data.replace(c, repl), C);
}
#[test]
fn test_replace_2d() {
let data = ~"ประเทศไทย中华";
let repl = ~"دولة الكويت";
let d = ~"ไท华";
2013-06-11 06:46:40 -05:00
assert_eq!(data.replace(d, repl), data);
}
#[test]
fn test_slice() {
assert_eq!("ab", "abc".slice(0, 2));
assert_eq!("bc", "abc".slice(1, 3));
assert_eq!("", "abc".slice(1, 1));
assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
2013-03-21 05:58:03 -05:00
let data = "ประเทศไทย中华";
assert_eq!("", data.slice(0, 3));
assert_eq!("", data.slice(3, 6));
assert_eq!("", data.slice(3, 3));
assert_eq!("", data.slice(30, 33));
fn a_million_letter_X() -> ~str {
let mut i = 0;
let mut rs = ~"";
2012-09-21 20:36:32 -05:00
while i < 100000 {
2013-03-21 05:58:03 -05:00
push_str(&mut rs, "华华华华华华华华华华");
2012-09-21 20:36:32 -05:00
i += 1;
}
2012-12-12 17:38:50 -06:00
rs
}
fn half_a_million_letter_X() -> ~str {
let mut i = 0;
let mut rs = ~"";
2013-03-21 05:58:03 -05:00
while i < 100000 { push_str(&mut rs, "华华华华华"); i += 1; }
2012-12-12 17:38:50 -06:00
rs
}
2013-03-21 05:58:03 -05:00
let letters = a_million_letter_X();
2013-03-28 20:39:09 -05:00
assert!(half_a_million_letter_X() ==
letters.slice(0u, 3u * 500000u).to_owned());
}
#[test]
fn test_slice_2() {
2013-03-21 05:58:03 -05:00
let ss = "中华Việt Nam";
assert_eq!("", ss.slice(3u, 6u));
assert_eq!("Việt Nam", ss.slice(6u, 16u));
assert_eq!("ab", "abc".slice(0u, 2u));
assert_eq!("bc", "abc".slice(1u, 3u));
assert_eq!("", "abc".slice(1u, 1u));
assert_eq!("", ss.slice(0u, 3u));
assert_eq!("华V", ss.slice(3u, 7u));
assert_eq!("", ss.slice(3u, 3u));
/*0: 中
3:
6: V
7: i
8:
11: t
12:
13: N
14: a
15: m */
}
2012-01-17 19:28:21 -06:00
#[test]
#[should_fail]
fn test_slice_fail() {
"中华Việt Nam".slice(0u, 2u);
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_slice_from() {
assert_eq!("abcd".slice_from(0), "abcd");
assert_eq!("abcd".slice_from(2), "cd");
assert_eq!("abcd".slice_from(4), "");
}
#[test]
fn test_slice_to() {
assert_eq!("abcd".slice_to(0), "");
assert_eq!("abcd".slice_to(2), "ab");
assert_eq!("abcd".slice_to(4), "abcd");
}
2012-01-17 19:28:21 -06:00
#[test]
2012-09-05 18:39:06 -05:00
fn test_trim_left_chars() {
let v: &[char] = &[];
assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
2012-09-05 18:39:06 -05:00
}
#[test]
fn test_trim_right_chars() {
let v: &[char] = &[];
assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
2012-09-05 18:39:06 -05:00
}
#[test]
fn test_trim_chars() {
let v: &[char] = &[];
assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
2012-09-05 18:39:06 -05:00
}
#[test]
2012-01-17 19:28:21 -06:00
fn test_trim_left() {
2013-06-10 06:03:16 -05:00
assert_eq!("".trim_left(), "");
assert_eq!("a".trim_left(), "a");
assert_eq!(" ".trim_left(), "");
assert_eq!(" blah".trim_left(), "blah");
assert_eq!(" \u3000 wut".trim_left(), "wut");
assert_eq!("hey ".trim_left(), "hey ");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_trim_right() {
2013-06-10 06:03:16 -05:00
assert_eq!("".trim_right(), "");
assert_eq!("a".trim_right(), "a");
assert_eq!(" ".trim_right(), "");
assert_eq!("blah ".trim_right(), "blah");
assert_eq!("wut \u3000 ".trim_right(), "wut");
assert_eq!(" hey".trim_right(), " hey");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_trim() {
2013-06-10 06:03:16 -05:00
assert_eq!("".trim(), "");
assert_eq!("a".trim(), "a");
assert_eq!(" ".trim(), "");
assert_eq!(" blah ".trim(), "blah");
assert_eq!("\nwut \u3000 ".trim(), "wut");
assert_eq!(" hey dude ".trim(), "hey dude");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_is_whitespace() {
assert!("".is_whitespace());
assert!(" ".is_whitespace());
assert!("\u2009".is_whitespace()); // Thin space
assert!(" \n\t ".is_whitespace());
assert!(!" _ ".is_whitespace());
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_push_byte() {
let mut s = ~"ABC";
unsafe{raw::push_byte(&mut s, 'D' as u8)};
assert_eq!(s, ~"ABCD");
}
2012-01-17 19:28:21 -06:00
#[test]
fn test_shift_byte() {
let mut s = ~"ABC";
2013-04-20 12:39:15 -05:00
let b = unsafe{raw::shift_byte(&mut s)};
assert_eq!(s, ~"BC");
assert_eq!(b, 65u8);
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_pop_byte() {
let mut s = ~"ABC";
2013-04-20 12:39:15 -05:00
let b = unsafe{raw::pop_byte(&mut s)};
assert_eq!(s, ~"AB");
assert_eq!(b, 67u8);
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_unsafe_from_utf8() {
let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8];
let b = unsafe { raw::from_utf8(a) };
assert_eq!(b, ~"AAAAAAA");
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_from_utf8() {
let ss = ~"ศไทย中华Việt Nam";
let bb = ~[0xe0_u8, 0xb8_u8, 0xa8_u8,
0xe0_u8, 0xb9_u8, 0x84_u8,
0xe0_u8, 0xb8_u8, 0x97_u8,
0xe0_u8, 0xb8_u8, 0xa2_u8,
0xe4_u8, 0xb8_u8, 0xad_u8,
0xe5_u8, 0x8d_u8, 0x8e_u8,
0x56_u8, 0x69_u8, 0xe1_u8,
0xbb_u8, 0x87_u8, 0x74_u8,
0x20_u8, 0x4e_u8, 0x61_u8,
0x6d_u8];
assert_eq!(ss, from_utf8(bb));
assert_eq!(~"𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰",
from_utf8(bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰")));
}
#[test]
fn test_is_utf8() {
// deny overlong encodings
assert!(!is_utf8([0xc0, 0x80]));
assert!(!is_utf8([0xc0, 0xae]));
assert!(!is_utf8([0xe0, 0x80, 0x80]));
assert!(!is_utf8([0xe0, 0x80, 0xaf]));
assert!(!is_utf8([0xe0, 0x81, 0x81]));
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
// deny surrogates
assert!(!is_utf8([0xED, 0xA0, 0x80]));
assert!(!is_utf8([0xED, 0xBF, 0xBF]));
assert!(is_utf8([0xC2, 0x80]));
assert!(is_utf8([0xDF, 0xBF]));
assert!(is_utf8([0xE0, 0xA0, 0x80]));
assert!(is_utf8([0xED, 0x9F, 0xBF]));
assert!(is_utf8([0xEE, 0x80, 0x80]));
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
}
#[test]
fn test_from_utf8_fail() {
use str::not_utf8::cond;
let bb = ~[0xff_u8, 0xb8_u8, 0xa8_u8,
0xe0_u8, 0xb9_u8, 0x84_u8,
0xe0_u8, 0xb8_u8, 0x97_u8,
0xe0_u8, 0xb8_u8, 0xa2_u8,
0xe4_u8, 0xb8_u8, 0xad_u8,
0xe5_u8, 0x8d_u8, 0x8e_u8,
0x56_u8, 0x69_u8, 0xe1_u8,
0xbb_u8, 0x87_u8, 0x74_u8,
0x20_u8, 0x4e_u8, 0x61_u8,
0x6d_u8];
let mut error_happened = false;
let _x = do cond.trap(|err| {
assert_eq!(err, ~"from_utf8: input is not UTF-8; first bad byte is 255");
error_happened = true;
~""
}).inside {
from_utf8(bb)
};
assert!(error_happened);
}
2012-01-17 19:28:21 -06:00
#[test]
fn test_raw_from_c_str() {
unsafe {
let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
2012-09-12 19:45:23 -05:00
let b = vec::raw::to_ptr(a);
let c = raw::from_c_str(b);
assert_eq!(c, ~"AAAAAAA");
}
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_as_bytes() {
// no null
let v = [
224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
109
];
assert_eq!("".as_bytes(), &[]);
assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
assert_eq!("ศไทย中华Việt Nam".as_bytes(), v);
}
#[test]
#[should_fail]
fn test_as_bytes_fail() {
// Don't double free. (I'm not sure if this exercises the
// original problem code path anymore.)
let s = ~"";
2013-07-03 22:02:09 -05:00
let _bytes = s.as_bytes();
fail!();
}
2012-01-17 19:28:21 -06:00
#[test]
fn test_as_imm_buf() {
2013-08-04 15:22:56 -05:00
do "".as_imm_buf |_, len| {
assert_eq!(len, 0);
}
2012-01-17 19:28:21 -06:00
do "hello".as_imm_buf |buf, len| {
2013-08-04 15:22:56 -05:00
assert_eq!(len, 5);
unsafe {
assert_eq!(*ptr::offset(buf, 0), 'h' as u8);
assert_eq!(*ptr::offset(buf, 1), 'e' as u8);
assert_eq!(*ptr::offset(buf, 2), 'l' as u8);
assert_eq!(*ptr::offset(buf, 3), 'l' as u8);
assert_eq!(*ptr::offset(buf, 4), 'o' as u8);
}
}
2012-01-17 19:28:21 -06:00
}
2013-06-30 10:29:38 -05:00
#[test]
fn test_subslice_offset() {
let a = "kernelsprite";
let b = a.slice(7, a.len());
let c = a.slice(0, a.len() - 6);
assert_eq!(a.subslice_offset(b), 7);
assert_eq!(a.subslice_offset(c), 0);
2013-04-10 17:48:31 -05:00
let string = "a\nb\nc";
let mut lines = ~[];
for line in string.line_iter() { lines.push(line) }
assert_eq!(string.subslice_offset(lines[0]), 0);
assert_eq!(string.subslice_offset(lines[1]), 2);
assert_eq!(string.subslice_offset(lines[2]), 4);
}
#[test]
#[should_fail]
fn test_subslice_offset_2() {
let a = "alchemiter";
let b = "cruxtruder";
a.subslice_offset(b);
}
2012-01-17 19:28:21 -06:00
#[test]
fn vec_str_conversions() {
let s1: ~str = ~"All mimsy were the borogoves";
2012-01-17 19:28:21 -06:00
let v: ~[u8] = s1.as_bytes().to_owned();
let s2: ~str = from_utf8(v);
let mut i: uint = 0u;
let n1: uint = s1.len();
let n2: uint = v.len();
assert_eq!(n1, n2);
2012-01-17 19:28:21 -06:00
while i < n1 {
let a: u8 = s1[i];
let b: u8 = s2[i];
2013-03-08 14:39:42 -06:00
debug!(a);
debug!(b);
assert_eq!(a, b);
2012-01-17 19:28:21 -06:00
i += 1u;
}
}
#[test]
fn test_contains() {
assert!("abcde".contains("bcd"));
assert!("abcde".contains("abcd"));
assert!("abcde".contains("bcde"));
assert!("abcde".contains(""));
assert!("".contains(""));
assert!(!"abcde".contains("def"));
assert!(!"".contains("a"));
let data = ~"ประเทศไทย中华Việt Nam";
assert!(data.contains("ประเ"));
assert!(data.contains("ะเ"));
assert!(data.contains("中华"));
assert!(!data.contains("ไท华"));
2012-01-17 19:28:21 -06:00
}
#[test]
fn test_contains_char() {
2013-06-10 08:01:45 -05:00
assert!("abc".contains_char('b'));
assert!("a".contains_char('a'));
assert!(!"abc".contains_char('d'));
assert!(!"".contains_char('a'));
}
2012-01-30 22:44:48 -06:00
#[test]
fn test_utf16() {
let pairs =
2013-05-23 11:39:17 -05:00
[(~"𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n",
~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
2013-05-23 11:39:17 -05:00
0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
0xd800_u16, 0xdf30_u16, 0x000a_u16]),
(~"𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n",
~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
2013-05-23 11:39:17 -05:00
0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
0x000a_u16]),
(~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
2013-05-23 11:39:17 -05:00
0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
(~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
2013-05-23 11:39:17 -05:00
0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
0x000a_u16 ]) ];
for p in pairs.iter() {
2013-07-02 14:47:32 -05:00
let (s, u) = (*p).clone();
assert!(s.to_utf16() == u);
2013-03-28 20:39:09 -05:00
assert!(from_utf16(u) == s);
assert!(from_utf16(s.to_utf16()) == s);
assert!(from_utf16(u).to_utf16() == u);
}
}
2012-03-30 00:28:26 -05:00
2013-03-15 02:32:11 -05:00
#[test]
fn test_char_at() {
let s = ~"ศไทย中华Việt Nam";
let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
let mut pos = 0;
for ch in v.iter() {
2013-03-28 20:39:09 -05:00
assert!(s.char_at(pos) == *ch);
2013-03-15 02:32:11 -05:00
pos += from_char(*ch).len();
}
}
#[test]
fn test_char_at_reverse() {
let s = ~"ศไทย中华Việt Nam";
let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
let mut pos = s.len();
for ch in v.rev_iter() {
2013-03-28 20:39:09 -05:00
assert!(s.char_at_reverse(pos) == *ch);
2013-03-15 02:32:11 -05:00
pos -= from_char(*ch).len();
}
}
#[test]
fn test_escape_unicode() {
2013-06-11 07:13:23 -05:00
assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
}
#[test]
fn test_escape_default() {
2013-06-11 07:13:23 -05:00
assert_eq!("abc".escape_default(), ~"abc");
assert_eq!("a c".escape_default(), ~"a c");
assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
}
#[test]
fn test_to_managed() {
2013-05-23 11:39:17 -05:00
assert_eq!("abc".to_managed(), @"abc");
assert_eq!("abcdef".slice(1, 5).to_managed(), @"bcde");
}
2013-03-01 21:07:12 -06:00
#[test]
fn test_total_ord() {
"1234".cmp(& &"123") == Greater;
"123".cmp(& &"1234") == Less;
"1234".cmp(& &"1234") == Equal;
"12345555".cmp(& &"123456") == Less;
"22".cmp(& &"1234") == Greater;
}
#[test]
fn test_char_range_at() {
let data = ~"b¢€𤭢𤭢€¢b";
assert_eq!('b', data.char_range_at(0).ch);
assert_eq!('¢', data.char_range_at(1).ch);
assert_eq!('€', data.char_range_at(3).ch);
assert_eq!('𤭢', data.char_range_at(6).ch);
assert_eq!('𤭢', data.char_range_at(10).ch);
assert_eq!('€', data.char_range_at(14).ch);
assert_eq!('¢', data.char_range_at(17).ch);
assert_eq!('b', data.char_range_at(19).ch);
}
#[test]
fn test_char_range_at_reverse_underflow() {
2013-06-10 06:46:36 -05:00
assert_eq!("abc".char_range_at_reverse(0).next, 0);
}
#[test]
fn test_add() {
2013-06-27 10:45:24 -05:00
#[allow(unnecessary_allocation)];
macro_rules! t (
($s1:expr, $s2:expr, $e:expr) => { {
let s1 = $s1;
let s2 = $s2;
let e = $e;
assert_eq!(s1 + s2, e.to_owned());
assert_eq!(s1.to_owned() + s2, e.to_owned());
assert_eq!(s1.to_managed() + s2, e.to_owned());
} }
);
t!("foo", "bar", "foobar");
t!("foo", @"bar", "foobar");
t!("foo", ~"bar", "foobar");
t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
t!("ศไทย中", @"华Việt Nam", "ศไทย中华Việt Nam");
t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
}
2013-04-18 07:50:55 -05:00
#[test]
fn test_iterator() {
use iter::*;
2013-04-18 07:50:55 -05:00
let s = ~"ศไทย中华Việt Nam";
let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
let mut pos = 0;
let mut it = s.iter();
for c in it {
assert_eq!(c, v[pos]);
pos += 1;
}
assert_eq!(pos, v.len());
}
#[test]
fn test_rev_iterator() {
use iter::*;
let s = ~"ศไทย中华Việt Nam";
let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
let mut pos = 0;
let mut it = s.rev_iter();
2013-04-18 07:50:55 -05:00
for c in it {
2013-04-18 07:50:55 -05:00
assert_eq!(c, v[pos]);
pos += 1;
}
assert_eq!(pos, v.len());
}
#[test]
fn test_iterator_clone() {
let s = "ศไทย中华Việt Nam";
let mut it = s.iter();
it.next();
assert!(it.zip(it.clone()).all(|(x,y)| x == y));
}
#[test]
fn test_byte_iterator() {
let s = ~"ศไทย中华Việt Nam";
let v = [
224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
109
];
let mut pos = 0;
for b in s.byte_iter() {
assert_eq!(b, v[pos]);
pos += 1;
}
}
#[test]
fn test_byte_rev_iterator() {
let s = ~"ศไทย中华Việt Nam";
let v = [
224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
109
];
let mut pos = v.len();
for b in s.byte_rev_iter() {
pos -= 1;
assert_eq!(b, v[pos]);
}
}
#[test]
fn test_char_offset_iterator() {
use iter::*;
let s = "ศไทย中华Việt Nam";
let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
let mut pos = 0;
let mut it = s.char_offset_iter();
for c in it {
assert_eq!(c, (p[pos], v[pos]));
pos += 1;
}
assert_eq!(pos, v.len());
assert_eq!(pos, p.len());
}
#[test]
fn test_char_offset_rev_iterator() {
use iter::*;
let s = "ศไทย中华Việt Nam";
let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
let mut pos = 0;
let mut it = s.char_offset_rev_iter();
for c in it {
assert_eq!(c, (p[pos], v[pos]));
pos += 1;
}
assert_eq!(pos, v.len());
assert_eq!(pos, p.len());
}
#[test]
fn test_split_char_iterator() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let split: ~[&str] = data.split_iter(' ').collect();
assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter(' ').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect();
assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == ' ').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
// Unicode
let split: ~[&str] = data.split_iter('ä').collect();
assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter('ä').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect();
assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == 'ä').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
}
#[test]
fn test_splitn_char_iterator() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let split: ~[&str] = data.splitn_iter(' ', 3).collect();
assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
let split: ~[&str] = data.splitn_iter(|c: char| c == ' ', 3).collect();
assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
// Unicode
let split: ~[&str] = data.splitn_iter('ä', 3).collect();
assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
let split: ~[&str] = data.splitn_iter(|c: char| c == 'ä', 3).collect();
assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
}
#[test]
fn test_rsplitn_char_iterator() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let mut split: ~[&str] = data.rsplitn_iter(' ', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == ' ', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
// Unicode
let mut split: ~[&str] = data.rsplitn_iter('ä', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == 'ä', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
}
#[test]
fn test_split_char_iterator_no_trailing() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let split: ~[&str] = data.split_iter('\n').collect();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
let split: ~[&str] = data.split_terminator_iter('\n').collect();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
}
#[test]
fn test_rev_split_char_iterator_no_trailing() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let mut split: ~[&str] = data.split_iter('\n').invert().collect();
split.reverse();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
let mut split: ~[&str] = data.split_terminator_iter('\n').invert().collect();
split.reverse();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
}
#[test]
fn test_word_iter() {
let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
let words: ~[&str] = data.word_iter().collect();
assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
}
#[test]
fn test_nfd_iter() {
assert_eq!("abc".nfd_iter().collect::<~str>(), ~"abc");
assert_eq!("\u1e0b\u01c4".nfd_iter().collect::<~str>(), ~"d\u0307\u01c4");
assert_eq!("\u2026".nfd_iter().collect::<~str>(), ~"\u2026");
assert_eq!("\u2126".nfd_iter().collect::<~str>(), ~"\u03a9");
assert_eq!("\u1e0b\u0323".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
assert_eq!("\u1e0d\u0307".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
assert_eq!("a\u0301".nfd_iter().collect::<~str>(), ~"a\u0301");
assert_eq!("\u0301a".nfd_iter().collect::<~str>(), ~"\u0301a");
assert_eq!("\ud4db".nfd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
assert_eq!("\uac1c".nfd_iter().collect::<~str>(), ~"\u1100\u1162");
}
#[test]
fn test_nfkd_iter() {
assert_eq!("abc".nfkd_iter().collect::<~str>(), ~"abc");
assert_eq!("\u1e0b\u01c4".nfkd_iter().collect::<~str>(), ~"d\u0307DZ\u030c");
assert_eq!("\u2026".nfkd_iter().collect::<~str>(), ~"...");
assert_eq!("\u2126".nfkd_iter().collect::<~str>(), ~"\u03a9");
assert_eq!("\u1e0b\u0323".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
assert_eq!("\u1e0d\u0307".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
assert_eq!("a\u0301".nfkd_iter().collect::<~str>(), ~"a\u0301");
assert_eq!("\u0301a".nfkd_iter().collect::<~str>(), ~"\u0301a");
assert_eq!("\ud4db".nfkd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
assert_eq!("\uac1c".nfkd_iter().collect::<~str>(), ~"\u1100\u1162");
}
#[test]
fn test_line_iter() {
let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
let lines: ~[&str] = data.line_iter().collect();
assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
let lines: ~[&str] = data.line_iter().collect();
assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
}
#[test]
fn test_split_str_iterator() {
fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
let v: ~[&str] = s.split_str_iter(sep).collect();
assert_eq!(v, u);
}
t("--1233345--", "12345", ~["--1233345--"]);
t("abc::hello::there", "::", ~["abc", "hello", "there"]);
t("::hello::there", "::", ~["", "hello", "there"]);
t("hello::there::", "::", ~["hello", "there", ""]);
t("::hello::there::", "::", ~["", "hello", "there", ""]);
t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
t("", ".", ~[""]);
t("zz", "zz", ~["",""]);
t("ok", "z", ~["ok"]);
t("zzz", "zz", ~["","z"]);
t("zzzzz", "zz", ~["","","z"]);
}
2013-06-17 02:05:51 -05:00
#[test]
2013-08-10 08:38:00 -05:00
fn test_str_default() {
use default::Default;
fn t<S: Default + Str>() {
let s: S = Default::default();
2013-06-17 02:05:51 -05:00
assert_eq!(s.as_slice(), "");
}
t::<&str>();
t::<@str>();
t::<~str>();
}
#[test]
fn test_str_container() {
fn sum_len<S: Container>(v: &[S]) -> uint {
v.iter().map(|x| x.len()).sum()
}
let s = ~"01234";
assert_eq!(5, sum_len(["012", "", "34"]));
assert_eq!(5, sum_len([@"01", @"2", @"34", @""]));
assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
assert_eq!(5, sum_len([s.as_slice()]));
}
2013-08-24 00:05:35 -05:00
#[test]
fn test_str_truncate() {
let mut s = ~"12345";
s.truncate(5);
assert_eq!(s.as_slice(), "12345");
s.truncate(3);
assert_eq!(s.as_slice(), "123");
s.truncate(0);
assert_eq!(s.as_slice(), "");
let mut s = ~"12345";
let p = s.as_imm_buf(|p,_| p);
s.truncate(3);
s.push_str("6");
let p_ = s.as_imm_buf(|p,_| p);
assert_eq!(p_, p);
}
#[test]
#[should_fail]
fn test_str_truncate_invalid_len() {
let mut s = ~"12345";
s.truncate(6);
}
#[test]
#[should_fail]
fn test_str_truncate_split_codepoint() {
let mut s = ~"\u00FC"; // ü
s.truncate(1);
}
#[test]
fn test_str_from_utf8_slice() {
let xs = bytes!("hello");
assert_eq!(from_utf8_slice(xs), "hello");
let xs = bytes!("ศไทย中华Việt Nam");
assert_eq!(from_utf8_slice(xs), "ศไทย中华Việt Nam");
}
#[test]
#[should_fail]
fn test_str_from_utf8_slice_invalid() {
let xs = bytes!("hello", 0xff);
let _ = from_utf8_slice(xs);
}
#[test]
fn test_str_from_utf8_slice_opt() {
let xs = bytes!("hello");
assert_eq!(from_utf8_slice_opt(xs), Some("hello"));
let xs = bytes!("ศไทย中华Việt Nam");
assert_eq!(from_utf8_slice_opt(xs), Some("ศไทย中华Việt Nam"));
let xs = bytes!("hello", 0xff);
assert_eq!(from_utf8_slice_opt(xs), None);
}
#[test]
fn test_str_from_utf8() {
let xs = bytes!("hello");
assert_eq!(from_utf8(xs), ~"hello");
let xs = bytes!("ศไทย中华Việt Nam");
assert_eq!(from_utf8(xs), ~"ศไทย中华Việt Nam");
}
#[test]
fn test_str_from_utf8_opt() {
let xs = bytes!("hello").to_owned();
assert_eq!(from_utf8_opt(xs), Some(~"hello"));
let xs = bytes!("ศไทย中华Việt Nam");
assert_eq!(from_utf8_opt(xs), Some(~"ศไทย中华Việt Nam"));
let xs = bytes!("hello", 0xff);
assert_eq!(from_utf8_opt(xs), None);
}
#[test]
fn test_str_from_utf8_owned() {
let xs = bytes!("hello").to_owned();
assert_eq!(from_utf8_owned(xs), ~"hello");
let xs = bytes!("ศไทย中华Việt Nam").to_owned();
assert_eq!(from_utf8_owned(xs), ~"ศไทย中华Việt Nam");
}
#[test]
fn test_str_from_utf8_owned_opt() {
let xs = bytes!("hello").to_owned();
assert_eq!(from_utf8_owned_opt(xs), Some(~"hello"));
let xs = bytes!("ศไทย中华Việt Nam").to_owned();
assert_eq!(from_utf8_owned_opt(xs), Some(~"ศไทย中华Việt Nam"));
let xs = bytes!("hello", 0xff).to_owned();
assert_eq!(from_utf8_owned_opt(xs), None);
}
#[test]
fn test_to_send_str() {
assert_eq!("abcde".to_send_str(), SendStrStatic("abcde"));
assert_eq!("abcde".to_send_str(), SendStrOwned(~"abcde"));
}
2012-01-23 02:36:58 -06:00
}
2013-07-22 12:52:38 -05:00
#[cfg(test)]
mod bench {
use extra::test::BenchHarness;
use super::*;
use prelude::*;
#[bench]
fn char_iterator(bh: &mut BenchHarness) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let len = s.char_len();
do bh.iter {
assert_eq!(s.iter().len(), len);
}
}
#[bench]
fn char_iterator_ascii(bh: &mut BenchHarness) {
let s = "Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb";
let len = s.char_len();
do bh.iter {
assert_eq!(s.iter().len(), len);
}
}
#[bench]
fn char_iterator_rev(bh: &mut BenchHarness) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let len = s.char_len();
do bh.iter {
assert_eq!(s.rev_iter().len(), len);
}
}
#[bench]
fn char_offset_iterator(bh: &mut BenchHarness) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let len = s.char_len();
do bh.iter {
assert_eq!(s.char_offset_iter().len(), len);
}
}
#[bench]
fn char_offset_iterator_rev(bh: &mut BenchHarness) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let len = s.char_len();
do bh.iter {
assert_eq!(s.char_offset_rev_iter().len(), len);
}
}
2013-07-22 12:52:38 -05:00
#[bench]
fn split_iter_unicode_ascii(bh: &mut BenchHarness) {
let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
do bh.iter {
assert_eq!(s.split_iter('V').len(), 3);
}
}
#[bench]
fn split_iter_unicode_not_ascii(bh: &mut BenchHarness) {
struct NotAscii(char);
impl CharEq for NotAscii {
fn matches(&self, c: char) -> bool {
**self == c
}
fn only_ascii(&self) -> bool { false }
}
let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
do bh.iter {
assert_eq!(s.split_iter(NotAscii('V')).len(), 3);
}
}
#[bench]
fn split_iter_ascii(bh: &mut BenchHarness) {
let s = "Mary had a little lamb, Little lamb, little-lamb.";
let len = s.split_iter(' ').len();
do bh.iter {
assert_eq!(s.split_iter(' ').len(), len);
}
}
#[bench]
fn split_iter_not_ascii(bh: &mut BenchHarness) {
struct NotAscii(char);
impl CharEq for NotAscii {
#[inline]
fn matches(&self, c: char) -> bool { **self == c }
fn only_ascii(&self) -> bool { false }
}
let s = "Mary had a little lamb, Little lamb, little-lamb.";
let len = s.split_iter(' ').len();
do bh.iter {
assert_eq!(s.split_iter(NotAscii(' ')).len(), len);
}
}
#[bench]
fn split_iter_extern_fn(bh: &mut BenchHarness) {
let s = "Mary had a little lamb, Little lamb, little-lamb.";
let len = s.split_iter(' ').len();
fn pred(c: char) -> bool { c == ' ' }
do bh.iter {
assert_eq!(s.split_iter(pred).len(), len);
}
}
#[bench]
fn split_iter_closure(bh: &mut BenchHarness) {
let s = "Mary had a little lamb, Little lamb, little-lamb.";
let len = s.split_iter(' ').len();
do bh.iter {
assert_eq!(s.split_iter(|c: char| c == ' ').len(), len);
}
}
#[bench]
fn split_iter_slice(bh: &mut BenchHarness) {
let s = "Mary had a little lamb, Little lamb, little-lamb.";
let len = s.split_iter(' ').len();
do bh.iter {
assert_eq!(s.split_iter(&[' ']).len(), len);
}
}
2013-07-22 12:52:38 -05:00
#[bench]
fn is_utf8_100_ascii(bh: &mut BenchHarness) {
let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
Lorem ipsum dolor sit amet, consectetur. ");
assert_eq!(100, s.len());
do bh.iter {
is_utf8(s);
2013-07-22 12:52:38 -05:00
}
}
#[bench]
fn is_utf8_100_multibyte(bh: &mut BenchHarness) {
let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
assert_eq!(100, s.len());
do bh.iter {
is_utf8(s);
2013-07-22 12:52:38 -05:00
}
}
#[bench]
fn bench_with_capacity(bh: &mut BenchHarness) {
do bh.iter {
with_capacity(100);
}
}
#[bench]
fn bench_push_str(bh: &mut BenchHarness) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
do bh.iter {
let mut r = ~"";
r.push_str(s);
}
}
#[bench]
fn bench_connect(bh: &mut BenchHarness) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let sep = "";
let v = [s, s, s, s, s, s, s, s, s, s];
do bh.iter {
assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);
}
}
2013-07-22 12:52:38 -05:00
}