2014-04-02 18:54:22 -05:00
|
|
|
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
|
|
|
// file at the top-level directory of this distribution and at
|
|
|
|
// http://rust-lang.org/COPYRIGHT.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
|
|
// option. This file may not be copied, modified, or distributed
|
|
|
|
// except according to those terms.
|
2014-07-14 22:46:04 -05:00
|
|
|
//
|
|
|
|
// ignore-lexer-test FIXME #15679
|
2014-04-02 18:54:22 -05:00
|
|
|
|
|
|
|
//! An owned, growable string that enforces that its contents are valid UTF-8.
|
|
|
|
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
use core::prelude::*;
|
|
|
|
|
2014-11-21 16:10:42 -06:00
|
|
|
use core::borrow::{Cow, IntoCow};
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
use core::default::Default;
|
|
|
|
use core::fmt;
|
2014-12-12 20:43:07 -06:00
|
|
|
use core::hash;
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
use core::mem;
|
|
|
|
use core::ptr;
|
2014-09-26 23:46:22 -05:00
|
|
|
use core::ops;
|
2014-08-18 10:29:44 -05:00
|
|
|
use core::raw::Slice as RawSlice;
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
use unicode::str as unicode_str;
|
|
|
|
use unicode::str::Utf16Item;
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
|
2014-12-11 11:44:17 -06:00
|
|
|
use slice::CloneSliceExt;
|
2014-12-10 21:46:38 -06:00
|
|
|
use str::{mod, CharRange, FromStr, Utf8Error};
|
2014-08-23 19:26:53 -05:00
|
|
|
use vec::{DerefVec, Vec, as_vec};
|
2014-04-02 18:54:22 -05:00
|
|
|
|
2014-04-10 05:55:34 -05:00
|
|
|
/// A growable string stored as a UTF-8 encoded buffer.
|
2014-11-20 23:14:05 -06:00
|
|
|
#[deriving(Clone, PartialOrd, Eq, Ord)]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-05-22 18:57:53 -05:00
|
|
|
pub struct String {
|
2014-04-02 18:54:22 -05:00
|
|
|
vec: Vec<u8>,
|
|
|
|
}
|
|
|
|
|
2014-05-22 18:57:53 -05:00
|
|
|
impl String {
|
2014-04-20 23:49:39 -05:00
|
|
|
/// Creates a new string buffer initialized with the empty string.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-05-22 18:57:53 -05:00
|
|
|
pub fn new() -> String {
|
|
|
|
String {
|
2014-04-02 18:54:22 -05:00
|
|
|
vec: Vec::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a new string buffer with the given capacity.
|
2014-07-27 05:40:39 -05:00
|
|
|
/// The string will be able to hold exactly `capacity` bytes without
|
|
|
|
/// reallocating. If `capacity` is 0, the string will not allocate.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::with_capacity(10);
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-05-22 18:57:53 -05:00
|
|
|
pub fn with_capacity(capacity: uint) -> String {
|
|
|
|
String {
|
2014-04-02 18:54:22 -05:00
|
|
|
vec: Vec::with_capacity(capacity),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a new string buffer from the given string.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
|
|
|
/// assert_eq!(s.as_slice(), "hello");
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[experimental = "needs investigation to see if to_string() can match perf"]
|
2014-05-22 18:57:53 -05:00
|
|
|
pub fn from_str(string: &str) -> String {
|
2014-09-17 14:56:31 -05:00
|
|
|
String { vec: string.as_bytes().to_vec() }
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
2014-05-14 18:55:24 -05:00
|
|
|
/// Returns the vector as a string buffer, if possible, taking care not to
|
|
|
|
/// copy it.
|
|
|
|
///
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
/// # Failure
|
|
|
|
///
|
|
|
|
/// If the given vector is not valid UTF-8, then the original vector and the
|
|
|
|
/// corresponding error is returned.
|
2014-06-30 09:41:30 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-06-30 09:41:30 -05:00
|
|
|
///
|
|
|
|
/// ```rust
|
2014-12-10 21:46:38 -06:00
|
|
|
/// # #![allow(deprecated)]
|
|
|
|
/// use std::str::Utf8Error;
|
|
|
|
///
|
2014-06-30 09:41:30 -05:00
|
|
|
/// let hello_vec = vec![104, 101, 108, 108, 111];
|
2014-07-27 05:40:39 -05:00
|
|
|
/// let s = String::from_utf8(hello_vec);
|
|
|
|
/// assert_eq!(s, Ok("hello".to_string()));
|
|
|
|
///
|
|
|
|
/// let invalid_vec = vec![240, 144, 128];
|
|
|
|
/// let s = String::from_utf8(invalid_vec);
|
2014-12-10 21:46:38 -06:00
|
|
|
/// assert_eq!(s, Err((vec![240, 144, 128], Utf8Error::TooShort)));
|
2014-06-30 09:41:30 -05:00
|
|
|
/// ```
|
2014-04-10 05:55:34 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "error type may change"]
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
pub fn from_utf8(vec: Vec<u8>) -> Result<String, (Vec<u8>, Utf8Error)> {
|
|
|
|
match str::from_utf8(vec.as_slice()) {
|
|
|
|
Ok(..) => Ok(String { vec: vec }),
|
|
|
|
Err(e) => Err((vec, e))
|
2014-04-10 05:55:34 -05:00
|
|
|
}
|
|
|
|
}
|
2014-07-10 11:21:16 -05:00
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Converts a vector of bytes to a new UTF-8 string.
|
|
|
|
/// Any invalid UTF-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
|
2014-07-10 11:21:16 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-10 11:21:16 -05:00
|
|
|
///
|
|
|
|
/// ```rust
|
|
|
|
/// let input = b"Hello \xF0\x90\x80World";
|
2014-07-04 15:38:13 -05:00
|
|
|
/// let output = String::from_utf8_lossy(input);
|
2014-12-09 16:08:10 -06:00
|
|
|
/// assert_eq!(output.as_slice(), "Hello \u{FFFD}World");
|
2014-07-10 11:21:16 -05:00
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "return type may change"]
|
2014-11-21 16:10:42 -06:00
|
|
|
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> CowString<'a> {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
match str::from_utf8(v) {
|
|
|
|
Ok(s) => return Cow::Borrowed(s),
|
|
|
|
Err(..) => {}
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
static TAG_CONT_U8: u8 = 128u8;
|
|
|
|
static REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8
|
|
|
|
let mut i = 0;
|
|
|
|
let total = v.len();
|
|
|
|
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
|
2014-08-06 22:08:16 -05:00
|
|
|
unsafe { *xs.unsafe_get(i) }
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
|
|
|
fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
|
|
|
|
if i >= total {
|
|
|
|
0
|
|
|
|
} else {
|
|
|
|
unsafe_get(xs, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut res = String::with_capacity(total);
|
|
|
|
|
|
|
|
if i > 0 {
|
|
|
|
unsafe {
|
2014-09-24 06:41:09 -05:00
|
|
|
res.as_mut_vec().push_all(v[..i])
|
2014-07-10 11:21:16 -05:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
// subseqidx is the index of the first byte of the subsequence we're looking at.
|
|
|
|
// It's used to copy a bunch of contiguous good codepoints at once instead of copying
|
|
|
|
// them one by one.
|
|
|
|
let mut subseqidx = 0;
|
|
|
|
|
|
|
|
while i < total {
|
|
|
|
let i_ = i;
|
|
|
|
let byte = unsafe_get(v, i);
|
|
|
|
i += 1;
|
|
|
|
|
|
|
|
macro_rules! error(() => ({
|
|
|
|
unsafe {
|
|
|
|
if subseqidx != i_ {
|
2014-09-24 06:41:09 -05:00
|
|
|
res.as_mut_vec().push_all(v[subseqidx..i_]);
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
|
|
|
subseqidx = i;
|
2014-09-22 10:28:35 -05:00
|
|
|
res.as_mut_vec().push_all(REPLACEMENT);
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
2014-11-14 11:18:10 -06:00
|
|
|
}));
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
if byte < 128u8 {
|
|
|
|
// subseqidx handles this
|
|
|
|
} else {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
let w = unicode_str::utf8_char_width(byte);
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
match w {
|
|
|
|
2 => {
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
3 => {
|
|
|
|
match (byte, safe_get(v, i, total)) {
|
2014-09-26 23:13:20 -05:00
|
|
|
(0xE0 , 0xA0 ... 0xBF) => (),
|
|
|
|
(0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
|
|
|
|
(0xED , 0x80 ... 0x9F) => (),
|
|
|
|
(0xEE ... 0xEF, 0x80 ... 0xBF) => (),
|
2014-07-10 11:21:16 -05:00
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
4 => {
|
|
|
|
match (byte, safe_get(v, i, total)) {
|
2014-09-26 23:13:20 -05:00
|
|
|
(0xF0 , 0x90 ... 0xBF) => (),
|
|
|
|
(0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
|
|
|
|
(0xF4 , 0x80 ... 0x8F) => (),
|
2014-07-10 11:21:16 -05:00
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if subseqidx < total {
|
|
|
|
unsafe {
|
2014-09-24 06:41:09 -05:00
|
|
|
res.as_mut_vec().push_all(v[subseqidx..total])
|
2014-07-10 11:21:16 -05:00
|
|
|
};
|
|
|
|
}
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
Cow::Owned(res)
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
|
|
|
|
2014-07-04 15:38:13 -05:00
|
|
|
/// Decode a UTF-16 encoded vector `v` into a `String`, returning `None`
|
2014-07-10 10:43:03 -05:00
|
|
|
/// if `v` contains any invalid data.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-10 10:43:03 -05:00
|
|
|
///
|
|
|
|
/// ```rust
|
2014-07-04 15:38:13 -05:00
|
|
|
/// // 𝄞music
|
2014-11-17 02:39:01 -06:00
|
|
|
/// let mut v = &mut [0xD834, 0xDD1E, 0x006d, 0x0075,
|
|
|
|
/// 0x0073, 0x0069, 0x0063];
|
2014-07-04 15:38:13 -05:00
|
|
|
/// assert_eq!(String::from_utf16(v), Some("𝄞music".to_string()));
|
2014-07-10 10:43:03 -05:00
|
|
|
///
|
2014-07-04 15:38:13 -05:00
|
|
|
/// // 𝄞mu<invalid>ic
|
2014-07-10 10:43:03 -05:00
|
|
|
/// v[4] = 0xD800;
|
|
|
|
/// assert_eq!(String::from_utf16(v), None);
|
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "error value in return may change"]
|
2014-07-10 10:43:03 -05:00
|
|
|
pub fn from_utf16(v: &[u16]) -> Option<String> {
|
2014-10-03 15:20:04 -05:00
|
|
|
let mut s = String::with_capacity(v.len());
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
for c in unicode_str::utf16_items(v) {
|
2014-07-10 10:43:03 -05:00
|
|
|
match c {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
Utf16Item::ScalarValue(c) => s.push(c),
|
|
|
|
Utf16Item::LoneSurrogate(_) => return None
|
2014-07-10 10:43:03 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Some(s)
|
|
|
|
}
|
2014-07-10 11:21:16 -05:00
|
|
|
|
2014-07-10 10:53:51 -05:00
|
|
|
/// Decode a UTF-16 encoded vector `v` into a string, replacing
|
|
|
|
/// invalid data with the replacement character (U+FFFD).
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
|
|
|
///
|
2014-07-10 10:53:51 -05:00
|
|
|
/// ```rust
|
2014-07-04 15:38:13 -05:00
|
|
|
/// // 𝄞mus<invalid>ic<invalid>
|
2014-11-17 02:39:01 -06:00
|
|
|
/// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075,
|
|
|
|
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
|
|
|
/// 0xD834];
|
2014-07-10 10:53:51 -05:00
|
|
|
///
|
|
|
|
/// assert_eq!(String::from_utf16_lossy(v),
|
2014-12-09 16:08:10 -06:00
|
|
|
/// "𝄞mus\u{FFFD}ic\u{FFFD}".to_string());
|
2014-07-10 10:53:51 -05:00
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-07-10 10:53:51 -05:00
|
|
|
pub fn from_utf16_lossy(v: &[u16]) -> String {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
2014-07-10 10:53:51 -05:00
|
|
|
}
|
2014-07-04 15:18:11 -05:00
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Convert a vector of `char`s to a `String`.
|
2014-07-04 14:55:58 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-04 14:55:58 -05:00
|
|
|
///
|
|
|
|
/// ```rust
|
2014-11-17 02:39:01 -06:00
|
|
|
/// let chars = &['h', 'e', 'l', 'l', 'o'];
|
2014-07-27 05:40:39 -05:00
|
|
|
/// let s = String::from_chars(chars);
|
|
|
|
/// assert_eq!(s.as_slice(), "hello");
|
2014-07-04 14:55:58 -05:00
|
|
|
/// ```
|
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "may be removed in favor of .collect()"]
|
2014-07-04 14:55:58 -05:00
|
|
|
pub fn from_chars(chs: &[char]) -> String {
|
|
|
|
chs.iter().map(|c| *c).collect()
|
|
|
|
}
|
2014-04-10 05:55:34 -05:00
|
|
|
|
2014-11-20 12:11:15 -06:00
|
|
|
/// Creates a new `String` from a length, capacity, and pointer.
|
|
|
|
///
|
|
|
|
/// This is unsafe because:
|
|
|
|
/// * We call `Vec::from_raw_parts` to get a `Vec<u8>`;
|
|
|
|
/// * We assume that the `Vec` contains valid UTF-8.
|
|
|
|
#[inline]
|
|
|
|
#[unstable = "function just moved from string::raw"]
|
|
|
|
pub unsafe fn from_raw_parts(buf: *mut u8, length: uint, capacity: uint) -> String {
|
|
|
|
String {
|
|
|
|
vec: Vec::from_raw_parts(buf, length, capacity),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a `String` from a null-terminated `*const u8` buffer.
|
|
|
|
///
|
|
|
|
/// This function is unsafe because we dereference memory until we find the
|
|
|
|
/// NUL character, which is not guaranteed to be present. Additionally, the
|
|
|
|
/// slice is not checked to see whether it contains valid UTF-8
|
|
|
|
#[unstable = "just renamed from `mod raw`"]
|
|
|
|
pub unsafe fn from_raw_buf(buf: *const u8) -> String {
|
|
|
|
String::from_str(str::from_c_str(buf as *const i8))
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a `String` from a `*const u8` buffer of the given length.
|
|
|
|
///
|
|
|
|
/// This function is unsafe because it blindly assumes the validity of the
|
|
|
|
/// pointer `buf` for `len` bytes of memory. This function will copy the
|
|
|
|
/// memory from `buf` into a new allocation (owned by the returned
|
|
|
|
/// `String`).
|
|
|
|
///
|
|
|
|
/// This function is also unsafe because it does not validate that the
|
|
|
|
/// buffer is valid UTF-8 encoded data.
|
|
|
|
#[unstable = "just renamed from `mod raw`"]
|
|
|
|
pub unsafe fn from_raw_buf_len(buf: *const u8, len: uint) -> String {
|
|
|
|
String::from_utf8_unchecked(Vec::from_raw_buf(buf, len))
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Converts a vector of bytes to a new `String` without checking if
|
|
|
|
/// it contains valid UTF-8. This is unsafe because it assumes that
|
|
|
|
/// the UTF-8-ness of the vector has already been validated.
|
|
|
|
#[inline]
|
|
|
|
#[unstable = "awaiting stabilization"]
|
|
|
|
pub unsafe fn from_utf8_unchecked(bytes: Vec<u8>) -> String {
|
|
|
|
String { vec: bytes }
|
|
|
|
}
|
|
|
|
|
2014-04-10 05:55:34 -05:00
|
|
|
/// Return the underlying byte buffer, encoded as UTF-8.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
|
|
|
/// let bytes = s.into_bytes();
|
|
|
|
/// assert_eq!(bytes, vec![104, 101, 108, 108, 111]);
|
|
|
|
/// ```
|
2014-04-10 05:55:34 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-04-10 05:55:34 -05:00
|
|
|
pub fn into_bytes(self) -> Vec<u8> {
|
|
|
|
self.vec
|
|
|
|
}
|
|
|
|
|
2014-04-02 18:54:22 -05:00
|
|
|
/// Creates a string buffer by repeating a character `length` times.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_char(5, 'a');
|
|
|
|
/// assert_eq!(s.as_slice(), "aaaaa");
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "may be replaced with iterators, questionable usability, and \
|
|
|
|
the name may change"]
|
2014-05-22 18:57:53 -05:00
|
|
|
pub fn from_char(length: uint, ch: char) -> String {
|
2014-04-02 18:54:22 -05:00
|
|
|
if length == 0 {
|
2014-05-22 18:57:53 -05:00
|
|
|
return String::new()
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut buf = String::new();
|
2014-09-22 10:28:35 -05:00
|
|
|
buf.push(ch);
|
2014-11-06 11:24:47 -06:00
|
|
|
let size = buf.len() * (length - 1);
|
|
|
|
buf.reserve_exact(size);
|
2014-04-02 18:54:22 -05:00
|
|
|
for _ in range(1, length) {
|
2014-09-22 10:28:35 -05:00
|
|
|
buf.push(ch)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Pushes the given string onto this string buffer.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.push_str("bar");
|
|
|
|
/// assert_eq!(s.as_slice(), "foobar");
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "extra variants of `push`, could possibly be based on iterators"]
|
2014-04-02 18:54:22 -05:00
|
|
|
pub fn push_str(&mut self, string: &str) {
|
|
|
|
self.vec.push_all(string.as_bytes())
|
|
|
|
}
|
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Pushes `ch` onto the given string `count` times.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.grow(5, 'Z');
|
|
|
|
/// assert_eq!(s.as_slice(), "fooZZZZZ");
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "duplicate of iterator-based functionality"]
|
2014-04-02 18:54:22 -05:00
|
|
|
pub fn grow(&mut self, count: uint, ch: char) {
|
|
|
|
for _ in range(0, count) {
|
2014-09-22 10:28:35 -05:00
|
|
|
self.push(ch)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
/// Returns the number of bytes that this string buffer can hold without reallocating.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::with_capacity(10);
|
2014-09-22 10:28:35 -05:00
|
|
|
/// assert!(s.capacity() >= 10);
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
/// ```
|
|
|
|
#[inline]
|
2014-11-06 11:24:47 -06:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
pub fn capacity(&self) -> uint {
|
|
|
|
self.vec.capacity()
|
|
|
|
}
|
|
|
|
|
2014-11-06 11:24:47 -06:00
|
|
|
/// Deprecated: Renamed to `reserve`.
|
|
|
|
#[deprecated = "Renamed to `reserve`"]
|
2014-04-02 18:54:22 -05:00
|
|
|
pub fn reserve_additional(&mut self, extra: uint) {
|
2014-11-06 11:24:47 -06:00
|
|
|
self.vec.reserve(extra)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
2014-11-06 11:24:47 -06:00
|
|
|
/// Reserves capacity for at least `additional` more bytes to be inserted in the given
|
|
|
|
/// `String`. The collection may reserve more space to avoid frequent reallocations.
|
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// Panics if the new capacity overflows `uint`.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
|
|
|
/// s.reserve(10);
|
2014-09-22 10:28:35 -05:00
|
|
|
/// assert!(s.capacity() >= 10);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
2014-11-06 11:24:47 -06:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
|
|
|
pub fn reserve(&mut self, additional: uint) {
|
|
|
|
self.vec.reserve(additional)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
2014-11-06 11:24:47 -06:00
|
|
|
/// Reserves the minimum capacity for exactly `additional` more bytes to be inserted in the
|
|
|
|
/// given `String`. Does nothing if the capacity is already sufficient.
|
|
|
|
///
|
|
|
|
/// Note that the allocator may give the collection more space than it requests. Therefore
|
|
|
|
/// capacity can not be relied upon to be precisely minimal. Prefer `reserve` if future
|
|
|
|
/// insertions are expected.
|
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// Panics if the new capacity overflows `uint`.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
2014-11-06 11:24:47 -06:00
|
|
|
/// s.reserve(10);
|
|
|
|
/// assert!(s.capacity() >= 10);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
2014-11-06 11:24:47 -06:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
|
|
|
pub fn reserve_exact(&mut self, additional: uint) {
|
|
|
|
self.vec.reserve_exact(additional)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Shrinks the capacity of this string buffer to match its length.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.reserve(100);
|
2014-09-22 10:28:35 -05:00
|
|
|
/// assert!(s.capacity() >= 100);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// s.shrink_to_fit();
|
2014-09-22 10:28:35 -05:00
|
|
|
/// assert_eq!(s.capacity(), 3);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
2014-11-06 11:24:47 -06:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
2014-04-02 18:54:22 -05:00
|
|
|
pub fn shrink_to_fit(&mut self) {
|
|
|
|
self.vec.shrink_to_fit()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Adds the given character to the end of the string.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("abc");
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
/// s.push('1');
|
|
|
|
/// s.push('2');
|
|
|
|
/// s.push('3');
|
2014-07-27 05:40:39 -05:00
|
|
|
/// assert_eq!(s.as_slice(), "abc123");
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
2014-09-26 19:37:28 -05:00
|
|
|
#[stable = "function just renamed from push_char"]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
pub fn push(&mut self, ch: char) {
|
2014-12-20 11:17:58 -06:00
|
|
|
if (ch as u32) < 0x80 {
|
|
|
|
self.vec.push(ch as u8);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-04-02 18:54:22 -05:00
|
|
|
let cur_len = self.len();
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
// This may use up to 4 bytes.
|
2014-11-06 11:24:47 -06:00
|
|
|
self.vec.reserve(4);
|
2014-04-02 18:54:22 -05:00
|
|
|
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
unsafe {
|
2014-04-02 18:54:22 -05:00
|
|
|
// Attempt to not use an intermediate buffer by just pushing bytes
|
|
|
|
// directly onto this string.
|
2014-08-06 22:03:55 -05:00
|
|
|
let slice = RawSlice {
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
data: self.vec.as_ptr().offset(cur_len as int),
|
|
|
|
len: 4,
|
|
|
|
};
|
2014-08-13 18:02:31 -05:00
|
|
|
let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
|
2014-04-02 18:54:22 -05:00
|
|
|
self.vec.set_len(cur_len + used);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Works with the underlying buffer as a byte slice.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
2014-08-06 04:59:40 -05:00
|
|
|
/// let b: &[_] = &[104, 101, 108, 108, 111];
|
|
|
|
/// assert_eq!(s.as_bytes(), b);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-04-02 18:54:22 -05:00
|
|
|
pub fn as_bytes<'a>(&'a self) -> &'a [u8] {
|
|
|
|
self.vec.as_slice()
|
|
|
|
}
|
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Shortens a string to the specified length.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-11-11 12:36:09 -06:00
|
|
|
/// # Panics
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-11-11 12:36:09 -06:00
|
|
|
/// Panics if `new_len` > current length,
|
2014-10-05 06:15:59 -05:00
|
|
|
/// or if `new_len` is not a character boundary.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("hello");
|
|
|
|
/// s.truncate(2);
|
|
|
|
/// assert_eq!(s.as_slice(), "he");
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
2014-10-09 14:17:22 -05:00
|
|
|
#[unstable = "the panic conventions for strings are under development"]
|
2014-10-05 06:15:59 -05:00
|
|
|
pub fn truncate(&mut self, new_len: uint) {
|
2014-11-27 10:45:50 -06:00
|
|
|
assert!(self.is_char_boundary(new_len));
|
2014-10-05 06:15:59 -05:00
|
|
|
self.vec.truncate(new_len)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
2014-07-27 05:40:39 -05:00
|
|
|
/// Removes the last character from the string buffer and returns it.
|
|
|
|
/// Returns `None` if this string buffer is empty.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
/// assert_eq!(s.pop(), Some('o'));
|
|
|
|
/// assert_eq!(s.pop(), Some('o'));
|
|
|
|
/// assert_eq!(s.pop(), Some('f'));
|
|
|
|
/// assert_eq!(s.pop(), None);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// ```
|
2014-05-08 15:42:40 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "this function was just renamed from pop_char"]
|
|
|
|
pub fn pop(&mut self) -> Option<char> {
|
2014-05-08 15:42:40 -05:00
|
|
|
let len = self.len();
|
|
|
|
if len == 0 {
|
|
|
|
return None
|
|
|
|
}
|
|
|
|
|
2014-11-27 10:45:50 -06:00
|
|
|
let CharRange {ch, next} = self.char_range_at_reverse(len);
|
2014-05-08 15:42:40 -05:00
|
|
|
unsafe {
|
|
|
|
self.vec.set_len(next);
|
|
|
|
}
|
|
|
|
Some(ch)
|
|
|
|
}
|
|
|
|
|
2014-09-22 10:24:14 -05:00
|
|
|
/// Removes the character from the string buffer at byte position `idx` and
|
|
|
|
/// returns it. Returns `None` if `idx` is out of bounds.
|
2014-05-08 15:42:40 -05:00
|
|
|
///
|
|
|
|
/// # Warning
|
|
|
|
///
|
2014-11-16 10:28:13 -06:00
|
|
|
/// This is an O(n) operation as it requires copying every element in the
|
2014-09-22 10:24:14 -05:00
|
|
|
/// buffer.
|
|
|
|
///
|
2014-10-09 14:17:22 -05:00
|
|
|
/// # Panics
|
2014-09-22 10:24:14 -05:00
|
|
|
///
|
|
|
|
/// If `idx` does not lie on a character boundary, then this function will
|
2014-10-09 14:17:22 -05:00
|
|
|
/// panic.
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
2014-09-22 10:24:14 -05:00
|
|
|
/// assert_eq!(s.remove(0), Some('f'));
|
|
|
|
/// assert_eq!(s.remove(1), Some('o'));
|
|
|
|
/// assert_eq!(s.remove(0), Some('o'));
|
|
|
|
/// assert_eq!(s.remove(0), None);
|
2014-07-27 05:40:39 -05:00
|
|
|
/// ```
|
2014-10-09 14:17:22 -05:00
|
|
|
#[unstable = "the panic semantics of this function and return type \
|
2014-09-22 10:24:14 -05:00
|
|
|
may change"]
|
|
|
|
pub fn remove(&mut self, idx: uint) -> Option<char> {
|
2014-04-02 18:54:22 -05:00
|
|
|
let len = self.len();
|
2014-09-22 10:24:14 -05:00
|
|
|
if idx >= len { return None }
|
2014-04-02 18:54:22 -05:00
|
|
|
|
2014-11-27 10:45:50 -06:00
|
|
|
let CharRange { ch, next } = self.char_range_at(idx);
|
2014-05-08 15:42:40 -05:00
|
|
|
unsafe {
|
2014-09-22 10:24:14 -05:00
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset(idx as int),
|
|
|
|
self.vec.as_ptr().offset(next as int),
|
|
|
|
len - next);
|
|
|
|
self.vec.set_len(len - (next - idx));
|
2014-05-08 15:42:40 -05:00
|
|
|
}
|
|
|
|
Some(ch)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
2014-04-12 07:44:31 -05:00
|
|
|
|
2014-09-22 10:24:14 -05:00
|
|
|
/// Insert a character into the string buffer at byte position `idx`.
|
|
|
|
///
|
|
|
|
/// # Warning
|
|
|
|
///
|
2014-11-16 10:28:13 -06:00
|
|
|
/// This is an O(n) operation as it requires copying every element in the
|
2014-09-22 10:24:14 -05:00
|
|
|
/// buffer.
|
|
|
|
///
|
2014-10-09 14:17:22 -05:00
|
|
|
/// # Panics
|
2014-09-22 10:24:14 -05:00
|
|
|
///
|
|
|
|
/// If `idx` does not lie on a character boundary or is out of bounds, then
|
2014-10-09 14:17:22 -05:00
|
|
|
/// this function will panic.
|
|
|
|
#[unstable = "the panic semantics of this function are uncertain"]
|
2014-09-22 10:24:14 -05:00
|
|
|
pub fn insert(&mut self, idx: uint, ch: char) {
|
|
|
|
let len = self.len();
|
|
|
|
assert!(idx <= len);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert!(self.is_char_boundary(idx));
|
2014-11-06 11:24:47 -06:00
|
|
|
self.vec.reserve(4);
|
2014-09-22 10:24:14 -05:00
|
|
|
let mut bits = [0, ..4];
|
2014-11-17 02:39:01 -06:00
|
|
|
let amt = ch.encode_utf8(&mut bits).unwrap();
|
2014-09-22 10:24:14 -05:00
|
|
|
|
|
|
|
unsafe {
|
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset((idx + amt) as int),
|
|
|
|
self.vec.as_ptr().offset(idx as int),
|
|
|
|
len - idx);
|
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset(idx as int),
|
|
|
|
bits.as_ptr(),
|
|
|
|
amt);
|
|
|
|
self.vec.set_len(len + amt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-12 07:44:31 -05:00
|
|
|
/// Views the string buffer as a mutable sequence of bytes.
|
|
|
|
///
|
2014-07-27 05:40:39 -05:00
|
|
|
/// This is unsafe because it does not check
|
|
|
|
/// to ensure that the resulting string will be valid UTF-8.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 05:40:39 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("hello");
|
|
|
|
/// unsafe {
|
|
|
|
/// let vec = s.as_mut_vec();
|
|
|
|
/// assert!(vec == &mut vec![104, 101, 108, 108, 111]);
|
|
|
|
/// vec.reverse();
|
|
|
|
/// }
|
|
|
|
/// assert_eq!(s.as_slice(), "olleh");
|
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[unstable = "the name of this method may be changed"]
|
2014-04-12 07:44:31 -05:00
|
|
|
pub unsafe fn as_mut_vec<'a>(&'a mut self) -> &'a mut Vec<u8> {
|
|
|
|
&mut self.vec
|
|
|
|
}
|
2014-04-02 18:54:22 -05:00
|
|
|
|
2014-10-30 15:43:24 -05:00
|
|
|
/// Return the number of bytes in this string.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-10-30 15:43:24 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let a = "foo".to_string();
|
|
|
|
/// assert_eq!(a.len(), 3);
|
|
|
|
/// ```
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-10-30 15:43:24 -05:00
|
|
|
pub fn len(&self) -> uint { self.vec.len() }
|
2014-04-02 18:54:22 -05:00
|
|
|
|
2014-10-30 15:43:24 -05:00
|
|
|
/// Returns true if the string contains no bytes
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-10-30 15:43:24 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut v = String::new();
|
|
|
|
/// assert!(v.is_empty());
|
|
|
|
/// v.push('a');
|
|
|
|
/// assert!(!v.is_empty());
|
|
|
|
/// ```
|
|
|
|
pub fn is_empty(&self) -> bool { self.len() == 0 }
|
|
|
|
|
|
|
|
/// Truncates the string, returning it to 0 length.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-10-30 15:43:24 -05:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = "foo".to_string();
|
|
|
|
/// s.clear();
|
|
|
|
/// assert!(s.is_empty());
|
|
|
|
/// ```
|
2014-05-11 05:49:09 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-10-30 15:43:24 -05:00
|
|
|
pub fn clear(&mut self) {
|
2014-05-11 05:49:09 -05:00
|
|
|
self.vec.clear()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[experimental = "waiting on FromIterator stabilization"]
|
2014-05-22 18:57:53 -05:00
|
|
|
impl FromIterator<char> for String {
|
|
|
|
fn from_iter<I:Iterator<char>>(iterator: I) -> String {
|
|
|
|
let mut buf = String::new();
|
2014-04-02 18:54:22 -05:00
|
|
|
buf.extend(iterator);
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-07 14:32:00 -06:00
|
|
|
#[experimental = "waiting on FromIterator stabilization"]
|
|
|
|
impl<'a> FromIterator<&'a str> for String {
|
|
|
|
fn from_iter<I:Iterator<&'a str>>(iterator: I) -> String {
|
|
|
|
let mut buf = String::new();
|
|
|
|
buf.extend(iterator);
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-07 18:39:39 -06:00
|
|
|
#[experimental = "waiting on Extend stabilization"]
|
|
|
|
impl Extend<char> for String {
|
2014-04-02 18:54:22 -05:00
|
|
|
fn extend<I:Iterator<char>>(&mut self, mut iterator: I) {
|
2014-12-07 14:31:24 -06:00
|
|
|
let (lower_bound, _) = iterator.size_hint();
|
|
|
|
self.reserve(lower_bound);
|
2014-04-02 18:54:22 -05:00
|
|
|
for ch in iterator {
|
2014-09-22 10:28:35 -05:00
|
|
|
self.push(ch)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-07 14:32:00 -06:00
|
|
|
#[experimental = "waiting on Extend stabilization"]
|
|
|
|
impl<'a> Extend<&'a str> for String {
|
|
|
|
fn extend<I: Iterator<&'a str>>(&mut self, mut iterator: I) {
|
|
|
|
// A guess that at least one byte per iterator element will be needed.
|
|
|
|
let (lower_bound, _) = iterator.size_hint();
|
|
|
|
self.reserve(lower_bound);
|
|
|
|
for s in iterator {
|
|
|
|
self.push_str(s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-20 23:14:05 -06:00
|
|
|
impl PartialEq for String {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &String) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &String) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
|
|
|
macro_rules! impl_eq {
|
|
|
|
($lhs:ty, $rhs: ty) => {
|
|
|
|
impl<'a> PartialEq<$rhs> for $lhs {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &$rhs) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &$rhs) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> PartialEq<$lhs> for $rhs {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &$lhs) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &$lhs) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-14 11:18:10 -06:00
|
|
|
impl_eq! { String, &'a str }
|
|
|
|
impl_eq! { CowString<'a>, String }
|
2014-11-20 23:14:05 -06:00
|
|
|
|
|
|
|
impl<'a, 'b> PartialEq<&'b str> for CowString<'a> {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &&'b str) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &&'b str) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a, 'b> PartialEq<CowString<'a>> for &'b str {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &CowString<'a>) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &CowString<'a>) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[experimental = "waiting on Str stabilization"]
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
#[allow(deprecated)]
|
2014-05-22 18:57:53 -05:00
|
|
|
impl Str for String {
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-04-02 18:54:22 -05:00
|
|
|
fn as_slice<'a>(&'a self) -> &'a str {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
unsafe { mem::transmute(self.vec.as_slice()) }
|
2014-05-19 19:23:26 -05:00
|
|
|
}
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[stable]
|
2014-05-22 18:57:53 -05:00
|
|
|
impl Default for String {
|
2014-12-15 22:04:52 -06:00
|
|
|
#[stable]
|
2014-05-22 18:57:53 -05:00
|
|
|
fn default() -> String {
|
|
|
|
String::new()
|
2014-05-20 01:19:56 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[experimental = "waiting on Show stabilization"]
|
2014-05-22 18:57:53 -05:00
|
|
|
impl fmt::Show for String {
|
2014-04-02 18:54:22 -05:00
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
2014-12-10 21:46:38 -06:00
|
|
|
(**self).fmt(f)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[experimental = "waiting on Hash stabilization"]
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 20:50:12 -05:00
|
|
|
impl<H: hash::Writer> hash::Hash<H> for String {
|
2014-04-02 18:54:22 -05:00
|
|
|
#[inline]
|
|
|
|
fn hash(&self, hasher: &mut H) {
|
2014-12-10 21:46:38 -06:00
|
|
|
(**self).hash(hasher)
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-26 22:50:12 -06:00
|
|
|
#[allow(deprecated)]
|
|
|
|
#[deprecated = "Use overloaded `core::cmp::PartialEq`"]
|
2014-05-22 18:57:53 -05:00
|
|
|
impl<'a, S: Str> Equiv<S> for String {
|
2014-05-16 12:45:16 -05:00
|
|
|
#[inline]
|
|
|
|
fn equiv(&self, other: &S) -> bool {
|
|
|
|
self.as_slice() == other.as_slice()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 09:12:10 -05:00
|
|
|
#[experimental = "waiting on Add stabilization"]
|
2014-12-01 13:08:56 -06:00
|
|
|
impl<'a> Add<&'a str, String> for String {
|
|
|
|
fn add(mut self, other: &str) -> String {
|
|
|
|
self.push_str(other);
|
|
|
|
self
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-04 17:59:10 -05:00
|
|
|
impl ops::Slice<uint, str> for String {
|
|
|
|
#[inline]
|
|
|
|
fn as_slice_<'a>(&'a self) -> &'a str {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
unsafe { mem::transmute(self.vec.as_slice()) }
|
2014-10-04 17:59:10 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn slice_from_or_fail<'a>(&'a self, from: &uint) -> &'a str {
|
|
|
|
self[][*from..]
|
|
|
|
}
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn slice_to_or_fail<'a>(&'a self, to: &uint) -> &'a str {
|
|
|
|
self[][..*to]
|
|
|
|
}
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn slice_or_fail<'a>(&'a self, from: &uint, to: &uint) -> &'a str {
|
|
|
|
self[][*from..*to]
|
|
|
|
}
|
|
|
|
}
|
2014-09-26 23:46:22 -05:00
|
|
|
|
2014-10-29 17:26:29 -05:00
|
|
|
#[experimental = "waiting on Deref stabilization"]
|
|
|
|
impl ops::Deref<str> for String {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
fn deref<'a>(&'a self) -> &'a str {
|
|
|
|
unsafe { mem::transmute(self.vec[]) }
|
|
|
|
}
|
2014-10-29 17:26:29 -05:00
|
|
|
}
|
|
|
|
|
2014-08-23 19:26:53 -05:00
|
|
|
/// Wrapper type providing a `&String` reference via `Deref`.
|
|
|
|
#[experimental]
|
|
|
|
pub struct DerefString<'a> {
|
|
|
|
x: DerefVec<'a, u8>
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Deref<String> for DerefString<'a> {
|
|
|
|
fn deref<'b>(&'b self) -> &'b String {
|
|
|
|
unsafe { mem::transmute(&*self.x) }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Convert a string slice to a wrapper type providing a `&String` reference.
|
2014-12-07 17:47:00 -06:00
|
|
|
///
|
|
|
|
/// # Examples
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// use std::string::as_string;
|
|
|
|
///
|
|
|
|
/// fn string_consumer(s: String) {
|
|
|
|
/// assert_eq!(s, "foo".to_string());
|
|
|
|
/// }
|
|
|
|
///
|
|
|
|
/// let string = as_string("foo").clone();
|
|
|
|
/// string_consumer(string);
|
|
|
|
/// ```
|
2014-08-23 19:26:53 -05:00
|
|
|
#[experimental]
|
|
|
|
pub fn as_string<'a>(x: &'a str) -> DerefString<'a> {
|
|
|
|
DerefString { x: as_vec(x.as_bytes()) }
|
|
|
|
}
|
|
|
|
|
2014-11-14 22:52:00 -06:00
|
|
|
impl FromStr for String {
|
|
|
|
#[inline]
|
|
|
|
fn from_str(s: &str) -> Option<String> {
|
|
|
|
Some(String::from_str(s))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-15 06:57:54 -06:00
|
|
|
/// Trait for converting a type to a string, consuming it in the process.
|
|
|
|
pub trait IntoString {
|
|
|
|
/// Consume and convert to a string.
|
|
|
|
fn into_string(self) -> String;
|
|
|
|
}
|
|
|
|
|
2014-11-15 19:38:03 -06:00
|
|
|
/// A generic trait for converting a value to a string
|
|
|
|
pub trait ToString {
|
|
|
|
/// Converts the value of `self` to an owned string
|
|
|
|
fn to_string(&self) -> String;
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: fmt::Show> ToString for T {
|
|
|
|
fn to_string(&self) -> String {
|
|
|
|
let mut buf = Vec::<u8>::new();
|
|
|
|
let _ = format_args!(|args| fmt::write(&mut buf, args), "{}", self);
|
|
|
|
String::from_utf8(buf).unwrap()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-21 16:10:42 -06:00
|
|
|
impl IntoCow<'static, String, str> for String {
|
|
|
|
fn into_cow(self) -> CowString<'static> {
|
|
|
|
Cow::Owned(self)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> IntoCow<'a, String, str> for &'a str {
|
|
|
|
fn into_cow(self) -> CowString<'a> {
|
|
|
|
Cow::Borrowed(self)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-27 05:40:39 -05:00
|
|
|
/// Unsafe operations
|
2014-11-20 12:11:15 -06:00
|
|
|
#[deprecated]
|
2014-07-19 05:23:47 -05:00
|
|
|
pub mod raw {
|
|
|
|
use super::String;
|
|
|
|
use vec::Vec;
|
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Creates a new `String` from a length, capacity, and pointer.
|
2014-07-21 13:44:56 -05:00
|
|
|
///
|
|
|
|
/// This is unsafe because:
|
2014-08-04 05:48:39 -05:00
|
|
|
/// * We call `Vec::from_raw_parts` to get a `Vec<u8>`;
|
|
|
|
/// * We assume that the `Vec` contains valid UTF-8.
|
2014-07-21 13:44:56 -05:00
|
|
|
#[inline]
|
2014-11-20 12:11:15 -06:00
|
|
|
#[deprecated = "renamed to String::from_raw_parts"]
|
2014-07-22 10:55:12 -05:00
|
|
|
pub unsafe fn from_parts(buf: *mut u8, length: uint, capacity: uint) -> String {
|
2014-11-20 12:11:15 -06:00
|
|
|
String::from_raw_parts(buf, length, capacity)
|
2014-07-21 13:44:56 -05:00
|
|
|
}
|
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Creates a `String` from a `*const u8` buffer of the given length.
|
2014-07-20 05:08:40 -05:00
|
|
|
///
|
|
|
|
/// This function is unsafe because of two reasons:
|
2014-11-20 12:11:15 -06:00
|
|
|
///
|
2014-08-04 05:48:39 -05:00
|
|
|
/// * A raw pointer is dereferenced and transmuted to `&[u8]`;
|
|
|
|
/// * The slice is not checked to see whether it contains valid UTF-8.
|
2014-11-20 12:11:15 -06:00
|
|
|
#[deprecated = "renamed to String::from_raw_buf_len"]
|
2014-07-20 05:08:40 -05:00
|
|
|
pub unsafe fn from_buf_len(buf: *const u8, len: uint) -> String {
|
2014-11-20 12:11:15 -06:00
|
|
|
String::from_raw_buf_len(buf, len)
|
2014-07-20 05:08:40 -05:00
|
|
|
}
|
2014-07-22 10:55:12 -05:00
|
|
|
|
2014-08-04 05:48:39 -05:00
|
|
|
/// Creates a `String` from a null-terminated `*const u8` buffer.
|
2014-07-22 10:55:12 -05:00
|
|
|
///
|
|
|
|
/// This function is unsafe because we dereference memory until we find the NUL character,
|
2014-08-01 18:40:21 -05:00
|
|
|
/// which is not guaranteed to be present. Additionally, the slice is not checked to see
|
2014-07-22 10:55:12 -05:00
|
|
|
/// whether it contains valid UTF-8
|
2014-11-20 12:11:15 -06:00
|
|
|
#[deprecated = "renamed to String::from_raw_buf"]
|
2014-07-22 10:55:12 -05:00
|
|
|
pub unsafe fn from_buf(buf: *const u8) -> String {
|
2014-11-20 12:11:15 -06:00
|
|
|
String::from_raw_buf(buf)
|
2014-07-22 10:55:12 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Converts a vector of bytes to a new `String` without checking if
|
|
|
|
/// it contains valid UTF-8. This is unsafe because it assumes that
|
2014-08-04 05:48:39 -05:00
|
|
|
/// the UTF-8-ness of the vector has already been validated.
|
2014-07-22 10:55:12 -05:00
|
|
|
#[inline]
|
2014-11-20 12:11:15 -06:00
|
|
|
#[deprecated = "renamed to String::from_utf8_unchecked"]
|
2014-07-22 10:55:12 -05:00
|
|
|
pub unsafe fn from_utf8(bytes: Vec<u8>) -> String {
|
2014-11-20 12:11:15 -06:00
|
|
|
String::from_utf8_unchecked(bytes)
|
2014-07-22 10:55:12 -05:00
|
|
|
}
|
2014-07-19 05:23:47 -05:00
|
|
|
}
|
|
|
|
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 11:02:31 -06:00
|
|
|
/// A clone-on-write string
|
|
|
|
#[stable]
|
|
|
|
pub type CowString<'a> = Cow<'a, String, str>;
|
|
|
|
|
|
|
|
#[allow(deprecated)]
|
|
|
|
impl<'a> Str for CowString<'a> {
|
|
|
|
#[inline]
|
|
|
|
fn as_slice<'b>(&'b self) -> &'b str {
|
|
|
|
(**self).as_slice()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-02 18:54:22 -05:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2014-12-19 06:02:22 -06:00
|
|
|
use prelude::*;
|
2014-05-29 21:03:06 -05:00
|
|
|
use test::Bencher;
|
|
|
|
|
2014-12-10 21:46:38 -06:00
|
|
|
use str::{StrExt, Utf8Error};
|
2014-07-10 10:53:51 -05:00
|
|
|
use str;
|
2014-12-19 06:02:22 -06:00
|
|
|
use super::as_string;
|
2014-04-02 18:54:22 -05:00
|
|
|
|
2014-08-23 19:26:53 -05:00
|
|
|
#[test]
|
|
|
|
fn test_as_string() {
|
|
|
|
let x = "foo";
|
|
|
|
assert_eq!(x, as_string(x).as_slice());
|
|
|
|
}
|
|
|
|
|
2014-06-21 05:39:03 -05:00
|
|
|
#[test]
|
|
|
|
fn test_from_str() {
|
|
|
|
let owned: Option<::std::string::String> = from_str("string");
|
|
|
|
assert_eq!(owned.as_ref().map(|s| s.as_slice()), Some("string"));
|
|
|
|
}
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf8() {
|
2014-10-05 05:11:17 -05:00
|
|
|
let xs = b"hello".to_vec();
|
2014-12-10 21:46:38 -06:00
|
|
|
assert_eq!(String::from_utf8(xs),
|
|
|
|
Ok(String::from_str("hello")));
|
2014-07-10 11:21:16 -05:00
|
|
|
|
2014-10-05 05:11:17 -05:00
|
|
|
let xs = "ศไทย中华Việt Nam".as_bytes().to_vec();
|
2014-12-10 21:46:38 -06:00
|
|
|
assert_eq!(String::from_utf8(xs),
|
|
|
|
Ok(String::from_str("ศไทย中华Việt Nam")));
|
2014-07-10 11:21:16 -05:00
|
|
|
|
2014-10-05 05:11:17 -05:00
|
|
|
let xs = b"hello\xFF".to_vec();
|
2014-07-10 11:21:16 -05:00
|
|
|
assert_eq!(String::from_utf8(xs),
|
2014-12-10 21:46:38 -06:00
|
|
|
Err((b"hello\xFF".to_vec(), Utf8Error::TooShort)));
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf8_lossy() {
|
|
|
|
let xs = b"hello";
|
2014-11-21 00:20:04 -06:00
|
|
|
let ys: str::CowString = "hello".into_cow();
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs), ys);
|
2014-07-10 11:21:16 -05:00
|
|
|
|
2014-07-04 15:38:13 -05:00
|
|
|
let xs = "ศไทย中华Việt Nam".as_bytes();
|
2014-11-21 00:20:04 -06:00
|
|
|
let ys: str::CowString = "ศไทย中华Việt Nam".into_cow();
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs), ys);
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
let xs = b"Hello\xC2 There\xFF Goodbye";
|
2014-07-04 15:38:13 -05:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 16:08:10 -06:00
|
|
|
String::from_str("Hello\u{FFFD} There\u{FFFD} Goodbye").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 16:08:10 -06:00
|
|
|
String::from_str("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
let xs = b"\xF5foo\xF5\x80bar";
|
2014-07-04 15:38:13 -05:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 16:08:10 -06:00
|
|
|
String::from_str("\u{FFFD}foo\u{FFFD}\u{FFFD}bar").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
|
2014-07-04 15:38:13 -05:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 16:08:10 -06:00
|
|
|
String::from_str("\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 16:08:10 -06:00
|
|
|
String::from_str("\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
|
2014-12-09 16:08:10 -06:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), String::from_str("\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\
|
|
|
|
foo\u{10000}bar").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
|
|
|
|
// surrogates
|
|
|
|
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
|
2014-12-09 16:08:10 -06:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), String::from_str("\u{FFFD}\u{FFFD}\u{FFFD}foo\
|
|
|
|
\u{FFFD}\u{FFFD}\u{FFFD}bar").into_cow());
|
2014-07-10 11:21:16 -05:00
|
|
|
}
|
|
|
|
|
2014-07-10 10:53:51 -05:00
|
|
|
#[test]
|
|
|
|
fn test_from_utf16() {
|
|
|
|
let pairs =
|
2014-07-04 15:38:13 -05:00
|
|
|
[(String::from_str("𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n"),
|
2014-07-10 10:53:51 -05:00
|
|
|
vec![0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
|
|
|
|
0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
|
|
|
|
0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
|
|
|
|
0xd800_u16, 0xdf30_u16, 0x000a_u16]),
|
|
|
|
|
2014-07-04 15:38:13 -05:00
|
|
|
(String::from_str("𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n"),
|
2014-07-10 10:53:51 -05:00
|
|
|
vec![0xd801_u16, 0xdc12_u16, 0xd801_u16,
|
|
|
|
0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
|
|
|
|
0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
|
|
|
|
0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
|
|
|
|
0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
|
|
|
|
0x000a_u16]),
|
|
|
|
|
2014-07-04 15:38:13 -05:00
|
|
|
(String::from_str("𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n"),
|
2014-07-10 10:53:51 -05:00
|
|
|
vec![0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
|
|
|
|
0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
|
|
|
|
0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
|
|
|
|
0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
|
|
|
|
0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
|
|
|
|
0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
|
|
|
|
0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
|
|
|
|
|
2014-07-04 15:38:13 -05:00
|
|
|
(String::from_str("𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n"),
|
2014-07-10 10:53:51 -05:00
|
|
|
vec![0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
|
|
|
|
0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
|
|
|
|
0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
|
|
|
|
0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
|
|
|
|
0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
|
|
|
|
0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
|
|
|
|
0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
|
|
|
|
0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
|
|
|
|
0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
|
|
|
|
0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
|
|
|
|
0x000a_u16 ]),
|
|
|
|
// Issue #12318, even-numbered non-BMP planes
|
2014-12-09 16:08:10 -06:00
|
|
|
(String::from_str("\u{20000}"),
|
2014-07-10 10:53:51 -05:00
|
|
|
vec![0xD840, 0xDC00])];
|
|
|
|
|
|
|
|
for p in pairs.iter() {
|
|
|
|
let (s, u) = (*p).clone();
|
2014-11-27 10:45:50 -06:00
|
|
|
let s_as_utf16 = s.utf16_units().collect::<Vec<u16>>();
|
2014-07-10 10:53:51 -05:00
|
|
|
let u_as_string = String::from_utf16(u.as_slice()).unwrap();
|
|
|
|
|
2014-12-10 21:46:38 -06:00
|
|
|
assert!(::unicode::str::is_utf16(u.as_slice()));
|
2014-07-10 10:53:51 -05:00
|
|
|
assert_eq!(s_as_utf16, u);
|
|
|
|
|
|
|
|
assert_eq!(u_as_string, s);
|
|
|
|
assert_eq!(String::from_utf16_lossy(u.as_slice()), s);
|
|
|
|
|
|
|
|
assert_eq!(String::from_utf16(s_as_utf16.as_slice()).unwrap(), s);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(u_as_string.utf16_units().collect::<Vec<u16>>(), u);
|
2014-07-10 10:53:51 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_utf16_invalid() {
|
|
|
|
// completely positive cases tested above.
|
|
|
|
// lead + eof
|
2014-11-17 02:39:01 -06:00
|
|
|
assert_eq!(String::from_utf16(&[0xD800]), None);
|
2014-07-10 10:53:51 -05:00
|
|
|
// lead + lead
|
2014-11-17 02:39:01 -06:00
|
|
|
assert_eq!(String::from_utf16(&[0xD800, 0xD800]), None);
|
2014-07-10 10:53:51 -05:00
|
|
|
|
|
|
|
// isolated trail
|
2014-11-17 02:39:01 -06:00
|
|
|
assert_eq!(String::from_utf16(&[0x0061, 0xDC00]), None);
|
2014-07-10 10:53:51 -05:00
|
|
|
|
|
|
|
// general
|
2014-11-17 02:39:01 -06:00
|
|
|
assert_eq!(String::from_utf16(&[0xD800, 0xd801, 0xdc8b, 0xD800]), None);
|
2014-07-10 10:53:51 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf16_lossy() {
|
|
|
|
// completely positive cases tested above.
|
|
|
|
// lead + eof
|
2014-12-09 16:08:10 -06:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800]), String::from_str("\u{FFFD}"));
|
2014-07-10 10:53:51 -05:00
|
|
|
// lead + lead
|
2014-12-09 16:08:10 -06:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800, 0xD800]),
|
|
|
|
String::from_str("\u{FFFD}\u{FFFD}"));
|
2014-07-10 10:53:51 -05:00
|
|
|
|
|
|
|
// isolated trail
|
2014-12-09 16:08:10 -06:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0x0061, 0xDC00]), String::from_str("a\u{FFFD}"));
|
2014-07-10 10:53:51 -05:00
|
|
|
|
|
|
|
// general
|
2014-11-17 02:39:01 -06:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800, 0xd801, 0xdc8b, 0xD800]),
|
2014-12-09 16:08:10 -06:00
|
|
|
String::from_str("\u{FFFD}𐒋\u{FFFD}"));
|
2014-07-10 10:53:51 -05:00
|
|
|
}
|
2014-06-21 05:39:03 -05:00
|
|
|
|
2014-07-20 05:08:40 -05:00
|
|
|
#[test]
|
|
|
|
fn test_from_buf_len() {
|
|
|
|
unsafe {
|
|
|
|
let a = vec![65u8, 65, 65, 65, 65, 65, 65, 0];
|
|
|
|
assert_eq!(super::raw::from_buf_len(a.as_ptr(), 3), String::from_str("AAA"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-22 10:55:12 -05:00
|
|
|
#[test]
|
|
|
|
fn test_from_buf() {
|
|
|
|
unsafe {
|
|
|
|
let a = vec![65, 65, 65, 65, 65, 65, 65, 0];
|
|
|
|
let b = a.as_ptr();
|
|
|
|
let c = super::raw::from_buf(b);
|
|
|
|
assert_eq!(c, String::from_str("AAAAAAA"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-02 18:54:22 -05:00
|
|
|
#[test]
|
|
|
|
fn test_push_bytes() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut s = String::from_str("ABC");
|
2014-04-02 18:54:22 -05:00
|
|
|
unsafe {
|
2014-10-05 05:11:17 -05:00
|
|
|
let mv = s.as_mut_vec();
|
2014-11-17 02:39:01 -06:00
|
|
|
mv.push_all(&[b'D']);
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "ABCD");
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_push_str() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut s = String::new();
|
2014-04-02 18:54:22 -05:00
|
|
|
s.push_str("");
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s.slice_from(0), "");
|
2014-04-02 18:54:22 -05:00
|
|
|
s.push_str("abc");
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s.slice_from(0), "abc");
|
2014-04-02 18:54:22 -05:00
|
|
|
s.push_str("ประเทศไทย中华Việt Nam");
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
2014-09-22 10:28:35 -05:00
|
|
|
fn test_push() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut data = String::from_str("ประเทศไทย中");
|
2014-09-22 10:28:35 -05:00
|
|
|
data.push('华');
|
|
|
|
data.push('b'); // 1 byte
|
|
|
|
data.push('¢'); // 2 byte
|
|
|
|
data.push('€'); // 3 byte
|
|
|
|
data.push('𤭢'); // 4 byte
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(data, "ประเทศไทย中华b¢€𤭢");
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|
|
|
|
|
2014-05-08 15:42:40 -05:00
|
|
|
#[test]
|
2014-10-05 05:11:17 -05:00
|
|
|
fn test_pop() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut data = String::from_str("ประเทศไทย中华b¢€𤭢");
|
2014-10-05 05:11:17 -05:00
|
|
|
assert_eq!(data.pop().unwrap(), '𤭢'); // 4 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '€'); // 3 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '¢'); // 2 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), 'b'); // 1 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '华');
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(data, "ประเทศไทย中");
|
2014-05-08 15:42:40 -05:00
|
|
|
}
|
|
|
|
|
2014-04-02 18:54:22 -05:00
|
|
|
#[test]
|
|
|
|
fn test_str_truncate() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 18:54:22 -05:00
|
|
|
s.truncate(5);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "12345");
|
2014-04-02 18:54:22 -05:00
|
|
|
s.truncate(3);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "123");
|
2014-04-02 18:54:22 -05:00
|
|
|
s.truncate(0);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "");
|
2014-04-02 18:54:22 -05:00
|
|
|
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut s = String::from_str("12345");
|
2014-11-27 10:45:50 -06:00
|
|
|
let p = s.as_ptr();
|
2014-04-02 18:54:22 -05:00
|
|
|
s.truncate(3);
|
|
|
|
s.push_str("6");
|
2014-11-27 10:45:50 -06:00
|
|
|
let p_ = s.as_ptr();
|
2014-04-02 18:54:22 -05:00
|
|
|
assert_eq!(p_, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[should_fail]
|
|
|
|
fn test_str_truncate_invalid_len() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 18:54:22 -05:00
|
|
|
s.truncate(6);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[should_fail]
|
|
|
|
fn test_str_truncate_split_codepoint() {
|
2014-12-09 16:08:10 -06:00
|
|
|
let mut s = String::from_str("\u{FC}"); // ü
|
2014-04-02 18:54:22 -05:00
|
|
|
s.truncate(1);
|
|
|
|
}
|
2014-05-11 05:49:09 -05:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_str_clear() {
|
2014-05-22 18:57:53 -05:00
|
|
|
let mut s = String::from_str("12345");
|
2014-05-11 05:49:09 -05:00
|
|
|
s.clear();
|
|
|
|
assert_eq!(s.len(), 0);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "");
|
2014-05-11 05:49:09 -05:00
|
|
|
}
|
2014-05-27 23:34:00 -05:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_str_add() {
|
|
|
|
let a = String::from_str("12345");
|
|
|
|
let b = a + "2";
|
2014-12-01 17:02:39 -06:00
|
|
|
let b = b + "2";
|
2014-05-27 23:34:00 -05:00
|
|
|
assert_eq!(b.len(), 7);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(b, "1234522");
|
2014-05-27 23:34:00 -05:00
|
|
|
}
|
2014-07-10 11:21:16 -05:00
|
|
|
|
2014-09-22 10:24:14 -05:00
|
|
|
#[test]
|
|
|
|
fn remove() {
|
|
|
|
let mut s = "ศไทย中华Việt Nam; foobar".to_string();;
|
|
|
|
assert_eq!(s.remove(0), Some('ศ'));
|
|
|
|
assert_eq!(s.len(), 33);
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "ไทย中华Việt Nam; foobar");
|
2014-09-22 10:24:14 -05:00
|
|
|
assert_eq!(s.remove(33), None);
|
|
|
|
assert_eq!(s.remove(300), None);
|
|
|
|
assert_eq!(s.remove(17), Some('ệ'));
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "ไทย中华Vit Nam; foobar");
|
2014-09-22 10:24:14 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test] #[should_fail]
|
|
|
|
fn remove_bad() {
|
|
|
|
"ศ".to_string().remove(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn insert() {
|
|
|
|
let mut s = "foobar".to_string();
|
|
|
|
s.insert(0, 'ệ');
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "ệfoobar");
|
2014-09-22 10:24:14 -05:00
|
|
|
s.insert(6, 'ย');
|
2014-11-27 10:45:50 -06:00
|
|
|
assert_eq!(s, "ệfooยbar");
|
2014-09-22 10:24:14 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test] #[should_fail] fn insert_bad1() { "".to_string().insert(1, 't'); }
|
|
|
|
#[test] #[should_fail] fn insert_bad2() { "ệ".to_string().insert(1, 't'); }
|
|
|
|
|
2014-09-26 23:46:22 -05:00
|
|
|
#[test]
|
|
|
|
fn test_slicing() {
|
|
|
|
let s = "foobar".to_string();
|
|
|
|
assert_eq!("foobar", s[]);
|
|
|
|
assert_eq!("foo", s[..3]);
|
|
|
|
assert_eq!("bar", s[3..]);
|
|
|
|
assert_eq!("oob", s[1..4]);
|
|
|
|
}
|
|
|
|
|
2014-11-15 19:38:03 -06:00
|
|
|
#[test]
|
|
|
|
fn test_simple_types() {
|
2014-11-27 18:09:59 -06:00
|
|
|
assert_eq!(1i.to_string(), "1");
|
|
|
|
assert_eq!((-1i).to_string(), "-1");
|
|
|
|
assert_eq!(200u.to_string(), "200");
|
|
|
|
assert_eq!(2u8.to_string(), "2");
|
|
|
|
assert_eq!(true.to_string(), "true");
|
|
|
|
assert_eq!(false.to_string(), "false");
|
|
|
|
assert_eq!(().to_string(), "()");
|
|
|
|
assert_eq!(("hi".to_string()).to_string(), "hi");
|
2014-11-15 19:38:03 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_vectors() {
|
|
|
|
let x: Vec<int> = vec![];
|
2014-11-27 18:09:59 -06:00
|
|
|
assert_eq!(x.to_string(), "[]");
|
|
|
|
assert_eq!((vec![1i]).to_string(), "[1]");
|
|
|
|
assert_eq!((vec![1i, 2, 3]).to_string(), "[1, 2, 3]");
|
2014-11-15 19:38:03 -06:00
|
|
|
assert!((vec![vec![], vec![1i], vec![1i, 1]]).to_string() ==
|
2014-11-27 18:09:59 -06:00
|
|
|
"[[], [1], [1, 1]]");
|
2014-11-15 19:38:03 -06:00
|
|
|
}
|
|
|
|
|
2014-12-07 14:43:11 -06:00
|
|
|
#[test]
|
|
|
|
fn test_from_iterator() {
|
|
|
|
let s = "ศไทย中华Việt Nam".to_string();
|
|
|
|
let t = "ศไทย中华";
|
|
|
|
let u = "Việt Nam";
|
|
|
|
|
|
|
|
let a: String = s.chars().collect();
|
2014-12-07 14:45:47 -06:00
|
|
|
assert_eq!(s, a);
|
2014-12-07 14:43:11 -06:00
|
|
|
|
|
|
|
let mut b = t.to_string();
|
|
|
|
b.extend(u.chars());
|
2014-12-07 14:45:47 -06:00
|
|
|
assert_eq!(s, b);
|
|
|
|
|
|
|
|
let c: String = vec![t, u].into_iter().collect();
|
|
|
|
assert_eq!(s, c);
|
|
|
|
|
|
|
|
let mut d = t.to_string();
|
|
|
|
d.extend(vec![u].into_iter());
|
|
|
|
assert_eq!(s, d);
|
2014-12-07 14:43:11 -06:00
|
|
|
}
|
|
|
|
|
2014-07-10 11:21:16 -05:00
|
|
|
#[bench]
|
|
|
|
fn bench_with_capacity(b: &mut Bencher) {
|
|
|
|
b.iter(|| {
|
|
|
|
String::with_capacity(100)
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_str(b: &mut Bencher) {
|
|
|
|
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
r.push_str(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2014-12-20 11:17:58 -06:00
|
|
|
const REPETITIONS: u64 = 10_000;
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_str_one_byte(b: &mut Bencher) {
|
|
|
|
b.bytes = REPETITIONS;
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
for _ in range(0, REPETITIONS) {
|
|
|
|
r.push_str("a")
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_char_one_byte(b: &mut Bencher) {
|
|
|
|
b.bytes = REPETITIONS;
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
for _ in range(0, REPETITIONS) {
|
|
|
|
r.push('a')
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_char_two_bytes(b: &mut Bencher) {
|
|
|
|
b.bytes = REPETITIONS * 2;
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
for _ in range(0, REPETITIONS) {
|
|
|
|
r.push('â')
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2014-07-10 11:21:16 -05:00
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
|
|
|
|
let s = b"Hello there, the quick brown fox jumped over the lazy dog! \
|
|
|
|
Lorem ipsum dolor sit amet, consectetur. ";
|
|
|
|
|
|
|
|
assert_eq!(100, s.len());
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
|
2014-07-21 00:43:08 -05:00
|
|
|
let s = "𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰".as_bytes();
|
2014-07-10 11:21:16 -05:00
|
|
|
assert_eq!(100, s.len());
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_invalid(b: &mut Bencher) {
|
|
|
|
let s = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
|
|
|
|
let s = Vec::from_elem(100, 0xF5u8);
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s.as_slice());
|
|
|
|
});
|
|
|
|
}
|
2014-04-02 18:54:22 -05:00
|
|
|
}
|