2014-04-02 16:54:22 -07:00
|
|
|
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
|
|
|
// file at the top-level directory of this distribution and at
|
|
|
|
// http://rust-lang.org/COPYRIGHT.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
|
|
// option. This file may not be copied, modified, or distributed
|
|
|
|
// except according to those terms.
|
2014-07-14 20:46:04 -07:00
|
|
|
//
|
|
|
|
// ignore-lexer-test FIXME #15679
|
2014-04-02 16:54:22 -07:00
|
|
|
|
|
|
|
//! An owned, growable string that enforces that its contents are valid UTF-8.
|
|
|
|
|
2014-12-28 10:29:56 -08:00
|
|
|
#![stable]
|
|
|
|
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
use core::prelude::*;
|
|
|
|
|
2014-11-21 17:10:42 -05:00
|
|
|
use core::borrow::{Cow, IntoCow};
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
use core::default::Default;
|
|
|
|
use core::fmt;
|
2014-12-12 18:43:07 -08:00
|
|
|
use core::hash;
|
2014-12-22 09:04:23 -08:00
|
|
|
use core::iter::FromIterator;
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
use core::mem;
|
2015-01-04 17:43:24 +13:00
|
|
|
use core::ops::{self, Deref, Add, Index};
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
use core::ptr;
|
2014-08-18 08:29:44 -07:00
|
|
|
use core::raw::Slice as RawSlice;
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
use unicode::str as unicode_str;
|
|
|
|
use unicode::str::Utf16Item;
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
|
2015-01-03 22:42:21 -05:00
|
|
|
use str::{self, CharRange, FromStr, Utf8Error};
|
2014-08-23 20:26:53 -04:00
|
|
|
use vec::{DerefVec, Vec, as_vec};
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-04-10 20:55:34 +10:00
|
|
|
/// A growable string stored as a UTF-8 encoded buffer.
|
2015-01-03 22:54:18 -05:00
|
|
|
#[derive(Clone, PartialOrd, Eq, Ord)]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub struct String {
|
2014-04-02 16:54:22 -07:00
|
|
|
vec: Vec<u8>,
|
|
|
|
}
|
|
|
|
|
2014-12-28 10:29:56 -08:00
|
|
|
/// A possible error value from the `String::from_utf8` function.
|
|
|
|
#[stable]
|
|
|
|
pub struct FromUtf8Error {
|
|
|
|
bytes: Vec<u8>,
|
|
|
|
error: Utf8Error,
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A possible error value from the `String::from_utf16` function.
|
|
|
|
#[stable]
|
|
|
|
#[allow(missing_copy_implementations)]
|
|
|
|
pub struct FromUtf16Error(());
|
|
|
|
|
2014-05-22 16:57:53 -07:00
|
|
|
impl String {
|
2014-04-21 00:49:39 -04:00
|
|
|
/// Creates a new string buffer initialized with the empty string.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn new() -> String {
|
|
|
|
String {
|
2014-04-02 16:54:22 -07:00
|
|
|
vec: Vec::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a new string buffer with the given capacity.
|
2014-07-27 12:40:39 +02:00
|
|
|
/// The string will be able to hold exactly `capacity` bytes without
|
|
|
|
/// reallocating. If `capacity` is 0, the string will not allocate.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::with_capacity(10);
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn with_capacity(capacity: uint) -> String {
|
|
|
|
String {
|
2014-04-02 16:54:22 -07:00
|
|
|
vec: Vec::with_capacity(capacity),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a new string buffer from the given string.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
|
|
|
/// assert_eq!(s.as_slice(), "hello");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "needs investigation to see if to_string() can match perf"]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn from_str(string: &str) -> String {
|
2015-01-02 09:12:27 -05:00
|
|
|
String { vec: ::slice::SliceExt::to_vec(string.as_bytes()) }
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-05-14 16:55:24 -07:00
|
|
|
/// Returns the vector as a string buffer, if possible, taking care not to
|
|
|
|
/// copy it.
|
|
|
|
///
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
/// # Failure
|
|
|
|
///
|
|
|
|
/// If the given vector is not valid UTF-8, then the original vector and the
|
|
|
|
/// corresponding error is returned.
|
2014-06-30 16:41:30 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-06-30 16:41:30 +02:00
|
|
|
///
|
|
|
|
/// ```rust
|
2014-12-10 19:46:38 -08:00
|
|
|
/// use std::str::Utf8Error;
|
|
|
|
///
|
2014-06-30 16:41:30 +02:00
|
|
|
/// let hello_vec = vec![104, 101, 108, 108, 111];
|
2014-12-28 10:29:56 -08:00
|
|
|
/// let s = String::from_utf8(hello_vec).unwrap();
|
|
|
|
/// assert_eq!(s, "hello");
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// let invalid_vec = vec![240, 144, 128];
|
2014-12-28 10:29:56 -08:00
|
|
|
/// let s = String::from_utf8(invalid_vec).err().unwrap();
|
|
|
|
/// assert_eq!(s.utf8_error(), Utf8Error::TooShort);
|
|
|
|
/// assert_eq!(s.into_bytes(), vec![240, 144, 128]);
|
2014-06-30 16:41:30 +02:00
|
|
|
/// ```
|
2014-04-10 20:55:34 +10:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
|
|
|
pub fn from_utf8(vec: Vec<u8>) -> Result<String, FromUtf8Error> {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
match str::from_utf8(vec.as_slice()) {
|
|
|
|
Ok(..) => Ok(String { vec: vec }),
|
2014-12-28 10:29:56 -08:00
|
|
|
Err(e) => Err(FromUtf8Error { bytes: vec, error: e })
|
2014-04-10 20:55:34 +10:00
|
|
|
}
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Converts a vector of bytes to a new UTF-8 string.
|
|
|
|
/// Any invalid UTF-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
|
2014-07-10 18:21:16 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-10 18:21:16 +02:00
|
|
|
///
|
|
|
|
/// ```rust
|
|
|
|
/// let input = b"Hello \xF0\x90\x80World";
|
2014-07-04 22:38:13 +02:00
|
|
|
/// let output = String::from_utf8_lossy(input);
|
2014-12-09 14:08:10 -08:00
|
|
|
/// assert_eq!(output.as_slice(), "Hello \u{FFFD}World");
|
2014-07-10 18:21:16 +02:00
|
|
|
/// ```
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-11-21 17:10:42 -05:00
|
|
|
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> CowString<'a> {
|
2015-01-01 01:22:43 -05:00
|
|
|
let mut i = 0;
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
match str::from_utf8(v) {
|
|
|
|
Ok(s) => return Cow::Borrowed(s),
|
2015-01-01 01:22:43 -05:00
|
|
|
Err(e) => {
|
|
|
|
if let Utf8Error::InvalidByte(firstbad) = e {
|
|
|
|
i = firstbad;
|
|
|
|
}
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static TAG_CONT_U8: u8 = 128u8;
|
|
|
|
static REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8
|
|
|
|
let total = v.len();
|
|
|
|
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
|
2014-12-30 10:51:18 -08:00
|
|
|
unsafe { *xs.get_unchecked(i) }
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
|
|
|
|
if i >= total {
|
|
|
|
0
|
|
|
|
} else {
|
|
|
|
unsafe_get(xs, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut res = String::with_capacity(total);
|
|
|
|
|
|
|
|
if i > 0 {
|
|
|
|
unsafe {
|
2015-01-07 11:58:31 -05:00
|
|
|
res.as_mut_vec().push_all(&v[0..i])
|
2014-07-10 18:21:16 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
// subseqidx is the index of the first byte of the subsequence we're looking at.
|
|
|
|
// It's used to copy a bunch of contiguous good codepoints at once instead of copying
|
|
|
|
// them one by one.
|
2015-01-01 01:22:43 -05:00
|
|
|
let mut subseqidx = i;
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
while i < total {
|
|
|
|
let i_ = i;
|
|
|
|
let byte = unsafe_get(v, i);
|
|
|
|
i += 1;
|
|
|
|
|
2015-01-02 14:44:21 -08:00
|
|
|
macro_rules! error { () => ({
|
2014-07-10 18:21:16 +02:00
|
|
|
unsafe {
|
|
|
|
if subseqidx != i_ {
|
2015-01-07 11:58:31 -05:00
|
|
|
res.as_mut_vec().push_all(&v[subseqidx..i_]);
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
subseqidx = i;
|
2014-09-22 08:28:35 -07:00
|
|
|
res.as_mut_vec().push_all(REPLACEMENT);
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
2015-01-02 14:44:21 -08:00
|
|
|
})}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
if byte < 128u8 {
|
|
|
|
// subseqidx handles this
|
|
|
|
} else {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
let w = unicode_str::utf8_char_width(byte);
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
match w {
|
|
|
|
2 => {
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
3 => {
|
|
|
|
match (byte, safe_get(v, i, total)) {
|
2014-09-26 21:13:20 -07:00
|
|
|
(0xE0 , 0xA0 ... 0xBF) => (),
|
|
|
|
(0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
|
|
|
|
(0xED , 0x80 ... 0x9F) => (),
|
|
|
|
(0xEE ... 0xEF, 0x80 ... 0xBF) => (),
|
2014-07-10 18:21:16 +02:00
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
4 => {
|
|
|
|
match (byte, safe_get(v, i, total)) {
|
2014-09-26 21:13:20 -07:00
|
|
|
(0xF0 , 0x90 ... 0xBF) => (),
|
|
|
|
(0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
|
|
|
|
(0xF4 , 0x80 ... 0x8F) => (),
|
2014-07-10 18:21:16 +02:00
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if subseqidx < total {
|
|
|
|
unsafe {
|
2015-01-07 11:58:31 -05:00
|
|
|
res.as_mut_vec().push_all(&v[subseqidx..total])
|
2014-07-10 18:21:16 +02:00
|
|
|
};
|
|
|
|
}
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
Cow::Owned(res)
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
/// Decode a UTF-16 encoded vector `v` into a `String`, returning `None`
|
2014-07-10 17:43:03 +02:00
|
|
|
/// if `v` contains any invalid data.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-10 17:43:03 +02:00
|
|
|
///
|
|
|
|
/// ```rust
|
2014-07-04 22:38:13 +02:00
|
|
|
/// // 𝄞music
|
2014-11-17 21:39:01 +13:00
|
|
|
/// let mut v = &mut [0xD834, 0xDD1E, 0x006d, 0x0075,
|
|
|
|
/// 0x0073, 0x0069, 0x0063];
|
2014-12-28 10:29:56 -08:00
|
|
|
/// assert_eq!(String::from_utf16(v).unwrap(),
|
|
|
|
/// "𝄞music".to_string());
|
2014-07-10 17:43:03 +02:00
|
|
|
///
|
2014-07-04 22:38:13 +02:00
|
|
|
/// // 𝄞mu<invalid>ic
|
2014-07-10 17:43:03 +02:00
|
|
|
/// v[4] = 0xD800;
|
2014-12-28 10:29:56 -08:00
|
|
|
/// assert!(String::from_utf16(v).is_err());
|
2014-07-10 17:43:03 +02:00
|
|
|
/// ```
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
|
|
|
pub fn from_utf16(v: &[u16]) -> Result<String, FromUtf16Error> {
|
2014-10-03 21:20:04 +01:00
|
|
|
let mut s = String::with_capacity(v.len());
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
for c in unicode_str::utf16_items(v) {
|
2014-07-10 17:43:03 +02:00
|
|
|
match c {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
Utf16Item::ScalarValue(c) => s.push(c),
|
2014-12-28 10:29:56 -08:00
|
|
|
Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())),
|
2014-07-10 17:43:03 +02:00
|
|
|
}
|
|
|
|
}
|
2014-12-28 10:29:56 -08:00
|
|
|
Ok(s)
|
2014-07-10 17:43:03 +02:00
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-07-10 17:53:51 +02:00
|
|
|
/// Decode a UTF-16 encoded vector `v` into a string, replacing
|
|
|
|
/// invalid data with the replacement character (U+FFFD).
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
|
|
|
///
|
2014-07-10 17:53:51 +02:00
|
|
|
/// ```rust
|
2014-07-04 22:38:13 +02:00
|
|
|
/// // 𝄞mus<invalid>ic<invalid>
|
2014-11-17 21:39:01 +13:00
|
|
|
/// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075,
|
|
|
|
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
|
|
|
/// 0xD834];
|
2014-07-10 17:53:51 +02:00
|
|
|
///
|
|
|
|
/// assert_eq!(String::from_utf16_lossy(v),
|
2014-12-09 14:08:10 -08:00
|
|
|
/// "𝄞mus\u{FFFD}ic\u{FFFD}".to_string());
|
2014-07-10 17:53:51 +02:00
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-07-10 17:53:51 +02:00
|
|
|
pub fn from_utf16_lossy(v: &[u16]) -> String {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
2014-07-10 17:53:51 +02:00
|
|
|
}
|
2014-07-04 22:18:11 +02:00
|
|
|
|
2014-11-20 10:11:15 -08:00
|
|
|
/// Creates a new `String` from a length, capacity, and pointer.
|
|
|
|
///
|
|
|
|
/// This is unsafe because:
|
|
|
|
/// * We call `Vec::from_raw_parts` to get a `Vec<u8>`;
|
|
|
|
/// * We assume that the `Vec` contains valid UTF-8.
|
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-11-20 10:11:15 -08:00
|
|
|
pub unsafe fn from_raw_parts(buf: *mut u8, length: uint, capacity: uint) -> String {
|
|
|
|
String {
|
|
|
|
vec: Vec::from_raw_parts(buf, length, capacity),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Converts a vector of bytes to a new `String` without checking if
|
|
|
|
/// it contains valid UTF-8. This is unsafe because it assumes that
|
|
|
|
/// the UTF-8-ness of the vector has already been validated.
|
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-11-20 10:11:15 -08:00
|
|
|
pub unsafe fn from_utf8_unchecked(bytes: Vec<u8>) -> String {
|
|
|
|
String { vec: bytes }
|
|
|
|
}
|
|
|
|
|
2014-04-10 20:55:34 +10:00
|
|
|
/// Return the underlying byte buffer, encoded as UTF-8.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
|
|
|
/// let bytes = s.into_bytes();
|
|
|
|
/// assert_eq!(bytes, vec![104, 101, 108, 108, 111]);
|
|
|
|
/// ```
|
2014-04-10 20:55:34 +10:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-04-10 20:55:34 +10:00
|
|
|
pub fn into_bytes(self) -> Vec<u8> {
|
|
|
|
self.vec
|
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
/// Pushes the given string onto this string buffer.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.push_str("bar");
|
|
|
|
/// assert_eq!(s.as_slice(), "foobar");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn push_str(&mut self, string: &str) {
|
|
|
|
self.vec.push_all(string.as_bytes())
|
|
|
|
}
|
|
|
|
|
2014-12-28 10:29:56 -08:00
|
|
|
/// Returns the number of bytes that this string buffer can hold without
|
|
|
|
/// reallocating.
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::with_capacity(10);
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert!(s.capacity() >= 10);
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// ```
|
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
pub fn capacity(&self) -> uint {
|
|
|
|
self.vec.capacity()
|
|
|
|
}
|
|
|
|
|
2014-12-28 10:29:56 -08:00
|
|
|
/// Reserves capacity for at least `additional` more bytes to be inserted
|
|
|
|
/// in the given `String`. The collection may reserve more space to avoid
|
|
|
|
/// frequent reallocations.
|
2014-11-06 12:24:47 -05:00
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// Panics if the new capacity overflows `uint`.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
|
|
|
/// s.reserve(10);
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert!(s.capacity() >= 10);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-11-06 12:24:47 -05:00
|
|
|
pub fn reserve(&mut self, additional: uint) {
|
|
|
|
self.vec.reserve(additional)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-12-28 10:29:56 -08:00
|
|
|
/// Reserves the minimum capacity for exactly `additional` more bytes to be
|
|
|
|
/// inserted in the given `String`. Does nothing if the capacity is already
|
|
|
|
/// sufficient.
|
2014-11-06 12:24:47 -05:00
|
|
|
///
|
2014-12-28 10:29:56 -08:00
|
|
|
/// Note that the allocator may give the collection more space than it
|
|
|
|
/// requests. Therefore capacity can not be relied upon to be precisely
|
|
|
|
/// minimal. Prefer `reserve` if future insertions are expected.
|
2014-11-06 12:24:47 -05:00
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// Panics if the new capacity overflows `uint`.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
2014-11-06 12:24:47 -05:00
|
|
|
/// s.reserve(10);
|
|
|
|
/// assert!(s.capacity() >= 10);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-11-06 12:24:47 -05:00
|
|
|
pub fn reserve_exact(&mut self, additional: uint) {
|
|
|
|
self.vec.reserve_exact(additional)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Shrinks the capacity of this string buffer to match its length.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.reserve(100);
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert!(s.capacity() >= 100);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// s.shrink_to_fit();
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert_eq!(s.capacity(), 3);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn shrink_to_fit(&mut self) {
|
|
|
|
self.vec.shrink_to_fit()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Adds the given character to the end of the string.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("abc");
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// s.push('1');
|
|
|
|
/// s.push('2');
|
|
|
|
/// s.push('3');
|
2014-07-27 12:40:39 +02:00
|
|
|
/// assert_eq!(s.as_slice(), "abc123");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
pub fn push(&mut self, ch: char) {
|
2014-12-20 17:17:58 +00:00
|
|
|
if (ch as u32) < 0x80 {
|
|
|
|
self.vec.push(ch as u8);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
let cur_len = self.len();
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
// This may use up to 4 bytes.
|
2014-11-06 12:24:47 -05:00
|
|
|
self.vec.reserve(4);
|
2014-04-02 16:54:22 -07:00
|
|
|
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
unsafe {
|
2014-04-02 16:54:22 -07:00
|
|
|
// Attempt to not use an intermediate buffer by just pushing bytes
|
|
|
|
// directly onto this string.
|
2014-08-06 20:03:55 -07:00
|
|
|
let slice = RawSlice {
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
data: self.vec.as_ptr().offset(cur_len as int),
|
|
|
|
len: 4,
|
|
|
|
};
|
2014-08-14 01:02:31 +02:00
|
|
|
let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
|
2014-04-02 16:54:22 -07:00
|
|
|
self.vec.set_len(cur_len + used);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Works with the underlying buffer as a byte slice.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
2014-08-06 11:59:40 +02:00
|
|
|
/// let b: &[_] = &[104, 101, 108, 108, 111];
|
|
|
|
/// assert_eq!(s.as_bytes(), b);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn as_bytes<'a>(&'a self) -> &'a [u8] {
|
|
|
|
self.vec.as_slice()
|
|
|
|
}
|
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Shortens a string to the specified length.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-11-12 03:36:09 +09:00
|
|
|
/// # Panics
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-11-12 03:36:09 +09:00
|
|
|
/// Panics if `new_len` > current length,
|
2014-10-05 12:15:59 +01:00
|
|
|
/// or if `new_len` is not a character boundary.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("hello");
|
|
|
|
/// s.truncate(2);
|
|
|
|
/// assert_eq!(s.as_slice(), "he");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-10-05 12:15:59 +01:00
|
|
|
pub fn truncate(&mut self, new_len: uint) {
|
2014-11-27 11:45:50 -05:00
|
|
|
assert!(self.is_char_boundary(new_len));
|
2014-10-05 12:15:59 +01:00
|
|
|
self.vec.truncate(new_len)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-07-27 12:40:39 +02:00
|
|
|
/// Removes the last character from the string buffer and returns it.
|
|
|
|
/// Returns `None` if this string buffer is empty.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// assert_eq!(s.pop(), Some('o'));
|
|
|
|
/// assert_eq!(s.pop(), Some('o'));
|
|
|
|
/// assert_eq!(s.pop(), Some('f'));
|
|
|
|
/// assert_eq!(s.pop(), None);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-05-08 21:42:40 +01:00
|
|
|
#[inline]
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
pub fn pop(&mut self) -> Option<char> {
|
2014-05-08 21:42:40 +01:00
|
|
|
let len = self.len();
|
|
|
|
if len == 0 {
|
|
|
|
return None
|
|
|
|
}
|
|
|
|
|
2014-11-27 11:45:50 -05:00
|
|
|
let CharRange {ch, next} = self.char_range_at_reverse(len);
|
2014-05-08 21:42:40 +01:00
|
|
|
unsafe {
|
|
|
|
self.vec.set_len(next);
|
|
|
|
}
|
|
|
|
Some(ch)
|
|
|
|
}
|
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
/// Removes the character from the string buffer at byte position `idx` and
|
2014-12-28 10:29:56 -08:00
|
|
|
/// returns it.
|
2014-05-08 21:42:40 +01:00
|
|
|
///
|
|
|
|
/// # Warning
|
|
|
|
///
|
2014-11-16 08:28:13 -08:00
|
|
|
/// This is an O(n) operation as it requires copying every element in the
|
2014-09-22 08:24:14 -07:00
|
|
|
/// buffer.
|
|
|
|
///
|
2014-10-09 15:17:22 -04:00
|
|
|
/// # Panics
|
2014-09-22 08:24:14 -07:00
|
|
|
///
|
2014-12-28 10:29:56 -08:00
|
|
|
/// If `idx` does not lie on a character boundary, or if it is out of
|
|
|
|
/// bounds, then this function will panic.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
2014-12-28 10:29:56 -08:00
|
|
|
/// assert_eq!(s.remove(0), 'f');
|
|
|
|
/// assert_eq!(s.remove(1), 'o');
|
|
|
|
/// assert_eq!(s.remove(0), 'o');
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
|
|
|
pub fn remove(&mut self, idx: uint) -> char {
|
2014-04-02 16:54:22 -07:00
|
|
|
let len = self.len();
|
2014-12-28 10:29:56 -08:00
|
|
|
assert!(idx <= len);
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-11-27 11:45:50 -05:00
|
|
|
let CharRange { ch, next } = self.char_range_at(idx);
|
2014-05-08 21:42:40 +01:00
|
|
|
unsafe {
|
2014-09-22 08:24:14 -07:00
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset(idx as int),
|
|
|
|
self.vec.as_ptr().offset(next as int),
|
|
|
|
len - next);
|
|
|
|
self.vec.set_len(len - (next - idx));
|
2014-05-08 21:42:40 +01:00
|
|
|
}
|
2014-12-28 10:29:56 -08:00
|
|
|
ch
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
2014-04-12 22:44:31 +10:00
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
/// Insert a character into the string buffer at byte position `idx`.
|
|
|
|
///
|
|
|
|
/// # Warning
|
|
|
|
///
|
2014-11-16 08:28:13 -08:00
|
|
|
/// This is an O(n) operation as it requires copying every element in the
|
2014-09-22 08:24:14 -07:00
|
|
|
/// buffer.
|
|
|
|
///
|
2014-10-09 15:17:22 -04:00
|
|
|
/// # Panics
|
2014-09-22 08:24:14 -07:00
|
|
|
///
|
|
|
|
/// If `idx` does not lie on a character boundary or is out of bounds, then
|
2014-10-09 15:17:22 -04:00
|
|
|
/// this function will panic.
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-09-22 08:24:14 -07:00
|
|
|
pub fn insert(&mut self, idx: uint, ch: char) {
|
|
|
|
let len = self.len();
|
|
|
|
assert!(idx <= len);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert!(self.is_char_boundary(idx));
|
2014-11-06 12:24:47 -05:00
|
|
|
self.vec.reserve(4);
|
2014-12-30 21:19:41 +13:00
|
|
|
let mut bits = [0; 4];
|
2014-11-17 21:39:01 +13:00
|
|
|
let amt = ch.encode_utf8(&mut bits).unwrap();
|
2014-09-22 08:24:14 -07:00
|
|
|
|
|
|
|
unsafe {
|
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset((idx + amt) as int),
|
|
|
|
self.vec.as_ptr().offset(idx as int),
|
|
|
|
len - idx);
|
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset(idx as int),
|
|
|
|
bits.as_ptr(),
|
|
|
|
amt);
|
|
|
|
self.vec.set_len(len + amt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-12 22:44:31 +10:00
|
|
|
/// Views the string buffer as a mutable sequence of bytes.
|
|
|
|
///
|
2014-07-27 12:40:39 +02:00
|
|
|
/// This is unsafe because it does not check
|
|
|
|
/// to ensure that the resulting string will be valid UTF-8.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("hello");
|
|
|
|
/// unsafe {
|
|
|
|
/// let vec = s.as_mut_vec();
|
|
|
|
/// assert!(vec == &mut vec![104, 101, 108, 108, 111]);
|
|
|
|
/// vec.reverse();
|
|
|
|
/// }
|
|
|
|
/// assert_eq!(s.as_slice(), "olleh");
|
|
|
|
/// ```
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-04-12 22:44:31 +10:00
|
|
|
pub unsafe fn as_mut_vec<'a>(&'a mut self) -> &'a mut Vec<u8> {
|
|
|
|
&mut self.vec
|
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-10-30 13:43:24 -07:00
|
|
|
/// Return the number of bytes in this string.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-10-30 13:43:24 -07:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let a = "foo".to_string();
|
|
|
|
/// assert_eq!(a.len(), 3);
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-10-30 13:43:24 -07:00
|
|
|
pub fn len(&self) -> uint { self.vec.len() }
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-10-30 13:43:24 -07:00
|
|
|
/// Returns true if the string contains no bytes
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-10-30 13:43:24 -07:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut v = String::new();
|
|
|
|
/// assert!(v.is_empty());
|
|
|
|
/// v.push('a');
|
|
|
|
/// assert!(!v.is_empty());
|
|
|
|
/// ```
|
2014-12-28 10:29:56 -08:00
|
|
|
#[stable]
|
2014-10-30 13:43:24 -07:00
|
|
|
pub fn is_empty(&self) -> bool { self.len() == 0 }
|
|
|
|
|
|
|
|
/// Truncates the string, returning it to 0 length.
|
|
|
|
///
|
2014-12-08 23:28:07 -06:00
|
|
|
/// # Examples
|
2014-10-30 13:43:24 -07:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = "foo".to_string();
|
|
|
|
/// s.clear();
|
|
|
|
/// assert!(s.is_empty());
|
|
|
|
/// ```
|
2014-05-11 03:49:09 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-10-30 13:43:24 -07:00
|
|
|
pub fn clear(&mut self) {
|
2014-05-11 03:49:09 -07:00
|
|
|
self.vec.clear()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-28 10:29:56 -08:00
|
|
|
impl FromUtf8Error {
|
|
|
|
/// Consume this error, returning the bytes that were attempted to make a
|
|
|
|
/// `String` with.
|
|
|
|
#[stable]
|
|
|
|
pub fn into_bytes(self) -> Vec<u8> { self.bytes }
|
|
|
|
|
|
|
|
/// Access the underlying UTF8-error that was the cause of this error.
|
|
|
|
#[stable]
|
|
|
|
pub fn utf8_error(&self) -> Utf8Error { self.error }
|
|
|
|
}
|
|
|
|
|
|
|
|
impl fmt::Show for FromUtf8Error {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
2014-12-20 00:09:35 -08:00
|
|
|
fmt::String::fmt(self, f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-07 14:58:31 -08:00
|
|
|
#[stable]
|
2014-12-20 00:09:35 -08:00
|
|
|
impl fmt::String for FromUtf8Error {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
fmt::String::fmt(&self.error, f)
|
2014-12-28 10:29:56 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl fmt::Show for FromUtf16Error {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
2014-12-20 00:09:35 -08:00
|
|
|
fmt::String::fmt(self, f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-07 14:58:31 -08:00
|
|
|
#[stable]
|
2014-12-20 00:09:35 -08:00
|
|
|
impl fmt::String for FromUtf16Error {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
fmt::String::fmt("invalid utf-16: lone surrogate found", f)
|
2014-12-28 10:29:56 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-04 16:16:55 -08:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl FromIterator<char> for String {
|
2015-01-01 23:15:35 -05:00
|
|
|
fn from_iter<I:Iterator<Item=char>>(iterator: I) -> String {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut buf = String::new();
|
2014-04-02 16:54:22 -07:00
|
|
|
buf.extend(iterator);
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-04 16:16:55 -08:00
|
|
|
#[stable]
|
2014-12-07 21:32:00 +01:00
|
|
|
impl<'a> FromIterator<&'a str> for String {
|
2015-01-01 23:15:35 -05:00
|
|
|
fn from_iter<I:Iterator<Item=&'a str>>(iterator: I) -> String {
|
2014-12-07 21:32:00 +01:00
|
|
|
let mut buf = String::new();
|
|
|
|
buf.extend(iterator);
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-08 01:39:39 +01:00
|
|
|
#[experimental = "waiting on Extend stabilization"]
|
|
|
|
impl Extend<char> for String {
|
2015-01-01 23:15:35 -05:00
|
|
|
fn extend<I:Iterator<Item=char>>(&mut self, mut iterator: I) {
|
2014-12-07 21:31:24 +01:00
|
|
|
let (lower_bound, _) = iterator.size_hint();
|
|
|
|
self.reserve(lower_bound);
|
2014-04-02 16:54:22 -07:00
|
|
|
for ch in iterator {
|
2014-09-22 08:28:35 -07:00
|
|
|
self.push(ch)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-07 21:32:00 +01:00
|
|
|
#[experimental = "waiting on Extend stabilization"]
|
|
|
|
impl<'a> Extend<&'a str> for String {
|
2015-01-01 23:15:35 -05:00
|
|
|
fn extend<I: Iterator<Item=&'a str>>(&mut self, mut iterator: I) {
|
2014-12-07 21:32:00 +01:00
|
|
|
// A guess that at least one byte per iterator element will be needed.
|
|
|
|
let (lower_bound, _) = iterator.size_hint();
|
|
|
|
self.reserve(lower_bound);
|
|
|
|
for s in iterator {
|
|
|
|
self.push_str(s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-30 14:44:26 -08:00
|
|
|
#[stable]
|
2014-11-21 00:14:05 -05:00
|
|
|
impl PartialEq for String {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &String) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &String) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
|
|
|
macro_rules! impl_eq {
|
|
|
|
($lhs:ty, $rhs: ty) => {
|
2014-12-30 14:44:26 -08:00
|
|
|
#[stable]
|
2014-11-21 00:14:05 -05:00
|
|
|
impl<'a> PartialEq<$rhs> for $lhs {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &$rhs) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &$rhs) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
2014-12-30 14:44:26 -08:00
|
|
|
#[stable]
|
2014-11-21 00:14:05 -05:00
|
|
|
impl<'a> PartialEq<$lhs> for $rhs {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &$lhs) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &$lhs) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-14 09:18:10 -08:00
|
|
|
impl_eq! { String, &'a str }
|
|
|
|
impl_eq! { CowString<'a>, String }
|
2014-11-21 00:14:05 -05:00
|
|
|
|
2014-12-30 14:44:26 -08:00
|
|
|
#[stable]
|
2014-11-21 00:14:05 -05:00
|
|
|
impl<'a, 'b> PartialEq<&'b str> for CowString<'a> {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &&'b str) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &&'b str) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
2014-12-30 14:44:26 -08:00
|
|
|
#[stable]
|
2014-11-21 00:14:05 -05:00
|
|
|
impl<'a, 'b> PartialEq<CowString<'a>> for &'b str {
|
|
|
|
#[inline]
|
|
|
|
fn eq(&self, other: &CowString<'a>) -> bool { PartialEq::eq(&**self, &**other) }
|
|
|
|
#[inline]
|
|
|
|
fn ne(&self, other: &CowString<'a>) -> bool { PartialEq::ne(&**self, &**other) }
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Str stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl Str for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-04-02 16:54:22 -07:00
|
|
|
fn as_slice<'a>(&'a self) -> &'a str {
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
unsafe { mem::transmute(self.vec.as_slice()) }
|
2014-05-19 17:23:26 -07:00
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl Default for String {
|
2014-12-15 20:04:52 -08:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
fn default() -> String {
|
|
|
|
String::new()
|
2014-05-19 23:19:56 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-07 14:58:31 -08:00
|
|
|
#[stable]
|
2014-12-20 00:09:35 -08:00
|
|
|
impl fmt::String for String {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
fmt::String::fmt(&**self, f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[experimental = "waiting on fmt stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl fmt::Show for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
2014-12-20 00:09:35 -08:00
|
|
|
fmt::Show::fmt(&**self, f)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Hash stabilization"]
|
std: Stabilize the std::hash module
This commit aims to prepare the `std::hash` module for alpha by formalizing its
current interface whileholding off on adding `#[stable]` to the new APIs. The
current usage with the `HashMap` and `HashSet` types is also reconciled by
separating out composable parts of the design. The primary goal of this slight
redesign is to separate the concepts of a hasher's state from a hashing
algorithm itself.
The primary change of this commit is to separate the `Hasher` trait into a
`Hasher` and a `HashState` trait. Conceptually the old `Hasher` trait was
actually just a factory for various states, but hashing had very little control
over how these states were used. Additionally the old `Hasher` trait was
actually fairly unrelated to hashing.
This commit redesigns the existing `Hasher` trait to match what the notion of a
`Hasher` normally implies with the following definition:
trait Hasher {
type Output;
fn reset(&mut self);
fn finish(&self) -> Output;
}
This `Hasher` trait emphasizes that hashing algorithms may produce outputs other
than a `u64`, so the output type is made generic. Other than that, however, very
little is assumed about a particular hasher. It is left up to implementors to
provide specific methods or trait implementations to feed data into a hasher.
The corresponding `Hash` trait becomes:
trait Hash<H: Hasher> {
fn hash(&self, &mut H);
}
The old default of `SipState` was removed from this trait as it's not something
that we're willing to stabilize until the end of time, but the type parameter is
always required to implement `Hasher`. Note that the type parameter `H` remains
on the trait to enable multidispatch for specialization of hashing for
particular hashers.
Note that `Writer` is not mentioned in either of `Hash` or `Hasher`, it is
simply used as part `derive` and the implementations for all primitive types.
With these definitions, the old `Hasher` trait is realized as a new `HashState`
trait in the `collections::hash_state` module as an unstable addition for
now. The current definition looks like:
trait HashState {
type Hasher: Hasher;
fn hasher(&self) -> Hasher;
}
The purpose of this trait is to emphasize that the one piece of functionality
for implementors is that new instances of `Hasher` can be created. This
conceptually represents the two keys from which more instances of a
`SipHasher` can be created, and a `HashState` is what's stored in a
`HashMap`, not a `Hasher`.
Implementors of custom hash algorithms should implement the `Hasher` trait, and
only hash algorithms intended for use in hash maps need to implement or worry
about the `HashState` trait.
The entire module and `HashState` infrastructure remains `#[unstable]` due to it
being recently redesigned, but some other stability decision made for the
`std::hash` module are:
* The `Writer` trait remains `#[experimental]` as it's intended to be replaced
with an `io::Writer` (more details soon).
* The top-level `hash` function is `#[unstable]` as it is intended to be generic
over the hashing algorithm instead of hardwired to `SipHasher`
* The inner `sip` module is now private as its one export, `SipHasher` is
reexported in the `hash` module.
And finally, a few changes were made to the default parameters on `HashMap`.
* The `RandomSipHasher` default type parameter was renamed to `RandomState`.
This renaming emphasizes that it is not a hasher, but rather just state to
generate hashers. It also moves away from the name "sip" as it may not always
be implemented as `SipHasher`. This type lives in the
`std::collections::hash_map` module as `#[unstable]`
* The associated `Hasher` type of `RandomState` is creatively called...
`Hasher`! This concrete structure lives next to `RandomState` as an
implemenation of the "default hashing algorithm" used for a `HashMap`. Under
the hood this is currently implemented as `SipHasher`, but it draws an
explicit interface for now and allows us to modify the implementation over
time if necessary.
There are many breaking changes outlined above, and as a result this commit is
a:
[breaking-change]
2014-12-09 12:37:23 -08:00
|
|
|
#[cfg(stage0)]
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
impl<H: hash::Writer> hash::Hash<H> for String {
|
std: Stabilize the std::hash module
This commit aims to prepare the `std::hash` module for alpha by formalizing its
current interface whileholding off on adding `#[stable]` to the new APIs. The
current usage with the `HashMap` and `HashSet` types is also reconciled by
separating out composable parts of the design. The primary goal of this slight
redesign is to separate the concepts of a hasher's state from a hashing
algorithm itself.
The primary change of this commit is to separate the `Hasher` trait into a
`Hasher` and a `HashState` trait. Conceptually the old `Hasher` trait was
actually just a factory for various states, but hashing had very little control
over how these states were used. Additionally the old `Hasher` trait was
actually fairly unrelated to hashing.
This commit redesigns the existing `Hasher` trait to match what the notion of a
`Hasher` normally implies with the following definition:
trait Hasher {
type Output;
fn reset(&mut self);
fn finish(&self) -> Output;
}
This `Hasher` trait emphasizes that hashing algorithms may produce outputs other
than a `u64`, so the output type is made generic. Other than that, however, very
little is assumed about a particular hasher. It is left up to implementors to
provide specific methods or trait implementations to feed data into a hasher.
The corresponding `Hash` trait becomes:
trait Hash<H: Hasher> {
fn hash(&self, &mut H);
}
The old default of `SipState` was removed from this trait as it's not something
that we're willing to stabilize until the end of time, but the type parameter is
always required to implement `Hasher`. Note that the type parameter `H` remains
on the trait to enable multidispatch for specialization of hashing for
particular hashers.
Note that `Writer` is not mentioned in either of `Hash` or `Hasher`, it is
simply used as part `derive` and the implementations for all primitive types.
With these definitions, the old `Hasher` trait is realized as a new `HashState`
trait in the `collections::hash_state` module as an unstable addition for
now. The current definition looks like:
trait HashState {
type Hasher: Hasher;
fn hasher(&self) -> Hasher;
}
The purpose of this trait is to emphasize that the one piece of functionality
for implementors is that new instances of `Hasher` can be created. This
conceptually represents the two keys from which more instances of a
`SipHasher` can be created, and a `HashState` is what's stored in a
`HashMap`, not a `Hasher`.
Implementors of custom hash algorithms should implement the `Hasher` trait, and
only hash algorithms intended for use in hash maps need to implement or worry
about the `HashState` trait.
The entire module and `HashState` infrastructure remains `#[unstable]` due to it
being recently redesigned, but some other stability decision made for the
`std::hash` module are:
* The `Writer` trait remains `#[experimental]` as it's intended to be replaced
with an `io::Writer` (more details soon).
* The top-level `hash` function is `#[unstable]` as it is intended to be generic
over the hashing algorithm instead of hardwired to `SipHasher`
* The inner `sip` module is now private as its one export, `SipHasher` is
reexported in the `hash` module.
And finally, a few changes were made to the default parameters on `HashMap`.
* The `RandomSipHasher` default type parameter was renamed to `RandomState`.
This renaming emphasizes that it is not a hasher, but rather just state to
generate hashers. It also moves away from the name "sip" as it may not always
be implemented as `SipHasher`. This type lives in the
`std::collections::hash_map` module as `#[unstable]`
* The associated `Hasher` type of `RandomState` is creatively called...
`Hasher`! This concrete structure lives next to `RandomState` as an
implemenation of the "default hashing algorithm" used for a `HashMap`. Under
the hood this is currently implemented as `SipHasher`, but it draws an
explicit interface for now and allows us to modify the implementation over
time if necessary.
There are many breaking changes outlined above, and as a result this commit is
a:
[breaking-change]
2014-12-09 12:37:23 -08:00
|
|
|
#[inline]
|
|
|
|
fn hash(&self, hasher: &mut H) {
|
|
|
|
(**self).hash(hasher)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#[experimental = "waiting on Hash stabilization"]
|
|
|
|
#[cfg(not(stage0))]
|
|
|
|
impl<H: hash::Writer + hash::Hasher> hash::Hash<H> for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
|
|
|
fn hash(&self, hasher: &mut H) {
|
2014-12-10 19:46:38 -08:00
|
|
|
(**self).hash(hasher)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-04 16:16:55 -08:00
|
|
|
#[unstable = "recent addition, needs more experience"]
|
2014-12-31 15:45:13 -05:00
|
|
|
impl<'a> Add<&'a str> for String {
|
|
|
|
type Output = String;
|
|
|
|
|
2014-12-01 14:08:56 -05:00
|
|
|
fn add(mut self, other: &str) -> String {
|
|
|
|
self.push_str(other);
|
|
|
|
self
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-04 17:43:24 +13:00
|
|
|
impl ops::Index<ops::Range<uint>> for String {
|
|
|
|
type Output = str;
|
2014-10-05 11:59:10 +13:00
|
|
|
#[inline]
|
2015-01-02 13:56:28 +13:00
|
|
|
fn index(&self, index: &ops::Range<uint>) -> &str {
|
2015-01-07 11:58:31 -05:00
|
|
|
&self[][*index]
|
2014-10-05 11:59:10 +13:00
|
|
|
}
|
2014-12-31 20:20:40 +13:00
|
|
|
}
|
2015-01-04 17:43:24 +13:00
|
|
|
impl ops::Index<ops::RangeTo<uint>> for String {
|
|
|
|
type Output = str;
|
2014-10-05 11:59:10 +13:00
|
|
|
#[inline]
|
2015-01-02 13:56:28 +13:00
|
|
|
fn index(&self, index: &ops::RangeTo<uint>) -> &str {
|
2015-01-07 11:58:31 -05:00
|
|
|
&self[][*index]
|
2014-10-05 11:59:10 +13:00
|
|
|
}
|
2014-12-31 20:20:40 +13:00
|
|
|
}
|
2015-01-04 17:43:24 +13:00
|
|
|
impl ops::Index<ops::RangeFrom<uint>> for String {
|
|
|
|
type Output = str;
|
2014-10-05 11:59:10 +13:00
|
|
|
#[inline]
|
2015-01-02 13:56:28 +13:00
|
|
|
fn index(&self, index: &ops::RangeFrom<uint>) -> &str {
|
2015-01-07 11:58:31 -05:00
|
|
|
&self[][*index]
|
2014-10-05 11:59:10 +13:00
|
|
|
}
|
2014-12-31 20:20:40 +13:00
|
|
|
}
|
2015-01-04 17:43:24 +13:00
|
|
|
impl ops::Index<ops::FullRange> for String {
|
|
|
|
type Output = str;
|
2014-10-05 11:59:10 +13:00
|
|
|
#[inline]
|
2015-01-02 13:56:28 +13:00
|
|
|
fn index(&self, _index: &ops::FullRange) -> &str {
|
2014-12-31 20:20:40 +13:00
|
|
|
unsafe { mem::transmute(self.vec.as_slice()) }
|
2014-10-05 11:59:10 +13:00
|
|
|
}
|
|
|
|
}
|
2014-09-26 21:46:22 -07:00
|
|
|
|
2015-01-04 16:16:55 -08:00
|
|
|
#[stable]
|
2015-01-01 14:53:20 -05:00
|
|
|
impl ops::Deref for String {
|
|
|
|
type Target = str;
|
|
|
|
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
fn deref<'a>(&'a self) -> &'a str {
|
2015-01-07 11:58:31 -05:00
|
|
|
unsafe { mem::transmute(&self.vec[]) }
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
}
|
2014-10-29 15:26:29 -07:00
|
|
|
}
|
|
|
|
|
2014-08-23 20:26:53 -04:00
|
|
|
/// Wrapper type providing a `&String` reference via `Deref`.
|
|
|
|
#[experimental]
|
|
|
|
pub struct DerefString<'a> {
|
|
|
|
x: DerefVec<'a, u8>
|
|
|
|
}
|
|
|
|
|
2015-01-01 14:53:20 -05:00
|
|
|
impl<'a> Deref for DerefString<'a> {
|
|
|
|
type Target = String;
|
|
|
|
|
2014-08-23 20:26:53 -04:00
|
|
|
fn deref<'b>(&'b self) -> &'b String {
|
|
|
|
unsafe { mem::transmute(&*self.x) }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Convert a string slice to a wrapper type providing a `&String` reference.
|
2014-12-07 17:47:00 -06:00
|
|
|
///
|
|
|
|
/// # Examples
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// use std::string::as_string;
|
|
|
|
///
|
|
|
|
/// fn string_consumer(s: String) {
|
|
|
|
/// assert_eq!(s, "foo".to_string());
|
|
|
|
/// }
|
|
|
|
///
|
|
|
|
/// let string = as_string("foo").clone();
|
|
|
|
/// string_consumer(string);
|
|
|
|
/// ```
|
2014-08-23 20:26:53 -04:00
|
|
|
#[experimental]
|
|
|
|
pub fn as_string<'a>(x: &'a str) -> DerefString<'a> {
|
|
|
|
DerefString { x: as_vec(x.as_bytes()) }
|
|
|
|
}
|
|
|
|
|
2014-11-15 15:52:00 +11:00
|
|
|
impl FromStr for String {
|
|
|
|
#[inline]
|
|
|
|
fn from_str(s: &str) -> Option<String> {
|
|
|
|
Some(String::from_str(s))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-16 12:38:03 +11:00
|
|
|
/// A generic trait for converting a value to a string
|
|
|
|
pub trait ToString {
|
|
|
|
/// Converts the value of `self` to an owned string
|
|
|
|
fn to_string(&self) -> String;
|
|
|
|
}
|
|
|
|
|
2014-12-20 00:09:35 -08:00
|
|
|
impl<T: fmt::String> ToString for T {
|
|
|
|
fn to_string(&self) -> String {
|
|
|
|
use core::fmt::Writer;
|
|
|
|
let mut buf = String::new();
|
|
|
|
let _ = buf.write_fmt(format_args!("{}", self));
|
|
|
|
buf.shrink_to_fit();
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-21 17:10:42 -05:00
|
|
|
impl IntoCow<'static, String, str> for String {
|
|
|
|
fn into_cow(self) -> CowString<'static> {
|
|
|
|
Cow::Owned(self)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> IntoCow<'a, String, str> for &'a str {
|
|
|
|
fn into_cow(self) -> CowString<'a> {
|
|
|
|
Cow::Borrowed(self)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one
`StrExt` trait to be included in the prelude. This means that
`UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into
one `StrExt` exported by the standard library. Some functionality is currently
duplicated with the `StrExt` present in libcore.
This commit also currently avoids any methods which require any form of pattern
to operate. These functions will be stabilized via a separate RFC.
Next, stability of methods and structures are as follows:
Stable
* from_utf8_unchecked
* CowString - after moving to std::string
* StrExt::as_bytes
* StrExt::as_ptr
* StrExt::bytes/Bytes - also made a struct instead of a typedef
* StrExt::char_indices/CharIndices - CharOffsets was renamed
* StrExt::chars/Chars
* StrExt::is_empty
* StrExt::len
* StrExt::lines/Lines
* StrExt::lines_any/LinesAny
* StrExt::slice_unchecked
* StrExt::trim
* StrExt::trim_left
* StrExt::trim_right
* StrExt::words/Words - also made a struct instead of a typedef
Unstable
* from_utf8 - the error type was changed to a `Result`, but the error type has
yet to prove itself
* from_c_str - this function will be handled by the c_str RFC
* FromStr - this trait will have an associated error type eventually
* StrExt::escape_default - needs iterators at least, unsure if it should make
the cut
* StrExt::escape_unicode - needs iterators at least, unsure if it should make
the cut
* StrExt::slice_chars - this function has yet to prove itself
* StrExt::slice_shift_char - awaiting conventions about slicing and shifting
* StrExt::graphemes/Graphemes - this functionality may only be in libunicode
* StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in
libunicode
* StrExt::width - this functionality may only be in libunicode
* StrExt::utf16_units - this functionality may only be in libunicode
* StrExt::nfd_chars - this functionality may only be in libunicode
* StrExt::nfkd_chars - this functionality may only be in libunicode
* StrExt::nfc_chars - this functionality may only be in libunicode
* StrExt::nfkc_chars - this functionality may only be in libunicode
* StrExt::is_char_boundary - naming is uncertain with container conventions
* StrExt::char_range_at - naming is uncertain with container conventions
* StrExt::char_range_at_reverse - naming is uncertain with container conventions
* StrExt::char_at - naming is uncertain with container conventions
* StrExt::char_at_reverse - naming is uncertain with container conventions
* StrVector::concat - this functionality may be replaced with iterators, but
it's not certain at this time
* StrVector::connect - as with concat, may be deprecated in favor of iterators
Deprecated
* StrAllocating and UnicodeStrPrelude have been merged into StrExit
* eq_slice - compiler implementation detail
* from_str - use the inherent parse() method
* is_utf8 - call from_utf8 instead
* replace - call the method instead
* truncate_utf16_at_nul - this is an implementation detail of windows and does
not need to be exposed.
* utf8_char_width - moved to libunicode
* utf16_items - moved to libunicode
* is_utf16 - moved to libunicode
* Utf16Items - moved to libunicode
* Utf16Item - moved to libunicode
* Utf16Encoder - moved to libunicode
* AnyLines - renamed to LinesAny and made a struct
* SendStr - use CowString<'static> instead
* str::raw - all functionality is deprecated
* StrExt::into_string - call to_string() instead
* StrExt::repeat - use iterators instead
* StrExt::char_len - use .chars().count() instead
* StrExt::is_alphanumeric - use .chars().all(..)
* StrExt::is_whitespace - use .chars().all(..)
Pending deprecation -- while slicing syntax is being worked out, these methods
are all #[unstable]
* Str - while currently used for generic programming, this trait will be
replaced with one of [], deref coercions, or a generic conversion trait.
* StrExt::slice - use slicing syntax instead
* StrExt::slice_to - use slicing syntax instead
* StrExt::slice_from - use slicing syntax instead
* StrExt::lev_distance - deprecated with no replacement
Awaiting stabilization due to patterns and/or matching
* StrExt::contains
* StrExt::contains_char
* StrExt::split
* StrExt::splitn
* StrExt::split_terminator
* StrExt::rsplitn
* StrExt::match_indices
* StrExt::split_str
* StrExt::starts_with
* StrExt::ends_with
* StrExt::trim_chars
* StrExt::trim_left_chars
* StrExt::trim_right_chars
* StrExt::find
* StrExt::rfind
* StrExt::find_str
* StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
|
|
|
/// A clone-on-write string
|
|
|
|
#[stable]
|
|
|
|
pub type CowString<'a> = Cow<'a, String, str>;
|
|
|
|
|
|
|
|
impl<'a> Str for CowString<'a> {
|
|
|
|
#[inline]
|
|
|
|
fn as_slice<'b>(&'b self) -> &'b str {
|
|
|
|
(**self).as_slice()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-12 10:59:41 -08:00
|
|
|
impl fmt::Writer for String {
|
|
|
|
fn write_str(&mut self, s: &str) -> fmt::Result {
|
|
|
|
self.push_str(s);
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2014-12-19 14:02:22 +02:00
|
|
|
use prelude::*;
|
2014-05-29 19:03:06 -07:00
|
|
|
use test::Bencher;
|
|
|
|
|
2014-12-29 16:38:07 -08:00
|
|
|
use str::Utf8Error;
|
2015-01-01 23:53:35 -08:00
|
|
|
use core::iter::repeat;
|
|
|
|
use super::{as_string, CowString};
|
2015-01-04 17:43:24 +13:00
|
|
|
use core::ops::FullRange;
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-08-23 20:26:53 -04:00
|
|
|
#[test]
|
|
|
|
fn test_as_string() {
|
|
|
|
let x = "foo";
|
|
|
|
assert_eq!(x, as_string(x).as_slice());
|
|
|
|
}
|
|
|
|
|
2014-06-21 03:39:03 -07:00
|
|
|
#[test]
|
|
|
|
fn test_from_str() {
|
2015-01-01 23:53:35 -08:00
|
|
|
let owned: Option<::std::string::String> = "string".parse();
|
2014-06-21 03:39:03 -07:00
|
|
|
assert_eq!(owned.as_ref().map(|s| s.as_slice()), Some("string"));
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf8() {
|
2014-10-05 18:11:17 +08:00
|
|
|
let xs = b"hello".to_vec();
|
2014-12-28 10:29:56 -08:00
|
|
|
assert_eq!(String::from_utf8(xs).unwrap(),
|
|
|
|
String::from_str("hello"));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-10-05 18:11:17 +08:00
|
|
|
let xs = "ศไทย中华Việt Nam".as_bytes().to_vec();
|
2014-12-28 10:29:56 -08:00
|
|
|
assert_eq!(String::from_utf8(xs).unwrap(),
|
|
|
|
String::from_str("ศไทย中华Việt Nam"));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-10-05 18:11:17 +08:00
|
|
|
let xs = b"hello\xFF".to_vec();
|
2014-12-28 10:29:56 -08:00
|
|
|
let err = String::from_utf8(xs).err().unwrap();
|
|
|
|
assert_eq!(err.utf8_error(), Utf8Error::TooShort);
|
|
|
|
assert_eq!(err.into_bytes(), b"hello\xff".to_vec());
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf8_lossy() {
|
|
|
|
let xs = b"hello";
|
2015-01-01 23:53:35 -08:00
|
|
|
let ys: CowString = "hello".into_cow();
|
2014-11-21 01:20:04 -05:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), ys);
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
let xs = "ศไทย中华Việt Nam".as_bytes();
|
2015-01-01 23:53:35 -08:00
|
|
|
let ys: CowString = "ศไทย中华Việt Nam".into_cow();
|
2014-11-21 01:20:04 -05:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), ys);
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"Hello\xC2 There\xFF Goodbye";
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 14:08:10 -08:00
|
|
|
String::from_str("Hello\u{FFFD} There\u{FFFD} Goodbye").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 14:08:10 -08:00
|
|
|
String::from_str("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"\xF5foo\xF5\x80bar";
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 14:08:10 -08:00
|
|
|
String::from_str("\u{FFFD}foo\u{FFFD}\u{FFFD}bar").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 14:08:10 -08:00
|
|
|
String::from_str("\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
2014-12-09 14:08:10 -08:00
|
|
|
String::from_str("\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
|
2014-12-09 14:08:10 -08:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), String::from_str("\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\
|
|
|
|
foo\u{10000}bar").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
// surrogates
|
|
|
|
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
|
2014-12-09 14:08:10 -08:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), String::from_str("\u{FFFD}\u{FFFD}\u{FFFD}foo\
|
|
|
|
\u{FFFD}\u{FFFD}\u{FFFD}bar").into_cow());
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
|
2014-07-10 17:53:51 +02:00
|
|
|
#[test]
|
|
|
|
fn test_from_utf16() {
|
|
|
|
let pairs =
|
2014-07-04 22:38:13 +02:00
|
|
|
[(String::from_str("𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
|
|
|
|
0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
|
|
|
|
0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
|
|
|
|
0xd800_u16, 0xdf30_u16, 0x000a_u16]),
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
(String::from_str("𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd801_u16, 0xdc12_u16, 0xd801_u16,
|
|
|
|
0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
|
|
|
|
0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
|
|
|
|
0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
|
|
|
|
0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
|
|
|
|
0x000a_u16]),
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
(String::from_str("𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
|
|
|
|
0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
|
|
|
|
0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
|
|
|
|
0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
|
|
|
|
0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
|
|
|
|
0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
|
|
|
|
0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
(String::from_str("𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
|
|
|
|
0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
|
|
|
|
0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
|
|
|
|
0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
|
|
|
|
0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
|
|
|
|
0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
|
|
|
|
0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
|
|
|
|
0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
|
|
|
|
0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
|
|
|
|
0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
|
|
|
|
0x000a_u16 ]),
|
|
|
|
// Issue #12318, even-numbered non-BMP planes
|
2014-12-09 14:08:10 -08:00
|
|
|
(String::from_str("\u{20000}"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xD840, 0xDC00])];
|
|
|
|
|
|
|
|
for p in pairs.iter() {
|
|
|
|
let (s, u) = (*p).clone();
|
2014-11-27 11:45:50 -05:00
|
|
|
let s_as_utf16 = s.utf16_units().collect::<Vec<u16>>();
|
2014-07-10 17:53:51 +02:00
|
|
|
let u_as_string = String::from_utf16(u.as_slice()).unwrap();
|
|
|
|
|
2014-12-10 19:46:38 -08:00
|
|
|
assert!(::unicode::str::is_utf16(u.as_slice()));
|
2014-07-10 17:53:51 +02:00
|
|
|
assert_eq!(s_as_utf16, u);
|
|
|
|
|
|
|
|
assert_eq!(u_as_string, s);
|
|
|
|
assert_eq!(String::from_utf16_lossy(u.as_slice()), s);
|
|
|
|
|
|
|
|
assert_eq!(String::from_utf16(s_as_utf16.as_slice()).unwrap(), s);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(u_as_string.utf16_units().collect::<Vec<u16>>(), u);
|
2014-07-10 17:53:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_utf16_invalid() {
|
|
|
|
// completely positive cases tested above.
|
|
|
|
// lead + eof
|
2014-12-28 10:29:56 -08:00
|
|
|
assert!(String::from_utf16(&[0xD800]).is_err());
|
2014-07-10 17:53:51 +02:00
|
|
|
// lead + lead
|
2014-12-28 10:29:56 -08:00
|
|
|
assert!(String::from_utf16(&[0xD800, 0xD800]).is_err());
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// isolated trail
|
2014-12-28 10:29:56 -08:00
|
|
|
assert!(String::from_utf16(&[0x0061, 0xDC00]).is_err());
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// general
|
2014-12-28 10:29:56 -08:00
|
|
|
assert!(String::from_utf16(&[0xD800, 0xd801, 0xdc8b, 0xD800]).is_err());
|
2014-07-10 17:53:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf16_lossy() {
|
|
|
|
// completely positive cases tested above.
|
|
|
|
// lead + eof
|
2014-12-09 14:08:10 -08:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800]), String::from_str("\u{FFFD}"));
|
2014-07-10 17:53:51 +02:00
|
|
|
// lead + lead
|
2014-12-09 14:08:10 -08:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800, 0xD800]),
|
|
|
|
String::from_str("\u{FFFD}\u{FFFD}"));
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// isolated trail
|
2014-12-09 14:08:10 -08:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0x0061, 0xDC00]), String::from_str("a\u{FFFD}"));
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// general
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800, 0xd801, 0xdc8b, 0xD800]),
|
2014-12-09 14:08:10 -08:00
|
|
|
String::from_str("\u{FFFD}𐒋\u{FFFD}"));
|
2014-07-10 17:53:51 +02:00
|
|
|
}
|
2014-06-21 03:39:03 -07:00
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
#[test]
|
|
|
|
fn test_push_bytes() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("ABC");
|
2014-04-02 16:54:22 -07:00
|
|
|
unsafe {
|
2014-10-05 18:11:17 +08:00
|
|
|
let mv = s.as_mut_vec();
|
2014-11-17 21:39:01 +13:00
|
|
|
mv.push_all(&[b'D']);
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "ABCD");
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_push_str() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::new();
|
2014-04-02 16:54:22 -07:00
|
|
|
s.push_str("");
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s.slice_from(0), "");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.push_str("abc");
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s.slice_from(0), "abc");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.push_str("ประเทศไทย中华Việt Nam");
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
2014-09-22 08:28:35 -07:00
|
|
|
fn test_push() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut data = String::from_str("ประเทศไทย中");
|
2014-09-22 08:28:35 -07:00
|
|
|
data.push('华');
|
|
|
|
data.push('b'); // 1 byte
|
|
|
|
data.push('¢'); // 2 byte
|
|
|
|
data.push('€'); // 3 byte
|
|
|
|
data.push('𤭢'); // 4 byte
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(data, "ประเทศไทย中华b¢€𤭢");
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-05-08 21:42:40 +01:00
|
|
|
#[test]
|
2014-10-05 18:11:17 +08:00
|
|
|
fn test_pop() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut data = String::from_str("ประเทศไทย中华b¢€𤭢");
|
2014-10-05 18:11:17 +08:00
|
|
|
assert_eq!(data.pop().unwrap(), '𤭢'); // 4 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '€'); // 3 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '¢'); // 2 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), 'b'); // 1 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '华');
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(data, "ประเทศไทย中");
|
2014-05-08 21:42:40 +01:00
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
#[test]
|
|
|
|
fn test_str_truncate() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(5);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "12345");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(3);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "123");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(0);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "");
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-11-27 11:45:50 -05:00
|
|
|
let p = s.as_ptr();
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(3);
|
|
|
|
s.push_str("6");
|
2014-11-27 11:45:50 -05:00
|
|
|
let p_ = s.as_ptr();
|
2014-04-02 16:54:22 -07:00
|
|
|
assert_eq!(p_, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[should_fail]
|
|
|
|
fn test_str_truncate_invalid_len() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(6);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[should_fail]
|
|
|
|
fn test_str_truncate_split_codepoint() {
|
2014-12-09 14:08:10 -08:00
|
|
|
let mut s = String::from_str("\u{FC}"); // ü
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(1);
|
|
|
|
}
|
2014-05-11 03:49:09 -07:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_str_clear() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-05-11 03:49:09 -07:00
|
|
|
s.clear();
|
|
|
|
assert_eq!(s.len(), 0);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "");
|
2014-05-11 03:49:09 -07:00
|
|
|
}
|
2014-05-27 21:34:00 -07:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_str_add() {
|
|
|
|
let a = String::from_str("12345");
|
|
|
|
let b = a + "2";
|
2014-12-01 18:02:39 -05:00
|
|
|
let b = b + "2";
|
2014-05-27 21:34:00 -07:00
|
|
|
assert_eq!(b.len(), 7);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(b, "1234522");
|
2014-05-27 21:34:00 -07:00
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
#[test]
|
|
|
|
fn remove() {
|
|
|
|
let mut s = "ศไทย中华Việt Nam; foobar".to_string();;
|
2014-12-28 10:29:56 -08:00
|
|
|
assert_eq!(s.remove(0), 'ศ');
|
2014-09-22 08:24:14 -07:00
|
|
|
assert_eq!(s.len(), 33);
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "ไทย中华Việt Nam; foobar");
|
2014-12-28 10:29:56 -08:00
|
|
|
assert_eq!(s.remove(17), 'ệ');
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "ไทย中华Vit Nam; foobar");
|
2014-09-22 08:24:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test] #[should_fail]
|
|
|
|
fn remove_bad() {
|
|
|
|
"ศ".to_string().remove(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn insert() {
|
|
|
|
let mut s = "foobar".to_string();
|
|
|
|
s.insert(0, 'ệ');
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "ệfoobar");
|
2014-09-22 08:24:14 -07:00
|
|
|
s.insert(6, 'ย');
|
2014-11-27 11:45:50 -05:00
|
|
|
assert_eq!(s, "ệfooยbar");
|
2014-09-22 08:24:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test] #[should_fail] fn insert_bad1() { "".to_string().insert(1, 't'); }
|
|
|
|
#[test] #[should_fail] fn insert_bad2() { "ệ".to_string().insert(1, 't'); }
|
|
|
|
|
2014-09-26 21:46:22 -07:00
|
|
|
#[test]
|
|
|
|
fn test_slicing() {
|
|
|
|
let s = "foobar".to_string();
|
2015-01-04 17:43:24 +13:00
|
|
|
assert_eq!("foobar", &s[]);
|
|
|
|
assert_eq!("foo", &s[..3]);
|
|
|
|
assert_eq!("bar", &s[3..]);
|
|
|
|
assert_eq!("oob", &s[1..4]);
|
2014-09-26 21:46:22 -07:00
|
|
|
}
|
|
|
|
|
2014-11-16 12:38:03 +11:00
|
|
|
#[test]
|
|
|
|
fn test_simple_types() {
|
2014-11-27 19:09:59 -05:00
|
|
|
assert_eq!(1i.to_string(), "1");
|
|
|
|
assert_eq!((-1i).to_string(), "-1");
|
|
|
|
assert_eq!(200u.to_string(), "200");
|
|
|
|
assert_eq!(2u8.to_string(), "2");
|
|
|
|
assert_eq!(true.to_string(), "true");
|
|
|
|
assert_eq!(false.to_string(), "false");
|
|
|
|
assert_eq!(("hi".to_string()).to_string(), "hi");
|
2014-11-16 12:38:03 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_vectors() {
|
|
|
|
let x: Vec<int> = vec![];
|
2015-01-07 14:58:31 -08:00
|
|
|
assert_eq!(format!("{:?}", x), "[]");
|
|
|
|
assert_eq!(format!("{:?}", vec![1i]), "[1i]");
|
|
|
|
assert_eq!(format!("{:?}", vec![1i, 2, 3]), "[1i, 2i, 3i]");
|
|
|
|
assert!(format!("{:?}", vec![vec![], vec![1i], vec![1i, 1]]) ==
|
|
|
|
"[[], [1i], [1i, 1i]]");
|
2014-11-16 12:38:03 +11:00
|
|
|
}
|
|
|
|
|
2014-12-07 21:43:11 +01:00
|
|
|
#[test]
|
|
|
|
fn test_from_iterator() {
|
|
|
|
let s = "ศไทย中华Việt Nam".to_string();
|
|
|
|
let t = "ศไทย中华";
|
|
|
|
let u = "Việt Nam";
|
|
|
|
|
|
|
|
let a: String = s.chars().collect();
|
2014-12-07 21:45:47 +01:00
|
|
|
assert_eq!(s, a);
|
2014-12-07 21:43:11 +01:00
|
|
|
|
|
|
|
let mut b = t.to_string();
|
|
|
|
b.extend(u.chars());
|
2014-12-07 21:45:47 +01:00
|
|
|
assert_eq!(s, b);
|
|
|
|
|
|
|
|
let c: String = vec![t, u].into_iter().collect();
|
|
|
|
assert_eq!(s, c);
|
|
|
|
|
|
|
|
let mut d = t.to_string();
|
|
|
|
d.extend(vec![u].into_iter());
|
|
|
|
assert_eq!(s, d);
|
2014-12-07 21:43:11 +01:00
|
|
|
}
|
|
|
|
|
2014-07-10 18:21:16 +02:00
|
|
|
#[bench]
|
|
|
|
fn bench_with_capacity(b: &mut Bencher) {
|
|
|
|
b.iter(|| {
|
|
|
|
String::with_capacity(100)
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_str(b: &mut Bencher) {
|
|
|
|
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
r.push_str(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2014-12-20 17:17:58 +00:00
|
|
|
const REPETITIONS: u64 = 10_000;
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_str_one_byte(b: &mut Bencher) {
|
|
|
|
b.bytes = REPETITIONS;
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
for _ in range(0, REPETITIONS) {
|
|
|
|
r.push_str("a")
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_char_one_byte(b: &mut Bencher) {
|
|
|
|
b.bytes = REPETITIONS;
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
for _ in range(0, REPETITIONS) {
|
|
|
|
r.push('a')
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_char_two_bytes(b: &mut Bencher) {
|
|
|
|
b.bytes = REPETITIONS * 2;
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
for _ in range(0, REPETITIONS) {
|
|
|
|
r.push('â')
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2014-07-10 18:21:16 +02:00
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
|
|
|
|
let s = b"Hello there, the quick brown fox jumped over the lazy dog! \
|
|
|
|
Lorem ipsum dolor sit amet, consectetur. ";
|
|
|
|
|
|
|
|
assert_eq!(100, s.len());
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
|
2014-07-21 00:43:08 -05:00
|
|
|
let s = "𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰".as_bytes();
|
2014-07-10 18:21:16 +02:00
|
|
|
assert_eq!(100, s.len());
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_invalid(b: &mut Bencher) {
|
|
|
|
let s = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
|
2015-01-01 23:53:35 -08:00
|
|
|
let s = repeat(0xf5u8).take(100).collect::<Vec<_>>();
|
2014-07-10 18:21:16 +02:00
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s.as_slice());
|
|
|
|
});
|
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|