2014-04-02 16:54:22 -07:00
|
|
|
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
|
|
|
// file at the top-level directory of this distribution and at
|
|
|
|
// http://rust-lang.org/COPYRIGHT.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
|
|
// option. This file may not be copied, modified, or distributed
|
|
|
|
// except according to those terms.
|
2014-07-14 20:46:04 -07:00
|
|
|
//
|
|
|
|
// ignore-lexer-test FIXME #15679
|
2014-04-02 16:54:22 -07:00
|
|
|
|
|
|
|
//! An owned, growable string that enforces that its contents are valid UTF-8.
|
|
|
|
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
use core::prelude::*;
|
|
|
|
|
|
|
|
use core::default::Default;
|
|
|
|
use core::fmt;
|
|
|
|
use core::mem;
|
|
|
|
use core::ptr;
|
2014-09-26 21:46:22 -07:00
|
|
|
use core::ops;
|
2014-08-06 20:03:55 -07:00
|
|
|
// FIXME: ICE's abound if you import the `Slice` type while importing `Slice` trait
|
2014-08-18 08:29:44 -07:00
|
|
|
use core::raw::Slice as RawSlice;
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
|
|
|
|
use hash;
|
2014-11-02 17:04:32 -08:00
|
|
|
use slice::CloneSliceAllocPrelude;
|
2014-04-10 20:55:34 +10:00
|
|
|
use str;
|
2014-11-15 15:52:00 +11:00
|
|
|
use str::{CharRange, FromStr, StrAllocating, MaybeOwned, Owned};
|
2014-08-18 08:29:44 -07:00
|
|
|
use str::Slice as MaybeOwnedSlice; // So many `Slice`s...
|
2014-08-23 20:26:53 -04:00
|
|
|
use vec::{DerefVec, Vec, as_vec};
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-04-10 20:55:34 +10:00
|
|
|
/// A growable string stored as a UTF-8 encoded buffer.
|
2014-05-31 10:43:52 -07:00
|
|
|
#[deriving(Clone, PartialEq, PartialOrd, Eq, Ord)]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub struct String {
|
2014-04-02 16:54:22 -07:00
|
|
|
vec: Vec<u8>,
|
|
|
|
}
|
|
|
|
|
2014-05-22 16:57:53 -07:00
|
|
|
impl String {
|
2014-04-21 00:49:39 -04:00
|
|
|
/// Creates a new string buffer initialized with the empty string.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn new() -> String {
|
|
|
|
String {
|
2014-04-02 16:54:22 -07:00
|
|
|
vec: Vec::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a new string buffer with the given capacity.
|
2014-07-27 12:40:39 +02:00
|
|
|
/// The string will be able to hold exactly `capacity` bytes without
|
|
|
|
/// reallocating. If `capacity` is 0, the string will not allocate.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::with_capacity(10);
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn with_capacity(capacity: uint) -> String {
|
|
|
|
String {
|
2014-04-02 16:54:22 -07:00
|
|
|
vec: Vec::with_capacity(capacity),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Creates a new string buffer from the given string.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
|
|
|
/// assert_eq!(s.as_slice(), "hello");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "needs investigation to see if to_string() can match perf"]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn from_str(string: &str) -> String {
|
2014-09-17 12:56:31 -07:00
|
|
|
String { vec: string.as_bytes().to_vec() }
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-05-14 16:55:24 -07:00
|
|
|
/// Returns the vector as a string buffer, if possible, taking care not to
|
|
|
|
/// copy it.
|
|
|
|
///
|
|
|
|
/// Returns `Err` with the original vector if the vector contains invalid
|
|
|
|
/// UTF-8.
|
2014-06-30 16:41:30 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```rust
|
|
|
|
/// let hello_vec = vec![104, 101, 108, 108, 111];
|
2014-07-27 12:40:39 +02:00
|
|
|
/// let s = String::from_utf8(hello_vec);
|
|
|
|
/// assert_eq!(s, Ok("hello".to_string()));
|
|
|
|
///
|
|
|
|
/// let invalid_vec = vec![240, 144, 128];
|
|
|
|
/// let s = String::from_utf8(invalid_vec);
|
|
|
|
/// assert_eq!(s, Err(vec![240, 144, 128]));
|
2014-06-30 16:41:30 +02:00
|
|
|
/// ```
|
2014-04-10 20:55:34 +10:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "error type may change"]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn from_utf8(vec: Vec<u8>) -> Result<String, Vec<u8>> {
|
2014-04-10 20:55:34 +10:00
|
|
|
if str::is_utf8(vec.as_slice()) {
|
2014-05-22 16:57:53 -07:00
|
|
|
Ok(String { vec: vec })
|
2014-04-10 20:55:34 +10:00
|
|
|
} else {
|
2014-05-14 16:55:24 -07:00
|
|
|
Err(vec)
|
2014-04-10 20:55:34 +10:00
|
|
|
}
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Converts a vector of bytes to a new UTF-8 string.
|
|
|
|
/// Any invalid UTF-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
|
2014-07-10 18:21:16 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```rust
|
|
|
|
/// let input = b"Hello \xF0\x90\x80World";
|
2014-07-04 22:38:13 +02:00
|
|
|
/// let output = String::from_utf8_lossy(input);
|
2014-07-10 18:21:16 +02:00
|
|
|
/// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
|
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "return type may change"]
|
2014-07-10 18:21:16 +02:00
|
|
|
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
|
|
|
|
if str::is_utf8(v) {
|
2014-08-06 20:03:55 -07:00
|
|
|
return MaybeOwnedSlice(unsafe { mem::transmute(v) })
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static TAG_CONT_U8: u8 = 128u8;
|
|
|
|
static REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8
|
|
|
|
let mut i = 0;
|
|
|
|
let total = v.len();
|
|
|
|
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
|
2014-08-06 20:08:16 -07:00
|
|
|
unsafe { *xs.unsafe_get(i) }
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
|
|
|
|
if i >= total {
|
|
|
|
0
|
|
|
|
} else {
|
|
|
|
unsafe_get(xs, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut res = String::with_capacity(total);
|
|
|
|
|
|
|
|
if i > 0 {
|
|
|
|
unsafe {
|
2014-09-24 23:41:09 +12:00
|
|
|
res.as_mut_vec().push_all(v[..i])
|
2014-07-10 18:21:16 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
// subseqidx is the index of the first byte of the subsequence we're looking at.
|
|
|
|
// It's used to copy a bunch of contiguous good codepoints at once instead of copying
|
|
|
|
// them one by one.
|
|
|
|
let mut subseqidx = 0;
|
|
|
|
|
|
|
|
while i < total {
|
|
|
|
let i_ = i;
|
|
|
|
let byte = unsafe_get(v, i);
|
|
|
|
i += 1;
|
|
|
|
|
|
|
|
macro_rules! error(() => ({
|
|
|
|
unsafe {
|
|
|
|
if subseqidx != i_ {
|
2014-09-24 23:41:09 +12:00
|
|
|
res.as_mut_vec().push_all(v[subseqidx..i_]);
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
subseqidx = i;
|
2014-09-22 08:28:35 -07:00
|
|
|
res.as_mut_vec().push_all(REPLACEMENT);
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
}))
|
|
|
|
|
|
|
|
if byte < 128u8 {
|
|
|
|
// subseqidx handles this
|
|
|
|
} else {
|
|
|
|
let w = str::utf8_char_width(byte);
|
|
|
|
|
|
|
|
match w {
|
|
|
|
2 => {
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
3 => {
|
|
|
|
match (byte, safe_get(v, i, total)) {
|
2014-09-26 21:13:20 -07:00
|
|
|
(0xE0 , 0xA0 ... 0xBF) => (),
|
|
|
|
(0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
|
|
|
|
(0xED , 0x80 ... 0x9F) => (),
|
|
|
|
(0xEE ... 0xEF, 0x80 ... 0xBF) => (),
|
2014-07-10 18:21:16 +02:00
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
4 => {
|
|
|
|
match (byte, safe_get(v, i, total)) {
|
2014-09-26 21:13:20 -07:00
|
|
|
(0xF0 , 0x90 ... 0xBF) => (),
|
|
|
|
(0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
|
|
|
|
(0xF4 , 0x80 ... 0x8F) => (),
|
2014-07-10 18:21:16 +02:00
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
_ => {
|
|
|
|
error!();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if subseqidx < total {
|
|
|
|
unsafe {
|
2014-09-24 23:41:09 +12:00
|
|
|
res.as_mut_vec().push_all(v[subseqidx..total])
|
2014-07-10 18:21:16 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
Owned(res.into_string())
|
|
|
|
}
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
/// Decode a UTF-16 encoded vector `v` into a `String`, returning `None`
|
2014-07-10 17:43:03 +02:00
|
|
|
/// if `v` contains any invalid data.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```rust
|
2014-07-04 22:38:13 +02:00
|
|
|
/// // 𝄞music
|
2014-11-17 21:39:01 +13:00
|
|
|
/// let mut v = &mut [0xD834, 0xDD1E, 0x006d, 0x0075,
|
|
|
|
/// 0x0073, 0x0069, 0x0063];
|
2014-07-04 22:38:13 +02:00
|
|
|
/// assert_eq!(String::from_utf16(v), Some("𝄞music".to_string()));
|
2014-07-10 17:43:03 +02:00
|
|
|
///
|
2014-07-04 22:38:13 +02:00
|
|
|
/// // 𝄞mu<invalid>ic
|
2014-07-10 17:43:03 +02:00
|
|
|
/// v[4] = 0xD800;
|
|
|
|
/// assert_eq!(String::from_utf16(v), None);
|
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "error value in return may change"]
|
2014-07-10 17:43:03 +02:00
|
|
|
pub fn from_utf16(v: &[u16]) -> Option<String> {
|
2014-10-03 21:20:04 +01:00
|
|
|
let mut s = String::with_capacity(v.len());
|
2014-07-10 17:43:03 +02:00
|
|
|
for c in str::utf16_items(v) {
|
|
|
|
match c {
|
2014-09-22 08:28:35 -07:00
|
|
|
str::ScalarValue(c) => s.push(c),
|
2014-07-10 17:43:03 +02:00
|
|
|
str::LoneSurrogate(_) => return None
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Some(s)
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-07-10 17:53:51 +02:00
|
|
|
/// Decode a UTF-16 encoded vector `v` into a string, replacing
|
|
|
|
/// invalid data with the replacement character (U+FFFD).
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
/// ```rust
|
2014-07-04 22:38:13 +02:00
|
|
|
/// // 𝄞mus<invalid>ic<invalid>
|
2014-11-17 21:39:01 +13:00
|
|
|
/// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075,
|
|
|
|
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
|
|
|
/// 0xD834];
|
2014-07-10 17:53:51 +02:00
|
|
|
///
|
|
|
|
/// assert_eq!(String::from_utf16_lossy(v),
|
2014-07-04 22:38:13 +02:00
|
|
|
/// "𝄞mus\uFFFDic\uFFFD".to_string());
|
2014-07-10 17:53:51 +02:00
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-07-10 17:53:51 +02:00
|
|
|
pub fn from_utf16_lossy(v: &[u16]) -> String {
|
|
|
|
str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
|
|
|
}
|
2014-07-04 22:18:11 +02:00
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Convert a vector of `char`s to a `String`.
|
2014-07-04 21:55:58 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```rust
|
2014-11-17 21:39:01 +13:00
|
|
|
/// let chars = &['h', 'e', 'l', 'l', 'o'];
|
2014-07-27 12:40:39 +02:00
|
|
|
/// let s = String::from_chars(chars);
|
|
|
|
/// assert_eq!(s.as_slice(), "hello");
|
2014-07-04 21:55:58 +02:00
|
|
|
/// ```
|
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "may be removed in favor of .collect()"]
|
2014-07-04 21:55:58 +02:00
|
|
|
pub fn from_chars(chs: &[char]) -> String {
|
|
|
|
chs.iter().map(|c| *c).collect()
|
|
|
|
}
|
2014-04-10 20:55:34 +10:00
|
|
|
|
|
|
|
/// Return the underlying byte buffer, encoded as UTF-8.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
|
|
|
/// let bytes = s.into_bytes();
|
|
|
|
/// assert_eq!(bytes, vec![104, 101, 108, 108, 111]);
|
|
|
|
/// ```
|
2014-04-10 20:55:34 +10:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-04-10 20:55:34 +10:00
|
|
|
pub fn into_bytes(self) -> Vec<u8> {
|
|
|
|
self.vec
|
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
/// Creates a string buffer by repeating a character `length` times.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_char(5, 'a');
|
|
|
|
/// assert_eq!(s.as_slice(), "aaaaa");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "may be replaced with iterators, questionable usability, and \
|
|
|
|
the name may change"]
|
2014-05-22 16:57:53 -07:00
|
|
|
pub fn from_char(length: uint, ch: char) -> String {
|
2014-04-02 16:54:22 -07:00
|
|
|
if length == 0 {
|
2014-05-22 16:57:53 -07:00
|
|
|
return String::new()
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut buf = String::new();
|
2014-09-22 08:28:35 -07:00
|
|
|
buf.push(ch);
|
2014-11-06 12:24:47 -05:00
|
|
|
let size = buf.len() * (length - 1);
|
|
|
|
buf.reserve_exact(size);
|
2014-04-02 16:54:22 -07:00
|
|
|
for _ in range(1, length) {
|
2014-09-22 08:28:35 -07:00
|
|
|
buf.push(ch)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Pushes the given string onto this string buffer.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.push_str("bar");
|
|
|
|
/// assert_eq!(s.as_slice(), "foobar");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "extra variants of `push`, could possibly be based on iterators"]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn push_str(&mut self, string: &str) {
|
|
|
|
self.vec.push_all(string.as_bytes())
|
|
|
|
}
|
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Pushes `ch` onto the given string `count` times.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.grow(5, 'Z');
|
|
|
|
/// assert_eq!(s.as_slice(), "fooZZZZZ");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "duplicate of iterator-based functionality"]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn grow(&mut self, count: uint, ch: char) {
|
|
|
|
for _ in range(0, count) {
|
2014-09-22 08:28:35 -07:00
|
|
|
self.push(ch)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// Returns the number of bytes that this string buffer can hold without reallocating.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::with_capacity(10);
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert!(s.capacity() >= 10);
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// ```
|
|
|
|
#[inline]
|
2014-11-06 12:24:47 -05:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
pub fn capacity(&self) -> uint {
|
|
|
|
self.vec.capacity()
|
|
|
|
}
|
|
|
|
|
2014-11-06 12:24:47 -05:00
|
|
|
/// Deprecated: Renamed to `reserve`.
|
|
|
|
#[deprecated = "Renamed to `reserve`"]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn reserve_additional(&mut self, extra: uint) {
|
2014-11-06 12:24:47 -05:00
|
|
|
self.vec.reserve(extra)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-11-06 12:24:47 -05:00
|
|
|
/// Reserves capacity for at least `additional` more bytes to be inserted in the given
|
|
|
|
/// `String`. The collection may reserve more space to avoid frequent reallocations.
|
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// Panics if the new capacity overflows `uint`.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
|
|
|
/// s.reserve(10);
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert!(s.capacity() >= 10);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-11-06 12:24:47 -05:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
|
|
|
pub fn reserve(&mut self, additional: uint) {
|
|
|
|
self.vec.reserve(additional)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-11-06 12:24:47 -05:00
|
|
|
/// Reserves the minimum capacity for exactly `additional` more bytes to be inserted in the
|
|
|
|
/// given `String`. Does nothing if the capacity is already sufficient.
|
|
|
|
///
|
|
|
|
/// Note that the allocator may give the collection more space than it requests. Therefore
|
|
|
|
/// capacity can not be relied upon to be precisely minimal. Prefer `reserve` if future
|
|
|
|
/// insertions are expected.
|
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// Panics if the new capacity overflows `uint`.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::new();
|
2014-11-06 12:24:47 -05:00
|
|
|
/// s.reserve(10);
|
|
|
|
/// assert!(s.capacity() >= 10);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-11-06 12:24:47 -05:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
|
|
|
pub fn reserve_exact(&mut self, additional: uint) {
|
|
|
|
self.vec.reserve_exact(additional)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Shrinks the capacity of this string buffer to match its length.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
|
|
|
/// s.reserve(100);
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert!(s.capacity() >= 100);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// s.shrink_to_fit();
|
2014-09-22 08:28:35 -07:00
|
|
|
/// assert_eq!(s.capacity(), 3);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-11-06 12:24:47 -05:00
|
|
|
#[unstable = "matches collection reform specification, waiting for dust to settle"]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn shrink_to_fit(&mut self) {
|
|
|
|
self.vec.shrink_to_fit()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Adds the given character to the end of the string.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("abc");
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// s.push('1');
|
|
|
|
/// s.push('2');
|
|
|
|
/// s.push('3');
|
2014-07-27 12:40:39 +02:00
|
|
|
/// assert_eq!(s.as_slice(), "abc123");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-09-27 02:37:28 +02:00
|
|
|
#[stable = "function just renamed from push_char"]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
pub fn push(&mut self, ch: char) {
|
2014-04-02 16:54:22 -07:00
|
|
|
let cur_len = self.len();
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
// This may use up to 4 bytes.
|
2014-11-06 12:24:47 -05:00
|
|
|
self.vec.reserve(4);
|
2014-04-02 16:54:22 -07:00
|
|
|
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
unsafe {
|
2014-04-02 16:54:22 -07:00
|
|
|
// Attempt to not use an intermediate buffer by just pushing bytes
|
|
|
|
// directly onto this string.
|
2014-08-06 20:03:55 -07:00
|
|
|
let slice = RawSlice {
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
data: self.vec.as_ptr().offset(cur_len as int),
|
|
|
|
len: 4,
|
|
|
|
};
|
2014-08-14 01:02:31 +02:00
|
|
|
let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
|
2014-04-02 16:54:22 -07:00
|
|
|
self.vec.set_len(cur_len + used);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Works with the underlying buffer as a byte slice.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let s = String::from_str("hello");
|
2014-08-06 11:59:40 +02:00
|
|
|
/// let b: &[_] = &[104, 101, 108, 108, 111];
|
|
|
|
/// assert_eq!(s.as_bytes(), b);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-04-02 16:54:22 -07:00
|
|
|
pub fn as_bytes<'a>(&'a self) -> &'a [u8] {
|
|
|
|
self.vec.as_slice()
|
|
|
|
}
|
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Shortens a string to the specified length.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-11-12 03:36:09 +09:00
|
|
|
/// # Panics
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
2014-11-12 03:36:09 +09:00
|
|
|
/// Panics if `new_len` > current length,
|
2014-10-05 12:15:59 +01:00
|
|
|
/// or if `new_len` is not a character boundary.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("hello");
|
|
|
|
/// s.truncate(2);
|
|
|
|
/// assert_eq!(s.as_slice(), "he");
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
2014-10-09 15:17:22 -04:00
|
|
|
#[unstable = "the panic conventions for strings are under development"]
|
2014-10-05 12:15:59 +01:00
|
|
|
pub fn truncate(&mut self, new_len: uint) {
|
|
|
|
assert!(self.as_slice().is_char_boundary(new_len));
|
|
|
|
self.vec.truncate(new_len)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
2014-07-27 12:40:39 +02:00
|
|
|
/// Removes the last character from the string buffer and returns it.
|
|
|
|
/// Returns `None` if this string buffer is empty.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
/// assert_eq!(s.pop(), Some('o'));
|
|
|
|
/// assert_eq!(s.pop(), Some('o'));
|
|
|
|
/// assert_eq!(s.pop(), Some('f'));
|
|
|
|
/// assert_eq!(s.pop(), None);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-05-08 21:42:40 +01:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "this function was just renamed from pop_char"]
|
|
|
|
pub fn pop(&mut self) -> Option<char> {
|
2014-05-08 21:42:40 +01:00
|
|
|
let len = self.len();
|
|
|
|
if len == 0 {
|
|
|
|
return None
|
|
|
|
}
|
|
|
|
|
|
|
|
let CharRange {ch, next} = self.as_slice().char_range_at_reverse(len);
|
|
|
|
unsafe {
|
|
|
|
self.vec.set_len(next);
|
|
|
|
}
|
|
|
|
Some(ch)
|
|
|
|
}
|
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
/// Removes the character from the string buffer at byte position `idx` and
|
|
|
|
/// returns it. Returns `None` if `idx` is out of bounds.
|
2014-05-08 21:42:40 +01:00
|
|
|
///
|
|
|
|
/// # Warning
|
|
|
|
///
|
2014-11-16 08:28:13 -08:00
|
|
|
/// This is an O(n) operation as it requires copying every element in the
|
2014-09-22 08:24:14 -07:00
|
|
|
/// buffer.
|
|
|
|
///
|
2014-10-09 15:17:22 -04:00
|
|
|
/// # Panics
|
2014-09-22 08:24:14 -07:00
|
|
|
///
|
|
|
|
/// If `idx` does not lie on a character boundary, then this function will
|
2014-10-09 15:17:22 -04:00
|
|
|
/// panic.
|
2014-07-27 12:40:39 +02:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("foo");
|
2014-09-22 08:24:14 -07:00
|
|
|
/// assert_eq!(s.remove(0), Some('f'));
|
|
|
|
/// assert_eq!(s.remove(1), Some('o'));
|
|
|
|
/// assert_eq!(s.remove(0), Some('o'));
|
|
|
|
/// assert_eq!(s.remove(0), None);
|
2014-07-27 12:40:39 +02:00
|
|
|
/// ```
|
2014-10-09 15:17:22 -04:00
|
|
|
#[unstable = "the panic semantics of this function and return type \
|
2014-09-22 08:24:14 -07:00
|
|
|
may change"]
|
|
|
|
pub fn remove(&mut self, idx: uint) -> Option<char> {
|
2014-04-02 16:54:22 -07:00
|
|
|
let len = self.len();
|
2014-09-22 08:24:14 -07:00
|
|
|
if idx >= len { return None }
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
let CharRange { ch, next } = self.as_slice().char_range_at(idx);
|
2014-05-08 21:42:40 +01:00
|
|
|
unsafe {
|
2014-09-22 08:24:14 -07:00
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset(idx as int),
|
|
|
|
self.vec.as_ptr().offset(next as int),
|
|
|
|
len - next);
|
|
|
|
self.vec.set_len(len - (next - idx));
|
2014-05-08 21:42:40 +01:00
|
|
|
}
|
|
|
|
Some(ch)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
2014-04-12 22:44:31 +10:00
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
/// Insert a character into the string buffer at byte position `idx`.
|
|
|
|
///
|
|
|
|
/// # Warning
|
|
|
|
///
|
2014-11-16 08:28:13 -08:00
|
|
|
/// This is an O(n) operation as it requires copying every element in the
|
2014-09-22 08:24:14 -07:00
|
|
|
/// buffer.
|
|
|
|
///
|
2014-10-09 15:17:22 -04:00
|
|
|
/// # Panics
|
2014-09-22 08:24:14 -07:00
|
|
|
///
|
|
|
|
/// If `idx` does not lie on a character boundary or is out of bounds, then
|
2014-10-09 15:17:22 -04:00
|
|
|
/// this function will panic.
|
|
|
|
#[unstable = "the panic semantics of this function are uncertain"]
|
2014-09-22 08:24:14 -07:00
|
|
|
pub fn insert(&mut self, idx: uint, ch: char) {
|
|
|
|
let len = self.len();
|
|
|
|
assert!(idx <= len);
|
|
|
|
assert!(self.as_slice().is_char_boundary(idx));
|
2014-11-06 12:24:47 -05:00
|
|
|
self.vec.reserve(4);
|
2014-09-22 08:24:14 -07:00
|
|
|
let mut bits = [0, ..4];
|
2014-11-17 21:39:01 +13:00
|
|
|
let amt = ch.encode_utf8(&mut bits).unwrap();
|
2014-09-22 08:24:14 -07:00
|
|
|
|
|
|
|
unsafe {
|
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset((idx + amt) as int),
|
|
|
|
self.vec.as_ptr().offset(idx as int),
|
|
|
|
len - idx);
|
|
|
|
ptr::copy_memory(self.vec.as_mut_ptr().offset(idx as int),
|
|
|
|
bits.as_ptr(),
|
|
|
|
amt);
|
|
|
|
self.vec.set_len(len + amt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-12 22:44:31 +10:00
|
|
|
/// Views the string buffer as a mutable sequence of bytes.
|
|
|
|
///
|
2014-07-27 12:40:39 +02:00
|
|
|
/// This is unsafe because it does not check
|
|
|
|
/// to ensure that the resulting string will be valid UTF-8.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = String::from_str("hello");
|
|
|
|
/// unsafe {
|
|
|
|
/// let vec = s.as_mut_vec();
|
|
|
|
/// assert!(vec == &mut vec![104, 101, 108, 108, 111]);
|
|
|
|
/// vec.reverse();
|
|
|
|
/// }
|
|
|
|
/// assert_eq!(s.as_slice(), "olleh");
|
|
|
|
/// ```
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "the name of this method may be changed"]
|
2014-04-12 22:44:31 +10:00
|
|
|
pub unsafe fn as_mut_vec<'a>(&'a mut self) -> &'a mut Vec<u8> {
|
|
|
|
&mut self.vec
|
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-10-30 13:43:24 -07:00
|
|
|
/// Return the number of bytes in this string.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let a = "foo".to_string();
|
|
|
|
/// assert_eq!(a.len(), 3);
|
|
|
|
/// ```
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-10-30 13:43:24 -07:00
|
|
|
pub fn len(&self) -> uint { self.vec.len() }
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-10-30 13:43:24 -07:00
|
|
|
/// Returns true if the string contains no bytes
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut v = String::new();
|
|
|
|
/// assert!(v.is_empty());
|
|
|
|
/// v.push('a');
|
|
|
|
/// assert!(!v.is_empty());
|
|
|
|
/// ```
|
|
|
|
pub fn is_empty(&self) -> bool { self.len() == 0 }
|
|
|
|
|
|
|
|
/// Truncates the string, returning it to 0 length.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// let mut s = "foo".to_string();
|
|
|
|
/// s.clear();
|
|
|
|
/// assert!(s.is_empty());
|
|
|
|
/// ```
|
2014-05-11 03:49:09 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-10-30 13:43:24 -07:00
|
|
|
pub fn clear(&mut self) {
|
2014-05-11 03:49:09 -07:00
|
|
|
self.vec.clear()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on FromIterator stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl FromIterator<char> for String {
|
|
|
|
fn from_iter<I:Iterator<char>>(iterator: I) -> String {
|
|
|
|
let mut buf = String::new();
|
2014-04-02 16:54:22 -07:00
|
|
|
buf.extend(iterator);
|
|
|
|
buf
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-08 01:39:39 +01:00
|
|
|
#[experimental = "waiting on Extend stabilization"]
|
|
|
|
impl Extend<char> for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
fn extend<I:Iterator<char>>(&mut self, mut iterator: I) {
|
|
|
|
for ch in iterator {
|
2014-09-22 08:28:35 -07:00
|
|
|
self.push(ch)
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Str stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl Str for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-04-02 16:54:22 -07:00
|
|
|
fn as_slice<'a>(&'a self) -> &'a str {
|
|
|
|
unsafe {
|
core: Remove the cast module
This commit revisits the `cast` module in libcore and libstd, and scrutinizes
all functions inside of it. The result was to remove the `cast` module entirely,
folding all functionality into the `mem` module. Specifically, this is the fate
of each function in the `cast` module.
* transmute - This function was moved to `mem`, but it is now marked as
#[unstable]. This is due to planned changes to the `transmute`
function and how it can be invoked (see the #[unstable] comment).
For more information, see RFC 5 and #12898
* transmute_copy - This function was moved to `mem`, with clarification that is
is not an error to invoke it with T/U that are different
sizes, but rather that it is strongly discouraged. This
function is now #[stable]
* forget - This function was moved to `mem` and marked #[stable]
* bump_box_refcount - This function was removed due to the deprecation of
managed boxes as well as its questionable utility.
* transmute_mut - This function was previously deprecated, and removed as part
of this commit.
* transmute_mut_unsafe - This function doesn't serve much of a purpose when it
can be achieved with an `as` in safe code, so it was
removed.
* transmute_lifetime - This function was removed because it is likely a strong
indication that code is incorrect in the first place.
* transmute_mut_lifetime - This function was removed for the same reasons as
`transmute_lifetime`
* copy_lifetime - This function was moved to `mem`, but it is marked
`#[unstable]` now due to the likelihood of being removed in
the future if it is found to not be very useful.
* copy_mut_lifetime - This function was also moved to `mem`, but had the same
treatment as `copy_lifetime`.
* copy_lifetime_vec - This function was removed because it is not used today,
and its existence is not necessary with DST
(copy_lifetime will suffice).
In summary, the cast module was stripped down to these functions, and then the
functions were moved to the `mem` module.
transmute - #[unstable]
transmute_copy - #[stable]
forget - #[stable]
copy_lifetime - #[unstable]
copy_mut_lifetime - #[unstable]
[breaking-change]
2014-05-09 10:34:51 -07:00
|
|
|
mem::transmute(self.vec.as_slice())
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
}
|
2014-04-30 23:06:36 -07:00
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on StrAllocating stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl StrAllocating for String {
|
2014-04-12 22:44:31 +10:00
|
|
|
#[inline]
|
2014-05-25 03:17:19 -07:00
|
|
|
fn into_string(self) -> String {
|
2014-05-19 17:23:26 -07:00
|
|
|
self
|
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[stable]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl Default for String {
|
|
|
|
fn default() -> String {
|
|
|
|
String::new()
|
2014-05-19 23:19:56 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Show stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl fmt::Show for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
self.as_slice().fmt(f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Hash stabilization"]
|
std: Recreate a `collections` module
As with the previous commit with `librand`, this commit shuffles around some
`collections` code. The new state of the world is similar to that of librand:
* The libcollections crate now only depends on libcore and liballoc.
* The standard library has a new module, `std::collections`. All functionality
of libcollections is reexported through this module.
I would like to stress that this change is purely cosmetic. There are very few
alterations to these primitives.
There are a number of notable points about the new organization:
* std::{str, slice, string, vec} all moved to libcollections. There is no reason
that these primitives shouldn't be necessarily usable in a freestanding
context that has allocation. These are all reexported in their usual places in
the standard library.
* The `hashmap`, and transitively the `lru_cache`, modules no longer reside in
`libcollections`, but rather in libstd. The reason for this is because the
`HashMap::new` contructor requires access to the OSRng for initially seeding
the hash map. Beyond this requirement, there is no reason that the hashmap
could not move to libcollections.
I do, however, have a plan to move the hash map to the collections module. The
`HashMap::new` function could be altered to require that the `H` hasher
parameter ascribe to the `Default` trait, allowing the entire `hashmap` module
to live in libcollections. The key idea would be that the default hasher would
be different in libstd. Something along the lines of:
// src/libstd/collections/mod.rs
pub type HashMap<K, V, H = RandomizedSipHasher> =
core_collections::HashMap<K, V, H>;
This is not possible today because you cannot invoke static methods through
type aliases. If we modified the compiler, however, to allow invocation of
static methods through type aliases, then this type definition would
essentially be switching the default hasher from `SipHasher` in libcollections
to a libstd-defined `RandomizedSipHasher` type. This type's `Default`
implementation would randomly seed the `SipHasher` instance, and otherwise
perform the same as `SipHasher`.
This future state doesn't seem incredibly far off, but until that time comes,
the hashmap module will live in libstd to not compromise on functionality.
* In preparation for the hashmap moving to libcollections, the `hash` module has
moved from libstd to libcollections. A previously snapshotted commit enables a
distinct `Writer` trait to live in the `hash` module which `Hash`
implementations are now parameterized over.
Due to using a custom trait, the `SipHasher` implementation has lost its
specialized methods for writing integers. These can be re-added
backwards-compatibly in the future via default methods if necessary, but the
FNV hashing should satisfy much of the need for speedier hashing.
A list of breaking changes:
* HashMap::{get, get_mut} no longer fails with the key formatted into the error
message with `{:?}`, instead, a generic message is printed. With backtraces,
it should still be not-too-hard to track down errors.
* The HashMap, HashSet, and LruCache types are now available through
std::collections instead of the collections crate.
* Manual implementations of hash should be parameterized over `hash::Writer`
instead of just `Writer`.
[breaking-change]
2014-05-29 18:50:12 -07:00
|
|
|
impl<H: hash::Writer> hash::Hash<H> for String {
|
2014-04-02 16:54:22 -07:00
|
|
|
#[inline]
|
|
|
|
fn hash(&self, hasher: &mut H) {
|
|
|
|
self.as_slice().hash(hasher)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Equiv stabilization"]
|
2014-05-22 16:57:53 -07:00
|
|
|
impl<'a, S: Str> Equiv<S> for String {
|
2014-05-16 10:45:16 -07:00
|
|
|
#[inline]
|
|
|
|
fn equiv(&self, other: &S) -> bool {
|
|
|
|
self.as_slice() == other.as_slice()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[experimental = "waiting on Add stabilization"]
|
2014-05-27 21:34:00 -07:00
|
|
|
impl<S: Str> Add<S, String> for String {
|
|
|
|
fn add(&self, other: &S) -> String {
|
2014-06-21 03:39:03 -07:00
|
|
|
let mut s = String::from_str(self.as_slice());
|
2014-05-27 21:34:00 -07:00
|
|
|
s.push_str(other.as_slice());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-05 11:59:10 +13:00
|
|
|
impl ops::Slice<uint, str> for String {
|
|
|
|
#[inline]
|
|
|
|
fn as_slice_<'a>(&'a self) -> &'a str {
|
|
|
|
self.as_slice()
|
|
|
|
}
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn slice_from_or_fail<'a>(&'a self, from: &uint) -> &'a str {
|
|
|
|
self[][*from..]
|
|
|
|
}
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn slice_to_or_fail<'a>(&'a self, to: &uint) -> &'a str {
|
|
|
|
self[][..*to]
|
|
|
|
}
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn slice_or_fail<'a>(&'a self, from: &uint, to: &uint) -> &'a str {
|
|
|
|
self[][*from..*to]
|
|
|
|
}
|
|
|
|
}
|
2014-09-26 21:46:22 -07:00
|
|
|
|
2014-10-29 15:26:29 -07:00
|
|
|
#[experimental = "waiting on Deref stabilization"]
|
|
|
|
impl ops::Deref<str> for String {
|
|
|
|
fn deref<'a>(&'a self) -> &'a str { self.as_slice() }
|
|
|
|
}
|
|
|
|
|
2014-08-23 20:26:53 -04:00
|
|
|
/// Wrapper type providing a `&String` reference via `Deref`.
|
|
|
|
#[experimental]
|
|
|
|
pub struct DerefString<'a> {
|
|
|
|
x: DerefVec<'a, u8>
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Deref<String> for DerefString<'a> {
|
|
|
|
fn deref<'b>(&'b self) -> &'b String {
|
|
|
|
unsafe { mem::transmute(&*self.x) }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Convert a string slice to a wrapper type providing a `&String` reference.
|
|
|
|
#[experimental]
|
|
|
|
pub fn as_string<'a>(x: &'a str) -> DerefString<'a> {
|
|
|
|
DerefString { x: as_vec(x.as_bytes()) }
|
|
|
|
}
|
|
|
|
|
2014-11-15 15:52:00 +11:00
|
|
|
impl FromStr for String {
|
|
|
|
#[inline]
|
|
|
|
fn from_str(s: &str) -> Option<String> {
|
|
|
|
Some(String::from_str(s))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-15 23:57:54 +11:00
|
|
|
/// Trait for converting a type to a string, consuming it in the process.
|
|
|
|
pub trait IntoString {
|
|
|
|
/// Consume and convert to a string.
|
|
|
|
fn into_string(self) -> String;
|
|
|
|
}
|
|
|
|
|
2014-11-16 12:38:03 +11:00
|
|
|
/// A generic trait for converting a value to a string
|
|
|
|
pub trait ToString {
|
|
|
|
/// Converts the value of `self` to an owned string
|
|
|
|
fn to_string(&self) -> String;
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: fmt::Show> ToString for T {
|
|
|
|
fn to_string(&self) -> String {
|
|
|
|
let mut buf = Vec::<u8>::new();
|
|
|
|
let _ = format_args!(|args| fmt::write(&mut buf, args), "{}", self);
|
|
|
|
String::from_utf8(buf).unwrap()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-27 12:40:39 +02:00
|
|
|
/// Unsafe operations
|
collections: Stabilize String
# Rationale
When dealing with strings, many functions deal with either a `char` (unicode
codepoint) or a byte (utf-8 encoding related). There is often an inconsistent
way in which methods are referred to as to whether they contain "byte", "char",
or nothing in their name. There are also issues open to rename *all* methods to
reflect that they operate on utf8 encodings or bytes (e.g. utf8_len() or
byte_len()).
The current state of String seems to largely be what is desired, so this PR
proposes the following rationale for methods dealing with bytes or characters:
> When constructing a string, the input encoding *must* be mentioned (e.g.
> from_utf8). This makes it clear what exactly the input type is expected to be
> in terms of encoding.
>
> When a method operates on anything related to an *index* within the string
> such as length, capacity, position, etc, the method *implicitly* operates on
> bytes. It is an understood fact that String is a utf-8 encoded string, and
> burdening all methods with "bytes" would be redundant.
>
> When a method operates on the *contents* of a string, such as push() or pop(),
> then "char" is the default type. A String can loosely be thought of as being a
> collection of unicode codepoints, but not all collection-related operations
> make sense because some can be woefully inefficient.
# Method stabilization
The following methods have been marked #[stable]
* The String type itself
* String::new
* String::with_capacity
* String::from_utf16_lossy
* String::into_bytes
* String::as_bytes
* String::len
* String::clear
* String::as_slice
The following methods have been marked #[unstable]
* String::from_utf8 - The error type in the returned `Result` may change to
provide a nicer message when it's `unwrap()`'d
* String::from_utf8_lossy - The returned `MaybeOwned` type still needs
stabilization
* String::from_utf16 - The return type may change to become a `Result` which
includes more contextual information like where the error
occurred.
* String::from_chars - This is equivalent to iter().collect(), but currently not
as ergonomic.
* String::from_char - This method is the equivalent of Vec::from_elem, and has
been marked #[unstable] becuase it can be seen as a
duplicate of iterator-based functionality as well as
possibly being renamed.
* String::push_str - This *can* be emulated with .extend(foo.chars()), but is
less efficient because of decoding/encoding. Due to the
desire to minimize API surface this may be able to be
removed in the future for something possibly generic with
no loss in performance.
* String::grow - This is a duplicate of iterator-based functionality, which may
become more ergonomic in the future.
* String::capacity - This function was just added.
* String::push - This function was just added.
* String::pop - This function was just added.
* String::truncate - The failure conventions around String methods and byte
indices isn't totally clear at this time, so the failure
semantics and return value of this method are subject to
change.
* String::as_mut_vec - the naming of this method may change.
* string::raw::* - these functions are all waiting on [an RFC][2]
[2]: https://github.com/rust-lang/rfcs/pull/240
The following method have been marked #[experimental]
* String::from_str - This function only exists as it's more efficient than
to_string(), but having a less ergonomic function for
performance reasons isn't the greatest reason to keep it
around. Like Vec::push_all, this has been marked
experimental for now.
The following methods have been #[deprecated]
* String::append - This method has been deprecated to remain consistent with the
deprecation of Vec::append. While convenient, it is one of
the only functional-style apis on String, and requires more
though as to whether it belongs as a first-class method or
now (and how it relates to other collections).
* String::from_byte - This is fairly rare functionality and can be emulated with
str::from_utf8 plus an assert plus a call to to_string().
Additionally, String::from_char could possibly be used.
* String::byte_capacity - Renamed to String::capacity due to the rationale
above.
* String::push_char - Renamed to String::push due to the rationale above.
* String::pop_char - Renamed to String::pop due to the rationale above.
* String::push_bytes - There are a number of `unsafe` functions on the `String`
type which allow bypassing utf-8 checks. These have all
been deprecated in favor of calling `.as_mut_vec()` and
then operating directly on the vector returned. These
methods were deprecated because naming them with relation
to other methods was difficult to rationalize and it's
arguably more composable to call .as_mut_vec().
* String::as_mut_bytes - See push_bytes
* String::push_byte - See push_bytes
* String::pop_byte - See push_bytes
* String::shift_byte - See push_bytes
# Reservation methods
This commit does not yet touch the methods for reserving bytes. The methods on
Vec have also not yet been modified. These methods are discussed in the upcoming
[Collections reform RFC][1]
[1]: https://github.com/aturon/rfcs/blob/collections-conventions/active/0000-collections-conventions.md#implicit-growth
2014-09-22 07:12:10 -07:00
|
|
|
#[unstable = "waiting on raw module conventions"]
|
2014-07-19 12:23:47 +02:00
|
|
|
pub mod raw {
|
2014-07-20 12:08:40 +02:00
|
|
|
use core::mem;
|
2014-07-22 17:55:12 +02:00
|
|
|
use core::ptr::RawPtr;
|
2014-07-20 12:08:40 +02:00
|
|
|
use core::raw::Slice;
|
|
|
|
|
2014-07-19 12:23:47 +02:00
|
|
|
use super::String;
|
|
|
|
use vec::Vec;
|
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Creates a new `String` from a length, capacity, and pointer.
|
2014-07-21 20:44:56 +02:00
|
|
|
///
|
|
|
|
/// This is unsafe because:
|
2014-08-04 22:48:39 +12:00
|
|
|
/// * We call `Vec::from_raw_parts` to get a `Vec<u8>`;
|
|
|
|
/// * We assume that the `Vec` contains valid UTF-8.
|
2014-07-21 20:44:56 +02:00
|
|
|
#[inline]
|
2014-07-22 17:55:12 +02:00
|
|
|
pub unsafe fn from_parts(buf: *mut u8, length: uint, capacity: uint) -> String {
|
2014-07-21 20:44:56 +02:00
|
|
|
String {
|
2014-10-24 19:31:17 -07:00
|
|
|
vec: Vec::from_raw_parts(buf, length, capacity),
|
2014-07-21 20:44:56 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Creates a `String` from a `*const u8` buffer of the given length.
|
2014-07-20 12:08:40 +02:00
|
|
|
///
|
|
|
|
/// This function is unsafe because of two reasons:
|
2014-08-04 22:48:39 +12:00
|
|
|
/// * A raw pointer is dereferenced and transmuted to `&[u8]`;
|
|
|
|
/// * The slice is not checked to see whether it contains valid UTF-8.
|
2014-07-20 12:08:40 +02:00
|
|
|
pub unsafe fn from_buf_len(buf: *const u8, len: uint) -> String {
|
2014-11-02 17:04:32 -08:00
|
|
|
use slice::CloneSliceAllocPrelude;
|
2014-07-20 12:08:40 +02:00
|
|
|
let slice: &[u8] = mem::transmute(Slice {
|
|
|
|
data: buf,
|
|
|
|
len: len,
|
|
|
|
});
|
|
|
|
self::from_utf8(slice.to_vec())
|
|
|
|
}
|
2014-07-22 17:55:12 +02:00
|
|
|
|
2014-08-04 22:48:39 +12:00
|
|
|
/// Creates a `String` from a null-terminated `*const u8` buffer.
|
2014-07-22 17:55:12 +02:00
|
|
|
///
|
|
|
|
/// This function is unsafe because we dereference memory until we find the NUL character,
|
2014-08-01 19:40:21 -04:00
|
|
|
/// which is not guaranteed to be present. Additionally, the slice is not checked to see
|
2014-07-22 17:55:12 +02:00
|
|
|
/// whether it contains valid UTF-8
|
|
|
|
pub unsafe fn from_buf(buf: *const u8) -> String {
|
|
|
|
let mut len = 0;
|
|
|
|
while *buf.offset(len) != 0 {
|
|
|
|
len += 1;
|
|
|
|
}
|
|
|
|
self::from_buf_len(buf, len as uint)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Converts a vector of bytes to a new `String` without checking if
|
|
|
|
/// it contains valid UTF-8. This is unsafe because it assumes that
|
2014-08-04 22:48:39 +12:00
|
|
|
/// the UTF-8-ness of the vector has already been validated.
|
2014-07-22 17:55:12 +02:00
|
|
|
#[inline]
|
|
|
|
pub unsafe fn from_utf8(bytes: Vec<u8>) -> String {
|
|
|
|
String { vec: bytes }
|
|
|
|
}
|
2014-07-19 12:23:47 +02:00
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2014-05-29 19:03:06 -07:00
|
|
|
use std::prelude::*;
|
|
|
|
use test::Bencher;
|
|
|
|
|
2014-07-10 17:53:51 +02:00
|
|
|
use str;
|
2014-11-02 17:04:32 -08:00
|
|
|
use str::{Str, StrPrelude, Owned};
|
2014-11-16 12:38:03 +11:00
|
|
|
use super::{as_string, String, ToString};
|
2014-07-04 22:38:13 +02:00
|
|
|
use vec::Vec;
|
2014-11-02 17:04:32 -08:00
|
|
|
use slice::CloneSliceAllocPrelude;
|
2014-04-02 16:54:22 -07:00
|
|
|
|
2014-08-23 20:26:53 -04:00
|
|
|
#[test]
|
|
|
|
fn test_as_string() {
|
|
|
|
let x = "foo";
|
|
|
|
assert_eq!(x, as_string(x).as_slice());
|
|
|
|
}
|
|
|
|
|
2014-06-21 03:39:03 -07:00
|
|
|
#[test]
|
|
|
|
fn test_from_str() {
|
|
|
|
let owned: Option<::std::string::String> = from_str("string");
|
|
|
|
assert_eq!(owned.as_ref().map(|s| s.as_slice()), Some("string"));
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf8() {
|
2014-10-05 18:11:17 +08:00
|
|
|
let xs = b"hello".to_vec();
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8(xs), Ok(String::from_str("hello")));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-10-05 18:11:17 +08:00
|
|
|
let xs = "ศไทย中华Việt Nam".as_bytes().to_vec();
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8(xs), Ok(String::from_str("ศไทย中华Việt Nam")));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-10-05 18:11:17 +08:00
|
|
|
let xs = b"hello\xFF".to_vec();
|
2014-07-10 18:21:16 +02:00
|
|
|
assert_eq!(String::from_utf8(xs),
|
2014-10-05 18:11:17 +08:00
|
|
|
Err(b"hello\xFF".to_vec()));
|
2014-07-10 18:21:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf8_lossy() {
|
|
|
|
let xs = b"hello";
|
2014-09-11 17:07:49 +12:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), str::Slice("hello"));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
let xs = "ศไทย中华Việt Nam".as_bytes();
|
2014-09-11 17:07:49 +12:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs), str::Slice("ศไทย中华Việt Nam"));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"Hello\xC2 There\xFF Goodbye";
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
|
|
|
Owned(String::from_str("Hello\uFFFD There\uFFFD Goodbye")));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
|
|
|
Owned(String::from_str("Hello\uFFFD\uFFFD There\uFFFD Goodbye")));
|
|
|
|
|
|
|
|
let xs = b"\xF5foo\xF5\x80bar";
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
|
|
|
Owned(String::from_str("\uFFFDfoo\uFFFD\uFFFDbar")));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
|
2014-07-04 22:38:13 +02:00
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
|
|
|
Owned(String::from_str("\uFFFDfoo\uFFFDbar\uFFFDbaz")));
|
2014-07-10 18:21:16 +02:00
|
|
|
|
|
|
|
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs),
|
|
|
|
Owned(String::from_str("\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz")));
|
|
|
|
|
|
|
|
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs), Owned(String::from_str("\uFFFD\uFFFD\uFFFD\uFFFD\
|
|
|
|
foo\U00010000bar")));
|
|
|
|
|
|
|
|
// surrogates
|
|
|
|
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
|
|
|
|
assert_eq!(String::from_utf8_lossy(xs), Owned(String::from_str("\uFFFD\uFFFD\uFFFDfoo\
|
|
|
|
\uFFFD\uFFFD\uFFFDbar")));
|
|
|
|
}
|
|
|
|
|
2014-07-10 17:53:51 +02:00
|
|
|
#[test]
|
|
|
|
fn test_from_utf16() {
|
|
|
|
let pairs =
|
2014-07-04 22:38:13 +02:00
|
|
|
[(String::from_str("𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
|
|
|
|
0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
|
|
|
|
0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
|
|
|
|
0xd800_u16, 0xdf30_u16, 0x000a_u16]),
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
(String::from_str("𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd801_u16, 0xdc12_u16, 0xd801_u16,
|
|
|
|
0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
|
|
|
|
0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
|
|
|
|
0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
|
|
|
|
0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
|
|
|
|
0x000a_u16]),
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
(String::from_str("𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
|
|
|
|
0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
|
|
|
|
0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
|
|
|
|
0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
|
|
|
|
0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
|
|
|
|
0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
|
|
|
|
0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
|
|
|
|
|
2014-07-04 22:38:13 +02:00
|
|
|
(String::from_str("𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n"),
|
2014-07-10 17:53:51 +02:00
|
|
|
vec![0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
|
|
|
|
0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
|
|
|
|
0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
|
|
|
|
0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
|
|
|
|
0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
|
|
|
|
0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
|
|
|
|
0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
|
|
|
|
0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
|
|
|
|
0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
|
|
|
|
0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
|
|
|
|
0x000a_u16 ]),
|
|
|
|
// Issue #12318, even-numbered non-BMP planes
|
|
|
|
(String::from_str("\U00020000"),
|
|
|
|
vec![0xD840, 0xDC00])];
|
|
|
|
|
|
|
|
for p in pairs.iter() {
|
|
|
|
let (s, u) = (*p).clone();
|
|
|
|
let s_as_utf16 = s.as_slice().utf16_units().collect::<Vec<u16>>();
|
|
|
|
let u_as_string = String::from_utf16(u.as_slice()).unwrap();
|
|
|
|
|
|
|
|
assert!(str::is_utf16(u.as_slice()));
|
|
|
|
assert_eq!(s_as_utf16, u);
|
|
|
|
|
|
|
|
assert_eq!(u_as_string, s);
|
|
|
|
assert_eq!(String::from_utf16_lossy(u.as_slice()), s);
|
|
|
|
|
|
|
|
assert_eq!(String::from_utf16(s_as_utf16.as_slice()).unwrap(), s);
|
|
|
|
assert_eq!(u_as_string.as_slice().utf16_units().collect::<Vec<u16>>(), u);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_utf16_invalid() {
|
|
|
|
// completely positive cases tested above.
|
|
|
|
// lead + eof
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16(&[0xD800]), None);
|
2014-07-10 17:53:51 +02:00
|
|
|
// lead + lead
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16(&[0xD800, 0xD800]), None);
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// isolated trail
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16(&[0x0061, 0xDC00]), None);
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// general
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16(&[0xD800, 0xd801, 0xdc8b, 0xD800]), None);
|
2014-07-10 17:53:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_from_utf16_lossy() {
|
|
|
|
// completely positive cases tested above.
|
|
|
|
// lead + eof
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800]), String::from_str("\uFFFD"));
|
2014-07-10 17:53:51 +02:00
|
|
|
// lead + lead
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800, 0xD800]), String::from_str("\uFFFD\uFFFD"));
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// isolated trail
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0x0061, 0xDC00]), String::from_str("a\uFFFD"));
|
2014-07-10 17:53:51 +02:00
|
|
|
|
|
|
|
// general
|
2014-11-17 21:39:01 +13:00
|
|
|
assert_eq!(String::from_utf16_lossy(&[0xD800, 0xd801, 0xdc8b, 0xD800]),
|
2014-07-04 22:38:13 +02:00
|
|
|
String::from_str("\uFFFD𐒋\uFFFD"));
|
2014-07-10 17:53:51 +02:00
|
|
|
}
|
2014-06-21 03:39:03 -07:00
|
|
|
|
2014-07-20 12:08:40 +02:00
|
|
|
#[test]
|
|
|
|
fn test_from_buf_len() {
|
|
|
|
unsafe {
|
|
|
|
let a = vec![65u8, 65, 65, 65, 65, 65, 65, 0];
|
|
|
|
assert_eq!(super::raw::from_buf_len(a.as_ptr(), 3), String::from_str("AAA"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-22 17:55:12 +02:00
|
|
|
#[test]
|
|
|
|
fn test_from_buf() {
|
|
|
|
unsafe {
|
|
|
|
let a = vec![65, 65, 65, 65, 65, 65, 65, 0];
|
|
|
|
let b = a.as_ptr();
|
|
|
|
let c = super::raw::from_buf(b);
|
|
|
|
assert_eq!(c, String::from_str("AAAAAAA"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
#[test]
|
|
|
|
fn test_push_bytes() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("ABC");
|
2014-04-02 16:54:22 -07:00
|
|
|
unsafe {
|
2014-10-05 18:11:17 +08:00
|
|
|
let mv = s.as_mut_vec();
|
2014-11-17 21:39:01 +13:00
|
|
|
mv.push_all(&[b'D']);
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|
|
|
|
assert_eq!(s.as_slice(), "ABCD");
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_push_str() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::new();
|
2014-04-02 16:54:22 -07:00
|
|
|
s.push_str("");
|
|
|
|
assert_eq!(s.as_slice().slice_from(0), "");
|
|
|
|
s.push_str("abc");
|
|
|
|
assert_eq!(s.as_slice().slice_from(0), "abc");
|
|
|
|
s.push_str("ประเทศไทย中华Việt Nam");
|
|
|
|
assert_eq!(s.as_slice().slice_from(0), "abcประเทศไทย中华Việt Nam");
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
2014-09-22 08:28:35 -07:00
|
|
|
fn test_push() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut data = String::from_str("ประเทศไทย中");
|
2014-09-22 08:28:35 -07:00
|
|
|
data.push('华');
|
|
|
|
data.push('b'); // 1 byte
|
|
|
|
data.push('¢'); // 2 byte
|
|
|
|
data.push('€'); // 3 byte
|
|
|
|
data.push('𤭢'); // 4 byte
|
2014-04-02 16:54:22 -07:00
|
|
|
assert_eq!(data.as_slice(), "ประเทศไทย中华b¢€𤭢");
|
|
|
|
}
|
|
|
|
|
2014-05-08 21:42:40 +01:00
|
|
|
#[test]
|
2014-10-05 18:11:17 +08:00
|
|
|
fn test_pop() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut data = String::from_str("ประเทศไทย中华b¢€𤭢");
|
2014-10-05 18:11:17 +08:00
|
|
|
assert_eq!(data.pop().unwrap(), '𤭢'); // 4 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '€'); // 3 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '¢'); // 2 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), 'b'); // 1 bytes
|
|
|
|
assert_eq!(data.pop().unwrap(), '华');
|
2014-05-08 21:42:40 +01:00
|
|
|
assert_eq!(data.as_slice(), "ประเทศไทย中");
|
|
|
|
}
|
|
|
|
|
2014-04-02 16:54:22 -07:00
|
|
|
#[test]
|
|
|
|
fn test_str_truncate() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(5);
|
|
|
|
assert_eq!(s.as_slice(), "12345");
|
|
|
|
s.truncate(3);
|
|
|
|
assert_eq!(s.as_slice(), "123");
|
|
|
|
s.truncate(0);
|
|
|
|
assert_eq!(s.as_slice(), "");
|
|
|
|
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 16:54:22 -07:00
|
|
|
let p = s.as_slice().as_ptr();
|
|
|
|
s.truncate(3);
|
|
|
|
s.push_str("6");
|
|
|
|
let p_ = s.as_slice().as_ptr();
|
|
|
|
assert_eq!(p_, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[should_fail]
|
|
|
|
fn test_str_truncate_invalid_len() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(6);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[should_fail]
|
|
|
|
fn test_str_truncate_split_codepoint() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("\u00FC"); // ü
|
2014-04-02 16:54:22 -07:00
|
|
|
s.truncate(1);
|
|
|
|
}
|
2014-05-11 03:49:09 -07:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_str_clear() {
|
2014-05-22 16:57:53 -07:00
|
|
|
let mut s = String::from_str("12345");
|
2014-05-11 03:49:09 -07:00
|
|
|
s.clear();
|
|
|
|
assert_eq!(s.len(), 0);
|
|
|
|
assert_eq!(s.as_slice(), "");
|
|
|
|
}
|
2014-05-27 21:34:00 -07:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_str_add() {
|
|
|
|
let a = String::from_str("12345");
|
|
|
|
let b = a + "2";
|
|
|
|
let b = b + String::from_str("2");
|
|
|
|
assert_eq!(b.len(), 7);
|
|
|
|
assert_eq!(b.as_slice(), "1234522");
|
|
|
|
}
|
2014-07-10 18:21:16 +02:00
|
|
|
|
2014-09-22 08:24:14 -07:00
|
|
|
#[test]
|
|
|
|
fn remove() {
|
|
|
|
let mut s = "ศไทย中华Việt Nam; foobar".to_string();;
|
|
|
|
assert_eq!(s.remove(0), Some('ศ'));
|
|
|
|
assert_eq!(s.len(), 33);
|
|
|
|
assert_eq!(s.as_slice(), "ไทย中华Việt Nam; foobar");
|
|
|
|
assert_eq!(s.remove(33), None);
|
|
|
|
assert_eq!(s.remove(300), None);
|
|
|
|
assert_eq!(s.remove(17), Some('ệ'));
|
|
|
|
assert_eq!(s.as_slice(), "ไทย中华Vit Nam; foobar");
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test] #[should_fail]
|
|
|
|
fn remove_bad() {
|
|
|
|
"ศ".to_string().remove(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn insert() {
|
|
|
|
let mut s = "foobar".to_string();
|
|
|
|
s.insert(0, 'ệ');
|
|
|
|
assert_eq!(s.as_slice(), "ệfoobar");
|
|
|
|
s.insert(6, 'ย');
|
|
|
|
assert_eq!(s.as_slice(), "ệfooยbar");
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test] #[should_fail] fn insert_bad1() { "".to_string().insert(1, 't'); }
|
|
|
|
#[test] #[should_fail] fn insert_bad2() { "ệ".to_string().insert(1, 't'); }
|
|
|
|
|
2014-09-26 21:46:22 -07:00
|
|
|
#[test]
|
|
|
|
fn test_slicing() {
|
|
|
|
let s = "foobar".to_string();
|
|
|
|
assert_eq!("foobar", s[]);
|
|
|
|
assert_eq!("foo", s[..3]);
|
|
|
|
assert_eq!("bar", s[3..]);
|
|
|
|
assert_eq!("oob", s[1..4]);
|
|
|
|
}
|
|
|
|
|
2014-11-16 12:38:03 +11:00
|
|
|
#[test]
|
|
|
|
fn test_simple_types() {
|
|
|
|
assert_eq!(1i.to_string(), "1".to_string());
|
|
|
|
assert_eq!((-1i).to_string(), "-1".to_string());
|
|
|
|
assert_eq!(200u.to_string(), "200".to_string());
|
|
|
|
assert_eq!(2u8.to_string(), "2".to_string());
|
|
|
|
assert_eq!(true.to_string(), "true".to_string());
|
|
|
|
assert_eq!(false.to_string(), "false".to_string());
|
|
|
|
assert_eq!(().to_string(), "()".to_string());
|
|
|
|
assert_eq!(("hi".to_string()).to_string(), "hi".to_string());
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_vectors() {
|
|
|
|
let x: Vec<int> = vec![];
|
|
|
|
assert_eq!(x.to_string(), "[]".to_string());
|
|
|
|
assert_eq!((vec![1i]).to_string(), "[1]".to_string());
|
|
|
|
assert_eq!((vec![1i, 2, 3]).to_string(), "[1, 2, 3]".to_string());
|
|
|
|
assert!((vec![vec![], vec![1i], vec![1i, 1]]).to_string() ==
|
|
|
|
"[[], [1], [1, 1]]".to_string());
|
|
|
|
}
|
|
|
|
|
2014-07-10 18:21:16 +02:00
|
|
|
#[bench]
|
|
|
|
fn bench_with_capacity(b: &mut Bencher) {
|
|
|
|
b.iter(|| {
|
|
|
|
String::with_capacity(100)
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn bench_push_str(b: &mut Bencher) {
|
|
|
|
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
|
|
|
|
b.iter(|| {
|
|
|
|
let mut r = String::new();
|
|
|
|
r.push_str(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
|
|
|
|
let s = b"Hello there, the quick brown fox jumped over the lazy dog! \
|
|
|
|
Lorem ipsum dolor sit amet, consectetur. ";
|
|
|
|
|
|
|
|
assert_eq!(100, s.len());
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
|
2014-07-21 00:43:08 -05:00
|
|
|
let s = "𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰".as_bytes();
|
2014-07-10 18:21:16 +02:00
|
|
|
assert_eq!(100, s.len());
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_invalid(b: &mut Bencher) {
|
|
|
|
let s = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
#[bench]
|
|
|
|
fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
|
|
|
|
let s = Vec::from_elem(100, 0xF5u8);
|
|
|
|
b.iter(|| {
|
|
|
|
let _ = String::from_utf8_lossy(s.as_slice());
|
|
|
|
});
|
|
|
|
}
|
2014-04-02 16:54:22 -07:00
|
|
|
}
|