From 4d73da92f0e6d00025609e42b1e7f59f00b73be9 Mon Sep 17 00:00:00 2001 From: Steve Klabnik Date: Fri, 2 Oct 2015 14:36:02 -0400 Subject: [PATCH] Improve documentation for the from_utf8 family Our docs were very basic for the various versions of from_utf8, so this commit beefs them up. It also improves docs for the &str variant's error, Utf8Error. --- src/libcollections/string.rs | 129 ++++++++++++++++++++++++++++++----- src/libcore/str/mod.rs | 124 +++++++++++++++++++++++++++++++-- 2 files changed, 230 insertions(+), 23 deletions(-) diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index acbce825ecc..271dbffcc2e 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -92,26 +92,61 @@ pub fn from_str(_: &str) -> String { panic!("not available with cfg(test)"); } - /// Returns the vector as a string buffer, if possible, taking care not to - /// copy it. + /// Converts a vector of bytes to a `String`. + /// + /// A string slice (`&str`) is made of bytes (`u8`), and a vector of bytes + /// (`Vec`) is made of bytes, so this function converts between the + /// two. Not all byte slices are valid `String`s, however: `String` + /// requires that it is valid UTF-8. `from_utf8()` checks to ensure that + /// the bytes are valid UTF-8, and then does the conversion. + /// + /// If you are sure that the byte slice is valid UTF-8, and you don't want + /// to incur the overhead of the validity check, there is an unsafe version + /// of this function, [`from_utf8_unchecked()`][fromutf8], which has the + /// same behavior but skips the check. + /// + /// [fromutf8]: struct.String.html#method.from_utf8_unchecked + /// + /// This method will take care to not copy the vector, for efficiency's + /// sake. + /// + /// If you need a `&str` instead of a `String`, consider + /// [`str::from_utf8()`][str]. + /// + /// [str]: ../str/fn.from_utf8.html /// /// # Failure /// - /// If the given vector is not valid UTF-8, then the original vector and the - /// corresponding error is returned. + /// Returns `Err` if the slice is not UTF-8 with a description as to why the + /// provided bytes are not UTF-8. The vector you moved in is also included. /// /// # Examples /// - /// ``` - /// let hello_vec = vec![104, 101, 108, 108, 111]; - /// let s = String::from_utf8(hello_vec).unwrap(); - /// assert_eq!(s, "hello"); + /// Basic usage: /// - /// let invalid_vec = vec![240, 144, 128]; - /// let s = String::from_utf8(invalid_vec).err().unwrap(); - /// let err = s.utf8_error(); - /// assert_eq!(s.into_bytes(), [240, 144, 128]); /// ``` + /// // some bytes, in a vector + /// let sparkle_heart = vec![240, 159, 146, 150]; + /// + /// // We know these bytes are valid, so just use `unwrap()`. + /// let sparkle_heart = String::from_utf8(sparkle_heart).unwrap(); + /// + /// assert_eq!("πŸ’–", sparkle_heart); + /// ``` + /// + /// Incorrect bytes: + /// + /// ``` + /// // some invalid bytes, in a vector + /// let sparkle_heart = vec![0, 159, 146, 150]; + /// + /// assert!(String::from_utf8(sparkle_heart).is_err()); + /// ``` + /// + /// See the docs for [`FromUtf8Error`][error] for more details on what you + /// can do with this error. + /// + /// [error]: struct.FromUtf8Error.html #[inline] #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8(vec: Vec) -> Result { @@ -121,15 +156,49 @@ pub fn from_utf8(vec: Vec) -> Result { } } - /// Converts a vector of bytes to a new UTF-8 string. - /// Any invalid UTF-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER. + /// Converts a slice of bytes to a `String`, including invalid characters. + /// + /// A string slice (`&str`) is made of bytes (`u8`), and a slice of bytes + /// (`&[u8]`) is made of bytes, so this function converts between the two. + /// Not all byte slices are valid string slices, however: `&str` requires + /// that it is valid UTF-8. During this conversion, `from_utf8_lossy()` + /// will replace any invalid UTF-8 sequences with + /// `U+FFFD REPLACEMENT CHARACTER`, which looks like this: οΏ½ + /// + /// If you are sure that the byte slice is valid UTF-8, and you don't want + /// to incur the overhead of the conversion, there is an unsafe version + /// of this function, [`from_utf8_unchecked()`][fromutf8], which has the + /// same behavior but skips the checks. + /// + /// [fromutf8]: struct.String.html#method.from_utf8_unchecked + /// + /// If you need a `&str` instead of a `String`, consider + /// [`str::from_utf8()`][str]. + /// + /// [str]: ../str/fn.from_utf8.html /// /// # Examples /// + /// Basic usage: + /// /// ``` + /// // some bytes, in a vector + /// let sparkle_heart = vec![240, 159, 146, 150]; + /// + /// // We know these bytes are valid, so just use `unwrap()`. + /// let sparkle_heart = String::from_utf8(sparkle_heart).unwrap(); + /// + /// assert_eq!("πŸ’–", sparkle_heart); + /// ``` + /// + /// Incorrect bytes: + /// + /// ``` + /// // some invalid bytes /// let input = b"Hello \xF0\x90\x80World"; /// let output = String::from_utf8_lossy(input); - /// assert_eq!(output, "Hello \u{FFFD}World"); + /// + /// assert_eq!("Hello οΏ½World", output); /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> { @@ -309,9 +378,33 @@ pub unsafe fn from_raw_parts(buf: *mut u8, length: usize, capacity: usize) -> St } } - /// Converts a vector of bytes to a new `String` without checking if - /// it contains valid UTF-8. This is unsafe because it assumes that - /// the UTF-8-ness of the vector has already been validated. + /// Converts a vector of bytes to a `String` without checking that the + /// string contains valid UTF-8. + /// + /// See the safe version, [`from_utrf8()`][fromutf8], for more. + /// + /// [fromutf8]: struct.String.html#method.from_utf8 + /// + /// # Unsafety + /// + /// This function is unsafe because it does not check that the bytes passed to + /// it are valid UTF-8. If this constraint is violated, undefined behavior + /// results, as the rest of Rust assumes that `String`s are valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some bytes, in a vector + /// let sparkle_heart = vec![240, 159, 146, 150]; + /// + /// let sparkle_heart = unsafe { + /// String::from_utf8_unchecked(sparkle_heart) + /// }; + /// + /// assert_eq!("πŸ’–", sparkle_heart); + /// ``` #[inline] #[stable(feature = "rust1", since = "1.0.0")] pub unsafe fn from_utf8_unchecked(bytes: Vec) -> String { diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index be2186945d5..9f1439ea388 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -119,7 +119,11 @@ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { Section: Creating a string */ -/// Errors which can occur when attempting to interpret a byte slice as a `str`. +/// Errors which can occur when attempting to interpret a sequence of `u8` +/// as a string. +/// +/// As such, the `from_utf8` family of functions and methods for both `String`s +/// and `&str`s make use of this error, for example. #[derive(Copy, Eq, PartialEq, Clone, Debug)] #[stable(feature = "rust1", since = "1.0.0")] pub struct Utf8Error { @@ -132,21 +136,104 @@ impl Utf8Error { /// /// It is the maximum index such that `from_utf8(input[..index])` /// would return `Some(_)`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(utf8_error)] + /// + /// use std::str; + /// + /// // some invalid bytes, in a vector + /// let sparkle_heart = vec![0, 159, 146, 150]; + /// + /// // std::str::from_utf8 returns a Utf8Error + /// let error = str::from_utf8(&sparkle_heart).unwrap_err(); + /// + /// // the first byte is invalid here + /// assert_eq!(1, error.valid_up_to()); + /// ``` #[unstable(feature = "utf8_error", reason = "method just added", issue = "27734")] pub fn valid_up_to(&self) -> usize { self.valid_up_to } } -/// Converts a slice of bytes to a string slice without performing any -/// allocations. +/// Converts a slice of bytes to a string slice. /// -/// Once the slice has been validated as UTF-8, it is transmuted in-place and -/// returned as a '&str' instead of a '&[u8]' +/// A string slice (`&str`) is made of bytes (`u8`), and a byte slice (`&[u8]`) +/// is made of bytes, so this function converts between the two. Not all byte +/// slices are valid string slices, however: `&str` requires that it is valid +/// UTF-8. `from_utf8()` checks to ensure that the bytes are valid UTF-8, and +/// then does the conversion. +/// +/// If you are sure that the byte slice is valid UTF-8, and you don't want to +/// incur the overhead of the validity check, there is an unsafe version of +/// this function, [`from_utf8_unchecked()`][fromutf8], which has the same +/// behavior but skips the check. +/// +/// [fromutf8]: fn.from_utf8.html +/// +/// If you need a `String` instead of a `&str`, consider +/// [`String::from_utf8()`][string]. +/// +/// [string]: ../string/struct.String.html#method.from_utf8 +/// +/// Because you can stack-allocate a `[u8; N]`, and you can take a `&[u8]` of +/// it, this function is one way to have a stack-allocated string. There is +/// an example of this in the examples section below. /// /// # Failure /// /// Returns `Err` if the slice is not UTF-8 with a description as to why the /// provided slice is not UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so just use `unwrap()`. +/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); +/// +/// assert_eq!("πŸ’–", sparkle_heart); +/// ``` +/// +/// Incorrect bytes: +/// +/// ``` +/// use std::str; +/// +/// // some invalid bytes, in a vector +/// let sparkle_heart = vec![0, 159, 146, 150]; +/// +/// assert!(str::from_utf8(&sparkle_heart).is_err()); +/// ``` +/// +/// See the docs for [`Utf8Error`][error] for more details on the kinds of +/// errors that can be returned. +/// +/// [error]: struct.Utf8Error.html +/// +/// A "stack allocated string": +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a stack-allocated array +/// let sparkle_heart = [240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so just use `unwrap()`. +/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); +/// +/// assert_eq!("πŸ’–", sparkle_heart); +/// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { try!(run_utf8_validation_iterator(&mut v.iter())); @@ -155,6 +242,33 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { /// Converts a slice of bytes to a string slice without checking /// that the string contains valid UTF-8. +/// +/// See the safe version, [`from_utrf8()`][fromutf8], for more. +/// +/// [fromutf8]: fn.from_utf8.html +/// +/// # Unsafety +/// +/// This function is unsafe because it does not check that the bytes passed to +/// it are valid UTF-8. If this constraint is violated, undefined behavior +/// results, as the rest of Rust assumes that `&str`s are valid UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// let sparkle_heart = unsafe { +/// str::from_utf8_unchecked(&sparkle_heart) +/// }; +/// +/// assert_eq!("πŸ’–", sparkle_heart); +/// ``` #[inline(always)] #[stable(feature = "rust1", since = "1.0.0")] pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {