diff --git a/src/compiletest/compiletest.rs b/src/compiletest/compiletest.rs index 073f16e354d..7348c85aa82 100644 --- a/src/compiletest/compiletest.rs +++ b/src/compiletest/compiletest.rs @@ -19,7 +19,6 @@ #![feature(unboxed_closures)] #![feature(std_misc)] #![feature(test)] -#![feature(unicode)] #![feature(core)] #![feature(path)] #![feature(io)] diff --git a/src/compiletest/errors.rs b/src/compiletest/errors.rs index 25f962c5785..2b0e7985229 100644 --- a/src/compiletest/errors.rs +++ b/src/compiletest/errors.rs @@ -71,7 +71,7 @@ fn parse_expected(last_nonfollow_error: Option, let letters = line[kind_start..].chars(); let kind = letters.skip_while(|c| c.is_whitespace()) .take_while(|c| !c.is_whitespace()) - .map(|c| c.to_lowercase()) + .flat_map(|c| c.to_lowercase()) .collect::(); let letters = line[kind_start..].chars(); let msg = letters.skip_while(|c| c.is_whitespace()) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index 99547b9c60a..9dd5e2dd9bc 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -12,13 +12,14 @@ //! Unicode string manipulation (the [`str`](../primitive.str.html) type). //! -//! Rust's [`str`](../primitive.str.html) type is one of the core primitive types of the -//! language. `&str` is the borrowed string type. This type of string can only be created -//! from other strings, unless it is a `&'static str` (see below). It is not possible to -//! move out of borrowed strings because they are owned elsewhere. +//! Rust's [`str`](../primitive.str.html) type is one of the core primitive +//! types of the language. `&str` is the borrowed string type. This type of +//! string can only be created from other strings, unless it is a `&'static str` +//! (see below). It is not possible to move out of borrowed strings because they +//! are owned elsewhere. //! -//! Basic operations are implemented directly by the compiler, but more advanced operations are -//! defined on the [`StrExt`](trait.StrExt.html) trait. +//! Basic operations are implemented directly by the compiler, but more advanced +//! operations are defined on the [`StrExt`](trait.StrExt.html) trait. //! //! # Examples //! @@ -28,8 +29,9 @@ //! let s = "Hello, world."; //! ``` //! -//! This `&str` is a `&'static str`, which is the type of string literals. They're `'static` -//! because literals are available for the entire lifetime of the program. +//! This `&str` is a `&'static str`, which is the type of string literals. +//! They're `'static` because literals are available for the entire lifetime of +//! the program. //! //! You can get a non-`'static` `&str` by taking a slice of a `String`: //! @@ -40,12 +42,13 @@ //! //! # Representation //! -//! Rust's string type, `str`, is a sequence of Unicode scalar values encoded as a stream of UTF-8 -//! bytes. All [strings](../../reference.html#literals) are guaranteed to be validly encoded UTF-8 -//! sequences. Additionally, strings are not null-terminated and can thus contain null bytes. +//! Rust's string type, `str`, is a sequence of Unicode scalar values encoded as +//! a stream of UTF-8 bytes. All [strings](../../reference.html#literals) are +//! guaranteed to be validly encoded UTF-8 sequences. Additionally, strings are +//! not null-terminated and can thus contain null bytes. //! -//! The actual representation of `str`s have direct mappings to slices: `&str` is the same as -//! `&[u8]`. +//! The actual representation of `str`s have direct mappings to slices: `&str` +//! is the same as `&[u8]`. #![doc(primitive = "str")] #![stable(feature = "rust1", since = "1.0.0")] @@ -53,16 +56,16 @@ use self::RecompositionState::*; use self::DecompositionType::*; -use core::char::CharExt; use core::clone::Clone; use core::iter::AdditiveIterator; -use core::iter::{Iterator, IteratorExt}; +use core::iter::{Iterator, IteratorExt, Extend}; use core::ops::Index; use core::ops::RangeFull; use core::option::Option::{self, Some, None}; use core::result::Result; use core::slice::AsSlice; use core::str as core_str; +use unicode::char::CharExt; use unicode::str::{UnicodeStr, Utf16Encoder}; use vec_deque::VecDeque; @@ -836,17 +839,19 @@ pub trait StrExt: Index { /// Returns a slice of the string from the character range [`begin`..`end`). /// - /// That is, start at the `begin`-th code point of the string and continue to the `end`-th code - /// point. This does not detect or handle edge cases such as leaving a combining character as - /// the first code point of the string. + /// That is, start at the `begin`-th code point of the string and continue + /// to the `end`-th code point. This does not detect or handle edge cases + /// such as leaving a combining character as the first code point of the + /// string. /// - /// Due to the design of UTF-8, this operation is `O(end)`. See `slice`, `slice_to` and - /// `slice_from` for `O(1)` variants that use byte indices rather than code point indices. + /// Due to the design of UTF-8, this operation is `O(end)`. See `slice`, + /// `slice_to` and `slice_from` for `O(1)` variants that use byte indices + /// rather than code point indices. /// /// # Panics /// - /// Panics if `begin` > `end` or the either `begin` or `end` are beyond the last character of - /// the string. + /// Panics if `begin` > `end` or the either `begin` or `end` are beyond the + /// last character of the string. /// /// # Examples /// @@ -868,8 +873,8 @@ pub trait StrExt: Index { /// /// # Unsafety /// - /// Caller must check both UTF-8 character boundaries and the boundaries of the entire slice as - /// well. + /// Caller must check both UTF-8 character boundaries and the boundaries of + /// the entire slice as well. /// /// # Examples /// @@ -1506,6 +1511,32 @@ pub trait StrExt: Index { fn trim_right(&self) -> &str { UnicodeStr::trim_right(&self[..]) } + + /// Returns the lowercase equivalent of this string. + /// + /// # Examples + /// + /// let s = "HELLO"; + /// assert_eq!(s.to_lowercase(), "hello"); + #[unstable(feature = "collections")] + fn to_lowercase(&self) -> String { + let mut s = String::with_capacity(self.len()); + s.extend(self[..].chars().flat_map(|c| c.to_lowercase())); + return s; + } + + /// Returns the uppercase equivalent of this string. + /// + /// # Examples + /// + /// let s = "hello"; + /// assert_eq!(s.to_uppercase(), "HELLO"); + #[unstable(feature = "collections")] + fn to_uppercase(&self) -> String { + let mut s = String::with_capacity(self.len()); + s.extend(self[..].chars().flat_map(|c| c.to_uppercase())); + return s; + } } #[stable(feature = "rust1", since = "1.0.0")] diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 973070677d8..010415b364a 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -118,7 +118,7 @@ pub fn from_u32(i: u32) -> Option { /// assert_eq!(c, Some('4')); /// ``` #[inline] -#[unstable(feature = "core", reason = "pending integer conventions")] +#[stable(feature = "rust1", since = "1.0.0")] pub fn from_digit(num: u32, radix: u32) -> Option { if radix > 36 { panic!("from_digit: radix is too high (maximum 36)"); @@ -136,230 +136,25 @@ pub fn from_digit(num: u32, radix: u32) -> Option { } } -/// Basic `char` manipulations. -#[stable(feature = "rust1", since = "1.0.0")] +// NB: the stabilization and documentation for this trait is in +// unicode/char.rs, not here +#[allow(missing_docs)] // docs in libunicode/u_char.rs pub trait CharExt { - /// Checks if a `char` parses as a numeric digit in the given radix. - /// - /// Compared to `is_numeric()`, this function only recognizes the characters - /// `0-9`, `a-z` and `A-Z`. - /// - /// # Return value - /// - /// Returns `true` if `c` is a valid digit under `radix`, and `false` - /// otherwise. - /// - /// # Panics - /// - /// Panics if given a radix > 36. - /// - /// # Examples - /// - /// ``` - /// let c = '1'; - /// - /// assert!(c.is_digit(10)); - /// - /// assert!('f'.is_digit(16)); - /// ``` - #[unstable(feature = "core", - reason = "pending integer conventions")] fn is_digit(self, radix: u32) -> bool; - - /// Converts a character to the corresponding digit. - /// - /// # Return value - /// - /// If `c` is between '0' and '9', the corresponding value between 0 and - /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns - /// none if the character does not refer to a digit in the given radix. - /// - /// # Panics - /// - /// Panics if given a radix outside the range [0..36]. - /// - /// # Examples - /// - /// ``` - /// let c = '1'; - /// - /// assert_eq!(c.to_digit(10), Some(1)); - /// - /// assert_eq!('f'.to_digit(16), Some(15)); - /// ``` - #[unstable(feature = "core", - reason = "pending integer conventions")] fn to_digit(self, radix: u32) -> Option; - - /// Returns an iterator that yields the hexadecimal Unicode escape of a character, as `char`s. - /// - /// All characters are escaped with Rust syntax of the form `\\u{NNNN}` where `NNNN` is the - /// shortest hexadecimal representation of the code point. - /// - /// # Examples - /// - /// ``` - /// for i in '❤'.escape_unicode() { - /// println!("{}", i); - /// } - /// ``` - /// - /// This prints: - /// - /// ```text - /// \ - /// u - /// { - /// 2 - /// 7 - /// 6 - /// 4 - /// } - /// ``` - /// - /// Collecting into a `String`: - /// - /// ``` - /// let heart: String = '❤'.escape_unicode().collect(); - /// - /// assert_eq!(heart, r"\u{2764}"); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] fn escape_unicode(self) -> EscapeUnicode; - - /// Returns an iterator that yields the 'default' ASCII and - /// C++11-like literal escape of a character, as `char`s. - /// - /// The default is chosen with a bias toward producing literals that are - /// legal in a variety of languages, including C++11 and similar C-family - /// languages. The exact rules are: - /// - /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively. - /// * Single-quote, double-quote and backslash chars are backslash- - /// escaped. - /// * Any other chars in the range [0x20,0x7e] are not escaped. - /// * Any other chars are given hex Unicode escapes; see `escape_unicode`. - /// - /// # Examples - /// - /// ``` - /// for i in '"'.escape_default() { - /// println!("{}", i); - /// } - /// ``` - /// - /// This prints: - /// - /// ```text - /// \ - /// " - /// ``` - /// - /// Collecting into a `String`: - /// - /// ``` - /// let quote: String = '"'.escape_default().collect(); - /// - /// assert_eq!(quote, "\\\""); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] fn escape_default(self) -> EscapeDefault; - - /// Returns the number of bytes this character would need if encoded in UTF-8. - /// - /// # Examples - /// - /// ``` - /// let n = 'ß'.len_utf8(); - /// - /// assert_eq!(n, 2); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] fn len_utf8(self) -> usize; - - /// Returns the number of bytes this character would need if encoded in UTF-16. - /// - /// # Examples - /// - /// ``` - /// let n = 'ß'.len_utf16(); - /// - /// assert_eq!(n, 1); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] fn len_utf16(self) -> usize; - - /// Encodes this character as UTF-8 into the provided byte buffer, and then returns the number - /// of bytes written. - /// - /// If the buffer is not large enough, nothing will be written into it and a `None` will be - /// returned. - /// - /// # Examples - /// - /// In both of these examples, 'ß' takes two bytes to encode. - /// - /// ``` - /// let mut b = [0; 2]; - /// - /// let result = 'ß'.encode_utf8(&mut b); - /// - /// assert_eq!(result, Some(2)); - /// ``` - /// - /// A buffer that's too small: - /// - /// ``` - /// let mut b = [0; 1]; - /// - /// let result = 'ß'.encode_utf8(&mut b); - /// - /// assert_eq!(result, None); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] fn encode_utf8(self, dst: &mut [u8]) -> Option; - - /// Encodes this character as UTF-16 into the provided `u16` buffer, and then returns the - /// number of `u16`s written. - /// - /// If the buffer is not large enough, nothing will be written into it and a `None` will be - /// returned. - /// - /// # Examples - /// - /// In both of these examples, 'ß' takes one byte to encode. - /// - /// ``` - /// let mut b = [0; 1]; - /// - /// let result = 'ß'.encode_utf16(&mut b); - /// - /// assert_eq!(result, Some(1)); - /// ``` - /// - /// A buffer that's too small: - /// - /// ``` - /// let mut b = [0; 0]; - /// - /// let result = 'ß'.encode_utf8(&mut b); - /// - /// assert_eq!(result, None); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] fn encode_utf16(self, dst: &mut [u16]) -> Option; } -#[stable(feature = "rust1", since = "1.0.0")] impl CharExt for char { - #[unstable(feature = "core", - reason = "pending integer conventions")] fn is_digit(self, radix: u32) -> bool { self.to_digit(radix).is_some() } - #[unstable(feature = "core", - reason = "pending integer conventions")] fn to_digit(self, radix: u32) -> Option { if radix > 36 { panic!("to_digit: radix is too high (maximum 36)"); @@ -374,12 +169,10 @@ impl CharExt for char { else { None } } - #[stable(feature = "rust1", since = "1.0.0")] fn escape_unicode(self) -> EscapeUnicode { EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash } } - #[stable(feature = "rust1", since = "1.0.0")] fn escape_default(self) -> EscapeDefault { let init_state = match self { '\t' => EscapeDefaultState::Backslash('t'), @@ -395,7 +188,6 @@ impl CharExt for char { } #[inline] - #[stable(feature = "rust1", since = "1.0.0")] fn len_utf8(self) -> usize { let code = self as u32; if code < MAX_ONE_B { @@ -410,22 +202,17 @@ impl CharExt for char { } #[inline] - #[stable(feature = "rust1", since = "1.0.0")] fn len_utf16(self) -> usize { let ch = self as u32; if (ch & 0xFFFF) == ch { 1 } else { 2 } } #[inline] - #[unstable(feature = "core", - reason = "pending decision about Iterator/Writer/Reader")] fn encode_utf8(self, dst: &mut [u8]) -> Option { encode_utf8_raw(self as u32, dst) } #[inline] - #[unstable(feature = "core", - reason = "pending decision about Iterator/Writer/Reader")] fn encode_utf16(self, dst: &mut [u16]) -> Option { encode_utf16_raw(self as u32, dst) } @@ -437,7 +224,6 @@ impl CharExt for char { /// If the buffer is not large enough, nothing will be written into it /// and a `None` will be returned. #[inline] -#[unstable(feature = "core")] pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { // Marked #[inline] to allow llvm optimizing it away if code < MAX_ONE_B && dst.len() >= 1 { @@ -469,7 +255,6 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { /// If the buffer is not large enough, nothing will be written into it /// and a `None` will be returned. #[inline] -#[unstable(feature = "core")] pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option { // Marked #[inline] to allow llvm optimizing it away if (ch & 0xFFFF) == ch && dst.len() >= 1 { @@ -497,7 +282,6 @@ pub struct EscapeUnicode { } #[derive(Clone)] -#[unstable(feature = "core")] enum EscapeUnicodeState { Backslash, Type, @@ -559,7 +343,6 @@ pub struct EscapeDefault { } #[derive(Clone)] -#[unstable(feature = "core")] enum EscapeDefaultState { Backslash(char), Char(char), diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index 46d1f7ff3ae..65e941d160d 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -57,35 +57,47 @@ fn test_to_digit() { #[test] fn test_to_lowercase() { - assert_eq!('A'.to_lowercase(), 'a'); - assert_eq!('Ö'.to_lowercase(), 'ö'); - assert_eq!('ß'.to_lowercase(), 'ß'); - assert_eq!('Ü'.to_lowercase(), 'ü'); - assert_eq!('💩'.to_lowercase(), '💩'); - assert_eq!('Σ'.to_lowercase(), 'σ'); - assert_eq!('Τ'.to_lowercase(), 'τ'); - assert_eq!('Ι'.to_lowercase(), 'ι'); - assert_eq!('Γ'.to_lowercase(), 'γ'); - assert_eq!('Μ'.to_lowercase(), 'μ'); - assert_eq!('Α'.to_lowercase(), 'α'); - assert_eq!('Σ'.to_lowercase(), 'σ'); + fn lower(c: char) -> char { + let mut it = c.to_lowercase(); + let c = it.next().unwrap(); + assert!(it.next().is_none()); + c + } + assert_eq!(lower('A'), 'a'); + assert_eq!(lower('Ö'), 'ö'); + assert_eq!(lower('ß'), 'ß'); + assert_eq!(lower('Ü'), 'ü'); + assert_eq!(lower('💩'), '💩'); + assert_eq!(lower('Σ'), 'σ'); + assert_eq!(lower('Τ'), 'τ'); + assert_eq!(lower('Ι'), 'ι'); + assert_eq!(lower('Γ'), 'γ'); + assert_eq!(lower('Μ'), 'μ'); + assert_eq!(lower('Α'), 'α'); + assert_eq!(lower('Σ'), 'σ'); } #[test] fn test_to_uppercase() { - assert_eq!('a'.to_uppercase(), 'A'); - assert_eq!('ö'.to_uppercase(), 'Ö'); - assert_eq!('ß'.to_uppercase(), 'ß'); // not ẞ: Latin capital letter sharp s - assert_eq!('ü'.to_uppercase(), 'Ü'); - assert_eq!('💩'.to_uppercase(), '💩'); + fn upper(c: char) -> char { + let mut it = c.to_uppercase(); + let c = it.next().unwrap(); + assert!(it.next().is_none()); + c + } + assert_eq!(upper('a'), 'A'); + assert_eq!(upper('ö'), 'Ö'); + assert_eq!(upper('ß'), 'ß'); // not ẞ: Latin capital letter sharp s + assert_eq!(upper('ü'), 'Ü'); + assert_eq!(upper('💩'), '💩'); - assert_eq!('σ'.to_uppercase(), 'Σ'); - assert_eq!('τ'.to_uppercase(), 'Τ'); - assert_eq!('ι'.to_uppercase(), 'Ι'); - assert_eq!('γ'.to_uppercase(), 'Γ'); - assert_eq!('μ'.to_uppercase(), 'Μ'); - assert_eq!('α'.to_uppercase(), 'Α'); - assert_eq!('ς'.to_uppercase(), 'Σ'); + assert_eq!(upper('σ'), 'Σ'); + assert_eq!(upper('τ'), 'Τ'); + assert_eq!(upper('ι'), 'Ι'); + assert_eq!(upper('γ'), 'Γ'); + assert_eq!(upper('μ'), 'Μ'); + assert_eq!(upper('α'), 'Α'); + assert_eq!(upper('ς'), 'Σ'); } #[test] diff --git a/src/librustc_driver/lib.rs b/src/librustc_driver/lib.rs index c09b018ab63..716b1116a20 100644 --- a/src/librustc_driver/lib.rs +++ b/src/librustc_driver/lib.rs @@ -36,7 +36,6 @@ #![feature(rustc_private)] #![feature(unsafe_destructor)] #![feature(staged_api)] -#![feature(unicode)] #![feature(exit_status)] #![feature(path)] #![feature(io)] @@ -618,8 +617,7 @@ Available lint options: let print_lint_groups = |lints: Vec<(&'static str, Vec)>| { for (name, to) in lints { - let name = name.chars().map(|x| x.to_lowercase()) - .collect::().replace("_", "-"); + let name = name.to_lowercase().replace("_", "-"); let desc = to.into_iter().map(|x| x.as_str().replace("_", "-")) .collect::>().connect(", "); println!(" {} {}", diff --git a/src/librustc_lint/builtin.rs b/src/librustc_lint/builtin.rs index 1eea52fe1bb..7e4d3e3ca59 100644 --- a/src/librustc_lint/builtin.rs +++ b/src/librustc_lint/builtin.rs @@ -810,11 +810,11 @@ impl NonCamelCaseTypes { fn to_camel_case(s: &str) -> String { s.split('_').flat_map(|word| word.chars().enumerate().map(|(i, c)| if i == 0 { - c.to_uppercase() + c.to_uppercase().collect::() } else { - c + c.to_string() } - )).collect() + )).collect::>().concat() } let s = token::get_ident(ident); @@ -947,7 +947,7 @@ impl NonSnakeCase { buf = String::new(); } last_upper = ch.is_uppercase(); - buf.push(ch.to_lowercase()); + buf.extend(ch.to_lowercase()); } words.push(buf); } @@ -1064,8 +1064,7 @@ impl NonUpperCaseGlobals { let s = token::get_ident(ident); if s.chars().any(|c| c.is_lowercase()) { - let uc: String = NonSnakeCase::to_snake_case(&s).chars() - .map(|c| c.to_uppercase()).collect(); + let uc = NonSnakeCase::to_snake_case(&s).to_uppercase(); if uc != &s[..] { cx.span_lint(NON_UPPER_CASE_GLOBALS, span, &format!("{} `{}` should have an upper case name such as `{}`", diff --git a/src/librustc_lint/lib.rs b/src/librustc_lint/lib.rs index 9781e9944f6..a49c9db07a0 100644 --- a/src/librustc_lint/lib.rs +++ b/src/librustc_lint/lib.rs @@ -41,7 +41,6 @@ #![feature(unsafe_destructor)] #![feature(staged_api)] #![feature(std_misc)] -#![feature(unicode)] #![cfg_attr(test, feature(test))] extern crate syntax; diff --git a/src/libstd/old_io/mod.rs b/src/libstd/old_io/mod.rs index 9ce888efceb..f2042b384ce 100644 --- a/src/libstd/old_io/mod.rs +++ b/src/libstd/old_io/mod.rs @@ -343,8 +343,7 @@ impl IoError { pub fn from_errno(errno: i32, detail: bool) -> IoError { let mut err = sys::decode_error(errno as i32); if detail && err.kind == OtherIoError { - err.detail = Some(os::error_string(errno).chars() - .map(|c| c.to_lowercase()).collect()) + err.detail = Some(os::error_string(errno).to_lowercase()); } err } diff --git a/src/libstd/old_io/process.rs b/src/libstd/old_io/process.rs index e02e863516a..a30dcd9d9f0 100644 --- a/src/libstd/old_io/process.rs +++ b/src/libstd/old_io/process.rs @@ -110,10 +110,11 @@ struct EnvKey(CString); #[cfg(windows)] impl hash::Hash for EnvKey { fn hash(&self, state: &mut H) { + use ascii::AsciiExt; let &EnvKey(ref x) = self; match str::from_utf8(x.as_bytes()) { Ok(s) => for ch in s.chars() { - (ch as u8 as char).to_lowercase().hash(state); + ch.to_ascii_lowercase().hash(state); }, Err(..) => x.hash(state) } @@ -123,6 +124,7 @@ impl hash::Hash for EnvKey { #[cfg(windows)] impl PartialEq for EnvKey { fn eq(&self, other: &EnvKey) -> bool { + use ascii::AsciiExt; let &EnvKey(ref x) = self; let &EnvKey(ref y) = other; match (str::from_utf8(x.as_bytes()), str::from_utf8(y.as_bytes())) { @@ -131,7 +133,7 @@ impl PartialEq for EnvKey { return false } else { for (xch, ych) in xs.chars().zip(ys.chars()) { - if xch.to_lowercase() != ych.to_lowercase() { + if xch.to_ascii_lowercase() != ych.to_ascii_lowercase() { return false; } } diff --git a/src/libterm/lib.rs b/src/libterm/lib.rs index 1dad5d09092..d3be5b56830 100644 --- a/src/libterm/lib.rs +++ b/src/libterm/lib.rs @@ -61,7 +61,6 @@ #![feature(rustc_private)] #![feature(staged_api)] #![feature(std_misc)] -#![feature(unicode)] #![feature(path_ext)] #![cfg_attr(windows, feature(libc))] diff --git a/src/libunicode/u_char.rs b/src/libunicode/char.rs similarity index 61% rename from src/libunicode/u_char.rs rename to src/libunicode/char.rs index c0f45ca4d72..bcc2820e381 100644 --- a/src/libunicode/u_char.rs +++ b/src/libunicode/char.rs @@ -8,16 +8,39 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Unicode-intensive `char` methods along with the `core` methods. +//! Character manipulation (`char` type, Unicode Scalar Value) //! -//! These methods implement functionality for `char` that requires knowledge of -//! Unicode definitions, including normalization, categorization, and display information. +//! This module provides the `CharExt` trait, as well as its +//! implementation for the primitive `char` type, in order to allow +//! basic character manipulation. +//! +//! A `char` actually represents a +//! *[Unicode Scalar +//! Value](http://www.unicode.org/glossary/#unicode_scalar_value)*, as it can +//! contain any Unicode code point except high-surrogate and low-surrogate code +//! points. +//! +//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\] +//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`; +//! however the converse is not always true due to the above range limits +//! and, as such, should be performed via the `from_u32` function. + +#![stable(feature = "rust1", since = "1.0.0")] +#![doc(primitive = "char")] -use core::char; use core::char::CharExt as C; -use core::option::Option; +use core::option::Option::{self, Some}; +use core::iter::Iterator; use tables::{derived_property, property, general_category, conversions, charwidth}; +// stable reexports +pub use core::char::{MAX, from_u32, from_digit, EscapeUnicode, EscapeDefault}; + +// unstable reexports +pub use normalize::{decompose_canonical, decompose_compatible, compose}; +pub use tables::normalization::canonical_combining_class; +pub use tables::UNICODE_VERSION; + /// Functionality for manipulating `char`. #[stable(feature = "rust1", since = "1.0.0")] pub trait CharExt { @@ -34,8 +57,17 @@ pub trait CharExt { /// # Panics /// /// Panics if given a radix > 36. - #[unstable(feature = "unicode", - reason = "pending integer conventions")] + /// + /// # Examples + /// + /// ``` + /// let c = '1'; + /// + /// assert!(c.is_digit(10)); + /// + /// assert!('f'.is_digit(16)); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] fn is_digit(self, radix: u32) -> bool; /// Converts a character to the corresponding digit. @@ -49,18 +81,56 @@ pub trait CharExt { /// # Panics /// /// Panics if given a radix outside the range [0..36]. - #[unstable(feature = "unicode", - reason = "pending integer conventions")] + /// + /// # Examples + /// + /// ``` + /// let c = '1'; + /// + /// assert_eq!(c.to_digit(10), Some(1)); + /// + /// assert_eq!('f'.to_digit(16), Some(15)); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] fn to_digit(self, radix: u32) -> Option; - /// Returns an iterator that yields the hexadecimal Unicode escape - /// of a character, as `char`s. + /// Returns an iterator that yields the hexadecimal Unicode escape of a + /// character, as `char`s. /// /// All characters are escaped with Rust syntax of the form `\\u{NNNN}` /// where `NNNN` is the shortest hexadecimal representation of the code /// point. + /// + /// # Examples + /// + /// ``` + /// for i in '❤'.escape_unicode() { + /// println!("{}", i); + /// } + /// ``` + /// + /// This prints: + /// + /// ```text + /// \ + /// u + /// { + /// 2 + /// 7 + /// 6 + /// 4 + /// } + /// ``` + /// + /// Collecting into a `String`: + /// + /// ``` + /// let heart: String = '❤'.escape_unicode().collect(); + /// + /// assert_eq!(heart, r"\u{2764}"); + /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn escape_unicode(self) -> char::EscapeUnicode; + fn escape_unicode(self) -> EscapeUnicode; /// Returns an iterator that yields the 'default' ASCII and /// C++11-like literal escape of a character, as `char`s. @@ -74,33 +144,118 @@ pub trait CharExt { /// escaped. /// * Any other chars in the range [0x20,0x7e] are not escaped. /// * Any other chars are given hex Unicode escapes; see `escape_unicode`. + /// + /// # Examples + /// + /// ``` + /// for i in '"'.escape_default() { + /// println!("{}", i); + /// } + /// ``` + /// + /// This prints: + /// + /// ```text + /// \ + /// " + /// ``` + /// + /// Collecting into a `String`: + /// + /// ``` + /// let quote: String = '"'.escape_default().collect(); + /// + /// assert_eq!(quote, "\\\""); + /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn escape_default(self) -> char::EscapeDefault; + fn escape_default(self) -> EscapeDefault; - /// Returns the amount of bytes this character would need if encoded in + /// Returns the number of bytes this character would need if encoded in /// UTF-8. + /// + /// # Examples + /// + /// ``` + /// let n = 'ß'.len_utf8(); + /// + /// assert_eq!(n, 2); + /// ``` #[stable(feature = "rust1", since = "1.0.0")] fn len_utf8(self) -> usize; - /// Returns the amount of bytes this character would need if encoded in - /// UTF-16. + /// Returns the number of 16-bit code units this character would need if + /// encoded in UTF-16. + /// + /// # Examples + /// + /// ``` + /// let n = 'ß'.len_utf16(); + /// + /// assert_eq!(n, 1); + /// ``` #[stable(feature = "rust1", since = "1.0.0")] fn len_utf16(self) -> usize; - /// Encodes this character as UTF-8 into the provided byte buffer, - /// and then returns the number of bytes written. + /// Encodes this character as UTF-8 into the provided byte buffer, and then + /// returns the number of bytes written. /// - /// If the buffer is not large enough, nothing will be written into it - /// and a `None` will be returned. + /// If the buffer is not large enough, nothing will be written into it and a + /// `None` will be returned. A buffer of length four is large enough to + /// encode any `char`. + /// + /// # Examples + /// + /// In both of these examples, 'ß' takes two bytes to encode. + /// + /// ``` + /// let mut b = [0; 2]; + /// + /// let result = 'ß'.encode_utf8(&mut b); + /// + /// assert_eq!(result, Some(2)); + /// ``` + /// + /// A buffer that's too small: + /// + /// ``` + /// let mut b = [0; 1]; + /// + /// let result = 'ß'.encode_utf8(&mut b); + /// + /// assert_eq!(result, None); + /// ``` #[unstable(feature = "unicode", reason = "pending decision about Iterator/Writer/Reader")] fn encode_utf8(self, dst: &mut [u8]) -> Option; - /// Encodes this character as UTF-16 into the provided `u16` buffer, - /// and then returns the number of `u16`s written. + /// Encodes this character as UTF-16 into the provided `u16` buffer, and + /// then returns the number of `u16`s written. /// - /// If the buffer is not large enough, nothing will be written into it - /// and a `None` will be returned. + /// If the buffer is not large enough, nothing will be written into it and a + /// `None` will be returned. A buffer of length 2 is large enough to encode + /// any `char`. + /// + /// # Examples + /// + /// In both of these examples, 'ß' takes one `u16` to encode. + /// + /// ``` + /// let mut b = [0; 1]; + /// + /// let result = 'ß'.encode_utf16(&mut b); + /// + /// assert_eq!(result, Some(1)); + /// ``` + /// + /// A buffer that's too small: + /// + /// ``` + /// let mut b = [0; 0]; + /// + /// let result = 'ß'.encode_utf8(&mut b); + /// + /// assert_eq!(result, None); + /// ``` #[unstable(feature = "unicode", reason = "pending decision about Iterator/Writer/Reader")] fn encode_utf16(self, dst: &mut [u16]) -> Option; @@ -175,35 +330,35 @@ pub trait CharExt { /// /// # Return value /// - /// Returns the lowercase equivalent of the character, or the character - /// itself if no conversion is possible. - #[unstable(feature = "unicode", - reason = "pending case transformation decisions")] - fn to_lowercase(self) -> char; + /// Returns an iterator which yields the characters corresponding to the + /// lowercase equivalent of the character. If no conversion is possible then + /// the input character is returned. + #[stable(feature = "rust1", since = "1.0.0")] + fn to_lowercase(self) -> ToLowercase; /// Converts a character to its uppercase equivalent. /// /// The case-folding performed is the common or simple mapping: it maps - /// one Unicode codepoint (one character in Rust) to its uppercase - /// equivalent according to the Unicode database [1]. The additional - /// [`SpecialCasing.txt`] is not considered here, as it expands to multiple - /// codepoints in some cases. + /// one Unicode codepoint to its uppercase equivalent according to the + /// Unicode database [1]. The additional [`SpecialCasing.txt`] is not yet + /// considered here, but the iterator returned will soon support this form + /// of case folding. /// /// A full reference can be found here [2]. /// /// # Return value /// - /// Returns the uppercase equivalent of the character, or the character - /// itself if no conversion was made. + /// Returns an iterator which yields the characters corresponding to the + /// uppercase equivalent of the character. If no conversion is possible then + /// the input character is returned. /// /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt /// /// [`SpecialCasing`.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt /// /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992 - #[unstable(feature = "unicode", - reason = "pending case transformation decisions")] - fn to_uppercase(self) -> char; + #[stable(feature = "rust1", since = "1.0.0")] + fn to_uppercase(self) -> ToUppercase; /// Returns this character's displayed width in columns, or `None` if it is a /// control character other than `'\x00'`. @@ -221,28 +376,15 @@ pub trait CharExt { #[stable(feature = "rust1", since = "1.0.0")] impl CharExt for char { - #[unstable(feature = "unicode", - reason = "pending integer conventions")] fn is_digit(self, radix: u32) -> bool { C::is_digit(self, radix) } - #[unstable(feature = "unicode", - reason = "pending integer conventions")] fn to_digit(self, radix: u32) -> Option { C::to_digit(self, radix) } - #[stable(feature = "rust1", since = "1.0.0")] - fn escape_unicode(self) -> char::EscapeUnicode { C::escape_unicode(self) } - #[stable(feature = "rust1", since = "1.0.0")] - fn escape_default(self) -> char::EscapeDefault { C::escape_default(self) } - #[stable(feature = "rust1", since = "1.0.0")] + fn escape_unicode(self) -> EscapeUnicode { C::escape_unicode(self) } + fn escape_default(self) -> EscapeDefault { C::escape_default(self) } fn len_utf8(self) -> usize { C::len_utf8(self) } - #[stable(feature = "rust1", since = "1.0.0")] fn len_utf16(self) -> usize { C::len_utf16(self) } - #[unstable(feature = "unicode", - reason = "pending decision about Iterator/Writer/Reader")] fn encode_utf8(self, dst: &mut [u8]) -> Option { C::encode_utf8(self, dst) } - #[unstable(feature = "unicode", - reason = "pending decision about Iterator/Writer/Reader")] fn encode_utf16(self, dst: &mut [u16]) -> Option { C::encode_utf16(self, dst) } - #[stable(feature = "rust1", since = "1.0.0")] fn is_alphabetic(self) -> bool { match self { 'a' ... 'z' | 'A' ... 'Z' => true, @@ -251,15 +393,10 @@ impl CharExt for char { } } - #[unstable(feature = "unicode", - reason = "mainly needed for compiler internals")] fn is_xid_start(self) -> bool { derived_property::XID_Start(self) } - #[unstable(feature = "unicode", - reason = "mainly needed for compiler internals")] fn is_xid_continue(self) -> bool { derived_property::XID_Continue(self) } - #[stable(feature = "rust1", since = "1.0.0")] fn is_lowercase(self) -> bool { match self { 'a' ... 'z' => true, @@ -268,7 +405,6 @@ impl CharExt for char { } } - #[stable(feature = "rust1", since = "1.0.0")] fn is_uppercase(self) -> bool { match self { 'A' ... 'Z' => true, @@ -277,7 +413,6 @@ impl CharExt for char { } } - #[stable(feature = "rust1", since = "1.0.0")] fn is_whitespace(self) -> bool { match self { ' ' | '\x09' ... '\x0d' => true, @@ -286,15 +421,12 @@ impl CharExt for char { } } - #[stable(feature = "rust1", since = "1.0.0")] fn is_alphanumeric(self) -> bool { self.is_alphabetic() || self.is_numeric() } - #[stable(feature = "rust1", since = "1.0.0")] fn is_control(self) -> bool { general_category::Cc(self) } - #[stable(feature = "rust1", since = "1.0.0")] fn is_numeric(self) -> bool { match self { '0' ... '9' => true, @@ -303,15 +435,35 @@ impl CharExt for char { } } - #[unstable(feature = "unicode", - reason = "pending case transformation decisions")] - fn to_lowercase(self) -> char { conversions::to_lower(self) } + fn to_lowercase(self) -> ToLowercase { + ToLowercase(Some(conversions::to_lower(self))) + } - #[unstable(feature = "unicode", - reason = "pending case transformation decisions")] - fn to_uppercase(self) -> char { conversions::to_upper(self) } + fn to_uppercase(self) -> ToUppercase { + ToUppercase(Some(conversions::to_upper(self))) + } - #[unstable(feature = "unicode", - reason = "needs expert opinion. is_cjk flag stands out as ugly")] fn width(self, is_cjk: bool) -> Option { charwidth::width(self, is_cjk) } } + +/// An iterator over the lowercase mapping of a given character, returned from +/// the `lowercase` method on characters. +#[stable(feature = "rust1", since = "1.0.0")] +pub struct ToLowercase(Option); + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for ToLowercase { + type Item = char; + fn next(&mut self) -> Option { self.0.take() } +} + +/// An iterator over the uppercase mapping of a given character, returned from +/// the `uppercase` method on characters. +#[stable(feature = "rust1", since = "1.0.0")] +pub struct ToUppercase(Option); + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for ToUppercase { + type Item = char; + fn next(&mut self) -> Option { self.0.take() } +} diff --git a/src/libunicode/lib.rs b/src/libunicode/lib.rs index 2095b6921c8..fadf91f33bc 100644 --- a/src/libunicode/lib.rs +++ b/src/libunicode/lib.rs @@ -42,37 +42,8 @@ pub use tables::regex; mod normalize; mod tables; -mod u_char; mod u_str; - -// re-export char so that std et al see it correctly -/// Character manipulation (`char` type, Unicode Scalar Value) -/// -/// This module provides the `CharExt` trait, as well as its -/// implementation for the primitive `char` type, in order to allow -/// basic character manipulation. -/// -/// A `char` actually represents a -/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*, -/// as it can contain any Unicode code point except high-surrogate and -/// low-surrogate code points. -/// -/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\] -/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`; -/// however the converse is not always true due to the above range limits -/// and, as such, should be performed via the `from_u32` function. -#[stable(feature = "rust1", since = "1.0.0")] -#[doc(primitive = "char")] -pub mod char { - pub use core::char::{MAX, from_u32, from_digit}; - - pub use normalize::{decompose_canonical, decompose_compatible, compose}; - - pub use tables::normalization::canonical_combining_class; - pub use tables::UNICODE_VERSION; - - pub use u_char::CharExt; -} +pub mod char; pub mod str { pub use u_str::{UnicodeStr, Words, Graphemes, GraphemeIndices}; diff --git a/src/libunicode/u_str.rs b/src/libunicode/u_str.rs index 57439addeaa..9b3f4b0521d 100644 --- a/src/libunicode/u_str.rs +++ b/src/libunicode/u_str.rs @@ -26,7 +26,7 @@ use core::num::Int; use core::slice; use core::str::Split; -use u_char::CharExt as UCharExt; // conflicts with core::prelude::CharExt +use char::CharExt as UCharExt; // conflicts with core::prelude::CharExt use tables::grapheme::GraphemeCat; /// An iterator over the words of a string, separated by a sequence of whitespace