0f6a0b58f9
This commit performs another pass over the `std::char` module for stabilization. Some minor cleanup is performed such as migrating documentation from libcore to libunicode (where the `std`-facing trait resides) as well as a slight reorganiation in libunicode itself. Otherwise, the stability modifications made are: * `char::from_digit` is now stable * `CharExt::is_digit` is now stable * `CharExt::to_digit` is now stable * `CharExt::to_{lower,upper}case` are now stable after being modified to return an iterator over characters. While the implementation today has not changed this should allow us to implement the full set of case conversions in unicode where some characters can map to multiple when doing an upper or lower case mapping. * `StrExt::to_{lower,upper}case` was added as unstable for a convenience of not having to worry about characters expanding to more characters when you just want the whole string to get into upper or lower case. This is a breaking change due to the change in the signatures of the `CharExt::to_{upper,lower}case` methods. Code can be updated to use functions like `flat_map` or `collect` to handle the difference. [breaking-change]
372 lines
12 KiB
Rust
372 lines
12 KiB
Rust
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
|
// file at the top-level directory of this distribution and at
|
|
// http://rust-lang.org/COPYRIGHT.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
// option. This file may not be copied, modified, or distributed
|
|
// except according to those terms.
|
|
|
|
//! Character manipulation.
|
|
//!
|
|
//! For more details, see ::unicode::char (a.k.a. std::char)
|
|
|
|
#![allow(non_snake_case)]
|
|
#![doc(primitive = "char")]
|
|
|
|
use iter::Iterator;
|
|
use mem::transmute;
|
|
use option::Option::{None, Some};
|
|
use option::Option;
|
|
use slice::SliceExt;
|
|
|
|
// UTF-8 ranges and tags for encoding characters
|
|
const TAG_CONT: u8 = 0b1000_0000;
|
|
const TAG_TWO_B: u8 = 0b1100_0000;
|
|
const TAG_THREE_B: u8 = 0b1110_0000;
|
|
const TAG_FOUR_B: u8 = 0b1111_0000;
|
|
const MAX_ONE_B: u32 = 0x80;
|
|
const MAX_TWO_B: u32 = 0x800;
|
|
const MAX_THREE_B: u32 = 0x10000;
|
|
|
|
/*
|
|
Lu Uppercase_Letter an uppercase letter
|
|
Ll Lowercase_Letter a lowercase letter
|
|
Lt Titlecase_Letter a digraphic character, with first part uppercase
|
|
Lm Modifier_Letter a modifier letter
|
|
Lo Other_Letter other letters, including syllables and ideographs
|
|
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
|
|
Mc Spacing_Mark a spacing combining mark (positive advance width)
|
|
Me Enclosing_Mark an enclosing combining mark
|
|
Nd Decimal_Number a decimal digit
|
|
Nl Letter_Number a letterlike numeric character
|
|
No Other_Number a numeric character of other type
|
|
Pc Connector_Punctuation a connecting punctuation mark, like a tie
|
|
Pd Dash_Punctuation a dash or hyphen punctuation mark
|
|
Ps Open_Punctuation an opening punctuation mark (of a pair)
|
|
Pe Close_Punctuation a closing punctuation mark (of a pair)
|
|
Pi Initial_Punctuation an initial quotation mark
|
|
Pf Final_Punctuation a final quotation mark
|
|
Po Other_Punctuation a punctuation mark of other type
|
|
Sm Math_Symbol a symbol of primarily mathematical use
|
|
Sc Currency_Symbol a currency sign
|
|
Sk Modifier_Symbol a non-letterlike modifier symbol
|
|
So Other_Symbol a symbol of other type
|
|
Zs Space_Separator a space character (of various non-zero widths)
|
|
Zl Line_Separator U+2028 LINE SEPARATOR only
|
|
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
|
|
Cc Control a C0 or C1 control code
|
|
Cf Format a format control character
|
|
Cs Surrogate a surrogate code point
|
|
Co Private_Use a private-use character
|
|
Cn Unassigned a reserved unassigned code point or a noncharacter
|
|
*/
|
|
|
|
/// The highest valid code point
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
pub const MAX: char = '\u{10ffff}';
|
|
|
|
/// Converts a `u32` to an `Option<char>`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use std::char;
|
|
///
|
|
/// let c = char::from_u32(10084); // produces `Some(❤)`
|
|
/// assert_eq!(c, Some('❤'));
|
|
/// ```
|
|
///
|
|
/// An invalid character:
|
|
///
|
|
/// ```
|
|
/// use std::char;
|
|
///
|
|
/// let none = char::from_u32(1114112);
|
|
/// assert_eq!(none, None);
|
|
/// ```
|
|
#[inline]
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
pub fn from_u32(i: u32) -> Option<char> {
|
|
// catch out-of-bounds and surrogates
|
|
if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
|
|
None
|
|
} else {
|
|
Some(unsafe { transmute(i) })
|
|
}
|
|
}
|
|
|
|
/// Converts a number to the character representing it.
|
|
///
|
|
/// # Return value
|
|
///
|
|
/// Returns `Some(char)` if `num` represents one digit under `radix`,
|
|
/// using one character of `0-9` or `a-z`, or `None` if it doesn't.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// Panics if given an `radix` > 36.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use std::char;
|
|
///
|
|
/// let c = char::from_digit(4, 10);
|
|
///
|
|
/// assert_eq!(c, Some('4'));
|
|
/// ```
|
|
#[inline]
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
pub fn from_digit(num: u32, radix: u32) -> Option<char> {
|
|
if radix > 36 {
|
|
panic!("from_digit: radix is too high (maximum 36)");
|
|
}
|
|
if num < radix {
|
|
unsafe {
|
|
if num < 10 {
|
|
Some(transmute('0' as u32 + num))
|
|
} else {
|
|
Some(transmute('a' as u32 + num - 10))
|
|
}
|
|
}
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
// NB: the stabilization and documentation for this trait is in
|
|
// unicode/char.rs, not here
|
|
#[allow(missing_docs)] // docs in libunicode/u_char.rs
|
|
pub trait CharExt {
|
|
fn is_digit(self, radix: u32) -> bool;
|
|
fn to_digit(self, radix: u32) -> Option<u32>;
|
|
fn escape_unicode(self) -> EscapeUnicode;
|
|
fn escape_default(self) -> EscapeDefault;
|
|
fn len_utf8(self) -> usize;
|
|
fn len_utf16(self) -> usize;
|
|
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
|
|
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
|
|
}
|
|
|
|
impl CharExt for char {
|
|
fn is_digit(self, radix: u32) -> bool {
|
|
self.to_digit(radix).is_some()
|
|
}
|
|
|
|
fn to_digit(self, radix: u32) -> Option<u32> {
|
|
if radix > 36 {
|
|
panic!("to_digit: radix is too high (maximum 36)");
|
|
}
|
|
let val = match self {
|
|
'0' ... '9' => self as u32 - '0' as u32,
|
|
'a' ... 'z' => self as u32 - 'a' as u32 + 10,
|
|
'A' ... 'Z' => self as u32 - 'A' as u32 + 10,
|
|
_ => return None,
|
|
};
|
|
if val < radix { Some(val) }
|
|
else { None }
|
|
}
|
|
|
|
fn escape_unicode(self) -> EscapeUnicode {
|
|
EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash }
|
|
}
|
|
|
|
fn escape_default(self) -> EscapeDefault {
|
|
let init_state = match self {
|
|
'\t' => EscapeDefaultState::Backslash('t'),
|
|
'\r' => EscapeDefaultState::Backslash('r'),
|
|
'\n' => EscapeDefaultState::Backslash('n'),
|
|
'\\' => EscapeDefaultState::Backslash('\\'),
|
|
'\'' => EscapeDefaultState::Backslash('\''),
|
|
'"' => EscapeDefaultState::Backslash('"'),
|
|
'\x20' ... '\x7e' => EscapeDefaultState::Char(self),
|
|
_ => EscapeDefaultState::Unicode(self.escape_unicode())
|
|
};
|
|
EscapeDefault { state: init_state }
|
|
}
|
|
|
|
#[inline]
|
|
fn len_utf8(self) -> usize {
|
|
let code = self as u32;
|
|
if code < MAX_ONE_B {
|
|
1
|
|
} else if code < MAX_TWO_B {
|
|
2
|
|
} else if code < MAX_THREE_B {
|
|
3
|
|
} else {
|
|
4
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn len_utf16(self) -> usize {
|
|
let ch = self as u32;
|
|
if (ch & 0xFFFF) == ch { 1 } else { 2 }
|
|
}
|
|
|
|
#[inline]
|
|
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
|
|
encode_utf8_raw(self as u32, dst)
|
|
}
|
|
|
|
#[inline]
|
|
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
|
|
encode_utf16_raw(self as u32, dst)
|
|
}
|
|
}
|
|
|
|
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
|
|
/// and then returns the number of bytes written.
|
|
///
|
|
/// If the buffer is not large enough, nothing will be written into it
|
|
/// and a `None` will be returned.
|
|
#[inline]
|
|
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
|
|
// Marked #[inline] to allow llvm optimizing it away
|
|
if code < MAX_ONE_B && dst.len() >= 1 {
|
|
dst[0] = code as u8;
|
|
Some(1)
|
|
} else if code < MAX_TWO_B && dst.len() >= 2 {
|
|
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
|
|
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
|
|
Some(2)
|
|
} else if code < MAX_THREE_B && dst.len() >= 3 {
|
|
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
|
|
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
|
|
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
|
|
Some(3)
|
|
} else if dst.len() >= 4 {
|
|
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
|
|
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
|
|
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
|
|
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
|
|
Some(4)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
|
|
/// and then returns the number of `u16`s written.
|
|
///
|
|
/// If the buffer is not large enough, nothing will be written into it
|
|
/// and a `None` will be returned.
|
|
#[inline]
|
|
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
|
|
// Marked #[inline] to allow llvm optimizing it away
|
|
if (ch & 0xFFFF) == ch && dst.len() >= 1 {
|
|
// The BMP falls through (assuming non-surrogate, as it should)
|
|
dst[0] = ch as u16;
|
|
Some(1)
|
|
} else if dst.len() >= 2 {
|
|
// Supplementary planes break into surrogates.
|
|
ch -= 0x1_0000;
|
|
dst[0] = 0xD800 | ((ch >> 10) as u16);
|
|
dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
|
|
Some(2)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// An iterator over the characters that represent a `char`, as escaped by
|
|
/// Rust's unicode escaping rules.
|
|
#[derive(Clone)]
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
pub struct EscapeUnicode {
|
|
c: char,
|
|
state: EscapeUnicodeState
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
enum EscapeUnicodeState {
|
|
Backslash,
|
|
Type,
|
|
LeftBrace,
|
|
Value(usize),
|
|
RightBrace,
|
|
Done,
|
|
}
|
|
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
impl Iterator for EscapeUnicode {
|
|
type Item = char;
|
|
|
|
fn next(&mut self) -> Option<char> {
|
|
match self.state {
|
|
EscapeUnicodeState::Backslash => {
|
|
self.state = EscapeUnicodeState::Type;
|
|
Some('\\')
|
|
}
|
|
EscapeUnicodeState::Type => {
|
|
self.state = EscapeUnicodeState::LeftBrace;
|
|
Some('u')
|
|
}
|
|
EscapeUnicodeState::LeftBrace => {
|
|
let mut n = 0;
|
|
while (self.c as u32) >> (4 * (n + 1)) != 0 {
|
|
n += 1;
|
|
}
|
|
self.state = EscapeUnicodeState::Value(n);
|
|
Some('{')
|
|
}
|
|
EscapeUnicodeState::Value(offset) => {
|
|
let v = match ((self.c as i32) >> (offset * 4)) & 0xf {
|
|
i @ 0 ... 9 => '0' as i32 + i,
|
|
i => 'a' as i32 + (i - 10)
|
|
};
|
|
if offset == 0 {
|
|
self.state = EscapeUnicodeState::RightBrace;
|
|
} else {
|
|
self.state = EscapeUnicodeState::Value(offset - 1);
|
|
}
|
|
Some(unsafe { transmute(v) })
|
|
}
|
|
EscapeUnicodeState::RightBrace => {
|
|
self.state = EscapeUnicodeState::Done;
|
|
Some('}')
|
|
}
|
|
EscapeUnicodeState::Done => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// An iterator over the characters that represent a `char`, escaped
|
|
/// for maximum portability.
|
|
#[derive(Clone)]
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
pub struct EscapeDefault {
|
|
state: EscapeDefaultState
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
enum EscapeDefaultState {
|
|
Backslash(char),
|
|
Char(char),
|
|
Done,
|
|
Unicode(EscapeUnicode),
|
|
}
|
|
|
|
#[stable(feature = "rust1", since = "1.0.0")]
|
|
impl Iterator for EscapeDefault {
|
|
type Item = char;
|
|
|
|
fn next(&mut self) -> Option<char> {
|
|
match self.state {
|
|
EscapeDefaultState::Backslash(c) => {
|
|
self.state = EscapeDefaultState::Char(c);
|
|
Some('\\')
|
|
}
|
|
EscapeDefaultState::Char(c) => {
|
|
self.state = EscapeDefaultState::Done;
|
|
Some(c)
|
|
}
|
|
EscapeDefaultState::Done => None,
|
|
EscapeDefaultState::Unicode(ref mut iter) => iter.next()
|
|
}
|
|
}
|
|
}
|