From 72d85db6eea29004a467842923c07169bc304217 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Sat, 30 May 2020 11:49:31 +0200 Subject: [PATCH] expose char::encode_utf8_raw for libstd --- src/libcore/char/methods.rs | 99 ++++++++++++++++++++++--------------- src/libcore/char/mod.rs | 4 ++ 2 files changed, 63 insertions(+), 40 deletions(-) diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index 5c5bc9adb5d..112e7e38e41 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -593,16 +593,7 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn len_utf8(self) -> usize { - let code = self as u32; - if code < MAX_ONE_B { - 1 - } else if code < MAX_TWO_B { - 2 - } else if code < MAX_THREE_B { - 3 - } else { - 4 - } + len_utf8(self as u32) } /// Returns the number of 16-bit code units this `char` would need if @@ -670,36 +661,7 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { - let code = self as u32; - let len = self.len_utf8(); - match (len, &mut dst[..]) { - (1, [a, ..]) => { - *a = code as u8; - } - (2, [a, b, ..]) => { - *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - *b = (code & 0x3F) as u8 | TAG_CONT; - } - (3, [a, b, c, ..]) => { - *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *c = (code & 0x3F) as u8 | TAG_CONT; - } - (4, [a, b, c, d, ..]) => { - *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; - *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *d = (code & 0x3F) as u8 | TAG_CONT; - } - _ => panic!( - "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", - len, - code, - dst.len(), - ), - }; - // SAFETY: We just wrote UTF-8 content in, so converting to str is fine. - unsafe { from_utf8_unchecked_mut(&mut dst[..len]) } + encode_utf8_raw(self as u32, dst) } /// Encodes this character as UTF-16 into the provided `u16` buffer, @@ -1673,3 +1635,60 @@ impl char { } } } + +#[inline] +fn len_utf8(code: u32) -> usize { + if code < MAX_ONE_B { + 1 + } else if code < MAX_TWO_B { + 2 + } else if code < MAX_THREE_B { + 3 + } else { + 4 + } +} + +/// Encodes a raw u32 value as UTF-8 into the provided byte buffer, +/// and then returns the subslice of the buffer that contains the encoded character. +/// +/// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range. +/// +/// # Panics +/// +/// Panics if the buffer is not large enough. +/// A buffer of length four is large enough to encode any `char`. +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +#[doc(hidden)] +#[inline] +pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str { + let len = len_utf8(code); + match (len, &mut dst[..]) { + (1, [a, ..]) => { + *a = code as u8; + } + (2, [a, b, ..]) => { + *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *b = (code & 0x3F) as u8 | TAG_CONT; + } + (3, [a, b, c, ..]) => { + *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *c = (code & 0x3F) as u8 | TAG_CONT; + } + (4, [a, b, c, d, ..]) => { + *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *d = (code & 0x3F) as u8 | TAG_CONT; + } + _ => panic!( + "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + len, + code, + dst.len(), + ), + }; + // SAFETY: We just wrote UTF-8 content in, so converting to str is fine. + unsafe { from_utf8_unchecked_mut(&mut dst[..len]) } +} diff --git a/src/libcore/char/mod.rs b/src/libcore/char/mod.rs index bf65c31e135..40b429b7496 100644 --- a/src/libcore/char/mod.rs +++ b/src/libcore/char/mod.rs @@ -37,6 +37,10 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error}; #[stable(feature = "unicode_version", since = "1.45.0")] pub use crate::unicode::UNICODE_VERSION; +// perma-unstable re-exports +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +pub use self::methods::encode_utf8_raw; + use crate::fmt::{self, Write}; use crate::iter::FusedIterator;