Rollup merge of #130659 - bjoernager:const-char-encode-utf16, r=dtolnay

Support `char::encode_utf16` in const scenarios.

Relevant tracking issue: #130660

The method `char::encode_utf16` should be marked "const" to allow compile-time conversions.

This PR additionally rewrites the `encode_utf16_raw` function for better readability whilst also reducing the amount of unsafe code.

try-job: x86_64-msvc
This commit is contained in:
Matthias Krüger 2024-09-23 06:45:33 +02:00 committed by GitHub
commit c1ccdb7d0c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 37 additions and 26 deletions

View File

@ -638,8 +638,7 @@ pub const fn len_utf8(self) -> usize {
#[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")] #[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")]
#[inline] #[inline]
pub const fn len_utf16(self) -> usize { pub const fn len_utf16(self) -> usize {
let ch = self as u32; len_utf16(self as u32)
if (ch & 0xFFFF) == ch { 1 } else { 2 }
} }
/// Encodes this character as UTF-8 into the provided byte buffer, /// Encodes this character as UTF-8 into the provided byte buffer,
@ -709,8 +708,9 @@ pub const fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
/// '𝕊'.encode_utf16(&mut b); /// '𝕊'.encode_utf16(&mut b);
/// ``` /// ```
#[stable(feature = "unicode_encode_char", since = "1.15.0")] #[stable(feature = "unicode_encode_char", since = "1.15.0")]
#[rustc_const_unstable(feature = "const_char_encode_utf16", issue = "130660")]
#[inline] #[inline]
pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { pub const fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
encode_utf16_raw(self as u32, dst) encode_utf16_raw(self as u32, dst)
} }
@ -1747,7 +1747,12 @@ const fn len_utf8(code: u32) -> usize {
} }
} }
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer, #[inline]
const fn len_utf16(code: u32) -> usize {
if (code & 0xFFFF) == code { 1 } else { 2 }
}
/// Encodes a raw `u32` value as UTF-8 into the provided byte buffer,
/// and then returns the subslice of the buffer that contains the encoded character. /// and then returns the subslice of the buffer that contains the encoded character.
/// ///
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
@ -1801,7 +1806,7 @@ fn panic_at_rt(code: u32, len: usize, dst_len: usize) {
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) } unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
} }
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, /// Encodes a raw `u32` value as UTF-16 into the provided `u16` buffer,
/// and then returns the subslice of the buffer that contains the encoded character. /// and then returns the subslice of the buffer that contains the encoded character.
/// ///
/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range. /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
@ -1812,28 +1817,33 @@ fn panic_at_rt(code: u32, len: usize, dst_len: usize) {
/// Panics if the buffer is not large enough. /// Panics if the buffer is not large enough.
/// A buffer of length 2 is large enough to encode any `char`. /// A buffer of length 2 is large enough to encode any `char`.
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
#[rustc_const_unstable(feature = "const_char_encode_utf16", issue = "130660")]
#[doc(hidden)] #[doc(hidden)]
#[inline] #[inline]
pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] { pub const fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
// SAFETY: each arm checks whether there are enough bits to write into const fn panic_at_const(_code: u32, _len: usize, _dst_len: usize) {
unsafe { // Note that we cannot format in constant expressions.
if (code & 0xFFFF) == code && !dst.is_empty() { panic!("encode_utf16: buffer does not have enough bytes to encode code point");
// The BMP falls through
*dst.get_unchecked_mut(0) = code as u16;
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
code -= 0x1_0000;
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
} else {
panic!(
"encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
char::from_u32_unchecked(code).len_utf16(),
code,
dst.len(),
)
}
} }
fn panic_at_rt(code: u32, len: usize, dst_len: usize) {
panic!(
"encode_utf16: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}",
);
}
let len = len_utf16(code);
match (len, &mut *dst) {
(1, [a, ..]) => {
*a = code as u16;
}
(2, [a, b, ..]) => {
code -= 0x1_0000;
*a = (code >> 10) as u16 | 0xD800;
*b = (code & 0x3FF) as u16 | 0xDC00;
}
// FIXME(const-hack): We would prefer to have streamlined panics when formatters become const-friendly.
_ => const_eval_select((code, len, dst.len()), panic_at_const, panic_at_rt),
};
// SAFETY: `<&mut [u16]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
} }

View File

@ -119,6 +119,7 @@
#![feature(const_bigint_helper_methods)] #![feature(const_bigint_helper_methods)]
#![feature(const_black_box)] #![feature(const_black_box)]
#![feature(const_cell_into_inner)] #![feature(const_cell_into_inner)]
#![feature(const_char_encode_utf16)]
#![feature(const_char_encode_utf8)] #![feature(const_char_encode_utf8)]
#![feature(const_eval_select)] #![feature(const_eval_select)]
#![feature(const_exact_div)] #![feature(const_exact_div)]