Constify [u8]::is_ascii (unstably)

UTF-8 checking in `const fn`-stabilized back in 1.63, but apparently somehow ASCII checking was never const-ified, despite being simpler.
2023-05-04 14:26:19 -07:00 · 2023-05-04 14:26:19 -07:00 · 370d31b93d
commit 370d31b93d
parent eb7a743421
4 changed files with 35 additions and 15 deletions
--- a/library/core/src/array/ascii.rs
+++ b/library/core/src/array/ascii.rs
@ -7,7 +7,7 @@ impl<const N: usize> [u8; N] {
    #[unstable(feature = "ascii_char", issue = "110998")]
    #[must_use]
    #[inline]
-    pub fn as_ascii(&self) -> Option<&[ascii::Char; N]> {
+    pub const fn as_ascii(&self) -> Option<&[ascii::Char; N]> {
        if self.is_ascii() {
            // SAFETY: Just checked that it's ASCII
            Some(unsafe { self.as_ascii_unchecked() })
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@ -149,6 +149,7 @@
 #![feature(const_slice_from_raw_parts_mut)]
 #![feature(const_slice_from_ref)]
 #![feature(const_slice_index)]
+#![feature(const_slice_is_ascii)]
 #![feature(const_slice_ptr_len)]
 #![feature(const_slice_split_at_mut)]
 #![feature(const_str_from_utf8_unchecked_mut)]
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@ -10,9 +10,10 @@ use crate::ops;
 impl [u8] {
    /// Checks if all bytes in this slice are within the ASCII range.
    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
+    #[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")]
    #[must_use]
    #[inline]
-    pub fn is_ascii(&self) -> bool {
+    pub const fn is_ascii(&self) -> bool {
        is_ascii(self)
    }

@ -21,7 +22,7 @@ impl [u8] {
    #[unstable(feature = "ascii_char", issue = "110998")]
    #[must_use]
    #[inline]
-    pub fn as_ascii(&self) -> Option<&[ascii::Char]> {
+    pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
        if self.is_ascii() {
            // SAFETY: Just checked that it's ASCII
            Some(unsafe { self.as_ascii_unchecked() })
@ -262,7 +263,7 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
 /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
 /// from `../str/mod.rs`, which does something similar for utf8 validation.
 #[inline]
-fn contains_nonascii(v: usize) -> bool {
+const fn contains_nonascii(v: usize) -> bool {
    const NONASCII_MASK: usize = usize::repeat_u8(0x80);
    (NONASCII_MASK & v) != 0
 }
@ -280,7 +281,7 @@ fn contains_nonascii(v: usize) -> bool {
 /// If any of these loads produces something for which `contains_nonascii`
 /// (above) returns true, then we know the answer is false.
 #[inline]
-fn is_ascii(s: &[u8]) -> bool {
+const fn is_ascii(s: &[u8]) -> bool {
    const USIZE_SIZE: usize = mem::size_of::<usize>();

    let len = s.len();
@ -292,7 +293,16 @@ fn is_ascii(s: &[u8]) -> bool {
    // We also do this for architectures where `size_of::<usize>()` isn't
    // sufficient alignment for `usize`, because it's a weird edge case.
    if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
-        return s.iter().all(|b| b.is_ascii());
+        // FIXME: once iterators and closures can be used in `const fn`,
+        // return s.iter().all(|b| b.is_ascii());
+        let mut i = 0;
+        while i < len {
+            if !s[i].is_ascii() {
+                return false;
+            }
+            i += 1;
+        }
+        return true;
    }

    // We always read the first word unaligned, which means `align_offset` is
@ -321,18 +331,26 @@ fn is_ascii(s: &[u8]) -> bool {
    // Paranoia check about alignment, since we're about to do a bunch of
    // unaligned loads. In practice this should be impossible barring a bug in
    // `align_offset` though.
-    debug_assert_eq!(word_ptr.addr() % mem::align_of::<usize>(), 0);
+    // While this method is allowed to spuriously fail in CTFE, if it doesn't
+    // have alignment information it should have given a `usize::MAX` for
+    // `align_offset` earlier, sending things through the scalar path instead of
+    // this one, so this check should pass if it's reachable.
+    debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));

    // Read subsequent words until the last aligned word, excluding the last
    // aligned word by itself to be done in tail check later, to ensure that
    // tail is always one `usize` at most to extra branch `byte_pos == len`.
    while byte_pos < len - USIZE_SIZE {
-        debug_assert!(
-            // Sanity check that the read is in bounds
-            (word_ptr.addr() + USIZE_SIZE) <= start.addr().wrapping_add(len) &&
-            // And that our assumptions about `byte_pos` hold.
-            (word_ptr.addr() - start.addr()) == byte_pos
-        );
+        // Sanity check that the read is in bounds
+        debug_assert!(byte_pos + USIZE_SIZE <= len);
+        // And that our assumptions about `byte_pos` hold.
+        debug_assert!(matches!(
+            word_ptr.cast::<u8>().guaranteed_eq(start.wrapping_add(byte_pos)),
+            // These are from the same allocation, so will hopefully always be
+            // known to match even in CTFE, but if it refuses to compare them
+            // that's ok since it's just a debug check anyway.
+            None | Some(true),
+        ));

        // SAFETY: We know `word_ptr` is properly aligned (because of
        // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
--- a/library/core/src/str/mod.rs
+++ b/library/core/src/str/mod.rs
@ -2358,9 +2358,10 @@ impl str {
    /// assert!(!non_ascii.is_ascii());
    /// ```
    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
+    #[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")]
    #[must_use]
    #[inline]
-    pub fn is_ascii(&self) -> bool {
+    pub const fn is_ascii(&self) -> bool {
        // We can treat each byte as character here: all multibyte characters
        // start with a byte that is not in the ASCII range, so we will stop
        // there already.
@ -2372,7 +2373,7 @@ impl str {
    #[unstable(feature = "ascii_char", issue = "110998")]
    #[must_use]
    #[inline]
-    pub fn as_ascii(&self) -> Option<&[ascii::Char]> {
+    pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
        // Like in `is_ascii`, we can work on the bytes directly.
        self.as_bytes().as_ascii()
    }