Rollup merge of #119172 - nnethercote:earlier-NulInCStr, r=petrochenkov

Detect `NulInCStr` error earlier. By making it an `EscapeError` instead of a `LitError`. This makes it like the other errors produced when checking string literals contents, e.g. for invalid escape sequences or bare CR chars. NOTE: this means these errors are issued earlier, before expansion, which changes behaviour. It will be possible to move the check back to the later point if desired. If that happens, it's likely that all the string literal contents checks will be delayed together. One nice thing about this: the old approach had some code in `report_lit_error` to calculate the span of the nul char from a range. This code used a hardwired `+2` to account for the `c"` at the start of a C string literal, but this should have changed to a `+3` for raw C string literals to account for the `cr"`, which meant that the caret in `cr"` nul error messages was one short of where it should have been. The new approach doesn't need any of this and avoids the off-by-one error. r? ```@fee1-dead```
2024-01-18 10:34:17 +01:00 · 2024-01-18 10:34:17 +01:00 · ff8c7a7816
commit ff8c7a7816
parent c485ee7147 9018d2c455
11 changed files with 32 additions and 28 deletions
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@ -8,7 +8,6 @@
 };
 use rustc_span::symbol::{kw, sym, Symbol};
 use rustc_span::Span;
-use std::ops::Range;
 use std::{ascii, fmt, str};

 // Escapes a string, represented as a symbol. Reuses the original symbol,
@ -39,7 +38,6 @@ pub enum LitError {
    InvalidFloatSuffix,
    NonDecimalFloat(u32),
    IntTooLarge(u32),
-    NulInCStr(Range<usize>),
 }

 impl LitKind {
@ -156,10 +154,7 @@ pub fn from_token_lit(lit: token::Lit) -> Result<LitKind, LitError> {
                let s = symbol.as_str();
                let mut buf = Vec::with_capacity(s.len());
                let mut error = Ok(());
-                unescape_c_string(s, Mode::CStr, &mut |span, c| match c {
-                    Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
-                        error = Err(LitError::NulInCStr(span));
-                    }
+                unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
                    Ok(CStrUnit::Byte(b)) => buf.push(b),
                    Ok(CStrUnit::Char(c)) => {
                        buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
@ -179,10 +174,7 @@ pub fn from_token_lit(lit: token::Lit) -> Result<LitKind, LitError> {
                // can convert the symbol directly to a `Lrc<u8>` on success.
                let s = symbol.as_str();
                let mut error = Ok(());
-                unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
-                    Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
-                        error = Err(LitError::NulInCStr(span));
-                    }
+                unescape_c_string(s, Mode::RawCStr, &mut |_, c| match c {
                    Ok(_) => {}
                    Err(err) => {
                        if err.is_fatal() {
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@ -59,6 +59,9 @@ pub enum EscapeError {
    /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
    NonAsciiCharInByte,

+    // `\0` in a C string literal.
+    NulInCStr,
+
    /// After a line ending with '\', the next line contains whitespace
    /// characters that are not skipped.
    UnskippedWhitespaceWarning,
@ -122,10 +125,20 @@ pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
 {
    match mode {
        CStr => {
-            unescape_non_raw_common(src, mode, callback);
+            unescape_non_raw_common(src, mode, &mut |r, mut result| {
+                if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result {
+                    result = Err(EscapeError::NulInCStr);
+                }
+                callback(r, result)
+            });
        }
        RawCStr => {
-            check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char)));
+            check_raw_common(src, mode, &mut |r, mut result| {
+                if let Ok('\0') = result {
+                    result = Err(EscapeError::NulInCStr);
+                }
+                callback(r, result.map(CStrUnit::Char))
+            });
        }
        Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
    }
--- a/compiler/rustc_parse/messages.ftl
+++ b/compiler/rustc_parse/messages.ftl
@ -616,6 +616,8 @@ parse_note_mut_pattern_usage = `mut` may be followed by `variable` and `variable

 parse_note_pattern_alternatives_use_single_vert = alternatives in or-patterns are separated with `|`, not `||`

+parse_nul_in_c_str = null characters in C string literals are not supported
+
 parse_or_pattern_not_allowed_in_fn_parameters = top-level or-patterns are not allowed in function parameters
 parse_or_pattern_not_allowed_in_let_binding = top-level or-patterns are not allowed in `let` bindings
 parse_out_of_range_hex_escape = out of range hex escape
--- a/compiler/rustc_parse/src/errors.rs
+++ b/compiler/rustc_parse/src/errors.rs
@ -2163,6 +2163,11 @@ pub enum UnescapeError {
        #[subdiagnostic]
        suggestion: MoreThanOneCharSugg,
    },
+    #[diag(parse_nul_in_c_str)]
+    NulInCStr {
+        #[primary_span]
+        span: Span,
+    },
 }

 #[derive(Subdiagnostic)]
--- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
+++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
@ -262,6 +262,9 @@ pub(crate) fn emit_unescape_error(
        EscapeError::LoneSlash => {
            dcx.emit_err(UnescapeError::LoneSlash(err_span));
        }
+        EscapeError::NulInCStr => {
+            dcx.emit_err(UnescapeError::NulInCStr { span: err_span });
+        }
        EscapeError::UnskippedWhitespaceWarning => {
            let (c, char_span) = last_char();
            dcx.emit_warn(UnescapeError::UnskippedWhitespace {
--- a/compiler/rustc_session/messages.ftl
+++ b/compiler/rustc_session/messages.ftl
@ -75,8 +75,6 @@ session_not_circumvent_feature = `-Zunleash-the-miri-inside-of-you` may not be u

 session_not_supported = not supported

-session_nul_in_c_str = null characters in C string literals are not supported
-
 session_octal_float_literal_not_supported = octal float literal is not supported

 session_optimization_fuel_exhausted = optimization-fuel-exhausted: {$msg}
--- a/compiler/rustc_session/src/errors.rs
+++ b/compiler/rustc_session/src/errors.rs
@ -6,7 +6,7 @@
    error_code, DiagCtxt, DiagnosticBuilder, DiagnosticMessage, IntoDiagnostic, Level, MultiSpan,
 };
 use rustc_macros::Diagnostic;
-use rustc_span::{BytePos, Span, Symbol};
+use rustc_span::{Span, Symbol};
 use rustc_target::spec::{SplitDebuginfo, StackProtector, TargetTriple};

 use crate::parse::ParseSess;
@ -346,13 +346,6 @@ pub(crate) struct BinaryFloatLiteralNotSupported {
    pub span: Span,
 }

-#[derive(Diagnostic)]
-#[diag(session_nul_in_c_str)]
-pub(crate) struct NulInCStr {
-    #[primary_span]
-    pub span: Span,
-}
-
 pub fn report_lit_error(sess: &ParseSess, err: LitError, lit: token::Lit, span: Span) {
    // Checks if `s` looks like i32 or u1234 etc.
    fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
@ -432,12 +425,6 @@ fn fix_base_capitalisation(prefix: &str, suffix: &str) -> Option<String> {
            };
            dcx.emit_err(IntLiteralTooLarge { span, limit });
        }
-        LitError::NulInCStr(range) => {
-            let lo = BytePos(span.lo().0 + range.start as u32 + 2);
-            let hi = BytePos(span.lo().0 + range.end as u32 + 2);
-            let span = span.with_lo(lo).with_hi(hi);
-            dcx.emit_err(NulInCStr { span });
-        }
    }
 }

--- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
+++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@ -369,6 +369,7 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
            "non-ASCII character in byte string literal"
        }
        EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
+        EscapeError::NulInCStr => "null character in C string literal",
        EscapeError::UnskippedWhitespaceWarning => "",
        EscapeError::MultipleSkippedLinesWarning => "",
    }
--- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
@ -106,6 +106,9 @@ fn rustc_unescape_error_to_string(err: unescape::EscapeError) -> (&'static str,
        EE::NonAsciiCharInByte  => {
            "Byte literals must not contain non-ASCII characters"
        }
+        EE::NulInCStr  => {
+            "C strings literals must not contain null characters"
+        }
        EE::UnskippedWhitespaceWarning => "Whitespace after this escape is not skipped",
        EE::MultipleSkippedLinesWarning => "Multiple lines are skipped by this escape",

--- a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr