Rework CStrUnit.

- Rename it as `MixedUnit`, because it will soon be used in more than
  just C string literals.
- Change the `Byte` variant to `HighByte` and use it only for
  `\x80`..`\xff` cases. This fixes the old inexactness where ASCII chars
  could be encoded with either `Byte` or `Char`.
- Add useful comments.
- Remove `is_ascii`, in favour of `u8::is_ascii`.
This commit is contained in:
Nicholas Nethercote 2024-01-23 12:27:56 +11:00
parent ef1e2228cf
commit a1c07214f0
3 changed files with 52 additions and 42 deletions

View File

@ -3,7 +3,7 @@
use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token}; use crate::token::{self, Token};
use rustc_lexer::unescape::{ use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
Mode, Mode,
}; };
use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::symbol::{kw, sym, Symbol};
@ -127,10 +127,10 @@ impl LitKind {
let s = symbol.as_str(); let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len()); let mut buf = Vec::with_capacity(s.len());
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c { unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
Ok(CStrUnit::Byte(b)) => buf.push(b), Ok(MixedUnit::Char(c)) => {
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
} }
Ok(MixedUnit::HighByte(b)) => buf.push(b),
Err(err) => { Err(err) => {
assert!(!err.is_fatal(), "failed to unescape C string literal") assert!(!err.is_fatal(), "failed to unescape C string literal")
} }

View File

@ -101,32 +101,45 @@ where
} }
} }
/// A unit within CStr. Must not be a nul character. /// Used for mixed utf8 string literals, i.e. those that allow both unicode
pub enum CStrUnit { /// chars and high bytes.
Byte(u8), pub enum MixedUnit {
/// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
/// and Unicode chars (written directly or via `\u` escapes).
///
/// For example, if '¥' appears in a string it is represented here as
/// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
/// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
Char(char), Char(char),
/// Used for high bytes (`\x80`..`\xff`).
///
/// For example, if `\xa5` appears in a string it is represented here as
/// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
/// byte string as the single byte `0xa5`.
HighByte(u8),
} }
impl From<u8> for CStrUnit { impl From<char> for MixedUnit {
fn from(value: u8) -> Self { fn from(c: char) -> Self {
CStrUnit::Byte(value) MixedUnit::Char(c)
} }
} }
impl From<char> for CStrUnit { impl From<u8> for MixedUnit {
fn from(value: char) -> Self { fn from(n: u8) -> Self {
CStrUnit::Char(value) if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) }
} }
} }
pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F) pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>), F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
{ {
match mode { match mode {
CStr => { CStr => {
unescape_non_raw_common(src, mode, &mut |r, mut result| { unescape_non_raw_common(src, mode, &mut |r, mut result| {
if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result { if let Ok(MixedUnit::Char('\0')) = result {
result = Err(EscapeError::NulInCStr); result = Err(EscapeError::NulInCStr);
} }
callback(r, result) callback(r, result)
@ -137,7 +150,8 @@ where
if let Ok('\0') = result { if let Ok('\0') = result {
result = Err(EscapeError::NulInCStr); result = Err(EscapeError::NulInCStr);
} }
callback(r, result.map(CStrUnit::Char)) // High bytes aren't possible in raw strings.
callback(r, result.map(MixedUnit::Char))
}); });
} }
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
@ -217,20 +231,19 @@ impl Mode {
} }
} }
fn scan_escape<T: From<u8> + From<char>>( fn scan_escape<T: From<char> + From<u8>>(
chars: &mut Chars<'_>, chars: &mut Chars<'_>,
mode: Mode, mode: Mode,
) -> Result<T, EscapeError> { ) -> Result<T, EscapeError> {
// Previous character was '\\', unescape what follows. // Previous character was '\\', unescape what follows.
let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? { let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
'"' => b'"', '"' => '"',
'n' => b'\n', 'n' => '\n',
'r' => b'\r', 'r' => '\r',
't' => b'\t', 't' => '\t',
'\\' => b'\\', '\\' => '\\',
'\'' => b'\'', '\'' => '\'',
'0' => b'\0', '0' => '\0',
'x' => { 'x' => {
// Parse hexadecimal character code. // Parse hexadecimal character code.
@ -240,15 +253,17 @@ fn scan_escape<T: From<u8> + From<char>>(
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo; let value = (hi * 16 + lo) as u8;
if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) { return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() {
return Err(EscapeError::OutOfRangeHexEscape); Err(EscapeError::OutOfRangeHexEscape)
} } else {
// This may be a high byte, but that will only happen if `T` is
value as u8 // `MixedUnit`, because of the `ascii_escapes_should_be_ascii`
// check above.
Ok(T::from(value as u8))
};
} }
'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from), 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from),
_ => return Err(EscapeError::InvalidEscape), _ => return Err(EscapeError::InvalidEscape),
}; };
@ -336,7 +351,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
/// Takes a contents of a string literal (without quotes) and produces a /// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors. /// sequence of escaped characters or errors.
fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F) fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<T, EscapeError>), F: FnMut(Range<usize>, Result<T, EscapeError>),
{ {
@ -430,7 +445,3 @@ pub fn byte_from_char(c: char) -> u8 {
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
res as u8 res as u8
} }
fn is_ascii(x: u32) -> bool {
x <= 0x7F
}

View File

@ -6,7 +6,7 @@ use std::{
}; };
use rustc_lexer::unescape::{ use rustc_lexer::unescape::{
unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode,
}; };
use crate::{ use crate::{
@ -336,10 +336,9 @@ impl ast::CString {
let mut buf = Vec::new(); let mut buf = Vec::new();
let mut prev_end = 0; let mut prev_end = 0;
let mut has_error = false; let mut has_error = false;
let mut char_buf = [0u8; 4]; let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit {
let mut extend_unit = |buf: &mut Vec<u8>, unit: CStrUnit| match unit { MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
CStrUnit::Byte(b) => buf.push(b), MixedUnit::HighByte(b) => buf.push(b),
CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()),
}; };
unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match ( unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match (
unescaped, unescaped,