Auto merge of #91407 - the8472:deserialize-unchecked-utf8, r=michaelwoerister
Avoid string validation in rustc_serialize, check a marker byte instead Since the serialization format isn't self-describing we need a way to detect when encoder and decoder don't match up. But for strings it doesn't have to be utf8 validation, which currently does cost a few percent of performance. Instead we can use a marker byte at the end to be reasonably sure that we're dealing with a string and it wasn't overwritten in some way.
This commit is contained in:
commit
477fd7038c
@ -55,6 +55,13 @@ macro_rules! write_leb128 {
|
||||
}};
|
||||
}
|
||||
|
||||
/// A byte that [cannot occur in UTF8 sequences][utf8]. Used to mark the end of a string.
|
||||
/// This way we can skip validation and still be relatively sure that deserialization
|
||||
/// did not desynchronize.
|
||||
///
|
||||
/// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout
|
||||
const STR_SENTINEL: u8 = 0xC1;
|
||||
|
||||
impl serialize::Encoder for Encoder {
|
||||
type Error = !;
|
||||
|
||||
@ -150,7 +157,8 @@ fn emit_char(&mut self, v: char) -> EncodeResult {
|
||||
#[inline]
|
||||
fn emit_str(&mut self, v: &str) -> EncodeResult {
|
||||
self.emit_usize(v.len())?;
|
||||
self.emit_raw_bytes(v.as_bytes())
|
||||
self.emit_raw_bytes(v.as_bytes())?;
|
||||
self.emit_u8(STR_SENTINEL)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@ -502,7 +510,8 @@ fn emit_char(&mut self, v: char) -> FileEncodeResult {
|
||||
#[inline]
|
||||
fn emit_str(&mut self, v: &str) -> FileEncodeResult {
|
||||
self.emit_usize(v.len())?;
|
||||
self.emit_raw_bytes(v.as_bytes())
|
||||
self.emit_raw_bytes(v.as_bytes())?;
|
||||
self.emit_u8(STR_SENTINEL)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@ -656,8 +665,12 @@ fn read_char(&mut self) -> Result<char, Self::Error> {
|
||||
#[inline]
|
||||
fn read_str(&mut self) -> Result<Cow<'_, str>, Self::Error> {
|
||||
let len = self.read_usize()?;
|
||||
let s = std::str::from_utf8(&self.data[self.position..self.position + len]).unwrap();
|
||||
self.position += len;
|
||||
let sentinel = self.data[self.position + len];
|
||||
assert!(sentinel == STR_SENTINEL);
|
||||
let s = unsafe {
|
||||
std::str::from_utf8_unchecked(&self.data[self.position..self.position + len])
|
||||
};
|
||||
self.position += len + 1;
|
||||
Ok(Cow::Borrowed(s))
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user