From b09803e8694b3ad83f988e30b4f0a3903ebe2632 Mon Sep 17 00:00:00 2001 From: Paul Dicker Date: Tue, 19 Feb 2019 15:57:59 +0100 Subject: [PATCH] Address review comments --- src/libstd/sys/windows/stdio.rs | 58 ++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/src/libstd/sys/windows/stdio.rs b/src/libstd/sys/windows/stdio.rs index ccfe0ced82f..6b9b36f6a92 100644 --- a/src/libstd/sys/windows/stdio.rs +++ b/src/libstd/sys/windows/stdio.rs @@ -1,6 +1,7 @@ #![unstable(issue = "0", feature = "windows_stdio")] use cell::Cell; +use char::decode_utf16; use cmp; use io; use ptr; @@ -64,22 +65,27 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { // // If the data is not valid UTF-8 we write out as many bytes as are valid. // Only when there are no valid bytes (which will happen on the next call), return an error. - let len = cmp::min(data.len(), MAX_BUFFER_SIZE); + let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2); let utf8 = match str::from_utf8(&data[..len]) { Ok(s) => s, Err(ref e) if e.valid_up_to() == 0 => { return Err(io::Error::new(io::ErrorKind::InvalidData, - "Windows stdio in console mode does not support non-UTF-8 byte sequences; \ - see https://github.com/rust-lang/rust/issues/23344")) + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences")) }, Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), }; - let utf16 = utf8.encode_utf16().collect::>(); + let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2]; + let mut len_utf16 = 0; + for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) { + *dest = chr; + len_utf16 += 1; + } + let utf16 = &utf16[..len_utf16]; let mut written = write_u16s(handle, &utf16)?; // Figure out how many bytes of as UTF-8 were written away as UTF-16. - if written >= utf16.len() { + if written == utf16.len() { Ok(utf8.len()) } else { // Make sure we didn't end up writing only half of a surrogate pair (even though the chance @@ -90,7 +96,7 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { let first_char_remaining = utf16[written]; if first_char_remaining >= 0xDCEE && first_char_remaining <= 0xDFFF { // low surrogate // We just hope this works, and give up otherwise - let _ = write_u16s(handle, &utf16[written..written]); + let _ = write_u16s(handle, &utf16[written..written+1]); written += 1; } // Calculate the number of bytes of `utf8` that were actually written. @@ -103,6 +109,7 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { _ => 3, }; } + debug_assert!(String::from_utf16(&utf16[..written]).unwrap() == utf8[..count]); Ok(count) } } @@ -137,7 +144,7 @@ impl Stdin { return Ok(0); } else if buf.len() < 4 { return Err(io::Error::new(io::ErrorKind::InvalidInput, - "Windows stdin in console mode does not support a buffer too small to; \ + "Windows stdin in console mode does not support a buffer too small to \ guarantee holding one arbitrary UTF-8 character (4 bytes)")) } @@ -147,27 +154,14 @@ impl Stdin { // lost. let amount = cmp::min(buf.len() / 3, utf16_buf.len()); let read = self.read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount)?; - let utf16 = &utf16_buf[..read]; - // FIXME: it would be nice if we could directly decode into the buffer instead of doing an - // allocation. - let data = match String::from_utf16(&utf16) { - Ok(utf8) => utf8.into_bytes(), - Err(..) => { - // We can't really do any better than forget all data and return an error. - return Err(io::Error::new(io::ErrorKind::InvalidData, - "Windows stdin in console mode does not support non-UTF-16 input; \ - encountered unpaired surrogate")) - }, - }; - buf.copy_from_slice(&data); - Ok(data.len()) + utf16_to_utf8(&utf16_buf[..read], buf) } // We assume that if the last `u16` is an unpaired surrogate they got sliced apart by our // buffer size, and keep it around for the next read hoping to put them together. // This is a best effort, and may not work if we are not the only reader on Stdin. - pub fn read_u16s_fixup_surrogates(&self, handle: c::HANDLE, buf: &mut [u16], mut amount: usize) + fn read_u16s_fixup_surrogates(&self, handle: c::HANDLE, buf: &mut [u16], mut amount: usize) -> io::Result { // Insert possibly remaining unpaired surrogate from last read. @@ -223,6 +217,26 @@ fn read_u16s(handle: c::HANDLE, buf: &mut [u16]) -> io::Result { Ok(amount as usize) } +#[allow(unused)] +fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result { + let mut written = 0; + for chr in decode_utf16(utf16.iter().cloned()) { + match chr { + Ok(chr) => { + chr.encode_utf8(&mut utf8[written..]); + written += chr.len_utf8(); + } + Err(_) => { + // We can't really do any better than forget all data and return an error. + return Err(io::Error::new(io::ErrorKind::InvalidData, + "Windows stdin in console mode does not support non-UTF-16 input; \ + encountered unpaired surrogate")) + } + } + } + Ok(written) +} + impl Stdout { pub fn new() -> io::Result { Ok(Stdout)