diff --git a/library/std/src/fs.rs b/library/std/src/fs.rs index e999c84f80d..726c855c4fd 100644 --- a/library/std/src/fs.rs +++ b/library/std/src/fs.rs @@ -198,19 +198,10 @@ pub struct DirBuilder { recursive: bool, } -/// Indicates how large a buffer to pre-allocate before reading the entire file. -fn initial_buffer_size(file: &File) -> usize { - // Don't worry about `usize` overflow because reading will fail regardless - // in that case. - file.metadata().map(|m| m.len() as usize).unwrap_or(0) -} - /// Read the entire contents of a file into a bytes vector. /// /// This is a convenience function for using [`File::open`] and [`read_to_end`] -/// with fewer imports and without an intermediate variable. It pre-allocates a -/// buffer based on the file size when available, so it is generally faster than -/// reading into a vector created with [`Vec::new()`]. +/// with fewer imports and without an intermediate variable. /// /// [`read_to_end`]: Read::read_to_end /// @@ -237,7 +228,7 @@ fn initial_buffer_size(file: &File) -> usize { pub fn read>(path: P) -> io::Result> { fn inner(path: &Path) -> io::Result> { let mut file = File::open(path)?; - let mut bytes = Vec::with_capacity(initial_buffer_size(&file)); + let mut bytes = Vec::new(); file.read_to_end(&mut bytes)?; Ok(bytes) } @@ -247,9 +238,7 @@ fn inner(path: &Path) -> io::Result> { /// Read the entire contents of a file into a string. /// /// This is a convenience function for using [`File::open`] and [`read_to_string`] -/// with fewer imports and without an intermediate variable. It pre-allocates a -/// buffer based on the file size when available, so it is generally faster than -/// reading into a string created with [`String::new()`]. +/// with fewer imports and without an intermediate variable. /// /// [`read_to_string`]: Read::read_to_string /// @@ -278,7 +267,7 @@ fn inner(path: &Path) -> io::Result> { pub fn read_to_string>(path: P) -> io::Result { fn inner(path: &Path) -> io::Result { let mut file = File::open(path)?; - let mut string = String::with_capacity(initial_buffer_size(&file)); + let mut string = String::new(); file.read_to_string(&mut string)?; Ok(string) } @@ -615,6 +604,15 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { } } +/// Indicates how much extra capacity is needed to read the rest of the file. +fn buffer_capacity_required(mut file: &File) -> usize { + let size = file.metadata().map(|m| m.len()).unwrap_or(0); + let pos = file.stream_position().unwrap_or(0); + // Don't worry about `usize` overflow because reading will fail regardless + // in that case. + size.saturating_sub(pos) as usize +} + #[stable(feature = "rust1", since = "1.0.0")] impl Read for File { fn read(&mut self, buf: &mut [u8]) -> io::Result { @@ -635,6 +633,18 @@ unsafe fn initializer(&self) -> Initializer { // SAFETY: Read is guaranteed to work on uninitialized memory unsafe { Initializer::nop() } } + + // Reserves space in the buffer based on the file size when available. + fn read_to_end(&mut self, buf: &mut Vec) -> io::Result { + buf.reserve(buffer_capacity_required(self)); + io::default_read_to_end(self, buf) + } + + // Reserves space in the buffer based on the file size when available. + fn read_to_string(&mut self, buf: &mut String) -> io::Result { + buf.reserve(buffer_capacity_required(self)); + io::default_read_to_string(self, buf) + } } #[stable(feature = "rust1", since = "1.0.0")] impl Write for File { @@ -681,6 +691,18 @@ unsafe fn initializer(&self) -> Initializer { // SAFETY: Read is guaranteed to work on uninitialized memory unsafe { Initializer::nop() } } + + // Reserves space in the buffer based on the file size when available. + fn read_to_end(&mut self, buf: &mut Vec) -> io::Result { + buf.reserve(buffer_capacity_required(self)); + io::default_read_to_end(self, buf) + } + + // Reserves space in the buffer based on the file size when available. + fn read_to_string(&mut self, buf: &mut String) -> io::Result { + buf.reserve(buffer_capacity_required(self)); + io::default_read_to_string(self, buf) + } } #[stable(feature = "rust1", since = "1.0.0")] impl Write for &File { diff --git a/library/std/src/io/buffered/bufreader.rs b/library/std/src/io/buffered/bufreader.rs index 869ac1ec859..243207a6065 100644 --- a/library/std/src/io/buffered/bufreader.rs +++ b/library/std/src/io/buffered/bufreader.rs @@ -308,6 +308,51 @@ fn is_read_vectored(&self) -> bool { unsafe fn initializer(&self) -> Initializer { self.inner.initializer() } + + // The inner reader might have an optimized `read_to_end`. Drain our buffer and then + // delegate to the inner implementation. + fn read_to_end(&mut self, buf: &mut Vec) -> io::Result { + let nread = self.cap - self.pos; + buf.extend_from_slice(&self.buf[self.pos..self.cap]); + self.discard_buffer(); + Ok(nread + self.inner.read_to_end(buf)?) + } + + // The inner reader might have an optimized `read_to_end`. Drain our buffer and then + // delegate to the inner implementation. + fn read_to_string(&mut self, buf: &mut String) -> io::Result { + // In the general `else` case below we must read bytes into a side buffer, check + // that they are valid UTF-8, and then append them to `buf`. This requires a + // potentially large memcpy. + // + // If `buf` is empty--the most common case--we can leverage `append_to_string` + // to read directly into `buf`'s internal byte buffer, saving an allocation and + // a memcpy. + if buf.is_empty() { + // `append_to_string`'s safety relies on the buffer only being appended to since + // it only checks the UTF-8 validity of new data. If there were existing content in + // `buf` then an untrustworthy reader (i.e. `self.inner`) could not only append + // bytes but also modify existing bytes and render them invalid. On the other hand, + // if `buf` is empty then by definition any writes must be appends and + // `append_to_string` will validate all of the new bytes. + unsafe { crate::io::append_to_string(buf, |b| self.read_to_end(b)) } + } else { + // We cannot append our byte buffer directly onto the `buf` String as there could + // be an incomplete UTF-8 sequence that has only been partially read. We must read + // everything into a side buffer first and then call `from_utf8` on the complete + // buffer. + let mut bytes = Vec::new(); + self.read_to_end(&mut bytes)?; + let string = crate::str::from_utf8(&bytes).map_err(|_| { + io::Error::new_const( + io::ErrorKind::InvalidData, + &"stream did not contain valid UTF-8", + ) + })?; + *buf += string; + Ok(string.len()) + } + } } #[stable(feature = "rust1", since = "1.0.0")] diff --git a/library/std/src/io/buffered/tests.rs b/library/std/src/io/buffered/tests.rs index d290c3c4660..feb149c07a5 100644 --- a/library/std/src/io/buffered/tests.rs +++ b/library/std/src/io/buffered/tests.rs @@ -243,6 +243,28 @@ fn seek(&mut self, _: SeekFrom) -> io::Result { assert_eq!(reader.buffer().len(), 0); } +#[test] +fn test_buffered_reader_read_to_end_consumes_buffer() { + let data: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7]; + let mut reader = BufReader::with_capacity(3, data); + let mut buf = Vec::new(); + assert_eq!(reader.fill_buf().ok(), Some(&[0, 1, 2][..])); + assert_eq!(reader.read_to_end(&mut buf).ok(), Some(8)); + assert_eq!(&buf, &[0, 1, 2, 3, 4, 5, 6, 7]); + assert!(reader.buffer().is_empty()); +} + +#[test] +fn test_buffered_reader_read_to_string_consumes_buffer() { + let data: &[u8] = "deadbeef".as_bytes(); + let mut reader = BufReader::with_capacity(3, data); + let mut buf = String::new(); + assert_eq!(reader.fill_buf().ok(), Some("dea".as_bytes())); + assert_eq!(reader.read_to_string(&mut buf).ok(), Some(8)); + assert_eq!(&buf, "deadbeef"); + assert!(reader.buffer().is_empty()); +} + #[test] fn test_buffered_writer() { let inner = Vec::new(); diff --git a/library/std/src/io/mod.rs b/library/std/src/io/mod.rs index b4e49ca89d7..8c71138aa23 100644 --- a/library/std/src/io/mod.rs +++ b/library/std/src/io/mod.rs @@ -316,11 +316,12 @@ fn drop(&mut self) { } } -// A few methods below (read_to_string, read_line) will append data into a -// `String` buffer, but we need to be pretty careful when doing this. The -// implementation will just call `.as_mut_vec()` and then delegate to a -// byte-oriented reading method, but we must ensure that when returning we never -// leave `buf` in a state such that it contains invalid UTF-8 in its bounds. +// Several `read_to_string` and `read_line` methods in the standard library will +// append data into a `String` buffer, but we need to be pretty careful when +// doing this. The implementation will just call `.as_mut_vec()` and then +// delegate to a byte-oriented reading method, but we must ensure that when +// returning we never leave `buf` in a state such that it contains invalid UTF-8 +// in its bounds. // // To this end, we use an RAII guard (to protect against panics) which updates // the length of the string when it is dropped. This guard initially truncates @@ -334,21 +335,19 @@ fn drop(&mut self) { // 2. We're passing a raw buffer to the function `f`, and it is expected that // the function only *appends* bytes to the buffer. We'll get undefined // behavior if existing bytes are overwritten to have non-UTF-8 data. -fn append_to_string(buf: &mut String, f: F) -> Result +pub(crate) unsafe fn append_to_string(buf: &mut String, f: F) -> Result where F: FnOnce(&mut Vec) -> Result, { - unsafe { - let mut g = Guard { len: buf.len(), buf: buf.as_mut_vec() }; - let ret = f(g.buf); - if str::from_utf8(&g.buf[g.len..]).is_err() { - ret.and_then(|_| { - Err(Error::new_const(ErrorKind::InvalidData, &"stream did not contain valid UTF-8")) - }) - } else { - g.len = g.buf.len(); - ret - } + let mut g = Guard { len: buf.len(), buf: buf.as_mut_vec() }; + let ret = f(g.buf); + if str::from_utf8(&g.buf[g.len..]).is_err() { + ret.and_then(|_| { + Err(Error::new_const(ErrorKind::InvalidData, &"stream did not contain valid UTF-8")) + }) + } else { + g.len = g.buf.len(); + ret } } @@ -361,7 +360,7 @@ fn append_to_string(buf: &mut String, f: F) -> Result // // Because we're extending the buffer with uninitialized data for trusted // readers, we need to make sure to truncate that if any of this panics. -fn read_to_end(r: &mut R, buf: &mut Vec) -> Result { +pub(crate) fn default_read_to_end(r: &mut R, buf: &mut Vec) -> Result { let start_len = buf.len(); let start_cap = buf.capacity(); let mut g = Guard { len: buf.len(), buf }; @@ -426,6 +425,22 @@ fn read_to_end(r: &mut R, buf: &mut Vec) -> Result } } +pub(crate) fn default_read_to_string( + r: &mut R, + buf: &mut String, +) -> Result { + // Note that we do *not* call `r.read_to_end()` here. We are passing + // `&mut Vec` (the raw contents of `buf`) into the `read_to_end` + // method to fill it up. An arbitrary implementation could overwrite the + // entire contents of the vector, not just append to it (which is what + // we are expecting). + // + // To prevent extraneously checking the UTF-8-ness of the entire buffer + // we pass it to our hardcoded `default_read_to_end` implementation which + // we know is guaranteed to only read data into the end of the buffer. + unsafe { append_to_string(buf, |b| default_read_to_end(r, b)) } +} + pub(crate) fn default_read_vectored(read: F, bufs: &mut [IoSliceMut<'_>]) -> Result where F: FnOnce(&mut [u8]) -> Result, @@ -716,7 +731,7 @@ unsafe fn initializer(&self) -> Initializer { /// [`std::fs::read`]: crate::fs::read #[stable(feature = "rust1", since = "1.0.0")] fn read_to_end(&mut self, buf: &mut Vec) -> Result { - read_to_end(self, buf) + default_read_to_end(self, buf) } /// Read all bytes until EOF in this source, appending them to `buf`. @@ -759,16 +774,7 @@ fn read_to_end(&mut self, buf: &mut Vec) -> Result { /// [`std::fs::read_to_string`]: crate::fs::read_to_string #[stable(feature = "rust1", since = "1.0.0")] fn read_to_string(&mut self, buf: &mut String) -> Result { - // Note that we do *not* call `.read_to_end()` here. We are passing - // `&mut Vec` (the raw contents of `buf`) into the `read_to_end` - // method to fill it up. An arbitrary implementation could overwrite the - // entire contents of the vector, not just append to it (which is what - // we are expecting). - // - // To prevent extraneously checking the UTF-8-ness of the entire buffer - // we pass it to our hardcoded `read_to_end` implementation which we - // know is guaranteed to only read data into the end of the buffer. - append_to_string(buf, |b| read_to_end(self, b)) + default_read_to_string(self, buf) } /// Read the exact number of bytes required to fill `buf`. @@ -1005,6 +1011,11 @@ fn take(self, limit: u64) -> Take /// need more control over performance, and in those cases you should definitely use /// [`Read::read_to_string`] directly. /// +/// Note that in some special cases, such as when reading files, this function will +/// pre-allocate memory based on the size of the input it is reading. In those +/// cases, the performance should be as good as if you had used +/// [`Read::read_to_string`] with a manually pre-allocated buffer. +/// /// # Errors /// /// This function forces you to handle errors because the output (the `String`) @@ -2201,7 +2212,7 @@ fn read_line(&mut self, buf: &mut String) -> Result { // Note that we are not calling the `.read_until` method here, but // rather our hardcoded implementation. For more details as to why, see // the comments in `read_to_end`. - append_to_string(buf, |b| read_until(self, b'\n', b)) + unsafe { append_to_string(buf, |b| read_until(self, b'\n', b)) } } /// Returns an iterator over the contents of this reader split on the byte diff --git a/library/std/src/io/tests.rs b/library/std/src/io/tests.rs index 58b2d1d14c9..0321a2b60b1 100644 --- a/library/std/src/io/tests.rs +++ b/library/std/src/io/tests.rs @@ -290,7 +290,7 @@ fn bench_read_to_end(b: &mut test::Bencher) { b.iter(|| { let mut lr = repeat(1).take(10000000); let mut vec = Vec::with_capacity(1024); - super::read_to_end(&mut lr, &mut vec) + super::default_read_to_end(&mut lr, &mut vec) }); }