Auto merge of #78409 - pietroalbini:build-manifest-checksum-cache, r=Mark-Simulacrum

Add checksums cache to build-manifest

During the release process we're currently calculating the SHA256 of each file three times:

1. In `build-manifest`, to fill the `hash = "f00"` keys of the manifests.
2. In `promote-release`, to generate the `.sha256` files.
3. In `promote-release`, to generate the `.asc` GPG signatures.

Calculations 1. and 2. could be merged into a single one if there was a way for `build-manifest` to pass the checksums it generated over to `promote-release`. Unfortunately calculation 3. can't be merged as GPG requires extra metadata to be hashed.

This PR adds support for merging 1. and 2. by creating the `BUILD_MANIFEST_CHECKSUM_CACHE` environment variable, which points to a JSON file storing a cache of all the calculated checksums. `build-manifest` will load it at startup and avoid generating existing checksums, and it will dump its internal checksums cache into it when it exits successfully.

This PR also allows to run `build-manifest` multiple times without the need to wait for checksums to be calculated in the following invocations. The speedup will allow to work torwards a fix for https://github.com/rust-lang/promote-release/issues/15 without impacting the release process duration nor our storage costs.

This PR can be reviewed commit-by-commit.
r? `@Mark-Simulacrum`
This commit is contained in:
bors 2020-10-28 14:52:20 +00:00
commit 717eb6ccea
2 changed files with 106 additions and 50 deletions

View File

@ -0,0 +1,97 @@
use crate::manifest::{FileHash, Manifest};
use rayon::prelude::*;
use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fs::File;
use std::io::BufReader;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::time::Instant;
pub(crate) struct Checksums {
cache_path: Option<PathBuf>,
collected: Mutex<HashMap<PathBuf, String>>,
}
impl Checksums {
pub(crate) fn new() -> Result<Self, Box<dyn Error>> {
let cache_path = std::env::var_os("BUILD_MANIFEST_CHECKSUM_CACHE").map(PathBuf::from);
let mut collected = HashMap::new();
if let Some(path) = &cache_path {
if path.is_file() {
collected = serde_json::from_slice(&std::fs::read(path)?)?;
}
}
Ok(Checksums { cache_path, collected: Mutex::new(collected) })
}
pub(crate) fn store_cache(&self) -> Result<(), Box<dyn Error>> {
if let Some(path) = &self.cache_path {
std::fs::write(path, &serde_json::to_vec(&self.collected)?)?;
}
Ok(())
}
pub(crate) fn fill_missing_checksums(&mut self, manifest: &mut Manifest) {
let need_checksums = self.find_missing_checksums(manifest);
if !need_checksums.is_empty() {
self.collect_checksums(&need_checksums);
}
self.replace_checksums(manifest);
}
fn find_missing_checksums(&mut self, manifest: &mut Manifest) -> HashSet<PathBuf> {
let collected = self.collected.lock().unwrap();
let mut need_checksums = HashSet::new();
crate::manifest::visit_file_hashes(manifest, |file_hash| {
if let FileHash::Missing(path) = file_hash {
let path = std::fs::canonicalize(path).unwrap();
if !collected.contains_key(&path) {
need_checksums.insert(path);
}
}
});
need_checksums
}
fn replace_checksums(&mut self, manifest: &mut Manifest) {
let collected = self.collected.lock().unwrap();
crate::manifest::visit_file_hashes(manifest, |file_hash| {
if let FileHash::Missing(path) = file_hash {
let path = std::fs::canonicalize(path).unwrap();
match collected.get(&path) {
Some(hash) => *file_hash = FileHash::Present(hash.clone()),
None => panic!("missing hash for file {}", path.display()),
}
}
});
}
fn collect_checksums(&mut self, files: &HashSet<PathBuf>) {
let collection_start = Instant::now();
println!(
"collecting hashes for {} tarballs across {} threads",
files.len(),
rayon::current_num_threads().min(files.len()),
);
files.par_iter().for_each(|path| match hash(path) {
Ok(hash) => {
self.collected.lock().unwrap().insert(path.clone(), hash);
}
Err(err) => eprintln!("error while fetching the hash for {}: {}", path.display(), err),
});
println!("collected {} hashes in {:.2?}", files.len(), collection_start.elapsed());
}
}
fn hash(path: &Path) -> Result<String, Box<dyn Error>> {
let mut file = BufReader::new(File::open(path)?);
let mut sha256 = Sha256::default();
std::io::copy(&mut file, &mut sha256)?;
Ok(hex::encode(sha256.finalize()))
}

View File

@ -4,22 +4,19 @@
//! via `x.py dist hash-and-sign`; the cmdline arguments are set up
//! by rustbuild (in `src/bootstrap/dist.rs`).
mod checksum;
mod manifest;
mod versions;
use crate::manifest::{Component, FileHash, Manifest, Package, Rename, Target};
use crate::checksum::Checksums;
use crate::manifest::{Component, Manifest, Package, Rename, Target};
use crate::versions::{PkgType, Versions};
use rayon::prelude::*;
use sha2::Digest;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::env;
use std::error::Error;
use std::fs::{self, File};
use std::io::{self, BufReader, Read, Write};
use std::io::{self, Read, Write};
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::sync::Mutex;
use std::time::Instant;
static HOSTS: &[&str] = &[
"aarch64-apple-darwin",
@ -186,6 +183,7 @@ macro_rules! t {
struct Builder {
versions: Versions,
checksums: Checksums,
shipped_files: HashSet<String>,
input: PathBuf,
@ -240,6 +238,7 @@ fn main() {
Builder {
versions: Versions::new(&channel, &input).unwrap(),
checksums: t!(Checksums::new()),
shipped_files: HashSet::new(),
input,
@ -276,6 +275,8 @@ impl Builder {
if let Some(path) = std::env::var_os("BUILD_MANIFEST_SHIPPED_FILES_PATH") {
self.write_shipped_files(&Path::new(&path));
}
t!(self.checksums.store_cache());
}
/// If a tool does not pass its tests, don't ship it.
@ -321,7 +322,7 @@ impl Builder {
self.add_renames_to(&mut manifest);
manifest.pkg.insert("rust".to_string(), self.rust_package(&manifest));
self.fill_missing_hashes(&mut manifest);
self.checksums.fill_missing_checksums(&mut manifest);
manifest
}
@ -595,41 +596,6 @@ impl Builder {
assert!(t!(child.wait()).success());
}
fn fill_missing_hashes(&self, manifest: &mut Manifest) {
// First collect all files that need hashes
let mut need_hashes = HashSet::new();
crate::manifest::visit_file_hashes(manifest, |file_hash| {
if let FileHash::Missing(path) = file_hash {
need_hashes.insert(path.clone());
}
});
let collected = Mutex::new(HashMap::new());
let collection_start = Instant::now();
println!(
"collecting hashes for {} tarballs across {} threads",
need_hashes.len(),
rayon::current_num_threads().min(need_hashes.len()),
);
need_hashes.par_iter().for_each(|path| match fetch_hash(path) {
Ok(hash) => {
collected.lock().unwrap().insert(path, hash);
}
Err(err) => eprintln!("error while fetching the hash for {}: {}", path.display(), err),
});
let collected = collected.into_inner().unwrap();
println!("collected {} hashes in {:.2?}", collected.len(), collection_start.elapsed());
crate::manifest::visit_file_hashes(manifest, |file_hash| {
if let FileHash::Missing(path) = file_hash {
match collected.get(path) {
Some(hash) => *file_hash = FileHash::Present(hash.clone()),
None => panic!("missing hash for file {}", path.display()),
}
}
})
}
fn write_channel_files(&mut self, channel_name: &str, manifest: &Manifest) {
self.write(&toml::to_string(&manifest).unwrap(), channel_name, ".toml");
self.write(&manifest.date, channel_name, "-date.txt");
@ -660,10 +626,3 @@ impl Builder {
t!(std::fs::write(path, content.as_bytes()));
}
}
fn fetch_hash(path: &Path) -> Result<String, Box<dyn Error>> {
let mut file = BufReader::new(File::open(path)?);
let mut sha256 = sha2::Sha256::default();
std::io::copy(&mut file, &mut sha256)?;
Ok(hex::encode(sha256.finalize()))
}