Auto merge of #123246 - Kobzol:tarball-reproducible, r=Mark-Simulacrum

Make source tarball generation more reproducible

This PR performs several changes to source tarball generation (`x dist rustc-src`) in order to make it more reproducible (in light of the recent "xz backdoor"...). I want to follow up on it with making a separate CI workflow for generating the tarball.

After this PR, running this locally produces identical checksums:
```bash
$ ./x dist rustc-src
$ sha256sum build/dist/rustc-1.79.0-src.tar.gz

$ ./x dist rustc-src
$ sha256sum build/dist/rustc-1.79.0-src.tar.gz
```

r? `@Mark-Simulacrum`
This commit is contained in:
bors 2024-03-31 12:36:23 +00:00
commit a8cfc83801
3 changed files with 58 additions and 29 deletions

View File

@ -995,9 +995,9 @@ fn run(self, builder: &Builder<'_>) -> GeneratedTarball {
if builder.rust_info().is_managed_git_subrepository()
|| builder.rust_info().is_from_tarball()
{
if builder.rust_info().is_managed_git_subrepository() {
// Ensure we have the submodules checked out.
builder.update_submodule(Path::new("src/tools/cargo"));
// Ensure we have all submodules from src and other directories checked out.
for submodule in builder.get_all_submodules() {
builder.update_submodule(Path::new(submodule));
}
// Vendor all Cargo dependencies
@ -1028,6 +1028,20 @@ fn run(self, builder: &Builder<'_>) -> GeneratedTarball {
builder.create(&cargo_config_dir.join("config.toml"), &config);
}
// Delete extraneous directories
// FIXME: if we're managed by git, we should probably instead ask git if the given path
// is managed by it?
for entry in walkdir::WalkDir::new(tarball.image_dir())
.follow_links(true)
.into_iter()
.filter_map(|e| e.ok())
{
if entry.path().is_dir() && entry.path().file_name() == Some(OsStr::new("__pycache__"))
{
t!(fs::remove_dir_all(entry.path()));
}
}
tarball.bare()
}
}

View File

@ -554,29 +554,7 @@ pub fn path(self, path: &str) -> Self {
///
/// [`path`]: ShouldRun::path
pub fn paths(mut self, paths: &[&str]) -> Self {
static SUBMODULES_PATHS: OnceLock<Vec<String>> = OnceLock::new();
let init_submodules_paths = |src: &PathBuf| {
let file = File::open(src.join(".gitmodules")).unwrap();
let mut submodules_paths = vec![];
for line in BufReader::new(file).lines() {
if let Ok(line) = line {
let line = line.trim();
if line.starts_with("path") {
let actual_path =
line.split(' ').last().expect("Couldn't get value of path");
submodules_paths.push(actual_path.to_owned());
}
}
}
submodules_paths
};
let submodules_paths =
SUBMODULES_PATHS.get_or_init(|| init_submodules_paths(&self.builder.src));
let submodules_paths = self.builder.get_all_submodules();
self.paths.insert(PathSet::Set(
paths
@ -2151,6 +2129,37 @@ pub fn ensure<S: Step>(&'a self, step: S) -> S::Output {
out
}
/// Return paths of all submodules managed by git.
/// If the current checkout is not managed by git, returns an empty slice.
pub fn get_all_submodules(&self) -> &[String] {
if !self.rust_info().is_managed_git_subrepository() {
return &[];
}
static SUBMODULES_PATHS: OnceLock<Vec<String>> = OnceLock::new();
let init_submodules_paths = |src: &PathBuf| {
let file = File::open(src.join(".gitmodules")).unwrap();
let mut submodules_paths = vec![];
for line in BufReader::new(file).lines() {
if let Ok(line) = line {
let line = line.trim();
if line.starts_with("path") {
let actual_path =
line.split(' ').last().expect("Couldn't get value of path");
submodules_paths.push(actual_path.to_owned());
}
}
}
submodules_paths
};
&SUBMODULES_PATHS.get_or_init(|| init_submodules_paths(&self.src))
}
/// Ensure that a given step is built *only if it's supposed to be built by default*, returning
/// its output. This will cache the step, so it's safe (and good!) to call this as often as
/// needed to ensure that all dependencies are build.

View File

@ -2,7 +2,7 @@
use std::fs::{read_link, symlink_metadata};
use std::io::{BufWriter, Write};
use std::path::Path;
use tar::{Builder, Header};
use tar::{Builder, Header, HeaderMode};
use walkdir::WalkDir;
use crate::{
@ -53,14 +53,19 @@ pub fn run(self) -> Result<()> {
// Sort files by their suffix, to group files with the same name from
// different locations (likely identical) and files with the same
// extension (likely containing similar data).
let (dirs, mut files) = get_recursive_paths(&self.work_dir, &self.input)
// Sorting of file and directory paths also helps with the reproducibility
// of the resulting archive.
let (mut dirs, mut files) = get_recursive_paths(&self.work_dir, &self.input)
.context("failed to collect file paths")?;
dirs.sort();
files.sort_by(|a, b| a.bytes().rev().cmp(b.bytes().rev()));
// Write the tar into both encoded files. We write all directories
// first, so files may be directly created. (See rust-lang/rustup.rs#1092.)
let buf = BufWriter::with_capacity(1024 * 1024, encoder);
let mut builder = Builder::new(buf);
// Make uid, gid and mtime deterministic to improve reproducibility
builder.mode(HeaderMode::Deterministic);
let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap();
pool.install(move || {
@ -91,7 +96,8 @@ pub fn run(self) -> Result<()> {
fn append_path<W: Write>(builder: &mut Builder<W>, src: &Path, path: &String) -> Result<()> {
let stat = symlink_metadata(src)?;
let mut header = Header::new_gnu();
header.set_metadata(&stat);
header.set_metadata_in_mode(&stat, HeaderMode::Deterministic);
if stat.file_type().is_symlink() {
let link = read_link(src)?;
builder.append_link(&mut header, path, &link)?;