Update generate-copyright

This tool now scans for cargo dependencies and includes any important looking license files.

We do this because cargo package metadata is not sufficient - the Apache-2.0 license says you have to include any NOTICE file, for example. And authors != copyright holders (cargo has the former, we must include the latter).
This commit is contained in:
Jonathan Pallant 2024-07-04 18:34:35 +01:00
parent 93ea767e29
commit ba0d6c9739
No known key found for this signature in database
7 changed files with 320 additions and 12 deletions

View File

@ -1408,6 +1408,8 @@ dependencies = [
"anyhow",
"serde",
"serde_json",
"tempfile",
"thiserror",
]
[[package]]

View File

@ -217,6 +217,8 @@ fn run(self, builder: &Builder<'_>) -> Self::Output {
let mut cmd = builder.tool_cmd(Tool::GenerateCopyright);
cmd.env("LICENSE_METADATA", &license_metadata);
cmd.env("DEST", &dest);
cmd.env("OUT_DIR", &builder.out);
cmd.env("CARGO", &builder.initial_cargo);
cmd.run(builder);
dest

View File

@ -2,6 +2,8 @@
name = "collect-license-metadata"
version = "0.1.0"
edition = "2021"
description = "Runs the reuse tool and caches the output, so rust toolchain devs don't need to have reuse installed"
license = "MIT OR Apache-2.0"
[dependencies]
anyhow = "1.0.65"

View File

@ -8,6 +8,11 @@
use crate::licenses::LicensesInterner;
/// The entry point to the binary.
///
/// You should probably let `bootstrap` execute this program instead of running it directly.
///
/// Run `x.py run collect-license-metadata`
fn main() -> Result<(), Error> {
let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into();
let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into();

View File

@ -2,6 +2,7 @@
name = "generate-copyright"
version = "0.1.0"
edition = "2021"
description = "Produces a manifest of all the copyrighted materials in the Rust Toolchain"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@ -9,3 +10,5 @@ edition = "2021"
anyhow = "1.0.65"
serde = { version = "1.0.147", features = ["derive"] }
serde_json = "1.0.85"
thiserror = "1"
tempfile = "3"

View File

@ -0,0 +1,196 @@
//! Gets metadata about a workspace from Cargo
use std::collections::{BTreeMap, BTreeSet};
use std::ffi::{OsStr, OsString};
use std::path::Path;
/// Describes how this module can fail
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error("Failed to run cargo metadata: {0:?}")]
LaunchingMetadata(#[from] std::io::Error),
#[error("Failed get output from cargo metadata: {0:?}")]
GettingMetadata(String),
#[error("Failed parse JSON output from cargo metadata: {0:?}")]
ParsingJson(#[from] serde_json::Error),
#[error("Failed find expected JSON element {0} in output from cargo metadata")]
MissingJsonElement(&'static str),
#[error("Failed find expected JSON element {0} in output from cargo metadata for package {1}")]
MissingJsonElementForPackage(String, String),
#[error("Failed to run cargo vendor: {0:?}")]
LaunchingVendor(std::io::Error),
#[error("Failed to complete cargo vendor")]
RunningVendor,
}
/// Describes one of our dependencies
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Dependency {
/// The name of the package
pub name: String,
/// The version number
pub version: String,
/// The license it is under
pub license: String,
/// The list of authors from the package metadata
pub authors: Vec<String>,
/// A list of important files from the package, with their contents.
///
/// This includes *COPYRIGHT*, *NOTICE*, *AUTHOR*, *LICENSE*, and *LICENCE* files, case-insensitive.
pub notices: BTreeMap<OsString, String>,
}
/// Use `cargo` to get a list of dependencies and their license data.
///
/// This will involve running `cargo vendor` into `${BUILD}/vendor` so we can
/// grab the license files.
///
/// Any dependency with a path beginning with `root_path` is ignored, as we
/// assume `reuse` has covered it already.
pub fn get(
cargo: &Path,
dest: &Path,
root_path: &Path,
manifest_paths: &[&Path],
) -> Result<BTreeSet<Dependency>, Error> {
let mut temp_set = BTreeSet::new();
// Look at the metadata for each manifest
for manifest_path in manifest_paths {
if manifest_path.file_name() != Some(OsStr::new("Cargo.toml")) {
panic!("cargo_manifest::get requires a path to a Cargo.toml file");
}
let metadata_json = get_metadata_json(cargo, manifest_path)?;
let packages = metadata_json["packages"]
.as_array()
.ok_or_else(|| Error::MissingJsonElement("packages array"))?;
for package in packages {
let package =
package.as_object().ok_or_else(|| Error::MissingJsonElement("package object"))?;
let manifest_path = package
.get("manifest_path")
.and_then(|v| v.as_str())
.map(Path::new)
.ok_or_else(|| Error::MissingJsonElement("package.manifest_path"))?;
if manifest_path.starts_with(&root_path) {
// it's an in-tree dependency and reuse covers it
continue;
}
// otherwise it's an out-of-tree dependency
let get_string = |field_name: &str, package_name: &str| {
package.get(field_name).and_then(|v| v.as_str()).ok_or_else(|| {
Error::MissingJsonElementForPackage(
format!("package.{field_name}"),
package_name.to_owned(),
)
})
};
let name = get_string("name", "unknown")?;
let license = get_string("license", name)?;
let version = get_string("version", name)?;
let authors_list = package
.get("authors")
.and_then(|v| v.as_array())
.ok_or_else(|| Error::MissingJsonElement("package.authors"))?;
let authors: Vec<String> =
authors_list.iter().filter_map(|v| v.as_str()).map(|s| s.to_owned()).collect();
temp_set.insert(Dependency {
name: name.to_owned(),
version: version.to_owned(),
license: license.to_owned(),
authors,
notices: BTreeMap::new(),
});
}
}
// Now do a cargo-vendor and grab everything
let vendor_path = dest.join("vendor");
println!("Vendoring deps into {}...", vendor_path.display());
run_cargo_vendor(cargo, &vendor_path, manifest_paths)?;
// Now for each dependency we found, go and grab any important looking files
let mut output = BTreeSet::new();
for mut dep in temp_set {
load_important_files(&mut dep, &vendor_path)?;
output.insert(dep);
}
Ok(output)
}
/// Get cargo-metdata for a package, as JSON
fn get_metadata_json(cargo: &Path, manifest_path: &Path) -> Result<serde_json::Value, Error> {
let metadata_output = std::process::Command::new(cargo)
.arg("metadata")
.arg("--format-version=1")
.arg("--all-features")
.arg("--manifest-path")
.arg(manifest_path)
.env("RUSTC_BOOTSTRAP", "1")
.output()
.map_err(|e| Error::LaunchingMetadata(e))?;
if !metadata_output.status.success() {
return Err(Error::GettingMetadata(
String::from_utf8(metadata_output.stderr).expect("UTF-8 output from cargo"),
));
}
let json = serde_json::from_slice(&metadata_output.stdout)?;
Ok(json)
}
/// Run cargo-vendor, fetching into the given dir
fn run_cargo_vendor(cargo: &Path, dest: &Path, manifest_paths: &[&Path]) -> Result<(), Error> {
let mut vendor_command = std::process::Command::new(cargo);
vendor_command.env("RUSTC_BOOTSTRAP", "1");
vendor_command.arg("vendor");
vendor_command.arg("--quiet");
vendor_command.arg("--versioned-dirs");
for manifest_path in manifest_paths {
vendor_command.arg("-s");
vendor_command.arg(manifest_path);
}
vendor_command.arg(dest);
let vendor_status = vendor_command.status().map_err(|e| Error::LaunchingVendor(e))?;
if !vendor_status.success() {
return Err(Error::RunningVendor);
}
Ok(())
}
/// Add important files off disk into this dependency.
///
/// Maybe one-day Cargo.toml will contain enough information that we don't need
/// to do this manual scraping.
fn load_important_files(dep: &mut Dependency, vendor_root: &Path) -> Result<(), Error> {
let name_version = format!("{}-{}", dep.name, dep.version);
println!("Scraping notices for {}...", name_version);
let dep_vendor_path = vendor_root.join(name_version);
for entry in std::fs::read_dir(dep_vendor_path)? {
let entry = entry?;
let metadata = entry.metadata()?;
let path = entry.path();
if let Some(filename) = path.file_name() {
let lc_filename = filename.to_ascii_lowercase();
let lc_filename_str = lc_filename.to_string_lossy();
let mut keep = false;
for m in ["copyright", "licence", "license", "author", "notice"] {
if lc_filename_str.contains(m) {
keep = true;
break;
}
}
if keep {
if metadata.is_dir() {
// scoop up whole directory
} else if metadata.is_file() {
println!("Scraping {}", filename.to_string_lossy());
dep.notices.insert(filename.to_owned(), std::fs::read_to_string(path)?);
}
}
}
}
Ok(())
}

View File

@ -1,54 +1,114 @@
use std::io::Write;
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use anyhow::Error;
mod cargo_metadata;
/// The entry point to the binary.
///
/// You should probably let `bootstrap` execute this program instead of running it directly.
///
/// Run `x.py run generate-metadata`
fn main() -> Result<(), Error> {
let dest = env_path("DEST")?;
let dest_file = env_path("DEST")?;
let out_dir = env_path("OUT_DIR")?;
let cargo = env_path("CARGO")?;
let license_metadata = env_path("LICENSE_METADATA")?;
let metadata: Metadata = serde_json::from_slice(&std::fs::read(&license_metadata)?)?;
let collected_tree_metadata: Metadata =
serde_json::from_slice(&std::fs::read(&license_metadata)?)?;
let root_path = std::path::absolute(".")?;
let workspace_paths = [
Path::new("./Cargo.toml"),
Path::new("./src/tools/cargo/Cargo.toml"),
Path::new("./library/std/Cargo.toml"),
];
let collected_cargo_metadata =
cargo_metadata::get(&cargo, &out_dir, &root_path, &workspace_paths)?;
let mut buffer = Vec::new();
render_recursive(&metadata.files, &mut buffer, 0)?;
std::fs::write(&dest, &buffer)?;
writeln!(buffer, "# COPYRIGHT for Rust")?;
writeln!(buffer)?;
writeln!(
buffer,
"This file describes the copyright and licensing information for the source code within The Rust Project git tree, and the third-party dependencies used when building the Rust toolchain (including the Rust Standard Library)"
)?;
writeln!(buffer)?;
writeln!(buffer, "## Table of Contents")?;
writeln!(buffer)?;
writeln!(buffer, "* [In-tree files](#in-tree-files)")?;
writeln!(buffer, "* [Out-of-tree files](#out-of-tree-files)")?;
// writeln!(buffer, "* [License Texts](#license-texts)")?;
writeln!(buffer)?;
writeln!(buffer, "## In-tree files")?;
writeln!(buffer)?;
writeln!(
buffer,
"The following licenses cover the in-tree source files that were used in this release:"
)?;
writeln!(buffer)?;
render_tree_recursive(&collected_tree_metadata.files, &mut buffer, 0)?;
writeln!(buffer)?;
writeln!(buffer, "## Out-of-tree files")?;
writeln!(buffer)?;
writeln!(
buffer,
"The following licenses cover the out-of-tree crates that were used in this release:"
)?;
writeln!(buffer)?;
render_deps(collected_cargo_metadata.iter(), &mut buffer)?;
std::fs::write(&dest_file, &buffer)?;
Ok(())
}
fn render_recursive(node: &Node, buffer: &mut Vec<u8>, depth: usize) -> Result<(), Error> {
/// Recursively draw the tree of files/folders we found on disk and their licenses, as
/// markdown, into the given Vec.
fn render_tree_recursive(node: &Node, buffer: &mut Vec<u8>, depth: usize) -> Result<(), Error> {
let prefix = std::iter::repeat("> ").take(depth + 1).collect::<String>();
match node {
Node::Root { children } => {
for child in children {
render_recursive(child, buffer, depth)?;
render_tree_recursive(child, buffer, depth)?;
}
}
Node::Directory { name, children, license } => {
render_license(&prefix, std::iter::once(name), license.as_ref(), buffer)?;
render_tree_license(&prefix, std::iter::once(name), license.as_ref(), buffer)?;
if !children.is_empty() {
writeln!(buffer, "{prefix}")?;
writeln!(buffer, "{prefix}*Exceptions:*")?;
for child in children {
writeln!(buffer, "{prefix}")?;
render_recursive(child, buffer, depth + 1)?;
render_tree_recursive(child, buffer, depth + 1)?;
}
}
}
Node::Group { files, directories, license } => {
render_license(&prefix, directories.iter().chain(files.iter()), Some(license), buffer)?;
render_tree_license(
&prefix,
directories.iter().chain(files.iter()),
Some(license),
buffer,
)?;
}
Node::File { name, license } => {
render_license(&prefix, std::iter::once(name), Some(license), buffer)?;
render_tree_license(&prefix, std::iter::once(name), Some(license), buffer)?;
}
}
Ok(())
}
fn render_license<'a>(
/// Draw a series of sibling files/folders, as markdown, into the given Vec.
fn render_tree_license<'a>(
prefix: &str,
names: impl Iterator<Item = &'a String>,
license: Option<&License>,
@ -67,11 +127,47 @@ fn render_license<'a>(
Ok(())
}
/// Render a list of out-of-tree dependencies as markdown into the given Vec.
fn render_deps<'a, 'b>(
deps: impl Iterator<Item = &'a cargo_metadata::Dependency>,
buffer: &'b mut Vec<u8>,
) -> Result<(), Error> {
for dep in deps {
let authors_list = dep.authors.join(", ").replace("<", "\\<").replace(">", "\\>");
let url = format!("https://crates.io/crates/{}/{}", dep.name, dep.version);
writeln!(buffer)?;
writeln!(
buffer,
"### [{name} {version}]({url})",
name = dep.name,
version = dep.version,
url = url,
)?;
writeln!(buffer)?;
writeln!(buffer, "* Authors: {}", authors_list)?;
writeln!(buffer, "* License: {}", dep.license)?;
for (name, contents) in &dep.notices {
writeln!(buffer)?;
writeln!(buffer, "#### {}", name.to_string_lossy())?;
writeln!(buffer)?;
writeln!(buffer, "<details><summary>Click to expand</summary>")?;
writeln!(buffer)?;
writeln!(buffer, "```")?;
writeln!(buffer, "{}", contents)?;
writeln!(buffer, "```")?;
writeln!(buffer)?;
writeln!(buffer, "</details>")?;
}
}
Ok(())
}
/// Describes a tree of metadata for our filesystem tree
#[derive(serde::Deserialize)]
struct Metadata {
files: Node,
}
/// Describes one node in our metadata tree
#[derive(serde::Deserialize)]
#[serde(rename_all = "kebab-case", tag = "type")]
pub(crate) enum Node {
@ -81,12 +177,14 @@ pub(crate) enum Node {
Group { files: Vec<String>, directories: Vec<String>, license: License },
}
/// A License has an SPDX license name and a list of copyright holders.
#[derive(serde::Deserialize)]
struct License {
spdx: String,
copyright: Vec<String>,
}
/// Grab an environment variable as a PathBuf, or fail nicely.
fn env_path(var: &str) -> Result<PathBuf, Error> {
if let Some(var) = std::env::var_os(var) {
Ok(var.into())