Rollup merge of #128353 - ferrocene:jonathanpallant/add-dependencies-to-copyright-file, r=Kobzol

Change generate-copyright to generate HTML, with cargo dependencies included

`x.py run generate-copyright` now produces `build/COPYRIGHT.html`. This includes a new format for in-tree dependencies, and also adds out-of-tree cargo dependencies.

After consulting expert opinion, I have elected to include every top-level:

* `*NOTICE*`
* `*AUTHOR*`
* `*LICENSE*`
* `*LICENCE*`, and
* `*COPYRIGHT*` file I can find - case-insensitive.

This is because the cargo package metadata's `author` field is not a list of copyright holders and does not meet the requirements of the Apache-2.0 license (which says you must include a NOTICE file with the binary if one was supplied by the author) nor the MIT license (which says you must include 'the above copyright notice').

I believe it would be appropriate to include this file with every Rust release, in order to do an even better job of appropriately recognising the efforts of the authors of the first-party and third-party libraries we are using here.

The output includes something like 524 copies of the Apache-2.0 text because they are not all identical. I think I count about 50 different variations by shasum - some differ in whitespace, while some have the boilerplate block at the bottom erroneously modified (don't modify the copy in the license, modify the copy you paste into your own source code!). Running `gzip` on the HTML file largely makes this problem go away, and the average browser is far happier with a ~6 MiB HTML file than the average Markdown viewer is with a ~6 MiB markdown file. But, if someone wants to, do they could submit a follow-up which de-dups the license text files and adds back-links to earlier identical copies (for some value of 'identical copy').

```console
$ xpy run generate-copyright
$ cd build
$ gzip -c COPYRIGHT.html > COPYRIGHT.gz
$ xz -c COPYRIGHT.html > COPYRIGHT.xz
$ ls -lh COPYRIGHT.*
-rw-r--r--  1 jonathan  staff   241K 29 Jul 17:19 COPYRIGHT.gz
-rw-r--r--@ 1 jonathan  staff   6.6M 29 Jul 11:30 COPYRIGHT.html
-rw-r--r--  1 jonathan  staff    59K 29 Jul 17:19 COPYRIGHT.xz
```

Here's an example [COPYRIGHT.gz](https://github.com/user-attachments/files/16416147/COPYRIGHT.gz).
This commit is contained in:
Matthias Krüger 2024-08-07 20:28:17 +02:00 committed by GitHub
commit e34229508a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 388 additions and 60 deletions

View File

@ -1406,8 +1406,11 @@ name = "generate-copyright"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"cargo_metadata 0.18.1",
"rinja",
"serde", "serde",
"serde_json", "serde_json",
"thiserror",
] ]
[[package]] [[package]]
@ -3094,7 +3097,10 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d3762e3740cdbf2fd2be465cc2c26d643ad17353cc2e0223d211c1b096118bd" checksum = "6d3762e3740cdbf2fd2be465cc2c26d643ad17353cc2e0223d211c1b096118bd"
dependencies = [ dependencies = [
"humansize",
"itoa", "itoa",
"num-traits",
"percent-encoding",
"rinja_derive", "rinja_derive",
] ]

View File

@ -163,7 +163,7 @@ SPDX-License-Identifier = "MIT OR Apache-2.0"
path = "src/llvm-project/**" path = "src/llvm-project/**"
precedence = "override" precedence = "override"
SPDX-FileCopyrightText = [ SPDX-FileCopyrightText = [
"2003-2019 by the contributors listed in [CREDITS.TXT](https://github.com/rust-lang/llvm-project/blob/7738295178045041669876bf32b0543ec8319a5c/llvm/CREDITS.TXT)", "2003-2019 by the contributors listed in CREDITS.TXT (https://github.com/rust-lang/llvm-project/blob/7738295178045041669876bf32b0543ec8319a5c/llvm/CREDITS.TXT)",
"2010 Apple Inc", "2010 Apple Inc",
"2003-2019 University of Illinois at Urbana-Champaign.", "2003-2019 University of Illinois at Urbana-Champaign.",
] ]

View File

@ -212,11 +212,13 @@ fn run(self, builder: &Builder<'_>) -> Self::Output {
let license_metadata = builder.ensure(CollectLicenseMetadata); let license_metadata = builder.ensure(CollectLicenseMetadata);
// Temporary location, it will be moved to the proper one once it's accurate. // Temporary location, it will be moved to the proper one once it's accurate.
let dest = builder.out.join("COPYRIGHT.md"); let dest = builder.out.join("COPYRIGHT.html");
let mut cmd = builder.tool_cmd(Tool::GenerateCopyright); let mut cmd = builder.tool_cmd(Tool::GenerateCopyright);
cmd.env("LICENSE_METADATA", &license_metadata); cmd.env("LICENSE_METADATA", &license_metadata);
cmd.env("DEST", &dest); cmd.env("DEST", &dest);
cmd.env("OUT_DIR", &builder.out);
cmd.env("CARGO", &builder.initial_cargo);
cmd.run(builder); cmd.run(builder);
dest dest

View File

@ -2,6 +2,8 @@
name = "collect-license-metadata" name = "collect-license-metadata"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
description = "Runs the reuse tool and caches the output, so rust toolchain devs don't need to have reuse installed"
license = "MIT OR Apache-2.0"
[dependencies] [dependencies]
anyhow = "1.0.65" anyhow = "1.0.65"

View File

@ -8,6 +8,11 @@
use crate::licenses::LicensesInterner; use crate::licenses::LicensesInterner;
/// The entry point to the binary.
///
/// You should probably let `bootstrap` execute this program instead of running it directly.
///
/// Run `x.py run collect-license-metadata`
fn main() -> Result<(), Error> { fn main() -> Result<(), Error> {
let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into(); let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into();
let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into(); let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into();

View File

@ -2,10 +2,14 @@
name = "generate-copyright" name = "generate-copyright"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
description = "Produces a manifest of all the copyrighted materials in the Rust Toolchain"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
anyhow = "1.0.65" anyhow = "1.0.65"
cargo_metadata = "0.18.1"
rinja = "0.3.0"
serde = { version = "1.0.147", features = ["derive"] } serde = { version = "1.0.147", features = ["derive"] }
serde_json = "1.0.85" serde_json = "1.0.85"
thiserror = "1"

View File

@ -0,0 +1,191 @@
//! Gets metadata about a workspace from Cargo
use std::collections::BTreeMap;
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
/// Describes how this module can fail
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error("I/O Error: {0:?}")]
Io(#[from] std::io::Error),
#[error("Failed get output from cargo-metadata: {0:?}")]
GettingMetadata(#[from] cargo_metadata::Error),
#[error("Failed to run cargo vendor: {0:?}")]
LaunchingVendor(std::io::Error),
#[error("Failed to complete cargo vendor")]
RunningVendor,
#[error("Bad path {0:?} whilst scraping files")]
Scraping(PathBuf),
}
/// Uniquely describes a package on crates.io
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Package {
/// The name of the package
pub name: String,
/// The version number
pub version: String,
}
/// Extra data about a package
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct PackageMetadata {
/// The license it is under
pub license: String,
/// The list of authors from the package metadata
pub authors: Vec<String>,
/// A list of important files from the package, with their contents.
///
/// This includes *COPYRIGHT*, *NOTICE*, *AUTHOR*, *LICENSE*, and *LICENCE* files, case-insensitive.
pub notices: BTreeMap<String, String>,
/// If this is true, this dep is in the Rust Standard Library
pub is_in_libstd: Option<bool>,
}
/// Use `cargo metadata` and `cargo vendor` to get a list of dependencies and their license data.
///
/// This will involve running `cargo vendor` into `${BUILD}/vendor` so we can
/// grab the license files.
///
/// Any dependency with a path beginning with `root_path` is ignored, as we
/// assume `reuse` has covered it already.
pub fn get_metadata_and_notices(
cargo: &Path,
dest: &Path,
root_path: &Path,
manifest_paths: &[&Path],
) -> Result<BTreeMap<Package, PackageMetadata>, Error> {
let mut output = get_metadata(cargo, root_path, manifest_paths)?;
// Now do a cargo-vendor and grab everything
let vendor_path = dest.join("vendor");
println!("Vendoring deps into {}...", vendor_path.display());
run_cargo_vendor(cargo, &vendor_path, manifest_paths)?;
// Now for each dependency we found, go and grab any important looking files
for (package, metadata) in output.iter_mut() {
load_important_files(package, metadata, &vendor_path)?;
}
Ok(output)
}
/// Use `cargo metadata` to get a list of dependencies and their license data.
///
/// Any dependency with a path beginning with `root_path` is ignored, as we
/// assume `reuse` has covered it already.
pub fn get_metadata(
cargo: &Path,
root_path: &Path,
manifest_paths: &[&Path],
) -> Result<BTreeMap<Package, PackageMetadata>, Error> {
let mut output = BTreeMap::new();
// Look at the metadata for each manifest
for manifest_path in manifest_paths {
if manifest_path.file_name() != Some(OsStr::new("Cargo.toml")) {
panic!("cargo_manifest::get requires a path to a Cargo.toml file");
}
let metadata = cargo_metadata::MetadataCommand::new()
.cargo_path(cargo)
.env("RUSTC_BOOTSTRAP", "1")
.manifest_path(manifest_path)
.exec()?;
for package in metadata.packages {
let manifest_path = package.manifest_path.as_path();
if manifest_path.starts_with(root_path) {
// it's an in-tree dependency and reuse covers it
continue;
}
// otherwise it's an out-of-tree dependency
let package_id = Package { name: package.name, version: package.version.to_string() };
output.insert(
package_id,
PackageMetadata {
license: package.license.unwrap_or_else(|| String::from("Unspecified")),
authors: package.authors,
notices: BTreeMap::new(),
is_in_libstd: None,
},
);
}
}
Ok(output)
}
/// Run cargo-vendor, fetching into the given dir
fn run_cargo_vendor(cargo: &Path, dest: &Path, manifest_paths: &[&Path]) -> Result<(), Error> {
let mut vendor_command = std::process::Command::new(cargo);
vendor_command.env("RUSTC_BOOTSTRAP", "1");
vendor_command.arg("vendor");
vendor_command.arg("--quiet");
vendor_command.arg("--versioned-dirs");
for manifest_path in manifest_paths {
vendor_command.arg("-s");
vendor_command.arg(manifest_path);
}
vendor_command.arg(dest);
let vendor_status = vendor_command.status().map_err(Error::LaunchingVendor)?;
if !vendor_status.success() {
return Err(Error::RunningVendor);
}
Ok(())
}
/// Add important files off disk into this dependency.
///
/// Maybe one-day Cargo.toml will contain enough information that we don't need
/// to do this manual scraping.
fn load_important_files(
package: &Package,
dep: &mut PackageMetadata,
vendor_root: &Path,
) -> Result<(), Error> {
let name_version = format!("{}-{}", package.name, package.version);
println!("Scraping notices for {}...", name_version);
let dep_vendor_path = vendor_root.join(name_version);
for entry in std::fs::read_dir(dep_vendor_path)? {
let entry = entry?;
let metadata = entry.metadata()?;
let path = entry.path();
let Some(filename) = path.file_name() else {
return Err(Error::Scraping(path));
};
let lc_filename = filename.to_ascii_lowercase();
let lc_filename_str = lc_filename.to_string_lossy();
let mut keep = false;
for m in ["copyright", "licence", "license", "author", "notice"] {
if lc_filename_str.contains(m) {
keep = true;
break;
}
}
if keep {
if metadata.is_dir() {
for inner_entry in std::fs::read_dir(entry.path())? {
let inner_entry = inner_entry?;
if inner_entry.metadata()?.is_file() {
let inner_filename = inner_entry.file_name();
let inner_filename_str = inner_filename.to_string_lossy();
let qualified_filename =
format!("{}/{}", lc_filename_str, inner_filename_str);
println!("Scraping {}", qualified_filename);
dep.notices.insert(
qualified_filename.to_string(),
std::fs::read_to_string(inner_entry.path())?,
);
}
}
} else if metadata.is_file() {
let filename = filename.to_string_lossy();
println!("Scraping {}", filename);
dep.notices.insert(filename.to_string(), std::fs::read_to_string(path)?);
}
}
}
Ok(())
}

View File

@ -1,79 +1,70 @@
use std::io::Write; use std::collections::BTreeMap;
use std::path::PathBuf; use std::path::{Path, PathBuf};
use anyhow::Error; use anyhow::Error;
use rinja::Template;
mod cargo_metadata;
#[derive(Template)]
#[template(path = "COPYRIGHT.html")]
struct CopyrightTemplate {
in_tree: Node,
dependencies: BTreeMap<cargo_metadata::Package, cargo_metadata::PackageMetadata>,
}
/// The entry point to the binary.
///
/// You should probably let `bootstrap` execute this program instead of running it directly.
///
/// Run `x.py run generate-copyright`
fn main() -> Result<(), Error> { fn main() -> Result<(), Error> {
let dest = env_path("DEST")?; let dest_file = env_path("DEST")?;
let out_dir = env_path("OUT_DIR")?;
let cargo = env_path("CARGO")?;
let license_metadata = env_path("LICENSE_METADATA")?; let license_metadata = env_path("LICENSE_METADATA")?;
let metadata: Metadata = serde_json::from_slice(&std::fs::read(&license_metadata)?)?; let collected_tree_metadata: Metadata =
serde_json::from_slice(&std::fs::read(&license_metadata)?)?;
let mut buffer = Vec::new(); let root_path = std::path::absolute(".")?;
render_recursive(&metadata.files, &mut buffer, 0)?; let workspace_paths = [
Path::new("./Cargo.toml"),
Path::new("./src/tools/cargo/Cargo.toml"),
Path::new("./library/Cargo.toml"),
];
let mut collected_cargo_metadata =
cargo_metadata::get_metadata_and_notices(&cargo, &out_dir, &root_path, &workspace_paths)?;
std::fs::write(&dest, &buffer)?; let stdlib_set =
cargo_metadata::get_metadata(&cargo, &root_path, &[Path::new("./library/std/Cargo.toml")])?;
Ok(())
} for (key, value) in collected_cargo_metadata.iter_mut() {
value.is_in_libstd = Some(stdlib_set.contains_key(key));
fn render_recursive(node: &Node, buffer: &mut Vec<u8>, depth: usize) -> Result<(), Error> { }
let prefix = std::iter::repeat("> ").take(depth + 1).collect::<String>();
let template = CopyrightTemplate {
match node { in_tree: collected_tree_metadata.files,
Node::Root { children } => { dependencies: collected_cargo_metadata,
for child in children { };
render_recursive(child, buffer, depth)?;
} let output = template.render()?;
}
Node::Directory { name, children, license } => { std::fs::write(&dest_file, output)?;
render_license(&prefix, std::iter::once(name), license.as_ref(), buffer)?;
if !children.is_empty() {
writeln!(buffer, "{prefix}")?;
writeln!(buffer, "{prefix}*Exceptions:*")?;
for child in children {
writeln!(buffer, "{prefix}")?;
render_recursive(child, buffer, depth + 1)?;
}
}
}
Node::Group { files, directories, license } => {
render_license(&prefix, directories.iter().chain(files.iter()), Some(license), buffer)?;
}
Node::File { name, license } => {
render_license(&prefix, std::iter::once(name), Some(license), buffer)?;
}
}
Ok(())
}
fn render_license<'a>(
prefix: &str,
names: impl Iterator<Item = &'a String>,
license: Option<&License>,
buffer: &mut Vec<u8>,
) -> Result<(), Error> {
for name in names {
writeln!(buffer, "{prefix}**`{name}`** ")?;
}
if let Some(license) = license {
writeln!(buffer, "{prefix}License: `{}`", license.spdx)?;
for copyright in license.copyright.iter() {
writeln!(buffer, "{prefix}Copyright: {copyright}")?;
}
}
Ok(()) Ok(())
} }
/// Describes a tree of metadata for our filesystem tree
#[derive(serde::Deserialize)] #[derive(serde::Deserialize)]
struct Metadata { struct Metadata {
files: Node, files: Node,
} }
#[derive(serde::Deserialize)] /// Describes one node in our metadata tree
#[derive(serde::Deserialize, rinja::Template)]
#[serde(rename_all = "kebab-case", tag = "type")] #[serde(rename_all = "kebab-case", tag = "type")]
#[template(path = "Node.html")]
pub(crate) enum Node { pub(crate) enum Node {
Root { children: Vec<Node> }, Root { children: Vec<Node> },
Directory { name: String, children: Vec<Node>, license: Option<License> }, Directory { name: String, children: Vec<Node>, license: Option<License> },
@ -81,12 +72,14 @@ pub(crate) enum Node {
Group { files: Vec<String>, directories: Vec<String>, license: License }, Group { files: Vec<String>, directories: Vec<String>, license: License },
} }
/// A License has an SPDX license name and a list of copyright holders.
#[derive(serde::Deserialize)] #[derive(serde::Deserialize)]
struct License { struct License {
spdx: String, spdx: String,
copyright: Vec<String>, copyright: Vec<String>,
} }
/// Grab an environment variable as a PathBuf, or fail nicely.
fn env_path(var: &str) -> Result<PathBuf, Error> { fn env_path(var: &str) -> Result<PathBuf, Error> {
if let Some(var) = std::env::var_os(var) { if let Some(var) = std::env::var_os(var) {
Ok(var.into()) Ok(var.into())

View File

@ -0,0 +1,54 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Copyright notices for The Rust Toolchain</title>
</head>
<body>
<h1>Copyright notices for The Rust Toolchain</h1>
<p>This file describes the copyright and licensing information for the source
code within The Rust Project git tree, and the third-party dependencies used
when building the Rust toolchain (including the Rust Standard Library).</p>
<h2>Table of Contents</h2>
<ul>
<li><a href="#in-tree-files">In-tree files</a></li>
<li><a href="#out-of-tree-dependencies">Out-of-tree dependencies</a></li>
</ul>
<h2 id="in-tree-files">In-tree files</h2>
<p>The following licenses cover the in-tree source files that were used in this
release:</p>
{{ in_tree|safe }}
<h2 id="out-of-tree-dependencies">Out-of-tree dependencies</h2>
<p>The following licenses cover the out-of-tree crates that were used in this
release:</p>
{% for (key, value) in dependencies %}
<h3>📦 {{key.name}}-{{key.version}}</h3>
<p><b>URL:</b> <a href="https://crates.io/crates/{{ key.name }}/{{ key.version }}">https://crates.io/crates/{{ key.name }}/{{ key.version }}</a></p>
<p><b>In libstd:</b> {% if value.is_in_libstd.unwrap() %} Yes {% else %} No {% endif %}</p>
<p><b>Authors:</b> {{ value.authors|join(", ") }}</p>
<p><b>License:</b> {{ value.license }}</p>
{% let len = value.notices.len() %}
{% if len > 0 %}
<p><b>Notices:</b>
{% for (notice_name, notice_text) in value.notices %}
<details>
<summary><code>{{ notice_name }}</code></summary>
<pre>
{{ notice_text }}
</pre>
</details>
{% endfor %}
</p>
{% endif %}
{% endfor %}
</body>
</html>

View File

@ -0,0 +1,71 @@
{% match self %}
{% when Node::Root { children } %}
{% for child in children %}
{{ child|safe }}
{% endfor %}
{% when Node::Directory { name, children, license } %}
<div style="border:1px solid black; padding: 5px;">
<p>
<b>File/Directory:</b> <code>{{ name }}</code>
</p>
{% if let Some(license) = license %}
<p><b>License:</b> {{ license.spdx }}</p>
{% for copyright in license.copyright.iter() %}
<p><b>Copyright:</b> {{ copyright }}</p>
{% endfor %}
{% endif %}
{% if !children.is_empty() %}
<p><b>Exceptions:</b></p>
{% for child in children %}
{{ child|safe }}
{% endfor %}
{% endif %}
</div>
{% when Node::File { name, license } %}
<div style="border:1px solid black; padding: 5px;">
<p>
<b>File/Directory:</b> <code>{{ name }}</code>
</p>
<p><b>License:</b> {{ license.spdx }}</p>
{% for copyright in license.copyright.iter() %}
<p><b>Copyright:</b> {{ copyright }}</p>
{% endfor %}
</div>
{% when Node::Group { files, directories, license } %}
<div style="border:1px solid black; padding: 5px;">
<p>
<b>File/Directory:</b>
{% for name in files %}
<code>{{ name }}</code>
{% endfor %}
{% for name in directories %}
<code>{{ name }}</code>
{% endfor %}
</p>
<p><b>License:</b> {{ license.spdx }}</p>
{% for copyright in license.copyright.iter() %}
<p><b>Copyright:</b> {{ copyright }}</p>
{% endfor %}
</div>
{% endmatch %}