add tool to collect license metadata from REUSE

This commit is contained in:
Pietro Albini 2022-11-15 10:19:17 +01:00
parent 17ee25d775
commit 13efb20846
No known key found for this signature in database
GPG Key ID: CD76B35F7734769E
10 changed files with 522 additions and 0 deletions

View File

@ -563,6 +563,7 @@ dependencies = [
"libc",
"num-integer",
"num-traits",
"serde",
"time",
"winapi",
]
@ -712,6 +713,16 @@ dependencies = [
"rustc-semver",
]
[[package]]
name = "collect-license-metadata"
version = "0.1.0"
dependencies = [
"anyhow",
"serde",
"serde_json",
"spdx-rs",
]
[[package]]
name = "color-eyre"
version = "0.6.2"
@ -4628,6 +4639,35 @@ dependencies = [
"winapi",
]
[[package]]
name = "spdx-expression"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d7ac03c67c572d85049d6db815e20a4a19b41b3d5cca732ac582342021ad77"
dependencies = [
"nom",
"serde",
"thiserror",
"tracing",
]
[[package]]
name = "spdx-rs"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3c02f6eb7e7b4100c272f685a9ccaccaab302324e8c7ec3e2ee72340fb29ff3"
dependencies = [
"chrono",
"log",
"nom",
"serde",
"spdx-expression",
"strum",
"strum_macros",
"thiserror",
"uuid",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
@ -4731,6 +4771,25 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
[[package]]
name = "strum_macros"
version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [
"heck",
"proc-macro2",
"quote",
"rustversion",
"syn",
]
[[package]]
name = "syn"
version = "1.0.102"
@ -5357,6 +5416,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8772a4ccbb4e89959023bc5b7cb8623a795caa7092d99f3aa9501b9484d4557d"
[[package]]
name = "uuid"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
dependencies = [
"getrandom 0.2.0",
]
[[package]]
name = "valuable"
version = "0.1.0"

View File

@ -39,6 +39,7 @@ members = [
"src/tools/bump-stage0",
"src/tools/replace-version-placeholder",
"src/tools/lld-wrapper",
"src/tools/collect-license-metadata",
]
exclude = [

View File

@ -753,6 +753,7 @@ macro_rules! describe {
run::BumpStage0,
run::ReplaceVersionPlaceholder,
run::Miri,
run::CollectLicenseMetadata,
),
// These commands either don't use paths, or they're special-cased in Build::build()
Kind::Clean | Kind::Format | Kind::Setup => vec![],

View File

@ -1,3 +1,4 @@
use std::path::PathBuf;
use std::process::Command;
use crate::builder::{Builder, RunConfig, ShouldRun, Step};
@ -189,3 +190,35 @@ fn run(self, builder: &Builder<'_>) {
builder.run(&mut miri);
}
}
#[derive(Debug, PartialOrd, Ord, Copy, Clone, Hash, PartialEq, Eq)]
pub struct CollectLicenseMetadata;
impl Step for CollectLicenseMetadata {
type Output = PathBuf;
const ONLY_HOSTS: bool = true;
fn should_run(run: ShouldRun<'_>) -> ShouldRun<'_> {
run.path("src/tools/collect-license-metadata")
}
fn make_run(run: RunConfig<'_>) {
run.builder.ensure(CollectLicenseMetadata);
}
fn run(self, builder: &Builder<'_>) -> Self::Output {
let Some(reuse) = &builder.config.reuse else {
panic!("REUSE is required to collect the license metadata");
};
// Temporary location, it will be moved to src/etc once it's accurate.
let dest = builder.out.join("license-metadata.json");
let mut cmd = builder.tool_cmd(Tool::CollectLicenseMetadata);
cmd.env("REUSE_EXE", reuse);
cmd.env("DEST", &dest);
builder.run(&mut cmd);
dest
}
}

View File

@ -380,6 +380,7 @@ fn run(self, builder: &Builder<'_>) -> PathBuf {
HtmlChecker, "src/tools/html-checker", "html-checker";
BumpStage0, "src/tools/bump-stage0", "bump-stage0";
ReplaceVersionPlaceholder, "src/tools/replace-version-placeholder", "replace-version-placeholder";
CollectLicenseMetadata, "src/tools/collect-license-metadata", "collect-license-metadata";
);
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]

View File

@ -0,0 +1,10 @@
[package]
name = "collect-license-metadata"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.65"
serde = { version = "1.0.147", features = ["derive"] }
serde_json = "1.0.85"
spdx-rs = "0.5.1"

View File

@ -0,0 +1,37 @@
use std::collections::HashMap;
pub(crate) struct LicensesInterner {
by_id: Vec<License>,
by_struct: HashMap<License, usize>,
}
impl LicensesInterner {
pub(crate) fn new() -> Self {
LicensesInterner { by_id: Vec::new(), by_struct: HashMap::new() }
}
pub(crate) fn intern(&mut self, license: License) -> LicenseId {
if let Some(id) = self.by_struct.get(&license) {
LicenseId(*id)
} else {
let id = self.by_id.len();
self.by_id.push(license.clone());
self.by_struct.insert(license, id);
LicenseId(id)
}
}
pub(crate) fn resolve(&self, id: LicenseId) -> &License {
&self.by_id[id.0]
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)]
#[serde(transparent)]
pub(crate) struct LicenseId(usize);
#[derive(Clone, Hash, PartialEq, Eq, serde::Serialize)]
pub(crate) struct License {
pub(crate) spdx: String,
pub(crate) copyright: Vec<String>,
}

View File

@ -0,0 +1,30 @@
mod licenses;
mod path_tree;
mod reuse;
use crate::licenses::LicensesInterner;
use anyhow::Error;
use std::path::PathBuf;
fn main() -> Result<(), Error> {
let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into();
let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into();
let mut interner = LicensesInterner::new();
let paths = crate::reuse::collect(&reuse_exe, &mut interner)?;
let mut tree = crate::path_tree::build(paths);
tree.simplify();
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)?;
}
std::fs::write(
&dest,
&serde_json::to_vec_pretty(&serde_json::json!({
"files": crate::path_tree::strip_interning(tree, &interner),
}))?,
)?;
Ok(())
}

View File

@ -0,0 +1,292 @@
//! Tools like REUSE output per-file licensing information, but we need to condense it in the
//! minimum amount of data that still represents the same licensing metadata. This module is
//! responsible for that, by turning the list of paths into a tree and executing simplification
//! passes over the tree to remove redundant information.
use crate::licenses::{License, LicenseId, LicensesInterner};
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
#[derive(serde::Serialize)]
#[serde(rename_all = "kebab-case", tag = "type")]
pub(crate) enum Node<L> {
Root { childs: Vec<Node<L>> },
Directory { name: PathBuf, childs: Vec<Node<L>>, license: Option<L> },
File { name: PathBuf, license: L },
FileGroup { names: Vec<PathBuf>, license: L },
Empty,
}
impl Node<LicenseId> {
pub(crate) fn simplify(&mut self) {
self.merge_directories();
self.collapse_in_licensed_directories();
self.merge_directory_licenses();
self.merge_file_groups();
self.remove_empty();
}
/// Initially, trees are built by the build() function with each file practically having a
/// separate directory tree, like so:
///
/// ```text
/// ┌─► ./ ──► compiler/ ──► rustc/ ──► src/ ──► main.rs
/// │
/// <root> ─┼─► ./ ──► compiler/ ──► rustc/ ──► Cargo.toml
/// │
/// └─► ./ ──► library/ ───► std/ ──► Cargo.toml
/// ```
///
/// This pass is responsible for turning that into a proper directory tree:
///
/// ```text
/// ┌─► compiler/ ──► rustc/ ──┬─► src/ ──► main.rs
/// │ │
/// <root> ──► ./ ──┤ └─► Cargo.toml
/// │
/// └─► library/ ───► std/ ──► Cargo.toml
/// ```
fn merge_directories(&mut self) {
match self {
Node::Root { childs } | Node::Directory { childs, license: None, .. } => {
let mut directories = BTreeMap::new();
let mut files = Vec::new();
for child in childs.drain(..) {
match child {
Node::Directory { name, mut childs, license: None } => {
directories.entry(name).or_insert_with(Vec::new).append(&mut childs);
}
file @ Node::File { .. } => {
files.push(file);
}
Node::Empty => {}
Node::Root { .. } => {
panic!("can't have a root inside another element");
}
Node::FileGroup { .. } => {
panic!("FileGroup should not be present at this stage");
}
Node::Directory { license: Some(_), .. } => {
panic!("license should not be set at this stage");
}
}
}
childs.extend(directories.into_iter().map(|(name, childs)| Node::Directory {
name,
childs,
license: None,
}));
childs.append(&mut files);
for child in &mut *childs {
child.merge_directories();
}
}
Node::Empty => {}
Node::File { .. } => {}
Node::FileGroup { .. } => {
panic!("FileGroup should not be present at this stage");
}
Node::Directory { license: Some(_), .. } => {
panic!("license should not be set at this stage");
}
}
}
/// In our codebase, most files in a directory have the same license as the other files in that
/// same directory, so it's redundant to store licensing metadata for all the files. Instead,
/// we can add a license for a whole directory, and only record the exceptions to a directory
/// licensing metadata.
///
/// We cannot instead record only the difference to Rust's standard licensing, as the majority
/// of the files in our repository are *not* licensed under Rust's standard licensing due to
/// our inclusion of LLVM.
fn collapse_in_licensed_directories(&mut self) {
match self {
Node::Directory { childs, license, .. } => {
for child in &mut *childs {
child.collapse_in_licensed_directories();
}
let mut licenses_count = BTreeMap::new();
for child in &*childs {
let Some(license) = child.license() else { continue };
*licenses_count.entry(license).or_insert(0) += 1;
}
let most_popular_license = licenses_count
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(license, _)| license);
if let Some(most_popular_license) = most_popular_license {
childs.retain(|child| child.license() != Some(most_popular_license));
*license = Some(most_popular_license);
}
}
Node::Root { childs } => {
for child in &mut *childs {
child.collapse_in_licensed_directories();
}
}
Node::File { .. } => {}
Node::FileGroup { .. } => {}
Node::Empty => {}
}
}
/// Reduce the depth of the tree by merging subdirectories with the same license as their
/// parent directory into their parent, and adjusting the paths of the childs accordingly.
fn merge_directory_licenses(&mut self) {
match self {
Node::Root { childs } => {
for child in &mut *childs {
child.merge_directory_licenses();
}
}
Node::Directory { childs, license, .. } => {
let mut to_add = Vec::new();
for child in &mut *childs {
child.merge_directory_licenses();
let Node::Directory {
name: child_name,
childs: child_childs,
license: child_license,
} = child else { continue };
if child_license != license {
continue;
}
for mut child_child in child_childs.drain(..) {
match &mut child_child {
Node::Root { .. } => {
panic!("can't have a root inside another element");
}
Node::FileGroup { .. } => {
panic!("FileGroup should not be present at this stage");
}
Node::Directory { name: child_child_name, .. } => {
*child_child_name = child_name.join(&child_child_name);
}
Node::File { name: child_child_name, .. } => {
*child_child_name = child_name.join(&child_child_name);
}
Node::Empty => {}
}
to_add.push(child_child);
}
*child = Node::Empty;
}
childs.append(&mut to_add);
}
Node::Empty => {}
Node::File { .. } => {}
Node::FileGroup { .. } => {}
}
}
/// This pass groups multiple files in a directory with the same license into a single
/// "FileGroup", so that the license of all those files can be reported as a group.
///
/// Crucially this pass runs after collapse_in_licensed_directories, so the most common license
/// will already be marked as the directory's license and won't be turned into a group.
fn merge_file_groups(&mut self) {
match self {
Node::Root { childs } | Node::Directory { childs, .. } => {
let mut grouped = BTreeMap::new();
for child in &mut *childs {
child.merge_file_groups();
if let Node::File { name, license } = child {
grouped.entry(*license).or_insert_with(Vec::new).push(name.clone());
*child = Node::Empty;
}
}
for (license, mut names) in grouped.into_iter() {
if names.len() == 1 {
childs.push(Node::File { license, name: names.pop().unwrap() });
} else {
childs.push(Node::FileGroup { license, names });
}
}
}
Node::File { .. } => {}
Node::FileGroup { .. } => panic!("FileGroup should not be present at this stage"),
Node::Empty => {}
}
}
/// Some nodes were replaced with Node::Empty to mark them for deletion. As the last step, make
/// sure to remove them from the tree.
fn remove_empty(&mut self) {
match self {
Node::Root { childs } | Node::Directory { childs, .. } => {
for child in &mut *childs {
child.remove_empty();
}
childs.retain(|child| !matches!(child, Node::Empty));
}
Node::FileGroup { .. } => {}
Node::File { .. } => {}
Node::Empty => {}
}
}
fn license(&self) -> Option<LicenseId> {
match self {
Node::Directory { childs, license: Some(license), .. } if childs.is_empty() => {
Some(*license)
}
Node::File { license, .. } => Some(*license),
_ => None,
}
}
}
pub(crate) fn build(mut input: Vec<(PathBuf, LicenseId)>) -> Node<LicenseId> {
let mut childs = Vec::new();
// Ensure reproducibility of all future steps.
input.sort();
for (path, license) in input {
let mut node = Node::File { name: path.file_name().unwrap().into(), license };
for component in path.parent().unwrap_or_else(|| Path::new(".")).components().rev() {
node = Node::Directory {
name: component.as_os_str().into(),
childs: vec![node],
license: None,
};
}
childs.push(node);
}
Node::Root { childs }
}
pub(crate) fn strip_interning(
node: Node<LicenseId>,
interner: &LicensesInterner,
) -> Node<&License> {
match node {
Node::Root { childs } => Node::Root {
childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(),
},
Node::Directory { name, childs, license } => Node::Directory {
childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(),
license: license.map(|license| interner.resolve(license)),
name,
},
Node::File { name, license } => Node::File { name, license: interner.resolve(license) },
Node::FileGroup { names, license } => {
Node::FileGroup { names, license: interner.resolve(license) }
}
Node::Empty => Node::Empty,
}
}

View File

@ -0,0 +1,49 @@
use crate::licenses::{License, LicenseId, LicensesInterner};
use anyhow::Error;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::Instant;
pub(crate) fn collect(
reuse_exe: &Path,
interner: &mut LicensesInterner,
) -> Result<Vec<(PathBuf, LicenseId)>, Error> {
eprintln!("gathering license information from REUSE");
let start = Instant::now();
let raw = &obtain_spdx_document(reuse_exe)?;
eprintln!("finished gathering the license information from REUSE in {:.2?}", start.elapsed());
let document = spdx_rs::parsers::spdx_from_tag_value(&raw)?;
let mut result = Vec::new();
for file in document.file_information {
let license = interner.intern(License {
spdx: file.concluded_license.to_string(),
copyright: file.copyright_text.split('\n').map(|s| s.into()).collect(),
});
result.push((file.file_name.into(), license));
}
Ok(result)
}
fn obtain_spdx_document(reuse_exe: &Path) -> Result<String, Error> {
let output = Command::new(reuse_exe)
.args(&["spdx", "--add-license-concluded", "--creator-person=bors"])
.stdout(Stdio::piped())
.spawn()?
.wait_with_output()?;
if !output.status.success() {
eprintln!();
eprintln!("Note that Rust requires some REUSE features that might not be present in the");
eprintln!("release you're using. Make sure your REUSE release includes these PRs:");
eprintln!();
eprintln!(" - https://github.com/fsfe/reuse-tool/pull/623");
eprintln!();
anyhow::bail!("collecting licensing information with REUSE failed");
}
Ok(String::from_utf8(output.stdout)?)
}