diff --git a/Cargo.lock b/Cargo.lock index bdbafb8623b..35a9fdccf9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -722,6 +722,18 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "coverage-dump" +version = "0.1.0" +dependencies = [ + "anyhow", + "leb128", + "md-5", + "miniz_oxide", + "regex", + "rustc-demangle", +] + [[package]] name = "coverage_test_macros" version = "0.0.0" @@ -2041,6 +2053,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "leb128" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" + [[package]] name = "levenshtein" version = "1.0.5" diff --git a/Cargo.toml b/Cargo.toml index d2e84d5426f..9b11ae8744b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ members = [ "src/tools/generate-windows-sys", "src/tools/rustdoc-gui-test", "src/tools/opt-dist", + "src/tools/coverage-dump", ] exclude = [ diff --git a/src/bootstrap/builder.rs b/src/bootstrap/builder.rs index b3666192853..73c0a192cef 100644 --- a/src/bootstrap/builder.rs +++ b/src/bootstrap/builder.rs @@ -703,7 +703,8 @@ macro_rules! describe { llvm::Lld, llvm::CrtBeginEnd, tool::RustdocGUITest, - tool::OptimizedDist + tool::OptimizedDist, + tool::CoverageDump, ), Kind::Check | Kind::Clippy | Kind::Fix => describe!( check::Std, diff --git a/src/bootstrap/tool.rs b/src/bootstrap/tool.rs index 07ff3da6b4a..f094dd9d7c9 100644 --- a/src/bootstrap/tool.rs +++ b/src/bootstrap/tool.rs @@ -306,6 +306,7 @@ fn run(self, builder: &Builder<'_>) -> PathBuf { GenerateWindowsSys, "src/tools/generate-windows-sys", "generate-windows-sys"; RustdocGUITest, "src/tools/rustdoc-gui-test", "rustdoc-gui-test", is_unstable_tool = true, allow_features = "test"; OptimizedDist, "src/tools/opt-dist", "opt-dist"; + CoverageDump, "src/tools/coverage-dump", "coverage-dump"; ); #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)] diff --git a/src/tools/coverage-dump/Cargo.toml b/src/tools/coverage-dump/Cargo.toml new file mode 100644 index 00000000000..7f14286b5d0 --- /dev/null +++ b/src/tools/coverage-dump/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "coverage-dump" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.71" +leb128 = "0.2.5" +md5 = { package = "md-5" , version = "0.10.5" } +miniz_oxide = "0.7.1" +regex = "1.8.4" +rustc-demangle = "0.1.23" diff --git a/src/tools/coverage-dump/README.md b/src/tools/coverage-dump/README.md new file mode 100644 index 00000000000..e2625d5adf2 --- /dev/null +++ b/src/tools/coverage-dump/README.md @@ -0,0 +1,8 @@ +This tool extracts coverage mapping information from an LLVM IR assembly file +(`.ll`), and prints it in a more human-readable form that can be used for +snapshot tests. + +The output format is mostly arbitrary, so it's OK to change the output as long +as any affected tests are also re-blessed. However, the output should be +consistent across different executions on different platforms, so avoid +printing any information that is platform-specific or non-deterministic. diff --git a/src/tools/coverage-dump/src/covfun.rs b/src/tools/coverage-dump/src/covfun.rs new file mode 100644 index 00000000000..3a5866dea3e --- /dev/null +++ b/src/tools/coverage-dump/src/covfun.rs @@ -0,0 +1,296 @@ +use crate::parser::{unescape_llvm_string_contents, Parser}; +use anyhow::{anyhow, Context}; +use regex::Regex; +use std::collections::HashMap; +use std::fmt::{self, Debug, Write as _}; +use std::sync::OnceLock; + +pub(crate) fn dump_covfun_mappings( + llvm_ir: &str, + function_names: &HashMap, +) -> anyhow::Result<()> { + // Extract function coverage entries from the LLVM IR assembly, and associate + // each entry with its (demangled) name. + let mut covfun_entries = llvm_ir + .lines() + .filter_map(covfun_line_data) + .map(|line_data| (function_names.get(&line_data.name_hash).map(String::as_str), line_data)) + .collect::>(); + covfun_entries.sort_by(|a, b| { + // Sort entries primarily by name, to help make the order consistent + // across platforms and relatively insensitive to changes. + // (Sadly we can't use `sort_by_key` because we would need to return references.) + Ord::cmp(&a.0, &b.0) + .then_with(|| Ord::cmp(&a.1.is_used, &b.1.is_used)) + .then_with(|| Ord::cmp(a.1.payload.as_slice(), b.1.payload.as_slice())) + }); + + for (name, line_data) in &covfun_entries { + let name = name.unwrap_or("(unknown)"); + let unused = if line_data.is_used { "" } else { " (unused)" }; + println!("Function name: {name}{unused}"); + + let payload: &[u8] = &line_data.payload; + println!("Raw bytes ({len}): 0x{payload:02x?}", len = payload.len()); + + let mut parser = Parser::new(payload); + + let num_files = parser.read_uleb128_u32()?; + println!("Number of files: {num_files}"); + + for i in 0..num_files { + let global_file_id = parser.read_uleb128_u32()?; + println!("- file {i} => global file {global_file_id}"); + } + + let num_expressions = parser.read_uleb128_u32()?; + println!("Number of expressions: {num_expressions}"); + + let mut expression_resolver = ExpressionResolver::new(); + for i in 0..num_expressions { + let lhs = parser.read_simple_term()?; + let rhs = parser.read_simple_term()?; + println!("- expression {i} operands: lhs = {lhs:?}, rhs = {rhs:?}"); + expression_resolver.push_operands(lhs, rhs); + } + + for i in 0..num_files { + let num_mappings = parser.read_uleb128_u32()?; + println!("Number of file {i} mappings: {num_mappings}"); + + for _ in 0..num_mappings { + let (kind, region) = parser.read_mapping_kind_and_region()?; + println!("- {kind:?} at {region:?}"); + + match kind { + // Also print expression mappings in resolved form. + MappingKind::Code(term @ CovTerm::Expression { .. }) + | MappingKind::Gap(term @ CovTerm::Expression { .. }) => { + println!(" = {}", expression_resolver.format_term(term)); + } + // If the mapping is a branch region, print both of its arms + // in resolved form (even if they aren't expressions). + MappingKind::Branch { r#true, r#false } => { + println!(" true = {}", expression_resolver.format_term(r#true)); + println!(" false = {}", expression_resolver.format_term(r#false)); + } + _ => (), + } + } + } + + parser.ensure_empty()?; + println!(); + } + Ok(()) +} + +struct CovfunLineData { + name_hash: u64, + is_used: bool, + payload: Vec, +} + +/// Checks a line of LLVM IR assembly to see if it contains an `__llvm_covfun` +/// entry, and if so extracts relevant data in a `CovfunLineData`. +fn covfun_line_data(line: &str) -> Option { + let re = { + // We cheat a little bit and match variable names `@__covrec_[HASH]u` + // rather than the section name, because the section name is harder to + // extract and differs across Linux/Windows/macOS. We also extract the + // symbol name hash from the variable name rather than the data, since + // it's easier and both should match. + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| { + Regex::new( + r#"^@__covrec_(?[0-9A-Z]+)(?u)? = .*\[[0-9]+ x i8\] c"(?[^"]*)".*$"#, + ) + .unwrap() + }) + }; + + let captures = re.captures(line)?; + let name_hash = u64::from_str_radix(&captures["name_hash"], 16).unwrap(); + let is_used = captures.name("is_used").is_some(); + let payload = unescape_llvm_string_contents(&captures["payload"]); + + Some(CovfunLineData { name_hash, is_used, payload }) +} + +// Extra parser methods only needed when parsing `covfun` payloads. +impl<'a> Parser<'a> { + fn read_simple_term(&mut self) -> anyhow::Result { + let raw_term = self.read_uleb128_u32()?; + CovTerm::decode(raw_term).context("decoding term") + } + + fn read_mapping_kind_and_region(&mut self) -> anyhow::Result<(MappingKind, MappingRegion)> { + let mut kind = self.read_raw_mapping_kind()?; + let mut region = self.read_raw_mapping_region()?; + + const HIGH_BIT: u32 = 1u32 << 31; + if region.end_column & HIGH_BIT != 0 { + region.end_column &= !HIGH_BIT; + kind = match kind { + MappingKind::Code(term) => MappingKind::Gap(term), + // LLVM's coverage mapping reader will actually handle this + // case without complaint, but the result is almost certainly + // a meaningless implementation artifact. + _ => return Err(anyhow!("unexpected base kind for gap region: {kind:?}")), + } + } + + Ok((kind, region)) + } + + fn read_raw_mapping_kind(&mut self) -> anyhow::Result { + let raw_mapping_kind = self.read_uleb128_u32()?; + if let Some(term) = CovTerm::decode(raw_mapping_kind) { + return Ok(MappingKind::Code(term)); + } + + assert_eq!(raw_mapping_kind & 0b11, 0); + assert_ne!(raw_mapping_kind, 0); + + let (high, is_expansion) = (raw_mapping_kind >> 3, raw_mapping_kind & 0b100 != 0); + if is_expansion { + Ok(MappingKind::Expansion(high)) + } else { + match high { + 0 => unreachable!("zero kind should have already been handled as a code mapping"), + 2 => Ok(MappingKind::Skip), + 4 => { + let r#true = self.read_simple_term()?; + let r#false = self.read_simple_term()?; + Ok(MappingKind::Branch { r#true, r#false }) + } + _ => Err(anyhow!("unknown mapping kind: {raw_mapping_kind:#x}")), + } + } + } + + fn read_raw_mapping_region(&mut self) -> anyhow::Result { + let start_line_offset = self.read_uleb128_u32()?; + let start_column = self.read_uleb128_u32()?; + let end_line_offset = self.read_uleb128_u32()?; + let end_column = self.read_uleb128_u32()?; + Ok(MappingRegion { start_line_offset, start_column, end_line_offset, end_column }) + } +} + +/// Enum that can hold a constant zero value, the ID of an physical coverage +/// counter, or the ID (and operation) of a coverage-counter expression. +/// +/// Terms are used as the operands of coverage-counter expressions, as the arms +/// of branch mappings, and as the value of code/gap mappings. +#[derive(Clone, Copy, Debug)] +pub(crate) enum CovTerm { + Zero, + Counter(u32), + Expression(u32, Op), +} + +/// Operator (addition or subtraction) used by an expression. +#[derive(Clone, Copy, Debug)] +pub(crate) enum Op { + Sub, + Add, +} + +impl CovTerm { + pub(crate) fn decode(input: u32) -> Option { + let (high, tag) = (input >> 2, input & 0b11); + match tag { + 0b00 if high == 0 => Some(Self::Zero), + 0b01 => Some(Self::Counter(high)), + 0b10 => Some(Self::Expression(high, Op::Sub)), + 0b11 => Some(Self::Expression(high, Op::Add)), + // When reading expression operands or branch arms, the LLVM coverage + // mapping reader will always interpret a `0b00` tag as a zero + // term, even when the high bits are non-zero. + // We treat that case as failure instead, so that this code can be + // shared by the full mapping-kind reader as well. + _ => None, + } + } +} + +#[derive(Debug)] +enum MappingKind { + Code(CovTerm), + Gap(CovTerm), + Expansion(u32), + Skip, + // Using raw identifiers here makes the dump output a little bit nicer + // (via the derived Debug), at the expense of making this tool's source + // code a little bit uglier. + Branch { r#true: CovTerm, r#false: CovTerm }, +} + +struct MappingRegion { + /// Offset of this region's start line, relative to the *start line* of + /// the *previous mapping* (or 0). Line numbers are 1-based. + start_line_offset: u32, + /// This region's start column, absolute and 1-based. + start_column: u32, + /// Offset of this region's end line, relative to the *this mapping's* + /// start line. Line numbers are 1-based. + end_line_offset: u32, + /// This region's end column, absolute, 1-based, and exclusive. + /// + /// If the highest bit is set, that bit is cleared and the associated + /// mapping becomes a gap region mapping. + end_column: u32, +} + +impl Debug for MappingRegion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "(prev + {}, {}) to (start + {}, {})", + self.start_line_offset, self.start_column, self.end_line_offset, self.end_column + ) + } +} + +/// Helper type that prints expressions in a "resolved" form, so that +/// developers reading the dump don't need to resolve expressions by hand. +struct ExpressionResolver { + operands: Vec<(CovTerm, CovTerm)>, +} + +impl ExpressionResolver { + fn new() -> Self { + Self { operands: Vec::new() } + } + + fn push_operands(&mut self, lhs: CovTerm, rhs: CovTerm) { + self.operands.push((lhs, rhs)); + } + + fn format_term(&self, term: CovTerm) -> String { + let mut output = String::new(); + self.write_term(&mut output, term); + output + } + + fn write_term(&self, output: &mut String, term: CovTerm) { + match term { + CovTerm::Zero => output.push_str("Zero"), + CovTerm::Counter(id) => write!(output, "c{id}").unwrap(), + CovTerm::Expression(id, op) => { + let (lhs, rhs) = self.operands[id as usize]; + let op = match op { + Op::Sub => "-", + Op::Add => "+", + }; + + output.push('('); + self.write_term(output, lhs); + write!(output, " {op} ").unwrap(); + self.write_term(output, rhs); + output.push(')'); + } + } + } +} diff --git a/src/tools/coverage-dump/src/main.rs b/src/tools/coverage-dump/src/main.rs new file mode 100644 index 00000000000..93fed1799e0 --- /dev/null +++ b/src/tools/coverage-dump/src/main.rs @@ -0,0 +1,17 @@ +mod covfun; +mod parser; +mod prf_names; + +fn main() -> anyhow::Result<()> { + use anyhow::Context as _; + + let args = std::env::args().collect::>(); + + let llvm_ir_path = args.get(1).context("LLVM IR file not specified")?; + let llvm_ir = std::fs::read_to_string(llvm_ir_path).context("couldn't read LLVM IR file")?; + + let function_names = crate::prf_names::make_function_names_table(&llvm_ir)?; + crate::covfun::dump_covfun_mappings(&llvm_ir, &function_names)?; + + Ok(()) +} diff --git a/src/tools/coverage-dump/src/parser.rs b/src/tools/coverage-dump/src/parser.rs new file mode 100644 index 00000000000..eefac1a4f94 --- /dev/null +++ b/src/tools/coverage-dump/src/parser.rs @@ -0,0 +1,80 @@ +#[cfg(test)] +mod tests; + +use anyhow::ensure; +use regex::bytes; +use std::sync::OnceLock; + +/// Given the raw contents of a string literal in LLVM IR assembly, decodes any +/// backslash escapes and returns a vector containing the resulting byte string. +pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec { + let escape_re = { + static RE: OnceLock = OnceLock::new(); + // LLVM IR supports two string escapes: `\\` and `\xx`. + RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap()) + }; + + fn u8_from_hex_digits(digits: &[u8]) -> u8 { + // We know that the input contains exactly 2 hex digits, so these calls + // should never fail. + assert_eq!(digits.len(), 2); + let digits = std::str::from_utf8(digits).unwrap(); + u8::from_str_radix(digits, 16).unwrap() + } + + escape_re + .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| { + let byte = match captures.get(1) { + None => b'\\', + Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()), + }; + [byte] + }) + .into_owned() +} + +pub(crate) struct Parser<'a> { + rest: &'a [u8], +} + +impl<'a> Parser<'a> { + pub(crate) fn new(input: &'a [u8]) -> Self { + Self { rest: input } + } + + pub(crate) fn ensure_empty(self) -> anyhow::Result<()> { + ensure!(self.rest.is_empty(), "unparsed bytes: 0x{:02x?}", self.rest); + Ok(()) + } + + pub(crate) fn read_n_bytes(&mut self, n: usize) -> anyhow::Result<&'a [u8]> { + ensure!(n <= self.rest.len()); + + let (bytes, rest) = self.rest.split_at(n); + self.rest = rest; + Ok(bytes) + } + + pub(crate) fn read_uleb128_u32(&mut self) -> anyhow::Result { + self.read_uleb128_u64_and_convert() + } + + pub(crate) fn read_uleb128_usize(&mut self) -> anyhow::Result { + self.read_uleb128_u64_and_convert() + } + + fn read_uleb128_u64_and_convert(&mut self) -> anyhow::Result + where + T: TryFrom + 'static, + T::Error: std::error::Error + Send + Sync, + { + let mut temp_rest = self.rest; + let raw_value: u64 = leb128::read::unsigned(&mut temp_rest)?; + let converted_value = T::try_from(raw_value)?; + + // Only update `self.rest` if the above steps succeeded, so that the + // parser position can be used for error reporting if desired. + self.rest = temp_rest; + Ok(converted_value) + } +} diff --git a/src/tools/coverage-dump/src/parser/tests.rs b/src/tools/coverage-dump/src/parser/tests.rs new file mode 100644 index 00000000000..a673606b9c4 --- /dev/null +++ b/src/tools/coverage-dump/src/parser/tests.rs @@ -0,0 +1,38 @@ +use super::unescape_llvm_string_contents; + +// WARNING: These tests don't necessarily run in CI, and were mainly used to +// help track down problems when originally developing this tool. +// (The tool is still tested indirectly by snapshot tests that rely on it.) + +// Tests for `unescape_llvm_string_contents`: + +#[test] +fn unescape_empty() { + assert_eq!(unescape_llvm_string_contents(""), &[]); +} + +#[test] +fn unescape_noop() { + let input = "The quick brown fox jumps over the lazy dog."; + assert_eq!(unescape_llvm_string_contents(input), input.as_bytes()); +} + +#[test] +fn unescape_backslash() { + let input = r"\\Hello\\world\\"; + assert_eq!(unescape_llvm_string_contents(input), r"\Hello\world\".as_bytes()); +} + +#[test] +fn unescape_hex() { + let input = r"\01\02\03\04\0a\0b\0C\0D\fd\fE\FF"; + let expected: &[u8] = &[0x01, 0x02, 0x03, 0x04, 0x0a, 0x0b, 0x0c, 0x0d, 0xfd, 0xfe, 0xff]; + assert_eq!(unescape_llvm_string_contents(input), expected); +} + +#[test] +fn unescape_mixed() { + let input = r"\\01.\5c\5c"; + let expected: &[u8] = br"\01.\\"; + assert_eq!(unescape_llvm_string_contents(input), expected); +} diff --git a/src/tools/coverage-dump/src/prf_names.rs b/src/tools/coverage-dump/src/prf_names.rs new file mode 100644 index 00000000000..d3f7b819e48 --- /dev/null +++ b/src/tools/coverage-dump/src/prf_names.rs @@ -0,0 +1,87 @@ +use crate::parser::{unescape_llvm_string_contents, Parser}; +use anyhow::{anyhow, ensure}; +use regex::Regex; +use std::collections::HashMap; +use std::sync::OnceLock; + +/// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names` +/// entries, decodes them, and creates a table that maps name hash values to +/// (demangled) function names. +pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result> { + fn prf_names_payload(line: &str) -> Option<&str> { + let re = { + // We cheat a little bit and match the variable name `@__llvm_prf_nm` + // rather than the section name, because the section name is harder + // to extract and differs across Linux/Windows/macOS. + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| { + Regex::new(r#"^@__llvm_prf_nm =.*\[[0-9]+ x i8\] c"([^"]*)".*$"#).unwrap() + }) + }; + + let payload = re.captures(line)?.get(1).unwrap().as_str(); + Some(payload) + } + + /// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to + /// 64 bits as a way to associate data stored in different tables/sections. + fn truncated_md5(bytes: &[u8]) -> u64 { + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + hasher.update(bytes); + let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap(); + // The truncated hash is explicitly little-endian, regardless of host + // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.) + u64::from_le_bytes(hash) + } + + fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result { + // In practice, raw symbol names should always be ASCII. + let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?; + match rustc_demangle::try_demangle(symbol_name_str) { + Ok(d) => Ok(format!("{d:#}")), + // If demangling failed, don't treat it as an error. This lets us + // run the dump tool against non-Rust coverage maps produced by + // `clang`, for testing purposes. + Err(_) => Ok(format!("(couldn't demangle) {symbol_name_str}")), + } + } + + let mut map = HashMap::new(); + + for payload in llvm_ir.lines().filter_map(prf_names_payload).map(unescape_llvm_string_contents) + { + let mut parser = Parser::new(&payload); + let uncompressed_len = parser.read_uleb128_usize()?; + let compressed_len = parser.read_uleb128_usize()?; + + let uncompressed_bytes_vec; + let uncompressed_bytes: &[u8] = if compressed_len == 0 { + // The symbol name bytes are uncompressed, so read them directly. + parser.read_n_bytes(uncompressed_len)? + } else { + // The symbol name bytes are compressed, so read and decompress them. + let compressed_bytes = parser.read_n_bytes(compressed_len)?; + + uncompressed_bytes_vec = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit( + compressed_bytes, + uncompressed_len, + ) + .map_err(|e| anyhow!("{e:?}"))?; + ensure!(uncompressed_bytes_vec.len() == uncompressed_len); + + &uncompressed_bytes_vec + }; + + // Symbol names in the payload are separated by `0x01` bytes. + for raw_name in uncompressed_bytes.split(|&b| b == 0x01) { + let hash = truncated_md5(raw_name); + let demangled = demangle_if_able(raw_name)?; + map.insert(hash, demangled); + } + + parser.ensure_empty()?; + } + + Ok(map) +}