From 6f228e3420787f123aff9b06247fefcb91a941ed Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Fri, 16 Jun 2023 15:17:01 +1000
Subject: [PATCH] Inline before merging CGUs.

Because CGU merging relies on CGU sizes, but the CGU sizes before
inlining aren't accurate.

This requires tweaking how the sizes are updated during merging: if CGU
A and B both have an inlined function F, then `size(A + B)` will be a
little less than `size(A) + size(B)`, because `A + B` will only have one
copy of F. Also, the minimum CGU size is increased because it now has to
account for inlined functions.

This change doesn't have much effect on compile perf, but it makes
follow-on changes that involve more sophisticated reasoning about CGU
sizes much easier.
---
 compiler/rustc_middle/src/mir/mono.rs         |  4 ---
 .../rustc_monomorphize/src/partitioning.rs    | 26 ++++++++++---------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
index 1511e255235..1cbf1a36283 100644
--- a/compiler/rustc_middle/src/mir/mono.rs
+++ b/compiler/rustc_middle/src/mir/mono.rs
@@ -335,10 +335,6 @@ impl<'tcx> CodegenUnit<'tcx> {
             .expect("create_size_estimate must be called before getting a size_estimate")
     }
 
-    pub fn modify_size_estimate(&mut self, delta: usize) {
-        *self.size_estimate.as_mut().unwrap() += delta;
-    }
-
     pub fn contains_item(&self, item: &MonoItem<'tcx>) -> bool {
         self.items().contains_key(item)
     }
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index cc4ea16cf50..3be0908fc7f 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -166,15 +166,6 @@ where
         placed
     };
 
-    // Merge until we have at most `max_cgu_count` codegen units.
-    // `merge_codegen_units` is responsible for updating the CGU size
-    // estimates.
-    {
-        let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
-        merge_codegen_units(cx, &mut codegen_units);
-        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
-    }
-
     // Use the usage map to put additional mono items in each codegen unit:
     // drop-glue, functions from external crates, and local functions the
     // definition of which is marked with `#[inline]`.
@@ -189,6 +180,15 @@ where
         debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);
     }
 
+    // Merge until we have at most `max_cgu_count` codegen units.
+    // `merge_codegen_units` is responsible for updating the CGU size
+    // estimates.
+    {
+        let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
+        merge_codegen_units(cx, &mut codegen_units);
+        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
+    }
+
     // Make as many symbols "internal" as possible, so LLVM has more freedom to
     // optimize.
     if !tcx.sess.link_dead_code() {
@@ -313,7 +313,7 @@ fn merge_codegen_units<'tcx>(
     // worse generated code. So we don't allow CGUs smaller than this (unless
     // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
     // common in larger programs, so this isn't all that large.
-    const NON_INCR_MIN_CGU_SIZE: usize = 1000;
+    const NON_INCR_MIN_CGU_SIZE: usize = 1800;
 
     // Repeatedly merge the two smallest codegen units as long as:
     // - we have more CGUs than the upper limit, or
@@ -337,9 +337,11 @@ fn merge_codegen_units<'tcx>(
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
-        // Move the mono-items from `smallest` to `second_smallest`
-        second_smallest.modify_size_estimate(smallest.size_estimate());
+        // Move the items from `smallest` to `second_smallest`. Some of them
+        // may be duplicate inlined items, in which case the destination CGU is
+        // unaffected. Recalculate size estimates afterwards.
         second_smallest.items_mut().extend(smallest.items_mut().drain());
+        second_smallest.create_size_estimate(cx.tcx);
 
         // Record that `second_smallest` now contains all the stuff that was
         // in `smallest` before.