[dart/vm] Improved inlining methods and heuristics

Rationale:
Previous method cached graph information (instruction and call site
counts) on a per-function level, not accounting for potential
specializations. The improved method runs an extra constant folding
pass, and only caches per-function information for non-specialized
cases. As a result, we inling much better, see for example, the
added test as illustration.

Since we no longer cache for constants, compile-time may be increased
a bit due to the extra scan. In the long run we should consider
for common constant "situations" as the call site.

https://github.com/dart-lang/sdk/issues/36880

Change-Id: I19f007c7f1860ad0ea88fafb38695dc154189ad5
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/105460
Commit-Queue: Aart Bik <ajcbik@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
Reviewed-by: Alexander Markov <alexmarkov@google.com>
diff --git a/runtime/vm/compiler/backend/constant_propagator.cc b/runtime/vm/compiler/backend/constant_propagator.cc
index dc78a0e..a3f6a6e 100644
--- a/runtime/vm/compiler/backend/constant_propagator.cc
+++ b/runtime/vm/compiler/backend/constant_propagator.cc
@@ -1453,7 +1453,8 @@
 static void RemovePushArguments(StaticCallInstr* call) {
   for (intptr_t i = 0; i < call->ArgumentCount(); ++i) {
     PushArgumentInstr* push = call->PushArgumentAt(i);
-    ASSERT(push->input_use_list() == nullptr);
+    ASSERT(push->input_use_list() == nullptr);           // no direct uses
+    push->ReplaceUsesWith(push->value()->definition());  // cleanup env uses
     push->RemoveFromGraph();
   }
 }
diff --git a/runtime/vm/compiler/backend/inliner.cc b/runtime/vm/compiler/backend/inliner.cc
index 5235005..6dd87b1 100644
--- a/runtime/vm/compiler/backend/inliner.cc
+++ b/runtime/vm/compiler/backend/inliner.cc
@@ -779,6 +779,8 @@
   };
 
   // Inlining heuristics based on Cooper et al. 2008.
+  // TODO(ajcbik): with the better specialized computation of counts,
+  //               do we still want to special-case const_arg_count here?
   InliningDecision ShouldWeInline(const Function& callee,
                                   intptr_t instr_count,
                                   intptr_t call_site_count,
@@ -955,11 +957,19 @@
       return false;
     }
 
+    // Apply early heuristics. For a specialized case
+    // (constants_arg_counts > 0), don't use a previously
+    // estimate of the call site and instruction counts.
+    // Note that at this point, optional constant parameters
+    // are not counted yet, which makes this decision approximate.
     GrowableArray<Value*>* arguments = call_data->arguments;
-    const intptr_t constant_arguments = CountConstants(*arguments);
+    const intptr_t constant_arg_count = CountConstants(*arguments);
+    const intptr_t instruction_count =
+        constant_arg_count == 0 ? function.optimized_instruction_count() : 0;
+    const intptr_t call_site_count =
+        constant_arg_count == 0 ? function.optimized_call_site_count() : 0;
     InliningDecision decision = ShouldWeInline(
-        function, function.optimized_instruction_count(),
-        function.optimized_call_site_count(), constant_arguments);
+        function, instruction_count, call_site_count, constant_arg_count);
     if (!decision.value) {
       TRACE_INLINING(
           THR_Print("     Bailout: early heuristics (%s) with "
@@ -967,9 +977,8 @@
                     "call sites: %" Pd ", "
                     "inlining depth of callee: %d, "
                     "const args: %" Pd "\n",
-                    decision.reason, function.optimized_instruction_count(),
-                    function.optimized_call_site_count(),
-                    function.inlining_depth(), constant_arguments));
+                    decision.reason, instruction_count, call_site_count,
+                    function.inlining_depth(), constant_arg_count));
       PRINT_INLINING_TREE("Early heuristic", &call_data->caller, &function,
                           call_data->call);
       return false;
@@ -1193,26 +1202,30 @@
           printer.PrintBlocks();
         }
 
-        // Collect information about the call site and caller graph.
-        // TODO(zerny): Do this after CP and dead code elimination.
+        // Collect information about the call site and caller graph. At this
+        // point, optional constant parameters are counted too, making the
+        // specialized vs. non-specialized decision accurate.
         intptr_t constants_count = 0;
-        for (intptr_t i = 0; i < param_stubs->length(); ++i) {
+        for (intptr_t i = 0, n = param_stubs->length(); i < n; ++i) {
           if ((*param_stubs)[i]->IsConstant()) ++constants_count;
         }
-
-        FlowGraphInliner::CollectGraphInfo(callee_graph);
-        const intptr_t size = function.optimized_instruction_count();
-        const intptr_t call_site_count = function.optimized_call_site_count();
+        intptr_t instruction_count = 0;
+        intptr_t call_site_count = 0;
+        FlowGraphInliner::CollectGraphInfo(callee_graph, constants_count,
+                                           /*force*/ false, &instruction_count,
+                                           &call_site_count);
 
         // Use heuristics do decide if this call should be inlined.
-        InliningDecision decision =
-            ShouldWeInline(function, size, call_site_count, constants_count);
+        InliningDecision decision = ShouldWeInline(
+            function, instruction_count, call_site_count, constants_count);
         if (!decision.value) {
           // If size is larger than all thresholds, don't consider it again.
-          if ((size > FLAG_inlining_size_threshold) &&
+          if ((instruction_count > FLAG_inlining_size_threshold) &&
               (call_site_count > FLAG_inlining_callee_call_sites_threshold) &&
-              (size > FLAG_inlining_constant_arguments_min_size_threshold) &&
-              (size > FLAG_inlining_constant_arguments_max_size_threshold)) {
+              (instruction_count >
+               FLAG_inlining_constant_arguments_min_size_threshold) &&
+              (instruction_count >
+               FLAG_inlining_constant_arguments_max_size_threshold)) {
             function.set_is_inlinable(false);
           }
           TRACE_INLINING(
@@ -1221,7 +1234,7 @@
                         "call sites: %" Pd ", "
                         "inlining depth of callee: %d, "
                         "const args: %" Pd "\n",
-                        decision.reason, size, call_site_count,
+                        decision.reason, instruction_count, call_site_count,
                         function.inlining_depth(), constants_count));
           PRINT_INLINING_TREE("Heuristic fail", &call_data->caller, &function,
                               call_data->call);
@@ -1231,6 +1244,7 @@
         // If requested, a stricter heuristic is applied to this inlining. This
         // heuristic always scans the method (rather than possibly reusing
         // cached results) to make sure all specializations are accounted for.
+        // TODO(ajcbik): with the now better bookkeeping, explore removing this
         if (stricter_heuristic) {
           if (!IsSmallLeaf(callee_graph)) {
             TRACE_INLINING(
@@ -1254,7 +1268,7 @@
 
         // Build succeeded so we restore the bailout jump.
         inlined_ = true;
-        inlined_size_ += size;
+        inlined_size_ += instruction_count;
         if (is_recursive_call) {
           inlined_recursive_call_ = true;
         }
@@ -1284,8 +1298,7 @@
         TRACE_INLINING(THR_Print("     Success\n"));
         TRACE_INLINING(THR_Print(
             "       with reason %s, code size %" Pd ", call sites: %" Pd "\n",
-            decision.reason, function.optimized_instruction_count(),
-            call_site_count));
+            decision.reason, instruction_count, call_site_count));
         PRINT_INLINING_TREE(NULL, &call_data->caller, &function, call);
         return true;
       } else {
@@ -1481,6 +1494,10 @@
   }
 
   bool InlineClosureCalls() {
+    // Under this flag, tear off testing closure calls appear before the
+    // StackOverflowInstr, which breaks assertions in our compiler when inlined.
+    // TODO(sjindel): move testing closure calls after first check
+    if (FLAG_enable_testing_pragmas) return false;  // keep all closures
     bool inlined = false;
     const GrowableArray<CallSites::ClosureCallInfo>& call_info =
         inlining_call_sites_->closure_calls();
@@ -2237,15 +2254,37 @@
       speculative_policy_(speculative_policy),
       precompiler_(precompiler) {}
 
-void FlowGraphInliner::CollectGraphInfo(FlowGraph* flow_graph, bool force) {
+void FlowGraphInliner::CollectGraphInfo(FlowGraph* flow_graph,
+                                        intptr_t constants_count,
+                                        bool force,
+                                        intptr_t* instruction_count,
+                                        intptr_t* call_site_count) {
   const Function& function = flow_graph->function();
+  // For OSR, don't even bother.
+  if (flow_graph->IsCompiledForOsr()) {
+    *instruction_count = 0;
+    *call_site_count = 0;
+    return;
+  }
+  // Specialized case: always recompute, never cache.
+  if (constants_count > 0) {
+    ASSERT(!force);
+    GraphInfoCollector info;
+    info.Collect(*flow_graph);
+    *instruction_count = info.instruction_count();
+    *call_site_count = info.call_site_count();
+    return;
+  }
+  // Non-specialized case: unless forced, only recompute on a cache miss.
+  ASSERT(constants_count == 0);
   if (force || (function.optimized_instruction_count() == 0)) {
     GraphInfoCollector info;
     info.Collect(*flow_graph);
-
     function.SetOptimizedInstructionCountClamped(info.instruction_count());
     function.SetOptimizedCallSiteCountClamped(info.call_site_count());
   }
+  *instruction_count = function.optimized_instruction_count();
+  *call_site_count = function.optimized_call_site_count();
 }
 
 // TODO(srdjan): This is only needed when disassembling and/or profiling.
@@ -2312,9 +2351,15 @@
 }
 
 int FlowGraphInliner::Inline() {
-  // Collect graph info and store it on the function.
-  // We might later use it for an early bailout from the inlining.
-  CollectGraphInfo(flow_graph_);
+  // Collect some early graph information assuming it is non-specialized
+  // so that the cached approximation may be used later for an early
+  // bailout from inlining.
+  intptr_t instruction_count = 0;
+  intptr_t call_site_count = 0;
+  FlowGraphInliner::CollectGraphInfo(flow_graph_,
+                                     /*constants_count*/ 0,
+                                     /*force*/ false, &instruction_count,
+                                     &call_site_count);
 
   const Function& top = flow_graph_->function();
   if ((FLAG_inlining_filter != NULL) &&
diff --git a/runtime/vm/compiler/backend/inliner.h b/runtime/vm/compiler/backend/inliner.h
index 34366cf..fecd8d7 100644
--- a/runtime/vm/compiler/backend/inliner.h
+++ b/runtime/vm/compiler/backend/inliner.h
@@ -96,8 +96,21 @@
   // depth that we inlined.
   int Inline();
 
-  // Compute graph info if it was not already computed or if 'force' is true.
-  static void CollectGraphInfo(FlowGraph* flow_graph, bool force = false);
+  // Computes graph information (instruction and call site count).
+  // For the non-specialized cases (num_constants_args == 0), the
+  // method uses a cache to avoid recomputing the counts (the cached
+  // value may still be approximate but close). The 'force' flag is
+  // used to update the cached value at the end of running the full pipeline
+  // on non-specialized cases. Specialized cases (num_constants_args > 0)
+  // always recompute the counts without caching.
+  //
+  // TODO(ajcbik): cache for specific constant argument combinations too?
+  static void CollectGraphInfo(FlowGraph* flow_graph,
+                               intptr_t num_constant_args,
+                               bool force,
+                               intptr_t* instruction_count,
+                               intptr_t* call_site_count);
+
   static void SetInliningId(FlowGraph* flow_graph, intptr_t inlining_id);
 
   bool AlwaysInline(const Function& function);
diff --git a/runtime/vm/compiler/compiler_pass.cc b/runtime/vm/compiler/compiler_pass.cc
index 340b149..d101470 100644
--- a/runtime/vm/compiler/compiler_pass.cc
+++ b/runtime/vm/compiler/compiler_pass.cc
@@ -217,6 +217,9 @@
   INVOKE_PASS(TypePropagation);
   INVOKE_PASS(ApplyICData);
   INVOKE_PASS(Canonicalize);
+  // Run constant propagation to make sure we specialize for
+  // (optional) constant arguments passed into the inlined method.
+  INVOKE_PASS(ConstantPropagation);
   // Optimize (a << b) & c patterns, merge instructions. Must occur
   // before 'SelectRepresentations' which inserts conversion nodes.
   INVOKE_PASS(TryOptimizePatterns);
@@ -475,10 +478,17 @@
               { WriteBarrierElimination(flow_graph); });
 
 COMPILER_PASS(FinalizeGraph, {
-  // Compute and store graph informations (call & instruction counts)
-  // to be later used by the inliner.
-  FlowGraphInliner::CollectGraphInfo(flow_graph, true);
+  // At the end of the pipeline, force recomputing and caching graph
+  // information (instruction and call site counts) for the (assumed)
+  // non-specialized case with better values, for future inlining.
+  intptr_t instruction_count = 0;
+  intptr_t call_site_count = 0;
+  FlowGraphInliner::CollectGraphInfo(flow_graph,
+                                     /*constants_count*/ 0,
+                                     /*force*/ true, &instruction_count,
+                                     &call_site_count);
   flow_graph->function().set_inlining_depth(state->inlining_depth);
+  // Remove redefinitions for the rest of the pipeline.
   flow_graph->RemoveRedefinitions();
 });
 
diff --git a/tests/language_2/vm/inline_heuristic_test.dart b/tests/language_2/vm/inline_heuristic_test.dart
new file mode 100755
index 0000000..ca25125
--- /dev/null
+++ b/tests/language_2/vm/inline_heuristic_test.dart
@@ -0,0 +1,303 @@
+// Copyright (c) 2019, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+// VMOptions=--deterministic --optimization_counter_threshold=10 --enable-inlining-annotations
+
+// Test on specialized vs non-specialized inlining.
+
+import 'dart:core';
+import "package:expect/expect.dart";
+
+const String NeverInline = 'NeverInline';
+
+// To inline or not to inline, that is the question?
+int foo(int k) {
+  switch (k) {
+    case 0:
+      return 1;
+    case 1:
+      return 2;
+    case 2:
+      return 3;
+    case 3:
+      return 4;
+    case 4:
+      return 5;
+    case 5:
+      return 6;
+    case 6:
+      return 7;
+    case 7:
+      return 8;
+    case 8:
+      return 9;
+    case 9:
+      return 10;
+    case 10:
+      return 11;
+    case 11:
+      return 12;
+    case 12:
+      return 13;
+    case 13:
+      return 14;
+    case 14:
+      return 15;
+    case 15:
+      return 16;
+    case 16:
+      return 17;
+    case 17:
+      return 18;
+    case 18:
+      return 19;
+    case 19:
+      return 20;
+    case 20:
+      return 21;
+    case 21:
+      return 22;
+    case 22:
+      return 23;
+    case 23:
+      return 24;
+    case 24:
+      return 25;
+    case 25:
+      return 26;
+    case 26:
+      return 27;
+    case 27:
+      return 28;
+    case 28:
+      return 29;
+    case 29:
+      return 30;
+    case 30:
+      return 31;
+    case 31:
+      return 32;
+    case 32:
+      return 33;
+    case 33:
+      return 34;
+    case 34:
+      return 35;
+    case 35:
+      return 36;
+    case 36:
+      return 37;
+    case 37:
+      return 38;
+    case 38:
+      return 39;
+    case 39:
+      return 40;
+    case 40:
+      return 41;
+    case 41:
+      return 42;
+    case 42:
+      return 43;
+    case 43:
+      return 44;
+    case 44:
+      return 45;
+    case 45:
+      return 46;
+    case 46:
+      return 47;
+    case 47:
+      return 48;
+    case 48:
+      return 49;
+    case 49:
+      return 50;
+    case 50:
+      return 51;
+    case 51:
+      return 52;
+    case 52:
+      return 53;
+    case 53:
+      return 54;
+    case 54:
+      return 55;
+    case 55:
+      return 56;
+    case 56:
+      return 57;
+    case 57:
+      return 58;
+    case 58:
+      return 59;
+    case 59:
+      return 60;
+    case 60:
+      return 61;
+    case 61:
+      return 62;
+    case 62:
+      return 63;
+    case 63:
+      return 64;
+    case 64:
+      return 65;
+    case 65:
+      return 66;
+    case 66:
+      return 67;
+    case 67:
+      return 68;
+    case 68:
+      return 69;
+    case 69:
+      return 70;
+    case 70:
+      return 71;
+    case 71:
+      return 72;
+    case 72:
+      return 73;
+    case 73:
+      return 74;
+    case 74:
+      return 75;
+    case 75:
+      return 76;
+    case 76:
+      return 77;
+    case 77:
+      return 78;
+    case 78:
+      return 79;
+    case 79:
+      return 80;
+    case 80:
+      return 81;
+    case 81:
+      return 82;
+    case 82:
+      return 83;
+    case 83:
+      return 84;
+    case 84:
+      return 85;
+    case 85:
+      return 86;
+    case 86:
+      return 87;
+    case 87:
+      return 88;
+    case 88:
+      return 89;
+    case 89:
+      return 90;
+    case 90:
+      return 91;
+    case 91:
+      return 92;
+    case 92:
+      return 93;
+    case 93:
+      return 94;
+    case 94:
+      return 95;
+    case 95:
+      return 96;
+    case 96:
+      return 97;
+    case 97:
+      return 98;
+    case 98:
+      return 99;
+    case 99:
+      return 100;
+    case 100:
+      return 101;
+    case 101:
+      return 102;
+    case 102:
+      return 103;
+    case 103:
+      return 104;
+    case 104:
+      return 105;
+    case 105:
+      return 106;
+    case 106:
+      return 107;
+    case 107:
+      return 108;
+    case 108:
+      return 109;
+    case 109:
+      return 110;
+    case 110:
+      return 111;
+    case 111:
+      return 112;
+    case 112:
+      return 113;
+    case 113:
+      return 114;
+    case 114:
+      return 115;
+    case 115:
+      return 116;
+    case 116:
+      return 117;
+    case 117:
+      return 118;
+    case 118:
+      return 119;
+    case 119:
+      return 120;
+    case 120:
+      return 121;
+    case 121:
+      return 122;
+    case 122:
+      return 123;
+    case 123:
+      return 124;
+    case 124:
+      return 125;
+    case 125:
+      return 126;
+    case 126:
+      return 127;
+    case 127:
+      return 128;
+    default:
+      return -1;
+  }
+}
+
+@NeverInline
+int bar() {
+  // Here we should inline! The inlined size is very small
+  // after specialization for the constant arguments.
+  return foo(1) + foo(12);
+}
+
+@NeverInline
+int baz(int i) {
+  // Here we should not inline! The inlined size is too large,
+  // just keep the original method. In fact, we can use the cached
+  // estimate of foo()'s size from the previous compilation at this
+  // point, which enables the "early" bail heuristic!
+  return foo(i);
+}
+
+main() {
+  // Repeat tests to enter JIT (when applicable).
+  for (int i = 0; i < 20; i++) {
+    Expect.equals(15, bar());
+    for (int i = -150; i <= 150; i++) {
+      int e = (i < 0 || i > 127) ? -1 : i + 1;
+      Expect.equals(e, baz(i));
+    }
+  }
+}