[dart/vm] Improved inlining methods and heuristics
Rationale:
Previous method cached graph information (instruction and call site
counts) on a per-function level, not accounting for potential
specializations. The improved method runs an extra constant folding
pass, and only caches per-function information for non-specialized
cases. As a result, we inling much better, see for example, the
added test as illustration.
Since we no longer cache for constants, compile-time may be increased
a bit due to the extra scan. In the long run we should consider
for common constant "situations" as the call site.
https://github.com/dart-lang/sdk/issues/36880
Change-Id: I19f007c7f1860ad0ea88fafb38695dc154189ad5
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/105460
Commit-Queue: Aart Bik <ajcbik@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
Reviewed-by: Alexander Markov <alexmarkov@google.com>
diff --git a/runtime/vm/compiler/backend/constant_propagator.cc b/runtime/vm/compiler/backend/constant_propagator.cc
index dc78a0e..a3f6a6e 100644
--- a/runtime/vm/compiler/backend/constant_propagator.cc
+++ b/runtime/vm/compiler/backend/constant_propagator.cc
@@ -1453,7 +1453,8 @@
static void RemovePushArguments(StaticCallInstr* call) {
for (intptr_t i = 0; i < call->ArgumentCount(); ++i) {
PushArgumentInstr* push = call->PushArgumentAt(i);
- ASSERT(push->input_use_list() == nullptr);
+ ASSERT(push->input_use_list() == nullptr); // no direct uses
+ push->ReplaceUsesWith(push->value()->definition()); // cleanup env uses
push->RemoveFromGraph();
}
}
diff --git a/runtime/vm/compiler/backend/inliner.cc b/runtime/vm/compiler/backend/inliner.cc
index 5235005..6dd87b1 100644
--- a/runtime/vm/compiler/backend/inliner.cc
+++ b/runtime/vm/compiler/backend/inliner.cc
@@ -779,6 +779,8 @@
};
// Inlining heuristics based on Cooper et al. 2008.
+ // TODO(ajcbik): with the better specialized computation of counts,
+ // do we still want to special-case const_arg_count here?
InliningDecision ShouldWeInline(const Function& callee,
intptr_t instr_count,
intptr_t call_site_count,
@@ -955,11 +957,19 @@
return false;
}
+ // Apply early heuristics. For a specialized case
+ // (constants_arg_counts > 0), don't use a previously
+ // estimate of the call site and instruction counts.
+ // Note that at this point, optional constant parameters
+ // are not counted yet, which makes this decision approximate.
GrowableArray<Value*>* arguments = call_data->arguments;
- const intptr_t constant_arguments = CountConstants(*arguments);
+ const intptr_t constant_arg_count = CountConstants(*arguments);
+ const intptr_t instruction_count =
+ constant_arg_count == 0 ? function.optimized_instruction_count() : 0;
+ const intptr_t call_site_count =
+ constant_arg_count == 0 ? function.optimized_call_site_count() : 0;
InliningDecision decision = ShouldWeInline(
- function, function.optimized_instruction_count(),
- function.optimized_call_site_count(), constant_arguments);
+ function, instruction_count, call_site_count, constant_arg_count);
if (!decision.value) {
TRACE_INLINING(
THR_Print(" Bailout: early heuristics (%s) with "
@@ -967,9 +977,8 @@
"call sites: %" Pd ", "
"inlining depth of callee: %d, "
"const args: %" Pd "\n",
- decision.reason, function.optimized_instruction_count(),
- function.optimized_call_site_count(),
- function.inlining_depth(), constant_arguments));
+ decision.reason, instruction_count, call_site_count,
+ function.inlining_depth(), constant_arg_count));
PRINT_INLINING_TREE("Early heuristic", &call_data->caller, &function,
call_data->call);
return false;
@@ -1193,26 +1202,30 @@
printer.PrintBlocks();
}
- // Collect information about the call site and caller graph.
- // TODO(zerny): Do this after CP and dead code elimination.
+ // Collect information about the call site and caller graph. At this
+ // point, optional constant parameters are counted too, making the
+ // specialized vs. non-specialized decision accurate.
intptr_t constants_count = 0;
- for (intptr_t i = 0; i < param_stubs->length(); ++i) {
+ for (intptr_t i = 0, n = param_stubs->length(); i < n; ++i) {
if ((*param_stubs)[i]->IsConstant()) ++constants_count;
}
-
- FlowGraphInliner::CollectGraphInfo(callee_graph);
- const intptr_t size = function.optimized_instruction_count();
- const intptr_t call_site_count = function.optimized_call_site_count();
+ intptr_t instruction_count = 0;
+ intptr_t call_site_count = 0;
+ FlowGraphInliner::CollectGraphInfo(callee_graph, constants_count,
+ /*force*/ false, &instruction_count,
+ &call_site_count);
// Use heuristics do decide if this call should be inlined.
- InliningDecision decision =
- ShouldWeInline(function, size, call_site_count, constants_count);
+ InliningDecision decision = ShouldWeInline(
+ function, instruction_count, call_site_count, constants_count);
if (!decision.value) {
// If size is larger than all thresholds, don't consider it again.
- if ((size > FLAG_inlining_size_threshold) &&
+ if ((instruction_count > FLAG_inlining_size_threshold) &&
(call_site_count > FLAG_inlining_callee_call_sites_threshold) &&
- (size > FLAG_inlining_constant_arguments_min_size_threshold) &&
- (size > FLAG_inlining_constant_arguments_max_size_threshold)) {
+ (instruction_count >
+ FLAG_inlining_constant_arguments_min_size_threshold) &&
+ (instruction_count >
+ FLAG_inlining_constant_arguments_max_size_threshold)) {
function.set_is_inlinable(false);
}
TRACE_INLINING(
@@ -1221,7 +1234,7 @@
"call sites: %" Pd ", "
"inlining depth of callee: %d, "
"const args: %" Pd "\n",
- decision.reason, size, call_site_count,
+ decision.reason, instruction_count, call_site_count,
function.inlining_depth(), constants_count));
PRINT_INLINING_TREE("Heuristic fail", &call_data->caller, &function,
call_data->call);
@@ -1231,6 +1244,7 @@
// If requested, a stricter heuristic is applied to this inlining. This
// heuristic always scans the method (rather than possibly reusing
// cached results) to make sure all specializations are accounted for.
+ // TODO(ajcbik): with the now better bookkeeping, explore removing this
if (stricter_heuristic) {
if (!IsSmallLeaf(callee_graph)) {
TRACE_INLINING(
@@ -1254,7 +1268,7 @@
// Build succeeded so we restore the bailout jump.
inlined_ = true;
- inlined_size_ += size;
+ inlined_size_ += instruction_count;
if (is_recursive_call) {
inlined_recursive_call_ = true;
}
@@ -1284,8 +1298,7 @@
TRACE_INLINING(THR_Print(" Success\n"));
TRACE_INLINING(THR_Print(
" with reason %s, code size %" Pd ", call sites: %" Pd "\n",
- decision.reason, function.optimized_instruction_count(),
- call_site_count));
+ decision.reason, instruction_count, call_site_count));
PRINT_INLINING_TREE(NULL, &call_data->caller, &function, call);
return true;
} else {
@@ -1481,6 +1494,10 @@
}
bool InlineClosureCalls() {
+ // Under this flag, tear off testing closure calls appear before the
+ // StackOverflowInstr, which breaks assertions in our compiler when inlined.
+ // TODO(sjindel): move testing closure calls after first check
+ if (FLAG_enable_testing_pragmas) return false; // keep all closures
bool inlined = false;
const GrowableArray<CallSites::ClosureCallInfo>& call_info =
inlining_call_sites_->closure_calls();
@@ -2237,15 +2254,37 @@
speculative_policy_(speculative_policy),
precompiler_(precompiler) {}
-void FlowGraphInliner::CollectGraphInfo(FlowGraph* flow_graph, bool force) {
+void FlowGraphInliner::CollectGraphInfo(FlowGraph* flow_graph,
+ intptr_t constants_count,
+ bool force,
+ intptr_t* instruction_count,
+ intptr_t* call_site_count) {
const Function& function = flow_graph->function();
+ // For OSR, don't even bother.
+ if (flow_graph->IsCompiledForOsr()) {
+ *instruction_count = 0;
+ *call_site_count = 0;
+ return;
+ }
+ // Specialized case: always recompute, never cache.
+ if (constants_count > 0) {
+ ASSERT(!force);
+ GraphInfoCollector info;
+ info.Collect(*flow_graph);
+ *instruction_count = info.instruction_count();
+ *call_site_count = info.call_site_count();
+ return;
+ }
+ // Non-specialized case: unless forced, only recompute on a cache miss.
+ ASSERT(constants_count == 0);
if (force || (function.optimized_instruction_count() == 0)) {
GraphInfoCollector info;
info.Collect(*flow_graph);
-
function.SetOptimizedInstructionCountClamped(info.instruction_count());
function.SetOptimizedCallSiteCountClamped(info.call_site_count());
}
+ *instruction_count = function.optimized_instruction_count();
+ *call_site_count = function.optimized_call_site_count();
}
// TODO(srdjan): This is only needed when disassembling and/or profiling.
@@ -2312,9 +2351,15 @@
}
int FlowGraphInliner::Inline() {
- // Collect graph info and store it on the function.
- // We might later use it for an early bailout from the inlining.
- CollectGraphInfo(flow_graph_);
+ // Collect some early graph information assuming it is non-specialized
+ // so that the cached approximation may be used later for an early
+ // bailout from inlining.
+ intptr_t instruction_count = 0;
+ intptr_t call_site_count = 0;
+ FlowGraphInliner::CollectGraphInfo(flow_graph_,
+ /*constants_count*/ 0,
+ /*force*/ false, &instruction_count,
+ &call_site_count);
const Function& top = flow_graph_->function();
if ((FLAG_inlining_filter != NULL) &&
diff --git a/runtime/vm/compiler/backend/inliner.h b/runtime/vm/compiler/backend/inliner.h
index 34366cf..fecd8d7 100644
--- a/runtime/vm/compiler/backend/inliner.h
+++ b/runtime/vm/compiler/backend/inliner.h
@@ -96,8 +96,21 @@
// depth that we inlined.
int Inline();
- // Compute graph info if it was not already computed or if 'force' is true.
- static void CollectGraphInfo(FlowGraph* flow_graph, bool force = false);
+ // Computes graph information (instruction and call site count).
+ // For the non-specialized cases (num_constants_args == 0), the
+ // method uses a cache to avoid recomputing the counts (the cached
+ // value may still be approximate but close). The 'force' flag is
+ // used to update the cached value at the end of running the full pipeline
+ // on non-specialized cases. Specialized cases (num_constants_args > 0)
+ // always recompute the counts without caching.
+ //
+ // TODO(ajcbik): cache for specific constant argument combinations too?
+ static void CollectGraphInfo(FlowGraph* flow_graph,
+ intptr_t num_constant_args,
+ bool force,
+ intptr_t* instruction_count,
+ intptr_t* call_site_count);
+
static void SetInliningId(FlowGraph* flow_graph, intptr_t inlining_id);
bool AlwaysInline(const Function& function);
diff --git a/runtime/vm/compiler/compiler_pass.cc b/runtime/vm/compiler/compiler_pass.cc
index 340b149..d101470 100644
--- a/runtime/vm/compiler/compiler_pass.cc
+++ b/runtime/vm/compiler/compiler_pass.cc
@@ -217,6 +217,9 @@
INVOKE_PASS(TypePropagation);
INVOKE_PASS(ApplyICData);
INVOKE_PASS(Canonicalize);
+ // Run constant propagation to make sure we specialize for
+ // (optional) constant arguments passed into the inlined method.
+ INVOKE_PASS(ConstantPropagation);
// Optimize (a << b) & c patterns, merge instructions. Must occur
// before 'SelectRepresentations' which inserts conversion nodes.
INVOKE_PASS(TryOptimizePatterns);
@@ -475,10 +478,17 @@
{ WriteBarrierElimination(flow_graph); });
COMPILER_PASS(FinalizeGraph, {
- // Compute and store graph informations (call & instruction counts)
- // to be later used by the inliner.
- FlowGraphInliner::CollectGraphInfo(flow_graph, true);
+ // At the end of the pipeline, force recomputing and caching graph
+ // information (instruction and call site counts) for the (assumed)
+ // non-specialized case with better values, for future inlining.
+ intptr_t instruction_count = 0;
+ intptr_t call_site_count = 0;
+ FlowGraphInliner::CollectGraphInfo(flow_graph,
+ /*constants_count*/ 0,
+ /*force*/ true, &instruction_count,
+ &call_site_count);
flow_graph->function().set_inlining_depth(state->inlining_depth);
+ // Remove redefinitions for the rest of the pipeline.
flow_graph->RemoveRedefinitions();
});
diff --git a/tests/language_2/vm/inline_heuristic_test.dart b/tests/language_2/vm/inline_heuristic_test.dart
new file mode 100755
index 0000000..ca25125
--- /dev/null
+++ b/tests/language_2/vm/inline_heuristic_test.dart
@@ -0,0 +1,303 @@
+// Copyright (c) 2019, the Dart project authors. Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+// VMOptions=--deterministic --optimization_counter_threshold=10 --enable-inlining-annotations
+
+// Test on specialized vs non-specialized inlining.
+
+import 'dart:core';
+import "package:expect/expect.dart";
+
+const String NeverInline = 'NeverInline';
+
+// To inline or not to inline, that is the question?
+int foo(int k) {
+ switch (k) {
+ case 0:
+ return 1;
+ case 1:
+ return 2;
+ case 2:
+ return 3;
+ case 3:
+ return 4;
+ case 4:
+ return 5;
+ case 5:
+ return 6;
+ case 6:
+ return 7;
+ case 7:
+ return 8;
+ case 8:
+ return 9;
+ case 9:
+ return 10;
+ case 10:
+ return 11;
+ case 11:
+ return 12;
+ case 12:
+ return 13;
+ case 13:
+ return 14;
+ case 14:
+ return 15;
+ case 15:
+ return 16;
+ case 16:
+ return 17;
+ case 17:
+ return 18;
+ case 18:
+ return 19;
+ case 19:
+ return 20;
+ case 20:
+ return 21;
+ case 21:
+ return 22;
+ case 22:
+ return 23;
+ case 23:
+ return 24;
+ case 24:
+ return 25;
+ case 25:
+ return 26;
+ case 26:
+ return 27;
+ case 27:
+ return 28;
+ case 28:
+ return 29;
+ case 29:
+ return 30;
+ case 30:
+ return 31;
+ case 31:
+ return 32;
+ case 32:
+ return 33;
+ case 33:
+ return 34;
+ case 34:
+ return 35;
+ case 35:
+ return 36;
+ case 36:
+ return 37;
+ case 37:
+ return 38;
+ case 38:
+ return 39;
+ case 39:
+ return 40;
+ case 40:
+ return 41;
+ case 41:
+ return 42;
+ case 42:
+ return 43;
+ case 43:
+ return 44;
+ case 44:
+ return 45;
+ case 45:
+ return 46;
+ case 46:
+ return 47;
+ case 47:
+ return 48;
+ case 48:
+ return 49;
+ case 49:
+ return 50;
+ case 50:
+ return 51;
+ case 51:
+ return 52;
+ case 52:
+ return 53;
+ case 53:
+ return 54;
+ case 54:
+ return 55;
+ case 55:
+ return 56;
+ case 56:
+ return 57;
+ case 57:
+ return 58;
+ case 58:
+ return 59;
+ case 59:
+ return 60;
+ case 60:
+ return 61;
+ case 61:
+ return 62;
+ case 62:
+ return 63;
+ case 63:
+ return 64;
+ case 64:
+ return 65;
+ case 65:
+ return 66;
+ case 66:
+ return 67;
+ case 67:
+ return 68;
+ case 68:
+ return 69;
+ case 69:
+ return 70;
+ case 70:
+ return 71;
+ case 71:
+ return 72;
+ case 72:
+ return 73;
+ case 73:
+ return 74;
+ case 74:
+ return 75;
+ case 75:
+ return 76;
+ case 76:
+ return 77;
+ case 77:
+ return 78;
+ case 78:
+ return 79;
+ case 79:
+ return 80;
+ case 80:
+ return 81;
+ case 81:
+ return 82;
+ case 82:
+ return 83;
+ case 83:
+ return 84;
+ case 84:
+ return 85;
+ case 85:
+ return 86;
+ case 86:
+ return 87;
+ case 87:
+ return 88;
+ case 88:
+ return 89;
+ case 89:
+ return 90;
+ case 90:
+ return 91;
+ case 91:
+ return 92;
+ case 92:
+ return 93;
+ case 93:
+ return 94;
+ case 94:
+ return 95;
+ case 95:
+ return 96;
+ case 96:
+ return 97;
+ case 97:
+ return 98;
+ case 98:
+ return 99;
+ case 99:
+ return 100;
+ case 100:
+ return 101;
+ case 101:
+ return 102;
+ case 102:
+ return 103;
+ case 103:
+ return 104;
+ case 104:
+ return 105;
+ case 105:
+ return 106;
+ case 106:
+ return 107;
+ case 107:
+ return 108;
+ case 108:
+ return 109;
+ case 109:
+ return 110;
+ case 110:
+ return 111;
+ case 111:
+ return 112;
+ case 112:
+ return 113;
+ case 113:
+ return 114;
+ case 114:
+ return 115;
+ case 115:
+ return 116;
+ case 116:
+ return 117;
+ case 117:
+ return 118;
+ case 118:
+ return 119;
+ case 119:
+ return 120;
+ case 120:
+ return 121;
+ case 121:
+ return 122;
+ case 122:
+ return 123;
+ case 123:
+ return 124;
+ case 124:
+ return 125;
+ case 125:
+ return 126;
+ case 126:
+ return 127;
+ case 127:
+ return 128;
+ default:
+ return -1;
+ }
+}
+
+@NeverInline
+int bar() {
+ // Here we should inline! The inlined size is very small
+ // after specialization for the constant arguments.
+ return foo(1) + foo(12);
+}
+
+@NeverInline
+int baz(int i) {
+ // Here we should not inline! The inlined size is too large,
+ // just keep the original method. In fact, we can use the cached
+ // estimate of foo()'s size from the previous compilation at this
+ // point, which enables the "early" bail heuristic!
+ return foo(i);
+}
+
+main() {
+ // Repeat tests to enter JIT (when applicable).
+ for (int i = 0; i < 20; i++) {
+ Expect.equals(15, bar());
+ for (int i = -150; i <= 150; i++) {
+ int e = (i < 0 || i > 127) ? -1 : i + 1;
+ Expect.equals(e, baz(i));
+ }
+ }
+}