[vm/compiler] Use loop framework for AOT inline heuristics
Rationale:
Without proper execution counters, the inline AOT inliner
marks every call site "cold", effectively disabling inlining
altogether. This change introduces loop-based static heuristic
that assumes statements nested inside loops are executed more
frequently. This results in more inlining.
Note:
Conservative version is used for now which yields
more performance without increasing code size too much.
There is still a lot of performance left at the table
which we could exploit if we fine tune heuristics
regarding code size.
Bug:
https://github.com/dart-lang/sdk/issues/34473
https://github.com/dart-lang/sdk/issues/32167
Change-Id: I86ba60f93bdab363cd22ab6bdbcf6688f2042fea
Reviewed-on: https://dart-review.googlesource.com/c/81187
Commit-Queue: Aart Bik <ajcbik@google.com>
Reviewed-by: Alexander Markov <alexmarkov@google.com>
diff --git a/runtime/vm/compiler/backend/il.cc b/runtime/vm/compiler/backend/il.cc
index 9688cae..2b5574f 100644
--- a/runtime/vm/compiler/backend/il.cc
+++ b/runtime/vm/compiler/backend/il.cc
@@ -1600,6 +1600,10 @@
return loop_info_ != nullptr && loop_info_->header() == this;
}
+intptr_t BlockEntryInstr::NestingDepth() const {
+ return loop_info_ == nullptr ? 0 : loop_info_->NestingDepth();
+}
+
// Helper to mutate the graph during inlining. This block should be
// replaced with new_block as a predecessor of all of this block's
// successors. For each successor, the predecessors will be reordered
diff --git a/runtime/vm/compiler/backend/il.h b/runtime/vm/compiler/backend/il.h
index 33d1434..0574f07 100644
--- a/runtime/vm/compiler/backend/il.h
+++ b/runtime/vm/compiler/backend/il.h
@@ -1360,6 +1360,7 @@
LoopInfo* loop_info() const { return loop_info_; }
void set_loop_info(LoopInfo* loop_info) { loop_info_ = loop_info; }
bool IsLoopHeader() const;
+ intptr_t NestingDepth() const;
virtual BlockEntryInstr* GetBlock() { return this; }
diff --git a/runtime/vm/compiler/backend/inliner.cc b/runtime/vm/compiler/backend/inliner.cc
index bdbf35a..f3f4a52 100644
--- a/runtime/vm/compiler/backend/inliner.cc
+++ b/runtime/vm/compiler/backend/inliner.cc
@@ -254,7 +254,7 @@
// A collection of call sites to consider for inlining.
class CallSites : public ValueObject {
public:
- explicit CallSites(FlowGraph* flow_graph, intptr_t threshold)
+ explicit CallSites(intptr_t threshold)
: inlining_depth_threshold_(threshold),
static_calls_(),
closure_calls_(),
@@ -264,9 +264,14 @@
PolymorphicInstanceCallInstr* call;
double ratio;
const FlowGraph* caller_graph;
+ intptr_t nesting_depth;
InstanceCallInfo(PolymorphicInstanceCallInstr* call_arg,
- FlowGraph* flow_graph)
- : call(call_arg), ratio(0.0), caller_graph(flow_graph) {}
+ FlowGraph* flow_graph,
+ intptr_t depth)
+ : call(call_arg),
+ ratio(0.0),
+ caller_graph(flow_graph),
+ nesting_depth(depth) {}
const Function& caller() const { return caller_graph->function(); }
};
@@ -274,8 +279,14 @@
StaticCallInstr* call;
double ratio;
FlowGraph* caller_graph;
- StaticCallInfo(StaticCallInstr* value, FlowGraph* flow_graph)
- : call(value), ratio(0.0), caller_graph(flow_graph) {}
+ intptr_t nesting_depth;
+ StaticCallInfo(StaticCallInstr* value,
+ FlowGraph* flow_graph,
+ intptr_t depth)
+ : call(value),
+ ratio(0.0),
+ caller_graph(flow_graph),
+ nesting_depth(depth) {}
const Function& caller() const { return caller_graph->function(); }
};
@@ -315,6 +326,32 @@
instance_calls_.Clear();
}
+ // Heuristic that maps the loop nesting depth to a static estimate of number
+ // of times code at that depth is executed (code at each higher nesting
+ // depth is assumed to execute 10x more often up to depth 3).
+ static intptr_t AotCallCountApproximation(intptr_t nesting_depth) {
+ switch (nesting_depth) {
+ case 0:
+ // Note that we use value 0, and not 1, i.e. any straightline code
+ // outside a loop is assumed to be very cold. With value 1, inlining
+ // inside loops is still favored over inlining inside straightline
+ // code, but for a method without loops, *all* call sites are inlined
+ // (potentially more performance, at the expense of larger code size).
+ // TODO(ajcbik): use 1 and fine tune other heuristics
+ return 0;
+ case 1:
+ return 10;
+ case 2:
+ return 100;
+ default:
+ return 1000;
+ }
+ }
+
+ // Computes the ratio for each call site in a method, defined as the
+ // number of times a call site is executed over the maximum number of
+ // times any call site is executed in the method. JIT uses actual call
+ // counts whereas AOT uses a static estimate based on nesting depth.
void ComputeCallSiteRatio(intptr_t static_call_start_ix,
intptr_t instance_call_start_ix) {
const intptr_t num_static_calls =
@@ -325,21 +362,26 @@
intptr_t max_count = 0;
GrowableArray<intptr_t> instance_call_counts(num_instance_calls);
for (intptr_t i = 0; i < num_instance_calls; ++i) {
- const intptr_t aggregate_count =
- instance_calls_[i + instance_call_start_ix].call->CallCount();
+ const InstanceCallInfo& info =
+ instance_calls_[i + instance_call_start_ix];
+ intptr_t aggregate_count =
+ FLAG_precompiled_mode ? AotCallCountApproximation(info.nesting_depth)
+ : info.call->CallCount();
instance_call_counts.Add(aggregate_count);
if (aggregate_count > max_count) max_count = aggregate_count;
}
GrowableArray<intptr_t> static_call_counts(num_static_calls);
for (intptr_t i = 0; i < num_static_calls; ++i) {
+ const StaticCallInfo& info = static_calls_[i + static_call_start_ix];
intptr_t aggregate_count =
- static_calls_[i + static_call_start_ix].call->CallCount();
+ FLAG_precompiled_mode ? AotCallCountApproximation(info.nesting_depth)
+ : info.call->CallCount();
static_call_counts.Add(aggregate_count);
if (aggregate_count > max_count) max_count = aggregate_count;
}
- // max_count can be 0 if none of the calls was executed.
+ // Note that max_count can be 0 if none of the calls was executed.
for (intptr_t i = 0; i < num_instance_calls; ++i) {
const double ratio =
(max_count == 0)
@@ -404,12 +446,18 @@
const bool inline_only_recognized_methods =
(depth == inlining_depth_threshold_);
+ // In AOT, compute loop hierarchy.
+ if (FLAG_precompiled_mode) {
+ graph->GetLoopHierarchy();
+ }
+
const intptr_t instance_call_start_ix = instance_calls_.length();
const intptr_t static_call_start_ix = static_calls_.length();
for (BlockIterator block_it = graph->postorder_iterator(); !block_it.Done();
block_it.Advance()) {
- for (ForwardInstructionIterator it(block_it.Current()); !it.Done();
- it.Advance()) {
+ BlockEntryInstr* entry = block_it.Current();
+ const intptr_t depth = entry->NestingDepth();
+ for (ForwardInstructionIterator it(entry); !it.Done(); it.Advance()) {
Instruction* current = it.Current();
if (current->IsPolymorphicInstanceCall()) {
PolymorphicInstanceCallInstr* instance_call =
@@ -417,7 +465,7 @@
if (!inline_only_recognized_methods ||
instance_call->IsSureToCallSingleRecognizedTarget() ||
instance_call->HasOnlyDispatcherOrImplicitAccessorTargets()) {
- instance_calls_.Add(InstanceCallInfo(instance_call, graph));
+ instance_calls_.Add(InstanceCallInfo(instance_call, graph, depth));
} else {
// Method not inlined because inlining too deep and method
// not recognized.
@@ -433,7 +481,7 @@
if (!inline_only_recognized_methods ||
static_call->function().IsRecognized() ||
static_call->function().IsDispatcherOrImplicitAccessor()) {
- static_calls_.Add(StaticCallInfo(static_call, graph));
+ static_calls_.Add(StaticCallInfo(static_call, graph, depth));
} else {
// Method not inlined because inlining too deep and method
// not recognized.
@@ -751,8 +799,8 @@
return;
}
// Create two call site collections to swap between.
- CallSites sites1(caller_graph_, inlining_depth_threshold_);
- CallSites sites2(caller_graph_, inlining_depth_threshold_);
+ CallSites sites1(inlining_depth_threshold_);
+ CallSites sites2(inlining_depth_threshold_);
CallSites* call_sites_temp = NULL;
collected_call_sites_ = &sites1;
inlining_call_sites_ = &sites2;