[dart/vm] Peephole optimizer (window size one) on stack code
Rationale:
A basic peephole optimizer with window size one that avoids
redundant push-pop sequences already results is substantial
savings in code size and runtime, without any noticeable impact
on compile time (the peephole is very, very fast).
Performance:
Golem unoptimized code (which typically runs -90% compared to
optimized code, sees 5-60% improvements
https://github.com/dart-lang/sdk/issues/36409
Change-Id: I08db4b3dbc92377d89340a4969db6e664e54bceb
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/102980
Commit-Queue: Aart Bik <ajcbik@google.com>
Reviewed-by: Vyacheslav Egorov <vegorov@google.com>
Reviewed-by: Ryan Macnak <rmacnak@google.com>
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.cc b/runtime/vm/compiler/backend/flow_graph_compiler.cc
index 73c21d8..2ef9877 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.cc
@@ -42,6 +42,8 @@
false,
"Inlining interval diagnostics");
+DEFINE_FLAG(bool, enable_peephole, true, "Enable peephole optimization");
+
#if !defined(DART_PRECOMPILED_RUNTIME)
DEFINE_FLAG(bool,
@@ -511,6 +513,35 @@
line.ToCString());
}
+#if !defined(TARGET_ARCH_DBC)
+
+static bool IsPusher(Instruction* instr) {
+ if (auto def = instr->AsDefinition()) {
+ return def->HasTemp();
+ }
+ return false;
+}
+
+static bool IsPopper(Instruction* instr) {
+ // TODO(ajcbik): even allow deopt targets by making environment aware?
+ if (!instr->CanBecomeDeoptimizationTarget()) {
+ return !instr->IsPushArgument() && instr->ArgumentCount() == 0 &&
+ instr->InputCount() > 0;
+ }
+ return false;
+}
+
+#endif
+
+bool FlowGraphCompiler::IsPeephole(Instruction* instr) const {
+#if !defined(TARGET_ARCH_DBC)
+ if (FLAG_enable_peephole && !is_optimizing()) {
+ return IsPusher(instr) && IsPopper(instr->next());
+ }
+#endif
+ return false;
+}
+
void FlowGraphCompiler::VisitBlocks() {
CompactBlocks();
if (Assembler::EmittingComments()) {
@@ -585,7 +616,12 @@
pending_deoptimization_env_ = instr->env();
instr->EmitNativeCode(this);
pending_deoptimization_env_ = NULL;
- EmitInstructionEpilogue(instr);
+ if (IsPeephole(instr)) {
+ ASSERT(top_of_stack_ == nullptr);
+ top_of_stack_ = instr->AsDefinition();
+ } else {
+ EmitInstructionEpilogue(instr);
+ }
EndCodeSourceRange(instr->token_pos());
}
@@ -1470,6 +1506,21 @@
bool blocked_registers[kNumberOfCpuRegisters];
+ // Connect input with peephole output for some special cases. All other
+ // cases are handled by simply allocating registers and generating code.
+ if (top_of_stack_ != nullptr) {
+ const intptr_t p = locs->input_count() - 1;
+ Location peephole = top_of_stack_->locs()->out(0);
+ if (locs->in(p).IsUnallocated() || locs->in(p).IsConstant()) {
+ // If input is unallocated, match with an output register, if set. Also,
+ // if input is a direct constant, but the peephole output is a register,
+ // use that register to avoid wasting the already generated code.
+ if (peephole.IsRegister()) {
+ locs->set_in(p, Location::RegisterLocation(peephole.reg()));
+ }
+ }
+ }
+
// Block all registers globally reserved by the assembler, etc and mark
// the rest as free.
for (intptr_t i = 0; i < kNumberOfCpuRegisters; i++) {
@@ -1518,10 +1569,17 @@
}
ASSERT(reg != kNoRegister || loc.IsConstant());
- // Inputs are consumed from the simulated frame. In case of a call argument
- // we leave it until the call instruction.
+ // Inputs are consumed from the simulated frame (or a peephole push/pop).
+ // In case of a call argument we leave it until the call instruction.
if (should_pop) {
- if (loc.IsConstant()) {
+ if (top_of_stack_ != nullptr) {
+ if (!loc.IsConstant()) {
+ // Moves top of stack location of the peephole into the required
+ // input. None of the required moves needs a temp register allocator.
+ EmitMove(locs->in(i), top_of_stack_->locs()->out(0), nullptr);
+ }
+ top_of_stack_ = nullptr; // consumed!
+ } else if (loc.IsConstant()) {
assembler()->Drop(1);
} else {
assembler()->PopRegister(reg);
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.h b/runtime/vm/compiler/backend/flow_graph_compiler.h
index 8795377..1d9dbd8 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.h
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.h
@@ -1025,6 +1025,10 @@
void FrameStateClear();
#endif
+ // Returns true if instruction lookahead (window size one)
+ // is amenable to a peephole optimization.
+ bool IsPeephole(Instruction* instr) const;
+
// This struct contains either function or code, the other one being NULL.
class StaticCallsStruct : public ZoneAllocated {
public:
@@ -1089,6 +1093,11 @@
bool fully_intrinsified_ = false;
CodeStatistics* stats_;
+ // The definition whose value is supposed to be at the top of the
+ // expression stack. Used by peephole optimization (window size one)
+ // to eliminate redundant push/pop pairs.
+ Definition* top_of_stack_ = nullptr;
+
const Class& double_class_;
const Class& mint_class_;
const Class& float32x4_class_;
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
index d968031..cd1dbf1 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
@@ -1309,6 +1309,8 @@
void FlowGraphCompiler::EmitMove(Location destination,
Location source,
TemporaryRegisterAllocator* allocator) {
+ if (destination.Equals(source)) return;
+
if (source.IsRegister()) {
if (destination.IsRegister()) {
__ mov(destination.reg(), Operand(source.reg()));
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
index 3eeac84..b629fec 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
@@ -1297,6 +1297,8 @@
void FlowGraphCompiler::EmitMove(Location destination,
Location source,
TemporaryRegisterAllocator* allocator) {
+ if (destination.Equals(source)) return;
+
if (source.IsRegister()) {
if (destination.IsRegister()) {
__ mov(destination.reg(), source.reg());
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
index 5cf94a8..a7e1572 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
@@ -1182,6 +1182,8 @@
void FlowGraphCompiler::EmitMove(Location destination,
Location source,
TemporaryRegisterAllocator* tmp) {
+ if (destination.Equals(source)) return;
+
if (source.IsRegister()) {
if (destination.IsRegister()) {
__ movl(destination.reg(), source.reg());