[dart/vm] Peephole optimizer (window size one) on stack code

Rationale:
A basic peephole optimizer with window size one that avoids
redundant push-pop sequences already results is substantial
savings in code size and runtime, without any noticeable impact
on compile time (the peephole is very, very fast).

Performance:
Golem unoptimized code (which typically runs -90% compared to
optimized code, sees 5-60% improvements

https://github.com/dart-lang/sdk/issues/36409

Change-Id: I08db4b3dbc92377d89340a4969db6e664e54bceb
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/102980
Commit-Queue: Aart Bik <ajcbik@google.com>
Reviewed-by: Vyacheslav Egorov <vegorov@google.com>
Reviewed-by: Ryan Macnak <rmacnak@google.com>
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.cc b/runtime/vm/compiler/backend/flow_graph_compiler.cc
index 73c21d8..2ef9877 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.cc
@@ -42,6 +42,8 @@
             false,
             "Inlining interval diagnostics");
 
+DEFINE_FLAG(bool, enable_peephole, true, "Enable peephole optimization");
+
 #if !defined(DART_PRECOMPILED_RUNTIME)
 
 DEFINE_FLAG(bool,
@@ -511,6 +513,35 @@
                        line.ToCString());
 }
 
+#if !defined(TARGET_ARCH_DBC)
+
+static bool IsPusher(Instruction* instr) {
+  if (auto def = instr->AsDefinition()) {
+    return def->HasTemp();
+  }
+  return false;
+}
+
+static bool IsPopper(Instruction* instr) {
+  // TODO(ajcbik): even allow deopt targets by making environment aware?
+  if (!instr->CanBecomeDeoptimizationTarget()) {
+    return !instr->IsPushArgument() && instr->ArgumentCount() == 0 &&
+           instr->InputCount() > 0;
+  }
+  return false;
+}
+
+#endif
+
+bool FlowGraphCompiler::IsPeephole(Instruction* instr) const {
+#if !defined(TARGET_ARCH_DBC)
+  if (FLAG_enable_peephole && !is_optimizing()) {
+    return IsPusher(instr) && IsPopper(instr->next());
+  }
+#endif
+  return false;
+}
+
 void FlowGraphCompiler::VisitBlocks() {
   CompactBlocks();
   if (Assembler::EmittingComments()) {
@@ -585,7 +616,12 @@
         pending_deoptimization_env_ = instr->env();
         instr->EmitNativeCode(this);
         pending_deoptimization_env_ = NULL;
-        EmitInstructionEpilogue(instr);
+        if (IsPeephole(instr)) {
+          ASSERT(top_of_stack_ == nullptr);
+          top_of_stack_ = instr->AsDefinition();
+        } else {
+          EmitInstructionEpilogue(instr);
+        }
         EndCodeSourceRange(instr->token_pos());
       }
 
@@ -1470,6 +1506,21 @@
 
   bool blocked_registers[kNumberOfCpuRegisters];
 
+  // Connect input with peephole output for some special cases. All other
+  // cases are handled by simply allocating registers and generating code.
+  if (top_of_stack_ != nullptr) {
+    const intptr_t p = locs->input_count() - 1;
+    Location peephole = top_of_stack_->locs()->out(0);
+    if (locs->in(p).IsUnallocated() || locs->in(p).IsConstant()) {
+      // If input is unallocated, match with an output register, if set. Also,
+      // if input is a direct constant, but the peephole output is a register,
+      // use that register to avoid wasting the already generated code.
+      if (peephole.IsRegister()) {
+        locs->set_in(p, Location::RegisterLocation(peephole.reg()));
+      }
+    }
+  }
+
   // Block all registers globally reserved by the assembler, etc and mark
   // the rest as free.
   for (intptr_t i = 0; i < kNumberOfCpuRegisters; i++) {
@@ -1518,10 +1569,17 @@
     }
     ASSERT(reg != kNoRegister || loc.IsConstant());
 
-    // Inputs are consumed from the simulated frame. In case of a call argument
-    // we leave it until the call instruction.
+    // Inputs are consumed from the simulated frame (or a peephole push/pop).
+    // In case of a call argument we leave it until the call instruction.
     if (should_pop) {
-      if (loc.IsConstant()) {
+      if (top_of_stack_ != nullptr) {
+        if (!loc.IsConstant()) {
+          // Moves top of stack location of the peephole into the required
+          // input. None of the required moves needs a temp register allocator.
+          EmitMove(locs->in(i), top_of_stack_->locs()->out(0), nullptr);
+        }
+        top_of_stack_ = nullptr;  // consumed!
+      } else if (loc.IsConstant()) {
         assembler()->Drop(1);
       } else {
         assembler()->PopRegister(reg);
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.h b/runtime/vm/compiler/backend/flow_graph_compiler.h
index 8795377..1d9dbd8 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.h
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.h
@@ -1025,6 +1025,10 @@
   void FrameStateClear();
 #endif
 
+  // Returns true if instruction lookahead (window size one)
+  // is amenable to a peephole optimization.
+  bool IsPeephole(Instruction* instr) const;
+
   // This struct contains either function or code, the other one being NULL.
   class StaticCallsStruct : public ZoneAllocated {
    public:
@@ -1089,6 +1093,11 @@
   bool fully_intrinsified_ = false;
   CodeStatistics* stats_;
 
+  // The definition whose value is supposed to be at the top of the
+  // expression stack. Used by peephole optimization (window size one)
+  // to eliminate redundant push/pop pairs.
+  Definition* top_of_stack_ = nullptr;
+
   const Class& double_class_;
   const Class& mint_class_;
   const Class& float32x4_class_;
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
index d968031..cd1dbf1 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
@@ -1309,6 +1309,8 @@
 void FlowGraphCompiler::EmitMove(Location destination,
                                  Location source,
                                  TemporaryRegisterAllocator* allocator) {
+  if (destination.Equals(source)) return;
+
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ mov(destination.reg(), Operand(source.reg()));
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
index 3eeac84..b629fec 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
@@ -1297,6 +1297,8 @@
 void FlowGraphCompiler::EmitMove(Location destination,
                                  Location source,
                                  TemporaryRegisterAllocator* allocator) {
+  if (destination.Equals(source)) return;
+
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ mov(destination.reg(), source.reg());
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
index 5cf94a8..a7e1572 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
@@ -1182,6 +1182,8 @@
 void FlowGraphCompiler::EmitMove(Location destination,
                                  Location source,
                                  TemporaryRegisterAllocator* tmp) {
+  if (destination.Equals(source)) return;
+
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ movl(destination.reg(), source.reg());