[vm] Consolidate generic move code between ParallelMoveResolver and FFICall

Will be useful for FFI callbacks.

Cq-Include-Trybots: luci.dart.try:vm-ffi-android-debug-arm-try
Change-Id: I7940d4a58a036e4cb868fc89311f659039f5d23f
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/100381
Commit-Queue: Samir Jindel <sjindel@google.com>
Reviewed-by: Vyacheslav Egorov <vegorov@google.com>
Reviewed-by: Aart Bik <ajcbik@google.com>
diff --git a/runtime/vm/compiler/assembler/assembler_ia32.cc b/runtime/vm/compiler/assembler/assembler_ia32.cc
index 4ca49e3..8794ba8 100644
--- a/runtime/vm/compiler/assembler/assembler_ia32.cc
+++ b/runtime/vm/compiler/assembler/assembler_ia32.cc
@@ -2262,6 +2262,11 @@
   label->BindTo(bound);
 }
 
+void Assembler::MoveMemoryToMemory(Address dst, Address src, Register tmp) {
+  movl(tmp, src);
+  movl(dst, tmp);
+}
+
 #ifndef PRODUCT
 void Assembler::MaybeTraceAllocation(intptr_t cid,
                                      Register temp_reg,
diff --git a/runtime/vm/compiler/assembler/assembler_ia32.h b/runtime/vm/compiler/assembler/assembler_ia32.h
index 9aca7a9..39086a3 100644
--- a/runtime/vm/compiler/assembler/assembler_ia32.h
+++ b/runtime/vm/compiler/assembler/assembler_ia32.h
@@ -723,6 +723,10 @@
   void Bind(Label* label);
   void Jump(Label* label) { jmp(label); }
 
+  // Moves one word from the memory at [from] to the memory at [to].
+  // Needs a temporary register.
+  void MoveMemoryToMemory(Address to, Address from, Register tmp);
+
   bool has_single_entry_point() const { return true; }
 
   // Set up a Dart frame on entry with a frame pointer and PC information to
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.cc b/runtime/vm/compiler/backend/flow_graph_compiler.cc
index dff5f3b..b99bc59 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.cc
@@ -1681,6 +1681,30 @@
   compiler_->EndCodeSourceRange(TokenPosition::kParallelMove);
 }
 
+#if !defined(TARGET_ARCH_DBC)
+void ParallelMoveResolver::EmitMove(int index) {
+  MoveOperands* const move = moves_[index];
+  const Location dst = move->dest();
+  if (dst.IsStackSlot() || dst.IsDoubleStackSlot()) {
+    ASSERT((dst.base_reg() != FPREG) ||
+           ((-compiler::target::frame_layout.VariableIndexForFrameSlot(
+                dst.stack_index())) < compiler_->StackSize()));
+  }
+  const Location src = move->src();
+  ParallelMoveResolver::TemporaryAllocator temp(this, /*blocked=*/kNoRegister);
+  compiler_->EmitMove(dst, src, &temp);
+#if defined(DEBUG)
+  // Allocating a scratch register here may cause stack spilling. Neither the
+  // source nor destination register should be SP-relative in that case.
+  for (const Location loc : {dst, src}) {
+    ASSERT(!temp.DidAllocateTemporary() || !loc.HasStackIndex() ||
+           loc.base_reg() != SPREG);
+  }
+#endif
+  move->Eliminate();
+}
+#endif
+
 bool ParallelMoveResolver::IsScratchLocation(Location loc) {
   for (int i = 0; i < moves_.length(); ++i) {
     if (moves_[i]->Blocks(loc)) {
@@ -1753,12 +1777,19 @@
   }
 }
 
-ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
+ParallelMoveResolver::TemporaryAllocator::TemporaryAllocator(
     ParallelMoveResolver* resolver,
     Register blocked)
-    : resolver_(resolver), reg_(kNoRegister), spilled_(false) {
-  uword blocked_mask = RegMaskBit(blocked) | kReservedCpuRegisters;
-  if (resolver->compiler_->intrinsic_mode()) {
+    : resolver_(resolver),
+      blocked_(blocked),
+      reg_(kNoRegister),
+      spilled_(false) {}
+
+Register ParallelMoveResolver::TemporaryAllocator::AllocateTemporary() {
+  ASSERT(reg_ == kNoRegister);
+
+  uword blocked_mask = RegMaskBit(blocked_) | kReservedCpuRegisters;
+  if (resolver_->compiler_->intrinsic_mode()) {
     // Block additional registers that must be preserved for intrinsics.
     blocked_mask |= RegMaskBit(ARGS_DESC_REG);
 #if !defined(TARGET_ARCH_IA32)
@@ -1772,14 +1803,29 @@
                                          kNumberOfCpuRegisters - 1, &spilled_));
 
   if (spilled_) {
-    resolver->SpillScratch(reg_);
+    resolver_->SpillScratch(reg_);
   }
+
+  DEBUG_ONLY(allocated_ = true;)
+  return reg_;
 }
 
-ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
+void ParallelMoveResolver::TemporaryAllocator::ReleaseTemporary() {
   if (spilled_) {
     resolver_->RestoreScratch(reg_);
   }
+  reg_ = kNoRegister;
+}
+
+ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveResolver* resolver,
+    Register blocked)
+    : allocator_(resolver, blocked) {
+  reg_ = allocator_.AllocateTemporary();
+}
+
+ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
+  allocator_.ReleaseTemporary();
 }
 
 const ICData* FlowGraphCompiler::GetOrAddInstanceCallICData(
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.h b/runtime/vm/compiler/backend/flow_graph_compiler.h
index 56046747..7f5c1a3 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.h
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.h
@@ -28,6 +28,32 @@
 class ParsedFunction;
 class SpeculativeInliningPolicy;
 
+// Used in methods which need conditional access to a temporary register.
+// May only be used to allocate a single temporary register.
+class TemporaryRegisterAllocator : public ValueObject {
+ public:
+  virtual ~TemporaryRegisterAllocator() {}
+  virtual Register AllocateTemporary() = 0;
+  virtual void ReleaseTemporary() = 0;
+};
+
+class ConstantTemporaryAllocator : public TemporaryRegisterAllocator {
+ public:
+  explicit ConstantTemporaryAllocator(Register tmp) : tmp_(tmp) {}
+
+  Register AllocateTemporary() override { return tmp_; }
+  void ReleaseTemporary() override {}
+
+ private:
+  Register const tmp_;
+};
+
+class NoTemporaryAllocator : public TemporaryRegisterAllocator {
+ public:
+  Register AllocateTemporary() override { UNREACHABLE(); }
+  void ReleaseTemporary() override { UNREACHABLE(); }
+};
+
 class ParallelMoveResolver : public ValueObject {
  public:
   explicit ParallelMoveResolver(FlowGraphCompiler* compiler);
@@ -50,6 +76,24 @@
     bool spilled_;
   };
 
+  class TemporaryAllocator : public TemporaryRegisterAllocator {
+   public:
+    TemporaryAllocator(ParallelMoveResolver* resolver, Register blocked);
+
+    Register AllocateTemporary() override;
+    void ReleaseTemporary() override;
+    DEBUG_ONLY(bool DidAllocateTemporary() { return allocated_; })
+
+    virtual ~TemporaryAllocator() { ASSERT(reg_ == kNoRegister); }
+
+   private:
+    ParallelMoveResolver* const resolver_;
+    const Register blocked_;
+    Register reg_;
+    bool spilled_;
+    DEBUG_ONLY(bool allocated_ = false);
+  };
+
   class ScratchRegisterScope : public ValueObject {
    public:
     ScratchRegisterScope(ParallelMoveResolver* resolver, Register blocked);
@@ -58,9 +102,8 @@
     Register reg() const { return reg_; }
 
    private:
-    ParallelMoveResolver* resolver_;
+    TemporaryAllocator allocator_;
     Register reg_;
-    bool spilled_;
   };
 
   bool IsScratchLocation(Location loc);
@@ -427,6 +470,9 @@
   // Returns 'true' if regular code generation should be skipped.
   bool TryIntrinsify();
 
+  // Emits code for a generic move from a location 'src' to a location 'dst'.
+  void EmitMove(Location dst, Location src, TemporaryRegisterAllocator* temp);
+
   void GenerateAssertAssignable(TokenPosition token_pos,
                                 intptr_t deopt_id,
                                 const AbstractType& dst_type,
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
index c53c257..0252b7b 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
@@ -1313,13 +1313,11 @@
 }
 
 #undef __
-#define __ compiler_->assembler()->
+#define __ assembler()->
 
-void ParallelMoveResolver::EmitMove(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
-
+void FlowGraphCompiler::EmitMove(Location destination,
+                                 Location source,
+                                 TemporaryRegisterAllocator* allocator) {
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ mov(destination.reg(), Operand(source.reg()));
@@ -1351,24 +1349,32 @@
         __ vmovd(EvenDRegisterOf(destination.fpu_reg()),
                  EvenDRegisterOf(source.fpu_reg()));
       }
+    } else if (destination.IsStackSlot()) {
+      // 32-bit float
+      const intptr_t dest_offset = destination.ToStackSlotOffset();
+      const SRegister src = EvenSRegisterOf(EvenDRegisterOf(source.fpu_reg()));
+      __ StoreSToOffset(src, destination.base_reg(), dest_offset);
+    } else if (destination.IsDoubleStackSlot()) {
+      const intptr_t dest_offset = destination.ToStackSlotOffset();
+      DRegister src = EvenDRegisterOf(source.fpu_reg());
+      __ StoreDToOffset(src, destination.base_reg(), dest_offset);
     } else {
-      if (destination.IsDoubleStackSlot()) {
-        const intptr_t dest_offset = destination.ToStackSlotOffset();
-        DRegister src = EvenDRegisterOf(source.fpu_reg());
-        __ StoreDToOffset(src, destination.base_reg(), dest_offset);
-      } else {
-        ASSERT(destination.IsQuadStackSlot());
-        const intptr_t dest_offset = destination.ToStackSlotOffset();
-        const DRegister dsrc0 = EvenDRegisterOf(source.fpu_reg());
-        __ StoreMultipleDToOffset(dsrc0, 2, destination.base_reg(),
-                                  dest_offset);
-      }
+      ASSERT(destination.IsQuadStackSlot());
+      const intptr_t dest_offset = destination.ToStackSlotOffset();
+      const DRegister dsrc0 = EvenDRegisterOf(source.fpu_reg());
+      __ StoreMultipleDToOffset(dsrc0, 2, destination.base_reg(), dest_offset);
     }
   } else if (source.IsDoubleStackSlot()) {
     if (destination.IsFpuRegister()) {
       const intptr_t source_offset = source.ToStackSlotOffset();
       const DRegister dst = EvenDRegisterOf(destination.fpu_reg());
       __ LoadDFromOffset(dst, source.base_reg(), source_offset);
+    } else if (destination.IsStackSlot()) {
+      // 32-bit float
+      const intptr_t source_offset = source.ToStackSlotOffset();
+      const intptr_t dest_offset = destination.ToStackSlotOffset();
+      __ LoadSFromOffset(STMP, source.base_reg(), source_offset);
+      __ StoreSToOffset(STMP, destination.base_reg(), dest_offset);
     } else {
       ASSERT(destination.IsDoubleStackSlot());
       const intptr_t source_offset = source.ToStackSlotOffset();
@@ -1389,22 +1395,27 @@
       __ LoadMultipleDFromOffset(dtmp0, 2, source.base_reg(), source_offset);
       __ StoreMultipleDToOffset(dtmp0, 2, destination.base_reg(), dest_offset);
     }
+  } else if (source.IsPairLocation()) {
+    ASSERT(destination.IsPairLocation());
+    for (intptr_t i : {0, 1}) {
+      EmitMove(destination.Component(i), source.Component(i), allocator);
+    }
   } else {
     ASSERT(source.IsConstant());
-
     if (destination.IsFpuRegister() || destination.IsDoubleStackSlot() ||
         destination.IsStackSlot()) {
-      ScratchRegisterScope scratch(this, kNoRegister);
-      source.constant_instruction()->EmitMoveToLocation(compiler_, destination,
-                                                        scratch.reg());
+      Register tmp = allocator->AllocateTemporary();
+      source.constant_instruction()->EmitMoveToLocation(this, destination, tmp);
+      allocator->ReleaseTemporary();
     } else {
-      source.constant_instruction()->EmitMoveToLocation(compiler_, destination);
+      source.constant_instruction()->EmitMoveToLocation(this, destination);
     }
   }
-
-  move->Eliminate();
 }
 
+#undef __
+#define __ compiler_->assembler()->
+
 void ParallelMoveResolver::EmitSwap(int index) {
   MoveOperands* move = moves_[index];
   const Location source = move->src();
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
index 8e76737..0458549 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
@@ -1305,13 +1305,11 @@
 }
 
 #undef __
-#define __ compiler_->assembler()->
+#define __ assembler()->
 
-void ParallelMoveResolver::EmitMove(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
-
+void FlowGraphCompiler::EmitMove(Location destination,
+                                 Location source,
+                                 TemporaryRegisterAllocator* allocator) {
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ mov(destination.reg(), source.reg());
@@ -1328,15 +1326,17 @@
       ASSERT(destination.IsStackSlot());
       const intptr_t source_offset = source.ToStackSlotOffset();
       const intptr_t dest_offset = destination.ToStackSlotOffset();
-      ScratchRegisterScope tmp(this, kNoRegister);
-      __ LoadFromOffset(tmp.reg(), source.base_reg(), source_offset);
-      __ StoreToOffset(tmp.reg(), destination.base_reg(), dest_offset);
+      Register tmp = allocator->AllocateTemporary();
+      __ LoadFromOffset(tmp, source.base_reg(), source_offset);
+      __ StoreToOffset(tmp, destination.base_reg(), dest_offset);
+      allocator->ReleaseTemporary();
     }
   } else if (source.IsFpuRegister()) {
     if (destination.IsFpuRegister()) {
       __ vmov(destination.fpu_reg(), source.fpu_reg());
     } else {
-      if (destination.IsDoubleStackSlot()) {
+      if (destination.IsStackSlot() /*32-bit float*/ ||
+          destination.IsDoubleStackSlot()) {
         const intptr_t dest_offset = destination.ToStackSlotOffset();
         VRegister src = source.fpu_reg();
         __ StoreDToOffset(src, destination.base_reg(), dest_offset);
@@ -1353,7 +1353,8 @@
       const VRegister dst = destination.fpu_reg();
       __ LoadDFromOffset(dst, source.base_reg(), source_offset);
     } else {
-      ASSERT(destination.IsDoubleStackSlot());
+      ASSERT(destination.IsDoubleStackSlot() ||
+             destination.IsStackSlot() /*32-bit float*/);
       const intptr_t source_offset = source.ToStackSlotOffset();
       const intptr_t dest_offset = destination.ToStackSlotOffset();
       __ LoadDFromOffset(VTMP, source.base_reg(), source_offset);
@@ -1374,17 +1375,18 @@
   } else {
     ASSERT(source.IsConstant());
     if (destination.IsStackSlot()) {
-      ScratchRegisterScope scratch(this, kNoRegister);
-      source.constant_instruction()->EmitMoveToLocation(compiler_, destination,
-                                                        scratch.reg());
+      Register tmp = allocator->AllocateTemporary();
+      source.constant_instruction()->EmitMoveToLocation(this, destination, tmp);
+      allocator->ReleaseTemporary();
     } else {
-      source.constant_instruction()->EmitMoveToLocation(compiler_, destination);
+      source.constant_instruction()->EmitMoveToLocation(this, destination);
     }
   }
-
-  move->Eliminate();
 }
 
+#undef __
+#define __ compiler_->assembler()->
+
 void ParallelMoveResolver::EmitSwap(int index) {
   MoveOperands* move = moves_[index];
   const Location source = move->src();
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
index 5fdf8f9..0336895 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
@@ -1174,13 +1174,11 @@
 }
 
 #undef __
-#define __ compiler_->assembler()->
+#define __ assembler()->
 
-void ParallelMoveResolver::EmitMove(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
-
+void FlowGraphCompiler::EmitMove(Location destination,
+                                 Location source,
+                                 TemporaryRegisterAllocator* tmp) {
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ movl(destination.reg(), source.reg());
@@ -1193,8 +1191,10 @@
       __ movl(destination.reg(), LocationToStackSlotAddress(source));
     } else {
       ASSERT(destination.IsStackSlot());
-      MoveMemoryToMemory(LocationToStackSlotAddress(destination),
-                         LocationToStackSlotAddress(source));
+      Register scratch = tmp->AllocateTemporary();
+      __ MoveMemoryToMemory(LocationToStackSlotAddress(destination),
+                            LocationToStackSlotAddress(source), scratch);
+      tmp->ReleaseTemporary();
     }
   } else if (source.IsFpuRegister()) {
     if (destination.IsFpuRegister()) {
@@ -1204,6 +1204,9 @@
     } else {
       if (destination.IsDoubleStackSlot()) {
         __ movsd(LocationToStackSlotAddress(destination), source.fpu_reg());
+      } else if (destination.IsStackSlot()) {
+        // 32-bit float
+        __ movss(LocationToStackSlotAddress(destination), source.fpu_reg());
       } else {
         ASSERT(destination.IsQuadStackSlot());
         __ movups(LocationToStackSlotAddress(destination), source.fpu_reg());
@@ -1212,6 +1215,10 @@
   } else if (source.IsDoubleStackSlot()) {
     if (destination.IsFpuRegister()) {
       __ movsd(destination.fpu_reg(), LocationToStackSlotAddress(source));
+    } else if (destination.IsStackSlot()) {
+      // Source holds a 32-bit float, take only the lower 32-bits
+      __ movss(FpuTMP, LocationToStackSlotAddress(source));
+      __ movss(LocationToStackSlotAddress(destination), FpuTMP);
     } else {
       ASSERT(destination.IsDoubleStackSlot());
       __ movsd(FpuTMP, LocationToStackSlotAddress(source));
@@ -1225,14 +1232,20 @@
       __ movups(FpuTMP, LocationToStackSlotAddress(source));
       __ movups(LocationToStackSlotAddress(destination), FpuTMP);
     }
+  } else if (source.IsPairLocation()) {
+    ASSERT(destination.IsPairLocation());
+    for (intptr_t i : {0, 1}) {
+      EmitMove(destination.Component(i), source.Component(i), tmp);
+    }
   } else {
     ASSERT(source.IsConstant());
-    source.constant_instruction()->EmitMoveToLocation(compiler_, destination);
+    source.constant_instruction()->EmitMoveToLocation(this, destination);
   }
-
-  move->Eliminate();
 }
 
+#undef __
+#define __ compiler_->assembler()->
+
 void ParallelMoveResolver::EmitSwap(int index) {
   MoveOperands* move = moves_[index];
   const Location source = move->src();
@@ -1314,8 +1327,7 @@
 void ParallelMoveResolver::MoveMemoryToMemory(const Address& dst,
                                               const Address& src) {
   ScratchRegisterScope ensure_scratch(this, kNoRegister);
-  __ movl(ensure_scratch.reg(), src);
-  __ movl(dst, ensure_scratch.reg());
+  __ MoveMemoryToMemory(dst, src, ensure_scratch.reg());
 }
 
 void ParallelMoveResolver::Exchange(Register reg, const Address& mem) {
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
index 46e9f35..91fe976 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
@@ -1300,33 +1300,31 @@
 }
 
 #undef __
-#define __ compiler_->assembler()->
+#define __ assembler()->
 
-void ParallelMoveResolver::EmitMove(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
+void FlowGraphCompiler::EmitMove(Location destination,
+                                 Location source,
+                                 TemporaryRegisterAllocator* tmp) {
+  if (destination.Equals(source)) return;
 
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ movq(destination.reg(), source.reg());
     } else {
       ASSERT(destination.IsStackSlot());
-      ASSERT((destination.base_reg() != FPREG) ||
-             ((-compiler::target::frame_layout.VariableIndexForFrameSlot(
-                  destination.stack_index())) < compiler_->StackSize()));
       __ movq(LocationToStackSlotAddress(destination), source.reg());
     }
   } else if (source.IsStackSlot()) {
-    ASSERT((source.base_reg() != FPREG) ||
-           ((-compiler::target::frame_layout.VariableIndexForFrameSlot(
-                source.stack_index())) < compiler_->StackSize()));
     if (destination.IsRegister()) {
       __ movq(destination.reg(), LocationToStackSlotAddress(source));
+    } else if (destination.IsFpuRegister()) {
+      // 32-bit float
+      __ movq(TMP, LocationToStackSlotAddress(source));
+      __ movq(destination.fpu_reg(), TMP);
     } else {
       ASSERT(destination.IsStackSlot());
-      MoveMemoryToMemory(LocationToStackSlotAddress(destination),
-                         LocationToStackSlotAddress(source));
+      __ MoveMemoryToMemory(LocationToStackSlotAddress(destination),
+                            LocationToStackSlotAddress(source));
     }
   } else if (source.IsFpuRegister()) {
     if (destination.IsFpuRegister()) {
@@ -1345,7 +1343,8 @@
     if (destination.IsFpuRegister()) {
       __ movsd(destination.fpu_reg(), LocationToStackSlotAddress(source));
     } else {
-      ASSERT(destination.IsDoubleStackSlot());
+      ASSERT(destination.IsDoubleStackSlot() ||
+             destination.IsStackSlot() /*32-bit float*/);
       __ movsd(FpuTMP, LocationToStackSlotAddress(source));
       __ movsd(LocationToStackSlotAddress(destination), FpuTMP);
     }
@@ -1360,17 +1359,19 @@
   } else {
     ASSERT(source.IsConstant());
     if (destination.IsFpuRegister() || destination.IsDoubleStackSlot()) {
-      ScratchRegisterScope scratch(this, kNoRegister);
-      source.constant_instruction()->EmitMoveToLocation(compiler_, destination,
-                                                        scratch.reg());
+      Register scratch = tmp->AllocateTemporary();
+      source.constant_instruction()->EmitMoveToLocation(this, destination,
+                                                        scratch);
+      tmp->ReleaseTemporary();
     } else {
-      source.constant_instruction()->EmitMoveToLocation(compiler_, destination);
+      source.constant_instruction()->EmitMoveToLocation(this, destination);
     }
   }
-
-  move->Eliminate();
 }
 
+#undef __
+#define __ compiler_->assembler()->
+
 void ParallelMoveResolver::EmitSwap(int index) {
   MoveOperands* move = moves_[index];
   const Location source = move->src();
diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc
index 15fb9a6..5750130 100644
--- a/runtime/vm/compiler/backend/il_arm.cc
+++ b/runtime/vm/compiler/backend/il_arm.cc
@@ -1002,45 +1002,13 @@
   __ ReserveAlignedFrameSpace(compiler::ffi::NumStackSlots(arg_locations_) *
                               kWordSize);
 
-  // Load a 32-bit argument, or a 32-bit component of a 64-bit argument.
-  auto load_single_slot = [&](Location from, Location to) {
-    if (!to.IsStackSlot()) return;
-    if (from.IsRegister()) {
-      __ str(from.reg(), LocationToStackSlotAddress(to));
-    } else if (from.IsFpuRegister()) {
-      __ vstrs(EvenSRegisterOf(EvenDRegisterOf(from.fpu_reg())),
-               LocationToStackSlotAddress(to));
-    } else if (from.IsStackSlot() || from.IsDoubleStackSlot()) {
-      ASSERT(from.base_reg() == FPREG);
-      __ ldr(TMP, Address(saved_fp, from.ToStackSlotOffset()));
-      __ str(TMP, LocationToStackSlotAddress(to));
-    } else {
-      UNREACHABLE();
-    }
-  };
-
+  FrameRebase rebase(/*old_base=*/FPREG, /*new_base=*/saved_fp,
+                     /*stack_delta=*/0);
   for (intptr_t i = 0, n = NativeArgCount(); i < n; ++i) {
-    Location origin = locs()->in(i);
-    Location target = arg_locations_[i];
-
-    if (target.IsStackSlot()) {
-      load_single_slot(origin, target);
-    } else if (target.IsDoubleStackSlot()) {
-      if (origin.IsFpuRegister()) {
-        __ vstrd(EvenDRegisterOf(origin.fpu_reg()),
-                 LocationToStackSlotAddress(target));
-      } else {
-        ASSERT(origin.IsDoubleStackSlot() && origin.base_reg() == FPREG);
-        __ vldrd(DTMP, Address(saved_fp, origin.ToStackSlotOffset()));
-        __ vstrd(DTMP, LocationToStackSlotAddress(target));
-      }
-    } else if (target.IsPairLocation()) {
-      ASSERT(origin.IsPairLocation());
-      load_single_slot(origin.AsPairLocation()->At(0),
-                       target.AsPairLocation()->At(0));
-      load_single_slot(origin.AsPairLocation()->At(1),
-                       target.AsPairLocation()->At(1));
-    }
+    const Location origin = rebase.Rebase(locs()->in(i));
+    const Location target = arg_locations_[i];
+    NoTemporaryAllocator no_temp;
+    compiler->EmitMove(target, origin, &no_temp);
   }
 
   // We need to copy the return address up into the dummy stack frame so the
diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc
index 4270030..0af7995 100644
--- a/runtime/vm/compiler/backend/il_arm64.cc
+++ b/runtime/vm/compiler/backend/il_arm64.cc
@@ -894,24 +894,13 @@
   __ ReserveAlignedFrameSpace(compiler::ffi::NumStackSlots(arg_locations_) *
                               kWordSize);
 
+  FrameRebase rebase(/*old_base=*/FPREG, /*new_base=*/saved_fp,
+                     /*stack_delta=*/0);
   for (intptr_t i = 0, n = NativeArgCount(); i < n; ++i) {
-    Location origin = locs()->in(i);
-    Location target = arg_locations_[i];
-
-    if (target.IsStackSlot()) {
-      if (origin.IsRegister()) {
-        __ StoreToOffset(origin.reg(), SPREG, target.ToStackSlotOffset());
-      } else if (origin.IsFpuRegister()) {
-        __ StoreDToOffset(origin.fpu_reg(), SPREG, target.ToStackSlotOffset());
-      } else if (origin.IsStackSlot() || origin.IsDoubleStackSlot()) {
-        // The base register cannot be SPREG because we've moved it.
-        ASSERT(origin.base_reg() == FPREG);
-        __ LoadFromOffset(TMP, saved_fp, origin.ToStackSlotOffset());
-        __ StoreToOffset(TMP, SPREG, target.ToStackSlotOffset());
-      }
-    } else {
-      ASSERT(origin.Equals(target));
-    }
+    const Location origin = rebase.Rebase(locs()->in(i));
+    const Location target = arg_locations_[i];
+    ConstantTemporaryAllocator temp_alloc(temp);
+    compiler->EmitMove(target, origin, &temp_alloc);
   }
 
   // We need to copy a dummy return address up into the dummy stack frame so the
diff --git a/runtime/vm/compiler/backend/il_ia32.cc b/runtime/vm/compiler/backend/il_ia32.cc
index 03ab961..03ef64f 100644
--- a/runtime/vm/compiler/backend/il_ia32.cc
+++ b/runtime/vm/compiler/backend/il_ia32.cc
@@ -846,9 +846,9 @@
 }
 
 void FfiCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
-  Register saved_fp = locs()->temp(0).reg();  // volatile
-  Register branch = locs()->in(TargetAddressIndex()).reg();
-  Register tmp = locs()->temp(1).reg();  // callee-saved
+  const Register saved_fp = locs()->temp(0).reg();  // volatile
+  const Register branch = locs()->in(TargetAddressIndex()).reg();
+  const Register tmp = locs()->temp(1).reg();  // callee-saved
 
   // Save frame pointer because we're going to update it when we enter the exit
   // frame.
@@ -866,45 +866,13 @@
     __ andl(SPREG, Immediate(~(OS::ActivationFrameAlignment() - 1)));
   }
 
-  // Load a 32-bit argument, or a 32-bit component of a 64-bit argument.
-  auto load_single_slot = [&](Location from, Location to) {
-    ASSERT(to.IsStackSlot());
-    if (from.IsRegister()) {
-      __ movl(LocationToStackSlotAddress(to), from.reg());
-    } else if (from.IsFpuRegister()) {
-      __ movss(LocationToStackSlotAddress(to), from.fpu_reg());
-    } else if (from.IsStackSlot() || from.IsDoubleStackSlot()) {
-      ASSERT(from.base_reg() == FPREG);
-      __ movl(tmp, Address(saved_fp, from.ToStackSlotOffset()));
-      __ movl(LocationToStackSlotAddress(to), tmp);
-    } else {
-      UNREACHABLE();
-    }
-  };
-
+  FrameRebase rebase(/*old_base=*/FPREG, /*new_base=*/saved_fp,
+                     /*stack_delta=*/0);
   for (intptr_t i = 0, n = NativeArgCount(); i < n; ++i) {
-    Location origin = locs()->in(i);
-    Location target = arg_locations_[i];
-
-    if (target.IsStackSlot()) {
-      load_single_slot(origin, target);
-    } else if (target.IsDoubleStackSlot()) {
-      if (origin.IsFpuRegister()) {
-        __ movsd(LocationToStackSlotAddress(target), origin.fpu_reg());
-      } else {
-        ASSERT(origin.IsDoubleStackSlot() && origin.base_reg() == FPREG);
-        __ movl(tmp, Address(saved_fp, origin.ToStackSlotOffset()));
-        __ movl(LocationToStackSlotAddress(target), tmp);
-        __ movl(tmp, Address(saved_fp, origin.ToStackSlotOffset() + 4));
-        __ movl(Address(SPREG, target.ToStackSlotOffset() + 4), tmp);
-      }
-    } else if (target.IsPairLocation()) {
-      ASSERT(origin.IsPairLocation());
-      load_single_slot(origin.AsPairLocation()->At(0),
-                       target.AsPairLocation()->At(0));
-      load_single_slot(origin.AsPairLocation()->At(1),
-                       target.AsPairLocation()->At(1));
-    }
+    const Location origin = rebase.Rebase(locs()->in(i));
+    const Location target = arg_locations_[i];
+    ConstantTemporaryAllocator tmp_alloc(tmp);
+    compiler->EmitMove(target, origin, &tmp_alloc);
   }
 
   // We need to copy a dummy return address up into the dummy stack frame so the
diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc
index 780cf7a..61194b0 100644
--- a/runtime/vm/compiler/backend/il_x64.cc
+++ b/runtime/vm/compiler/backend/il_x64.cc
@@ -902,25 +902,13 @@
     __ andq(SPREG, Immediate(~(OS::ActivationFrameAlignment() - 1)));
   }
 
+  FrameRebase rebase(/*old_base=*/FPREG, /*new_base=*/saved_fp,
+                     /*stack_delta=*/0);
   for (intptr_t i = 0, n = NativeArgCount(); i < n; ++i) {
-    Location origin = locs()->in(i);
-    Location target = arg_locations_[i];
-
-    if (target.IsStackSlot()) {
-      if (origin.IsRegister()) {
-        __ movq(LocationToStackSlotAddress(target), origin.reg());
-      } else if (origin.IsFpuRegister()) {
-        __ movq(TMP, origin.fpu_reg());
-        __ movq(LocationToStackSlotAddress(target), TMP);
-      } else if (origin.IsStackSlot() || origin.IsDoubleStackSlot()) {
-        // The base register cannot be SPREG because we've moved it.
-        ASSERT(origin.base_reg() == FPREG);
-        __ movq(TMP, Address(saved_fp, origin.ToStackSlotOffset()));
-        __ movq(LocationToStackSlotAddress(target), TMP);
-      }
-    } else {
-      ASSERT(origin.Equals(target));
-    }
+    const Location origin = rebase.Rebase(locs()->in(i));
+    const Location target = arg_locations_[i];
+    NoTemporaryAllocator temp;
+    compiler->EmitMove(target, rebase.Rebase(origin), &temp);
   }
 
   // We need to copy a dummy return address up into the dummy stack frame so the
diff --git a/runtime/vm/compiler/backend/locations.h b/runtime/vm/compiler/backend/locations.h
index c37640d..d127790 100644
--- a/runtime/vm/compiler/backend/locations.h
+++ b/runtime/vm/compiler/backend/locations.h
@@ -62,6 +62,8 @@
 static constexpr Representation kUnboxedIntPtr =
     compiler::target::kWordSize == 4 ? kUnboxedInt32 : kUnboxedInt64;
 
+class FrameRebase;
+
 // Location objects are used to connect register allocator and code generator.
 // Instruction templates used by code generator have a corresponding
 // LocationSummary object which specifies expected location for every input
@@ -218,6 +220,11 @@
   TemplatePairLocation<TemplateLocation<Register, FpuRegister>>*
   AsPairLocation() const;
 
+  // For pair locations, returns the ith component (for i in {0, 1}).
+  TemplateLocation<Register, FpuRegister> Component(intptr_t i) const {
+    return AsPairLocation()->At(i);
+  }
+
   // Unallocated locations.
   enum Policy {
     kAny,
@@ -403,11 +410,25 @@
  private:
   explicit TemplateLocation(uword value) : value_(value) {}
 
+  void set_stack_index(intptr_t index) {
+    ASSERT(HasStackIndex());
+    value_ = PayloadField::update(
+        StackIndexField::update(EncodeStackIndex(index), payload()), value_);
+  }
+
+  void set_base_reg(Register reg) {
+    ASSERT(HasStackIndex());
+    value_ = PayloadField::update(StackSlotBaseField::update(reg, payload()),
+                                  value_);
+  }
+
   TemplateLocation(Kind kind, uword payload)
       : value_(KindField::encode(kind) | PayloadField::encode(payload)) {}
 
   uword payload() const { return PayloadField::decode(value_); }
 
+  friend class FrameRebase;
+
   class KindField : public BitField<uword, Kind, kKindBitsPos, kKindBitsSize> {
   };
   class PayloadField
@@ -805,6 +826,36 @@
 #endif
 };
 
+// Describes a change of stack frame where the stack or base register or stack
+// offset may change. This class allows easily rebasing stack locations across
+// frame manipulations.
+//
+// If the stack offset register matches 'old_base', it is changed to 'new_base'
+// and 'stack_delta' (# of slots) is applied.
+class FrameRebase : public ValueObject {
+ public:
+  FrameRebase(Register old_base, Register new_base, intptr_t stack_delta)
+      : old_base_(old_base), new_base_(new_base), stack_delta_(stack_delta) {}
+
+  Location Rebase(Location loc) {
+    if (loc.IsPairLocation()) {
+      return Location::Pair(Rebase(loc.Component(0)), Rebase(loc.Component(1)));
+    }
+    if (!loc.HasStackIndex() || loc.base_reg() != old_base_) {
+      return loc;
+    }
+
+    loc.set_base_reg(new_base_);
+    loc.set_stack_index(loc.stack_index() + stack_delta_);
+    return loc;
+  }
+
+ private:
+  Register old_base_;
+  Register new_base_;
+  intptr_t stack_delta_;
+};
+
 }  // namespace dart
 
 #endif  // RUNTIME_VM_COMPILER_BACKEND_LOCATIONS_H_