[vm/precomp] Extend support for slow-path code sharing to ARM and ARM64.

Test Plan:

Covered by existing slow-path code sharing tests

Cq-Include-Trybots:luci.dart.try:vm-kernel-precomp-linux-debug-x64-try;luci.dart.try:vm-kernel-precomp-linux-release-simarm-try;luci.dart.try:vm-kernel-precomp-linux-release-simarm64-try;luci.dart.try:vm-kernel-precomp-linux-release-x64-try;luci.dart.try:vm-kernel-precomp-win-release-x64-try
Change-Id: Iab999e0abfa7677d60ebb259a1d3d128bb49f879
Reviewed-on: https://dart-review.googlesource.com/62903
Commit-Queue: Samir Jindel <sjindel@google.com>
Reviewed-by: Alexander Markov <alexmarkov@google.com>
diff --git a/runtime/vm/compiler/assembler/assembler_arm.cc b/runtime/vm/compiler/assembler/assembler_arm.cc
index 49bded1..70bee17 100644
--- a/runtime/vm/compiler/assembler/assembler_arm.cc
+++ b/runtime/vm/compiler/assembler/assembler_arm.cc
@@ -6,6 +6,7 @@
 #if defined(TARGET_ARCH_ARM) && !defined(DART_PRECOMPILED_RUNTIME)
 
 #include "vm/compiler/assembler/assembler.h"
+#include "vm/compiler/backend/locations.h"
 #include "vm/cpu.h"
 #include "vm/longjump.h"
 #include "vm/runtime_entry.h"
@@ -2225,6 +2226,70 @@
   ldm(IA_W, SP, regs, cond);
 }
 
+void Assembler::PushRegisters(const RegisterSet& regs) {
+  const intptr_t fpu_regs_count = regs.FpuRegisterCount();
+  if (fpu_regs_count > 0) {
+    AddImmediate(SP, -(fpu_regs_count * kFpuRegisterSize));
+    // Store fpu registers with the lowest register number at the lowest
+    // address.
+    intptr_t offset = 0;
+    mov(TMP, Operand(SP));
+    for (intptr_t i = 0; i < kNumberOfFpuRegisters; ++i) {
+      QRegister fpu_reg = static_cast<QRegister>(i);
+      if (regs.ContainsFpuRegister(fpu_reg)) {
+        DRegister d = EvenDRegisterOf(fpu_reg);
+        ASSERT(d + 1 == OddDRegisterOf(fpu_reg));
+        vstmd(IA_W, IP, d, 2);
+        offset += kFpuRegisterSize;
+      }
+    }
+    ASSERT(offset == (fpu_regs_count * kFpuRegisterSize));
+  }
+
+  // The order in which the registers are pushed must match the order
+  // in which the registers are encoded in the safe point's stack map.
+  // NOTE: This matches the order of ARM's multi-register push.
+  RegList reg_list = 0;
+  for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
+    Register reg = static_cast<Register>(i);
+    if (regs.ContainsRegister(reg)) {
+      reg_list |= (1 << reg);
+    }
+  }
+  if (reg_list != 0) {
+    PushList(reg_list);
+  }
+}
+
+void Assembler::PopRegisters(const RegisterSet& regs) {
+  RegList reg_list = 0;
+  for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
+    Register reg = static_cast<Register>(i);
+    if (regs.ContainsRegister(reg)) {
+      reg_list |= (1 << reg);
+    }
+  }
+  if (reg_list != 0) {
+    PopList(reg_list);
+  }
+
+  const intptr_t fpu_regs_count = regs.FpuRegisterCount();
+  if (fpu_regs_count > 0) {
+    // Fpu registers have the lowest register number at the lowest address.
+    intptr_t offset = 0;
+    for (intptr_t i = 0; i < kNumberOfFpuRegisters; ++i) {
+      QRegister fpu_reg = static_cast<QRegister>(i);
+      if (regs.ContainsFpuRegister(fpu_reg)) {
+        DRegister d = EvenDRegisterOf(fpu_reg);
+        ASSERT(d + 1 == OddDRegisterOf(fpu_reg));
+        vldmd(IA_W, SP, d, 2);
+        offset += kFpuRegisterSize;
+      }
+    }
+    ASSERT(offset == (fpu_regs_count * kFpuRegisterSize));
+  }
+}
+
 void Assembler::MoveRegister(Register rd, Register rm, Condition cond) {
   if (rd != rm) {
     mov(rd, Operand(rm), cond);
@@ -2406,6 +2471,15 @@
   blx(IP);
 }
 
+void Assembler::CallNullErrorShared(bool save_fpu_registers) {
+  uword entry_point_offset =
+      save_fpu_registers
+          ? Thread::null_error_shared_with_fpu_regs_entry_point_offset()
+          : Thread::null_error_shared_without_fpu_regs_entry_point_offset();
+  ldr(LR, Address(THR, entry_point_offset));
+  blx(LR);
+}
+
 void Assembler::BranchLinkWithEquivalence(const StubEntry& stub_entry,
                                           const Object& equivalence) {
   const Code& target = Code::ZoneHandle(stub_entry.code());
diff --git a/runtime/vm/compiler/assembler/assembler_arm.h b/runtime/vm/compiler/assembler/assembler_arm.h
index 1916d62..4de25ee 100644
--- a/runtime/vm/compiler/assembler/assembler_arm.h
+++ b/runtime/vm/compiler/assembler/assembler_arm.h
@@ -22,6 +22,7 @@
 // Forward declarations.
 class RuntimeEntry;
 class StubEntry;
+class RegisterSet;
 
 // Instruction encoding bits.
 enum {
@@ -698,7 +699,7 @@
   void BranchLink(const Code& code, Patchability patchable);
   void BranchLinkToRuntime();
 
-  void CallNullErrorShared(bool save_fpu_registers) { UNREACHABLE(); }
+  void CallNullErrorShared(bool save_fpu_registers);
 
   // Branch and link to an entry address. Call sequence can be patched.
   void BranchLinkPatchable(const StubEntry& stub_entry);
@@ -910,6 +911,9 @@
   void PushList(RegList regs, Condition cond = AL);
   void PopList(RegList regs, Condition cond = AL);
 
+  void PushRegisters(const RegisterSet& regs);
+  void PopRegisters(const RegisterSet& regs);
+
   void CompareRegisters(Register rn, Register rm) { cmp(rn, Operand(rm)); }
   void BranchIf(Condition condition, Label* label) { b(label, condition); }
 
diff --git a/runtime/vm/compiler/assembler/assembler_arm64.cc b/runtime/vm/compiler/assembler/assembler_arm64.cc
index dee219c..cc16da2 100644
--- a/runtime/vm/compiler/assembler/assembler_arm64.cc
+++ b/runtime/vm/compiler/assembler/assembler_arm64.cc
@@ -6,6 +6,7 @@
 #if defined(TARGET_ARCH_ARM64) && !defined(DART_PRECOMPILED_RUNTIME)
 
 #include "vm/compiler/assembler/assembler.h"
+#include "vm/compiler/backend/locations.h"
 #include "vm/cpu.h"
 #include "vm/longjump.h"
 #include "vm/runtime_entry.h"
@@ -655,6 +656,15 @@
   blr(TMP);
 }
 
+void Assembler::CallNullErrorShared(bool save_fpu_registers) {
+  uword entry_point_offset =
+      save_fpu_registers
+          ? Thread::null_error_shared_with_fpu_regs_entry_point_offset()
+          : Thread::null_error_shared_without_fpu_regs_entry_point_offset();
+  ldr(LR, Address(THR, entry_point_offset));
+  blr(LR);
+}
+
 void Assembler::AddImmediate(Register dest, Register rn, int64_t imm) {
   Operand op;
   if (imm == 0) {
@@ -1580,6 +1590,49 @@
   UNIMPLEMENTED();
 }
 
+void Assembler::PushRegisters(const RegisterSet& regs) {
+  const intptr_t fpu_regs_count = regs.FpuRegisterCount();
+  if (fpu_regs_count > 0) {
+    // Store fpu registers with the lowest register number at the lowest
+    // address.
+    for (intptr_t i = kNumberOfVRegisters - 1; i >= 0; --i) {
+      VRegister fpu_reg = static_cast<VRegister>(i);
+      if (regs.ContainsFpuRegister(fpu_reg)) {
+        PushQuad(fpu_reg);
+      }
+    }
+  }
+
+  // The order in which the registers are pushed must match the order
+  // in which the registers are encoded in the safe point's stack map.
+  for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
+    Register reg = static_cast<Register>(i);
+    if (regs.ContainsRegister(reg)) {
+      Push(reg);
+    }
+  }
+}
+
+void Assembler::PopRegisters(const RegisterSet& regs) {
+  for (intptr_t i = 0; i < kNumberOfCpuRegisters; ++i) {
+    Register reg = static_cast<Register>(i);
+    if (regs.ContainsRegister(reg)) {
+      Pop(reg);
+    }
+  }
+
+  const intptr_t fpu_regs_count = regs.FpuRegisterCount();
+  if (fpu_regs_count > 0) {
+    // Fpu registers have the lowest register number at the lowest address.
+    for (intptr_t i = 0; i < kNumberOfVRegisters; ++i) {
+      VRegister fpu_reg = static_cast<VRegister>(i);
+      if (regs.ContainsFpuRegister(fpu_reg)) {
+        PopQuad(fpu_reg);
+      }
+    }
+  }
+}
+
 }  // namespace dart
 
 #endif  // defined(TARGET_ARCH_ARM64) && !defined(DART_PRECOMPILED_RUNTIME)
diff --git a/runtime/vm/compiler/assembler/assembler_arm64.h b/runtime/vm/compiler/assembler/assembler_arm64.h
index ba5897e..2bcfb14 100644
--- a/runtime/vm/compiler/assembler/assembler_arm64.h
+++ b/runtime/vm/compiler/assembler/assembler_arm64.h
@@ -22,6 +22,7 @@
 // Forward declarations.
 class RuntimeEntry;
 class StubEntry;
+class RegisterSet;
 
 class Immediate : public ValueObject {
  public:
@@ -430,6 +431,9 @@
   void PushRegister(Register r) { Push(r); }
   void PopRegister(Register r) { Pop(r); }
 
+  void PushRegisters(const RegisterSet& registers);
+  void PopRegisters(const RegisterSet& registers);
+
   void Drop(intptr_t stack_elements) {
     add(SP, SP, Operand(stack_elements * kWordSize));
   }
@@ -1379,7 +1383,7 @@
   void BranchLinkPatchable(const StubEntry& stub_entry);
   void BranchLinkToRuntime();
 
-  void CallNullErrorShared(bool save_fpu_registers) { UNREACHABLE(); }
+  void CallNullErrorShared(bool save_fpu_registers);
 
   // Emit a call that shares its object pool entries with other calls
   // that have the same equivalence marker.
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
index 6233ee3..c536e9c 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
@@ -1137,69 +1137,12 @@
   locs->CheckWritableInputs();
   ClobberDeadTempRegisters(locs);
 #endif
-
   // TODO(vegorov): consider saving only caller save (volatile) registers.
-  const intptr_t fpu_regs_count = locs->live_registers()->FpuRegisterCount();
-  if (fpu_regs_count > 0) {
-    __ AddImmediate(SP, -(fpu_regs_count * kFpuRegisterSize));
-    // Store fpu registers with the lowest register number at the lowest
-    // address.
-    intptr_t offset = 0;
-    __ mov(IP, Operand(SP));
-    for (intptr_t i = 0; i < kNumberOfFpuRegisters; ++i) {
-      QRegister fpu_reg = static_cast<QRegister>(i);
-      if (locs->live_registers()->ContainsFpuRegister(fpu_reg)) {
-        DRegister d = EvenDRegisterOf(fpu_reg);
-        ASSERT(d + 1 == OddDRegisterOf(fpu_reg));
-        __ vstmd(IA_W, IP, d, 2);
-        offset += kFpuRegisterSize;
-      }
-    }
-    ASSERT(offset == (fpu_regs_count * kFpuRegisterSize));
-  }
-
-  // The order in which the registers are pushed must match the order
-  // in which the registers are encoded in the safe point's stack map.
-  // NOTE: This matches the order of ARM's multi-register push.
-  RegList reg_list = 0;
-  for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
-    Register reg = static_cast<Register>(i);
-    if (locs->live_registers()->ContainsRegister(reg)) {
-      reg_list |= (1 << reg);
-    }
-  }
-  if (reg_list != 0) {
-    __ PushList(reg_list);
-  }
+  __ PushRegisters(*locs->live_registers());
 }
 
 void FlowGraphCompiler::RestoreLiveRegisters(LocationSummary* locs) {
-  RegList reg_list = 0;
-  for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
-    Register reg = static_cast<Register>(i);
-    if (locs->live_registers()->ContainsRegister(reg)) {
-      reg_list |= (1 << reg);
-    }
-  }
-  if (reg_list != 0) {
-    __ PopList(reg_list);
-  }
-
-  const intptr_t fpu_regs_count = locs->live_registers()->FpuRegisterCount();
-  if (fpu_regs_count > 0) {
-    // Fpu registers have the lowest register number at the lowest address.
-    intptr_t offset = 0;
-    for (intptr_t i = 0; i < kNumberOfFpuRegisters; ++i) {
-      QRegister fpu_reg = static_cast<QRegister>(i);
-      if (locs->live_registers()->ContainsFpuRegister(fpu_reg)) {
-        DRegister d = EvenDRegisterOf(fpu_reg);
-        ASSERT(d + 1 == OddDRegisterOf(fpu_reg));
-        __ vldmd(IA_W, SP, d, 2);
-        offset += kFpuRegisterSize;
-      }
-    }
-    ASSERT(offset == (fpu_regs_count * kFpuRegisterSize));
-  }
+  __ PopRegisters(*locs->live_registers());
 }
 
 #if defined(DEBUG)
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
index d46f351..15e28ee 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
@@ -1111,48 +1111,12 @@
   locs->CheckWritableInputs();
   ClobberDeadTempRegisters(locs);
 #endif
-
   // TODO(vegorov): consider saving only caller save (volatile) registers.
-  const intptr_t fpu_regs_count = locs->live_registers()->FpuRegisterCount();
-  if (fpu_regs_count > 0) {
-    // Store fpu registers with the lowest register number at the lowest
-    // address.
-    for (intptr_t i = kNumberOfVRegisters - 1; i >= 0; --i) {
-      VRegister fpu_reg = static_cast<VRegister>(i);
-      if (locs->live_registers()->ContainsFpuRegister(fpu_reg)) {
-        __ PushQuad(fpu_reg);
-      }
-    }
-  }
-
-  // The order in which the registers are pushed must match the order
-  // in which the registers are encoded in the safe point's stack map.
-  for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
-    Register reg = static_cast<Register>(i);
-    if (locs->live_registers()->ContainsRegister(reg)) {
-      __ Push(reg);
-    }
-  }
+  __ PushRegisters(*locs->live_registers());
 }
 
 void FlowGraphCompiler::RestoreLiveRegisters(LocationSummary* locs) {
-  for (intptr_t i = 0; i < kNumberOfCpuRegisters; ++i) {
-    Register reg = static_cast<Register>(i);
-    if (locs->live_registers()->ContainsRegister(reg)) {
-      __ Pop(reg);
-    }
-  }
-
-  const intptr_t fpu_regs_count = locs->live_registers()->FpuRegisterCount();
-  if (fpu_regs_count > 0) {
-    // Fpu registers have the lowest register number at the lowest address.
-    for (intptr_t i = 0; i < kNumberOfVRegisters; ++i) {
-      VRegister fpu_reg = static_cast<VRegister>(i);
-      if (locs->live_registers()->ContainsFpuRegister(fpu_reg)) {
-        __ PopQuad(fpu_reg);
-      }
-    }
-  }
+  __ PopRegisters(*locs->live_registers());
 }
 
 #if defined(DEBUG)
diff --git a/runtime/vm/compiler/backend/il.h b/runtime/vm/compiler/backend/il.h
index ae7d1fa..b92c693 100644
--- a/runtime/vm/compiler/backend/il.h
+++ b/runtime/vm/compiler/backend/il.h
@@ -1011,7 +1011,8 @@
   void Unsupported(FlowGraphCompiler* compiler);
 
   static bool SlowPathSharingSupported(bool is_optimizing) {
-#if defined(TARGET_ARCH_X64)
+#if defined(TARGET_ARCH_X64) || defined(TARGET_ARCH_ARM) ||                    \
+    defined(TARGET_ARCH_ARM64)
     return FLAG_enable_slow_path_sharing && FLAG_precompiled_mode &&
            is_optimizing;
 #else
diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc
index 8d16db6..694f8f2 100644
--- a/runtime/vm/compiler/backend/il_arm.cc
+++ b/runtime/vm/compiler/backend/il_arm.cc
@@ -2949,16 +2949,39 @@
     }
     __ Comment("CheckStackOverflowSlowPath");
     __ Bind(entry_label());
-    compiler->SaveLiveRegisters(instruction()->locs());
+    const bool using_shared_stub =
+        instruction()->locs()->call_on_shared_slow_path();
+    if (!using_shared_stub) {
+      compiler->SaveLiveRegisters(instruction()->locs());
+    }
     // pending_deoptimization_env_ is needed to generate a runtime call that
     // may throw an exception.
     ASSERT(compiler->pending_deoptimization_env_ == NULL);
     Environment* env =
         compiler->SlowPathEnvironmentFor(instruction(), kNumSlowPathArgs);
     compiler->pending_deoptimization_env_ = env;
-    compiler->GenerateRuntimeCall(
-        instruction()->token_pos(), instruction()->deopt_id(),
-        kStackOverflowRuntimeEntry, kNumSlowPathArgs, instruction()->locs());
+
+    if (using_shared_stub) {
+      uword entry_point_offset =
+          instruction()->locs()->live_registers()->FpuRegisterCount() > 0
+              ? Thread::stack_overflow_shared_with_fpu_regs_entry_point_offset()
+              : Thread::
+                    stack_overflow_shared_without_fpu_regs_entry_point_offset();
+      ASSERT(instruction()->locs()->temp(1).IsRegister() &&
+             instruction()->locs()->temp(1).reg() == LR);
+      __ ldr(LR, Address(THR, entry_point_offset));
+      __ blx(LR);
+      compiler->RecordSafepoint(instruction()->locs(), kNumSlowPathArgs);
+      compiler->EmitCatchEntryState();
+      compiler->AddDescriptor(
+          RawPcDescriptors::kOther, compiler->assembler()->CodeSize(),
+          instruction()->deopt_id(), instruction()->token_pos(),
+          compiler->CurrentTryIndex());
+    } else {
+      compiler->GenerateRuntimeCall(
+          instruction()->token_pos(), instruction()->deopt_id(),
+          kStackOverflowRuntimeEntry, kNumSlowPathArgs, instruction()->locs());
+    }
 
     if (compiler->isolate()->use_osr() && !compiler->is_optimizing() &&
         instruction()->in_loop()) {
@@ -2968,7 +2991,9 @@
                                      TokenPosition::kNoSource);
     }
     compiler->pending_deoptimization_env_ = NULL;
-    compiler->RestoreLiveRegisters(instruction()->locs());
+    if (!using_shared_stub) {
+      compiler->RestoreLiveRegisters(instruction()->locs());
+    }
     __ b(exit_label());
   }
 
diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc
index c6c8498..ee58c12 100644
--- a/runtime/vm/compiler/backend/il_arm64.cc
+++ b/runtime/vm/compiler/backend/il_arm64.cc
@@ -2711,16 +2711,39 @@
     }
     __ Comment("CheckStackOverflowSlowPath");
     __ Bind(entry_label());
-    compiler->SaveLiveRegisters(instruction()->locs());
+    const bool using_shared_stub =
+        instruction()->locs()->call_on_shared_slow_path();
+    if (!using_shared_stub) {
+      compiler->SaveLiveRegisters(instruction()->locs());
+    }
     // pending_deoptimization_env_ is needed to generate a runtime call that
     // may throw an exception.
     ASSERT(compiler->pending_deoptimization_env_ == NULL);
     Environment* env =
         compiler->SlowPathEnvironmentFor(instruction(), kNumSlowPathArgs);
     compiler->pending_deoptimization_env_ = env;
-    compiler->GenerateRuntimeCall(
-        instruction()->token_pos(), instruction()->deopt_id(),
-        kStackOverflowRuntimeEntry, kNumSlowPathArgs, instruction()->locs());
+
+    if (using_shared_stub) {
+      uword entry_point_offset =
+          instruction()->locs()->live_registers()->FpuRegisterCount() > 0
+              ? Thread::stack_overflow_shared_with_fpu_regs_entry_point_offset()
+              : Thread::
+                    stack_overflow_shared_without_fpu_regs_entry_point_offset();
+      ASSERT(instruction()->locs()->temp(1).IsRegister() &&
+             instruction()->locs()->temp(1).reg() == LR);
+      __ ldr(LR, Address(THR, entry_point_offset));
+      __ blr(LR);
+      compiler->RecordSafepoint(instruction()->locs(), kNumSlowPathArgs);
+      compiler->EmitCatchEntryState();
+      compiler->AddDescriptor(
+          RawPcDescriptors::kOther, compiler->assembler()->CodeSize(),
+          instruction()->deopt_id(), instruction()->token_pos(),
+          compiler->CurrentTryIndex());
+    } else {
+      compiler->GenerateRuntimeCall(
+          instruction()->token_pos(), instruction()->deopt_id(),
+          kStackOverflowRuntimeEntry, kNumSlowPathArgs, instruction()->locs());
+    }
 
     if (compiler->isolate()->use_osr() && !compiler->is_optimizing() &&
         instruction()->in_loop()) {
@@ -2730,7 +2753,9 @@
                                      TokenPosition::kNoSource);
     }
     compiler->pending_deoptimization_env_ = NULL;
-    compiler->RestoreLiveRegisters(instruction()->locs());
+    if (!using_shared_stub) {
+      compiler->RestoreLiveRegisters(instruction()->locs());
+    }
     __ b(exit_label());
   }
 
diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc
index 92ba643..5c5658f 100644
--- a/runtime/vm/compiler/backend/il_x64.cc
+++ b/runtime/vm/compiler/backend/il_x64.cc
@@ -2636,8 +2636,6 @@
     __ Bind(entry_label());
     const bool using_shared_stub =
         instruction()->locs()->call_on_shared_slow_path();
-    const bool live_fpu_regs =
-        instruction()->locs()->live_registers()->FpuRegisterCount() > 0;
     if (!using_shared_stub) {
       compiler->SaveLiveRegisters(instruction()->locs());
     }
@@ -2650,7 +2648,7 @@
 
     if (using_shared_stub) {
       uword entry_point_offset =
-          live_fpu_regs
+          instruction()->locs()->live_registers()->FpuRegisterCount() > 0
               ? Thread::stack_overflow_shared_with_fpu_regs_entry_point_offset()
               : Thread::
                     stack_overflow_shared_without_fpu_regs_entry_point_offset();
diff --git a/runtime/vm/compiler/backend/locations.cc b/runtime/vm/compiler/backend/locations.cc
index f1435e5..76fb649 100644
--- a/runtime/vm/compiler/backend/locations.cc
+++ b/runtime/vm/compiler/backend/locations.cc
@@ -28,7 +28,11 @@
                                  intptr_t temp_count,
                                  LocationSummary::ContainsCall contains_call)
     : num_inputs_(input_count),
+#if defined(TARGET_ARCH_ARM)
+      num_temps_(temp_count + (contains_call == kCallOnSharedSlowPath ? 1 : 0)),
+#else
       num_temps_(temp_count),
+#endif
       stack_bitmap_(NULL),
       contains_call_(contains_call),
       live_registers_() {
@@ -37,6 +41,14 @@
 #endif
   input_locations_ = zone->Alloc<Location>(num_inputs_);
   temp_locations_ = zone->Alloc<Location>(num_temps_);
+
+#if defined(TARGET_ARCH_ARM)
+  if (contains_call == kCallOnSharedSlowPath) {
+    // TODO(sjindel): Mitigate the negative effect on the fast-path of blocking
+    // LR.
+    set_temp(temp_count, Location::RegisterLocation(LR));
+  }
+#endif
 }
 
 LocationSummary* LocationSummary::Make(
diff --git a/runtime/vm/compiler/backend/locations.h b/runtime/vm/compiler/backend/locations.h
index 2d66b54..0c12d62 100644
--- a/runtime/vm/compiler/backend/locations.h
+++ b/runtime/vm/compiler/backend/locations.h
@@ -516,15 +516,17 @@
     ASSERT(kNumberOfFpuRegisters <= (kWordSize * kBitsPerByte));
   }
 
-  void AddAllNonReservedRegisters() {
-    for (intptr_t i = kNumberOfFpuRegisters - 1; i >= 0; --i) {
-      Add(Location::FpuRegisterLocation(static_cast<FpuRegister>(i)));
-    }
-
+  void AddAllNonReservedRegisters(bool include_fpu_registers) {
     for (intptr_t i = kNumberOfCpuRegisters - 1; i >= 0; --i) {
       if (kReservedCpuRegisters & (1 << i)) continue;
       Add(Location::RegisterLocation(static_cast<Register>(i)));
     }
+
+    if (include_fpu_registers) {
+      for (intptr_t i = kNumberOfFpuRegisters - 1; i >= 0; --i) {
+        Add(Location::FpuRegisterLocation(static_cast<FpuRegister>(i)));
+      }
+    }
   }
 
   void Add(Location loc, Representation rep = kTagged) {
diff --git a/runtime/vm/stub_code_arm.cc b/runtime/vm/stub_code_arm.cc
index d9fede6..555fb3a 100644
--- a/runtime/vm/stub_code_arm.cc
+++ b/runtime/vm/stub_code_arm.cc
@@ -112,26 +112,92 @@
   __ Ret();
 }
 
+void StubCode::GenerateSharedStub(Assembler* assembler,
+                                  bool save_fpu_registers,
+                                  const RuntimeEntry* target,
+                                  intptr_t self_code_stub_offset_from_thread,
+                                  bool allow_return) {
+  __ Push(LR);
+
+  // We want the saved registers to appear like part of the caller's frame, so
+  // we push them before calling EnterStubFrame.
+  //
+  // TODO(sjindel): We could skip saving LR (and thus remove one bit from the
+  // stackmap of the callsite), but this would add ARM-specific complexity to
+  // FlowGraphCompiler::RecordSafepoint and
+  // FlowGraphCompiler::SlowPathEnvironmentFor.
+  RegisterSet all_registers;
+  all_registers.AddAllNonReservedRegisters(save_fpu_registers);
+  __ PushRegisters(all_registers);
+
+  const intptr_t kSavedCpuRegisterSlots =
+      Utils::CountOneBitsWord(kDartAvailableCpuRegs);
+
+  const intptr_t kSavedFpuRegisterSlots =
+      save_fpu_registers ? kNumberOfFpuRegisters * kFpuRegisterSize / kWordSize
+                         : 0;
+
+  const intptr_t kAllSavedRegistersSlots =
+      kSavedCpuRegisterSlots + kSavedFpuRegisterSlots;
+
+  // Copy down the return address so the stack layout is correct.
+  __ ldr(TMP, Address(SPREG, kAllSavedRegistersSlots * kWordSize));
+  __ Push(TMP);
+
+  __ ldr(CODE_REG, Address(THR, self_code_stub_offset_from_thread));
+
+  __ EnterStubFrame();
+
+  __ ldr(CODE_REG, Address(THR, Thread::call_to_runtime_stub_offset()));
+  __ ldr(R9, Address(THR, Thread::OffsetFromThread(target)));
+  __ mov(R4, Operand(/*argument_count=*/0));
+  __ ldr(TMP, Address(THR, Thread::call_to_runtime_entry_point_offset()));
+  __ blx(TMP);
+
+  if (!allow_return) {
+    __ Breakpoint();
+    return;
+  }
+  __ LeaveStubFrame();
+
+  // Drop "official" return address -- we can just use the one stored above the
+  // saved registers.
+  __ Drop(1);
+
+  __ PopRegisters(all_registers);
+
+  __ Pop(LR);
+  __ bx(LR);
+}
+
 void StubCode::GenerateNullErrorSharedWithoutFPURegsStub(Assembler* assembler) {
-  // TODO(sjindel): implement.
-  __ Breakpoint();
+  GenerateSharedStub(assembler, /*save_fpu_registers=*/false,
+                     &kNullErrorRuntimeEntry,
+                     Thread::null_error_shared_without_fpu_regs_stub_offset(),
+                     /*allow_return=*/false);
 }
 
 void StubCode::GenerateNullErrorSharedWithFPURegsStub(Assembler* assembler) {
-  // TODO(sjindel): implement.
-  __ Breakpoint();
+  GenerateSharedStub(assembler, /*save_fpu_registers=*/true,
+                     &kNullErrorRuntimeEntry,
+                     Thread::null_error_shared_with_fpu_regs_stub_offset(),
+                     /*allow_return=*/false);
 }
 
 void StubCode::GenerateStackOverflowSharedWithoutFPURegsStub(
     Assembler* assembler) {
-  // TODO(sjindel): implement.
-  __ Breakpoint();
+  GenerateSharedStub(
+      assembler, /*save_fpu_registers=*/false, &kStackOverflowRuntimeEntry,
+      Thread::stack_overflow_shared_without_fpu_regs_stub_offset(),
+      /*allow_return=*/true);
 }
 
 void StubCode::GenerateStackOverflowSharedWithFPURegsStub(
     Assembler* assembler) {
-  // TODO(sjindel): implement.
-  __ Breakpoint();
+  GenerateSharedStub(assembler, /*save_fpu_registers=*/true,
+                     &kStackOverflowRuntimeEntry,
+                     Thread::stack_overflow_shared_with_fpu_regs_stub_offset(),
+                     /*allow_return=*/true);
 }
 
 // Input parameters:
diff --git a/runtime/vm/stub_code_arm64.cc b/runtime/vm/stub_code_arm64.cc
index 7610428..12f4302 100644
--- a/runtime/vm/stub_code_arm64.cc
+++ b/runtime/vm/stub_code_arm64.cc
@@ -131,24 +131,87 @@
   __ ret();
 }
 
+void StubCode::GenerateSharedStub(Assembler* assembler,
+                                  bool save_fpu_registers,
+                                  const RuntimeEntry* target,
+                                  intptr_t self_code_stub_offset_from_thread,
+                                  bool allow_return) {
+  __ Push(LR);
+
+  // We want the saved registers to appear like part of the caller's frame, so
+  // we push them before calling EnterStubFrame.
+  RegisterSet all_registers;
+  all_registers.AddAllNonReservedRegisters(save_fpu_registers);
+  __ PushRegisters(all_registers);
+
+  const intptr_t kSavedCpuRegisterSlots =
+      Utils::CountOneBitsWord(kDartAvailableCpuRegs);
+
+  const intptr_t kSavedFpuRegisterSlots =
+      save_fpu_registers ? kNumberOfFpuRegisters * kFpuRegisterSize / kWordSize
+                         : 0;
+
+  const intptr_t kAllSavedRegistersSlots =
+      kSavedCpuRegisterSlots + kSavedFpuRegisterSlots;
+
+  // Copy down the return address so the stack layout is correct.
+  __ ldr(TMP, Address(SPREG, kAllSavedRegistersSlots * kWordSize));
+  __ Push(TMP);
+
+  __ ldr(CODE_REG, Address(THR, self_code_stub_offset_from_thread));
+
+  __ EnterStubFrame();
+
+  __ ldr(CODE_REG, Address(THR, Thread::call_to_runtime_stub_offset()));
+  __ ldr(R5, Address(THR, Thread::OffsetFromThread(target)));
+  __ LoadImmediate(R4, /*argument_count=*/0);
+  __ ldr(TMP, Address(THR, Thread::call_to_runtime_entry_point_offset()));
+  __ blr(TMP);
+
+  if (!allow_return) {
+    __ Breakpoint();
+    return;
+  }
+  __ LeaveStubFrame();
+
+  // Drop "official" return address -- we can just use the one stored above the
+  // saved registers.
+  __ Drop(1);
+
+  __ PopRegisters(all_registers);
+
+  __ Pop(LR);
+  __ ret(LR);
+}
+
 void StubCode::GenerateNullErrorSharedWithoutFPURegsStub(Assembler* assembler) {
-  __ Breakpoint();
+  GenerateSharedStub(assembler, /*save_fpu_registers=*/false,
+                     &kNullErrorRuntimeEntry,
+                     Thread::null_error_shared_without_fpu_regs_stub_offset(),
+                     /*allow_return=*/false);
 }
 
 void StubCode::GenerateNullErrorSharedWithFPURegsStub(Assembler* assembler) {
-  __ Breakpoint();
+  GenerateSharedStub(assembler, /*save_fpu_registers=*/true,
+                     &kNullErrorRuntimeEntry,
+                     Thread::null_error_shared_with_fpu_regs_stub_offset(),
+                     /*allow_return=*/false);
 }
 
 void StubCode::GenerateStackOverflowSharedWithoutFPURegsStub(
     Assembler* assembler) {
-  // TODO(sjindel): implement.
-  __ Breakpoint();
+  GenerateSharedStub(
+      assembler, /*save_fpu_registers=*/false, &kStackOverflowRuntimeEntry,
+      Thread::stack_overflow_shared_without_fpu_regs_stub_offset(),
+      /*allow_return=*/true);
 }
 
 void StubCode::GenerateStackOverflowSharedWithFPURegsStub(
     Assembler* assembler) {
-  // TODO(sjindel): implement.
-  __ Breakpoint();
+  GenerateSharedStub(assembler, /*save_fpu_registers=*/true,
+                     &kStackOverflowRuntimeEntry,
+                     Thread::stack_overflow_shared_with_fpu_regs_stub_offset(),
+                     /*allow_return=*/true);
 }
 
 void StubCode::GeneratePrintStopMessageStub(Assembler* assembler) {