[VM/AOT] Fix shared spilling stubs on arm/arm64 and enable it

This reduces flutter_gallery_total_size slightly less than 1%

Change-Id: I44f670f693ccc97d1140673ad8559f70e460c6d2
Reviewed-on: https://dart-review.googlesource.com/c/89146
Reviewed-by: Vyacheslav Egorov <vegorov@google.com>
Commit-Queue: Martin Kustermann <kustermann@google.com>
diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc
index 3bc424f..94c7519 100644
--- a/runtime/vm/compiler/backend/il_arm.cc
+++ b/runtime/vm/compiler/backend/il_arm.cc
@@ -2988,8 +2988,12 @@
                                                               bool opt) const {
   const intptr_t kNumInputs = 0;
   const intptr_t kNumTemps = 1;
-  LocationSummary* summary = new (zone) LocationSummary(
-      zone, kNumInputs, kNumTemps, LocationSummary::kCallOnSlowPath);
+  const bool using_shared_stub = UseSharedSlowPathStub(opt);
+  ASSERT((kReservedCpuRegisters & (1 << LR)) != 0);
+  LocationSummary* summary = new (zone)
+      LocationSummary(zone, kNumInputs, kNumTemps,
+                      using_shared_stub ? LocationSummary::kCallOnSharedSlowPath
+                                        : LocationSummary::kCallOnSlowPath);
   summary->set_temp(0, Location::RequiresRegister());
   return summary;
 }
@@ -3030,7 +3034,6 @@
               ? Thread::stack_overflow_shared_with_fpu_regs_entry_point_offset()
               : Thread::
                     stack_overflow_shared_without_fpu_regs_entry_point_offset();
-      ASSERT(kReservedCpuRegisters & (1 << LR));
       __ ldr(LR, Address(THR, entry_point_offset));
       __ blx(LR);
       compiler->RecordSafepoint(instruction()->locs(), kNumSlowPathArgs);
diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc
index a3a7868..9c03a18 100644
--- a/runtime/vm/compiler/backend/il_arm64.cc
+++ b/runtime/vm/compiler/backend/il_arm64.cc
@@ -2676,8 +2676,12 @@
                                                               bool opt) const {
   const intptr_t kNumInputs = 0;
   const intptr_t kNumTemps = 1;
-  LocationSummary* summary = new (zone) LocationSummary(
-      zone, kNumInputs, kNumTemps, LocationSummary::kCallOnSlowPath);
+  const bool using_shared_stub = UseSharedSlowPathStub(opt);
+  ASSERT((kReservedCpuRegisters & (1 << LR)) != 0);
+  LocationSummary* summary = new (zone)
+      LocationSummary(zone, kNumInputs, kNumTemps,
+                      using_shared_stub ? LocationSummary::kCallOnSharedSlowPath
+                                        : LocationSummary::kCallOnSlowPath);
   summary->set_temp(0, Location::RequiresRegister());
   return summary;
 }
@@ -2718,8 +2722,6 @@
               ? Thread::stack_overflow_shared_with_fpu_regs_entry_point_offset()
               : Thread::
                     stack_overflow_shared_without_fpu_regs_entry_point_offset();
-      ASSERT(instruction()->locs()->temp(1).IsRegister() &&
-             instruction()->locs()->temp(1).reg() == LR);
       __ ldr(LR, Address(THR, entry_point_offset));
       __ blr(LR);
       compiler->RecordSafepoint(instruction()->locs(), kNumSlowPathArgs);
diff --git a/runtime/vm/stub_code_arm.cc b/runtime/vm/stub_code_arm.cc
index 5a4ce97..6709e31 100644
--- a/runtime/vm/stub_code_arm.cc
+++ b/runtime/vm/stub_code_arm.cc
@@ -124,30 +124,17 @@
                                   const RuntimeEntry* target,
                                   intptr_t self_code_stub_offset_from_thread,
                                   bool allow_return) {
-  __ Push(LR);
-
   // We want the saved registers to appear like part of the caller's frame, so
   // we push them before calling EnterStubFrame.
   RegisterSet all_registers;
   all_registers.AddAllNonReservedRegisters(save_fpu_registers);
+
+  // To make the stack map calculation architecture independent we do the same
+  // as on intel.
+  __ Push(LR);
+
   __ PushRegisters(all_registers);
-
-  const intptr_t kSavedCpuRegisterSlots =
-      Utils::CountOneBitsWord(kDartAvailableCpuRegs);
-
-  const intptr_t kSavedFpuRegisterSlots =
-      save_fpu_registers ? kNumberOfFpuRegisters * kFpuRegisterSize / kWordSize
-                         : 0;
-
-  const intptr_t kAllSavedRegistersSlots =
-      kSavedCpuRegisterSlots + kSavedFpuRegisterSlots;
-
-  // Copy down the return address so the stack layout is correct.
-  __ ldr(TMP, Address(SPREG, kAllSavedRegistersSlots * kWordSize));
-  __ Push(TMP);
-
   __ ldr(CODE_REG, Address(THR, self_code_stub_offset_from_thread));
-
   __ EnterStubFrame();
   __ CallRuntime(*target, /*argument_count=*/0);
   if (!allow_return) {
@@ -155,13 +142,7 @@
     return;
   }
   __ LeaveStubFrame();
-
-  // Drop "official" return address -- we can just use the one stored above the
-  // saved registers.
-  __ Drop(1);
-
   __ PopRegisters(all_registers);
-
   __ Pop(LR);
   __ bx(LR);
 }
diff --git a/runtime/vm/stub_code_arm64.cc b/runtime/vm/stub_code_arm64.cc
index 3e5928f..182a355f 100644
--- a/runtime/vm/stub_code_arm64.cc
+++ b/runtime/vm/stub_code_arm64.cc
@@ -148,30 +148,16 @@
                                   const RuntimeEntry* target,
                                   intptr_t self_code_stub_offset_from_thread,
                                   bool allow_return) {
-  __ Push(LR);
-
   // We want the saved registers to appear like part of the caller's frame, so
   // we push them before calling EnterStubFrame.
   RegisterSet all_registers;
   all_registers.AddAllNonReservedRegisters(save_fpu_registers);
+
+  // To make the stack map calculation architecture independent we do the same
+  // as on intel.
+  __ Push(LR);
   __ PushRegisters(all_registers);
-
-  const intptr_t kSavedCpuRegisterSlots =
-      Utils::CountOneBitsWord(kDartAvailableCpuRegs);
-
-  const intptr_t kSavedFpuRegisterSlots =
-      save_fpu_registers ? kNumberOfFpuRegisters * kFpuRegisterSize / kWordSize
-                         : 0;
-
-  const intptr_t kAllSavedRegistersSlots =
-      kSavedCpuRegisterSlots + kSavedFpuRegisterSlots;
-
-  // Copy down the return address so the stack layout is correct.
-  __ ldr(TMP, Address(SPREG, kAllSavedRegistersSlots * kWordSize));
-  __ Push(TMP);
-
   __ ldr(CODE_REG, Address(THR, self_code_stub_offset_from_thread));
-
   __ EnterStubFrame();
   __ CallRuntime(*target, /*argument_count=*/0);
   if (!allow_return) {
@@ -179,13 +165,7 @@
     return;
   }
   __ LeaveStubFrame();
-
-  // Drop "official" return address -- we can just use the one stored above the
-  // saved registers.
-  __ Drop(1);
-
   __ PopRegisters(all_registers);
-
   __ Pop(LR);
   __ ret(LR);
 }