[vm/compiler] Fix unwinding issues

We used to emit an unwinding rule for caller's SP, however
this does not really work as we expected it to work. Instead there is
an unwritten rule that caller's SP is equal to the CFA which overwrites
our unwinding rule (at least in libunwindstack library used by
Android).

This leads to unwinding issues on the boundary between C++ and Dart:
when we unwind through the InvokeDartCode stub frame we arrive to
C++ code with SP equal to callee's CFA, which is set to FP by our
unwinding rules. In this case if C++ code does not use FP based frames
we will fail to unwind C++ frame property - because SP will be
incorrect. This manifest in simpleperf profiles as stacks which
abruptly truncate at DartEntry::InvokeCode.

This CL rewrites unwinding rules to fix this: instead of using
FP as CFA we define CFA to be caller's SP (FP+2*kWordSize). This ensures
that unwinding on the boundary between C++ and Dart works correctly:
as CFA will always be consistently set to caller's SP.

Additionally this CL fixes InvokeDartCode stub to remove a value
that this stub pushes above saved return address in PRODUCT mode.

This value acts as a profiler marker used by builtin profiler
which is only included in non-PRODUCT builds - but the presence of this
value interfers with unwinding (caller's SP will be off by 1 due the
presence of this value).

We could produce custom unwinding information for this stub instead, but
just removing this value in PRODUCT builds should solve the problem
as we usually only profile Flutter release builds with simpleperf.

Fixes b/220804295

TEST=manually profiled a Flutter app and confirmed correct flamegraph.

Change-Id: I2094afaab6e54e89625c0b5a89aebc6b9823d67c
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/235226
Reviewed-by: Martin Kustermann <kustermann@google.com>
Commit-Queue: Slava Egorov <vegorov@google.com>
diff --git a/runtime/vm/compiler/stub_code_compiler_arm.cc b/runtime/vm/compiler/stub_code_compiler_arm.cc
index a37565a..3c65323 100644
--- a/runtime/vm/compiler/stub_code_compiler_arm.cc
+++ b/runtime/vm/compiler/stub_code_compiler_arm.cc
@@ -1195,7 +1195,8 @@
 //   R2 : arguments array.
 //   R3 : current thread.
 void StubCodeCompiler::GenerateInvokeDartCodeStub(Assembler* assembler) {
-  READS_RETURN_ADDRESS_FROM_LR(__ Push(LR));  // Marker for the profiler.
+  // Marker for the profiler.
+  NOT_IN_PRODUCT(READS_RETURN_ADDRESS_FROM_LR(__ Push(LR)));
   SPILLS_LR_TO_FRAME(__ EnterFrame((1 << FP) | (1 << LR), 0));
 
   // Push code object to PC marker slot.
@@ -1318,7 +1319,7 @@
 
   // Restore the frame pointer and return.
   RESTORES_LR_FROM_FRAME(__ LeaveFrame((1 << FP) | (1 << LR)));
-  __ Drop(1);
+  NOT_IN_PRODUCT(__ Drop(1));  // Drop profiler marker.
   __ Ret();
 }
 
diff --git a/runtime/vm/compiler/stub_code_compiler_arm64.cc b/runtime/vm/compiler/stub_code_compiler_arm64.cc
index 6dc4dbe..e32707b 100644
--- a/runtime/vm/compiler/stub_code_compiler_arm64.cc
+++ b/runtime/vm/compiler/stub_code_compiler_arm64.cc
@@ -1365,7 +1365,8 @@
   // from over-writing Dart frames.
   __ mov(SP, CSP);
   __ SetupCSPFromThread(R3);
-  READS_RETURN_ADDRESS_FROM_LR(__ Push(LR));  // Marker for the profiler.
+  // Marker for the profiler.
+  NOT_IN_PRODUCT(READS_RETURN_ADDRESS_FROM_LR(__ Push(LR)));
   __ EnterFrame(0);
 
   // Push code object to PC marker slot.
@@ -1488,7 +1489,7 @@
 
   // Restore the frame pointer and C stack pointer and return.
   __ LeaveFrame();
-  __ Drop(1);
+  NOT_IN_PRODUCT(__ Drop(1));  // Drop profiler marker.
   __ RestoreCSP();
   __ ret();
 }
diff --git a/runtime/vm/compiler/stub_code_compiler_ia32.cc b/runtime/vm/compiler/stub_code_compiler_ia32.cc
index 68b3b4c..52f6ed0 100644
--- a/runtime/vm/compiler/stub_code_compiler_ia32.cc
+++ b/runtime/vm/compiler/stub_code_compiler_ia32.cc
@@ -969,7 +969,7 @@
   const intptr_t kArgumentsOffset = 5 * target::kWordSize;
   const intptr_t kThreadOffset = 6 * target::kWordSize;
 
-  __ pushl(Address(ESP, 0));  // Marker for the profiler.
+  NOT_IN_PRODUCT(__ pushl(Address(ESP, 0)));  // Marker for the profiler.
   __ EnterFrame(0);
 
   // Push code object to PC marker slot.
@@ -1084,7 +1084,7 @@
 
   // Restore the frame pointer.
   __ LeaveFrame();
-  __ popl(ECX);
+  NOT_IN_PRODUCT(__ popl(ECX));  // Drop profiler marker.
 
   __ ret();
 }
diff --git a/runtime/vm/compiler/stub_code_compiler_riscv.cc b/runtime/vm/compiler/stub_code_compiler_riscv.cc
index 14dd9f3..f4a17da 100644
--- a/runtime/vm/compiler/stub_code_compiler_riscv.cc
+++ b/runtime/vm/compiler/stub_code_compiler_riscv.cc
@@ -1276,7 +1276,7 @@
 void StubCodeCompiler::GenerateInvokeDartCodeStub(Assembler* assembler) {
   __ Comment("InvokeDartCodeStub");
 
-  __ PushRegister(RA);  // Marker for the profiler.
+  NOT_IN_PRODUCT(__ PushRegister(RA));  // Marker for the profiler.
   __ EnterFrame(0);
 
   // Push code object to PC marker slot.
@@ -1396,7 +1396,7 @@
 
   // Restore the frame pointer and C stack pointer and return.
   __ LeaveFrame();
-  __ Drop(1);
+  __ NOT_IN_PRODUCT(__ Drop(1));  // Drop profiler marker.
   __ ret();
 }
 
diff --git a/runtime/vm/compiler/stub_code_compiler_x64.cc b/runtime/vm/compiler/stub_code_compiler_x64.cc
index de42236..0268696 100644
--- a/runtime/vm/compiler/stub_code_compiler_x64.cc
+++ b/runtime/vm/compiler/stub_code_compiler_x64.cc
@@ -1375,7 +1375,7 @@
 //   RDX : arguments array.
 //   RCX : current thread.
 void StubCodeCompiler::GenerateInvokeDartCodeStub(Assembler* assembler) {
-  __ pushq(Address(RSP, 0));  // Marker for the profiler.
+  NOT_IN_PRODUCT(__ pushq(Address(RSP, 0)));  // Marker for the profiler.
   __ EnterFrame(0);
 
   const Register kTargetReg = CallingConventions::kArg1Reg;
@@ -1518,7 +1518,7 @@
 
   // Restore the frame pointer.
   __ LeaveFrame();
-  __ popq(RCX);
+  NOT_IN_PRODUCT(__ popq(RCX));  // Drop profiler marker.
 
   __ ret();
 }
diff --git a/runtime/vm/elf.cc b/runtime/vm/elf.cc
index 77d6949..7f87db9 100644
--- a/runtime/vm/elf.cc
+++ b/runtime/vm/elf.cc
@@ -1373,7 +1373,7 @@
 
   // Multiplier which will be used to scale operands of DW_CFA_offset and
   // DW_CFA_val_offset.
-  const intptr_t kDataAlignment = compiler::target::kWordSize;
+  const intptr_t kDataAlignment = -compiler::target::kWordSize;
 
   static const uint8_t DW_EH_PE_pcrel = 0x10;
   static const uint8_t DW_EH_PE_sdata4 = 0x0b;
@@ -1395,12 +1395,20 @@
         ConcreteRegister(LINK_REGISTER));  // Return address register
     dwarf_stream.uleb128(1);               // Augmentation size
     dwarf_stream.u1(DW_EH_PE_pcrel | DW_EH_PE_sdata4);  // FDE encoding.
-    // CFA is FP+0
+    // CFA is caller's SP (FP+kCallerSpSlotFromFp*kWordSize)
     dwarf_stream.u1(Dwarf::DW_CFA_def_cfa);
     dwarf_stream.uleb128(FP);
-    dwarf_stream.uleb128(0);
+    dwarf_stream.uleb128(kCallerSpSlotFromFp * compiler::target::kWordSize);
   });
 
+  // Emit rule defining that |reg| value is stored at CFA+offset.
+  const auto cfa_offset = [&](Register reg, intptr_t offset) {
+    const intptr_t scaled_offset = offset / kDataAlignment;
+    RELEASE_ASSERT(scaled_offset >= 0);
+    dwarf_stream.u1(Dwarf::DW_CFA_offset | reg);
+    dwarf_stream.uleb128(scaled_offset);
+  };
+
   // Emit an FDE covering each .text section.
   for (const auto& portion : text_section->portions()) {
     ASSERT(portion.symbol_name != nullptr);  // Needed for relocations.
@@ -1413,27 +1421,17 @@
       dwarf_stream.u4(portion.size);           // Size.
       dwarf_stream.u1(0);                      // Augmentation Data length.
 
-      // FP at FP+kSavedCallerPcSlotFromFp*kWordSize
-      COMPILE_ASSERT(kSavedCallerFpSlotFromFp >= 0);
-      dwarf_stream.u1(Dwarf::DW_CFA_offset | FP);
-      dwarf_stream.uleb128(kSavedCallerFpSlotFromFp);
+      // Caller FP at FP+kSavedCallerPcSlotFromFp*kWordSize,
+      // where FP is CFA - kCallerSpSlotFromFp*kWordSize.
+      COMPILE_ASSERT((kSavedCallerFpSlotFromFp - kCallerSpSlotFromFp) <= 0);
+      cfa_offset(FP,
+                 (kSavedCallerFpSlotFromFp - kCallerSpSlotFromFp) * kWordSize);
 
-      // LR at FP+kSavedCallerPcSlotFromFp*kWordSize
-      COMPILE_ASSERT(kSavedCallerPcSlotFromFp >= 0);
-      dwarf_stream.u1(Dwarf::DW_CFA_offset | ConcreteRegister(LINK_REGISTER));
-      dwarf_stream.uleb128(kSavedCallerPcSlotFromFp);
-
-      // SP is FP+kCallerSpSlotFromFp*kWordSize
-      COMPILE_ASSERT(kCallerSpSlotFromFp >= 0);
-      dwarf_stream.u1(Dwarf::DW_CFA_val_offset);
-#if defined(TARGET_ARCH_ARM64)
-      dwarf_stream.uleb128(ConcreteRegister(CSP));
-#elif defined(TARGET_ARCH_ARM)
-      dwarf_stream.uleb128(SP);
-#else
-#error "Unsupported .eh_frame architecture"
-#endif
-      dwarf_stream.uleb128(kCallerSpSlotFromFp);
+      // Caller LR at FP+kSavedCallerPcSlotFromFp*kWordSize,
+      // where FP is CFA - kCallerSpSlotFromFp*kWordSize
+      COMPILE_ASSERT((kSavedCallerPcSlotFromFp - kCallerSpSlotFromFp) <= 0);
+      cfa_offset(ConcreteRegister(LINK_REGISTER),
+                 (kSavedCallerPcSlotFromFp - kCallerSpSlotFromFp) * kWordSize);
     });
   }
 
diff --git a/runtime/vm/image_snapshot.cc b/runtime/vm/image_snapshot.cc
index 3661d07..4528f28 100644
--- a/runtime/vm/image_snapshot.cc
+++ b/runtime/vm/image_snapshot.cc
@@ -1294,97 +1294,31 @@
   // CFA = Canonical frame address
   assembly_stream_->WriteString(".cfi_startproc\n");
 
-#if defined(TARGET_ARCH_X64)
-  assembly_stream_->WriteString(".cfi_def_cfa rbp, 0\n");  // CFA is fp+0
-  assembly_stream_->WriteString(
-      ".cfi_offset rbp, 0\n");  // saved fp is *(CFA+0)
-  assembly_stream_->WriteString(
-      ".cfi_offset rip, 8\n");  // saved pc is *(CFA+8)
-  // saved sp is CFA+16
-  // Would prefer to use ".cfi_value_offset sp, 16", but this requires gcc
-  // newer than late 2016. Can't emit .cfi_value_offset using .cfi_scape
-  // because DW_CFA_val_offset uses scaled operand and we don't know what
-  // data alignment factor will be choosen by the assembler when emitting CIE.
-  // DW_CFA_expression          0x10
-  // uleb128 register (rsp)        7   (DWARF register number)
-  // uleb128 size of operation     2
-  // DW_OP_plus_uconst          0x23
-  // uleb128 addend               16
-  assembly_stream_->WriteString(".cfi_escape 0x10, 31, 2, 0x23, 16\n");
+  // Below .cfi_def_cfa defines CFA as caller's SP, while .cfi_offset R, offs
+  // tells unwinder that caller's value of register R is stored at address
+  // CFA+offs.
 
+#if defined(TARGET_ARCH_X64)
+  assembly_stream_->WriteString(".cfi_def_cfa rbp, 16\n");
+  assembly_stream_->WriteString(".cfi_offset rbp, -16\n");
+  assembly_stream_->WriteString(".cfi_offset rip, -8\n");
 #elif defined(TARGET_ARCH_ARM64)
   COMPILE_ASSERT(R29 == FP);
   COMPILE_ASSERT(R30 == LINK_REGISTER);
-  assembly_stream_->WriteString(".cfi_def_cfa x29, 0\n");  // CFA is fp+0
-  assembly_stream_->WriteString(
-      ".cfi_offset x29, 0\n");  // saved fp is *(CFA+0)
-  assembly_stream_->WriteString(
-      ".cfi_offset x30, 8\n");  // saved pc is *(CFA+8)
-  // saved sp is CFA+16
-  // Would prefer to use ".cfi_value_offset sp, 16", but this requires gcc
-  // newer than late 2016. Can't emit .cfi_value_offset using .cfi_scape
-  // because DW_CFA_val_offset uses scaled operand and we don't know what
-  // data alignment factor will be choosen by the assembler when emitting CIE.
-#if defined(DART_TARGET_OS_ANDROID)
-  // On Android libunwindstack has a bug (b/191113792): it does not push
-  // CFA value to the expression stack before evaluating expression given
-  // to DW_CFA_expression. We have to workaround this bug by manually pushing
-  // CFA (R11) to the stack using DW_OP_breg29 0.
-  // DW_CFA_expression          0x10
-  // uleb128 register (x31)       31
-  // uleb128 size of operation     4
-  // DW_OP_breg11               0x8d (0x70 + 29)
-  // sleb128 offset                0
-  // DW_OP_plus_uconst          0x23
-  // uleb128 addend               16
-  assembly_stream_->WriteString(".cfi_escape 0x10, 31, 4, 0x8d, 0, 0x23, 16\n");
-#else
-  // DW_CFA_expression          0x10
-  // uleb128 register (x31)       31
-  // uleb128 size of operation     2
-  // DW_OP_plus_uconst          0x23
-  // uleb128 addend               16
-  assembly_stream_->WriteString(".cfi_escape 0x10, 31, 2, 0x23, 16\n");
-#endif
-
+  assembly_stream_->WriteString(".cfi_def_cfa x29, 16\n");
+  assembly_stream_->WriteString(".cfi_offset x29, -16\n");
+  assembly_stream_->WriteString(".cfi_offset x30, -8\n");
 #elif defined(TARGET_ARCH_ARM)
 #if defined(DART_TARGET_OS_MACOS) || defined(DART_TARGET_OS_MACOS_IOS)
   COMPILE_ASSERT(FP == R7);
-  assembly_stream_->WriteString(".cfi_def_cfa r7, 0\n");  // CFA is fp+0
-  assembly_stream_->WriteString(".cfi_offset r7, 0\n");  // saved fp is *(CFA+0)
+  assembly_stream_->WriteString(".cfi_def_cfa r7, 8\n");
+  assembly_stream_->WriteString(".cfi_offset r7, -8\n");
 #else
   COMPILE_ASSERT(FP == R11);
-  assembly_stream_->WriteString(".cfi_def_cfa r11, 0\n");  // CFA is fp+0
-  assembly_stream_->WriteString(
-      ".cfi_offset r11, 0\n");  // saved fp is *(CFA+0)
+  assembly_stream_->WriteString(".cfi_def_cfa r11, 8\n");
+  assembly_stream_->WriteString(".cfi_offset r11, -8\n");
 #endif
-  assembly_stream_->WriteString(".cfi_offset lr, 4\n");  // saved pc is *(CFA+4)
-  // saved sp is CFA+8
-  // Would prefer to use ".cfi_value_offset sp, 16", but this requires gcc
-  // newer than late 2016. Can't emit .cfi_value_offset using .cfi_scape
-  // because DW_CFA_val_offset uses scaled operand and we don't know what
-  // data alignment factor will be choosen by the assembler when emitting CIE.
-#if defined(DART_TARGET_OS_ANDROID)
-  // On Android libunwindstack has a bug (b/191113792): it does not push
-  // CFA value to the expression stack before evaluating expression given
-  // to DW_CFA_expression. We have to workaround this bug by manually pushing
-  // CFA (R11) to the stack using DW_OP_breg11 0.
-  // DW_CFA_expression          0x10
-  // uleb128 register (sp)        13
-  // uleb128 size of operation     4
-  // DW_OP_breg11               0x7b (0x70 + 11)
-  // sleb128 offset                0
-  // DW_OP_plus_uconst          0x23
-  // uleb128 addend                8
-  assembly_stream_->WriteString(".cfi_escape 0x10, 31, 4, 0x7b, 0, 0x23, 16\n");
-#else
-  // DW_CFA_expression          0x10
-  // uleb128 register (sp)        13
-  // uleb128 size of operation     2
-  // DW_OP_plus_uconst          0x23
-  // uleb128 addend                8
-  assembly_stream_->WriteString(".cfi_escape 0x10, 13, 2, 0x23, 8\n");
-#endif
+  assembly_stream_->WriteString(".cfi_offset lr, -4\n");
 
 // libunwind on ARM may use .ARM.exidx instead of .debug_frame
 #if !defined(DART_TARGET_OS_MACOS) && !defined(DART_TARGET_OS_MACOS_IOS)