[vm/ffi] SimDBC64 align stack frame

And regression test for dartbug.com/37069

Fixes: https://github.com/dart-lang/sdk/issues/37176
Follow up of: https://dart-review.googlesource.com/c/sdk/+/103814
Bug: https://github.com/dart-lang/sdk/issues/37069
Change-Id: I2c791271f936580d66a48485a60ab6cddb1ba4f8
Cq-Include-Trybots: luci.dart.try:vm-ffi-android-debug-arm-try, app-kernel-linux-debug-x64-try, vm-kernel-linux-debug-simdbc64-try,vm-kernel-mac-debug-simdbc64-try,vm-kernel-reload-mac-debug-simdbc64-try,vm-kernel-linux-debug-ia32-try,vm-dartkb-linux-debug-simarm64-try,vm-kernel-win-debug-x64-try
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/104824
Commit-Queue: Daco Harkes <dacoharkes@google.com>
Reviewed-by: Samir Jindel <sjindel@google.com>
diff --git a/runtime/bin/ffi_test_functions.cc b/runtime/bin/ffi_test_functions.cc
index 52b4512..48757cf 100644
--- a/runtime/bin/ffi_test_functions.cc
+++ b/runtime/bin/ffi_test_functions.cc
@@ -149,7 +149,7 @@
 }
 
 // Sums many ints.
-// Used for testing calling conventions. With so many doubles we are using all
+// Used for testing calling conventions. With so many integers we are using all
 // normal parameter registers and some stack slots.
 DART_EXPORT intptr_t SumManyInts(intptr_t a,
                                  intptr_t b,
@@ -169,6 +169,28 @@
   return retval;
 }
 
+// Sums an odd number of ints.
+// Used for testing calling conventions. With so many arguments, and an odd
+// number of arguments, we are testing stack alignment on various architectures.
+DART_EXPORT intptr_t SumManyIntsOdd(intptr_t a,
+                                    intptr_t b,
+                                    intptr_t c,
+                                    intptr_t d,
+                                    intptr_t e,
+                                    intptr_t f,
+                                    intptr_t g,
+                                    intptr_t h,
+                                    intptr_t i,
+                                    intptr_t j,
+                                    intptr_t k) {
+  std::cout << "SumManyInts(" << a << ", " << b << ", " << c << ", " << d
+            << ", " << e << ", " << f << ", " << g << ", " << h << ", " << i
+            << ", " << j << ", " << k << ")\n";
+  intptr_t retval = a + b + c + d + e + f + g + h + i + j + k;
+  std::cout << "returning " << retval << "\n";
+  return retval;
+}
+
 // Sums many doubles.
 // Used for testing calling conventions. With so many doubles we are using all
 // xmm parameter registers and some stack slots.
@@ -485,6 +507,22 @@
   Dart_ExecuteInternalCommand("gc-now");
 }
 
+// Triggers GC. Has 11 dummy arguments as unboxed odd integers which should be
+// ignored by GC.
+DART_EXPORT void Regress37069(uint64_t a,
+                              uint64_t b,
+                              uint64_t c,
+                              uint64_t d,
+                              uint64_t e,
+                              uint64_t f,
+                              uint64_t g,
+                              uint64_t h,
+                              uint64_t i,
+                              uint64_t j,
+                              uint64_t k) {
+  Dart_ExecuteInternalCommand("gc-now");
+}
+
 // Calls a Dart function to allocate 'count' objects.
 // Used for stress-testing GC when re-entering the API.
 DART_EXPORT void AllocateThroughDart() {
diff --git a/runtime/lib/ffi.cc b/runtime/lib/ffi.cc
index 5dd4367..f2ac373 100644
--- a/runtime/lib/ffi.cc
+++ b/runtime/lib/ffi.cc
@@ -5,6 +5,7 @@
 #include "lib/ffi.h"
 
 #include "include/dart_api.h"
+#include "platform/globals.h"
 #include "vm/bootstrap_natives.h"
 #include "vm/class_finalizer.h"
 #include "vm/compiler/assembler/assembler.h"
@@ -677,6 +678,10 @@
   data_[kOffsetNumStackSlots] = num_args;
 }
 
+void FfiMarshalledArguments::SetAlignmentMask(uint64_t alignment_mask) const {
+  data_[kOffsetAlignmentMask] = alignment_mask;
+}
+
 intptr_t FfiMarshalledArguments::GetNumStackSlots() const {
   return data_[kOffsetNumStackSlots];
 }
@@ -691,6 +696,7 @@
     const compiler::ffi::FfiSignatureDescriptor& signature,
     const uint64_t* arg_values) {
   const intptr_t num_stack_slots = signature.num_stack_slots();
+  const uint64_t alignment_mask = ~(OS::ActivationFrameAlignment() - 1);
   const intptr_t size =
       FfiMarshalledArguments::kOffsetStackSlotValues + num_stack_slots;
   uint64_t* data = Thread::Current()->GetFfiMarshalledArguments(size);
@@ -699,6 +705,7 @@
   descr.SetFunctionAddress(arg_values[compiler::ffi::kFunctionAddressRegister]);
   const intptr_t num_args = signature.length();
   descr.SetNumStackSlots(num_stack_slots);
+  descr.SetAlignmentMask(alignment_mask);
   for (int i = 0; i < num_args; i++) {
     uint64_t arg_value = arg_values[compiler::ffi::kFirstArgumentRegister + i];
     HostLocation loc = signature.LocationAt(i);
@@ -737,10 +744,12 @@
                  RegisterNames::FpuRegisterName(
                      host::CallingConventions::FpuArgumentRegisters[i]));
   }
-  const intptr_t index = kOffsetNumStackSlots;
-  const intptr_t num_stack_slots = data_[index];
-  OS::PrintErr("  %02" Pd " 0x%" Pp " (number of stack slots)\n", index,
-               num_stack_slots);
+  const intptr_t alignment_mask = data_[kOffsetAlignmentMask];
+  OS::PrintErr("  %02" Pd " 0x%" Pp " (stack alignment mask)\n",
+               kOffsetAlignmentMask, alignment_mask);
+  const intptr_t num_stack_slots = data_[kOffsetNumStackSlots];
+  OS::PrintErr("  %02" Pd " 0x%" Pp " (number of stack slots)\n",
+               kOffsetNumStackSlots, num_stack_slots);
   for (intptr_t i = 0; i < num_stack_slots; i++) {
     const intptr_t index = kOffsetStackSlotValues + i;
     OS::PrintErr("  %02" Pd " 0x%016" Px64 " (stack slot %" Pd ")\n", index,
diff --git a/runtime/lib/ffi.h b/runtime/lib/ffi.h
index 00c527f..4cfa855 100644
--- a/runtime/lib/ffi.h
+++ b/runtime/lib/ffi.h
@@ -47,6 +47,7 @@
   void SetFunctionAddress(uint64_t value) const;
   void SetRegister(::dart::host::Register reg, uint64_t value) const;
   void SetFpuRegister(::dart::host::FpuRegister reg, uint64_t value) const;
+  void SetAlignmentMask(uint64_t kOffsetAlignmentMask) const;
   void SetNumStackSlots(intptr_t num_args) const;
   intptr_t GetNumStackSlots() const;
   void SetStackSlotValue(intptr_t index, uint64_t value) const;
@@ -59,8 +60,9 @@
   static const intptr_t kOffsetRegisters = 1;
   static const intptr_t kOffsetFpuRegisters =
       kOffsetRegisters + ::dart::host::CallingConventions::kNumArgRegs;
-  static const intptr_t kOffsetNumStackSlots =
+  static const intptr_t kOffsetAlignmentMask =
       kOffsetFpuRegisters + ::dart::host::CallingConventions::kNumFpuArgRegs;
+  static const intptr_t kOffsetNumStackSlots = kOffsetAlignmentMask + 1;
   static const intptr_t kOffsetStackSlotValues = kOffsetNumStackSlots + 1;
 
   static const intptr_t kOffsetIntResult = 0;
diff --git a/runtime/vm/compiler/ffi_dbc_trampoline_x64_linux_mac.S b/runtime/vm/compiler/ffi_dbc_trampoline_x64_linux_mac.S
index e8d1918..4f42279 100644
--- a/runtime/vm/compiler/ffi_dbc_trampoline_x64_linux_mac.S
+++ b/runtime/vm/compiler/ffi_dbc_trampoline_x64_linux_mac.S
@@ -12,20 +12,33 @@
 _FfiTrampolineCall:
 #endif
 
-push  rbx
+/* Save argument in scratch register. */
+push  rbx              /* Backup caller saved register. */
 mov   rbx,  rdi        /* Save argument in scratch register. */
 
+/* Enter frame. */
+push  rbp
+mov   rbp,  rsp
+
+/* Reserve framespace for arguments. */
+mov   rax,  [rbx+0x80] /* Load number of stack arguments. */
+shl   rax,  3          /* Multiply by size (8 bytes). */
+sub   rsp,  rax        /* Reserve num_stack_args stack slots. */
+
+/* Stack alignment. */
+and   rsp,  [rbx+0x78] /* Align stack with stack alignment mask. */
+
 /* Copy stack arguments. */
-mov   rax,  [rbx+0x78] /* Load number of stack arguments. */
 cmp   rax,  0x0        /* Check if number of stack arguments is 0. */
 jz    .done            /* Skip loop if no stack arguments. */
-add   rbx,  0x78       /* Offset RBX to point to stack arguments */
+add   rsp,  rax        /* Unreserve stack slots so we can push arguments. */
+add   rbx,  0x80       /* Offset RBX to point to stack arguments */
 .loop:                 /* Copy stack arguments loop. */
-push  [rbx+0x8*rax]    /* Push stack argument. */
-sub   rax,  0x1        /* Decrement stack argument iterator. */
+push  [rbx+rax]        /* Push stack argument. */
+sub   rax,  0x8        /* Decrement stack argument iterator. */
 cmp   rax,  0x0        /* Compare iterator with 0 */
 jnz   .loop            /* Loop while iterator is not 0 */
-sub   rbx,  0x78       /* Restore RBX to original value. */
+sub   rbx,  0x80       /* Restore RBX to original value. */
 .done:                 /* End stack arguments loop. */
 
 /* Copy registers and fpu registers. */
@@ -52,12 +65,13 @@
 mov   [rbx],   rax  /* Move integer result in kOffsetIntResult */
 movsd [rbx+8], xmm0 /* Move double result in kOffsetDoubleResult */
 
-/* Clean up stack arguments. */
-mov   rax,  [rbx+0x78] /* Load number of stack arguments. */
-imul  rax,  0x8        /* Multiply by stack argument size. */
-add   rsp,  rax        /* Clean up the stack. */
+/* leave frame */
+mov   rsp,  rbp
+pop   rbp
 
+/* Restore caller saved register. */
 pop   rbx
+
 ret
 
 #endif /* HOST_ARCH_X64 */
diff --git a/runtime/vm/os_android.cc b/runtime/vm/os_android.cc
index 196620f..807b7c9 100644
--- a/runtime/vm/os_android.cc
+++ b/runtime/vm/os_android.cc
@@ -175,9 +175,13 @@
 // into a architecture specific file e.g: os_ia32_linux.cc
 intptr_t OS::ActivationFrameAlignment() {
 #if defined(TARGET_ARCH_IA32) || defined(TARGET_ARCH_X64) ||                   \
-    defined(TARGET_ARCH_ARM64)
+    defined(TARGET_ARCH_ARM64) ||                                              \
+    defined(TARGET_ARCH_DBC) &&                                                \
+        (defined(HOST_ARCH_IA32) || defined(HOST_ARCH_X64) ||                  \
+         defined(HOST_ARCH_ARM64))
   const int kMinimumAlignment = 16;
-#elif defined(TARGET_ARCH_ARM) || defined(TARGET_ARCH_DBC)
+#elif defined(TARGET_ARCH_ARM) ||                                              \
+    defined(TARGET_ARCH_DBC) && defined(HOST_ARCH_ARM)
   const int kMinimumAlignment = 8;
 #else
 #error Unsupported architecture.
diff --git a/runtime/vm/os_fuchsia.cc b/runtime/vm/os_fuchsia.cc
index 869c5b2..517f3c4 100644
--- a/runtime/vm/os_fuchsia.cc
+++ b/runtime/vm/os_fuchsia.cc
@@ -130,9 +130,13 @@
 // into a architecture specific file e.g: os_ia32_fuchsia.cc
 intptr_t OS::ActivationFrameAlignment() {
 #if defined(TARGET_ARCH_IA32) || defined(TARGET_ARCH_X64) ||                   \
-    defined(TARGET_ARCH_ARM64)
+    defined(TARGET_ARCH_ARM64) ||                                              \
+    defined(TARGET_ARCH_DBC) &&                                                \
+        (defined(HOST_ARCH_IA32) || defined(HOST_ARCH_X64) ||                  \
+         defined(HOST_ARCH_ARM64))
   const int kMinimumAlignment = 16;
-#elif defined(TARGET_ARCH_ARM) || defined(TARGET_ARCH_DBC)
+#elif defined(TARGET_ARCH_ARM) ||                                              \
+    defined(TARGET_ARCH_DBC) && defined(HOST_ARCH_ARM)
   const int kMinimumAlignment = 8;
 #else
 #error Unsupported architecture.
diff --git a/runtime/vm/os_linux.cc b/runtime/vm/os_linux.cc
index 2ca3bb6..fb65cd1 100644
--- a/runtime/vm/os_linux.cc
+++ b/runtime/vm/os_linux.cc
@@ -497,6 +497,7 @@
 intptr_t OS::ActivationFrameAlignment() {
 #if defined(TARGET_ARCH_IA32) || defined(TARGET_ARCH_X64) ||                   \
     defined(TARGET_ARCH_ARM64) || defined(TARGET_ARCH_DBC)
+  // DBC alignment should be at least as much as host architecture.
   const int kMinimumAlignment = 16;
 #elif defined(TARGET_ARCH_ARM)
   const int kMinimumAlignment = 8;
diff --git a/runtime/vm/os_macos.cc b/runtime/vm/os_macos.cc
index be6455d..ddbd18f 100644
--- a/runtime/vm/os_macos.cc
+++ b/runtime/vm/os_macos.cc
@@ -16,7 +16,7 @@
 #include <sys/time.h>        // NOLINT
 #include <unistd.h>          // NOLINT
 #if HOST_OS_IOS
-#include <syslog.h>      // NOLINT
+#include <syslog.h>  // NOLINT
 #endif
 
 #include "platform/utils.h"
@@ -140,7 +140,7 @@
 #elif TARGET_ARCH_X64
   return 16;  // iOS simulator
 #elif TARGET_ARCH_DBC
-  return 16;
+  return 16;  // Should be at least as much as any host architecture.
 #else
 #error Unimplemented
 #endif
@@ -212,7 +212,6 @@
       __builtin_extract_return_addr(__builtin_return_address(0)));
 }
 
-
 void OS::Print(const char* format, ...) {
 #if HOST_OS_IOS
   va_list args;
diff --git a/runtime/vm/os_win.cc b/runtime/vm/os_win.cc
index 1ff4eb9..c0f8e39 100644
--- a/runtime/vm/os_win.cc
+++ b/runtime/vm/os_win.cc
@@ -175,9 +175,11 @@
 }
 
 intptr_t OS::ActivationFrameAlignment() {
-#if defined(TARGET_ARCH_ARM64)
+#if defined(TARGET_ARCH_ARM64) ||                                              \
+    defined(TARGET_ARCH_DBC) && defined(HOST_ARCH_ARM64)
   return 16;
-#elif defined(TARGET_ARCH_ARM)
+#elif defined(TARGET_ARCH_ARM) ||                                              \
+    defined(TARGET_ARCH_DBC) && defined(HOST_ARCH_ARM)
   return 8;
 #elif defined(_WIN64)
   // Windows 64-bit ABI requires the stack to be 16-byte aligned.
diff --git a/samples/ffi/sample_ffi_functions.dart b/samples/ffi/sample_ffi_functions.dart
index 4a3cf98..9d326e9 100644
--- a/samples/ffi/sample_ffi_functions.dart
+++ b/samples/ffi/sample_ffi_functions.dart
@@ -18,7 +18,7 @@
 typedef NativeFunc4 = ffi.IntPtr Function(ffi.IntPtr);
 typedef NativeDoubleUnaryOp = ffi.Double Function(ffi.Double);
 typedef NativeFloatUnaryOp = ffi.Float Function(ffi.Float);
-typedef NativeOctenaryOp = ffi.IntPtr Function(
+typedef NativeDecenaryOp = ffi.IntPtr Function(
     ffi.IntPtr,
     ffi.IntPtr,
     ffi.IntPtr,
@@ -29,7 +29,7 @@
     ffi.IntPtr,
     ffi.IntPtr,
     ffi.IntPtr);
-typedef NativeDoubleOctenaryOp = ffi.Double Function(
+typedef NativeDoubleDecenaryOp = ffi.Double Function(
     ffi.Double,
     ffi.Double,
     ffi.Double,
@@ -65,9 +65,9 @@
     ffi.Pointer<ffi.Int64>);
 typedef QuadOp = int Function(int, int, int, int);
 typedef DoubleUnaryOp = double Function(double);
-typedef OctenaryOp = int Function(
+typedef DecenaryOp = int Function(
     int, int, int, int, int, int, int, int, int, int);
-typedef DoubleOctenaryOp = double Function(double, double, double, double,
+typedef DoubleDecenaryOp = double Function(double, double, double, double,
     double, double, double, double, double, double);
 typedef VigesimalOp = double Function(
     int,
@@ -162,8 +162,8 @@
 
   {
     // function with many arguments: arguments get passed in registers and stack
-    OctenaryOp sumManyInts = ffiTestFunctions
-        .lookupFunction<NativeOctenaryOp, OctenaryOp>("SumManyInts");
+    DecenaryOp sumManyInts = ffiTestFunctions
+        .lookupFunction<NativeDecenaryOp, DecenaryOp>("SumManyInts");
     var result = sumManyInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
     print(result);
     print(result.runtimeType);
@@ -171,8 +171,8 @@
 
   {
     // function with many double arguments
-    DoubleOctenaryOp sumManyDoubles = ffiTestFunctions.lookupFunction<
-        NativeDoubleOctenaryOp, DoubleOctenaryOp>("SumManyDoubles");
+    DoubleDecenaryOp sumManyDoubles = ffiTestFunctions.lookupFunction<
+        NativeDoubleDecenaryOp, DoubleDecenaryOp>("SumManyDoubles");
     var result =
         sumManyDoubles(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0);
     print(result);
diff --git a/tests/ffi/function_gc_test.dart b/tests/ffi/function_gc_test.dart
index 1799ce1..789ecfb3 100644
--- a/tests/ffi/function_gc_test.dart
+++ b/tests/ffi/function_gc_test.dart
@@ -23,6 +23,7 @@
   testBoxDouble();
   testBoxPointer();
   testAllocateInNative();
+  testRegress37069();
   testAllocateInDart();
 }
 
@@ -35,11 +36,25 @@
 typedef NativeNullaryOpPtr = ffi.Pointer<ffi.Void> Function();
 typedef NativeNullaryOp = ffi.Void Function();
 typedef NativeUnaryOp = ffi.Void Function(ffi.Uint64);
+typedef NativeUndenaryOp = ffi.Uint64 Function(
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64,
+    ffi.Uint64);
 typedef NullaryOp = int Function();
 typedef NullaryOpDbl = double Function();
 typedef NullaryOpPtr = ffi.Pointer<ffi.Void> Function();
 typedef UnaryOp = void Function(int);
 typedef NullaryOpVoid = void Function();
+typedef UndenaryOp = int Function(
+    int, int, int, int, int, int, int, int, int, int, int);
 
 //// These functions return values that require boxing into different types.
 
@@ -88,6 +103,21 @@
 // Test GC in the FFI call path by calling a C function which triggers GC
 // directly.
 void testAllocateInNative() => triggerGc();
+// This also works as a regression test for 37176.
+
+final regress37069 = ffiTestFunctions
+    .lookupFunction<NativeUndenaryOp, UndenaryOp>("Regress37069");
+
+// Test GC in the FFI call path by calling a C function which triggers GC
+// directly.
+void testRegress37069() {
+  regress37069(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+}
+
+class C {
+  final int i;
+  C(this.i);
+}
 
 @pragma("vm:entry-point", "call")
 void testAllocationsInDartHelper() => triggerGc();
diff --git a/tests/ffi/function_test.dart b/tests/ffi/function_test.dart
index 014c66e..2865d30 100644
--- a/tests/ffi/function_test.dart
+++ b/tests/ffi/function_test.dart
@@ -29,6 +29,7 @@
     testNativeFunctionManyArguments1();
     testNativeFunctionManyArguments2();
     testNativeFunctionManyArguments3();
+    testNativeFunctionManyArguments4();
     testNativeFunctionPointer();
     testNullInt();
     testNullDouble();
@@ -225,7 +226,7 @@
   Expect.approxEquals(1337.0, times1_337Float(1000.0));
 }
 
-typedef NativeOctenaryOp = ffi.IntPtr Function(
+typedef NativeDecenaryOp = ffi.IntPtr Function(
     ffi.IntPtr,
     ffi.IntPtr,
     ffi.IntPtr,
@@ -236,17 +237,39 @@
     ffi.IntPtr,
     ffi.IntPtr,
     ffi.IntPtr);
-typedef OctenaryOp = int Function(
+typedef DecenaryOp = int Function(
     int, int, int, int, int, int, int, int, int, int);
 
-OctenaryOp sumManyInts = ffiTestFunctions
-    .lookupFunction<NativeOctenaryOp, OctenaryOp>("SumManyInts");
+DecenaryOp sumManyInts = ffiTestFunctions
+    .lookupFunction<NativeDecenaryOp, DecenaryOp>("SumManyInts");
 
 void testNativeFunctionManyArguments1() {
   Expect.equals(55, sumManyInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
 }
 
-typedef NativeDoubleOctenaryOp = ffi.Double Function(
+typedef NativeUndenaryOp = ffi.IntPtr Function(
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr,
+    ffi.IntPtr);
+typedef UndenaryOp = int Function(
+    int, int, int, int, int, int, int, int, int, int, int);
+
+UndenaryOp sumManyIntsOdd = ffiTestFunctions
+    .lookupFunction<NativeUndenaryOp, UndenaryOp>("SumManyIntsOdd");
+
+void testNativeFunctionManyArguments4() {
+  Expect.equals(66, sumManyIntsOdd(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11));
+}
+
+typedef NativeDoubleDecenaryOp = ffi.Double Function(
     ffi.Double,
     ffi.Double,
     ffi.Double,
@@ -257,11 +280,11 @@
     ffi.Double,
     ffi.Double,
     ffi.Double);
-typedef DoubleOctenaryOp = double Function(double, double, double, double,
+typedef DoubleDecenaryOp = double Function(double, double, double, double,
     double, double, double, double, double, double);
 
-DoubleOctenaryOp sumManyDoubles = ffiTestFunctions
-    .lookupFunction<NativeDoubleOctenaryOp, DoubleOctenaryOp>("SumManyDoubles");
+DoubleDecenaryOp sumManyDoubles = ffiTestFunctions
+    .lookupFunction<NativeDoubleDecenaryOp, DoubleDecenaryOp>("SumManyDoubles");
 
 void testNativeFunctionManyArguments2() {
   Expect.approxEquals(