[vm/ffi] Introduce `CCallInstr` and replace handle instructions

The CCallInstr takes unboxed word-sized integers and does a call to C
without any VM transitions.

This can be used for the following.
1) Leaf runtime entries. This CL removes the EnterHandleScope,
   ExitHandleScope, and AllocateHandle instructions.
2) Compilation of isolate independent code where all definitions in IL
   are unboxed integers.

Currently, the CCallInstr expects all it's arguments and the return type
to be word-sized.

This CL also:
- Adds loadImmediate and CCall with a register for each assembler.
- Adds a CSE pass to force-optimized so that LoadThreadInstr gets de-
  duplicated.

TEST=tests/ffi(_2)/*

Change-Id: I67c6aaa1b7d6aa7d60e274477686a54e5a331216
Cq-Include-Trybots: luci.dart.try:vm-kernel-linux-debug-simriscv64-try,vm-kernel-precomp-linux-debug-simriscv64-try,vm-kernel-nnbd-mac-debug-arm64-try,vm-kernel-nnbd-mac-debug-x64-try,vm-kernel-win-debug-ia32-try,vm-kernel-win-debug-x64-try
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/246240
Reviewed-by: Martin Kustermann <kustermann@google.com>
Commit-Queue: Daco Harkes <dacoharkes@google.com>
Reviewed-by: Ryan Macnak <rmacnak@google.com>
diff --git a/runtime/vm/compiler/assembler/assembler_arm.cc b/runtime/vm/compiler/assembler/assembler_arm.cc
index aa43b50..67f8fb8 100644
--- a/runtime/vm/compiler/assembler/assembler_arm.cc
+++ b/runtime/vm/compiler/assembler/assembler_arm.cc
@@ -2810,6 +2810,10 @@
   }
 }
 
+void Assembler::LoadImmediate(Register rd, Immediate value, Condition cond) {
+  LoadImmediate(rd, value.value(), cond);
+}
+
 void Assembler::LoadImmediate(Register rd, int32_t value, Condition cond) {
   Operand o;
   if (Operand::CanHold(value, &o)) {
diff --git a/runtime/vm/compiler/assembler/assembler_arm.h b/runtime/vm/compiler/assembler/assembler_arm.h
index 419dca2..00dc85b 100644
--- a/runtime/vm/compiler/assembler/assembler_arm.h
+++ b/runtime/vm/compiler/assembler/assembler_arm.h
@@ -47,6 +47,16 @@
 
 namespace compiler {
 
+class Immediate : public ValueObject {
+ public:
+  explicit Immediate(int32_t value) : value_(value) {}
+
+  int32_t value() const { return value_; }
+
+ private:
+  const int32_t value_;
+};
+
 // Instruction encoding bits.
 enum {
   H = 1 << 5,   // halfword (or byte)
@@ -803,6 +813,10 @@
 
   void CallCFunction(Address target) { Call(target); }
 
+  void CallCFunction(Register target, Condition cond = AL) {
+    blx(target, cond);
+  }
+
   // Add signed immediate value to rd. May clobber IP.
   void AddImmediate(Register rd, int32_t value, Condition cond = AL) {
     AddImmediate(rd, rd, value, cond);
@@ -867,6 +881,7 @@
   // These three do not clobber IP.
   void LoadPatchableImmediate(Register rd, int32_t value, Condition cond = AL);
   void LoadDecodableImmediate(Register rd, int32_t value, Condition cond = AL);
+  void LoadImmediate(Register rd, Immediate value, Condition cond = AL);
   void LoadImmediate(Register rd, int32_t value, Condition cond = AL);
   // These two may clobber IP.
   void LoadSImmediate(SRegister sd, float value, Condition cond = AL);
diff --git a/runtime/vm/compiler/assembler/assembler_arm64.h b/runtime/vm/compiler/assembler/assembler_arm64.h
index e11ed94..3e79152 100644
--- a/runtime/vm/compiler/assembler/assembler_arm64.h
+++ b/runtime/vm/compiler/assembler/assembler_arm64.h
@@ -1790,7 +1790,15 @@
   }
   void Call(const Code& code) { BranchLink(code); }
 
-  void CallCFunction(Address target) { Call(target); }
+  // Clobbers LR.
+  void CallCFunction(Address target) {
+    Call(target);
+  }
+  void CallCFunction(Register target) {
+#define __ this->
+    CLOBBERS_LR({ blr(target); });
+#undef __
+  }
 
   void AddImmediate(Register dest, int64_t imm) {
     AddImmediate(dest, dest, imm);
@@ -2054,6 +2062,9 @@
   void LoadUniqueObject(Register dst, const Object& obj);
   // Note: the function never clobbers TMP, TMP2 scratch registers.
   void LoadImmediate(Register reg, int64_t imm);
+  void LoadImmediate(Register reg, Immediate imm) {
+    LoadImmediate(reg, imm.value());
+  }
 
   void LoadDImmediate(VRegister reg, double immd);
   void LoadQImmediate(VRegister reg, simd128_value_t immq);
@@ -2080,6 +2091,9 @@
     LoadImmediate(TMP, immediate);
     Push(TMP);
   }
+  void PushImmediate(Immediate immediate) {
+    PushImmediate(immediate.value());
+  }
   void CompareObject(Register reg, const Object& object);
 
   void ExtractClassIdFromTags(Register result, Register tags);
diff --git a/runtime/vm/compiler/assembler/assembler_ia32.cc b/runtime/vm/compiler/assembler/assembler_ia32.cc
index 2ead117..b5f2b99 100644
--- a/runtime/vm/compiler/assembler/assembler_ia32.cc
+++ b/runtime/vm/compiler/assembler/assembler_ia32.cc
@@ -9,6 +9,7 @@
 
 #include "vm/class_id.h"
 #include "vm/compiler/assembler/assembler.h"
+#include "vm/compiler/backend/locations.h"
 #include "vm/cpu.h"
 #include "vm/instructions.h"
 #include "vm/tags.h"
diff --git a/runtime/vm/compiler/assembler/assembler_ia32.h b/runtime/vm/compiler/assembler/assembler_ia32.h
index 48f6a49..b97296c 100644
--- a/runtime/vm/compiler/assembler/assembler_ia32.h
+++ b/runtime/vm/compiler/assembler/assembler_ia32.h
@@ -21,6 +21,7 @@
 #include "vm/pointer_tagging.h"
 
 namespace dart {
+
 namespace compiler {
 
 class Immediate : public ValueObject {
@@ -747,6 +748,10 @@
     }
   }
 
+  void LoadImmediate(Register reg, Immediate immediate) {
+    LoadImmediate(reg, immediate.value());
+  }
+
   void LoadDImmediate(XmmRegister dst, double value);
 
   void Drop(intptr_t stack_elements);
@@ -874,6 +879,10 @@
 
   void CallCFunction(Address target) { Call(target); }
 
+  void CallCFunction(Register target) {
+    call(target);
+  }
+
   void Jmp(const Code& code);
   void J(Condition condition, const Code& code);
 
diff --git a/runtime/vm/compiler/assembler/assembler_riscv.cc b/runtime/vm/compiler/assembler/assembler_riscv.cc
index 863adf4..0c1caab 100644
--- a/runtime/vm/compiler/assembler/assembler_riscv.cc
+++ b/runtime/vm/compiler/assembler/assembler_riscv.cc
@@ -2719,6 +2719,10 @@
   jalr(RA);
 }
 
+void Assembler::Call(Register target) {
+  jalr(target);
+}
+
 void Assembler::AddImmediate(Register rd,
                              Register rs1,
                              intx_t imm,
diff --git a/runtime/vm/compiler/assembler/assembler_riscv.h b/runtime/vm/compiler/assembler/assembler_riscv.h
index ac1e920..914301b 100644
--- a/runtime/vm/compiler/assembler/assembler_riscv.h
+++ b/runtime/vm/compiler/assembler/assembler_riscv.h
@@ -929,9 +929,13 @@
       CodeEntryKind entry_kind = CodeEntryKind::kNormal);
 
   void Call(Address target);
+  void Call(Register target);
   void Call(const Code& code) { JumpAndLink(code); }
 
   void CallCFunction(Address target) { Call(target); }
+  void CallCFunction(Register target) {
+    Call(target);
+  }
 
   void AddImmediate(Register dest, intx_t imm) {
     AddImmediate(dest, dest, imm);
diff --git a/runtime/vm/compiler/assembler/assembler_x64.h b/runtime/vm/compiler/assembler/assembler_x64.h
index 6680a77..9f3fb6e 100644
--- a/runtime/vm/compiler/assembler/assembler_x64.h
+++ b/runtime/vm/compiler/assembler/assembler_x64.h
@@ -1246,7 +1246,7 @@
   // offset is not yet known and needs therefore relocation to the right place
   // before the code can be used.
   //
-  // The neccessary information for the "linker" (i.e. the relocation
+  // The necessary information for the "linker" (i.e. the relocation
   // information) is stored in [UntaggedCode::static_calls_target_table_]: an
   // entry of the form
   //
diff --git a/runtime/vm/compiler/backend/constant_propagator.cc b/runtime/vm/compiler/backend/constant_propagator.cc
index 63528f15..51af8e2 100644
--- a/runtime/vm/compiler/backend/constant_propagator.cc
+++ b/runtime/vm/compiler/backend/constant_propagator.cc
@@ -752,15 +752,7 @@
   SetValue(instr, non_constant_);
 }
 
-void ConstantPropagator::VisitEnterHandleScope(EnterHandleScopeInstr* instr) {
-  SetValue(instr, non_constant_);
-}
-
-void ConstantPropagator::VisitExitHandleScope(ExitHandleScopeInstr* instr) {
-  // Nothing to do.
-}
-
-void ConstantPropagator::VisitAllocateHandle(AllocateHandleInstr* instr) {
+void ConstantPropagator::VisitCCall(CCallInstr* instr) {
   SetValue(instr, non_constant_);
 }
 
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.cc b/runtime/vm/compiler/backend/flow_graph_compiler.cc
index 04712ef..32c36ac 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.cc
@@ -3606,7 +3606,6 @@
   }
 }
 
-
 // The assignment to loading units here must match that in
 // AssignLoadingUnitsCodeVisitor, which runs after compilation is done.
 static intptr_t LoadingUnitOf(Zone* zone, const Function& function) {
diff --git a/runtime/vm/compiler/backend/il.cc b/runtime/vm/compiler/backend/il.cc
index 98a44f1..8e50cc3 100644
--- a/runtime/vm/compiler/backend/il.cc
+++ b/runtime/vm/compiler/backend/il.cc
@@ -4,6 +4,7 @@
 
 #include "vm/compiler/backend/il.h"
 
+#include "platform/assert.h"
 #include "vm/bit_vector.h"
 #include "vm/bootstrap.h"
 #include "vm/compiler/aot/dispatch_table_generator.h"
@@ -6800,101 +6801,6 @@
   __ Comment("EmitReturnMovesEnd");
 }
 
-static Location FirstArgumentLocation() {
-#ifdef TARGET_ARCH_IA32
-  return Location::StackSlot(0, SPREG);
-#else
-  return Location::RegisterLocation(CallingConventions::ArgumentRegisters[0]);
-#endif
-}
-
-LocationSummary* EnterHandleScopeInstr::MakeLocationSummary(
-    Zone* zone,
-    bool is_optimizing) const {
-  LocationSummary* summary =
-      new (zone) LocationSummary(zone, /*num_inputs=*/0,
-                                 /*num_temps=*/0, LocationSummary::kCall);
-  summary->set_out(0,
-                   Location::RegisterLocation(CallingConventions::kReturnReg));
-  return summary;
-}
-
-void EnterHandleScopeInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
-  ASSERT(kEnterHandleScopeRuntimeEntry.is_leaf());
-
-  Location arg_loc = FirstArgumentLocation();
-  __ EnterCFrame(arg_loc.IsRegister() ? 0 : compiler::target::kWordSize);
-  NoTemporaryAllocator no_temp;
-  compiler->EmitMove(arg_loc, Location::RegisterLocation(THR), &no_temp);
-  __ CallCFunction(
-      compiler::Address(THR, compiler::target::Thread::OffsetFromThread(
-                                 &kEnterHandleScopeRuntimeEntry)));
-  __ LeaveCFrame();
-}
-
-LocationSummary* ExitHandleScopeInstr::MakeLocationSummary(
-    Zone* zone,
-    bool is_optimizing) const {
-  LocationSummary* summary =
-      new (zone) LocationSummary(zone, /*num_inputs=*/0,
-                                 /*num_temps=*/0, LocationSummary::kCall);
-  return summary;
-}
-
-void ExitHandleScopeInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
-  ASSERT(kEnterHandleScopeRuntimeEntry.is_leaf());
-
-  Location arg_loc = FirstArgumentLocation();
-  __ EnterCFrame(arg_loc.IsRegister() ? 0 : compiler::target::kWordSize);
-  NoTemporaryAllocator no_temp;
-  compiler->EmitMove(arg_loc, Location::RegisterLocation(THR), &no_temp);
-  __ CallCFunction(
-      compiler::Address(THR, compiler::target::Thread::OffsetFromThread(
-                                 &kExitHandleScopeRuntimeEntry)));
-  __ LeaveCFrame();
-}
-
-LocationSummary* AllocateHandleInstr::MakeLocationSummary(
-    Zone* zone,
-    bool is_optimizing) const {
-  LocationSummary* summary =
-      new (zone) LocationSummary(zone, /*num_inputs=*/1,
-                                 /*num_temps=*/0, LocationSummary::kCall);
-
-  Location arg_loc = FirstArgumentLocation();
-  // Assign input to a register that does not conflict with anything if
-  // argument is passed on the stack.
-  const Register scope_reg =
-      arg_loc.IsStackSlot() ? CallingConventions::kSecondNonArgumentRegister
-                            : arg_loc.reg();
-
-  summary->set_in(kScope, Location::RegisterLocation(scope_reg));
-  summary->set_out(0,
-                   Location::RegisterLocation(CallingConventions::kReturnReg));
-  return summary;
-}
-
-Representation AllocateHandleInstr::RequiredInputRepresentation(
-    intptr_t idx) const {
-  ASSERT(idx == kScope);
-  return kUnboxedIntPtr;
-}
-
-void AllocateHandleInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
-  ASSERT(kEnterHandleScopeRuntimeEntry.is_leaf());
-
-  Location arg_loc = FirstArgumentLocation();
-  __ EnterCFrame(arg_loc.IsRegister() ? 0 : compiler::target::kWordSize);
-  if (arg_loc.IsStackSlot()) {
-    NoTemporaryAllocator no_temp;
-    compiler->EmitMove(arg_loc, locs()->in(kScope), &no_temp);
-  }
-  __ CallCFunction(
-      compiler::Address(THR, compiler::target::Thread::OffsetFromThread(
-                                 &kAllocateHandleRuntimeEntry)));
-  __ LeaveCFrame();
-}
-
 LocationSummary* RawStoreFieldInstr::MakeLocationSummary(
     Zone* zone,
     bool is_optimizing) const {
@@ -7049,6 +6955,126 @@
   __ MoveRegister(out, THR);
 }
 
+LocationSummary* CCallInstr::MakeLocationSummaryInternal(
+    Zone* zone,
+    const RegList temps) const {
+  LocationSummary* summary =
+      new (zone) LocationSummary(zone, /*num_inputs=*/InputCount(),
+                                 /*num_temps=*/Utils::CountOneBitsWord(temps),
+                                 LocationSummary::kNativeLeafCall);
+
+  intptr_t reg_i = 0;
+  for (intptr_t reg = 0; reg < kNumberOfCpuRegisters; reg++) {
+    if ((temps & (1 << reg)) != 0) {
+      summary->set_temp(reg_i,
+                        Location::RegisterLocation(static_cast<Register>(reg)));
+      reg_i++;
+    }
+  }
+
+  summary->set_in(TargetAddressIndex(),
+                  Location::RegisterLocation(
+                      CallingConventions::kFirstNonArgumentRegister));
+
+  const auto& argument_locations =
+      native_calling_convention_.argument_locations();
+  for (intptr_t i = 0, n = argument_locations.length(); i < n; ++i) {
+    const auto& argument_location = *argument_locations.At(i);
+    if (argument_location.IsRegisters()) {
+      const auto& reg_location = argument_location.AsRegisters();
+      ASSERT(reg_location.num_regs() == 1);
+      summary->set_in(i, reg_location.AsLocation());
+    } else if (argument_location.IsFpuRegisters()) {
+      UNIMPLEMENTED();
+    } else if (argument_location.IsStack()) {
+      summary->set_in(i, Location::Any());
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  const auto& return_location = native_calling_convention_.return_location();
+  ASSERT(return_location.IsRegisters());
+  summary->set_out(0, return_location.AsLocation());
+  return summary;
+}
+
+CCallInstr::CCallInstr(
+    Zone* zone,
+    const compiler::ffi::NativeCallingConvention& native_calling_convention,
+    InputsArray* inputs)
+    : Definition(DeoptId::kNone),
+      zone_(zone),
+      native_calling_convention_(native_calling_convention),
+      inputs_(inputs) {
+#ifdef DEBUG
+  const intptr_t num_inputs =
+      native_calling_convention.argument_locations().length() + 1;
+  ASSERT(num_inputs == inputs->length());
+#endif
+  for (intptr_t i = 0, n = inputs_->length(); i < n; ++i) {
+    SetInputAt(i, (*inputs_)[i]);
+  }
+}
+
+Representation CCallInstr::RequiredInputRepresentation(intptr_t idx) const {
+  if (idx < native_calling_convention_.argument_locations().length()) {
+    const auto& argument_type =
+        native_calling_convention_.argument_locations().At(idx)->payload_type();
+    ASSERT(argument_type.IsExpressibleAsRepresentation());
+    return argument_type.AsRepresentation();
+  }
+  ASSERT(idx == TargetAddressIndex());
+  return kUnboxedFfiIntPtr;
+}
+
+void CCallInstr::EmitParamMoves(FlowGraphCompiler* compiler,
+                                Register saved_fp,
+                                Register temp0) {
+  if (native_calling_convention_.StackTopInBytes() == 0) {
+    return;
+  }
+
+  ConstantTemporaryAllocator temp_alloc(temp0);
+  compiler::ffi::FrameRebase rebase(zone_, /*old_base=*/FPREG,
+                                    /*new_base=*/saved_fp,
+                                    /*stack_delta=*/0);
+
+  __ Comment("EmitParamMoves");
+  const auto& argument_locations =
+      native_calling_convention_.argument_locations();
+  for (intptr_t i = 0, n = argument_locations.length(); i < n; ++i) {
+    const auto& argument_location = *argument_locations.At(i);
+    if (argument_location.IsRegisters()) {
+      const auto& reg_location = argument_location.AsRegisters();
+      ASSERT(reg_location.num_regs() == 1);
+      const Location src_loc = rebase.Rebase(locs()->in(i));
+      const Representation src_rep = RequiredInputRepresentation(i);
+      compiler->EmitMoveToNative(argument_location, src_loc, src_rep,
+                                 &temp_alloc);
+    } else if (argument_location.IsFpuRegisters()) {
+      UNIMPLEMENTED();
+    } else if (argument_location.IsStack()) {
+      const Location src_loc = rebase.Rebase(locs()->in(i));
+      const Representation src_rep = RequiredInputRepresentation(i);
+      __ Comment("Param %" Pd ": %s %s -> %s", i, src_loc.ToCString(),
+                 RepresentationToCString(src_rep),
+                 argument_location.ToCString());
+      compiler->EmitMoveToNative(argument_location, src_loc, src_rep,
+                                 &temp_alloc);
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  __ Comment("EmitParamMovesEnd");
+}
+
+Representation CCallInstr::representation() const {
+  const auto& return_type =
+      native_calling_convention_.return_location().payload_type();
+  ASSERT(return_type.IsExpressibleAsRepresentation());
+  return return_type.AsRepresentation();
+}
+
 // SIMD
 
 SimdOpInstr::Kind SimdOpInstr::KindForOperator(MethodRecognizer::Kind kind) {
@@ -7206,7 +7232,7 @@
   Representation inputs[4];
 };
 
-// Make representaion from type name used by SIMD_OP_LIST.
+// Make representation from type name used by SIMD_OP_LIST.
 #define REP(T) (kUnboxed##T)
 static const Representation kUnboxedBool = kTagged;
 static const Representation kUnboxedInt8 = kUnboxedInt32;
diff --git a/runtime/vm/compiler/backend/il.h b/runtime/vm/compiler/backend/il.h
index c4192ba..9f7562f 100644
--- a/runtime/vm/compiler/backend/il.h
+++ b/runtime/vm/compiler/backend/il.h
@@ -428,9 +428,7 @@
   M(SpecialParameter, kNoGC)                                                   \
   M(ClosureCall, _)                                                            \
   M(FfiCall, _)                                                                \
-  M(EnterHandleScope, kNoGC)                                                   \
-  M(ExitHandleScope, kNoGC)                                                    \
-  M(AllocateHandle, kNoGC)                                                     \
+  M(CCall, kNoGC)                                                              \
   M(RawStoreField, kNoGC)                                                      \
   M(InstanceCall, _)                                                           \
   M(PolymorphicInstanceCall, _)                                                \
@@ -5349,50 +5347,53 @@
   DISALLOW_COPY_AND_ASSIGN(FfiCallInstr);
 };
 
-class EnterHandleScopeInstr : public TemplateDefinition<0, NoThrow> {
+// Has the target address in a register passed as the last input in IL.
+class CCallInstr : public Definition {
  public:
-  EnterHandleScopeInstr() {}
+  CCallInstr(
+      Zone* zone,
+      const compiler::ffi::NativeCallingConvention& native_calling_convention,
+      InputsArray* inputs);
 
-  DECLARE_INSTRUCTION(EnterHandleScope)
+  DECLARE_INSTRUCTION(CCall)
 
-  virtual Representation representation() const { return kUnboxedIntPtr; }
+  LocationSummary* MakeLocationSummaryInternal(Zone* zone,
+                                               const RegList temps) const;
+
+  // Input index of the function pointer to invoke.
+  intptr_t TargetAddressIndex() const {
+    return native_calling_convention_.argument_locations().length();
+  }
+
+  virtual intptr_t InputCount() const { return inputs_->length(); }
+  virtual Value* InputAt(intptr_t i) const { return inputs_->At(i); }
+  virtual bool MayThrow() const { return false; }
+
   virtual bool ComputeCanDeoptimize() const { return false; }
-  virtual bool HasUnknownSideEffects() const { return false; }
+
+  virtual bool HasUnknownSideEffects() const { return true; }
+
+  virtual bool CanCallDart() const { return false; }
+
+  virtual Representation RequiredInputRepresentation(intptr_t idx) const;
+  virtual Representation representation() const;
+
+  void EmitParamMoves(FlowGraphCompiler* compiler,
+                      Register saved_fp,
+                      Register temp0);
 
   PRINT_OPERANDS_TO_SUPPORT
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(EnterHandleScopeInstr);
-};
+  virtual void RawSetInputAt(intptr_t i, Value* value) {
+    (*inputs_)[i] = value;
+  }
 
-class ExitHandleScopeInstr : public TemplateInstruction<0, NoThrow> {
- public:
-  ExitHandleScopeInstr() {}
+  Zone* const zone_;
+  const compiler::ffi::NativeCallingConvention& native_calling_convention_;
+  InputsArray* inputs_;
 
-  DECLARE_INSTRUCTION(ExitHandleScope)
-
-  virtual bool ComputeCanDeoptimize() const { return false; }
-  virtual bool HasUnknownSideEffects() const { return false; }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(ExitHandleScopeInstr);
-};
-
-class AllocateHandleInstr : public TemplateDefinition<1, NoThrow> {
- public:
-  explicit AllocateHandleInstr(Value* scope) { SetInputAt(kScope, scope); }
-
-  enum { kScope = 0 };
-
-  DECLARE_INSTRUCTION(AllocateHandle)
-
-  virtual Representation RequiredInputRepresentation(intptr_t idx) const;
-  virtual Representation representation() const { return kUnboxedIntPtr; }
-  virtual bool ComputeCanDeoptimize() const { return false; }
-  virtual bool HasUnknownSideEffects() const { return false; }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(AllocateHandleInstr);
+  DISALLOW_COPY_AND_ASSIGN(CCallInstr);
 };
 
 // Populates the untagged base + offset outside the heap with a tagged value.
diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc
index 8794fb4..d42f1c2 100644
--- a/runtime/vm/compiler/backend/il_arm.cc
+++ b/runtime/vm/compiler/backend/il_arm.cc
@@ -1714,6 +1714,33 @@
   FunctionEntryInstr::EmitNativeCode(compiler);
 }
 
+#define R(r) (1 << r)
+
+LocationSummary* CCallInstr::MakeLocationSummary(Zone* zone,
+                                                 bool is_optimizing) const {
+  constexpr Register saved_fp = CallingConventions::kSecondNonArgumentRegister;
+  return MakeLocationSummaryInternal(zone, (R(saved_fp)));
+}
+
+#undef R
+
+void CCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
+  const Register saved_fp = locs()->temp(0).reg();
+  const Register temp0 = TMP;
+
+  __ MoveRegister(saved_fp, FPREG);
+
+  const intptr_t frame_space = native_calling_convention_.StackTopInBytes();
+  __ EnterCFrame(frame_space);
+
+  EmitParamMoves(compiler, saved_fp, temp0);
+
+  const Register target_address = locs()->in(TargetAddressIndex()).reg();
+  __ CallCFunction(target_address);
+
+  __ LeaveCFrame();
+}
+
 LocationSummary* OneByteStringFromCharCodeInstr::MakeLocationSummary(
     Zone* zone,
     bool opt) const {
diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc
index 6c1cc4f..2dfcd42 100644
--- a/runtime/vm/compiler/backend/il_arm64.cc
+++ b/runtime/vm/compiler/backend/il_arm64.cc
@@ -1577,6 +1577,41 @@
   FunctionEntryInstr::EmitNativeCode(compiler);
 }
 
+#define R(r) (1 << r)
+
+LocationSummary* CCallInstr::MakeLocationSummary(Zone* zone,
+                                                 bool is_optimizing) const {
+  constexpr Register saved_csp = kAbiFirstPreservedCpuReg;
+  ASSERT(IsAbiPreservedRegister(saved_csp));
+  return MakeLocationSummaryInternal(zone, (R(saved_csp)));
+}
+
+#undef R
+
+void CCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
+  const Register saved_fp = TMP2;
+  const Register temp0 = TMP;
+  const Register saved_csp = locs()->temp(0).reg();
+
+  __ MoveRegister(saved_fp, FPREG);
+
+  const intptr_t frame_space = native_calling_convention_.StackTopInBytes();
+  __ EnterCFrame(frame_space);
+  ASSERT(IsAbiPreservedRegister(saved_csp));
+  __ mov(saved_csp, CSP);
+  __ mov(CSP, SP);
+
+  EmitParamMoves(compiler, saved_fp, temp0);
+
+  const Register target_address = locs()->in(TargetAddressIndex()).reg();
+  __ CallCFunction(target_address);
+
+  // We don't use the DartSP, we leave the frame after this immediately.
+  // However, we need set CSP to a 16 byte aligned value far above the SP.
+  __ mov(CSP, saved_csp);
+  __ LeaveCFrame();
+}
+
 LocationSummary* OneByteStringFromCharCodeInstr::MakeLocationSummary(
     Zone* zone,
     bool opt) const {
diff --git a/runtime/vm/compiler/backend/il_ia32.cc b/runtime/vm/compiler/backend/il_ia32.cc
index 1915491..1513ce0 100644
--- a/runtime/vm/compiler/backend/il_ia32.cc
+++ b/runtime/vm/compiler/backend/il_ia32.cc
@@ -1248,6 +1248,34 @@
   FunctionEntryInstr::EmitNativeCode(compiler);
 }
 
+#define R(r) (1 << r)
+
+LocationSummary* CCallInstr::MakeLocationSummary(Zone* zone,
+                                                 bool is_optimizing) const {
+  constexpr Register saved_fp = CallingConventions::kSecondNonArgumentRegister;
+  constexpr Register temp0 = CallingConventions::kFfiAnyNonAbiRegister;
+  static_assert(saved_fp < temp0, "Unexpected ordering of registers in set.");
+  return MakeLocationSummaryInternal(zone, (R(saved_fp) | R(temp0)));
+}
+
+#undef R
+
+void CCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
+  const Register saved_fp = locs()->temp(0).reg();
+  const Register temp0 = locs()->temp(1).reg();
+
+  __ MoveRegister(saved_fp, FPREG);
+  const intptr_t frame_space = native_calling_convention_.StackTopInBytes();
+  __ EnterCFrame(frame_space);
+
+  EmitParamMoves(compiler, saved_fp, temp0);
+
+  const Register target_address = locs()->in(TargetAddressIndex()).reg();
+  __ CallCFunction(target_address);
+
+  __ LeaveCFrame();
+}
+
 static bool CanBeImmediateIndex(Value* value, intptr_t cid) {
   ConstantInstr* constant = value->definition()->AsConstant();
   if ((constant == NULL) ||
diff --git a/runtime/vm/compiler/backend/il_printer.cc b/runtime/vm/compiler/backend/il_printer.cc
index f901a37f..8f983bd 100644
--- a/runtime/vm/compiler/backend/il_printer.cc
+++ b/runtime/vm/compiler/backend/il_printer.cc
@@ -1286,8 +1286,20 @@
   }
 }
 
-void EnterHandleScopeInstr::PrintOperandsTo(BaseTextBuffer* f) const {
-  f->AddString("<enter handle scope>");
+void CCallInstr::PrintOperandsTo(BaseTextBuffer* f) const {
+  f->AddString(" target_address=");
+  InputAt(TargetAddressIndex())->PrintTo(f);
+
+  const auto& argument_locations =
+      native_calling_convention_.argument_locations();
+  for (intptr_t i = 0; i < argument_locations.length(); i++) {
+    const auto& arg_location = *argument_locations.At(i);
+    f->AddString(", ");
+    InputAt(i)->PrintTo(f);
+    f->AddString(" (@");
+    arg_location.PrintTo(f);
+    f->AddString(")");
+  }
 }
 
 void NativeReturnInstr::PrintOperandsTo(BaseTextBuffer* f) const {
diff --git a/runtime/vm/compiler/backend/il_riscv.cc b/runtime/vm/compiler/backend/il_riscv.cc
index 00b0cdb..73580e9 100644
--- a/runtime/vm/compiler/backend/il_riscv.cc
+++ b/runtime/vm/compiler/backend/il_riscv.cc
@@ -1426,13 +1426,7 @@
 
 #define R(r) (1 << r)
 
-LocationSummary* FfiCallInstr::MakeLocationSummary(Zone* zone,
-                                                   bool is_optimizing) const {
-  LocationSummary* summary = MakeLocationSummaryInternal(
-      zone, is_optimizing,
-      (R(CallingConventions::kSecondNonArgumentRegister) |
-       R(CallingConventions::kFfiAnyNonAbiRegister) | R(CALLEE_SAVED_TEMP2)));
-
+static void RemapA3A4A5(LocationSummary* summary) {
   // A3/A4/A5 are unavailable in normal register allocation because they are
   // assigned to TMP/TMP2/PP. This assignment is important for reducing code
   // size. We can't just override the normal blockage of these registers because
@@ -1450,6 +1444,18 @@
       summary->set_in(i, Location::RegisterLocation(T5));
     }
   }
+}
+
+#define R(r) (1 << r)
+
+LocationSummary* FfiCallInstr::MakeLocationSummary(Zone* zone,
+                                                   bool is_optimizing) const {
+  LocationSummary* summary = MakeLocationSummaryInternal(
+      zone, is_optimizing,
+      (R(CallingConventions::kSecondNonArgumentRegister) |
+       R(CallingConventions::kFfiAnyNonAbiRegister) | R(CALLEE_SAVED_TEMP2)));
+
+  RemapA3A4A5(summary);
   return summary;
 }
 
@@ -1759,6 +1765,47 @@
   FunctionEntryInstr::EmitNativeCode(compiler);
 }
 
+#define R(r) (1 << r)
+
+LocationSummary* CCallInstr::MakeLocationSummary(Zone* zone,
+                                                 bool is_optimizing) const {
+  constexpr Register saved_fp = CallingConventions::kSecondNonArgumentRegister;
+  constexpr Register temp0 = CallingConventions::kFfiAnyNonAbiRegister;
+  static_assert(saved_fp < temp0, "Unexpected ordering of registers in set.");
+  LocationSummary* summary =
+      MakeLocationSummaryInternal(zone, (R(saved_fp) | R(temp0)));
+  RemapA3A4A5(summary);
+  return summary;
+}
+
+#undef R
+
+void CCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
+  const Register saved_fp = locs()->temp(0).reg();
+  const Register temp0 = locs()->temp(1).reg();
+
+  // Beware! Do not use CODE_REG/TMP/TMP2/PP within FfiCallInstr as they are
+  // assigned to A2/A3/A4/A5, which may be in use as argument registers.
+  __ set_constant_pool_allowed(false);
+
+  __ MoveRegister(saved_fp, FPREG);
+
+  const intptr_t frame_space = native_calling_convention_.StackTopInBytes();
+  __ EnterCFrame(frame_space);
+
+  // Also does the remapping A3/A4/A5.
+  EmitParamMoves(compiler, saved_fp, temp0);
+
+  const Register target_address = locs()->in(TargetAddressIndex()).reg();
+  __ CallCFunction(target_address);
+
+  __ LeaveCFrame();
+
+  // PP is a volatile register, so it must be restored even for leaf FFI calls.
+  __ RestorePoolPointer();
+  __ set_constant_pool_allowed(true);
+}
+
 LocationSummary* OneByteStringFromCharCodeInstr::MakeLocationSummary(
     Zone* zone,
     bool opt) const {
diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc
index b6b58ce..55fc995 100644
--- a/runtime/vm/compiler/backend/il_x64.cc
+++ b/runtime/vm/compiler/backend/il_x64.cc
@@ -1444,6 +1444,34 @@
   FunctionEntryInstr::EmitNativeCode(compiler);
 }
 
+#define R(r) (1 << r)
+
+LocationSummary* CCallInstr::MakeLocationSummary(Zone* zone,
+                                                 bool is_optimizing) const {
+  constexpr Register saved_fp = CallingConventions::kSecondNonArgumentRegister;
+  return MakeLocationSummaryInternal(zone, (R(saved_fp)));
+}
+
+#undef R
+
+void CCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
+  const Register saved_fp = locs()->temp(0).reg();
+  const Register temp0 = TMP;
+
+  // TODO(http://dartbug.com/47778): If we knew whether the stack was aligned
+  // at this point, we could omit having a frame.
+  __ MoveRegister(saved_fp, FPREG);
+
+  const intptr_t frame_space = native_calling_convention_.StackTopInBytes();
+  __ EnterCFrame(frame_space);
+
+  EmitParamMoves(compiler, saved_fp, temp0);
+  const Register target_address = locs()->in(TargetAddressIndex()).reg();
+  __ CallCFunction(target_address);
+
+  __ LeaveCFrame();
+}
+
 static bool CanBeImmediateIndex(Value* index, intptr_t cid) {
   if (!index->definition()->IsConstant()) return false;
   const Object& constant = index->definition()->AsConstant()->value();
diff --git a/runtime/vm/compiler/backend/redundancy_elimination.cc b/runtime/vm/compiler/backend/redundancy_elimination.cc
index c9245a5..7d37d1f 100644
--- a/runtime/vm/compiler/backend/redundancy_elimination.cc
+++ b/runtime/vm/compiler/backend/redundancy_elimination.cc
@@ -1876,7 +1876,7 @@
 
     // 1) Populate 'gen' sets with places which are initialized at each basic
     // block. Optimize lazy initializer calls within basic block and
-    // figure out if there are lazy intializer calls left to optimize.
+    // figure out if there are lazy initializer calls left to optimize.
     bool has_lazy_initializer_calls = false;
     for (BlockIterator block_it = graph_->reverse_postorder_iterator();
          !block_it.Done(); block_it.Advance()) {
@@ -4351,7 +4351,6 @@
   if (MayHaveVisibleEffect(current) || current->CanDeoptimize() ||
       current == block->last_instruction() || current->IsMaterializeObject() ||
       current->IsCheckStackOverflow() || current->IsReachabilityFence() ||
-      current->IsEnterHandleScope() || current->IsExitHandleScope() ||
       current->IsRawStoreField()) {
     return false;
   }
diff --git a/runtime/vm/compiler/compiler_pass.cc b/runtime/vm/compiler/compiler_pass.cc
index 2ea7890..f6a4dd3 100644
--- a/runtime/vm/compiler/compiler_pass.cc
+++ b/runtime/vm/compiler/compiler_pass.cc
@@ -311,6 +311,7 @@
   INVOKE_PASS(TypePropagation);
   INVOKE_PASS(WidenSmiToInt32);
   INVOKE_PASS(SelectRepresentations_Final);
+  INVOKE_PASS(CSE);
   INVOKE_PASS(TypePropagation);
   INVOKE_PASS(TryCatchOptimization);
   INVOKE_PASS(EliminateEnvironments);
diff --git a/runtime/vm/compiler/ffi/native_location.cc b/runtime/vm/compiler/ffi/native_location.cc
index ef92266..ca039d3 100644
--- a/runtime/vm/compiler/ffi/native_location.cc
+++ b/runtime/vm/compiler/ffi/native_location.cc
@@ -318,7 +318,12 @@
 }
 
 void NativeStackLocation::PrintTo(BaseTextBuffer* f) const {
-  f->Printf("S%+" Pd, offset_in_bytes_);
+  if (base_register_ != SPREG) {
+    f->Printf("S(%s)+%" Pd, RegisterNames::RegisterAbiName(base_register_),
+              offset_in_bytes_);
+  } else {
+    f->Printf("S+%" Pd, offset_in_bytes_);
+  }
   PrintRepresentations(f, *this);
 }
 
diff --git a/runtime/vm/compiler/ffi/native_type.cc b/runtime/vm/compiler/ffi/native_type.cc
index 6838082..a6cf2e4 100644
--- a/runtime/vm/compiler/ffi/native_type.cc
+++ b/runtime/vm/compiler/ffi/native_type.cc
@@ -8,6 +8,7 @@
 #include "platform/globals.h"
 #include "vm/class_id.h"
 #include "vm/compiler/ffi/abi.h"
+#include "vm/compiler/runtime_api.h"
 #include "vm/constants.h"
 #include "vm/zone_text_buffer.h"
 
@@ -583,6 +584,23 @@
                                                            Representation rep) {
   return *new (zone) NativePrimitiveType(fundamental_rep(rep));
 }
+
+const NativeFunctionType* NativeFunctionType::FromUnboxedRepresentation(
+    Zone* zone,
+    intptr_t num_arguments,
+    Representation representation) {
+  const auto& intptr_type =
+      compiler::ffi::NativePrimitiveType::FromUnboxedRepresentation(
+          zone, representation);
+  auto& argument_representations =
+      *new (zone) ZoneGrowableArray<const compiler::ffi::NativeType*>(
+          zone, num_arguments);
+  for (intptr_t i = 0; i < num_arguments; i++) {
+    argument_representations.Add(&intptr_type);
+  }
+  return new (zone)
+      compiler::ffi::NativeFunctionType(argument_representations, intptr_type);
+}
 #endif  // !defined(DART_PRECOMPILED_RUNTIME) && !defined(FFI_UNIT_TESTS)
 
 const char* NativeType::ToCString(Zone* zone,
diff --git a/runtime/vm/compiler/ffi/native_type.h b/runtime/vm/compiler/ffi/native_type.h
index a3a03bd..e85f091 100644
--- a/runtime/vm/compiler/ffi/native_type.h
+++ b/runtime/vm/compiler/ffi/native_type.h
@@ -404,6 +404,13 @@
                      const NativeType& return_type)
       : argument_types_(argument_types), return_type_(return_type) {}
 
+#if !defined(DART_PRECOMPILED_RUNTIME) && !defined(FFI_UNIT_TESTS)
+  static const NativeFunctionType* FromUnboxedRepresentation(
+      Zone* zone,
+      intptr_t num_arguments,
+      Representation representation);
+#endif
+
   const NativeTypes& argument_types() const { return argument_types_; }
   const NativeType& return_type() const { return return_type_; }
 
diff --git a/runtime/vm/compiler/frontend/kernel_to_il.cc b/runtime/vm/compiler/frontend/kernel_to_il.cc
index fda2a9b..8a20c37 100644
--- a/runtime/vm/compiler/frontend/kernel_to_il.cc
+++ b/runtime/vm/compiler/frontend/kernel_to_il.cc
@@ -31,6 +31,7 @@
 #include "vm/object_store.h"
 #include "vm/report.h"
 #include "vm/resolver.h"
+#include "vm/runtime_entry.h"
 #include "vm/scopes.h"
 #include "vm/stack_frame.h"
 #include "vm/symbols.h"
@@ -407,6 +408,37 @@
   return body;
 }
 
+Fragment FlowGraphBuilder::CCall(
+    const compiler::ffi::NativeCallingConvention& native_calling_convention) {
+  Fragment body;
+
+  const intptr_t num_arguments =
+      native_calling_convention.argument_locations().length() + 1;
+  InputsArray* arguments = new (Z) InputsArray(num_arguments);
+  arguments->FillWith(nullptr, 0, num_arguments);
+  for (intptr_t i = num_arguments - 1; i >= 0; --i) {
+    (*arguments)[i] = Pop();
+  }
+  auto* const call =
+      new (Z) CCallInstr(Z, native_calling_convention, arguments);
+
+  Push(call);
+  body <<= call;
+
+  return body;
+}
+
+Fragment FlowGraphBuilder::CCall(intptr_t num_arguments,
+                                 Representation representation) {
+  const auto& native_function_type =
+      *compiler::ffi::NativeFunctionType::FromUnboxedRepresentation(
+          Z, num_arguments, representation);
+  const auto& native_calling_convention =
+      compiler::ffi::NativeCallingConvention::FromSignature(
+          Z, native_function_type);
+  return CCall(native_calling_convention);
+}
+
 Fragment FlowGraphBuilder::RethrowException(TokenPosition position,
                                             int catch_try_index) {
   Fragment instructions;
@@ -3982,9 +4014,20 @@
 }
 
 Fragment FlowGraphBuilder::EnterHandleScope() {
-  auto* instr = new (Z) EnterHandleScopeInstr();
-  Push(instr);
-  return Fragment(instr);
+  Fragment body;
+  body += LoadThread();
+  body += ConvertUntaggedToUnboxed(kUnboxedIntPtr);  // argument.
+
+  // LoadThread again, we can't store it in a temp because it will end up
+  // in the environment of the FfiCall as untagged then.
+  body += LoadThread();
+  body += LoadUntagged(compiler::target::Thread::OffsetFromThread(
+      &kEnterHandleScopeRuntimeEntry));
+  body += ConvertUntaggedToUnboxed(kUnboxedFfiIntPtr);  // function address.
+
+  body += CCall(/*num_arguments=*/1);
+
+  return body;
 }
 
 Fragment FlowGraphBuilder::GetTopHandleScope() {
@@ -3996,18 +4039,33 @@
 }
 
 Fragment FlowGraphBuilder::ExitHandleScope() {
-  auto* instr = new (Z) ExitHandleScopeInstr();
-  return Fragment(instr);
+  Fragment code;
+  code += LoadThread();
+  code += ConvertUntaggedToUnboxed(kUnboxedIntPtr);  // argument.
+
+  code += LoadThread();
+  code += LoadUntagged(compiler::target::Thread::OffsetFromThread(
+      &kExitHandleScopeRuntimeEntry));
+  code += ConvertUntaggedToUnboxed(kUnboxedFfiIntPtr);  // function address.
+
+  code += CCall(/*num_arguments=*/1);
+
+  code += Drop();
+  return code;
 }
 
 Fragment FlowGraphBuilder::AllocateHandle() {
   Fragment code;
   // Get a reference to the top handle scope.
   code += GetTopHandleScope();
-  Value* api_local_scope_value = Pop();
-  auto* instr = new (Z) AllocateHandleInstr(api_local_scope_value);
-  Push(instr);
-  code <<= instr;
+
+  code += LoadThread();
+  code += LoadUntagged(
+      compiler::target::Thread::OffsetFromThread(&kAllocateHandleRuntimeEntry));
+  code += ConvertUntaggedToUnboxed(kUnboxedFfiIntPtr);  // function address.
+
+  code += CCall(/*num_arguments=*/1);
+
   return code;
 }
 
diff --git a/runtime/vm/compiler/frontend/kernel_to_il.h b/runtime/vm/compiler/frontend/kernel_to_il.h
index 697be82..ff641cc 100644
--- a/runtime/vm/compiler/frontend/kernel_to_il.h
+++ b/runtime/vm/compiler/frontend/kernel_to_il.h
@@ -196,6 +196,11 @@
 
   Fragment FfiCall(const compiler::ffi::CallMarshaller& marshaller);
 
+  Fragment CCall(
+      const compiler::ffi::NativeCallingConvention& native_calling_convention);
+  Fragment CCall(intptr_t num_arguments,
+                 Representation representation = kUnboxedFfiIntPtr);
+
   Fragment RethrowException(TokenPosition position, int catch_try_index);
   Fragment LoadLocal(LocalVariable* variable);
   Fragment IndirectGoto(intptr_t target_count);
diff --git a/runtime/vm/runtime_entry.cc b/runtime/vm/runtime_entry.cc
index ac9d41a..a3fcc07 100644
--- a/runtime/vm/runtime_entry.cc
+++ b/runtime/vm/runtime_entry.cc
@@ -3722,7 +3722,6 @@
   return GetThreadForNativeCallback(callback_id, 0);
 }
 
-// This is called directly by EnterHandleScopeInstr.
 extern "C" ApiLocalScope* DLRT_EnterHandleScope(Thread* thread) {
   CHECK_STACK_ALIGNMENT;
   TRACE_RUNTIME_CALL("EnterHandleScope %p", thread);
@@ -3737,7 +3736,6 @@
     false /* is_float */,
     reinterpret_cast<RuntimeFunction>(&DLRT_EnterHandleScope));
 
-// This is called directly by ExitHandleScopeInstr.
 extern "C" void DLRT_ExitHandleScope(Thread* thread) {
   CHECK_STACK_ALIGNMENT;
   TRACE_RUNTIME_CALL("ExitHandleScope %p", thread);
@@ -3750,7 +3748,6 @@
     false /* is_float */,
     reinterpret_cast<RuntimeFunction>(&DLRT_ExitHandleScope));
 
-// This is called directly by AllocateHandleInstr.
 extern "C" LocalHandle* DLRT_AllocateHandle(ApiLocalScope* scope) {
   CHECK_STACK_ALIGNMENT;
   TRACE_RUNTIME_CALL("AllocateHandle %p", scope);