[vm] Enable type stubs based type checks in JIT mode for some types.

Relanding 4be50d6fa1737dcb8402041ecb7334e5a34ba3a1 with fixes to DBC
and location summaries: AssertAssignable must save FPU registers.

For now we are limiting this to type checks against type parameter types.


In Dart 1 mode Dart2JS compiles itself in 28s when running from source
and in 23s when running from ideal app-jit snapshot (trained on the
same workload).

Before this change in Dart 2 mode numbers were 51s and 57s respectively.

After this change in Dart 2 mode numbers are 38s and 32s. Meaning
that regression is reduced by 50%.

Issue https://github.com/dart-lang/sdk/issues/31798
Issue https://github.com/dart-lang/sdk/issues/33257

Change-Id: Ifb55f86453bfdf36a2e03bcd7f3197cfde257103
Reviewed-on: https://dart-review.googlesource.com/57980
Commit-Queue: Vyacheslav Egorov <vegorov@google.com>
Reviewed-by: RĂ©gis Crelier <regis@google.com>
diff --git a/runtime/vm/clustered_snapshot.cc b/runtime/vm/clustered_snapshot.cc
index 989960e..4cce63f 100644
--- a/runtime/vm/clustered_snapshot.cc
+++ b/runtime/vm/clustered_snapshot.cc
@@ -33,6 +33,10 @@
   return RawObject::FromAddr(address);
 }
 
+static bool SnapshotContainsTypeTestingStubs(Snapshot::Kind kind) {
+  return kind == Snapshot::kFullAOT || kind == Snapshot::kFullJIT;
+}
+
 void Deserializer::InitializeHeader(RawObject* raw,
                                     intptr_t class_id,
                                     intptr_t size,
@@ -3058,6 +3062,8 @@
 
   void WriteFill(Serializer* s) {
     const bool is_vm_isolate = s->isolate() == Dart::vm_isolate();
+    const bool should_write_type_testing_stub =
+        SnapshotContainsTypeTestingStubs(s->kind());
 
     intptr_t count = canonical_objects_.length();
     for (intptr_t i = 0; i < count; i++) {
@@ -3069,7 +3075,7 @@
       }
       s->WriteTokenPosition(type->ptr()->token_pos_);
       s->Write<int8_t>(type->ptr()->type_state_);
-      if (s->kind() == Snapshot::kFullAOT) {
+      if (should_write_type_testing_stub) {
         RawInstructions* instr = type_testing_stubs_.LookupByAddresss(
             type->ptr()->type_test_stub_entry_point_);
         s->WriteInstructions(instr, Code::null());
@@ -3085,7 +3091,7 @@
       }
       s->WriteTokenPosition(type->ptr()->token_pos_);
       s->Write<int8_t>(type->ptr()->type_state_);
-      if (s->kind() == Snapshot::kFullAOT) {
+      if (should_write_type_testing_stub) {
         RawInstructions* instr = type_testing_stubs_.LookupByAddresss(
             type->ptr()->type_test_stub_entry_point_);
         s->WriteInstructions(instr, Code::null());
@@ -3094,7 +3100,7 @@
 
     // The dynamic/void objects are not serialized, so we manually send
     // the type testing stub for it.
-    if (s->kind() == Snapshot::kFullAOT && is_vm_isolate) {
+    if (should_write_type_testing_stub && is_vm_isolate) {
       RawInstructions* dynamic_instr = type_testing_stubs_.LookupByAddresss(
           Type::dynamic_type().type_test_stub_entry_point());
       s->WriteInstructions(dynamic_instr, Code::null());
@@ -3137,6 +3143,8 @@
 
   void ReadFill(Deserializer* d) {
     const bool is_vm_isolate = d->isolate() == Dart::vm_isolate();
+    const bool should_read_type_testing_stub =
+        SnapshotContainsTypeTestingStubs(d->kind());
 
     for (intptr_t id = canonical_start_index_; id < canonical_stop_index_;
          id++) {
@@ -3150,7 +3158,7 @@
       }
       type->ptr()->token_pos_ = d->ReadTokenPosition();
       type->ptr()->type_state_ = d->Read<int8_t>();
-      if (d->kind() == Snapshot::kFullAOT) {
+      if (should_read_type_testing_stub) {
         instr_ = d->ReadInstructions();
         type_ = type;
         type_.SetTypeTestingStub(instr_);
@@ -3168,7 +3176,7 @@
       }
       type->ptr()->token_pos_ = d->ReadTokenPosition();
       type->ptr()->type_state_ = d->Read<int8_t>();
-      if (d->kind() == Snapshot::kFullAOT) {
+      if (should_read_type_testing_stub) {
         instr_ = d->ReadInstructions();
         type_ = type;
         type_.SetTypeTestingStub(instr_);
@@ -3177,7 +3185,7 @@
 
     // The dynamic/void objects are not serialized, so we manually send
     // the type testing stub for it.
-    if (d->kind() == Snapshot::kFullAOT && is_vm_isolate) {
+    if (should_read_type_testing_stub && is_vm_isolate) {
       instr_ = d->ReadInstructions();
       Type::dynamic_type().SetTypeTestingStub(instr_);
       instr_ = d->ReadInstructions();
@@ -3186,7 +3194,7 @@
   }
 
   void PostLoad(const Array& refs, Snapshot::Kind kind, Zone* zone) {
-    if (kind != Snapshot::kFullAOT) {
+    if (!SnapshotContainsTypeTestingStubs(kind)) {
       for (intptr_t id = canonical_start_index_; id < canonical_stop_index_;
            id++) {
         type_ ^= refs.At(id);
@@ -3237,6 +3245,9 @@
   }
 
   void WriteFill(Serializer* s) {
+    const bool should_write_type_testing_stub =
+        SnapshotContainsTypeTestingStubs(s->kind());
+
     intptr_t count = objects_.length();
     for (intptr_t i = 0; i < count; i++) {
       RawTypeRef* type = objects_[i];
@@ -3245,7 +3256,7 @@
       for (RawObject** p = from; p <= to; p++) {
         s->WriteRef(*p);
       }
-      if (s->kind() == Snapshot::kFullAOT) {
+      if (should_write_type_testing_stub) {
         RawInstructions* instr = type_testing_stubs_.LookupByAddresss(
             type->ptr()->type_test_stub_entry_point_);
         s->WriteInstructions(instr, Code::null());
@@ -3276,7 +3287,9 @@
   }
 
   void ReadFill(Deserializer* d) {
-    bool is_vm_object = d->isolate() == Dart::vm_isolate();
+    const bool is_vm_object = d->isolate() == Dart::vm_isolate();
+    const bool should_read_type_testing_stub =
+        SnapshotContainsTypeTestingStubs(d->kind());
 
     for (intptr_t id = start_index_; id < stop_index_; id++) {
       RawTypeRef* type = reinterpret_cast<RawTypeRef*>(d->Ref(id));
@@ -3287,7 +3300,7 @@
       for (RawObject** p = from; p <= to; p++) {
         *p = d->ReadRef();
       }
-      if (d->kind() == Snapshot::kFullAOT) {
+      if (should_read_type_testing_stub) {
         instr_ = d->ReadInstructions();
         type_ = type;
         type_.SetTypeTestingStub(instr_);
@@ -3295,6 +3308,16 @@
     }
   }
 
+  void PostLoad(const Array& refs, Snapshot::Kind kind, Zone* zone) {
+    if (!SnapshotContainsTypeTestingStubs(kind)) {
+      for (intptr_t id = start_index_; id < stop_index_; id++) {
+        type_ ^= refs.At(id);
+        instr_ = TypeTestingStubGenerator::DefaultCodeForType(type_);
+        type_.SetTypeTestingStub(instr_);
+      }
+    }
+  }
+
  private:
   AbstractType& type_;
   Instructions& instr_;
@@ -3331,6 +3354,9 @@
   }
 
   void WriteFill(Serializer* s) {
+    const bool should_write_type_testing_stub =
+        SnapshotContainsTypeTestingStubs(s->kind());
+
     intptr_t count = objects_.length();
     for (intptr_t i = 0; i < count; i++) {
       RawTypeParameter* type = objects_[i];
@@ -3343,7 +3369,7 @@
       s->WriteTokenPosition(type->ptr()->token_pos_);
       s->Write<int16_t>(type->ptr()->index_);
       s->Write<int8_t>(type->ptr()->type_state_);
-      if (s->kind() == Snapshot::kFullAOT) {
+      if (should_write_type_testing_stub) {
         RawInstructions* instr = type_testing_stubs_.LookupByAddresss(
             type->ptr()->type_test_stub_entry_point_);
         s->WriteInstructions(instr, Code::null());
@@ -3376,6 +3402,8 @@
 
   void ReadFill(Deserializer* d) {
     bool is_vm_object = d->isolate() == Dart::vm_isolate();
+    const bool should_read_type_testing_stub =
+        SnapshotContainsTypeTestingStubs(d->kind());
 
     for (intptr_t id = start_index_; id < stop_index_; id++) {
       RawTypeParameter* type = reinterpret_cast<RawTypeParameter*>(d->Ref(id));
@@ -3390,7 +3418,7 @@
       type->ptr()->token_pos_ = d->ReadTokenPosition();
       type->ptr()->index_ = d->Read<int16_t>();
       type->ptr()->type_state_ = d->Read<int8_t>();
-      if (d->kind() == Snapshot::kFullAOT) {
+      if (should_read_type_testing_stub) {
         instr_ = d->ReadInstructions();
         type_ = type;
         type_.SetTypeTestingStub(instr_);
@@ -3399,7 +3427,7 @@
   }
 
   void PostLoad(const Array& refs, Snapshot::Kind kind, Zone* zone) {
-    if (kind != Snapshot::kFullAOT) {
+    if (!SnapshotContainsTypeTestingStubs(kind)) {
       for (intptr_t id = start_index_; id < stop_index_; id++) {
         type_ ^= refs.At(id);
         instr_ = TypeTestingStubGenerator::DefaultCodeForType(type_);
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.cc b/runtime/vm/compiler/backend/flow_graph_compiler.cc
index 199e17c..187e12f 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.cc
@@ -1934,7 +1934,12 @@
   }
 }
 
-void FlowGraphCompiler::GenerateAssertAssignableAOT(
+bool FlowGraphCompiler::ShouldUseTypeTestingStubFor(bool optimizing,
+                                                    const AbstractType& type) {
+  return FLAG_precompiled_mode || (optimizing && type.IsTypeParameter());
+}
+
+void FlowGraphCompiler::GenerateAssertAssignableViaTypeTestingStub(
     const AbstractType& dst_type,
     const String& dst_name,
     const Register instance_reg,
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler.h b/runtime/vm/compiler/backend/flow_graph_compiler.h
index b4123f2..8185f32 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler.h
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.h
@@ -364,21 +364,28 @@
                                 const AbstractType& dst_type,
                                 const String& dst_name,
                                 LocationSummary* locs);
-  void GenerateAssertAssignableAOT(TokenPosition token_pos,
-                                   intptr_t deopt_id,
-                                   const AbstractType& dst_type,
-                                   const String& dst_name,
-                                   LocationSummary* locs);
 
-  void GenerateAssertAssignableAOT(const AbstractType& dst_type,
-                                   const String& dst_name,
-                                   const Register instance_reg,
-                                   const Register instantiator_type_args_reg,
-                                   const Register function_type_args_reg,
-                                   const Register subtype_cache_reg,
-                                   const Register dst_type_reg,
-                                   const Register scratch_reg,
-                                   Label* done);
+  // Returns true if we can use a type testing stub based assert
+  // assignable code pattern for the given type.
+  static bool ShouldUseTypeTestingStubFor(bool optimizing,
+                                          const AbstractType& type);
+
+  void GenerateAssertAssignableViaTypeTestingStub(TokenPosition token_pos,
+                                                  intptr_t deopt_id,
+                                                  const AbstractType& dst_type,
+                                                  const String& dst_name,
+                                                  LocationSummary* locs);
+
+  void GenerateAssertAssignableViaTypeTestingStub(
+      const AbstractType& dst_type,
+      const String& dst_name,
+      const Register instance_reg,
+      const Register instantiator_type_args_reg,
+      const Register function_type_args_reg,
+      const Register subtype_cache_reg,
+      const Register dst_type_reg,
+      const Register scratch_reg,
+      Label* done);
 
 // DBC emits calls very differently from all other architectures due to its
 // interpreted nature.
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
index a27e76c..e41694f 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
@@ -663,7 +663,8 @@
   }
 
   if (FLAG_precompiled_mode) {
-    GenerateAssertAssignableAOT(token_pos, deopt_id, dst_type, dst_name, locs);
+    GenerateAssertAssignableViaTypeTestingStub(token_pos, deopt_id, dst_type,
+                                               dst_name, locs);
   } else {
     Label is_assignable_fast, is_assignable, runtime_call;
 
@@ -691,10 +692,11 @@
     __ PushObject(dst_name);  // Push the name of the destination.
     __ LoadUniqueObject(R0, test_cache);
     __ Push(R0);
-    GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 6, locs);
+    __ PushObject(Smi::ZoneHandle(zone(), Smi::New(kTypeCheckFromInline)));
+    GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 7, locs);
     // Pop the parameters supplied to the runtime entry. The result of the
     // type check runtime call is the checked value.
-    __ Drop(6);
+    __ Drop(7);
     __ Pop(R0);
     __ Bind(&is_assignable);
     __ PopList((1 << kFunctionTypeArgumentsReg) |
@@ -703,7 +705,7 @@
   }
 }
 
-void FlowGraphCompiler::GenerateAssertAssignableAOT(
+void FlowGraphCompiler::GenerateAssertAssignableViaTypeTestingStub(
     TokenPosition token_pos,
     intptr_t deopt_id,
     const AbstractType& dst_type,
@@ -719,10 +721,10 @@
 
   Label done;
 
-  GenerateAssertAssignableAOT(dst_type, dst_name, kInstanceReg,
-                              kInstantiatorTypeArgumentsReg,
-                              kFunctionTypeArgumentsReg, kSubtypeTestCacheReg,
-                              kDstTypeReg, kScratchReg, &done);
+  GenerateAssertAssignableViaTypeTestingStub(
+      dst_type, dst_name, kInstanceReg, kInstantiatorTypeArgumentsReg,
+      kFunctionTypeArgumentsReg, kSubtypeTestCacheReg, kDstTypeReg, kScratchReg,
+      &done);
   // We use 2 consecutive entries in the pool for the subtype cache and the
   // destination name.  The second entry, namely [dst_name] seems to be unused,
   // but it will be used by the code throwing a TypeError if the type test fails
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
index c76e56d..a74cec8 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
@@ -643,8 +643,9 @@
     return;
   }
 
-  if (FLAG_precompiled_mode) {
-    GenerateAssertAssignableAOT(token_pos, deopt_id, dst_type, dst_name, locs);
+  if (ShouldUseTypeTestingStubFor(is_optimizing(), dst_type)) {
+    GenerateAssertAssignableViaTypeTestingStub(token_pos, deopt_id, dst_type,
+                                               dst_name, locs);
   } else {
     Label is_assignable_fast, is_assignable, runtime_call;
 
@@ -669,10 +670,11 @@
     __ PushObject(dst_name);  // Push the name of the destination.
     __ LoadUniqueObject(R0, test_cache);
     __ Push(R0);
-    GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 6, locs);
+    __ PushObject(Smi::ZoneHandle(zone(), Smi::New(kTypeCheckFromInline)));
+    GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 7, locs);
     // Pop the parameters supplied to the runtime entry. The result of the
     // type check runtime call is the checked value.
-    __ Drop(6);
+    __ Drop(7);
     __ Pop(R0);
     __ Bind(&is_assignable);
     __ PopPair(kFunctionTypeArgumentsReg, kInstantiatorTypeArgumentsReg);
@@ -680,7 +682,7 @@
   }
 }
 
-void FlowGraphCompiler::GenerateAssertAssignableAOT(
+void FlowGraphCompiler::GenerateAssertAssignableViaTypeTestingStub(
     TokenPosition token_pos,
     intptr_t deopt_id,
     const AbstractType& dst_type,
@@ -696,10 +698,10 @@
 
   Label done;
 
-  GenerateAssertAssignableAOT(dst_type, dst_name, kInstanceReg,
-                              kInstantiatorTypeArgumentsReg,
-                              kFunctionTypeArgumentsReg, kSubtypeTestCacheReg,
-                              kDstTypeReg, kScratchReg, &done);
+  GenerateAssertAssignableViaTypeTestingStub(
+      dst_type, dst_name, kInstanceReg, kInstantiatorTypeArgumentsReg,
+      kFunctionTypeArgumentsReg, kSubtypeTestCacheReg, kDstTypeReg, kScratchReg,
+      &done);
 
   // We use 2 consecutive entries in the pool for the subtype cache and the
   // destination name.  The second entry, namely [dst_name] seems to be unused,
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
index 0580e5c..f19eea3 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
@@ -686,10 +686,11 @@
   __ PushObject(dst_name);  // Push the name of the destination.
   __ LoadObject(EAX, test_cache);
   __ pushl(EAX);
-  GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 6, locs);
+  __ PushObject(Smi::ZoneHandle(zone(), Smi::New(kTypeCheckFromInline)));
+  GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 7, locs);
   // Pop the parameters supplied to the runtime entry. The result of the
   // type check runtime call is the checked value.
-  __ Drop(6);
+  __ Drop(7);
   __ popl(EAX);
 
   __ Bind(&is_assignable);
diff --git a/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc b/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
index 4a2c58b..0ef54a5 100644
--- a/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
@@ -654,8 +654,9 @@
     return;
   }
 
-  if (FLAG_precompiled_mode) {
-    GenerateAssertAssignableAOT(token_pos, deopt_id, dst_type, dst_name, locs);
+  if (ShouldUseTypeTestingStubFor(is_optimizing(), dst_type)) {
+    GenerateAssertAssignableViaTypeTestingStub(token_pos, deopt_id, dst_type,
+                                               dst_name, locs);
   } else {
     Label is_assignable, runtime_call;
 
@@ -678,16 +679,17 @@
     __ PushObject(dst_name);  // Push the name of the destination.
     __ LoadUniqueObject(RAX, test_cache);
     __ pushq(RAX);
-    GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 6, locs);
+    __ PushObject(Smi::ZoneHandle(zone(), Smi::New(kTypeCheckFromInline)));
+    GenerateRuntimeCall(token_pos, deopt_id, kTypeCheckRuntimeEntry, 7, locs);
     // Pop the parameters supplied to the runtime entry. The result of the
     // type check runtime call is the checked value.
-    __ Drop(6);
+    __ Drop(7);
     __ popq(RAX);
     __ Bind(&is_assignable);
   }
 }
 
-void FlowGraphCompiler::GenerateAssertAssignableAOT(
+void FlowGraphCompiler::GenerateAssertAssignableViaTypeTestingStub(
     TokenPosition token_pos,
     intptr_t deopt_id,
     const AbstractType& dst_type,
@@ -702,10 +704,10 @@
   const Register subtype_cache_reg = R9;
   const Register kScratchReg = RBX;
 
-  GenerateAssertAssignableAOT(dst_type, dst_name, kInstanceReg,
-                              kInstantiatorTypeArgumentsReg,
-                              kFunctionTypeArgumentsReg, subtype_cache_reg,
-                              kScratchReg, kScratchReg, &done);
+  GenerateAssertAssignableViaTypeTestingStub(
+      dst_type, dst_name, kInstanceReg, kInstantiatorTypeArgumentsReg,
+      kFunctionTypeArgumentsReg, subtype_cache_reg, kScratchReg, kScratchReg,
+      &done);
 
   // We use 2 consecutive entries in the pool for the subtype cache and the
   // destination name.  The second entry, namely [dst_name] seems to be unused,
diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc
index 3d7b61d..e9a04e7 100644
--- a/runtime/vm/compiler/backend/il_arm.cc
+++ b/runtime/vm/compiler/backend/il_arm.cc
@@ -396,31 +396,45 @@
 
 LocationSummary* AssertAssignableInstr::MakeLocationSummary(Zone* zone,
                                                             bool opt) const {
-  // In AOT mode, we want to prevent spilling of the function/instantiator type
-  // argument vectors, since we preserve them.  So we make this a `kNoCall`
-  // summary.  Though most other registers can be modified by the type testing
-  // stubs we are calling.  To tell the register allocator about it, we reserve
+  // When using a type testing stub, we want to prevent spilling of the
+  // function/instantiator type argument vectors, since stub preserves them. So
+  // we make this a `kNoCall` summary, even though most other registers can be
+  // modified by the stub. To tell the register allocator about it, we reserve
   // all the other registers as temporary registers.
   // TODO(http://dartbug.com/32788): Simplify this.
   const Register kInstanceReg = R0;
   const Register kInstantiatorTypeArgumentsReg = R2;
   const Register kFunctionTypeArgumentsReg = R1;
 
+  const bool using_stub =
+      FlowGraphCompiler::ShouldUseTypeTestingStubFor(opt, dst_type());
+
   const intptr_t kNonChangeableInputRegs =
       (1 << kInstanceReg) | (1 << kInstantiatorTypeArgumentsReg) |
       (1 << kFunctionTypeArgumentsReg);
 
   const intptr_t kNumInputs = 3;
 
-  const intptr_t kNumTemps =
-      FLAG_precompiled_mode ? (Utils::CountOneBits64(kDartAvailableCpuRegs) -
-                               Utils::CountOneBits64(kNonChangeableInputRegs))
-                            : 0;
+  // We invoke a stub that can potentially clobber any CPU register
+  // but can only clobber FPU registers on the slow path when
+  // entering runtime. Preserve all FPU registers that are
+  // not guarateed to be preserved by the ABI.
+  const intptr_t kCpuRegistersToPreserve =
+      kDartAvailableCpuRegs & ~kNonChangeableInputRegs;
+  const intptr_t kFpuRegistersToPreserve =
+      Utils::SignedNBitMask(kNumberOfFpuRegisters) &
+      ~(Utils::SignedNBitMask(kAbiPreservedFpuRegCount)
+        << kAbiFirstPreservedFpuReg) &
+      ~(1 << FpuTMP);
 
-  LocationSummary* summary = new (zone)
-      LocationSummary(zone, kNumInputs, kNumTemps,
-                      FLAG_precompiled_mode ? LocationSummary::kCallCalleeSafe
-                                            : LocationSummary::kCall);
+  const intptr_t kNumTemps =
+      using_stub ? (Utils::CountOneBits64(kCpuRegistersToPreserve) +
+                    Utils::CountOneBits64(kFpuRegistersToPreserve))
+                 : 0;
+
+  LocationSummary* summary = new (zone) LocationSummary(
+      zone, kNumInputs, kNumTemps,
+      using_stub ? LocationSummary::kCallCalleeSafe : LocationSummary::kCall);
   summary->set_in(0, Location::RegisterLocation(kInstanceReg));  // Value.
   summary->set_in(1,
                   Location::RegisterLocation(
@@ -432,17 +446,24 @@
   // once register allocator no longer hits assertion.
   summary->set_out(0, Location::RegisterLocation(kInstanceReg));
 
-  if (FLAG_precompiled_mode) {
+  if (using_stub) {
     // Let's reserve all registers except for the input ones.
     intptr_t next_temp = 0;
     for (intptr_t i = 0; i < kNumberOfCpuRegisters; ++i) {
-      const bool is_allocatable = ((1 << i) & kDartAvailableCpuRegs) != 0;
-      const bool is_input = ((1 << i) & kNonChangeableInputRegs) != 0;
-      if (is_allocatable && !is_input) {
+      const bool should_preserve = ((1 << i) & kCpuRegistersToPreserve) != 0;
+      if (should_preserve) {
         summary->set_temp(next_temp++,
                           Location::RegisterLocation(static_cast<Register>(i)));
       }
     }
+
+    for (intptr_t i = 0; i < kNumberOfFpuRegisters; i++) {
+      const bool should_preserve = ((1 << i) & kFpuRegistersToPreserve) != 0;
+      if (should_preserve) {
+        summary->set_temp(next_temp++, Location::FpuRegisterLocation(
+                                           static_cast<FpuRegister>(i)));
+      }
+    }
   }
 
   return summary;
diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc
index 9a9bc82..204dd69 100644
--- a/runtime/vm/compiler/backend/il_arm64.cc
+++ b/runtime/vm/compiler/backend/il_arm64.cc
@@ -394,31 +394,43 @@
 
 LocationSummary* AssertAssignableInstr::MakeLocationSummary(Zone* zone,
                                                             bool opt) const {
-  // In AOT mode, we want to prevent spilling of the function/instantiator type
-  // argument vectors, since we preserve them.  So we make this a `kNoCall`
-  // summary.  Though most other registers can be modified by the type testing
-  // stubs we are calling.  To tell the register allocator about it, we reserve
+  // When using a type testing stub, we want to prevent spilling of the
+  // function/instantiator type argument vectors, since stub preserves them. So
+  // we make this a `kNoCall` summary, even though most other registers can be
+  // modified by the stub. To tell the register allocator about it, we reserve
   // all the other registers as temporary registers.
   // TODO(http://dartbug.com/32788): Simplify this.
   const Register kInstanceReg = R0;
   const Register kInstantiatorTypeArgumentsReg = R1;
   const Register kFunctionTypeArgumentsReg = R2;
 
+  const bool using_stub =
+      FlowGraphCompiler::ShouldUseTypeTestingStubFor(opt, dst_type());
+
   const intptr_t kNonChangeableInputRegs =
       (1 << kInstanceReg) | (1 << kInstantiatorTypeArgumentsReg) |
       (1 << kFunctionTypeArgumentsReg);
 
   const intptr_t kNumInputs = 3;
 
-  const intptr_t kNumTemps =
-      FLAG_precompiled_mode ? (Utils::CountOneBits64(kDartAvailableCpuRegs) -
-                               Utils::CountOneBits64(kNonChangeableInputRegs))
-                            : 0;
+  // We invoke a stub that can potentially clobber any CPU register
+  // but can only clobber FPU registers on the slow path when
+  // entering runtime. ARM64 ABI only guarantees that lower
+  // 64-bits of an V registers are preserved so we block all
+  // of them except for FpuTMP.
+  const intptr_t kCpuRegistersToPreserve =
+      kDartAvailableCpuRegs & ~kNonChangeableInputRegs;
+  const intptr_t kFpuRegistersToPreserve =
+      Utils::SignedNBitMask(kNumberOfFpuRegisters) & ~(1l << FpuTMP);
 
-  LocationSummary* summary = new (zone)
-      LocationSummary(zone, kNumInputs, kNumTemps,
-                      FLAG_precompiled_mode ? LocationSummary::kCallCalleeSafe
-                                            : LocationSummary::kCall);
+  const intptr_t kNumTemps =
+      using_stub ? (Utils::CountOneBits64(kCpuRegistersToPreserve) +
+                    Utils::CountOneBits64(kFpuRegistersToPreserve))
+                 : 0;
+
+  LocationSummary* summary = new (zone) LocationSummary(
+      zone, kNumInputs, kNumTemps,
+      using_stub ? LocationSummary::kCallCalleeSafe : LocationSummary::kCall);
   summary->set_in(0, Location::RegisterLocation(kInstanceReg));  // Value.
   summary->set_in(1,
                   Location::RegisterLocation(
@@ -430,17 +442,24 @@
   // once register allocator no longer hits assertion.
   summary->set_out(0, Location::RegisterLocation(kInstanceReg));
 
-  if (FLAG_precompiled_mode) {
+  if (using_stub) {
     // Let's reserve all registers except for the input ones.
     intptr_t next_temp = 0;
     for (intptr_t i = 0; i < kNumberOfCpuRegisters; ++i) {
-      const bool is_allocatable = ((1 << i) & kDartAvailableCpuRegs) != 0;
-      const bool is_input = ((1 << i) & kNonChangeableInputRegs) != 0;
-      if (is_allocatable && !is_input) {
+      const bool should_preserve = ((1 << i) & kCpuRegistersToPreserve) != 0;
+      if (should_preserve) {
         summary->set_temp(next_temp++,
                           Location::RegisterLocation(static_cast<Register>(i)));
       }
     }
+
+    for (intptr_t i = 0; i < kNumberOfFpuRegisters; i++) {
+      const bool should_preserve = ((1l << i) & kFpuRegistersToPreserve) != 0;
+      if (should_preserve) {
+        summary->set_temp(next_temp++, Location::FpuRegisterLocation(
+                                           static_cast<FpuRegister>(i)));
+      }
+    }
   }
 
   return summary;
diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc
index cbbd640..57490ec 100644
--- a/runtime/vm/compiler/backend/il_x64.cc
+++ b/runtime/vm/compiler/backend/il_x64.cc
@@ -362,31 +362,42 @@
 
 LocationSummary* AssertAssignableInstr::MakeLocationSummary(Zone* zone,
                                                             bool opt) const {
-  // In AOT mode, we want to prevent spilling of the function/instantiator type
-  // argument vectors, since we preserve them.  So we make this a `kNoCall`
-  // summary.  Though most other registers can be modified by the type testing
-  // stubs we are calling.  To tell the register allocator about it, we reserve
+  // When using a type testing stub, we want to prevent spilling of the
+  // function/instantiator type argument vectors, since stub preserves them. So
+  // we make this a `kNoCall` summary, even though most other registers can be
+  // modified by the stub. To tell the register allocator about it, we reserve
   // all the other registers as temporary registers.
   // TODO(http://dartbug.com/32788): Simplify this.
   const Register kInstanceReg = RAX;
   const Register kInstantiatorTypeArgumentsReg = RDX;
   const Register kFunctionTypeArgumentsReg = RCX;
 
+  const bool using_stub =
+      FlowGraphCompiler::ShouldUseTypeTestingStubFor(opt, dst_type());
+
   const intptr_t kNonChangeableInputRegs =
       (1 << kInstanceReg) | (1 << kInstantiatorTypeArgumentsReg) |
       (1 << kFunctionTypeArgumentsReg);
 
   const intptr_t kNumInputs = 3;
 
-  const intptr_t kNumTemps =
-      FLAG_precompiled_mode ? (Utils::CountOneBits64(kDartAvailableCpuRegs) -
-                               Utils::CountOneBits64(kNonChangeableInputRegs))
-                            : 0;
+  // We invoke a stub that can potentially clobber any CPU register
+  // but can only clobber FPU registers on the slow path when
+  // entering runtime. Preserve all FPU registers that are
+  // not guarateed to be preserved by the ABI.
+  const intptr_t kCpuRegistersToPreserve =
+      kDartAvailableCpuRegs & ~kNonChangeableInputRegs;
+  const intptr_t kFpuRegistersToPreserve =
+      CallingConventions::kVolatileXmmRegisters & ~(1 << FpuTMP);
 
-  LocationSummary* summary = new (zone)
-      LocationSummary(zone, kNumInputs, kNumTemps,
-                      FLAG_precompiled_mode ? LocationSummary::kCallCalleeSafe
-                                            : LocationSummary::kCall);
+  const intptr_t kNumTemps =
+      using_stub ? (Utils::CountOneBits64(kCpuRegistersToPreserve) +
+                    Utils::CountOneBits64(kFpuRegistersToPreserve))
+                 : 0;
+
+  LocationSummary* summary = new (zone) LocationSummary(
+      zone, kNumInputs, kNumTemps,
+      using_stub ? LocationSummary::kCallCalleeSafe : LocationSummary::kCall);
   summary->set_in(0, Location::RegisterLocation(kInstanceReg));  // Value.
   summary->set_in(1,
                   Location::RegisterLocation(
@@ -398,17 +409,24 @@
   // once register allocator no longer hits assertion.
   summary->set_out(0, Location::RegisterLocation(kInstanceReg));
 
-  if (FLAG_precompiled_mode) {
+  if (using_stub) {
     // Let's reserve all registers except for the input ones.
     intptr_t next_temp = 0;
     for (intptr_t i = 0; i < kNumberOfCpuRegisters; ++i) {
-      const bool is_allocatable = ((1 << i) & kDartAvailableCpuRegs) != 0;
-      const bool is_input = ((1 << i) & kNonChangeableInputRegs) != 0;
-      if (is_allocatable && !is_input) {
+      const bool should_preserve = ((1 << i) & kCpuRegistersToPreserve) != 0;
+      if (should_preserve) {
         summary->set_temp(next_temp++,
                           Location::RegisterLocation(static_cast<Register>(i)));
       }
     }
+
+    for (intptr_t i = 0; i < kNumberOfFpuRegisters; i++) {
+      const bool should_preserve = ((1 << i) & kFpuRegistersToPreserve) != 0;
+      if (should_preserve) {
+        summary->set_temp(next_temp++, Location::FpuRegisterLocation(
+                                           static_cast<FpuRegister>(i)));
+      }
+    }
   }
 
   return summary;
diff --git a/runtime/vm/instructions.h b/runtime/vm/instructions.h
index e0ec51f..eb39a5d 100644
--- a/runtime/vm/instructions.h
+++ b/runtime/vm/instructions.h
@@ -28,7 +28,7 @@
 
 bool DecodeLoadObjectFromPoolOrThread(uword pc, const Code& code, Object* obj);
 
-#if defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
+#if !defined(TARGET_ARCH_IA32) && !defined(TARGET_ARCH_DBC)
 
 class TypeTestingStubCallPattern : public ValueObject {
  public:
@@ -40,7 +40,7 @@
   const uword pc_;
 };
 
-#endif  // defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
+#endif  // !defined(TARGET_ARCH_IA32) && !defined(TARGET_ARCH_DBC)
 
 }  // namespace dart
 
diff --git a/runtime/vm/instructions_arm.cc b/runtime/vm/instructions_arm.cc
index f62215f..38dbee5 100644
--- a/runtime/vm/instructions_arm.cc
+++ b/runtime/vm/instructions_arm.cc
@@ -285,8 +285,6 @@
   return false;
 }
 
-#if defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
-
 intptr_t TypeTestingStubCallPattern::GetSubtypeTestCachePoolIndex() {
   // Calls to the type testing stubs look like:
   //   ldr R3, [PP+idx]
@@ -304,8 +302,6 @@
   return pool_index;
 }
 
-#endif  // defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
-
 }  // namespace dart
 
 #endif  // defined TARGET_ARCH_ARM
diff --git a/runtime/vm/instructions_arm64.cc b/runtime/vm/instructions_arm64.cc
index f728ffd..a08eda0 100644
--- a/runtime/vm/instructions_arm64.cc
+++ b/runtime/vm/instructions_arm64.cc
@@ -351,8 +351,6 @@
   return bx_lr->InstructionBits() == instruction;
 }
 
-#if defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
-
 intptr_t TypeTestingStubCallPattern::GetSubtypeTestCachePoolIndex() {
   // Calls to the type testing stubs look like:
   //   ldr R3, [PP+idx]
@@ -370,8 +368,6 @@
   return pool_index;
 }
 
-#endif  // defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
-
 }  // namespace dart
 
 #endif  // defined TARGET_ARCH_ARM64
diff --git a/runtime/vm/instructions_x64.cc b/runtime/vm/instructions_x64.cc
index 3c63a75..4765a52 100644
--- a/runtime/vm/instructions_x64.cc
+++ b/runtime/vm/instructions_x64.cc
@@ -64,8 +64,6 @@
   return false;
 }
 
-#if defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
-
 intptr_t TypeTestingStubCallPattern::GetSubtypeTestCachePoolIndex() {
   const intptr_t kCallPatternSize = 10;
   static int16_t pattern[kCallPatternSize] = {
@@ -78,8 +76,6 @@
   return IndexFromPPLoad(start + 3);
 }
 
-#endif  // defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
-
 }  // namespace dart
 
 #endif  // defined TARGET_ARCH_X64
diff --git a/runtime/vm/interpreter.cc b/runtime/vm/interpreter.cc
index d8b68f7..dba91ad 100644
--- a/runtime/vm/interpreter.cc
+++ b/runtime/vm/interpreter.cc
@@ -3368,8 +3368,9 @@
       SP[4] = args[2];  // function type args
       SP[5] = args[4];  // name
       SP[6] = cache;
-      Exit(thread, FP, SP + 7, pc);
-      NativeArguments native_args(thread, 6, SP + 1, SP - 4);
+      SP[7] = Smi::New(kTypeCheckFromInline);
+      Exit(thread, FP, SP + 8, pc);
+      NativeArguments native_args(thread, 7, SP + 1, SP - 4);
       INVOKE_RUNTIME(DRT_TypeCheck, native_args);
     }
 
diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc
index 8be76ae..ed6ecb1 100644
--- a/runtime/vm/object.cc
+++ b/runtime/vm/object.cc
@@ -18247,6 +18247,8 @@
   cloned_ref_type = ref_type.CloneUninstantiated(new_owner, trail);
   ASSERT(!cloned_ref_type.IsTypeRef());
   cloned_type_ref.set_type(cloned_ref_type);
+  cloned_type_ref.SetTypeTestingStub(Instructions::Handle(
+      TypeTestingStubGenerator::DefaultCodeForType(cloned_type_ref)));
   return cloned_type_ref.raw();
 }
 
diff --git a/runtime/vm/raw_object.h b/runtime/vm/raw_object.h
index e62baec..f1c80c5 100644
--- a/runtime/vm/raw_object.h
+++ b/runtime/vm/raw_object.h
@@ -1811,6 +1811,13 @@
     kFinalizedInstantiated,    // Instantiated type ready for use.
     kFinalizedUninstantiated,  // Uninstantiated type ready for use.
   };
+
+  // Note: we don't handle this field in GC in any special way.
+  // Instead we rely on two things:
+  //   (1) GC not moving code objects and
+  //   (2) lifetime of optimized stubs exceeding that of types;
+  // Practically (2) means that optimized stubs never die because
+  // canonical types to which they are attached never die.
   uword type_test_stub_entry_point_;  // Accessed from generated code.
 
  private:
diff --git a/runtime/vm/raw_object_snapshot.cc b/runtime/vm/raw_object_snapshot.cc
index e8af8b8..e5571e4 100644
--- a/runtime/vm/raw_object_snapshot.cc
+++ b/runtime/vm/raw_object_snapshot.cc
@@ -9,6 +9,7 @@
 #include "vm/snapshot.h"
 #include "vm/stub_code.h"
 #include "vm/symbols.h"
+#include "vm/type_testing_stubs.h"
 #include "vm/visitor.h"
 
 namespace dart {
@@ -236,6 +237,11 @@
     type.SetCanonical();
   }
 
+  // Fill in the type testing stub.
+  Instructions& instr = *reader->InstructionsHandle();
+  instr = TypeTestingStubGenerator::DefaultCodeForType(type);
+  type.SetTypeTestingStub(instr);
+
   return type.raw();
 }
 
@@ -306,6 +312,11 @@
   READ_OBJECT_FIELDS(type_ref, type_ref.raw()->from(), type_ref.raw()->to(),
                      kAsReference);
 
+  // Fill in the type testing stub.
+  Instructions& instr = *reader->InstructionsHandle();
+  instr = TypeTestingStubGenerator::DefaultCodeForType(type_ref);
+  type_ref.SetTypeTestingStub(instr);
+
   return type_ref.raw();
 }
 
@@ -357,6 +368,11 @@
       Class::RawCast(reader->ReadObjectImpl(kAsReference));
   type_parameter.set_parameterized_class(*reader->ClassHandle());
 
+  // Fill in the type testing stub.
+  Instructions& instr = *reader->InstructionsHandle();
+  instr = TypeTestingStubGenerator::DefaultCodeForType(type_parameter);
+  type_parameter.SetTypeTestingStub(instr);
+
   return type_parameter.raw();
 }
 
diff --git a/runtime/vm/runtime_entry.cc b/runtime/vm/runtime_entry.cc
index 645f1ea..da751b3 100644
--- a/runtime/vm/runtime_entry.cc
+++ b/runtime/vm/runtime_entry.cc
@@ -26,6 +26,7 @@
 #include "vm/stack_frame.h"
 #include "vm/symbols.h"
 #include "vm/thread_registry.h"
+#include "vm/type_testing_stubs.h"
 #include "vm/verifier.h"
 
 namespace dart {
@@ -680,8 +681,9 @@
 // Arg3: type arguments of the function of the type being assigned to.
 // Arg4: name of variable being assigned to.
 // Arg5: SubtypeTestCache.
+// Arg6: invocation mode (see TypeCheckMode)
 // Return value: instance if a subtype, otherwise throw a TypeError.
-DEFINE_RUNTIME_ENTRY(TypeCheck, 6) {
+DEFINE_RUNTIME_ENTRY(TypeCheck, 7) {
   const Instance& src_instance =
       Instance::CheckedHandle(zone, arguments.ArgAt(0));
   AbstractType& dst_type =
@@ -698,6 +700,13 @@
   cache ^= arguments.ArgAt(5);
   ASSERT(cache.IsNull() || cache.IsSubtypeTestCache());
 
+  const TypeCheckMode mode = static_cast<TypeCheckMode>(
+      Smi::CheckedHandle(zone, arguments.ArgAt(6)).Value());
+
+#if defined(TARGET_ARCH_IA32) || defined(TARGET_ARCH_DBC)
+  ASSERT(mode == kTypeCheckFromInline);
+#endif
+
   ASSERT(!dst_type.IsMalformed());    // Already checked in code generator.
   ASSERT(!dst_type.IsMalbounded());   // Already checked in code generator.
   ASSERT(!dst_type.IsDynamicType());  // No need to check assignment.
@@ -731,8 +740,10 @@
       bound_error_message = String::New(bound_error.ToErrorCString());
     }
     if (dst_name.IsNull()) {
-#if !defined(TARGET_ARCH_DBC) && !defined(TARGET_ARCH_IA32) &&                 \
-    (defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME))
+#if !defined(TARGET_ARCH_DBC) && !defined(TARGET_ARCH_IA32)
+      // Can only come here from type testing stub.
+      ASSERT(mode != kTypeCheckFromInline);
+
       // Grab the [dst_name] from the pool.  It's stored at one pool slot after
       // the subtype-test-cache.
       DartFrameIterator iterator(thread,
@@ -756,14 +767,20 @@
     UNREACHABLE();
   }
 
-  if (cache.IsNull()) {
+  bool should_update_cache = true;
 #if !defined(TARGET_ARCH_DBC) && !defined(TARGET_ARCH_IA32) &&                 \
-    (defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME))
+    !defined(DART_PRECOMPILED_RUNTIME)
+  if (mode == kTypeCheckFromLazySpecializeStub) {
+    TypeTestingStubGenerator::SpecializeStubFor(thread, dst_type);
+    // Only create the cache when we come from a normal stub.
+    should_update_cache = false;
+  }
+#endif
 
-#if defined(DART_PRECOMPILER)
-    if (FLAG_precompiled_mode) {
-#endif  // defined(DART_PRECOMPILER)
-
+  if (should_update_cache) {
+    if (cache.IsNull()) {
+#if !defined(TARGET_ARCH_DBC) && !defined(TARGET_ARCH_IA32)
+      ASSERT(mode == kTypeCheckFromSlowStub);
       // We lazily create [SubtypeTestCache] for those call sites which actually
       // need one and will patch the pool entry.
       DartFrameIterator iterator(thread,
@@ -780,37 +797,15 @@
       ASSERT(pool.ObjectAt(stc_pool_idx) == Object::null());
       cache = SubtypeTestCache::New();
       pool.SetObjectAt(stc_pool_idx, cache);
-
-#if defined(DART_PRECOMPILER)
-    }
-#endif  // defined(DART_PRECOMPILER)
-
 #else
-    // WARNING: If we ever come here, it's a really bad sign, because it means
-    // that there was a type test, which generated code could not handle but we
-    // have no subtype cache.  Which means that this successfully-passing type
-    // check will always go to runtime.
-    //
-    // Currently there is one known case when this happens:
-    //
-    // The [FlowGraphCompiler::GenerateInstantiatedTypeNoArgumentsTest] is
-    // handling type checks against int/num specially: It generates a number of
-    // class-id checks.  Unfortunately it handles only normal implementations of
-    // 'int', such as kSmiCid, kMintCid, kBigintCid.  It will signal that there
-    // is no subtype-cache necessary on that call site, because all integer
-    // types have been handled.
-    //
-    // -> Though this is not true, due to (from runtime/lib/array_patch.dart):
-    //
-    //    class _GrowableArrayMarker implements int { }
-    //
-    // Because of this, we cannot have an `UNREACHABLE()` here, but rather just
-    // have a NOP and return `true`, to signal the type check passed.
-#endif  // defined(DART_PRECOMPILER) || defined(DART_PRECOMPILED_RUNTIME)
+      UNREACHABLE();
+#endif
+    }
+
+    UpdateTypeTestCache(src_instance, dst_type, instantiator_type_arguments,
+                        function_type_arguments, Bool::True(), cache);
   }
 
-  UpdateTypeTestCache(src_instance, dst_type, instantiator_type_arguments,
-                      function_type_arguments, Bool::True(), cache);
   arguments.SetReturn(src_instance);
 }
 
diff --git a/runtime/vm/simulator_dbc.cc b/runtime/vm/simulator_dbc.cc
index 487143f..46e467b 100644
--- a/runtime/vm/simulator_dbc.cc
+++ b/runtime/vm/simulator_dbc.cc
@@ -3060,8 +3060,9 @@
       SP[4] = args[2];  // function type args
       SP[5] = args[4];  // name
       SP[6] = cache;
-      Exit(thread, FP, SP + 7, pc);
-      NativeArguments native_args(thread, 6, SP + 1, SP - 4);
+      SP[7] = Smi::New(kTypeCheckFromInline);
+      Exit(thread, FP, SP + 8, pc);
+      NativeArguments native_args(thread, 7, SP + 1, SP - 4);
       INVOKE_RUNTIME(DRT_TypeCheck, native_args);
     }
 
diff --git a/runtime/vm/snapshot.cc b/runtime/vm/snapshot.cc
index d3667ce..d1fb4ba 100644
--- a/runtime/vm/snapshot.cc
+++ b/runtime/vm/snapshot.cc
@@ -192,6 +192,7 @@
       old_space_(thread_->isolate()->heap()->old_space()),
       cls_(Class::Handle(zone_)),
       code_(Code::Handle(zone_)),
+      instructions_(Instructions::Handle(zone_)),
       obj_(Object::Handle(zone_)),
       pobj_(PassiveObject::Handle(zone_)),
       array_(Array::Handle(zone_)),
diff --git a/runtime/vm/snapshot.h b/runtime/vm/snapshot.h
index 60a944f..b630cc3 100644
--- a/runtime/vm/snapshot.h
+++ b/runtime/vm/snapshot.h
@@ -335,6 +335,7 @@
   Array* ArrayHandle() { return &array_; }
   Class* ClassHandle() { return &cls_; }
   Code* CodeHandle() { return &code_; }
+  Instructions* InstructionsHandle() { return &instructions_; }
   String* StringHandle() { return &str_; }
   AbstractType* TypeHandle() { return &type_; }
   TypeArguments* TypeArgumentsHandle() { return &type_arguments_; }
@@ -440,20 +441,21 @@
 
   bool is_vm_isolate() const;
 
-  Snapshot::Kind kind_;   // Indicates type of snapshot(full, script, message).
-  Thread* thread_;        // Current thread.
-  Zone* zone_;            // Zone for allocations while reading snapshot.
-  Heap* heap_;            // Heap of the current isolate.
-  PageSpace* old_space_;  // Old space of the current isolate.
-  Class& cls_;            // Temporary Class handle.
-  Code& code_;            // Temporary Code handle.
-  Object& obj_;           // Temporary Object handle.
-  PassiveObject& pobj_;   // Temporary PassiveObject handle.
-  Array& array_;          // Temporary Array handle.
-  Field& field_;          // Temporary Field handle.
-  String& str_;           // Temporary String handle.
-  Library& library_;      // Temporary library handle.
-  AbstractType& type_;    // Temporary type handle.
+  Snapshot::Kind kind_;            // Indicates type of the snapshot.
+  Thread* thread_;                 // Current thread.
+  Zone* zone_;                     // Zone for allocations while reading.
+  Heap* heap_;                     // Heap of the current isolate.
+  PageSpace* old_space_;           // Old space of the current isolate.
+  Class& cls_;                     // Temporary Class handle.
+  Code& code_;                     // Temporary Code handle.
+  Instructions& instructions_;     // Temporary Instructions handle
+  Object& obj_;                    // Temporary Object handle.
+  PassiveObject& pobj_;            // Temporary PassiveObject handle.
+  Array& array_;                   // Temporary Array handle.
+  Field& field_;                   // Temporary Field handle.
+  String& str_;                    // Temporary String handle.
+  Library& library_;               // Temporary library handle.
+  AbstractType& type_;             // Temporary type handle.
   TypeArguments& type_arguments_;  // Temporary type argument handle.
   GrowableObjectArray& tokens_;    // Temporary tokens handle.
   TokenStream& stream_;            // Temporary token stream handle.
diff --git a/runtime/vm/stub_code.h b/runtime/vm/stub_code.h
index 01ffc6f..b6c1c3c 100644
--- a/runtime/vm/stub_code.h
+++ b/runtime/vm/stub_code.h
@@ -74,6 +74,7 @@
   V(TypeRefTypeTest)                                                           \
   V(UnreachableTypeTest)                                                       \
   V(SlowTypeTest)                                                              \
+  V(LazySpecializeTypeTest)                                                    \
   V(CallClosureNoSuchMethod)                                                   \
   V(FrameAwaitingMaterialization)                                              \
   V(AsynchronousGapMarker)
@@ -94,6 +95,7 @@
   V(TypeRefTypeTest)                                                           \
   V(UnreachableTypeTest)                                                       \
   V(SlowTypeTest)                                                              \
+  V(LazySpecializeTypeTest)                                                    \
   V(FrameAwaitingMaterialization)                                              \
   V(AsynchronousGapMarker)
 
@@ -215,6 +217,23 @@
 
 enum DeoptStubKind { kLazyDeoptFromReturn, kLazyDeoptFromThrow, kEagerDeopt };
 
+// Invocation mode for TypeCheck runtime entry that describes
+// where we are calling it from.
+enum TypeCheckMode {
+  // TypeCheck is invoked from LazySpecializeTypeTest stub.
+  // It should replace stub on the type with a specialized version.
+  kTypeCheckFromLazySpecializeStub,
+
+  // TypeCheck is invoked from the SlowTypeTest stub.
+  // This means that cache can be lazily created (if needed)
+  // and dst_name can be fetched from the pool.
+  kTypeCheckFromSlowStub,
+
+  // TypeCheck is invoked from normal inline AssertAssignable.
+  // Both cache and dst_name must be already populated.
+  kTypeCheckFromInline
+};
+
 // Zap value used to indicate unused CODE_REG in deopt.
 static const uword kZapCodeReg = 0xf1f1f1f1;
 
diff --git a/runtime/vm/stub_code_arm.cc b/runtime/vm/stub_code_arm.cc
index a97ccc5..6aa0494 100644
--- a/runtime/vm/stub_code_arm.cc
+++ b/runtime/vm/stub_code_arm.cc
@@ -1952,11 +1952,54 @@
   __ Breakpoint();
 }
 
+static void InvokeTypeCheckFromTypeTestStub(Assembler* assembler,
+                                            TypeCheckMode mode) {
+  const Register kInstanceReg = R0;
+  const Register kInstantiatorTypeArgumentsReg = R2;
+  const Register kFunctionTypeArgumentsReg = R1;
+  const Register kDstTypeReg = R8;
+  const Register kSubtypeTestCacheReg = R3;
+
+  __ PushObject(Object::null_object());  // Make room for result.
+  __ Push(kInstanceReg);
+  __ Push(kDstTypeReg);
+  __ Push(kInstantiatorTypeArgumentsReg);
+  __ Push(kFunctionTypeArgumentsReg);
+  __ PushObject(Object::null_object());
+  __ Push(kSubtypeTestCacheReg);
+  __ PushObject(Smi::ZoneHandle(Smi::New(mode)));
+  __ CallRuntime(kTypeCheckRuntimeEntry, 7);
+  __ Drop(1);  // mode
+  __ Pop(kSubtypeTestCacheReg);
+  __ Drop(1);  // dst_name
+  __ Pop(kFunctionTypeArgumentsReg);
+  __ Pop(kInstantiatorTypeArgumentsReg);
+  __ Pop(kDstTypeReg);
+  __ Pop(kInstanceReg);
+  __ Drop(1);  // Discard return value.
+}
+
+void StubCode::GenerateLazySpecializeTypeTestStub(Assembler* assembler) {
+  const Register kInstanceReg = R0;
+  Label done;
+
+  __ CompareObject(kInstanceReg, Object::null_object());
+  __ BranchIf(EQUAL, &done);
+
+  __ ldr(CODE_REG,
+         Address(THR, Thread::lazy_specialize_type_test_stub_offset()));
+  __ EnterStubFrame();
+  InvokeTypeCheckFromTypeTestStub(assembler, kTypeCheckFromLazySpecializeStub);
+  __ LeaveStubFrame();
+
+  __ Bind(&done);
+  __ Ret();
+}
+
 void StubCode::GenerateSlowTypeTestStub(Assembler* assembler) {
   Label done, call_runtime;
 
   const Register kInstanceReg = R0;
-  const Register kInstantiatorTypeArgumentsReg = R2;
   const Register kFunctionTypeArgumentsReg = R1;
   const Register kDstTypeReg = R8;
   const Register kSubtypeTestCacheReg = R3;
@@ -2031,21 +2074,8 @@
   __ CompareObject(kDstTypeReg, Type::Handle(Type::ObjectType()));
   __ BranchIf(EQUAL, &done);
 
-  __ PushObject(Object::null_object());  // Make room for result.
-  __ Push(kInstanceReg);
-  __ Push(kDstTypeReg);
-  __ Push(kInstantiatorTypeArgumentsReg);
-  __ Push(kFunctionTypeArgumentsReg);
-  __ PushObject(Object::null_object());
-  __ Push(kSubtypeTestCacheReg);
-  __ CallRuntime(kTypeCheckRuntimeEntry, 6);
-  __ Pop(kSubtypeTestCacheReg);
-  __ Drop(1);  // dst_name
-  __ Pop(kFunctionTypeArgumentsReg);
-  __ Pop(kInstantiatorTypeArgumentsReg);
-  __ Pop(kDstTypeReg);
-  __ Pop(kInstanceReg);
-  __ Drop(1);  // Discard return value.
+  InvokeTypeCheckFromTypeTestStub(assembler, kTypeCheckFromSlowStub);
+
   __ Bind(&done);
   __ LeaveStubFrame();
   __ Ret();
diff --git a/runtime/vm/stub_code_arm64.cc b/runtime/vm/stub_code_arm64.cc
index df85dd5..5d07eb1 100644
--- a/runtime/vm/stub_code_arm64.cc
+++ b/runtime/vm/stub_code_arm64.cc
@@ -2007,12 +2007,56 @@
       kFunctionTypeArgumentsReg, kOwnTypeArgumentValue, check_failed);
 }
 
+static void InvokeTypeCheckFromTypeTestStub(Assembler* assembler,
+                                            TypeCheckMode mode) {
+  const Register kInstanceReg = R0;
+  const Register kInstantiatorTypeArgumentsReg = R1;
+  const Register kFunctionTypeArgumentsReg = R2;
+
+  const Register kSubtypeTestCacheReg = R3;
+  const Register kDstTypeReg = R8;
+
+  __ PushObject(Object::null_object());  // Make room for result.
+  __ Push(kInstanceReg);
+  __ Push(kDstTypeReg);
+  __ Push(kInstantiatorTypeArgumentsReg);
+  __ Push(kFunctionTypeArgumentsReg);
+  __ PushObject(Object::null_object());
+  __ Push(kSubtypeTestCacheReg);
+  __ PushObject(Smi::ZoneHandle(Smi::New(mode)));
+  __ CallRuntime(kTypeCheckRuntimeEntry, 7);
+  __ Drop(1);  // mode
+  __ Pop(kSubtypeTestCacheReg);
+  __ Drop(1);  // dst_name
+  __ Pop(kFunctionTypeArgumentsReg);
+  __ Pop(kInstantiatorTypeArgumentsReg);
+  __ Pop(kDstTypeReg);
+  __ Pop(kInstanceReg);
+  __ Drop(1);  // Discard return value.
+}
+
+void StubCode::GenerateLazySpecializeTypeTestStub(Assembler* assembler) {
+  const Register kInstanceReg = R0;
+  Label done;
+
+  __ CompareObject(kInstanceReg, Object::null_object());
+  __ BranchIf(EQUAL, &done);
+
+  __ ldr(CODE_REG,
+         Address(THR, Thread::lazy_specialize_type_test_stub_offset()));
+  __ EnterStubFrame();
+  InvokeTypeCheckFromTypeTestStub(assembler, kTypeCheckFromLazySpecializeStub);
+  __ LeaveStubFrame();
+
+  __ Bind(&done);
+  __ Ret();
+}
+
 void StubCode::GenerateSlowTypeTestStub(Assembler* assembler) {
   Label done, call_runtime;
 
   const Register kInstanceReg = R0;
   const Register kInstantiatorTypeArgumentsReg = R1;
-  const Register kFunctionTypeArgumentsReg = R2;
 
   const Register kSubtypeTestCacheReg = R3;
   const Register kDstTypeReg = R8;
@@ -2083,21 +2127,8 @@
   __ CompareObject(kDstTypeReg, Type::Handle(Type::ObjectType()));
   __ BranchIf(EQUAL, &done);
 
-  __ PushObject(Object::null_object());  // Make room for result.
-  __ Push(kInstanceReg);
-  __ Push(kDstTypeReg);
-  __ Push(kInstantiatorTypeArgumentsReg);
-  __ Push(kFunctionTypeArgumentsReg);
-  __ PushObject(Object::null_object());
-  __ Push(kSubtypeTestCacheReg);
-  __ CallRuntime(kTypeCheckRuntimeEntry, 6);
-  __ Pop(kSubtypeTestCacheReg);
-  __ Drop(1);  // dst_name
-  __ Pop(kFunctionTypeArgumentsReg);
-  __ Pop(kInstantiatorTypeArgumentsReg);
-  __ Pop(kDstTypeReg);
-  __ Pop(kInstanceReg);
-  __ Drop(1);  // Discard return value.
+  InvokeTypeCheckFromTypeTestStub(assembler, kTypeCheckFromSlowStub);
+
   __ Bind(&done);
   __ LeaveStubFrame();
   __ Ret();
diff --git a/runtime/vm/stub_code_dbc.cc b/runtime/vm/stub_code_dbc.cc
index f78a5d1..130e4f8 100644
--- a/runtime/vm/stub_code_dbc.cc
+++ b/runtime/vm/stub_code_dbc.cc
@@ -104,6 +104,11 @@
 }
 
 // TODO(kustermann): Don't generate this stub.
+void StubCode::GenerateLazySpecializeTypeTestStub(Assembler* assembler) {
+  __ Trap();
+}
+
+// TODO(kustermann): Don't generate this stub.
 void StubCode::GenerateSlowTypeTestStub(Assembler* assembler) {
   __ Trap();
 }
diff --git a/runtime/vm/stub_code_ia32.cc b/runtime/vm/stub_code_ia32.cc
index feb5068..ae5b070 100644
--- a/runtime/vm/stub_code_ia32.cc
+++ b/runtime/vm/stub_code_ia32.cc
@@ -1788,27 +1788,32 @@
 }
 
 void StubCode::GenerateDefaultTypeTestStub(Assembler* assembler) {
-  // Only used in AOT and therefore not on ia32.
+  // Not implemented on ia32.
   __ Breakpoint();
 }
 
 void StubCode::GenerateTopTypeTypeTestStub(Assembler* assembler) {
-  // Only used in AOT and therefore not on ia32.
+  // Not implemented on ia32.
   __ Breakpoint();
 }
 
 void StubCode::GenerateTypeRefTypeTestStub(Assembler* assembler) {
-  // Only used in AOT and therefore not on ia32.
+  // Not implemented on ia32.
   __ Breakpoint();
 }
 
 void StubCode::GenerateUnreachableTypeTestStub(Assembler* assembler) {
-  // Only used in AOT and therefore not on ia32.
+  // Not implemented on ia32.
+  __ Breakpoint();
+}
+
+void StubCode::GenerateLazySpecializeTypeTestStub(Assembler* assembler) {
+  // Not implemented on ia32.
   __ Breakpoint();
 }
 
 void StubCode::GenerateSlowTypeTestStub(Assembler* assembler) {
-  // Only used in AOT and therefore not on ia32.
+  // Not implemented on ia32.
   __ Breakpoint();
 }
 
diff --git a/runtime/vm/stub_code_x64.cc b/runtime/vm/stub_code_x64.cc
index d87f045..c5d5151 100644
--- a/runtime/vm/stub_code_x64.cc
+++ b/runtime/vm/stub_code_x64.cc
@@ -2121,12 +2121,56 @@
       kFunctionTypeArgumentsReg, kOwnTypeArgumentValue, check_failed);
 }
 
+static void InvokeTypeCheckFromTypeTestStub(Assembler* assembler,
+                                            TypeCheckMode mode) {
+  const Register kInstanceReg = RAX;
+  const Register kInstantiatorTypeArgumentsReg = RDX;
+  const Register kFunctionTypeArgumentsReg = RCX;
+  const Register kDstTypeReg = RBX;
+  const Register kSubtypeTestCacheReg = R9;
+
+  __ PushObject(Object::null_object());  // Make room for result.
+  __ pushq(kInstanceReg);
+  __ pushq(kDstTypeReg);
+  __ pushq(kInstantiatorTypeArgumentsReg);
+  __ pushq(kFunctionTypeArgumentsReg);
+  __ PushObject(Object::null_object());
+  __ pushq(kSubtypeTestCacheReg);
+  __ PushObject(Smi::ZoneHandle(Smi::New(mode)));
+  __ CallRuntime(kTypeCheckRuntimeEntry, 7);
+  __ Drop(1);
+  __ popq(kSubtypeTestCacheReg);
+  __ Drop(1);
+  __ popq(kFunctionTypeArgumentsReg);
+  __ popq(kInstantiatorTypeArgumentsReg);
+  __ popq(kDstTypeReg);
+  __ popq(kInstanceReg);
+  __ Drop(1);  // Discard return value.
+}
+
+void StubCode::GenerateLazySpecializeTypeTestStub(Assembler* assembler) {
+  const Register kInstanceReg = RAX;
+
+  Label done;
+
+  // Fast case for 'null'.
+  __ CompareObject(kInstanceReg, Object::null_object());
+  __ BranchIf(EQUAL, &done);
+
+  __ movq(CODE_REG,
+          Address(THR, Thread::lazy_specialize_type_test_stub_offset()));
+  __ EnterStubFrame();
+  InvokeTypeCheckFromTypeTestStub(assembler, kTypeCheckFromLazySpecializeStub);
+  __ LeaveStubFrame();
+
+  __ Bind(&done);
+  __ Ret();
+}
+
 void StubCode::GenerateSlowTypeTestStub(Assembler* assembler) {
   Label done, call_runtime;
 
   const Register kInstanceReg = RAX;
-  const Register kInstantiatorTypeArgumentsReg = RDX;
-  const Register kFunctionTypeArgumentsReg = RCX;
   const Register kDstTypeReg = RBX;
   const Register kSubtypeTestCacheReg = R9;
 
@@ -2191,21 +2235,8 @@
   __ CompareObject(kDstTypeReg, Type::Handle(Type::ObjectType()));
   __ BranchIf(EQUAL, &done);
 
-  __ PushObject(Object::null_object());  // Make room for result.
-  __ pushq(kInstanceReg);
-  __ pushq(kDstTypeReg);
-  __ pushq(kInstantiatorTypeArgumentsReg);
-  __ pushq(kFunctionTypeArgumentsReg);
-  __ PushObject(Object::null_object());
-  __ pushq(kSubtypeTestCacheReg);
-  __ CallRuntime(kTypeCheckRuntimeEntry, 6);
-  __ popq(kSubtypeTestCacheReg);
-  __ Drop(1);
-  __ popq(kFunctionTypeArgumentsReg);
-  __ popq(kInstantiatorTypeArgumentsReg);
-  __ popq(kDstTypeReg);
-  __ popq(kInstanceReg);
-  __ Drop(1);  // Discard return value.
+  InvokeTypeCheckFromTypeTestStub(assembler, kTypeCheckFromSlowStub);
+
   __ Bind(&done);
   __ LeaveStubFrame();
   __ Ret();
diff --git a/runtime/vm/thread.h b/runtime/vm/thread.h
index 802233a..4f176a1 100644
--- a/runtime/vm/thread.h
+++ b/runtime/vm/thread.h
@@ -102,7 +102,9 @@
   V(RawCode*, lazy_deopt_from_throw_stub_,                                     \
     StubCode::DeoptimizeLazyFromThrow_entry()->code(), NULL)                   \
   V(RawCode*, slow_type_test_stub_, StubCode::SlowTypeTest_entry()->code(),    \
-    NULL)
+    NULL)                                                                      \
+  V(RawCode*, lazy_specialize_type_test_stub_,                                 \
+    StubCode::LazySpecializeTypeTest_entry()->code(), NULL)
 
 #endif
 
diff --git a/runtime/vm/type_testing_stubs.cc b/runtime/vm/type_testing_stubs.cc
index 2a3b85c..3ec5386 100644
--- a/runtime/vm/type_testing_stubs.cc
+++ b/runtime/vm/type_testing_stubs.cc
@@ -92,7 +92,8 @@
 }
 
 RawInstructions* TypeTestingStubGenerator::DefaultCodeForType(
-    const AbstractType& type) {
+    const AbstractType& type,
+    bool lazy_specialize /* = true */) {
   // During bootstrapping we have no access to stubs yet, so we'll just return
   // `null` and patch these later in `Object::FinishInitOnce()`.
   if (!StubCode::HasBeenInitialized()) {
@@ -112,13 +113,27 @@
   }
 
   if (type.IsType() || type.IsTypeParameter()) {
-    return Code::InstructionsOf(StubCode::DefaultTypeTest_entry()->code());
+    const bool should_specialize = !FLAG_precompiled_mode && lazy_specialize;
+    return Code::InstructionsOf(
+        should_specialize ? StubCode::LazySpecializeTypeTest_entry()->code()
+                          : StubCode::DefaultTypeTest_entry()->code());
   } else {
     ASSERT(type.IsBoundedType() || type.IsMixinAppType());
     return Code::InstructionsOf(StubCode::UnreachableTypeTest_entry()->code());
   }
 }
 
+#if !defined(DART_PRECOMPILED_RUNTIME)
+void TypeTestingStubGenerator::SpecializeStubFor(Thread* thread,
+                                                 const AbstractType& type) {
+  HierarchyInfo hi(thread);
+  TypeTestingStubGenerator generator;
+  const Instructions& instr = Instructions::Handle(
+      thread->zone(), generator.OptimizedCodeForType(type));
+  type.SetTypeTestingStub(instr);
+}
+#endif
+
 TypeTestingStubGenerator::TypeTestingStubGenerator()
     : object_store_(Isolate::Current()->object_store()),
       array_(GrowableObjectArray::Handle()),
@@ -167,7 +182,7 @@
     }
   }
 #endif  // !defined(TARGET_ARCH_DBC) && !defined(TARGET_ARCH_IA32)
-  return TypeTestingStubGenerator::DefaultCodeForType(type);
+  return TypeTestingStubGenerator::DefaultCodeForType(type, false);
 }
 
 TypeTestingStubFinder::TypeTestingStubFinder()
@@ -189,6 +204,10 @@
   if (entry_point == code_.UncheckedEntryPoint()) {
     return code_.instructions();
   }
+  code_ = StubCode::LazySpecializeTypeTest_entry()->code();
+  if (entry_point == code_.UncheckedEntryPoint()) {
+    return code_.instructions();
+  }
   code_ = StubCode::TopTypeTypeTest_entry()->code();
   if (entry_point == code_.UncheckedEntryPoint()) {
     return code_.instructions();
@@ -213,6 +232,10 @@
   if (entry_point == code_.UncheckedEntryPoint()) {
     return "TypeTestingStub_Default";
   }
+  code_ = StubCode::LazySpecializeTypeTest_entry()->code();
+  if (entry_point == code_.UncheckedEntryPoint()) {
+    return "TypeTestingStub_LazySpecialize";
+  }
   code_ = StubCode::TopTypeTypeTest_entry()->code();
   if (entry_point == code_.UncheckedEntryPoint()) {
     return "TypeTestingStub_Top";
@@ -331,7 +354,7 @@
 #ifndef PRODUCT
   if (FLAG_support_disassembler && FLAG_disassemble_stubs) {
     LogBlock lb;
-    THR_Print("Code for stub '%s': {\n", name);
+    THR_Print("Code for stub '%s' (type = %s): {\n", name, type.ToCString());
     DisassembleToStdout formatter;
     code.Disassemble(&formatter);
     THR_Print("}\n");
diff --git a/runtime/vm/type_testing_stubs.h b/runtime/vm/type_testing_stubs.h
index d03b48a..ce589c8 100644
--- a/runtime/vm/type_testing_stubs.h
+++ b/runtime/vm/type_testing_stubs.h
@@ -36,7 +36,12 @@
   // During bootstrapping it will return `null` for a whitelisted set of types,
   // otherwise it will return a default stub which tail-calls
   // subtypingtest/runtime code.
-  static RawInstructions* DefaultCodeForType(const AbstractType& type);
+  static RawInstructions* DefaultCodeForType(const AbstractType& type,
+                                             bool lazy_specialize = true);
+
+#if !defined(DART_PRECOMPILED_RUNTIME)
+  static void SpecializeStubFor(Thread* thread, const AbstractType& type);
+#endif
 
   TypeTestingStubGenerator();