[vm/compiler] Allow WB elimination for small arrays.

Previously all arrays were excluded from WB elimination pass
to avoid invariant restoration code creating excessive work
for the GC and to avoid dealing with card marking in the
invariant restoration code.

It seems reasonable to enable this for small arrays of up to 8
elements. The cut of point of 8 elements was chosen based on the
cut of point for literal<N> specialisations provided by the
core library for creating small literal arrays.

TEST=vm/cc/IRTest_WriteBarrierElimination_Arrays

Cq-Include-Trybots: luci.dart.try:vm-kernel-precomp-dwarf-linux-product-x64-try,vm-kernel-precomp-linux-debug-simarm_x64-try,vm-kernel-precomp-linux-debug-x64-try,vm-kernel-precomp-linux-debug-x64c-try,vm-kernel-precomp-linux-product-x64-try,vm-kernel-precomp-linux-release-simarm-try,vm-kernel-precomp-linux-release-simarm64-try,vm-kernel-precomp-linux-release-simarm_x64-try,vm-kernel-precomp-linux-release-x64-try
Change-Id: I2b3169865f07c3ff95820c1bc6718943e96bd33b
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/229903
Reviewed-by: Alexander Markov <alexmarkov@google.com>
Reviewed-by: Ryan Macnak <rmacnak@google.com>
Commit-Queue: Slava Egorov <vegorov@google.com>
diff --git a/runtime/platform/utils.h b/runtime/platform/utils.h
index 0e7a462..c22fcc0 100644
--- a/runtime/platform/utils.h
+++ b/runtime/platform/utils.h
@@ -84,7 +84,7 @@
   }
 
   template <typename T>
-  static inline T RoundDown(T x, intptr_t n) {
+  static constexpr inline T RoundDown(T x, intptr_t n) {
     ASSERT(IsPowerOfTwo(n));
     return (x & -n);
   }
@@ -95,7 +95,7 @@
   }
 
   template <typename T>
-  static inline T RoundUp(T x, intptr_t n) {
+  static constexpr inline T RoundUp(T x, intptr_t n) {
     return RoundDown(x + n - 1, n);
   }
 
diff --git a/runtime/vm/compiler/write_barrier_elimination.cc b/runtime/vm/compiler/write_barrier_elimination.cc
index 855cffb..86f58bd 100644
--- a/runtime/vm/compiler/write_barrier_elimination.cc
+++ b/runtime/vm/compiler/write_barrier_elimination.cc
@@ -114,7 +114,7 @@
 
   // Bitvector with all non-Array-allocation instructions set. Used to
   // un-mark Array allocations as usable.
-  BitVector* array_allocations_mask_;
+  BitVector* large_array_allocations_mask_;
 
   // Bitvectors for each block of which allocations are new or remembered
   // at the start (after Phis).
@@ -189,8 +189,21 @@
   }
 }
 
+static bool IsCreateLargeArray(Definition* defn) {
+  if (auto create_array = defn->AsCreateArray()) {
+    static_assert(!Array::UseCardMarkingForAllocation(
+                      Array::kMaxLengthForWriteBarrierElimination),
+                  "Invariant restoration code does not handle card marking.");
+    // Note: IsUsable would reject CreateArray instructions with non-constant
+    // number of elements.
+    return create_array->GetConstantNumElements() >
+           Array::kMaxLengthForWriteBarrierElimination;
+  }
+  return false;
+}
+
 void WriteBarrierElimination::IndexDefinitions(Zone* zone) {
-  BitmapBuilder array_allocations;
+  BitmapBuilder large_array_allocations;
 
   GrowableArray<Definition*> create_array_worklist;
 
@@ -198,7 +211,7 @@
     BlockEntryInstr* const block = block_order_->At(i);
     if (auto join_block = block->AsJoinEntry()) {
       for (PhiIterator it(join_block); !it.Done(); it.Advance()) {
-        array_allocations.Set(definition_count_, false);
+        large_array_allocations.Set(definition_count_, false);
         definition_indices_.Insert({it.Current(), definition_count_++});
 #if defined(DEBUG)
         if (tracing_) {
@@ -211,10 +224,10 @@
     for (ForwardInstructionIterator it(block); !it.Done(); it.Advance()) {
       if (Definition* current = it.Current()->AsDefinition()) {
         if (IsUsable(current)) {
-          const bool is_create_array = current->IsCreateArray();
-          array_allocations.Set(definition_count_, is_create_array);
+          const bool is_create_large_array = IsCreateLargeArray(current);
+          large_array_allocations.Set(definition_count_, is_create_large_array);
           definition_indices_.Insert({current, definition_count_++});
-          if (is_create_array) {
+          if (is_create_large_array) {
             create_array_worklist.Add(current);
           }
 #if defined(DEBUG)
@@ -234,8 +247,9 @@
          it.Advance()) {
       if (auto phi_use = it.Current()->instruction()->AsPhi()) {
         const intptr_t index = Index(phi_use);
-        if (!array_allocations.Get(index)) {
-          array_allocations.Set(index, /*can_be_create_array=*/true);
+        if (!large_array_allocations.Get(index)) {
+          large_array_allocations.Set(index,
+                                      /*can_be_create_large_array=*/true);
           create_array_worklist.Add(phi_use);
         }
       }
@@ -244,9 +258,9 @@
 
   vector_ = new (zone) BitVector(zone, definition_count_);
   vector_->SetAll();
-  array_allocations_mask_ = new (zone) BitVector(zone, definition_count_);
+  large_array_allocations_mask_ = new (zone) BitVector(zone, definition_count_);
   for (intptr_t i = 0; i < definition_count_; ++i) {
-    if (!array_allocations.Get(i)) array_allocations_mask_->Add(i);
+    if (!large_array_allocations.Get(i)) large_array_allocations_mask_->Add(i);
   }
 }
 
@@ -388,9 +402,9 @@
     if (current->CanCallDart()) {
       vector_->Clear();
     } else if (current->CanTriggerGC()) {
-      // Clear array allocations. These are not added to the remembered set
-      // by Thread::RememberLiveTemporaries() after a scavenge.
-      vector_->Intersect(array_allocations_mask_);
+      // Clear large array allocations. These are not added to the remembered
+      // set by Thread::RememberLiveTemporaries() after a scavenge.
+      vector_->Intersect(large_array_allocations_mask_);
     }
 
     if (AllocationInstr* const alloc = current->AsAllocation()) {
diff --git a/runtime/vm/compiler/write_barrier_elimination_test.cc b/runtime/vm/compiler/write_barrier_elimination_test.cc
index 1275b14..284d112 100644
--- a/runtime/vm/compiler/write_barrier_elimination_test.cc
+++ b/runtime/vm/compiler/write_barrier_elimination_test.cc
@@ -138,14 +138,14 @@
   EXPECT(store->ShouldEmitStoreBarrier() == true);
 }
 
-ISOLATE_UNIT_TEST_CASE(IRTest_WriteBarrierElimination_Arrays) {
+static void TestWBEForArrays(int length) {
   DEBUG_ONLY(
       SetFlagScope<bool> sfs(&FLAG_trace_write_barrier_elimination, true));
   const char* nullable_tag = TestCase::NullableTag();
 
-  // Test that array allocations are not considered usable after a
-  // may-trigger-GC instruction (in this case CheckStackOverflow), unlike
-  // normal allocations, which are only interruped by a Dart call.
+  // Test that array allocations are considered usable after a
+  // may-trigger-GC instruction (in this case CheckStackOverflow) iff they
+  // are small.
   // clang-format off
   auto kScript =
       Utils::CStringUniquePtr(OS::SCreate(nullptr, R"(
@@ -159,7 +159,8 @@
       foo(int x) {
         C c = C();
         C n = C();
-        List<C%s> array = List<C%s>.filled(1, null);
+        List<C%s> array = List<C%s>.filled(%d, null);
+        array[0] = c;
         while (x --> 0) {
           c.next = n;
           n = c;
@@ -170,10 +171,15 @@
       }
 
       main() { foo(10); }
-      )", TestCase::LateTag(), nullable_tag, nullable_tag), std::free);
+      )", TestCase::LateTag(), nullable_tag, nullable_tag, length), std::free);
   // clang-format on
 
-  const auto& root_library = Library::Handle(LoadTestScript(kScript.get()));
+  // Generate a length dependent test library uri.
+  char lib_uri[256];
+  snprintf(lib_uri, sizeof(lib_uri), "%s%d", RESOLVED_USER_TEST_URI, length);
+
+  const auto& root_library = Library::Handle(
+      LoadTestScript(kScript.get(), /*resolver=*/nullptr, lib_uri));
 
   Invoke(root_library, "main");
 
@@ -185,11 +191,14 @@
   EXPECT(entry != nullptr);
 
   StoreInstanceFieldInstr* store_into_c = nullptr;
-  StoreIndexedInstr* store_into_array = nullptr;
+  StoreIndexedInstr* store_into_array_before_loop = nullptr;
+  StoreIndexedInstr* store_into_array_after_loop = nullptr;
 
   ILMatcher cursor(flow_graph, entry);
   RELEASE_ASSERT(cursor.TryMatch({
       kMoveGlob,
+      {kMatchAndMoveStoreIndexed, &store_into_array_before_loop},
+      kMoveGlob,
       kMatchAndMoveGoto,
       kMoveGlob,
       kMatchAndMoveBranchTrue,
@@ -200,11 +209,19 @@
       kMoveGlob,
       kMatchAndMoveBranchFalse,
       kMoveGlob,
-      {kMatchAndMoveStoreIndexed, &store_into_array},
+      {kMatchAndMoveStoreIndexed, &store_into_array_after_loop},
   }));
 
   EXPECT(store_into_c->ShouldEmitStoreBarrier() == false);
-  EXPECT(store_into_array->ShouldEmitStoreBarrier() == true);
+  EXPECT(store_into_array_before_loop->ShouldEmitStoreBarrier() == false);
+  EXPECT(store_into_array_after_loop->ShouldEmitStoreBarrier() ==
+         (length > Array::kMaxLengthForWriteBarrierElimination));
+}
+
+ISOLATE_UNIT_TEST_CASE(IRTest_WriteBarrierElimination_Arrays) {
+  TestWBEForArrays(1);
+  TestWBEForArrays(Array::kMaxLengthForWriteBarrierElimination);
+  TestWBEForArrays(Array::kMaxLengthForWriteBarrierElimination + 1);
 }
 
 ISOLATE_UNIT_TEST_CASE(IRTest_WriteBarrierElimination_Regress43786) {
diff --git a/runtime/vm/object.h b/runtime/vm/object.h
index b8d8a8a..8497df0 100644
--- a/runtime/vm/object.h
+++ b/runtime/vm/object.h
@@ -668,7 +668,7 @@
                             Heap::Space space,
                             bool compressed);
 
-  static intptr_t RoundedAllocationSize(intptr_t size) {
+  static constexpr intptr_t RoundedAllocationSize(intptr_t size) {
     return Utils::RoundUp(size, kObjectAlignment);
   }
 
@@ -10228,10 +10228,17 @@
 class Array : public Instance {
  public:
   // Returns `true` if we use card marking for arrays of length [array_length].
-  static bool UseCardMarkingForAllocation(const intptr_t array_length) {
+  static constexpr bool UseCardMarkingForAllocation(
+      const intptr_t array_length) {
     return Array::InstanceSize(array_length) > Heap::kNewAllocatableSize;
   }
 
+  // WB invariant restoration code only applies to arrives which have at most
+  // this many elements. Consequently WB elimination code should not eliminate
+  // WB on arrays of larger lengths across instructions that can cause GC.
+  // Note: we also can't restore WB invariant for arrays which use card marking.
+  static constexpr intptr_t kMaxLengthForWriteBarrierElimination = 8;
+
   intptr_t Length() const { return LengthOf(ptr()); }
   static intptr_t LengthOf(const ArrayPtr array) {
     return Smi::Value(array->untag()->length());
@@ -10327,7 +10334,7 @@
     return OFFSET_OF(UntaggedArray, type_arguments_);
   }
 
-  static bool IsValidLength(intptr_t len) {
+  static constexpr bool IsValidLength(intptr_t len) {
     return 0 <= len && len <= kMaxElements;
   }
 
@@ -10337,7 +10344,7 @@
     return 0;
   }
 
-  static intptr_t InstanceSize(intptr_t len) {
+  static constexpr intptr_t InstanceSize(intptr_t len) {
     // Ensure that variable length data is not adding to the object length.
     ASSERT(sizeof(UntaggedArray) ==
            (sizeof(UntaggedInstance) + (2 * kBytesPerElement)));
diff --git a/runtime/vm/thread.cc b/runtime/vm/thread.cc
index 584cb9c..e21a66c 100644
--- a/runtime/vm/thread.cc
+++ b/runtime/vm/thread.cc
@@ -657,10 +657,15 @@
       // Stores into new-space objects don't need a write barrier.
       if (obj->IsSmiOrNewObject()) continue;
 
-      // To avoid adding too much work into the remembered set, skip
+      // To avoid adding too much work into the remembered set, skip large
       // arrays. Write barrier elimination will not remove the barrier
       // if we can trigger GC between array allocation and store.
-      if (obj->GetClassId() == kArrayCid) continue;
+      if (obj->GetClassId() == kArrayCid) {
+        const auto length = Smi::Value(Array::RawCast(obj)->untag()->length());
+        if (length > Array::kMaxLengthForWriteBarrierElimination) {
+          continue;
+        }
+      }
 
       // Dart code won't store into VM-internal objects except Contexts and
       // UnhandledExceptions. This assumption is checked by an assertion in