[vm] Weaken CAS to RMW when accessing the remembered and mark bits.

Tighten some sequences in the write barrier stub.

Change-Id: Ib3657b9b582082137d17e86135200444172f428a
Reviewed-on: https://dart-review.googlesource.com/60820
Commit-Queue: Ryan Macnak <rmacnak@google.com>
Reviewed-by: Siva Annamalai <asiva@google.com>
diff --git a/runtime/platform/atomic.h b/runtime/platform/atomic.h
index 4a468e5..37836e0 100644
--- a/runtime/platform/atomic.h
+++ b/runtime/platform/atomic.h
@@ -30,6 +30,10 @@
   // Atomically decrement the value at p by 'value'.
   static void DecrementBy(intptr_t* p, intptr_t value);
 
+  // Atomically perform { tmp = *ptr; *ptr = (tmp OP value); return tmp; }.
+  static uint32_t FetchOrRelaxedUint32(uint32_t* ptr, uint32_t value);
+  static uint32_t FetchAndRelaxedUint32(uint32_t* ptr, uint32_t value);
+
   // Atomically compare *ptr to old_value, and if equal, store new_value.
   // Returns the original value at ptr.
   static uword CompareAndSwapWord(uword* ptr, uword old_value, uword new_value);
diff --git a/runtime/platform/atomic_android.h b/runtime/platform/atomic_android.h
index 42dda56..f95ba02 100644
--- a/runtime/platform/atomic_android.h
+++ b/runtime/platform/atomic_android.h
@@ -46,6 +46,16 @@
   __sync_fetch_and_sub(p, value);
 }
 
+inline uint32_t AtomicOperations::FetchOrRelaxedUint32(uint32_t* ptr,
+                                                       uint32_t value) {
+  return __atomic_fetch_or(ptr, value, __ATOMIC_RELAXED);
+}
+
+inline uint32_t AtomicOperations::FetchAndRelaxedUint32(uint32_t* ptr,
+                                                        uint32_t value) {
+  return __atomic_fetch_and(ptr, value, __ATOMIC_RELAXED);
+}
+
 inline uword AtomicOperations::CompareAndSwapWord(uword* ptr,
                                                   uword old_value,
                                                   uword new_value) {
diff --git a/runtime/platform/atomic_fuchsia.h b/runtime/platform/atomic_fuchsia.h
index 5434fb7..2883e1e 100644
--- a/runtime/platform/atomic_fuchsia.h
+++ b/runtime/platform/atomic_fuchsia.h
@@ -43,6 +43,16 @@
   __sync_fetch_and_sub(p, value);
 }
 
+inline uint32_t AtomicOperations::FetchOrRelaxedUint32(uint32_t* ptr,
+                                                       uint32_t value) {
+  return __atomic_fetch_or(ptr, value, __ATOMIC_RELAXED);
+}
+
+inline uint32_t AtomicOperations::FetchAndRelaxedUint32(uint32_t* ptr,
+                                                        uint32_t value) {
+  return __atomic_fetch_and(ptr, value, __ATOMIC_RELAXED);
+}
+
 inline uword AtomicOperations::CompareAndSwapWord(uword* ptr,
                                                   uword old_value,
                                                   uword new_value) {
diff --git a/runtime/platform/atomic_linux.h b/runtime/platform/atomic_linux.h
index 3db8d73..fd1773f 100644
--- a/runtime/platform/atomic_linux.h
+++ b/runtime/platform/atomic_linux.h
@@ -46,6 +46,16 @@
   __sync_fetch_and_sub(p, value);
 }
 
+inline uint32_t AtomicOperations::FetchOrRelaxedUint32(uint32_t* ptr,
+                                                       uint32_t value) {
+  return __atomic_fetch_or(ptr, value, __ATOMIC_RELAXED);
+}
+
+inline uint32_t AtomicOperations::FetchAndRelaxedUint32(uint32_t* ptr,
+                                                        uint32_t value) {
+  return __atomic_fetch_and(ptr, value, __ATOMIC_RELAXED);
+}
+
 inline uword AtomicOperations::CompareAndSwapWord(uword* ptr,
                                                   uword old_value,
                                                   uword new_value) {
diff --git a/runtime/platform/atomic_macos.h b/runtime/platform/atomic_macos.h
index b08ba4f..b0bd31f 100644
--- a/runtime/platform/atomic_macos.h
+++ b/runtime/platform/atomic_macos.h
@@ -46,6 +46,16 @@
   __sync_fetch_and_sub(p, value);
 }
 
+inline uint32_t AtomicOperations::FetchOrRelaxedUint32(uint32_t* ptr,
+                                                       uint32_t value) {
+  return __atomic_fetch_or(ptr, value, __ATOMIC_RELAXED);
+}
+
+inline uint32_t AtomicOperations::FetchAndRelaxedUint32(uint32_t* ptr,
+                                                        uint32_t value) {
+  return __atomic_fetch_and(ptr, value, __ATOMIC_RELAXED);
+}
+
 inline uword AtomicOperations::CompareAndSwapWord(uword* ptr,
                                                   uword old_value,
                                                   uword new_value) {
diff --git a/runtime/platform/atomic_win.h b/runtime/platform/atomic_win.h
index f7fc322..5e6db1b 100644
--- a/runtime/platform/atomic_win.h
+++ b/runtime/platform/atomic_win.h
@@ -102,6 +102,18 @@
 #endif
 }
 
+inline uint32_t AtomicOperations::FetchOrRelaxedUint32(uint32_t* ptr,
+                                                       uint32_t value) {
+  return static_cast<uint32_t>(InterlockedOrNoFence(
+      reinterpret_cast<LONG*>(ptr), static_cast<LONG>(value)));
+}
+
+inline uint32_t AtomicOperations::FetchAndRelaxedUint32(uint32_t* ptr,
+                                                        uint32_t value) {
+  return static_cast<uint32_t>(InterlockedAndNoFence(
+      reinterpret_cast<LONG*>(ptr), static_cast<LONG>(value)));
+}
+
 inline uword AtomicOperations::CompareAndSwapWord(uword* ptr,
                                                   uword old_value,
                                                   uword new_value) {
diff --git a/runtime/vm/atomic_test.cc b/runtime/vm/atomic_test.cc
index c9afc97..5f1d565 100644
--- a/runtime/vm/atomic_test.cc
+++ b/runtime/vm/atomic_test.cc
@@ -50,6 +50,20 @@
   EXPECT_EQ(static_cast<intptr_t>(1), v);
 }
 
+VM_UNIT_TEST_CASE(FetchOrRelaxed) {
+  uint32_t v = 42;
+  uint32_t previous = AtomicOperations::FetchOrRelaxedUint32(&v, 3);
+  EXPECT_EQ(static_cast<uint32_t>(42), previous);
+  EXPECT_EQ(static_cast<uint32_t>(43), v);
+}
+
+VM_UNIT_TEST_CASE(FetchAndRelaxed) {
+  uint32_t v = 42;
+  uint32_t previous = AtomicOperations::FetchAndRelaxedUint32(&v, 3);
+  EXPECT_EQ(static_cast<uint32_t>(42), previous);
+  EXPECT_EQ(static_cast<uint32_t>(2), v);
+}
+
 VM_UNIT_TEST_CASE(LoadRelaxed) {
   uword v = 42;
   EXPECT_EQ(static_cast<uword>(42), AtomicOperations::LoadRelaxed(&v));
diff --git a/runtime/vm/raw_object.h b/runtime/vm/raw_object.h
index d8b7048..bf7d68f 100644
--- a/runtime/vm/raw_object.h
+++ b/runtime/vm/raw_object.h
@@ -623,28 +623,20 @@
 
   template <class TagBitField>
   void UpdateTagBit(bool value) {
-    uint32_t tags = ptr()->tags_;
-    uint32_t old_tags;
-    do {
-      old_tags = tags;
-      uint32_t new_tags = TagBitField::update(value, old_tags);
-      tags = AtomicOperations::CompareAndSwapUint32(&ptr()->tags_, old_tags,
-                                                    new_tags);
-    } while (tags != old_tags);
+    if (value) {
+      AtomicOperations::FetchOrRelaxedUint32(&ptr()->tags_,
+                                             TagBitField::encode(true));
+    } else {
+      AtomicOperations::FetchAndRelaxedUint32(&ptr()->tags_,
+                                              ~TagBitField::encode(true));
+    }
   }
 
   template <class TagBitField>
   bool TryAcquireTagBit() {
-    uint32_t tags = ptr()->tags_;
-    uint32_t old_tags;
-    do {
-      old_tags = tags;
-      if (TagBitField::decode(tags)) return false;
-      uint32_t new_tags = TagBitField::update(true, old_tags);
-      tags = AtomicOperations::CompareAndSwapUint32(&ptr()->tags_, old_tags,
-                                                    new_tags);
-    } while (tags != old_tags);
-    return true;
+    uint32_t old_tags = AtomicOperations::FetchOrRelaxedUint32(
+        &ptr()->tags_, TagBitField::encode(true));
+    return !TagBitField::decode(old_tags);
   }
 
   // All writes to heap objects should ultimately pass through one of the
diff --git a/runtime/vm/stub_code_arm.cc b/runtime/vm/stub_code_arm.cc
index 7b43afa..493329f 100644
--- a/runtime/vm/stub_code_arm.cc
+++ b/runtime/vm/stub_code_arm.cc
@@ -1030,21 +1030,21 @@
 // Input parameters:
 //   R0: address (i.e. object) being stored into.
 void StubCode::GenerateUpdateStoreBufferStub(Assembler* assembler) {
-  // Save values being destroyed.
-  __ PushList((1 << R1) | (1 << R2) | (1 << R3));
-
   Label add_to_buffer;
   // Check whether this object has already been remembered. Skip adding to the
   // store buffer if the object is in the store buffer already.
   // Spilled: R1, R2, R3
   // R0: Address being stored
-  __ ldr(R2, FieldAddress(R0, Object::tags_offset()));
-  __ tst(R2, Operand(1 << RawObject::kRememberedBit));
+  __ ldr(TMP, FieldAddress(R0, Object::tags_offset()));
+  __ tst(TMP, Operand(1 << RawObject::kRememberedBit));
   __ b(&add_to_buffer, EQ);
-  __ PopList((1 << R1) | (1 << R2) | (1 << R3));
   __ Ret();
 
   __ Bind(&add_to_buffer);
+
+  // Save values being destroyed.
+  __ PushList((1 << R1) | (1 << R2) | (1 << R3));
+
   // R2: Header word.
   if (TargetCPUFeatures::arm_version() == ARMv5TE) {
 // TODO(21263): Implement 'swp' and use it below.
@@ -1077,17 +1077,17 @@
   // Increment top_ and check for overflow.
   // R2: top_.
   // R1: StoreBufferBlock.
-  Label L;
+  Label overflow;
   __ add(R2, R2, Operand(1));
   __ str(R2, Address(R1, StoreBufferBlock::top_offset()));
   __ CompareImmediate(R2, StoreBufferBlock::kSize);
   // Restore values.
   __ PopList((1 << R1) | (1 << R2) | (1 << R3));
-  __ b(&L, EQ);
+  __ b(&overflow, EQ);
   __ Ret();
 
   // Handle overflow: Call the runtime leaf function.
-  __ Bind(&L);
+  __ Bind(&overflow);
   // Setup frame, push callee-saved registers.
 
   __ Push(CODE_REG);
diff --git a/runtime/vm/stub_code_arm64.cc b/runtime/vm/stub_code_arm64.cc
index 97a107b..c38de9a 100644
--- a/runtime/vm/stub_code_arm64.cc
+++ b/runtime/vm/stub_code_arm64.cc
@@ -1211,8 +1211,7 @@
   // Check whether this object has already been remembered. Skip adding to the
   // store buffer if the object is in the store buffer already.
   __ LoadFieldFromOffset(TMP, R0, Object::tags_offset(), kWord);
-  __ tsti(TMP, Immediate(1 << RawObject::kRememberedBit));
-  __ b(&add_to_buffer, EQ);
+  __ tbz(&add_to_buffer, TMP, RawObject::kRememberedBit);
   __ ret();
 
   __ Bind(&add_to_buffer);
@@ -1232,8 +1231,7 @@
   __ ldxr(R2, R3, kWord);
   __ orri(R2, R2, Immediate(1 << RawObject::kRememberedBit));
   __ stxr(R1, R2, R3, kWord);
-  __ cmp(R1, Operand(1));
-  __ b(&retry, EQ);
+  __ cbnz(&retry, R1);
 
   // Load the StoreBuffer block out of the thread. Then load top_ out of the
   // StoreBufferBlock and add the address to the pointers_.
@@ -1245,7 +1243,7 @@
   // Increment top_ and check for overflow.
   // R2: top_.
   // R1: StoreBufferBlock.
-  Label L;
+  Label overflow;
   __ add(R2, R2, Operand(1));
   __ StoreToOffset(R2, R1, StoreBufferBlock::top_offset(), kUnsignedWord);
   __ CompareImmediate(R2, StoreBufferBlock::kSize);
@@ -1253,11 +1251,11 @@
   __ Pop(R3);
   __ Pop(R2);
   __ Pop(R1);
-  __ b(&L, EQ);
+  __ b(&overflow, EQ);
   __ ret();
 
   // Handle overflow: Call the runtime leaf function.
-  __ Bind(&L);
+  __ Bind(&overflow);
   // Setup frame, push callee-saved registers.
 
   __ Push(CODE_REG);
diff --git a/runtime/vm/stub_code_ia32.cc b/runtime/vm/stub_code_ia32.cc
index bdc494b..60a107b 100644
--- a/runtime/vm/stub_code_ia32.cc
+++ b/runtime/vm/stub_code_ia32.cc
@@ -961,8 +961,6 @@
   // store buffer if the object is in the store buffer already.
   // Spilled: EAX, ECX
   // EDX: Address being stored
-  Label reload;
-  __ Bind(&reload);
   __ movl(EAX, FieldAddress(EDX, Object::tags_offset()));
   __ testl(EAX, Immediate(1 << RawObject::kRememberedBit));
   __ j(EQUAL, &add_to_buffer, Assembler::kNearJump);
@@ -974,11 +972,10 @@
   // EDX: Address being stored
   // EAX: Current tag value
   __ Bind(&add_to_buffer);
-  __ movl(ECX, EAX);
-  __ orl(ECX, Immediate(1 << RawObject::kRememberedBit));
-  // Compare the tag word with EAX, update to ECX if unchanged.
-  __ LockCmpxchgl(FieldAddress(EDX, Object::tags_offset()), ECX);
-  __ j(NOT_EQUAL, &reload);
+  // lock+orl is an atomic read-modify-write.
+  __ lock();
+  __ orl(FieldAddress(EDX, Object::tags_offset()),
+         Immediate(1 << RawObject::kRememberedBit));
 
   // Load the StoreBuffer block out of the thread. Then load top_ out of the
   // StoreBufferBlock and add the address to the pointers_.
@@ -992,7 +989,7 @@
   // Spilled: EAX, ECX
   // ECX: top_
   // EAX: StoreBufferBlock
-  Label L;
+  Label overflow;
   __ incl(ECX);
   __ movl(Address(EAX, StoreBufferBlock::top_offset()), ECX);
   __ cmpl(ECX, Immediate(StoreBufferBlock::kSize));
@@ -1000,11 +997,11 @@
   // Spilled: EAX, ECX
   __ popl(ECX);
   __ popl(EAX);
-  __ j(EQUAL, &L, Assembler::kNearJump);
+  __ j(EQUAL, &overflow, Assembler::kNearJump);
   __ ret();
 
   // Handle overflow: Call the runtime leaf function.
-  __ Bind(&L);
+  __ Bind(&overflow);
   // Setup frame, push callee-saved registers.
 
   __ EnterCallRuntimeFrame(1 * kWordSize);
diff --git a/runtime/vm/stub_code_x64.cc b/runtime/vm/stub_code_x64.cc
index 6b0b539..22da3fe 100644
--- a/runtime/vm/stub_code_x64.cc
+++ b/runtime/vm/stub_code_x64.cc
@@ -1194,22 +1194,13 @@
 // Input parameters:
 //   RDX: Address being stored
 void StubCode::GenerateUpdateStoreBufferStub(Assembler* assembler) {
-  // Save registers being destroyed.
-  __ pushq(RAX);
-  __ pushq(RCX);
-
   Label add_to_buffer;
   // Check whether this object has already been remembered. Skip adding to the
   // store buffer if the object is in the store buffer already.
-  // Spilled: RAX, RCX
   // RDX: Address being stored
-  Label reload;
-  __ Bind(&reload);
-  __ movl(RAX, FieldAddress(RDX, Object::tags_offset()));
-  __ testl(RAX, Immediate(1 << RawObject::kRememberedBit));
+  __ movl(TMP, FieldAddress(RDX, Object::tags_offset()));
+  __ testl(TMP, Immediate(1 << RawObject::kRememberedBit));
   __ j(EQUAL, &add_to_buffer, Assembler::kNearJump);
-  __ popq(RCX);
-  __ popq(RAX);
   __ ret();
 
   // Update the tags that this object has been remembered.
@@ -1218,11 +1209,14 @@
   // RDX: Address being stored
   // RAX: Current tag value
   __ Bind(&add_to_buffer);
-  __ movl(RCX, RAX);
-  __ orl(RCX, Immediate(1 << RawObject::kRememberedBit));
-  // Compare the tag word with RAX, update to RCX if unchanged.
-  __ LockCmpxchgl(FieldAddress(RDX, Object::tags_offset()), RCX);
-  __ j(NOT_EQUAL, &reload);
+  // lock+orl is an atomic read-modify-write.
+  __ lock();
+  __ orl(FieldAddress(RDX, Object::tags_offset()),
+         Immediate(1 << RawObject::kRememberedBit));
+
+  // Save registers being destroyed.
+  __ pushq(RAX);
+  __ pushq(RCX);
 
   // Load the StoreBuffer block out of the thread. Then load top_ out of the
   // StoreBufferBlock and add the address to the pointers_.
@@ -1234,18 +1228,18 @@
   // Increment top_ and check for overflow.
   // RCX: top_
   // RAX: StoreBufferBlock
-  Label L;
+  Label overflow;
   __ incq(RCX);
   __ movl(Address(RAX, StoreBufferBlock::top_offset()), RCX);
   __ cmpl(RCX, Immediate(StoreBufferBlock::kSize));
   // Restore values.
   __ popq(RCX);
   __ popq(RAX);
-  __ j(EQUAL, &L, Assembler::kNearJump);
+  __ j(EQUAL, &overflow, Assembler::kNearJump);
   __ ret();
 
   // Handle overflow: Call the runtime leaf function.
-  __ Bind(&L);
+  __ Bind(&overflow);
   // Setup frame, push callee-saved registers.
   __ pushq(CODE_REG);
   __ movq(CODE_REG, Address(THR, Thread::update_store_buffer_code_offset()));