Tweak some X64 assembler instructions to save bytes.
Changes `xorpd(r,r)` and `xorq(r, r)` to `xorps(r,r)` and `xorl(r, r)`, which saves one (REX prefix) byte and still clears the register. (At least if the register is in the first 8, otherwise a REX byte is needed just to name the register. Still better to let the compiler worry about that than hardwiring a 64-bit `q` or `pd` operation.)
Changes `andq` to `AndImmediate` which recognizes that a 32-bit or smaller immediate only needs an `andl` (again potentially saving a REX prefix).
Simplifies the code for clamping an Uint8Clamped value to have fewer branches. (If we know the value is not in the 0..255 range, two instructions can convert all negative values to zero, and all positive values to a value with the lower 8 bit set: `(~v)>>63`.)
Simplifies the code for "is infinite" to use one less 64-bit immediate (by multiplying by 2 to shift out the sign, rather than masking it out using a 63-bit mask.)
Tested: No new behavior, only optimization. Covered by existing tests.
Change-Id: I94e66c2ff39f0a207649f657e4da1ed43e4e819e
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/385741
Commit-Queue: Lasse Nielsen <lrn@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
diff --git a/runtime/vm/compiler/assembler/assembler_x64.h b/runtime/vm/compiler/assembler/assembler_x64.h
index 22f0ddf..3ed5af8 100644
--- a/runtime/vm/compiler/assembler/assembler_x64.h
+++ b/runtime/vm/compiler/assembler/assembler_x64.h
@@ -724,6 +724,11 @@
void jmp(const ExternalLabel* label);
void jmp(const Code& code);
+ /// Moves an XMM register's content to a 64-bit register.
+ void MoveFpuRegisterToRegister(Register dst, FpuRegister src) {
+ movq(dst, src);
+ }
+
// Issue memory to memory move through a TMP register.
// TODO(koda): Assert that these are not used for heap objects.
void MoveMemoryToMemory(const Address& dst, const Address& src) {
@@ -835,6 +840,14 @@
void LoadDImmediate(FpuRegister dst, double immediate);
void LoadQImmediate(FpuRegister dst, simd128_value_t immediate);
+ // Sets register to zero.
+ // Affects flags (sets zero flag, clears rest).
+ void ClearRegister(Register reg) { xorl(reg, reg); }
+
+ // Sets XMM register to zero.
+ // Affects flags (sets zero flag, clears rest).
+ void ClearFpuRegister(FpuRegister reg) { xorps(reg, reg); }
+
void LoadIsolate(Register dst);
void LoadIsolateGroup(Register dst);
void LoadDispatchTable(Register dst);
diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc
index 889dd8e..1cdf3a2 100644
--- a/runtime/vm/compiler/backend/il_x64.cc
+++ b/runtime/vm/compiler/backend/il_x64.cc
@@ -544,7 +544,7 @@
// Clear upper part of the out register. We are going to use setcc on it
// which is a byte move.
- __ xorq(RDX, RDX);
+ __ ClearRegister(RDX);
// Emit comparison code. This must not overwrite the result register.
// IfThenElseInstr::Supports() should prevent EmitConditionCode from using
@@ -650,7 +650,7 @@
if (RepresentationUtils::IsUnboxedInteger(representation())) {
const int64_t value = Integer::Cast(value_).Value();
if (value == 0) {
- __ xorl(destination.reg(), destination.reg());
+ __ ClearRegister(destination.reg());
} else {
__ movq(destination.reg(), compiler::Immediate(value));
}
@@ -1506,11 +1506,11 @@
compiler::Address(
THR, compiler::target::Thread::global_object_pool_offset()));
} else {
- __ xorq(PP, PP); // GC-safe value into PP.
+ __ ClearRegister(PP); // GC-safe value into PP.
}
// Load a GC-safe value for arguments descriptor (unused but tagged).
- __ xorq(ARGS_DESC_REG, ARGS_DESC_REG);
+ __ ClearRegister(ARGS_DESC_REG);
// Push a dummy return address which suggests that we are inside of
// InvokeDartCodeStub. This is how the stack walker detects an entry frame.
@@ -1657,8 +1657,8 @@
__ leaq(bytes_end_minus_16_reg, compiler::Address(bytes_end_reg, -16));
// Initialize size and flags.
- __ xorq(size_reg, size_reg);
- __ xorq(flags_reg, flags_reg);
+ __ ClearRegister(size_reg);
+ __ ClearRegister(flags_reg);
__ jmp(&scan_ascii, compiler::Assembler::kNearJump);
@@ -1700,7 +1700,7 @@
table_reg, temp_reg, TIMES_1,
compiler::target::OneByteString::data_offset()));
__ orq(flags_reg, temp_reg);
- __ andq(temp_reg, compiler::Immediate(kSizeMask));
+ __ AndImmediate(temp_reg, compiler::Immediate(kSizeMask));
__ addq(size_reg, temp_reg);
// Stop if end is reached.
@@ -1735,7 +1735,7 @@
table_reg, temp_reg, TIMES_1,
compiler::target::OneByteString::data_offset()));
__ orq(flags_reg, temp_reg);
- __ andq(temp_reg, compiler::Immediate(kSizeMask));
+ __ AndImmediate(temp_reg, compiler::Immediate(kSizeMask));
__ addq(size_reg, temp_reg);
// Stop if end is reached.
@@ -1745,7 +1745,7 @@
__ Bind(&done);
// Write flags to field.
- __ andq(flags_reg, compiler::Immediate(kFlagsMask));
+ __ AndImmediate(flags_reg, compiler::Immediate(kFlagsMask));
if (!IsScanFlagsUnboxed()) {
__ SmiTag(flags_reg);
}
@@ -2042,15 +2042,12 @@
__ movb(element_address, compiler::Immediate(static_cast<int8_t>(value)));
} else {
const Register value = locs()->in(2).reg();
- compiler::Label store_value, store_0xff;
+ compiler::Label store_value;
__ CompareImmediate(value, compiler::Immediate(0xFF));
- __ j(BELOW_EQUAL, &store_value, compiler::Assembler::kNearJump);
+ __ j(UNSIGNED_LESS_EQUAL, &store_value, compiler::Assembler::kNearJump);
// Clamp to 0x0 or 0xFF respectively.
- __ j(GREATER, &store_0xff);
- __ xorq(value, value);
- __ jmp(&store_value, compiler::Assembler::kNearJump);
- __ Bind(&store_0xff);
- __ LoadImmediate(value, compiler::Immediate(0xFF));
+ __ notq(value);
+ __ sarq(value, compiler::Immediate(63));
__ Bind(&store_value);
__ movb(element_address, ByteRegisterOf(value));
}
@@ -2971,7 +2968,7 @@
compiler::Label done, is_not_zero;
__ CompareObject(right, Smi::ZoneHandle(Smi::New(Smi::kBits)));
__ j(BELOW, &is_not_zero, compiler::Assembler::kNearJump);
- __ xorq(left, left);
+ __ ClearRegister(left);
__ jmp(&done, compiler::Assembler::kNearJump);
__ Bind(&is_not_zero);
__ SmiUntag(right);
@@ -3505,7 +3502,7 @@
compiler::kObjectBytes);
compiler::Label count_ok;
__ j(LESS_EQUAL, &count_ok, compiler::Assembler::kNearJump);
- __ xorq(left, left);
+ __ ClearRegister(left);
__ jmp(&done, compiler::Assembler::kNearJump);
__ Bind(&count_ok);
}
@@ -3704,7 +3701,7 @@
// bits intact. This creates false dependency and causes performance
// problems for subsequent uses of the XMM register. To break the
// dependency XORPS is recommended.
- __ xorps(result, result);
+ __ ClearFpuRegister(result);
__ OBJ(cvtsi2sd)(result, box);
break;
}
@@ -4046,21 +4043,18 @@
}
case MethodRecognizer::kDouble_getIsInfinite: {
const Register temp = locs()->temp(0).reg();
- __ AddImmediate(RSP, compiler::Immediate(-kDoubleSize));
- __ movsd(compiler::Address(RSP, 0), value);
- __ movq(temp, compiler::Address(RSP, 0));
- __ AddImmediate(RSP, compiler::Immediate(kDoubleSize));
- // Mask off the sign.
- __ AndImmediate(temp, compiler::Immediate(0x7FFFFFFFFFFFFFFFLL));
- // Compare with +infinity.
- __ CompareImmediate(temp, compiler::Immediate(0x7FF0000000000000LL));
+ __ MoveFpuRegisterToRegister(temp, value);
+ // Shift out the sign.
+ __ addq(temp, temp);
+ // Compare with +/-infinity << 1.
+ __ CompareImmediate(temp, compiler::Immediate(0xFFE0000000000000LL));
return is_negated ? NOT_EQUAL : EQUAL;
}
case MethodRecognizer::kDouble_getIsNegative: {
const Register temp = locs()->temp(0).reg();
const FpuRegister temp_fpu = locs()->temp(1).fpu_reg();
compiler::Label not_zero;
- __ xorpd(temp_fpu, temp_fpu);
+ __ ClearFpuRegister(temp_fpu);
__ comisd(value, temp_fpu);
// If it's NaN, it's not negative.
__ j(PARITY_EVEN, is_negated ? labels.true_label : labels.false_label);
@@ -4272,11 +4266,11 @@
}
DEFINE_EMIT(Float32x4Zero, (XmmRegister value)) {
- __ xorps(value, value);
+ __ ClearFpuRegister(value);
}
DEFINE_EMIT(Float64x2Zero, (XmmRegister value)) {
- __ xorpd(value, value);
+ __ ClearFpuRegister(value);
}
DEFINE_EMIT(Float32x4Clamp,
@@ -4319,7 +4313,7 @@
__ SubImmediate(RSP, compiler::Immediate(kSimd128Size));
for (intptr_t i = 0; i < 4; i++) {
compiler::Label done, load_false;
- __ xorq(temp, temp);
+ __ ClearRegister(temp);
__ CompareObject(instr->locs()->in(i).reg(), Bool::True());
__ setcc(EQUAL, ByteRegisterOf(temp));
__ negl(temp); // temp = input ? -1 : 0
@@ -4341,7 +4335,7 @@
DEFINE_EMIT(Int32x4GetFlagZorW,
(Register out, XmmRegister value, Temp<XmmRegister> temp)) {
__ movhlps(temp, value); // extract upper half.
- __ movq(out, temp);
+ __ MoveFpuRegisterToRegister(out, temp);
if (instr->kind() == SimdOpInstr::kInt32x4GetFlagW) {
__ shrq(out, compiler::Immediate(32)); // extract upper 32bits.
}
@@ -4349,7 +4343,7 @@
}
DEFINE_EMIT(Int32x4GetFlagXorY, (Register out, XmmRegister value)) {
- __ movq(out, value);
+ __ MoveFpuRegisterToRegister(out, value);
if (instr->kind() == SimdOpInstr::kInt32x4GetFlagY) {
__ shrq(out, compiler::Immediate(32)); // extract upper 32bits.
}
@@ -4370,7 +4364,7 @@
__ movups(compiler::Address(RSP, 0), mask);
// temp = flag == true ? -1 : 0
- __ xorq(temp, temp);
+ __ ClearRegister(temp);
__ CompareObject(flag, Bool::True());
__ setcc(EQUAL, ByteRegisterOf(temp));
__ negl(temp);
@@ -4699,7 +4693,7 @@
// bits intact. This creates false dependency and causes performance
// problems for subsequent uses of the XMM register. To break the
// dependency XORPS is recommended.
- __ xorps(result, result);
+ __ ClearFpuRegister(result);
__ cvtsi2sdl(result, value);
}
@@ -4722,7 +4716,7 @@
// bits intact. This creates false dependency and causes performance
// problems for subsequent uses of the XMM register. To break the
// dependency XORPS is recommended.
- __ xorps(result, result);
+ __ ClearFpuRegister(result);
__ OBJ(cvtsi2sd)(result, value);
}
@@ -4731,7 +4725,7 @@
// bits intact. This creates false dependency and causes performance
// problems for subsequent uses of the XMM register. To break the
// dependency XORPS is recommended.
- __ xorps(result, result);
+ __ ClearFpuRegister(result);
__ cvtsi2sdq(result, value);
}
@@ -4776,7 +4770,7 @@
compiler::Immediate(0));
__ j(EQUAL, slow_path->entry_label());
- __ xorps(FpuTMP, FpuTMP);
+ __ ClearFpuRegister(FpuTMP);
switch (recognized_kind()) {
case MethodRecognizer::kDoubleFloorToInt:
__ roundsd(FpuTMP, value_double, compiler::Assembler::kRoundDown);
@@ -4934,7 +4928,7 @@
XmmRegister zero_temp =
locs->temp(InvokeMathCFunctionInstr::kDoubleTempIndex).fpu_reg();
- __ xorps(zero_temp, zero_temp);
+ __ ClearFpuRegister(zero_temp);
__ LoadDImmediate(result, 1.0);
compiler::Label check_base, skip_call;
@@ -5242,7 +5236,7 @@
__ movq(RDX, RAX);
__ shrq(RDX, compiler::Immediate(32));
__ xorq(RAX, RDX);
- __ andq(RAX, compiler::Immediate(0x3fffffff));
+ __ AndImmediate(RAX, compiler::Immediate(0x3fffffff));
}
LocationSummary* HashDoubleOpInstr::MakeLocationSummary(Zone* zone,
@@ -5269,8 +5263,8 @@
// cvtsi2sd only writes to the lower part of the register and leaves upper
// bits intact. This creates false dependency and causes performance
// problems for subsequent uses of the XMM register. To break the
- // dependency XORPS is recommended.
- __ xorps(temp_fpu_reg, temp_fpu_reg);
+ // dependency XORPS is recommended (which is what ClearRegister does).
+ __ ClearFpuRegister(temp_fpu_reg);
__ cvttsd2siq(RAX, value);
__ cvtsi2sdq(temp_fpu_reg, RAX);
__ comisd(value, temp_fpu_reg);
@@ -5285,11 +5279,11 @@
__ Bind(&hash_double);
// Convert the double bits to a hash code that fits in a Smi.
- __ movq(RAX, value);
+ __ MoveFpuRegisterToRegister(RAX, value);
__ movq(RDX, RAX);
__ shrq(RDX, compiler::Immediate(32));
__ xorq(RAX, RDX);
- __ andq(RAX, compiler::Immediate(compiler::target::kSmiMax));
+ __ AndImmediate(RAX, compiler::Immediate(compiler::target::kSmiMax));
__ Bind(&done);
}
@@ -5572,7 +5566,7 @@
if (has_divide_by_minus_one()) {
__ Bind(div_by_minus_one_label());
if (is_mod_) {
- __ xorq(RDX, RDX); // x % -1 = 0
+ __ ClearRegister(RDX); // x % -1 = 0
} else {
__ negq(RAX); // x / -1 = -x
}
@@ -5691,7 +5685,6 @@
__ imulq(RDX, TMP);
__ subq(RAX, RDX);
// Compensate for Dart's Euclidean view of MOD.
- __ testq(RAX, RAX);
__ j(GREATER_EQUAL, &pos);
if (divisor > 0) {
__ addq(RAX, TMP);
@@ -5968,7 +5961,7 @@
}
if (shift >= 32) {
- __ xorl(left, left);
+ __ ClearRegister(left);
} else {
switch (op_kind) {
case Token::kSHR:
@@ -6026,7 +6019,7 @@
break;
case Token::kUSHR:
case Token::kSHL:
- __ xorq(out, out);
+ __ ClearRegister(out);
break;
default:
UNREACHABLE();
@@ -6096,7 +6089,7 @@
compiler::Label done;
__ cmpl(RCX, compiler::Immediate(kUint32ShiftCountLimit));
__ j(UNSIGNED_LESS_EQUAL, &done);
- __ xorl(out, out);
+ __ ClearRegister(out);
__ Bind(&done);
}
}
@@ -6416,7 +6409,7 @@
// RCX: instructions entry point.
if (!FLAG_precompiled_mode) {
// RBX: Smi 0 (no IC data; the lazy-compile stub expects a GC-safe value).
- __ xorq(IC_DATA_REG, IC_DATA_REG);
+ __ ClearRegister(IC_DATA_REG);
}
__ call(RCX);
compiler->EmitCallsiteMetadata(source(), deopt_id(),