summaryrefslogtreecommitdiff
path: root/js
diff options
context:
space:
mode:
authorMoonchild <moonchild@palemoon.org>2023-09-12 23:28:49 +0200
committerMoonchild <moonchild@palemoon.org>2023-09-12 23:28:49 +0200
commit1a7f79ef9acde005dd78984aeb5917af525960d6 (patch)
tree56845f1dfffc062d2a22719c464e25535c5f864c /js
parent281497201e52d95b1592e28ba59431ad4ae3bfeb (diff)
downloaduxp-1a7f79ef9acde005dd78984aeb5917af525960d6.tar.gz
Issue #2307 - Part 2: Move SIMD code generation to masm methods
Diffstat (limited to 'js')
-rw-r--r--js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp1185
-rw-r--r--js/src/jit/x86-shared/CodeGenerator-x86-shared.h6
-rw-r--r--js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp1227
-rw-r--r--js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h6
-rw-r--r--js/src/jit/x86-shared/MacroAssembler-x86-shared.h222
-rw-r--r--js/src/moz.build1
6 files changed, 1570 insertions, 1077 deletions
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
index 5ec00da849..9858836e7d 100644
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -27,6 +27,7 @@ using mozilla::BitwiseCast;
using mozilla::DebugOnly;
using mozilla::FloatingPoint;
using mozilla::FloorLog2;
+using mozilla::Maybe;
using mozilla::NegativeInfinity;
using mozilla::SpecificNaN;
@@ -2458,51 +2459,18 @@ CodeGeneratorX86Shared::visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins)
FloatRegister out = ToFloatRegister(ins->output());
Register temp = ToRegister(ins->temp());
- masm.convertFloat32x4ToInt32x4(in, out);
-
auto* ool = new(alloc()) OutOfLineSimdFloatToIntCheck(temp, in, ins, ins->mir()->trapOffset());
addOutOfLineCode(ool, ins->mir());
- static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648));
-
- ScratchSimd128Scope scratch(masm);
- masm.loadConstantSimd128Int(InvalidResult, scratch);
- masm.packedEqualInt32x4(Operand(out), scratch);
- // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
- // the two following instructions.
- masm.vmovmskps(scratch, temp);
- masm.cmp32(temp, Imm32(0));
- masm.j(Assembler::NotEqual, ool->entry());
-
- masm.bind(ool->rejoin());
+ masm.checkedConvertFloat32x4ToInt32x4(in, out, temp, ool->entry(), ool->rejoin());
}
void
-CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck *ool)
+CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck* ool)
{
- static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
- static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
-
Label onConversionError;
- FloatRegister input = ool->input();
- Register temp = ool->temp();
-
- ScratchSimd128Scope scratch(masm);
- masm.loadConstantSimd128Float(Int32MinX4, scratch);
- masm.vcmpleps(Operand(input), scratch, scratch);
- masm.vmovmskps(scratch, temp);
- masm.cmp32(temp, Imm32(15));
- masm.j(Assembler::NotEqual, &onConversionError);
-
- masm.loadConstantSimd128Float(Int32MaxX4, scratch);
- masm.vcmpleps(Operand(input), scratch, scratch);
- masm.vmovmskps(scratch, temp);
- masm.cmp32(temp, Imm32(0));
- masm.j(Assembler::NotEqual, &onConversionError);
-
- masm.jump(ool->rejoin());
-
+ masm.oolConvertFloat32x4ToInt32x4(ool->input(), ool->temp(), ool->rejoin(), &onConversionError);
if (gen->compilingWasm()) {
masm.bindLater(&onConversionError, trap(ool, wasm::Trap::ImpreciseSimdConversion));
} else {
@@ -2512,105 +2480,39 @@ CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIn
}
// Convert Float32x4 to Uint32x4.
-//
// If any input lane value is out of range or NaN, bail out.
void
CodeGeneratorX86Shared::visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins)
{
- const MSimdConvert* mir = ins->mir();
FloatRegister in = ToFloatRegister(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
Register temp = ToRegister(ins->tempR());
FloatRegister tempF = ToFloatRegister(ins->tempF());
- // Classify lane values into 4 disjoint classes:
- //
- // N-lanes: in <= -1.0
- // A-lanes: -1.0 < in <= 0x0.ffffffp31
- // B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
- // V-lanes: 0x1.0p32 <= in, or isnan(in)
- //
- // We need to bail out to throw a RangeError if we see any N-lanes or
- // V-lanes.
- //
- // For A-lanes and B-lanes, we make two float -> int32 conversions:
- //
- // A = cvttps2dq(in)
- // B = cvttps2dq(in - 0x1.0p31f)
- //
- // Note that the subtraction for the B computation is exact for B-lanes.
- // There is no rounding, so B is the low 31 bits of the correctly converted
- // result.
- //
- // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
- // out of range for a signed int32_t. This conveniently provides the missing
- // high bit for B, so the desired result is A for A-lanes and A|B for
- // B-lanes.
-
- ScratchSimd128Scope scratch(masm);
-
- // TODO: If the majority of lanes are A-lanes, it could be faster to compute
- // A first, use vmovmskps to check for any non-A-lanes and handle them in
- // ool code. OTOH, we we're wrong about the lane distribution, that would be
- // slower.
-
- // Compute B in |scratch|.
- static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
- static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
- masm.loadConstantSimd128Float(Bias, scratch);
- masm.packedAddFloat32(Operand(in), scratch);
- masm.convertFloat32x4ToInt32x4(scratch, scratch);
-
- // Compute A in |out|. This is the last time we use |in| and the first time
- // we use |out|, so we can tolerate if they are the same register.
- masm.convertFloat32x4ToInt32x4(in, out);
-
- // We can identify A-lanes by the sign bits in A: Any A-lanes will be
- // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
- // mask of non-A-lanes into |tempF|.
- masm.zeroSimd128Float(tempF);
- masm.packedGreaterThanInt32x4(Operand(out), tempF);
-
- // Clear the A-lanes in B.
- masm.bitwiseAndSimd128(Operand(tempF), scratch);
-
- // Compute the final result: A for A-lanes, A|B for B-lanes.
- masm.bitwiseOrSimd128(Operand(scratch), out);
-
- // We still need to filter out the V-lanes. They would show up as 0x80000000
- // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
- // the remaining negative lanes in B.
- masm.vmovmskps(scratch, temp);
- masm.cmp32(temp, Imm32(0));
+ Label failed;
+ masm.checkedConvertFloat32x4ToUint32x4(in, out, temp, tempF, &failed);
+ Label ok;
+ masm.jump(&ok);
+ masm.bind(&failed);
if (gen->compilingWasm())
- masm.j(Assembler::NotEqual, trap(mir, wasm::Trap::ImpreciseSimdConversion));
+ masm.j(Assembler::NotEqual, trap(ins->mir(), wasm::Trap::ImpreciseSimdConversion));
else
- bailoutIf(Assembler::NotEqual, ins->snapshot());
+// bailoutIf(Assembler::NotEqual, ins->snapshot());
+ bailout(ins->snapshot());
+ masm.bind(&ok);
}
void
CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4* ins)
{
MOZ_ASSERT(ins->mir()->type() == MIRType::Int32x4 || ins->mir()->type() == MIRType::Bool32x4);
-
- FloatRegister output = ToFloatRegister(ins->output());
- if (AssemblerX86Shared::HasSSE41()) {
- masm.vmovd(ToRegister(ins->getOperand(0)), output);
- for (size_t i = 1; i < 4; ++i) {
- Register r = ToRegister(ins->getOperand(i));
- masm.vpinsrd(i, r, output, output);
- }
- return;
- }
-
- masm.reserveStack(Simd128DataSize);
- for (size_t i = 0; i < 4; ++i) {
- Register r = ToRegister(ins->getOperand(i));
- masm.store32(r, Address(StackPointer, i * sizeof(int32_t)));
- }
- masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
- masm.freeStack(Simd128DataSize);
+ masm.createInt32x4(ToRegister(ins->getOperand(0)),
+ ToRegister(ins->getOperand(1)),
+ ToRegister(ins->getOperand(2)),
+ ToRegister(ins->getOperand(3)),
+ ToFloatRegister(ins->output())
+ );
}
void
@@ -2625,12 +2527,7 @@ CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
FloatRegister tmp = ToFloatRegister(ins->getTemp(0));
FloatRegister output = ToFloatRegister(ins->output());
- FloatRegister r0Copy = masm.reusedInputFloat32x4(r0, output);
- FloatRegister r1Copy = masm.reusedInputFloat32x4(r1, tmp);
-
- masm.vunpcklps(r3, r1Copy, tmp);
- masm.vunpcklps(r2, r0Copy, output);
- masm.vunpcklps(tmp, output, output);
+ masm.createFloat32x4(r0, r1, r2, r3, tmp, output);
}
void
@@ -2639,20 +2536,7 @@ CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins)
MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
Register input = ToRegister(ins->getOperand(0));
FloatRegister output = ToFloatRegister(ins->output());
- masm.vmovd(input, output);
- if (AssemblerX86Shared::HasSSSE3()) {
- masm.zeroSimd128Int(ScratchSimd128Reg);
- masm.vpshufb(ScratchSimd128Reg, output, output);
- } else {
- // Use two shifts to duplicate the low 8 bits into the low 16 bits.
- masm.vpsllw(Imm32(8), output, output);
- masm.vmovdqa(output, ScratchSimd128Reg);
- masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
- masm.vpor(ScratchSimd128Reg, output, output);
- // Then do an X8 splat.
- masm.vpshuflw(0, output, output);
- masm.vpshufd(0, output, output);
- }
+ masm.splatX16(input, output);
}
void
@@ -2661,9 +2545,7 @@ CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins)
MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
Register input = ToRegister(ins->getOperand(0));
FloatRegister output = ToFloatRegister(ins->output());
- masm.vmovd(input, output);
- masm.vpshuflw(0, output, output);
- masm.vpshufd(0, output, output);
+ masm.splatX8(input, output);
}
void
@@ -2675,15 +2557,10 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
MOZ_ASSERT(IsSimdType(mir->type()));
JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
- if (mir->type() == MIRType::Float32x4) {
- FloatRegister r = ToFloatRegister(ins->getOperand(0));
- FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
- masm.vshufps(0, rCopy, rCopy, output);
- } else {
- Register r = ToRegister(ins->getOperand(0));
- masm.vmovd(r, output);
- masm.vpshufd(0, output, output);
- }
+ if (mir->type() == MIRType::Float32x4)
+ masm.splatX4(ToFloatRegister(ins->getOperand(0)), output);
+ else
+ masm.splatX4(ToRegister(ins->getOperand(0)), output);
}
void
@@ -2691,83 +2568,8 @@ CodeGeneratorX86Shared::visitSimdReinterpretCast(LSimdReinterpretCast* ins)
{
FloatRegister input = ToFloatRegister(ins->input());
FloatRegister output = ToFloatRegister(ins->output());
-
- if (input.aliases(output))
- return;
-
- if (IsIntegerSimdType(ins->mir()->type()))
- masm.vmovdqa(input, output);
- else
- masm.vmovaps(input, output);
-}
-
-// Extract an integer lane from the 32x4 vector register |input| and place it in
-// |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane)
-{
- if (lane == 0) {
- // The value we want to extract is in the low double-word
- masm.moveLowInt32(input, output);
- } else if (AssemblerX86Shared::HasSSE41()) {
- masm.vpextrd(lane, input, output);
- } else {
- uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
- masm.shuffleInt32(mask, input, ScratchSimd128Reg);
- masm.moveLowInt32(ScratchSimd128Reg, output);
- }
-}
-
-// Extract an integer lane from the 16x8 vector register |input|, sign- or
-// zero-extend to 32 bits and place the result in |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane16x8(FloatRegister input, Register output,
- unsigned lane, SimdSign signedness)
-{
- // Unlike pextrd and pextrb, this is available in SSE2.
- masm.vpextrw(lane, input, output);
-
- if (signedness == SimdSign::Signed)
- masm.movswl(output, output);
-}
-
-// Extract an integer lane from the 8x16 vector register |input|, sign- or
-// zero-extend to 32 bits and place the result in |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane8x16(FloatRegister input, Register output,
- unsigned lane, SimdSign signedness)
-{
- if (AssemblerX86Shared::HasSSE41()) {
- masm.vpextrb(lane, input, output);
- // vpextrb clears the high bits, so no further extension required.
- if (signedness == SimdSign::Unsigned)
- signedness = SimdSign::NotApplicable;
- } else {
- // Extract the relevant 16 bits containing our lane, then shift the
- // right 8 bits into place.
- emitSimdExtractLane16x8(input, output, lane / 2, SimdSign::Unsigned);
- if (lane % 2) {
- masm.shrl(Imm32(8), output);
- // The shrl handles the zero-extension. Don't repeat it.
- if (signedness == SimdSign::Unsigned)
- signedness = SimdSign::NotApplicable;
- }
- }
-
- // We have the right low 8 bits in |output|, but we may need to fix the high
- // bits. Note that this requires |output| to be one of the %eax-%edx
- // registers.
- switch (signedness) {
- case SimdSign::Signed:
- masm.movsbl(output, output);
- break;
- case SimdSign::Unsigned:
- masm.movzbl(output, output);
- break;
- case SimdSign::NotApplicable:
- // No adjustment needed.
- break;
- }
+ bool isIntLaneType = IsIntegerSimdType(ins->mir()->type());
+ masm.reinterpretSimd(isIntLaneType, input, output);
}
void
@@ -2776,25 +2578,8 @@ CodeGeneratorX86Shared::visitSimdExtractElementB(LSimdExtractElementB* ins)
FloatRegister input = ToFloatRegister(ins->input());
Register output = ToRegister(ins->output());
MSimdExtractElement* mir = ins->mir();
- unsigned length = SimdTypeToLength(mir->specialization());
-
- switch (length) {
- case 4:
- emitSimdExtractLane32x4(input, output, mir->lane());
- break;
- case 8:
- // Get a lane, don't bother fixing the high bits since we'll mask below.
- emitSimdExtractLane16x8(input, output, mir->lane(), SimdSign::NotApplicable);
- break;
- case 16:
- emitSimdExtractLane8x16(input, output, mir->lane(), SimdSign::NotApplicable);
- break;
- default:
- MOZ_CRASH("Unhandled SIMD length");
- }
-
- // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
- masm.and32(Imm32(1), output);
+ unsigned numLanes = SimdTypeToLength(mir->specialization());
+ masm.extractLaneSimdBool(input, output, numLanes, mir->lane());
}
void
@@ -2803,17 +2588,16 @@ CodeGeneratorX86Shared::visitSimdExtractElementI(LSimdExtractElementI* ins)
FloatRegister input = ToFloatRegister(ins->input());
Register output = ToRegister(ins->output());
MSimdExtractElement* mir = ins->mir();
- unsigned length = SimdTypeToLength(mir->specialization());
-
- switch (length) {
+ unsigned numLanes = SimdTypeToLength(mir->specialization());
+ switch (numLanes) {
case 4:
- emitSimdExtractLane32x4(input, output, mir->lane());
+ masm.extractLaneInt32x4(input, output, mir->lane());
break;
case 8:
- emitSimdExtractLane16x8(input, output, mir->lane(), mir->signedness());
+ masm.extractLaneInt16x8(input, output, mir->lane(), mir->signedness());
break;
case 16:
- emitSimdExtractLane8x16(input, output, mir->lane(), mir->signedness());
+ masm.extractLaneInt8x16(input, output, mir->lane(), mir->signedness());
break;
default:
MOZ_CRASH("Unhandled SIMD length");
@@ -2828,7 +2612,7 @@ CodeGeneratorX86Shared::visitSimdExtractElementU2D(LSimdExtractElementU2D* ins)
Register temp = ToRegister(ins->temp());
MSimdExtractElement* mir = ins->mir();
MOZ_ASSERT(mir->specialization() == MIRType::Int32x4);
- emitSimdExtractLane32x4(input, temp, mir->lane());
+ masm.extractLaneInt32x4(input, temp, mir->lane());
masm.convertUInt32ToDouble(temp, output);
}
@@ -2839,102 +2623,31 @@ CodeGeneratorX86Shared::visitSimdExtractElementF(LSimdExtractElementF* ins)
FloatRegister output = ToFloatRegister(ins->output());
unsigned lane = ins->mir()->lane();
- if (lane == 0) {
- // The value we want to extract is in the low double-word
- if (input != output)
- masm.moveFloat32(input, output);
- } else if (lane == 2) {
- masm.moveHighPairToLowPairFloat32(input, output);
- } else {
- uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
- masm.shuffleFloat32(mask, input, output);
- }
- // NaNs contained within SIMD values are not enforced to be canonical, so
- // when we extract an element into a "regular" scalar JS value, we have to
- // canonicalize. In wasm code, we can skip this, as wasm only has to
- // canonicalize NaNs at FFI boundaries.
- if (!gen->compilingWasm())
- masm.canonicalizeFloat(output);
+ bool canonicalize = !gen->compilingWasm();
+ masm.extractLaneFloat32x4(input, output, lane, canonicalize);
}
void
CodeGeneratorX86Shared::visitSimdInsertElementI(LSimdInsertElementI* ins)
{
- FloatRegister vector = ToFloatRegister(ins->vector());
+ FloatRegister input = ToFloatRegister(ins->vector());
Register value = ToRegister(ins->value());
FloatRegister output = ToFloatRegister(ins->output());
- MOZ_ASSERT(vector == output); // defineReuseInput(0)
-
+ MOZ_ASSERT(input == output); // defineReuseInput(0)
unsigned lane = ins->lane();
unsigned length = ins->length();
- if (length == 8) {
- // Available in SSE 2.
- masm.vpinsrw(lane, value, vector, output);
- return;
- }
-
- // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
- // value goes into the first component, as vmovd clears out the higher lanes
- // of the output.
- if (AssemblerX86Shared::HasSSE41()) {
- // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
- switch (length) {
- case 4:
- masm.vpinsrd(lane, value, vector, output);
- return;
- case 16:
- masm.vpinsrb(lane, value, vector, output);
- return;
- }
- }
-
- masm.reserveStack(Simd128DataSize);
- masm.storeAlignedSimd128Int(vector, Address(StackPointer, 0));
- switch (length) {
- case 4:
- masm.store32(value, Address(StackPointer, lane * sizeof(int32_t)));
- break;
- case 16:
- // Note that this requires `value` to be in one the registers where the
- // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
- masm.store8(value, Address(StackPointer, lane * sizeof(int8_t)));
- break;
- default:
- MOZ_CRASH("Unsupported SIMD length");
- }
- masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
- masm.freeStack(Simd128DataSize);
+ masm.insertLaneSimdInt(input, value, output, lane, length);
}
void
CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF* ins)
{
- FloatRegister vector = ToFloatRegister(ins->vector());
+ FloatRegister input = ToFloatRegister(ins->vector());
FloatRegister value = ToFloatRegister(ins->value());
FloatRegister output = ToFloatRegister(ins->output());
- MOZ_ASSERT(vector == output); // defineReuseInput(0)
-
- if (ins->lane() == 0) {
- // As both operands are registers, vmovss doesn't modify the upper bits
- // of the destination operand.
- if (value != output)
- masm.vmovss(value, vector, output);
- return;
- }
-
- if (AssemblerX86Shared::HasSSE41()) {
- // The input value is in the low float32 of the 'value' FloatRegister.
- masm.vinsertps(masm.vinsertpsMask(0, ins->lane()), value, output, output);
- return;
- }
-
- unsigned component = unsigned(ins->lane());
- masm.reserveStack(Simd128DataSize);
- masm.storeAlignedSimd128Float(vector, Address(StackPointer, 0));
- masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
- masm.loadAlignedSimd128Float(Address(StackPointer, 0), output);
- masm.freeStack(Simd128DataSize);
+ MOZ_ASSERT(input == output); // defineReuseInput(0)
+ masm.insertLaneFloat32x4(input, value, output, ins->lane());
}
void
@@ -2943,9 +2656,7 @@ CodeGeneratorX86Shared::visitSimdAllTrue(LSimdAllTrue* ins)
FloatRegister input = ToFloatRegister(ins->input());
Register output = ToRegister(ins->output());
- masm.vmovmskps(input, output);
- masm.cmp32(output, Imm32(0xf));
- masm.emitSet(Assembler::Zero, output);
+ masm.allTrueSimdBool(input, output);
}
void
@@ -2954,11 +2665,10 @@ CodeGeneratorX86Shared::visitSimdAnyTrue(LSimdAnyTrue* ins)
FloatRegister input = ToFloatRegister(ins->input());
Register output = ToRegister(ins->output());
- masm.vmovmskps(input, output);
- masm.cmp32(output, Imm32(0x0));
- masm.emitSet(Assembler::NonZero, output);
+ masm.anyTrueSimdBool(input, output);
}
+// XXX note for reviewer: this is SIMD.js only, no need to keep it for wasm.
template <class T, class Reg> void
CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Reg tempRegister)
{
@@ -3017,6 +2727,7 @@ CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Re
masm.freeStack(stackSpace);
}
+// XXX SIMD.js only
void
CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI* ins)
{
@@ -3047,13 +2758,10 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins)
switch (numLanes) {
case 4: {
- uint32_t x = ins->lane(0);
- uint32_t y = ins->lane(1);
- uint32_t z = ins->lane(2);
- uint32_t w = ins->lane(3);
-
- uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
- masm.shuffleInt32(mask, input, output);
+ unsigned lanes[4];
+ for (unsigned i = 0; i < 4; i++)
+ lanes[i] = ins->lane(i);
+ masm.swizzleInt32x4(input, output, lanes);
return;
}
}
@@ -3061,31 +2769,18 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins)
// In the general case, use pshufb if it is available. Convert to a
// byte-wise swizzle.
const unsigned bytesPerLane = 16 / numLanes;
- int8_t bLane[16];
+ int8_t lanes[16];
for (unsigned i = 0; i < numLanes; i++) {
for (unsigned b = 0; b < bytesPerLane; b++) {
- bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+ lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
}
}
- if (AssemblerX86Shared::HasSSSE3()) {
- ScratchSimd128Scope scratch(masm);
- masm.loadConstantSimd128Int(SimdConstant::CreateX16(bLane), scratch);
- FloatRegister inputCopy = masm.reusedInputInt32x4(input, output);
- masm.vpshufb(scratch, inputCopy, output);
- return;
- }
+ Maybe<Register> maybeTemp;
+ if (!ins->getTemp(0)->isBogusTemp())
+ maybeTemp.emplace(ToRegister(ins->getTemp(0)));
- // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
- Register temp = ToRegister(ins->getTemp(0));
- masm.reserveStack(2 * Simd128DataSize);
- masm.storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
- for (unsigned i = 0; i < 16; i++) {
- masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
- masm.store8(temp, Address(StackPointer, i));
- }
- masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
- masm.freeStack(2 * Simd128DataSize);
+ masm.swizzleInt8x16(input, output, maybeTemp, lanes);
}
void
@@ -3095,54 +2790,10 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF* ins)
FloatRegister output = ToFloatRegister(ins->output());
MOZ_ASSERT(ins->numLanes() == 4);
- uint32_t x = ins->lane(0);
- uint32_t y = ins->lane(1);
- uint32_t z = ins->lane(2);
- uint32_t w = ins->lane(3);
-
- if (AssemblerX86Shared::HasSSE3()) {
- if (ins->lanesMatch(0, 0, 2, 2)) {
- masm.vmovsldup(input, output);
- return;
- }
- if (ins->lanesMatch(1, 1, 3, 3)) {
- masm.vmovshdup(input, output);
- return;
- }
- }
-
- // TODO Here and below, arch specific lowering could identify this pattern
- // and use defineReuseInput to avoid this move (bug 1084404)
- if (ins->lanesMatch(2, 3, 2, 3)) {
- FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
- masm.vmovhlps(input, inputCopy, output);
- return;
- }
-
- if (ins->lanesMatch(0, 1, 0, 1)) {
- if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
- masm.vmovddup(input, output);
- return;
- }
- FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
- masm.vmovlhps(input, inputCopy, output);
- return;
- }
-
- if (ins->lanesMatch(0, 0, 1, 1)) {
- FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
- masm.vunpcklps(input, inputCopy, output);
- return;
- }
-
- if (ins->lanesMatch(2, 2, 3, 3)) {
- FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
- masm.vunpckhps(input, inputCopy, output);
- return;
- }
-
- uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
- masm.shuffleFloat32(mask, input, output);
+ unsigned lanes[4];
+ for (unsigned i = 0; i < 4; i++)
+ lanes[i] = ins->lane(i);
+ masm.swizzleFloat32x4(input, output, lanes);
}
void
@@ -3155,52 +2806,21 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle* ins)
const unsigned bytesPerLane = 16 / numLanes;
// Convert the shuffle to a byte-wise shuffle.
- uint8_t bLane[16];
+ uint8_t lanes[16];
for (unsigned i = 0; i < numLanes; i++) {
for (unsigned b = 0; b < bytesPerLane; b++) {
- bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+ lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
}
}
- // Use pshufb if it is available.
- if (AssemblerX86Shared::HasSSSE3()) {
- FloatRegister scratch1 = ToFloatRegister(ins->temp());
- ScratchSimd128Scope scratch2(masm);
-
- // Use pshufb instructions to gather the lanes from each source vector.
- // A negative index creates a zero lane, so the two vectors can be combined.
-
- // Set scratch2 = lanes from lhs.
- int8_t idx[16];
- for (unsigned i = 0; i < 16; i++)
- idx[i] = bLane[i] < 16 ? bLane[i] : -1;
- masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
- FloatRegister lhsCopy = masm.reusedInputInt32x4(lhs, scratch2);
- masm.vpshufb(scratch1, lhsCopy, scratch2);
-
- // Set output = lanes from rhs.
- for (unsigned i = 0; i < 16; i++)
- idx[i] = bLane[i] >= 16 ? bLane[i] - 16 : -1;
- masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
- FloatRegister rhsCopy = masm.reusedInputInt32x4(rhs, output);
- masm.vpshufb(scratch1, rhsCopy, output);
-
- // Combine.
- masm.vpor(scratch2, output, output);
- return;
- }
+ Maybe<FloatRegister> maybeFloatTemp;
+ Maybe<Register> maybeTemp;
+ if (AssemblerX86Shared::HasSSSE3())
+ maybeFloatTemp.emplace(ToFloatRegister(ins->temp()));
+ else
+ maybeTemp.emplace(ToRegister(ins->temp()));
- // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
- Register temp = ToRegister(ins->getTemp(0));
- masm.reserveStack(3 * Simd128DataSize);
- masm.storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
- masm.storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
- for (unsigned i = 0; i < 16; i++) {
- masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
- masm.store8(temp, Address(StackPointer, i));
- }
- masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
- masm.freeStack(3 * Simd128DataSize);
+ masm.shuffleInt8x16(lhs, rhs, output, maybeFloatTemp, maybeTemp, lanes);
}
void
@@ -3210,409 +2830,60 @@ CodeGeneratorX86Shared::visitSimdShuffleX4(LSimdShuffleX4* ins)
Operand rhs = ToOperand(ins->rhs());
FloatRegister out = ToFloatRegister(ins->output());
- uint32_t x = ins->lane(0);
- uint32_t y = ins->lane(1);
- uint32_t z = ins->lane(2);
- uint32_t w = ins->lane(3);
-
- // Check that lanes come from LHS in majority:
- unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
- MOZ_ASSERT(numLanesFromLHS >= 2);
-
- // When reading this method, remember that vshufps takes the two first
- // inputs of the destination operand (right operand) and the two last
- // inputs of the source operand (left operand).
- //
- // Legend for explanations:
- // - L: LHS
- // - R: RHS
- // - T: temporary
-
- uint32_t mask;
-
- // If all lanes came from a single vector, we should have constructed a
- // MSimdSwizzle instead.
- MOZ_ASSERT(numLanesFromLHS < 4);
-
- // If all values stay in their lane, this is a blend.
- if (AssemblerX86Shared::HasSSE41()) {
- if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
- masm.vblendps(masm.blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
- return;
- }
- }
-
- // One element of the second, all other elements of the first
- if (numLanesFromLHS == 3) {
- unsigned firstMask = -1, secondMask = -1;
-
- // register-register vmovss preserves the high lanes.
- if (ins->lanesMatch(4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
- masm.vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
- return;
- }
-
- // SSE4.1 vinsertps can handle any single element.
- unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
- if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
- unsigned srcLane;
- unsigned dstLane;
- if (x >= 4) {
- srcLane = x - 4;
- dstLane = 0;
- } else if (y >= 4) {
- srcLane = y - 4;
- dstLane = 1;
- } else if (z >= 4) {
- srcLane = z - 4;
- dstLane = 2;
- } else {
- MOZ_ASSERT(w >= 4);
- srcLane = w - 4;
- dstLane = 3;
- }
- masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
- return;
- }
-
- FloatRegister rhsCopy = ToFloatRegister(ins->temp());
-
- if (x < 4 && y < 4) {
- if (w >= 4) {
- w %= 4;
- // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
- firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
- // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
- secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
- } else {
- MOZ_ASSERT(z >= 4);
- z %= 4;
- // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
- firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
- // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
- secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
- }
-
- masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
- masm.vshufps(secondMask, rhsCopy, lhs, out);
- return;
- }
-
- MOZ_ASSERT(z < 4 && w < 4);
-
- if (y >= 4) {
- y %= 4;
- // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
- firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
- // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
- secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
- } else {
- MOZ_ASSERT(x >= 4);
- x %= 4;
- // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
- firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
- // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
- secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
- }
-
- masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
- if (AssemblerX86Shared::HasAVX()) {
- masm.vshufps(secondMask, lhs, rhsCopy, out);
- } else {
- masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy);
- masm.moveSimd128Float(rhsCopy, out);
- }
- return;
- }
-
- // Two elements from one vector, two other elements from the other
- MOZ_ASSERT(numLanesFromLHS == 2);
-
- // TODO Here and below, symmetric case would be more handy to avoid a move,
- // but can't be reached because operands would get swapped (bug 1084404).
- if (ins->lanesMatch(2, 3, 6, 7)) {
- ScratchSimd128Scope scratch(masm);
- if (AssemblerX86Shared::HasAVX()) {
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
- masm.vmovhlps(lhs, rhsCopy, out);
- } else {
- masm.loadAlignedSimd128Float(rhs, scratch);
- masm.vmovhlps(lhs, scratch, scratch);
- masm.moveSimd128Float(scratch, out);
- }
- return;
- }
-
- if (ins->lanesMatch(0, 1, 4, 5)) {
- FloatRegister rhsCopy;
- ScratchSimd128Scope scratch(masm);
- if (rhs.kind() == Operand::FPREG) {
- // No need to make an actual copy, since the operand is already
- // in a register, and it won't be clobbered by the vmovlhps.
- rhsCopy = FloatRegister::FromCode(rhs.fpu());
- } else {
- masm.loadAlignedSimd128Float(rhs, scratch);
- rhsCopy = scratch;
- }
- masm.vmovlhps(rhsCopy, lhs, out);
- return;
- }
-
- if (ins->lanesMatch(0, 4, 1, 5)) {
- masm.vunpcklps(rhs, lhs, out);
- return;
- }
-
- // TODO swapped case would be better (bug 1084404)
- if (ins->lanesMatch(4, 0, 5, 1)) {
- ScratchSimd128Scope scratch(masm);
- if (AssemblerX86Shared::HasAVX()) {
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
- masm.vunpcklps(lhs, rhsCopy, out);
- } else {
- masm.loadAlignedSimd128Float(rhs, scratch);
- masm.vunpcklps(lhs, scratch, scratch);
- masm.moveSimd128Float(scratch, out);
- }
- return;
- }
-
- if (ins->lanesMatch(2, 6, 3, 7)) {
- masm.vunpckhps(rhs, lhs, out);
- return;
- }
-
- // TODO swapped case would be better (bug 1084404)
- if (ins->lanesMatch(6, 2, 7, 3)) {
- ScratchSimd128Scope scratch(masm);
- if (AssemblerX86Shared::HasAVX()) {
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
- masm.vunpckhps(lhs, rhsCopy, out);
- } else {
- masm.loadAlignedSimd128Float(rhs, scratch);
- masm.vunpckhps(lhs, scratch, scratch);
- masm.moveSimd128Float(scratch, out);
- }
- return;
- }
-
- // In one vshufps
- if (x < 4 && y < 4) {
- mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
- masm.vshufps(mask, rhs, lhs, out);
- return;
- }
-
- // At creation, we should have explicitly swapped in this case.
- MOZ_ASSERT(!(z >= 4 && w >= 4));
-
- // In two vshufps, for the most generic case:
- uint32_t firstMask[4], secondMask[4];
- unsigned i = 0, j = 2, k = 0;
+ unsigned lanes[4];
+ for (unsigned i = 0; i < 4; i++)
+ lanes[i] = ins->lane(i);
+ Maybe<FloatRegister> maybeTemp;
+ if (!ins->temp()->isBogusTemp())
+ maybeTemp.emplace(ToFloatRegister(ins->temp()));
+ masm.shuffleX4(lhs, rhs, out, maybeTemp, lanes);
+}
-#define COMPUTE_MASK(lane) \
- if (lane >= 4) { \
- firstMask[j] = lane % 4; \
- secondMask[k++] = j++; \
- } else { \
- firstMask[i] = lane; \
- secondMask[k++] = i++; \
+static inline Assembler::Condition
+ToCondition(MSimdBinaryComp::Operation op)
+{
+ switch (op) {
+ case MSimdBinaryComp::greaterThan: return Assembler::GreaterThan;
+ case MSimdBinaryComp::equal: return Assembler::Equal;
+ case MSimdBinaryComp::lessThan: return Assembler::LessThan;
+ case MSimdBinaryComp::notEqual: return Assembler::NotEqual;
+ case MSimdBinaryComp::greaterThanOrEqual: return Assembler::GreaterThanOrEqual;
+ case MSimdBinaryComp::lessThanOrEqual: return Assembler::LessThanOrEqual;
}
- COMPUTE_MASK(x)
- COMPUTE_MASK(y)
- COMPUTE_MASK(z)
- COMPUTE_MASK(w)
-#undef COMPUTE_MASK
-
- MOZ_ASSERT(i == 2 && j == 4 && k == 4);
-
- mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
- firstMask[2], firstMask[3]);
- masm.vshufps(mask, rhs, lhs, lhs);
-
- mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
- secondMask[2], secondMask[3]);
- masm.vshufps(mask, lhs, lhs, lhs);
+ MOZ_CRASH("unexpected cond");
}
void
CodeGeneratorX86Shared::visitSimdBinaryCompIx16(LSimdBinaryCompIx16* ins)
{
- static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
-
FloatRegister lhs = ToFloatRegister(ins->lhs());
Operand rhs = ToOperand(ins->rhs());
FloatRegister output = ToFloatRegister(ins->output());
MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs);
- ScratchSimd128Scope scratch(masm);
-
- MSimdBinaryComp::Operation op = ins->operation();
- switch (op) {
- case MSimdBinaryComp::greaterThan:
- masm.vpcmpgtb(rhs, lhs, output);
- return;
- case MSimdBinaryComp::equal:
- masm.vpcmpeqb(rhs, lhs, output);
- return;
- case MSimdBinaryComp::lessThan:
- // src := rhs
- if (rhs.kind() == Operand::FPREG)
- masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
- else
- masm.loadAlignedSimd128Int(rhs, scratch);
-
- // src := src > lhs (i.e. lhs < rhs)
- // Improve by doing custom lowering (rhs is tied to the output register)
- masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch);
- masm.moveSimd128Int(scratch, output);
- return;
- case MSimdBinaryComp::notEqual:
- // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
- // should invert the comparison by, e.g. swapping the arms of a select
- // if that's what it's used in.
- masm.loadConstantSimd128Int(allOnes, scratch);
- masm.vpcmpeqb(rhs, lhs, output);
- masm.bitwiseXorSimd128(Operand(scratch), output);
- return;
- case MSimdBinaryComp::greaterThanOrEqual:
- // src := rhs
- if (rhs.kind() == Operand::FPREG)
- masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
- else
- masm.loadAlignedSimd128Int(rhs, scratch);
- masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch);
- masm.loadConstantSimd128Int(allOnes, output);
- masm.bitwiseXorSimd128(Operand(scratch), output);
- return;
- case MSimdBinaryComp::lessThanOrEqual:
- // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
- masm.loadConstantSimd128Int(allOnes, scratch);
- masm.vpcmpgtb(rhs, lhs, output);
- masm.bitwiseXorSimd128(Operand(scratch), output);
- return;
- }
- MOZ_CRASH("unexpected SIMD op");
+ masm.compareInt8x16(lhs, rhs, ToCondition(ins->operation()), output);
}
void
CodeGeneratorX86Shared::visitSimdBinaryCompIx8(LSimdBinaryCompIx8* ins)
{
- static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
-
FloatRegister lhs = ToFloatRegister(ins->lhs());
Operand rhs = ToOperand(ins->rhs());
FloatRegister output = ToFloatRegister(ins->output());
MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs);
- ScratchSimd128Scope scratch(masm);
-
- MSimdBinaryComp::Operation op = ins->operation();
- switch (op) {
- case MSimdBinaryComp::greaterThan:
- masm.vpcmpgtw(rhs, lhs, output);
- return;
- case MSimdBinaryComp::equal:
- masm.vpcmpeqw(rhs, lhs, output);
- return;
- case MSimdBinaryComp::lessThan:
- // src := rhs
- if (rhs.kind() == Operand::FPREG)
- masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
- else
- masm.loadAlignedSimd128Int(rhs, scratch);
-
- // src := src > lhs (i.e. lhs < rhs)
- // Improve by doing custom lowering (rhs is tied to the output register)
- masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch);
- masm.moveSimd128Int(scratch, output);
- return;
- case MSimdBinaryComp::notEqual:
- // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
- // should invert the comparison by, e.g. swapping the arms of a select
- // if that's what it's used in.
- masm.loadConstantSimd128Int(allOnes, scratch);
- masm.vpcmpeqw(rhs, lhs, output);
- masm.bitwiseXorSimd128(Operand(scratch), output);
- return;
- case MSimdBinaryComp::greaterThanOrEqual:
- // src := rhs
- if (rhs.kind() == Operand::FPREG)
- masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
- else
- masm.loadAlignedSimd128Int(rhs, scratch);
- masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch);
- masm.loadConstantSimd128Int(allOnes, output);
- masm.bitwiseXorSimd128(Operand(scratch), output);
- return;
- case MSimdBinaryComp::lessThanOrEqual:
- // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
- masm.loadConstantSimd128Int(allOnes, scratch);
- masm.vpcmpgtw(rhs, lhs, output);
- masm.bitwiseXorSimd128(Operand(scratch), output);
- return;
- }
- MOZ_CRASH("unexpected SIMD op");
+ masm.compareInt16x8(lhs, rhs, ToCondition(ins->operation()), output);
}
void
CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins)
{
- static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
-
FloatRegister lhs = ToFloatRegister(ins->lhs());
Operand rhs = ToOperand(ins->rhs());
MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
- ScratchSimd128Scope scratch(masm);
-
- MSimdBinaryComp::Operation op = ins->operation();
- switch (op) {
- case MSimdBinaryComp::greaterThan:
- masm.packedGreaterThanInt32x4(rhs, lhs);
- return;
- case MSimdBinaryComp::equal:
- masm.packedEqualInt32x4(rhs, lhs);
- return;
- case MSimdBinaryComp::lessThan:
- // src := rhs
- if (rhs.kind() == Operand::FPREG)
- masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
- else
- masm.loadAlignedSimd128Int(rhs, scratch);
-
- // src := src > lhs (i.e. lhs < rhs)
- // Improve by doing custom lowering (rhs is tied to the output register)
- masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
- masm.moveSimd128Int(scratch, lhs);
- return;
- case MSimdBinaryComp::notEqual:
- // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
- // should invert the comparison by, e.g. swapping the arms of a select
- // if that's what it's used in.
- masm.loadConstantSimd128Int(allOnes, scratch);
- masm.packedEqualInt32x4(rhs, lhs);
- masm.bitwiseXorSimd128(Operand(scratch), lhs);
- return;
- case MSimdBinaryComp::greaterThanOrEqual:
- // src := rhs
- if (rhs.kind() == Operand::FPREG)
- masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
- else
- masm.loadAlignedSimd128Int(rhs, scratch);
- masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
- masm.loadConstantSimd128Int(allOnes, lhs);
- masm.bitwiseXorSimd128(Operand(scratch), lhs);
- return;
- case MSimdBinaryComp::lessThanOrEqual:
- // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
- masm.loadConstantSimd128Int(allOnes, scratch);
- masm.packedGreaterThanInt32x4(rhs, lhs);
- masm.bitwiseXorSimd128(Operand(scratch), lhs);
- return;
- }
- MOZ_CRASH("unexpected SIMD op");
+ masm.compareInt32x4(lhs, rhs, ToCondition(ins->operation()), lhs);
}
void
@@ -3622,27 +2893,7 @@ CodeGeneratorX86Shared::visitSimdBinaryCompFx4(LSimdBinaryCompFx4* ins)
Operand rhs = ToOperand(ins->rhs());
FloatRegister output = ToFloatRegister(ins->output());
- MSimdBinaryComp::Operation op = ins->operation();
- switch (op) {
- case MSimdBinaryComp::equal:
- masm.vcmpeqps(rhs, lhs, output);
- return;
- case MSimdBinaryComp::lessThan:
- masm.vcmpltps(rhs, lhs, output);
- return;
- case MSimdBinaryComp::lessThanOrEqual:
- masm.vcmpleps(rhs, lhs, output);
- return;
- case MSimdBinaryComp::notEqual:
- masm.vcmpneqps(rhs, lhs, output);
- return;
- case MSimdBinaryComp::greaterThanOrEqual:
- case MSimdBinaryComp::greaterThan:
- // We reverse these before register allocation so that we don't have to
- // copy into and out of temporaries after codegen.
- MOZ_CRASH("lowering should have reversed this");
- }
- MOZ_CRASH("unexpected SIMD op");
+ masm.compareFloat32x4(lhs, rhs, ToCondition(ins->operation()), output);
}
void
@@ -3655,10 +2906,10 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx16(LSimdBinaryArithIx16* ins)
MSimdBinaryArith::Operation op = ins->operation();
switch (op) {
case MSimdBinaryArith::Op_add:
- masm.vpaddb(rhs, lhs, output);
+ masm.addInt8x16(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_sub:
- masm.vpsubb(rhs, lhs, output);
+ masm.subInt8x16(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_mul:
// 8x16 mul is a valid operation, but not supported in SSE or AVX.
@@ -3685,13 +2936,13 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx8(LSimdBinaryArithIx8* ins)
MSimdBinaryArith::Operation op = ins->operation();
switch (op) {
case MSimdBinaryArith::Op_add:
- masm.vpaddw(rhs, lhs, output);
+ masm.addInt16x8(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_sub:
- masm.vpsubw(rhs, lhs, output);
+ masm.subInt16x8(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_mul:
- masm.vpmullw(rhs, lhs, output);
+ masm.mulInt16x8(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_div:
case MSimdBinaryArith::Op_max:
@@ -3710,35 +2961,19 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx4(LSimdBinaryArithIx4* ins)
Operand rhs = ToOperand(ins->rhs());
FloatRegister output = ToFloatRegister(ins->output());
- ScratchSimd128Scope scratch(masm);
-
MSimdBinaryArith::Operation op = ins->operation();
switch (op) {
case MSimdBinaryArith::Op_add:
- masm.vpaddd(rhs, lhs, output);
+ masm.addInt32x4(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_sub:
- masm.vpsubd(rhs, lhs, output);
+ masm.subInt32x4(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_mul: {
- if (AssemblerX86Shared::HasSSE41()) {
- masm.vpmulld(rhs, lhs, output);
- return;
- }
-
- masm.loadAlignedSimd128Int(rhs, scratch);
- masm.vpmuludq(lhs, scratch, scratch);
- // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
-
- FloatRegister temp = ToFloatRegister(ins->temp());
- masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
- masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, temp);
- masm.vpmuludq(temp, lhs, lhs);
- // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
-
- masm.vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
- // lhs contains (Ry, Rw, Rx, Rz)
- masm.vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+ Maybe<FloatRegister> maybeTemp;
+ if (!AssemblerX86Shared::HasSSE41())
+ maybeTemp.emplace(ToFloatRegister(ins->getTemp(0)));
+ masm.mulInt32x4(lhs, rhs, maybeTemp, output);
return;
}
case MSimdBinaryArith::Op_div:
@@ -3766,104 +3001,34 @@ CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4* ins)
Operand rhs = ToOperand(ins->rhs());
FloatRegister output = ToFloatRegister(ins->output());
- ScratchSimd128Scope scratch(masm);
-
MSimdBinaryArith::Operation op = ins->operation();
switch (op) {
case MSimdBinaryArith::Op_add:
- masm.vaddps(rhs, lhs, output);
+ masm.addFloat32x4(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_sub:
- masm.vsubps(rhs, lhs, output);
+ masm.subFloat32x4(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_mul:
- masm.vmulps(rhs, lhs, output);
+ masm.mulFloat32x4(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_div:
- masm.vdivps(rhs, lhs, output);
+ masm.divFloat32x4(lhs, rhs, output);
return;
case MSimdBinaryArith::Op_max: {
- FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, scratch);
- masm.vcmpunordps(rhs, lhsCopy, scratch);
-
- FloatRegister tmp = ToFloatRegister(ins->temp());
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, tmp);
- masm.vmaxps(Operand(lhs), rhsCopy, tmp);
- masm.vmaxps(rhs, lhs, output);
-
- masm.vandps(tmp, output, output);
- masm.vorps(scratch, output, output); // or in the all-ones NaNs
+ masm.maxFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
return;
}
case MSimdBinaryArith::Op_min: {
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
- masm.vminps(Operand(lhs), rhsCopy, scratch);
- masm.vminps(rhs, lhs, output);
- masm.vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
+ masm.minFloat32x4(lhs, rhs, output);
return;
}
case MSimdBinaryArith::Op_minNum: {
- FloatRegister tmp = ToFloatRegister(ins->temp());
- masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
-
- FloatRegister mask = scratch;
- FloatRegister tmpCopy = masm.reusedInputFloat32x4(tmp, scratch);
- masm.vpcmpeqd(Operand(lhs), tmpCopy, mask);
- masm.vandps(tmp, mask, mask);
-
- FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
- masm.vminps(rhs, lhsCopy, tmp);
- masm.vorps(mask, tmp, tmp);
-
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
- masm.vcmpneqps(rhs, rhsCopy, mask);
-
- if (AssemblerX86Shared::HasAVX()) {
- masm.vblendvps(mask, lhs, tmp, output);
- } else {
- // Emulate vblendvps.
- // With SSE.4.1 we could use blendvps, however it's awkward since
- // it requires the mask to be in xmm0.
- if (lhs != output)
- masm.moveSimd128Float(lhs, output);
- masm.vandps(Operand(mask), output, output);
- masm.vandnps(Operand(tmp), mask, mask);
- masm.vorps(Operand(mask), output, output);
- }
+ masm.minNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
return;
}
case MSimdBinaryArith::Op_maxNum: {
- FloatRegister mask = scratch;
- masm.loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
- masm.vpcmpeqd(Operand(lhs), mask, mask);
-
- FloatRegister tmp = ToFloatRegister(ins->temp());
- masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
- masm.vandps(tmp, mask, mask);
-
- FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
- masm.vmaxps(rhs, lhsCopy, tmp);
- masm.vandnps(Operand(tmp), mask, mask);
-
- // Ensure tmp always contains the temporary result
- mask = tmp;
- tmp = scratch;
-
- FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
- masm.vcmpneqps(rhs, rhsCopy, mask);
-
- if (AssemblerX86Shared::HasAVX()) {
- masm.vblendvps(mask, lhs, tmp, output);
- } else {
- // Emulate vblendvps.
- // With SSE.4.1 we could use blendvps, however it's awkward since
- // it requires the mask to be in xmm0.
- if (lhs != output)
- masm.moveSimd128Float(lhs, output);
- masm.vandps(Operand(mask), output, output);
- masm.vandnps(Operand(tmp), mask, mask);
- masm.vorps(Operand(mask), output, output);
- }
+ masm.maxNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
return;
}
}
@@ -3884,16 +3049,10 @@ CodeGeneratorX86Shared::visitSimdBinarySaturating(LSimdBinarySaturating* ins)
case MIRType::Int8x16:
switch (ins->operation()) {
case MSimdBinarySaturating::add:
- if (sign == SimdSign::Signed)
- masm.vpaddsb(rhs, lhs, output);
- else
- masm.vpaddusb(rhs, lhs, output);
+ masm.addSatInt8x16(lhs, rhs, sign, output);
return;
case MSimdBinarySaturating::sub:
- if (sign == SimdSign::Signed)
- masm.vpsubsb(rhs, lhs, output);
- else
- masm.vpsubusb(rhs, lhs, output);
+ masm.subSatInt8x16(lhs, rhs, sign, output);
return;
}
break;
@@ -3901,16 +3060,10 @@ CodeGeneratorX86Shared::visitSimdBinarySaturating(LSimdBinarySaturating* ins)
case MIRType::Int16x8:
switch (ins->operation()) {
case MSimdBinarySaturating::add:
- if (sign == SimdSign::Signed)
- masm.vpaddsw(rhs, lhs, output);
- else
- masm.vpaddusw(rhs, lhs, output);
+ masm.addSatInt16x8(lhs, rhs, sign, output);
return;
case MSimdBinarySaturating::sub:
- if (sign == SimdSign::Signed)
- masm.vpsubsw(rhs, lhs, output);
- else
- masm.vpsubusw(rhs, lhs, output);
+ masm.subSatInt16x8(lhs, rhs, sign, output);
return;
}
break;
@@ -3927,16 +3080,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx16(LSimdUnaryArithIx16* ins)
Operand in = ToOperand(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
- static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
-
switch (ins->operation()) {
case MSimdUnaryArith::neg:
- masm.zeroSimd128Int(out);
- masm.packedSubInt8(in, out);
+ masm.negInt8x16(in, out);
return;
case MSimdUnaryArith::not_:
- masm.loadConstantSimd128Int(allOnes, out);
- masm.bitwiseXorSimd128(in, out);
+ masm.notInt8x16(in, out);;
return;
case MSimdUnaryArith::abs:
case MSimdUnaryArith::reciprocalApproximation:
@@ -3953,16 +3102,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx8(LSimdUnaryArithIx8* ins)
Operand in = ToOperand(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
- static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
-
switch (ins->operation()) {
case MSimdUnaryArith::neg:
- masm.zeroSimd128Int(out);
- masm.packedSubInt16(in, out);
+ masm.negInt16x8(in, out);
return;
case MSimdUnaryArith::not_:
- masm.loadConstantSimd128Int(allOnes, out);
- masm.bitwiseXorSimd128(in, out);
+ masm.notInt16x8(in, out);
return;
case MSimdUnaryArith::abs:
case MSimdUnaryArith::reciprocalApproximation:
@@ -3979,16 +3124,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins)
Operand in = ToOperand(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
- static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
-
switch (ins->operation()) {
case MSimdUnaryArith::neg:
- masm.zeroSimd128Int(out);
- masm.packedSubInt32(in, out);
+ masm.negInt32x4(in, out);
return;
case MSimdUnaryArith::not_:
- masm.loadConstantSimd128Int(allOnes, out);
- masm.bitwiseXorSimd128(in, out);
+ masm.notInt32x4(in, out);
return;
case MSimdUnaryArith::abs:
case MSimdUnaryArith::reciprocalApproximation:
@@ -4005,29 +3146,15 @@ CodeGeneratorX86Shared::visitSimdUnaryArithFx4(LSimdUnaryArithFx4* ins)
Operand in = ToOperand(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
- // All ones but the sign bit
- float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
- static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
-
- // All ones including the sign bit
- float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
- static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
-
- // All zeros but the sign bit
- static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
-
switch (ins->operation()) {
case MSimdUnaryArith::abs:
- masm.loadConstantSimd128Float(signMasks, out);
- masm.bitwiseAndSimd128(in, out);
+ masm.absFloat32x4(in, out);
return;
case MSimdUnaryArith::neg:
- masm.loadConstantSimd128Float(minusZero, out);
- masm.bitwiseXorSimd128(in, out);
+ masm.negFloat32x4(in, out);
return;
case MSimdUnaryArith::not_:
- masm.loadConstantSimd128Float(allOnes, out);
- masm.bitwiseXorSimd128(in, out);
+ masm.notFloat32x4(in, out);
return;
case MSimdUnaryArith::reciprocalApproximation:
masm.packedRcpApproximationFloat32x4(in, out);
@@ -4053,21 +3180,21 @@ CodeGeneratorX86Shared::visitSimdBinaryBitwise(LSimdBinaryBitwise* ins)
switch (op) {
case MSimdBinaryBitwise::and_:
if (ins->type() == MIRType::Float32x4)
- masm.vandps(rhs, lhs, output);
+ masm.bitwiseAndFloat32x4(lhs, rhs, output);
else
- masm.vpand(rhs, lhs, output);
+ masm.bitwiseAndSimdInt(lhs, rhs, output);
return;
case MSimdBinaryBitwise::or_:
if (ins->type() == MIRType::Float32x4)
- masm.vorps(rhs, lhs, output);
+ masm.bitwiseOrFloat32x4(lhs, rhs, output);
else
- masm.vpor(rhs, lhs, output);
+ masm.bitwiseOrSimdInt(lhs, rhs, output);
return;
case MSimdBinaryBitwise::xor_:
if (ins->type() == MIRType::Float32x4)
- masm.vxorps(rhs, lhs, output);
+ masm.bitwiseXorFloat32x4(lhs, rhs, output);
else
- masm.vpxor(rhs, lhs, output);
+ masm.bitwiseXorSimdInt(lhs, rhs, output);
return;
}
MOZ_CRASH("unexpected SIMD bitwise op");
@@ -4079,15 +3206,12 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins)
FloatRegister out = ToFloatRegister(ins->output());
MOZ_ASSERT(ToFloatRegister(ins->vector()) == out); // defineReuseInput(0);
- // The shift amount is masked to the number of bits in a lane.
- uint32_t shiftmask = (128u / SimdTypeToLength(ins->type())) - 1;
-
// Note that SSE doesn't have instructions for shifting 8x16 vectors.
// These shifts are synthesized by the MSimdShift::AddLegalized() function.
const LAllocation* val = ins->value();
if (val->isConstant()) {
MOZ_ASSERT(ins->temp()->isBogusTemp());
- Imm32 count(uint32_t(ToInt32(val)) & shiftmask);
+ Imm32 count(uint32_t(ToInt32(val)));
switch (ins->type()) {
case MIRType::Int16x8:
switch (ins->operation()) {
@@ -4121,38 +3245,33 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins)
MOZ_CRASH("unexpected SIMD bitwise op");
}
- // Truncate val to 5 bits. We should have a temp register for that.
- MOZ_ASSERT(val->isRegister());
- Register count = ToRegister(ins->temp());
- masm.mov(ToRegister(val), count);
- masm.andl(Imm32(shiftmask), count);
- ScratchFloat32Scope scratch(masm);
- masm.vmovd(count, scratch);
+ Register temp = ToRegister(ins->temp());
+ Register count = ToRegister(val);
switch (ins->type()) {
case MIRType::Int16x8:
switch (ins->operation()) {
case MSimdShift::lsh:
- masm.packedLeftShiftByScalarInt16x8(scratch, out);
+ masm.packedLeftShiftByScalarInt16x8(out, count, temp, out);
return;
case MSimdShift::rsh:
- masm.packedRightShiftByScalarInt16x8(scratch, out);
+ masm.packedRightShiftByScalarInt16x8(out, count, temp, out);
return;
case MSimdShift::ursh:
- masm.packedUnsignedRightShiftByScalarInt16x8(scratch, out);
+ masm.packedUnsignedRightShiftByScalarInt16x8(out, count, temp, out);
return;
}
break;
case MIRType::Int32x4:
switch (ins->operation()) {
case MSimdShift::lsh:
- masm.packedLeftShiftByScalarInt32x4(scratch, out);
+ masm.packedLeftShiftByScalarInt32x4(out, count, temp, out);
return;
case MSimdShift::rsh:
- masm.packedRightShiftByScalarInt32x4(scratch, out);
+ masm.packedRightShiftByScalarInt32x4(out, count, temp, out);
return;
case MSimdShift::ursh:
- masm.packedUnsignedRightShiftByScalarInt32x4(scratch, out);
+ masm.packedUnsignedRightShiftByScalarInt32x4(out, count, temp, out);
return;
}
break;
@@ -4171,26 +3290,12 @@ CodeGeneratorX86Shared::visitSimdSelect(LSimdSelect* ins)
FloatRegister output = ToFloatRegister(ins->output());
FloatRegister temp = ToFloatRegister(ins->temp());
- if (onTrue != output)
- masm.vmovaps(onTrue, output);
- if (mask != temp)
- masm.vmovaps(mask, temp);
-
MSimdSelect* mir = ins->mir();
unsigned lanes = SimdTypeToLength(mir->type());
-
- if (AssemblerX86Shared::HasAVX() && lanes == 4) {
- // TBD: Use vpblendvb for lanes > 4, HasAVX.
- masm.vblendvps(mask, onTrue, onFalse, output);
- return;
- }
-
- // SSE4.1 has plain blendvps which can do this, but it is awkward
- // to use because it requires the mask to be in xmm0.
-
- masm.bitwiseAndSimd128(Operand(temp), output);
- masm.bitwiseAndNotSimd128(Operand(onFalse), temp);
- masm.bitwiseOrSimd128(Operand(temp), output);
+ if (lanes == 4)
+ masm.selectX4(mask, onTrue, onFalse, temp, output);
+ else
+ masm.selectSimd128(mask, onTrue, onFalse, temp, output);
}
void
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
index 0b4961dddd..4b0664fb63 100644
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
@@ -173,12 +173,6 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
void emitTableSwitchDispatch(MTableSwitch* mir, Register index, Register base);
- void emitSimdExtractLane8x16(FloatRegister input, Register output, unsigned lane,
- SimdSign signedness);
- void emitSimdExtractLane16x8(FloatRegister input, Register output, unsigned lane,
- SimdSign signedness);
- void emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane);
-
public:
CodeGeneratorX86Shared(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm);
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
new file mode 100644
index 0000000000..0ebf30de1a
--- /dev/null
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -0,0 +1,1227 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/MacroAssembler.h"
+#include "jit/x86-shared/MacroAssembler-x86-shared.h"
+
+#include "jit/MacroAssembler-inl.h"
+
+using namespace js;
+using namespace js::jit;
+
+using mozilla::DebugOnly;
+using mozilla::FloatingPoint;
+using mozilla::Maybe;
+using mozilla::SpecificNaN;
+
+void
+MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest,
+ Register temp, Label* oolEntry,
+ Label* rejoin)
+{
+ // Does the conversion and jumps to the OOL entry if the result value
+ // is the undefined integer pattern.
+ static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648));
+ convertFloat32x4ToInt32x4(src, dest);
+
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().loadConstantSimd128Int(InvalidResult, scratch);
+ packedEqualInt32x4(Operand(dest), scratch);
+ // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
+ // the two following instructions.
+ vmovmskps(scratch, temp);
+ cmp32(temp, Imm32(0));
+ j(Assembler::NotEqual, oolEntry);
+ bind(rejoin);
+}
+
+void
+MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp,
+ Label* rejoin, Label* onConversionError)
+{
+ static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
+ static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
+
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
+ vcmpleps(Operand(src), scratch, scratch);
+ vmovmskps(scratch, temp);
+ cmp32(temp, Imm32(15));
+ j(Assembler::NotEqual, onConversionError);
+
+ asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
+ vcmpleps(Operand(src), scratch, scratch);
+ vmovmskps(scratch, temp);
+ cmp32(temp, Imm32(0));
+ j(Assembler::NotEqual, onConversionError);
+
+ jump(rejoin);
+}
+
+void
+MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(FloatRegister in, FloatRegister out,
+ Register temp, FloatRegister tempF,
+ Label* failed)
+{
+ // Classify lane values into 4 disjoint classes:
+ //
+ // N-lanes: in <= -1.0
+ // A-lanes: -1.0 < in <= 0x0.ffffffp31
+ // B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
+ // V-lanes: 0x1.0p32 <= in, or isnan(in)
+ //
+ // We need to bail out to throw a RangeError if we see any N-lanes or
+ // V-lanes.
+ //
+ // For A-lanes and B-lanes, we make two float -> int32 conversions:
+ //
+ // A = cvttps2dq(in)
+ // B = cvttps2dq(in - 0x1.0p31f)
+ //
+ // Note that the subtraction for the B computation is exact for B-lanes.
+ // There is no rounding, so B is the low 31 bits of the correctly converted
+ // result.
+ //
+ // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
+ // out of range for a signed int32_t. This conveniently provides the missing
+ // high bit for B, so the desired result is A for A-lanes and A|B for
+ // B-lanes.
+
+ ScratchSimd128Scope scratch(asMasm());
+
+ // TODO: If the majority of lanes are A-lanes, it could be faster to compute
+ // A first, use vmovmskps to check for any non-A-lanes and handle them in
+ // ool code. OTOH, we we're wrong about the lane distribution, that would be
+ // slower.
+
+ // Compute B in |scratch|.
+ static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
+ static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
+ asMasm().loadConstantSimd128Float(Bias, scratch);
+ packedAddFloat32(Operand(in), scratch);
+ convertFloat32x4ToInt32x4(scratch, scratch);
+
+ // Compute A in |out|. This is the last time we use |in| and the first time
+ // we use |out|, so we can tolerate if they are the same register.
+ convertFloat32x4ToInt32x4(in, out);
+
+ // We can identify A-lanes by the sign bits in A: Any A-lanes will be
+ // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
+ // mask of non-A-lanes into |tempF|.
+ zeroSimd128Float(tempF);
+ packedGreaterThanInt32x4(Operand(out), tempF);
+
+ // Clear the A-lanes in B.
+ bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
+
+ // Compute the final result: A for A-lanes, A|B for B-lanes.
+ bitwiseOrSimdInt(out, Operand(scratch), out);
+
+ // We still need to filter out the V-lanes. They would show up as 0x80000000
+ // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
+ // the remaining negative lanes in B.
+ vmovmskps(scratch, temp);
+ cmp32(temp, Imm32(0));
+ j(Assembler::NotEqual, failed);
+}
+
+void
+MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1, Register lane2,
+ Register lane3, FloatRegister dest)
+{
+ if (AssemblerX86Shared::HasSSE41()) {
+ vmovd(lane0, dest);
+ vpinsrd(1, lane1, dest, dest);
+ vpinsrd(2, lane2, dest, dest);
+ vpinsrd(3, lane3, dest, dest);
+ return;
+ }
+
+ asMasm().reserveStack(Simd128DataSize);
+ store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
+ store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
+ store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
+ store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
+ loadAlignedSimd128Int(Address(StackPointer, 0), dest);
+ asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::createFloat32x4(FloatRegister lane0, FloatRegister lane1,
+ FloatRegister lane2, FloatRegister lane3,
+ FloatRegister temp, FloatRegister output)
+{
+ FloatRegister lane0Copy = reusedInputFloat32x4(lane0, output);
+ FloatRegister lane1Copy = reusedInputFloat32x4(lane1, temp);
+ vunpcklps(lane3, lane1Copy, temp);
+ vunpcklps(lane2, lane0Copy, output);
+ vunpcklps(temp, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output)
+{
+ vmovd(input, output);
+ if (AssemblerX86Shared::HasSSSE3()) {
+ zeroSimd128Int(ScratchSimd128Reg);
+ vpshufb(ScratchSimd128Reg, output, output);
+ } else {
+ // Use two shifts to duplicate the low 8 bits into the low 16 bits.
+ vpsllw(Imm32(8), output, output);
+ vmovdqa(output, ScratchSimd128Reg);
+ vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
+ vpor(ScratchSimd128Reg, output, output);
+ // Then do an X8 splat.
+ vpshuflw(0, output, output);
+ vpshufd(0, output, output);
+ }
+}
+
+void
+MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output)
+{
+ vmovd(input, output);
+ vpshuflw(0, output, output);
+ vpshufd(0, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output)
+{
+ vmovd(input, output);
+ vpshufd(0, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX4(FloatRegister input, FloatRegister output)
+{
+ FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+ vshufps(0, inputCopy, inputCopy, output);
+}
+
+void
+MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType, FloatRegister input,
+ FloatRegister output)
+{
+ if (input.aliases(output))
+ return;
+ if (isIntegerLaneType)
+ vmovdqa(input, output);
+ else
+ vmovaps(input, output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, Register output, unsigned lane)
+{
+ if (lane == 0) {
+ // The value we want to extract is in the low double-word
+ moveLowInt32(input, output);
+ } else if (AssemblerX86Shared::HasSSE41()) {
+ vpextrd(lane, input, output);
+ } else {
+ uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+ shuffleInt32(mask, input, ScratchSimd128Reg);
+ moveLowInt32(ScratchSimd128Reg, output);
+ }
+}
+
+void
+MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, FloatRegister output,
+ unsigned lane, bool canonicalize)
+{
+ if (lane == 0) {
+ // The value we want to extract is in the low double-word
+ if (input != output)
+ moveFloat32(input, output);
+ } else if (lane == 2) {
+ moveHighPairToLowPairFloat32(input, output);
+ } else {
+ uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+ shuffleFloat32(mask, input, output);
+ }
+ // NaNs contained within SIMD values are not enforced to be canonical, so
+ // when we extract an element into a "regular" scalar JS value, we have to
+ // canonicalize. In wasm code, we can skip this, as wasm only has to
+ // canonicalize NaNs at FFI boundaries.
+ if (canonicalize)
+ asMasm().canonicalizeFloat(output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, Register output, unsigned lane,
+ SimdSign sign)
+{
+ // Unlike pextrd and pextrb, this is available in SSE2.
+ vpextrw(lane, input, output);
+ if (sign == SimdSign::Signed)
+ movswl(output, output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, Register output, unsigned lane,
+ SimdSign sign)
+{
+ if (AssemblerX86Shared::HasSSE41()) {
+ vpextrb(lane, input, output);
+ // vpextrb clears the high bits, so no further extension required.
+ if (sign == SimdSign::Unsigned)
+ sign = SimdSign::NotApplicable;
+ } else {
+ // Extract the relevant 16 bits containing our lane, then shift the
+ // right 8 bits into place.
+ extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned);
+ if (lane % 2) {
+ shrl(Imm32(8), output);
+ // The shrl handles the zero-extension. Don't repeat it.
+ if (sign == SimdSign::Unsigned)
+ sign = SimdSign::NotApplicable;
+ }
+ }
+
+ // We have the right low 8 bits in |output|, but we may need to fix the high
+ // bits. Note that this requires |output| to be one of the %eax-%edx
+ // registers.
+ switch (sign) {
+ case SimdSign::Signed:
+ movsbl(output, output);
+ break;
+ case SimdSign::Unsigned:
+ movzbl(output, output);
+ break;
+ case SimdSign::NotApplicable:
+ // No adjustment needed.
+ break;
+ }
+}
+
+void
+MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes,
+ unsigned lane)
+{
+ switch (numLanes) {
+ case 4:
+ extractLaneInt32x4(input, output, lane);
+ break;
+ case 8:
+ // Get a lane, don't bother fixing the high bits since we'll mask below.
+ extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
+ break;
+ case 16:
+ extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
+ break;
+ default:
+ MOZ_CRASH("Unhandled SIMD number of lanes");
+ }
+ // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
+ asMasm().and32(Imm32(1), output);
+}
+
+void
+MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output,
+ unsigned lane, unsigned numLanes)
+{
+ if (numLanes == 8) {
+ // Available in SSE 2.
+ vpinsrw(lane, value, input, output);
+ return;
+ }
+
+ // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
+ // value goes into the first component, as vmovd clears out the higher lanes
+ // of the output.
+ if (AssemblerX86Shared::HasSSE41()) {
+ // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
+ switch (numLanes) {
+ case 4:
+ vpinsrd(lane, value, input, output);
+ return;
+ case 16:
+ vpinsrb(lane, value, input, output);
+ return;
+ }
+ }
+
+ asMasm().reserveStack(Simd128DataSize);
+ storeAlignedSimd128Int(input, Address(StackPointer, 0));
+ switch (numLanes) {
+ case 4:
+ store32(value, Address(StackPointer, lane * sizeof(int32_t)));
+ break;
+ case 16:
+ // Note that this requires `value` to be in one the registers where the
+ // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
+ store8(value, Address(StackPointer, lane * sizeof(int8_t)));
+ break;
+ default:
+ MOZ_CRASH("Unsupported SIMD numLanes");
+ }
+ loadAlignedSimd128Int(Address(StackPointer, 0), output);
+ asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input, FloatRegister value,
+ FloatRegister output, unsigned lane)
+{
+ if (lane == 0) {
+ // As both operands are registers, vmovss doesn't modify the upper bits
+ // of the destination operand.
+ if (value != output)
+ vmovss(value, input, output);
+ return;
+ }
+
+ if (AssemblerX86Shared::HasSSE41()) {
+ // The input value is in the low float32 of the 'value' FloatRegister.
+ vinsertps(vinsertpsMask(0, lane), value, output, output);
+ return;
+ }
+
+ asMasm().reserveStack(Simd128DataSize);
+ storeAlignedSimd128Float(input, Address(StackPointer, 0));
+ asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t)));
+ loadAlignedSimd128Float(Address(StackPointer, 0), output);
+ asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input, Register output)
+{
+ // We know that the input lanes are boolean, so they are either 0 or -1.
+ // The all-true vector has all 128 bits set, no matter the lane geometry.
+ vpmovmskb(input, output);
+ cmp32(output, Imm32(0xffff));
+ emitSet(Assembler::Zero, output);
+}
+
+void
+MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input, Register output)
+{
+ vpmovmskb(input, output);
+ cmp32(output, Imm32(0x0));
+ emitSet(Assembler::NonZero, output);
+}
+
+void
+MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input, FloatRegister output,
+ unsigned lanes[4])
+{
+ uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1], lanes[2], lanes[3]);
+ shuffleInt32(mask, input, output);
+}
+
+void
+MacroAssemblerX86Shared::swizzleInt8x16(FloatRegister input, FloatRegister output,
+ const Maybe<Register>& temp, int8_t lanes[16])
+{
+ if (AssemblerX86Shared::HasSSSE3()) {
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
+ FloatRegister inputCopy = reusedInputInt32x4(input, output);
+ vpshufb(scratch, inputCopy, output);
+ return;
+ }
+
+ // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
+ MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
+ asMasm().reserveStack(2 * Simd128DataSize);
+ storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
+ for (unsigned i = 0; i < 16; i++) {
+ load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
+ store8(*temp, Address(StackPointer, i));
+ }
+ loadAlignedSimd128Int(Address(StackPointer, 0), output);
+ asMasm().freeStack(2 * Simd128DataSize);
+}
+
+static inline bool
+LanesMatch(unsigned lanes[4], unsigned x, unsigned y, unsigned z, unsigned w)
+{
+ return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
+}
+
+void
+MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input, FloatRegister output,
+ unsigned lanes[4])
+{
+ if (AssemblerX86Shared::HasSSE3()) {
+ if (LanesMatch(lanes, 0, 0, 2, 2)) {
+ vmovsldup(input, output);
+ return;
+ }
+ if (LanesMatch(lanes, 1, 1, 3, 3)) {
+ vmovshdup(input, output);
+ return;
+ }
+ }
+
+ // TODO Here and below, arch specific lowering could identify this pattern
+ // and use defineReuseInput to avoid this move (bug 1084404)
+ if (LanesMatch(lanes, 2, 3, 2, 3)) {
+ FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+ vmovhlps(input, inputCopy, output);
+ return;
+ }
+
+ if (LanesMatch(lanes, 0, 1, 0, 1)) {
+ if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
+ vmovddup(input, output);
+ return;
+ }
+ FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+ vmovlhps(input, inputCopy, output);
+ return;
+ }
+
+ if (LanesMatch(lanes, 0, 0, 1, 1)) {
+ FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+ vunpcklps(input, inputCopy, output);
+ return;
+ }
+
+ if (LanesMatch(lanes, 2, 2, 3, 3)) {
+ FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+ vunpckhps(input, inputCopy, output);
+ return;
+ }
+
+ uint32_t x = lanes[0];
+ uint32_t y = lanes[1];
+ uint32_t z = lanes[2];
+ uint32_t w = lanes[3];
+
+ uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
+ shuffleFloat32(mask, input, output);
+}
+
+void
+MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+ const Maybe<FloatRegister>& maybeFloatTemp,
+ const Maybe<Register>& maybeTemp, uint8_t lanes[16])
+{
+ DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3();
+ MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp);
+ MOZ_ASSERT(!hasSSSE3 == !!maybeTemp);
+
+ // Use pshufb if it is available.
+ if (AssemblerX86Shared::HasSSSE3()) {
+ ScratchSimd128Scope scratch(asMasm());
+
+ // Use pshufb instructions to gather the lanes from each source vector.
+ // A negative index creates a zero lane, so the two vectors can be combined.
+
+ // Set scratch = lanes from lhs.
+ int8_t idx[16];
+ for (unsigned i = 0; i < 16; i++)
+ idx[i] = lanes[i] < 16 ? lanes[i] : -1;
+ asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp);
+ FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch);
+ vpshufb(*maybeFloatTemp, lhsCopy, scratch);
+
+ // Set output = lanes from rhs.
+ for (unsigned i = 0; i < 16; i++)
+ idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
+ asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp);
+ FloatRegister rhsCopy = reusedInputInt32x4(rhs, output);
+ vpshufb(*maybeFloatTemp, rhsCopy, output);
+
+ // Combine.
+ vpor(scratch, output, output);
+ return;
+ }
+
+ // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
+ asMasm().reserveStack(3 * Simd128DataSize);
+ storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
+ storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
+ for (unsigned i = 0; i < 16; i++) {
+ load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *maybeTemp);
+ store8(*maybeTemp, Address(StackPointer, i));
+ }
+ loadAlignedSimd128Int(Address(StackPointer, 0), output);
+ asMasm().freeStack(3 * Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+ const Maybe<FloatRegister>& maybeTemp, unsigned lanes[4])
+{
+ uint32_t x = lanes[0];
+ uint32_t y = lanes[1];
+ uint32_t z = lanes[2];
+ uint32_t w = lanes[3];
+
+ // Check that lanes come from LHS in majority:
+ unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
+ MOZ_ASSERT(numLanesFromLHS >= 2);
+
+ // When reading this method, remember that vshufps takes the two first
+ // inputs of the destination operand (right operand) and the two last
+ // inputs of the source operand (left operand).
+ //
+ // Legend for explanations:
+ // - L: LHS
+ // - R: RHS
+ // - T: temporary
+
+ uint32_t mask;
+
+ // If all lanes came from a single vector, we should use swizzle instead.
+ MOZ_ASSERT(numLanesFromLHS < 4);
+
+ // If all values stay in their lane, this is a blend.
+ if (AssemblerX86Shared::HasSSE41()) {
+ if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
+ vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
+ return;
+ }
+ }
+
+ // One element of the second, all other elements of the first
+ if (numLanesFromLHS == 3) {
+ unsigned firstMask = -1, secondMask = -1;
+
+ // register-register vmovss preserves the high lanes.
+ if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
+ vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
+ return;
+ }
+
+ // SSE4.1 vinsertps can handle any single element.
+ unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
+ if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
+ unsigned srcLane;
+ unsigned dstLane;
+ if (x >= 4) {
+ srcLane = x - 4;
+ dstLane = 0;
+ } else if (y >= 4) {
+ srcLane = y - 4;
+ dstLane = 1;
+ } else if (z >= 4) {
+ srcLane = z - 4;
+ dstLane = 2;
+ } else {
+ MOZ_ASSERT(w >= 4);
+ srcLane = w - 4;
+ dstLane = 3;
+ }
+ vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
+ return;
+ }
+
+ MOZ_ASSERT(!!maybeTemp);
+ FloatRegister rhsCopy = *maybeTemp;
+ loadAlignedSimd128Float(rhs, rhsCopy);
+
+ if (x < 4 && y < 4) {
+ if (w >= 4) {
+ w %= 4;
+ // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
+ firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
+ // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
+ secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
+ } else {
+ MOZ_ASSERT(z >= 4);
+ z %= 4;
+ // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
+ firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
+ // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
+ secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
+ }
+
+ vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+ vshufps(secondMask, rhsCopy, lhs, out);
+ return;
+ }
+
+ MOZ_ASSERT(z < 4 && w < 4);
+
+ if (y >= 4) {
+ y %= 4;
+ // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
+ firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
+ // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
+ secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
+ } else {
+ MOZ_ASSERT(x >= 4);
+ x %= 4;
+ // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
+ firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
+ // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
+ secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
+ }
+
+ vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+ if (AssemblerX86Shared::HasAVX()) {
+ vshufps(secondMask, lhs, rhsCopy, out);
+ } else {
+ vshufps(secondMask, lhs, rhsCopy, rhsCopy);
+ moveSimd128Float(rhsCopy, out);
+ }
+ return;
+ }
+
+ // Two elements from one vector, two other elements from the other
+ MOZ_ASSERT(numLanesFromLHS == 2);
+
+ // TODO Here and below, symmetric case would be more handy to avoid a move,
+ // but can't be reached because operands would get swapped (bug 1084404).
+ if (LanesMatch(lanes, 2, 3, 6, 7)) {
+ ScratchSimd128Scope scratch(asMasm());
+ if (AssemblerX86Shared::HasAVX()) {
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+ vmovhlps(lhs, rhsCopy, out);
+ } else {
+ loadAlignedSimd128Float(rhs, scratch);
+ vmovhlps(lhs, scratch, scratch);
+ moveSimd128Float(scratch, out);
+ }
+ return;
+ }
+
+ if (LanesMatch(lanes, 0, 1, 4, 5)) {
+ FloatRegister rhsCopy;
+ ScratchSimd128Scope scratch(asMasm());
+ if (rhs.kind() == Operand::FPREG) {
+ // No need to make an actual copy, since the operand is already
+ // in a register, and it won't be clobbered by the vmovlhps.
+ rhsCopy = FloatRegister::FromCode(rhs.fpu());
+ } else {
+ loadAlignedSimd128Float(rhs, scratch);
+ rhsCopy = scratch;
+ }
+ vmovlhps(rhsCopy, lhs, out);
+ return;
+ }
+
+ if (LanesMatch(lanes, 0, 4, 1, 5)) {
+ vunpcklps(rhs, lhs, out);
+ return;
+ }
+
+ // TODO swapped case would be better (bug 1084404)
+ if (LanesMatch(lanes, 4, 0, 5, 1)) {
+ ScratchSimd128Scope scratch(asMasm());
+ if (AssemblerX86Shared::HasAVX()) {
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+ vunpcklps(lhs, rhsCopy, out);
+ } else {
+ loadAlignedSimd128Float(rhs, scratch);
+ vunpcklps(lhs, scratch, scratch);
+ moveSimd128Float(scratch, out);
+ }
+ return;
+ }
+
+ if (LanesMatch(lanes, 2, 6, 3, 7)) {
+ vunpckhps(rhs, lhs, out);
+ return;
+ }
+
+ // TODO swapped case would be better (bug 1084404)
+ if (LanesMatch(lanes, 6, 2, 7, 3)) {
+ ScratchSimd128Scope scratch(asMasm());
+ if (AssemblerX86Shared::HasAVX()) {
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+ vunpckhps(lhs, rhsCopy, out);
+ } else {
+ loadAlignedSimd128Float(rhs, scratch);
+ vunpckhps(lhs, scratch, scratch);
+ moveSimd128Float(scratch, out);
+ }
+ return;
+ }
+
+ // In one vshufps
+ if (x < 4 && y < 4) {
+ mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
+ vshufps(mask, rhs, lhs, out);
+ return;
+ }
+
+ // At creation, we should have explicitly swapped in this case.
+ MOZ_ASSERT(!(z >= 4 && w >= 4));
+
+ // In two vshufps, for the most generic case:
+ uint32_t firstMask[4], secondMask[4];
+ unsigned i = 0, j = 2, k = 0;
+
+#define COMPUTE_MASK(lane) \
+ if (lane >= 4) { \
+ firstMask[j] = lane % 4; \
+ secondMask[k++] = j++; \
+ } else { \
+ firstMask[i] = lane; \
+ secondMask[k++] = i++; \
+ }
+
+ COMPUTE_MASK(x)
+ COMPUTE_MASK(y)
+ COMPUTE_MASK(z)
+ COMPUTE_MASK(w)
+#undef COMPUTE_MASK
+
+ MOZ_ASSERT(i == 2 && j == 4 && k == 4);
+
+ mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
+ firstMask[2], firstMask[3]);
+ vshufps(mask, rhs, lhs, lhs);
+
+ mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
+ secondMask[2], secondMask[3]);
+ vshufps(mask, lhs, lhs, lhs);
+}
+
+static inline FloatRegister
+ToSimdFloatRegister(const Operand& op)
+{
+ return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
+}
+
+void
+MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output)
+{
+ static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+ ScratchSimd128Scope scratch(asMasm());
+ switch (cond) {
+ case Assembler::Condition::GreaterThan:
+ vpcmpgtb(rhs, lhs, output);
+ break;
+ case Assembler::Condition::Equal:
+ vpcmpeqb(rhs, lhs, output);
+ break;
+ case Assembler::Condition::LessThan:
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG)
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ else
+ loadAlignedSimd128Int(rhs, scratch);
+
+ // src := src > lhs (i.e. lhs < rhs)
+ // Improve by doing custom lowering (rhs is tied to the output register)
+ vpcmpgtb(Operand(lhs), scratch, scratch);
+ moveSimd128Int(scratch, output);
+ break;
+ case Assembler::Condition::NotEqual:
+ // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+ // should invert the comparison by, e.g. swapping the arms of a select
+ // if that's what it's used in.
+ asMasm().loadConstantSimd128Int(allOnes, scratch);
+ vpcmpeqb(rhs, lhs, output);
+ bitwiseXorSimdInt(output, Operand(scratch), output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual:
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG)
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ else
+ loadAlignedSimd128Int(rhs, scratch);
+ vpcmpgtb(Operand(lhs), scratch, scratch);
+ asMasm().loadConstantSimd128Int(allOnes, output);
+ bitwiseXorSimdInt(output, Operand(scratch), output);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+ asMasm().loadConstantSimd128Int(allOnes, scratch);
+ vpcmpgtb(rhs, lhs, output);
+ bitwiseXorSimdInt(output, Operand(scratch), output);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void
+MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output)
+{
+ static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+
+ ScratchSimd128Scope scratch(asMasm());
+ switch (cond) {
+ case Assembler::Condition::GreaterThan:
+ vpcmpgtw(rhs, lhs, output);
+ break;
+ case Assembler::Condition::Equal:
+ vpcmpeqw(rhs, lhs, output);
+ break;
+ case Assembler::Condition::LessThan:
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG)
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ else
+ loadAlignedSimd128Int(rhs, scratch);
+
+ // src := src > lhs (i.e. lhs < rhs)
+ // Improve by doing custom lowering (rhs is tied to the output register)
+ vpcmpgtw(Operand(lhs), scratch, scratch);
+ moveSimd128Int(scratch, output);
+ break;
+ case Assembler::Condition::NotEqual:
+ // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+ // should invert the comparison by, e.g. swapping the arms of a select
+ // if that's what it's used in.
+ asMasm().loadConstantSimd128Int(allOnes, scratch);
+ vpcmpeqw(rhs, lhs, output);
+ bitwiseXorSimdInt(output, Operand(scratch), output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual:
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG)
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ else
+ loadAlignedSimd128Int(rhs, scratch);
+ vpcmpgtw(Operand(lhs), scratch, scratch);
+ asMasm().loadConstantSimd128Int(allOnes, output);
+ bitwiseXorSimdInt(output, Operand(scratch), output);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+ asMasm().loadConstantSimd128Int(allOnes, scratch);
+ vpcmpgtw(rhs, lhs, output);
+ bitwiseXorSimdInt(output, Operand(scratch), output);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void
+MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output)
+{
+ static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+ ScratchSimd128Scope scratch(asMasm());
+ switch (cond) {
+ case Assembler::Condition::GreaterThan:
+ packedGreaterThanInt32x4(rhs, lhs);
+ break;
+ case Assembler::Condition::Equal:
+ packedEqualInt32x4(rhs, lhs);
+ break;
+ case Assembler::Condition::LessThan:
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG)
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ else
+ loadAlignedSimd128Int(rhs, scratch);
+
+ // src := src > lhs (i.e. lhs < rhs)
+ // Improve by doing custom lowering (rhs is tied to the output register)
+ packedGreaterThanInt32x4(Operand(lhs), scratch);
+ moveSimd128Int(scratch, lhs);
+ break;
+ case Assembler::Condition::NotEqual:
+ // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+ // should invert the comparison by, e.g. swapping the arms of a select
+ // if that's what it's used in.
+ asMasm().loadConstantSimd128Int(allOnes, scratch);
+ packedEqualInt32x4(rhs, lhs);
+ bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual:
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG)
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ else
+ loadAlignedSimd128Int(rhs, scratch);
+ packedGreaterThanInt32x4(Operand(lhs), scratch);
+ asMasm().loadConstantSimd128Int(allOnes, lhs);
+ bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+ asMasm().loadConstantSimd128Int(allOnes, scratch);
+ packedGreaterThanInt32x4(rhs, lhs);
+ bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void
+MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output)
+{
+ switch (cond) {
+ case Assembler::Condition::Equal:
+ vcmpeqps(rhs, lhs, output);
+ break;
+ case Assembler::Condition::LessThan:
+ vcmpltps(rhs, lhs, output);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ vcmpleps(rhs, lhs, output);
+ break;
+ case Assembler::Condition::NotEqual:
+ vcmpneqps(rhs, lhs, output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual:
+ case Assembler::Condition::GreaterThan:
+ // We reverse these before register allocation so that we don't have to
+ // copy into and out of temporaries after codegen.
+ MOZ_CRASH("should have reversed this");
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void
+MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs,
+ const Maybe<FloatRegister>& temp, FloatRegister output)
+{
+ if (AssemblerX86Shared::HasSSE41()) {
+ vpmulld(rhs, lhs, output);
+ return;
+ }
+
+ ScratchSimd128Scope scratch(asMasm());
+ loadAlignedSimd128Int(rhs, scratch);
+ vpmuludq(lhs, scratch, scratch);
+ // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
+
+ MOZ_ASSERT(!!temp);
+ vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
+ vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp);
+ vpmuludq(*temp, lhs, lhs);
+ // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
+
+ vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
+ // lhs contains (Ry, Rw, Rx, Rz)
+ vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+}
+
+void
+MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+ vminps(Operand(lhs), rhsCopy, scratch);
+ vminps(rhs, lhs, output);
+ vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
+}
+
+void
+MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+ FloatRegister output)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ FloatRegister lhsCopy = reusedInputFloat32x4(lhs, scratch);
+ vcmpunordps(rhs, lhsCopy, scratch);
+
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, temp);
+ vmaxps(Operand(lhs), rhsCopy, temp);
+ vmaxps(rhs, lhs, output);
+
+ vandps(temp, output, output);
+ vorps(scratch, output, output); // or in the all-ones NaNs
+}
+
+void
+MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+ FloatRegister output)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp);
+
+ FloatRegister mask = scratch;
+ FloatRegister tmpCopy = reusedInputFloat32x4(temp, scratch);
+ vpcmpeqd(Operand(lhs), tmpCopy, mask);
+ vandps(temp, mask, mask);
+
+ FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp);
+ vminps(rhs, lhsCopy, temp);
+ vorps(mask, temp, temp);
+
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask);
+ vcmpneqps(rhs, rhsCopy, mask);
+
+ if (AssemblerX86Shared::HasAVX()) {
+ vblendvps(mask, lhs, temp, output);
+ } else {
+ // Emulate vblendvps.
+ // With SSE.4.1 we could use blendvps, however it's awkward since
+ // it requires the mask to be in xmm0.
+ if (lhs != output)
+ moveSimd128Float(lhs, output);
+ vandps(Operand(mask), output, output);
+ vandnps(Operand(temp), mask, mask);
+ vorps(Operand(mask), output, output);
+ }
+}
+
+void
+MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+ FloatRegister output)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ FloatRegister mask = scratch;
+
+ asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
+ vpcmpeqd(Operand(lhs), mask, mask);
+
+ asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp);
+ vandps(temp, mask, mask);
+
+ FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp);
+ vmaxps(rhs, lhsCopy, temp);
+ vandnps(Operand(temp), mask, mask);
+
+ // Ensure temp always contains the temporary result
+ mask = temp;
+ temp = scratch;
+
+ FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask);
+ vcmpneqps(rhs, rhsCopy, mask);
+
+ if (AssemblerX86Shared::HasAVX()) {
+ vblendvps(mask, lhs, temp, output);
+ } else {
+ // Emulate vblendvps.
+ // With SSE.4.1 we could use blendvps, however it's awkward since
+ // it requires the mask to be in xmm0.
+ if (lhs != output)
+ moveSimd128Float(lhs, output);
+ vandps(Operand(mask), output, output);
+ vandnps(Operand(temp), mask, mask);
+ vorps(Operand(mask), output, output);
+ }
+}
+
+void
+MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out)
+{
+ // All zeros but the sign bit
+ static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
+ asMasm().loadConstantSimd128Float(minusZero, out);
+ bitwiseXorFloat32x4(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out)
+{
+ static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+ asMasm().loadConstantSimd128Int(allOnes, out);
+ bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out)
+{
+ static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+ asMasm().loadConstantSimd128Int(allOnes, out);
+ bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out)
+{
+ static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+ asMasm().loadConstantSimd128Int(allOnes, out);
+ bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out)
+{
+ float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
+ static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
+ asMasm().loadConstantSimd128Float(allOnes, out);
+ bitwiseXorFloat32x4(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out)
+{
+ // All ones but the sign bit
+ float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
+ static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
+ asMasm().loadConstantSimd128Float(signMasks, out);
+ bitwiseAndFloat32x4(out, in, out);
+}
+
+static inline void
+MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask, Register count, Register temp,
+ FloatRegister dest)
+{
+ masm.mov(count, temp);
+ masm.andl(Imm32(shiftmask), temp);
+ masm.vmovd(temp, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(FloatRegister in, Register count,
+ Register temp, FloatRegister dest)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+ vpsllw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+ Register temp, FloatRegister dest)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+ vpsraw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+ Register temp, FloatRegister dest)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+ vpsrlw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(FloatRegister in, Register count,
+ Register temp, FloatRegister dest)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+ vpslld(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+ Register temp, FloatRegister dest)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+ vpsrad(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+ Register temp, FloatRegister dest)
+{
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+ vpsrld(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+ FloatRegister temp, FloatRegister output)
+{
+ if (onTrue != output)
+ vmovaps(onTrue, output);
+ if (mask != temp)
+ vmovaps(mask, temp);
+
+ // SSE4.1 has plain blendvps which can do this, but it is awkward
+ // to use because it requires the mask to be in xmm0.
+
+ bitwiseAndSimdInt(output, Operand(temp), output);
+ bitwiseAndNotSimdInt(temp, Operand(onFalse), temp);
+ bitwiseOrSimdInt(output, Operand(temp), output);
+}
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
index 36f3a008a9..f308e41fd8 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@@ -1123,9 +1123,9 @@ MacroAssembler::canonicalizeFloat32x4(FloatRegister reg, FloatRegister scratch)
float nanf = float(JS::GenericNaN());
loadConstantSimd128Float(SimdConstant::SplatX4(nanf), ifFalse);
- bitwiseAndSimd128(Operand(mask), reg);
- bitwiseAndNotSimd128(Operand(ifFalse), mask);
- bitwiseOrSimd128(Operand(mask), reg);
+ bitwiseAndFloat32x4(reg, Operand(mask), reg);
+ bitwiseAndNotFloat32x4(mask, Operand(ifFalse), mask);
+ bitwiseOrFloat32x4(reg, Operand(mask), reg);
}
// ========================================================================
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
index e7783736b2..25b3b846da 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -820,20 +820,179 @@ class MacroAssemblerX86Shared : public Assembler
vcvtdq2ps(src, dest);
}
- void bitwiseAndSimd128(const Operand& src, FloatRegister dest) {
- // TODO Using the "ps" variant for all types incurs a domain crossing
- // penalty for integer types and double.
- vandps(src, dest, dest);
+ // SIMD methods, defined in MacroAssembler-x86-shared-SIMD.cpp.
+ void checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest, Register temp,
+ Label* oolCheck, Label* rejoin);
+ void oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp, Label* rejoin,
+ Label* onConversionError);
+ void checkedConvertFloat32x4ToUint32x4(FloatRegister src, FloatRegister dest, Register temp,
+ FloatRegister tempF, Label* failed);
+
+ void createInt32x4(Register lane0, Register lane1, Register lane2, Register lane3,
+ FloatRegister dest);
+ void createFloat32x4(FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
+ FloatRegister lane3, FloatRegister temp, FloatRegister output);
+
+ void splatX16(Register input, FloatRegister output);
+ void splatX8(Register input, FloatRegister output);
+ void splatX4(Register input, FloatRegister output);
+ void splatX4(FloatRegister input, FloatRegister output);
+
+ void reinterpretSimd(bool isIntegerLaneType, FloatRegister input, FloatRegister output);
+
+ void extractLaneInt32x4(FloatRegister input, Register output, unsigned lane);
+ void extractLaneFloat32x4(FloatRegister input, FloatRegister output, unsigned lane,
+ bool canonicalize);
+ void extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, SimdSign sign);
+ void extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, SimdSign sign);
+ void extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes, unsigned lane);
+
+ void insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output,
+ unsigned lane, unsigned numLanes);
+ void insertLaneFloat32x4(FloatRegister input, FloatRegister value, FloatRegister output,
+ unsigned lane);
+
+ void allTrueSimdBool(FloatRegister input, Register output);
+ void anyTrueSimdBool(FloatRegister input, Register output);
+
+ void swizzleInt32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]);
+ void swizzleFloat32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]);
+ void swizzleInt8x16(FloatRegister input, FloatRegister output,
+ const mozilla::Maybe<Register>& temp, int8_t lanes[16]);
+
+ void shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+ const mozilla::Maybe<FloatRegister>& maybeTemp, unsigned lanes[4]);
+ void shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+ const mozilla::Maybe<FloatRegister>& maybeFloatTemp,
+ const mozilla::Maybe<Register>& maybeTemp, uint8_t lanes[16]);
+
+ void compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output);
+ void compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output);
+ void compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output);
+ void compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output);
+
+ void addInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpaddb(rhs, lhs, output);
+ }
+ void addInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpaddw(rhs, lhs, output);
+ }
+ void addInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpaddd(rhs, lhs, output);
+ }
+ void addFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vaddps(rhs, lhs, output);
+ }
+
+ void addSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+ if (sign == SimdSign::Signed)
+ vpaddsb(rhs, lhs, output);
+ else
+ vpaddusb(rhs, lhs, output);
+ }
+ void addSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+ if (sign == SimdSign::Signed)
+ vpaddsw(rhs, lhs, output);
+ else
+ vpaddusw(rhs, lhs, output);
+ }
+
+ void subInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpsubb(rhs, lhs, output);
+ }
+ void subInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpsubw(rhs, lhs, output);
+ }
+ void subInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpsubd(rhs, lhs, output);
+ }
+ void subFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vsubps(rhs, lhs, output);
+ }
+
+ void subSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+ if (sign == SimdSign::Signed)
+ vpsubsb(rhs, lhs, output);
+ else
+ vpsubusb(rhs, lhs, output);
+ }
+ void subSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+ if (sign == SimdSign::Signed)
+ vpsubsw(rhs, lhs, output);
+ else
+ vpsubusw(rhs, lhs, output);
+ }
+
+ void mulInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vpmullw(rhs, lhs, output);
+ }
+ void mulInt32x4(FloatRegister lhs, Operand rhs, const mozilla::Maybe<FloatRegister>& temp,
+ FloatRegister output);
+ void mulFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vmulps(rhs, lhs, output);
+ }
+
+ void negInt8x16(Operand in, FloatRegister out) {
+ zeroSimd128Int(out);
+ packedSubInt8(in, out);
+ }
+ void negInt16x8(Operand in, FloatRegister out) {
+ zeroSimd128Int(out);
+ packedSubInt16(in, out);
+ }
+ void negInt32x4(Operand in, FloatRegister out) {
+ zeroSimd128Int(out);
+ packedSubInt32(in, out);
+ }
+ void negFloat32x4(Operand in, FloatRegister out);
+
+ void notInt8x16(Operand in, FloatRegister out);
+ void notInt16x8(Operand in, FloatRegister out);
+ void notInt32x4(Operand in, FloatRegister out);
+ void notFloat32x4(Operand in, FloatRegister out);
+
+ void divFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+ vdivps(rhs, lhs, output);
}
- void bitwiseAndNotSimd128(const Operand& src, FloatRegister dest) {
- vandnps(src, dest, dest);
+ void minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output);
+ void maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+ void minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+ void maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+
+ void absFloat32x4(Operand in, FloatRegister out);
+
+ void bitwiseAndFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vandps(rhs, lhs, dest);
+ }
+ void bitwiseAndSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vpand(rhs, lhs, dest);
+ }
+
+ void bitwiseOrFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vorps(rhs, lhs, dest);
+ }
+ void bitwiseOrSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vpor(rhs, lhs, dest);
}
- void bitwiseOrSimd128(const Operand& src, FloatRegister dest) {
- vorps(src, dest, dest);
+
+ void bitwiseXorFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vxorps(rhs, lhs, dest);
+ }
+ void bitwiseXorSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vpxor(rhs, lhs, dest);
+ }
+
+ void bitwiseAndNotFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vandnps(rhs, lhs, dest);
}
- void bitwiseXorSimd128(const Operand& src, FloatRegister dest) {
- vxorps(src, dest, dest);
+ void bitwiseAndNotSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+ vpandn(rhs, lhs, dest);
}
+
void zeroSimd128Float(FloatRegister dest) {
vxorps(dest, dest, dest);
}
@@ -841,6 +1000,16 @@ class MacroAssemblerX86Shared : public Assembler
vpxor(dest, dest, dest);
}
+ void selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+ FloatRegister temp, FloatRegister output);
+ void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+ FloatRegister temp, FloatRegister output) {
+ if (AssemblerX86Shared::HasAVX())
+ vblendvps(mask, onTrue, onFalse, output);
+ else
+ selectSimd128(mask, onTrue, onFalse, temp, output);
+ }
+
template <class T, class Reg> inline void loadScalar(const Operand& src, Reg dest);
template <class T, class Reg> inline void storeScalar(Reg src, const Address& dest);
template <class T> inline void loadAlignedVector(const Address& src, FloatRegister dest);
@@ -987,41 +1156,38 @@ class MacroAssemblerX86Shared : public Assembler
vsqrtps(src, dest);
}
- void packedLeftShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
- vpsllw(src, dest, dest);
- }
+ public:
+ void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+ void packedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+ void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+
void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+ count.value &= 15;
vpsllw(count, dest, dest);
}
- void packedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
- vpsraw(src, dest, dest);
- }
void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+ count.value &= 15;
vpsraw(count, dest, dest);
}
- void packedUnsignedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
- vpsrlw(src, dest, dest);
- }
void packedUnsignedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+ count.value &= 15;
vpsrlw(count, dest, dest);
}
- void packedLeftShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
- vpslld(src, dest, dest);
- }
+ void packedLeftShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+ void packedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+ void packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+
void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+ count.value &= 31;
vpslld(count, dest, dest);
}
- void packedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
- vpsrad(src, dest, dest);
- }
void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+ count.value &= 31;
vpsrad(count, dest, dest);
}
- void packedUnsignedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
- vpsrld(src, dest, dest);
- }
void packedUnsignedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+ count.value &= 31;
vpsrld(count, dest, dest);
}
diff --git a/js/src/moz.build b/js/src/moz.build
index 8e14de6e85..59feedf22d 100644
--- a/js/src/moz.build
+++ b/js/src/moz.build
@@ -431,6 +431,7 @@ elif CONFIG['JS_CODEGEN_X86'] or CONFIG['JS_CODEGEN_X64']:
'jit/x86-shared/CodeGenerator-x86-shared.cpp',
'jit/x86-shared/Disassembler-x86-shared.cpp', # using namespace js::jit::X86Encoding;
'jit/x86-shared/Lowering-x86-shared.cpp',
+ 'jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp',
'jit/x86-shared/MacroAssembler-x86-shared.cpp',
'jit/x86-shared/MoveEmitter-x86-shared.cpp',
]