diff options
author | Moonchild <moonchild@palemoon.org> | 2023-09-12 23:28:49 +0200 |
---|---|---|
committer | Moonchild <moonchild@palemoon.org> | 2023-09-12 23:28:49 +0200 |
commit | 1a7f79ef9acde005dd78984aeb5917af525960d6 (patch) | |
tree | 56845f1dfffc062d2a22719c464e25535c5f864c /js | |
parent | 281497201e52d95b1592e28ba59431ad4ae3bfeb (diff) | |
download | uxp-1a7f79ef9acde005dd78984aeb5917af525960d6.tar.gz |
Issue #2307 - Part 2: Move SIMD code generation to masm methods
Diffstat (limited to 'js')
-rw-r--r-- | js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp | 1185 | ||||
-rw-r--r-- | js/src/jit/x86-shared/CodeGenerator-x86-shared.h | 6 | ||||
-rw-r--r-- | js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp | 1227 | ||||
-rw-r--r-- | js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h | 6 | ||||
-rw-r--r-- | js/src/jit/x86-shared/MacroAssembler-x86-shared.h | 222 | ||||
-rw-r--r-- | js/src/moz.build | 1 |
6 files changed, 1570 insertions, 1077 deletions
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp index 5ec00da849..9858836e7d 100644 --- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp +++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp @@ -27,6 +27,7 @@ using mozilla::BitwiseCast; using mozilla::DebugOnly; using mozilla::FloatingPoint; using mozilla::FloorLog2; +using mozilla::Maybe; using mozilla::NegativeInfinity; using mozilla::SpecificNaN; @@ -2458,51 +2459,18 @@ CodeGeneratorX86Shared::visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins) FloatRegister out = ToFloatRegister(ins->output()); Register temp = ToRegister(ins->temp()); - masm.convertFloat32x4ToInt32x4(in, out); - auto* ool = new(alloc()) OutOfLineSimdFloatToIntCheck(temp, in, ins, ins->mir()->trapOffset()); addOutOfLineCode(ool, ins->mir()); - static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648)); - - ScratchSimd128Scope scratch(masm); - masm.loadConstantSimd128Int(InvalidResult, scratch); - masm.packedEqualInt32x4(Operand(out), scratch); - // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of - // the two following instructions. - masm.vmovmskps(scratch, temp); - masm.cmp32(temp, Imm32(0)); - masm.j(Assembler::NotEqual, ool->entry()); - - masm.bind(ool->rejoin()); + masm.checkedConvertFloat32x4ToInt32x4(in, out, temp, ool->entry(), ool->rejoin()); } void -CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck *ool) +CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck* ool) { - static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f); - static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f); - Label onConversionError; - FloatRegister input = ool->input(); - Register temp = ool->temp(); - - ScratchSimd128Scope scratch(masm); - masm.loadConstantSimd128Float(Int32MinX4, scratch); - masm.vcmpleps(Operand(input), scratch, scratch); - masm.vmovmskps(scratch, temp); - masm.cmp32(temp, Imm32(15)); - masm.j(Assembler::NotEqual, &onConversionError); - - masm.loadConstantSimd128Float(Int32MaxX4, scratch); - masm.vcmpleps(Operand(input), scratch, scratch); - masm.vmovmskps(scratch, temp); - masm.cmp32(temp, Imm32(0)); - masm.j(Assembler::NotEqual, &onConversionError); - - masm.jump(ool->rejoin()); - + masm.oolConvertFloat32x4ToInt32x4(ool->input(), ool->temp(), ool->rejoin(), &onConversionError); if (gen->compilingWasm()) { masm.bindLater(&onConversionError, trap(ool, wasm::Trap::ImpreciseSimdConversion)); } else { @@ -2512,105 +2480,39 @@ CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIn } // Convert Float32x4 to Uint32x4. -// // If any input lane value is out of range or NaN, bail out. void CodeGeneratorX86Shared::visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins) { - const MSimdConvert* mir = ins->mir(); FloatRegister in = ToFloatRegister(ins->input()); FloatRegister out = ToFloatRegister(ins->output()); Register temp = ToRegister(ins->tempR()); FloatRegister tempF = ToFloatRegister(ins->tempF()); - // Classify lane values into 4 disjoint classes: - // - // N-lanes: in <= -1.0 - // A-lanes: -1.0 < in <= 0x0.ffffffp31 - // B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32 - // V-lanes: 0x1.0p32 <= in, or isnan(in) - // - // We need to bail out to throw a RangeError if we see any N-lanes or - // V-lanes. - // - // For A-lanes and B-lanes, we make two float -> int32 conversions: - // - // A = cvttps2dq(in) - // B = cvttps2dq(in - 0x1.0p31f) - // - // Note that the subtraction for the B computation is exact for B-lanes. - // There is no rounding, so B is the low 31 bits of the correctly converted - // result. - // - // The cvttps2dq instruction produces 0x80000000 when the input is NaN or - // out of range for a signed int32_t. This conveniently provides the missing - // high bit for B, so the desired result is A for A-lanes and A|B for - // B-lanes. - - ScratchSimd128Scope scratch(masm); - - // TODO: If the majority of lanes are A-lanes, it could be faster to compute - // A first, use vmovmskps to check for any non-A-lanes and handle them in - // ool code. OTOH, we we're wrong about the lane distribution, that would be - // slower. - - // Compute B in |scratch|. - static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC. - static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust); - masm.loadConstantSimd128Float(Bias, scratch); - masm.packedAddFloat32(Operand(in), scratch); - masm.convertFloat32x4ToInt32x4(scratch, scratch); - - // Compute A in |out|. This is the last time we use |in| and the first time - // we use |out|, so we can tolerate if they are the same register. - masm.convertFloat32x4ToInt32x4(in, out); - - // We can identify A-lanes by the sign bits in A: Any A-lanes will be - // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a - // mask of non-A-lanes into |tempF|. - masm.zeroSimd128Float(tempF); - masm.packedGreaterThanInt32x4(Operand(out), tempF); - - // Clear the A-lanes in B. - masm.bitwiseAndSimd128(Operand(tempF), scratch); - - // Compute the final result: A for A-lanes, A|B for B-lanes. - masm.bitwiseOrSimd128(Operand(scratch), out); - - // We still need to filter out the V-lanes. They would show up as 0x80000000 - // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are - // the remaining negative lanes in B. - masm.vmovmskps(scratch, temp); - masm.cmp32(temp, Imm32(0)); + Label failed; + masm.checkedConvertFloat32x4ToUint32x4(in, out, temp, tempF, &failed); + Label ok; + masm.jump(&ok); + masm.bind(&failed); if (gen->compilingWasm()) - masm.j(Assembler::NotEqual, trap(mir, wasm::Trap::ImpreciseSimdConversion)); + masm.j(Assembler::NotEqual, trap(ins->mir(), wasm::Trap::ImpreciseSimdConversion)); else - bailoutIf(Assembler::NotEqual, ins->snapshot()); +// bailoutIf(Assembler::NotEqual, ins->snapshot()); + bailout(ins->snapshot()); + masm.bind(&ok); } void CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4* ins) { MOZ_ASSERT(ins->mir()->type() == MIRType::Int32x4 || ins->mir()->type() == MIRType::Bool32x4); - - FloatRegister output = ToFloatRegister(ins->output()); - if (AssemblerX86Shared::HasSSE41()) { - masm.vmovd(ToRegister(ins->getOperand(0)), output); - for (size_t i = 1; i < 4; ++i) { - Register r = ToRegister(ins->getOperand(i)); - masm.vpinsrd(i, r, output, output); - } - return; - } - - masm.reserveStack(Simd128DataSize); - for (size_t i = 0; i < 4; ++i) { - Register r = ToRegister(ins->getOperand(i)); - masm.store32(r, Address(StackPointer, i * sizeof(int32_t))); - } - masm.loadAlignedSimd128Int(Address(StackPointer, 0), output); - masm.freeStack(Simd128DataSize); + masm.createInt32x4(ToRegister(ins->getOperand(0)), + ToRegister(ins->getOperand(1)), + ToRegister(ins->getOperand(2)), + ToRegister(ins->getOperand(3)), + ToFloatRegister(ins->output()) + ); } void @@ -2625,12 +2527,7 @@ CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins) FloatRegister tmp = ToFloatRegister(ins->getTemp(0)); FloatRegister output = ToFloatRegister(ins->output()); - FloatRegister r0Copy = masm.reusedInputFloat32x4(r0, output); - FloatRegister r1Copy = masm.reusedInputFloat32x4(r1, tmp); - - masm.vunpcklps(r3, r1Copy, tmp); - masm.vunpcklps(r2, r0Copy, output); - masm.vunpcklps(tmp, output, output); + masm.createFloat32x4(r0, r1, r2, r3, tmp, output); } void @@ -2639,20 +2536,7 @@ CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins) MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16); Register input = ToRegister(ins->getOperand(0)); FloatRegister output = ToFloatRegister(ins->output()); - masm.vmovd(input, output); - if (AssemblerX86Shared::HasSSSE3()) { - masm.zeroSimd128Int(ScratchSimd128Reg); - masm.vpshufb(ScratchSimd128Reg, output, output); - } else { - // Use two shifts to duplicate the low 8 bits into the low 16 bits. - masm.vpsllw(Imm32(8), output, output); - masm.vmovdqa(output, ScratchSimd128Reg); - masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg); - masm.vpor(ScratchSimd128Reg, output, output); - // Then do an X8 splat. - masm.vpshuflw(0, output, output); - masm.vpshufd(0, output, output); - } + masm.splatX16(input, output); } void @@ -2661,9 +2545,7 @@ CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins) MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8); Register input = ToRegister(ins->getOperand(0)); FloatRegister output = ToFloatRegister(ins->output()); - masm.vmovd(input, output); - masm.vpshuflw(0, output, output); - masm.vpshufd(0, output, output); + masm.splatX8(input, output); } void @@ -2675,15 +2557,10 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins) MOZ_ASSERT(IsSimdType(mir->type())); JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t)); - if (mir->type() == MIRType::Float32x4) { - FloatRegister r = ToFloatRegister(ins->getOperand(0)); - FloatRegister rCopy = masm.reusedInputFloat32x4(r, output); - masm.vshufps(0, rCopy, rCopy, output); - } else { - Register r = ToRegister(ins->getOperand(0)); - masm.vmovd(r, output); - masm.vpshufd(0, output, output); - } + if (mir->type() == MIRType::Float32x4) + masm.splatX4(ToFloatRegister(ins->getOperand(0)), output); + else + masm.splatX4(ToRegister(ins->getOperand(0)), output); } void @@ -2691,83 +2568,8 @@ CodeGeneratorX86Shared::visitSimdReinterpretCast(LSimdReinterpretCast* ins) { FloatRegister input = ToFloatRegister(ins->input()); FloatRegister output = ToFloatRegister(ins->output()); - - if (input.aliases(output)) - return; - - if (IsIntegerSimdType(ins->mir()->type())) - masm.vmovdqa(input, output); - else - masm.vmovaps(input, output); -} - -// Extract an integer lane from the 32x4 vector register |input| and place it in -// |output|. -void -CodeGeneratorX86Shared::emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane) -{ - if (lane == 0) { - // The value we want to extract is in the low double-word - masm.moveLowInt32(input, output); - } else if (AssemblerX86Shared::HasSSE41()) { - masm.vpextrd(lane, input, output); - } else { - uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); - masm.shuffleInt32(mask, input, ScratchSimd128Reg); - masm.moveLowInt32(ScratchSimd128Reg, output); - } -} - -// Extract an integer lane from the 16x8 vector register |input|, sign- or -// zero-extend to 32 bits and place the result in |output|. -void -CodeGeneratorX86Shared::emitSimdExtractLane16x8(FloatRegister input, Register output, - unsigned lane, SimdSign signedness) -{ - // Unlike pextrd and pextrb, this is available in SSE2. - masm.vpextrw(lane, input, output); - - if (signedness == SimdSign::Signed) - masm.movswl(output, output); -} - -// Extract an integer lane from the 8x16 vector register |input|, sign- or -// zero-extend to 32 bits and place the result in |output|. -void -CodeGeneratorX86Shared::emitSimdExtractLane8x16(FloatRegister input, Register output, - unsigned lane, SimdSign signedness) -{ - if (AssemblerX86Shared::HasSSE41()) { - masm.vpextrb(lane, input, output); - // vpextrb clears the high bits, so no further extension required. - if (signedness == SimdSign::Unsigned) - signedness = SimdSign::NotApplicable; - } else { - // Extract the relevant 16 bits containing our lane, then shift the - // right 8 bits into place. - emitSimdExtractLane16x8(input, output, lane / 2, SimdSign::Unsigned); - if (lane % 2) { - masm.shrl(Imm32(8), output); - // The shrl handles the zero-extension. Don't repeat it. - if (signedness == SimdSign::Unsigned) - signedness = SimdSign::NotApplicable; - } - } - - // We have the right low 8 bits in |output|, but we may need to fix the high - // bits. Note that this requires |output| to be one of the %eax-%edx - // registers. - switch (signedness) { - case SimdSign::Signed: - masm.movsbl(output, output); - break; - case SimdSign::Unsigned: - masm.movzbl(output, output); - break; - case SimdSign::NotApplicable: - // No adjustment needed. - break; - } + bool isIntLaneType = IsIntegerSimdType(ins->mir()->type()); + masm.reinterpretSimd(isIntLaneType, input, output); } void @@ -2776,25 +2578,8 @@ CodeGeneratorX86Shared::visitSimdExtractElementB(LSimdExtractElementB* ins) FloatRegister input = ToFloatRegister(ins->input()); Register output = ToRegister(ins->output()); MSimdExtractElement* mir = ins->mir(); - unsigned length = SimdTypeToLength(mir->specialization()); - - switch (length) { - case 4: - emitSimdExtractLane32x4(input, output, mir->lane()); - break; - case 8: - // Get a lane, don't bother fixing the high bits since we'll mask below. - emitSimdExtractLane16x8(input, output, mir->lane(), SimdSign::NotApplicable); - break; - case 16: - emitSimdExtractLane8x16(input, output, mir->lane(), SimdSign::NotApplicable); - break; - default: - MOZ_CRASH("Unhandled SIMD length"); - } - - // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits. - masm.and32(Imm32(1), output); + unsigned numLanes = SimdTypeToLength(mir->specialization()); + masm.extractLaneSimdBool(input, output, numLanes, mir->lane()); } void @@ -2803,17 +2588,16 @@ CodeGeneratorX86Shared::visitSimdExtractElementI(LSimdExtractElementI* ins) FloatRegister input = ToFloatRegister(ins->input()); Register output = ToRegister(ins->output()); MSimdExtractElement* mir = ins->mir(); - unsigned length = SimdTypeToLength(mir->specialization()); - - switch (length) { + unsigned numLanes = SimdTypeToLength(mir->specialization()); + switch (numLanes) { case 4: - emitSimdExtractLane32x4(input, output, mir->lane()); + masm.extractLaneInt32x4(input, output, mir->lane()); break; case 8: - emitSimdExtractLane16x8(input, output, mir->lane(), mir->signedness()); + masm.extractLaneInt16x8(input, output, mir->lane(), mir->signedness()); break; case 16: - emitSimdExtractLane8x16(input, output, mir->lane(), mir->signedness()); + masm.extractLaneInt8x16(input, output, mir->lane(), mir->signedness()); break; default: MOZ_CRASH("Unhandled SIMD length"); @@ -2828,7 +2612,7 @@ CodeGeneratorX86Shared::visitSimdExtractElementU2D(LSimdExtractElementU2D* ins) Register temp = ToRegister(ins->temp()); MSimdExtractElement* mir = ins->mir(); MOZ_ASSERT(mir->specialization() == MIRType::Int32x4); - emitSimdExtractLane32x4(input, temp, mir->lane()); + masm.extractLaneInt32x4(input, temp, mir->lane()); masm.convertUInt32ToDouble(temp, output); } @@ -2839,102 +2623,31 @@ CodeGeneratorX86Shared::visitSimdExtractElementF(LSimdExtractElementF* ins) FloatRegister output = ToFloatRegister(ins->output()); unsigned lane = ins->mir()->lane(); - if (lane == 0) { - // The value we want to extract is in the low double-word - if (input != output) - masm.moveFloat32(input, output); - } else if (lane == 2) { - masm.moveHighPairToLowPairFloat32(input, output); - } else { - uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); - masm.shuffleFloat32(mask, input, output); - } - // NaNs contained within SIMD values are not enforced to be canonical, so - // when we extract an element into a "regular" scalar JS value, we have to - // canonicalize. In wasm code, we can skip this, as wasm only has to - // canonicalize NaNs at FFI boundaries. - if (!gen->compilingWasm()) - masm.canonicalizeFloat(output); + bool canonicalize = !gen->compilingWasm(); + masm.extractLaneFloat32x4(input, output, lane, canonicalize); } void CodeGeneratorX86Shared::visitSimdInsertElementI(LSimdInsertElementI* ins) { - FloatRegister vector = ToFloatRegister(ins->vector()); + FloatRegister input = ToFloatRegister(ins->vector()); Register value = ToRegister(ins->value()); FloatRegister output = ToFloatRegister(ins->output()); - MOZ_ASSERT(vector == output); // defineReuseInput(0) - + MOZ_ASSERT(input == output); // defineReuseInput(0) unsigned lane = ins->lane(); unsigned length = ins->length(); - if (length == 8) { - // Available in SSE 2. - masm.vpinsrw(lane, value, vector, output); - return; - } - - // Note that, contrarily to float32x4, we cannot use vmovd if the inserted - // value goes into the first component, as vmovd clears out the higher lanes - // of the output. - if (AssemblerX86Shared::HasSSE41()) { - // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX. - switch (length) { - case 4: - masm.vpinsrd(lane, value, vector, output); - return; - case 16: - masm.vpinsrb(lane, value, vector, output); - return; - } - } - - masm.reserveStack(Simd128DataSize); - masm.storeAlignedSimd128Int(vector, Address(StackPointer, 0)); - switch (length) { - case 4: - masm.store32(value, Address(StackPointer, lane * sizeof(int32_t))); - break; - case 16: - // Note that this requires `value` to be in one the registers where the - // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64). - masm.store8(value, Address(StackPointer, lane * sizeof(int8_t))); - break; - default: - MOZ_CRASH("Unsupported SIMD length"); - } - masm.loadAlignedSimd128Int(Address(StackPointer, 0), output); - masm.freeStack(Simd128DataSize); + masm.insertLaneSimdInt(input, value, output, lane, length); } void CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF* ins) { - FloatRegister vector = ToFloatRegister(ins->vector()); + FloatRegister input = ToFloatRegister(ins->vector()); FloatRegister value = ToFloatRegister(ins->value()); FloatRegister output = ToFloatRegister(ins->output()); - MOZ_ASSERT(vector == output); // defineReuseInput(0) - - if (ins->lane() == 0) { - // As both operands are registers, vmovss doesn't modify the upper bits - // of the destination operand. - if (value != output) - masm.vmovss(value, vector, output); - return; - } - - if (AssemblerX86Shared::HasSSE41()) { - // The input value is in the low float32 of the 'value' FloatRegister. - masm.vinsertps(masm.vinsertpsMask(0, ins->lane()), value, output, output); - return; - } - - unsigned component = unsigned(ins->lane()); - masm.reserveStack(Simd128DataSize); - masm.storeAlignedSimd128Float(vector, Address(StackPointer, 0)); - masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t))); - masm.loadAlignedSimd128Float(Address(StackPointer, 0), output); - masm.freeStack(Simd128DataSize); + MOZ_ASSERT(input == output); // defineReuseInput(0) + masm.insertLaneFloat32x4(input, value, output, ins->lane()); } void @@ -2943,9 +2656,7 @@ CodeGeneratorX86Shared::visitSimdAllTrue(LSimdAllTrue* ins) FloatRegister input = ToFloatRegister(ins->input()); Register output = ToRegister(ins->output()); - masm.vmovmskps(input, output); - masm.cmp32(output, Imm32(0xf)); - masm.emitSet(Assembler::Zero, output); + masm.allTrueSimdBool(input, output); } void @@ -2954,11 +2665,10 @@ CodeGeneratorX86Shared::visitSimdAnyTrue(LSimdAnyTrue* ins) FloatRegister input = ToFloatRegister(ins->input()); Register output = ToRegister(ins->output()); - masm.vmovmskps(input, output); - masm.cmp32(output, Imm32(0x0)); - masm.emitSet(Assembler::NonZero, output); + masm.anyTrueSimdBool(input, output); } +// XXX note for reviewer: this is SIMD.js only, no need to keep it for wasm. template <class T, class Reg> void CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Reg tempRegister) { @@ -3017,6 +2727,7 @@ CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Re masm.freeStack(stackSpace); } +// XXX SIMD.js only void CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI* ins) { @@ -3047,13 +2758,10 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins) switch (numLanes) { case 4: { - uint32_t x = ins->lane(0); - uint32_t y = ins->lane(1); - uint32_t z = ins->lane(2); - uint32_t w = ins->lane(3); - - uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w); - masm.shuffleInt32(mask, input, output); + unsigned lanes[4]; + for (unsigned i = 0; i < 4; i++) + lanes[i] = ins->lane(i); + masm.swizzleInt32x4(input, output, lanes); return; } } @@ -3061,31 +2769,18 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins) // In the general case, use pshufb if it is available. Convert to a // byte-wise swizzle. const unsigned bytesPerLane = 16 / numLanes; - int8_t bLane[16]; + int8_t lanes[16]; for (unsigned i = 0; i < numLanes; i++) { for (unsigned b = 0; b < bytesPerLane; b++) { - bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b; + lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b; } } - if (AssemblerX86Shared::HasSSSE3()) { - ScratchSimd128Scope scratch(masm); - masm.loadConstantSimd128Int(SimdConstant::CreateX16(bLane), scratch); - FloatRegister inputCopy = masm.reusedInputInt32x4(input, output); - masm.vpshufb(scratch, inputCopy, output); - return; - } + Maybe<Register> maybeTemp; + if (!ins->getTemp(0)->isBogusTemp()) + maybeTemp.emplace(ToRegister(ins->getTemp(0))); - // Worst-case fallback for pre-SSSE3 machines. Bounce through memory. - Register temp = ToRegister(ins->getTemp(0)); - masm.reserveStack(2 * Simd128DataSize); - masm.storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize)); - for (unsigned i = 0; i < 16; i++) { - masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp); - masm.store8(temp, Address(StackPointer, i)); - } - masm.loadAlignedSimd128Int(Address(StackPointer, 0), output); - masm.freeStack(2 * Simd128DataSize); + masm.swizzleInt8x16(input, output, maybeTemp, lanes); } void @@ -3095,54 +2790,10 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF* ins) FloatRegister output = ToFloatRegister(ins->output()); MOZ_ASSERT(ins->numLanes() == 4); - uint32_t x = ins->lane(0); - uint32_t y = ins->lane(1); - uint32_t z = ins->lane(2); - uint32_t w = ins->lane(3); - - if (AssemblerX86Shared::HasSSE3()) { - if (ins->lanesMatch(0, 0, 2, 2)) { - masm.vmovsldup(input, output); - return; - } - if (ins->lanesMatch(1, 1, 3, 3)) { - masm.vmovshdup(input, output); - return; - } - } - - // TODO Here and below, arch specific lowering could identify this pattern - // and use defineReuseInput to avoid this move (bug 1084404) - if (ins->lanesMatch(2, 3, 2, 3)) { - FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output); - masm.vmovhlps(input, inputCopy, output); - return; - } - - if (ins->lanesMatch(0, 1, 0, 1)) { - if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) { - masm.vmovddup(input, output); - return; - } - FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output); - masm.vmovlhps(input, inputCopy, output); - return; - } - - if (ins->lanesMatch(0, 0, 1, 1)) { - FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output); - masm.vunpcklps(input, inputCopy, output); - return; - } - - if (ins->lanesMatch(2, 2, 3, 3)) { - FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output); - masm.vunpckhps(input, inputCopy, output); - return; - } - - uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w); - masm.shuffleFloat32(mask, input, output); + unsigned lanes[4]; + for (unsigned i = 0; i < 4; i++) + lanes[i] = ins->lane(i); + masm.swizzleFloat32x4(input, output, lanes); } void @@ -3155,52 +2806,21 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle* ins) const unsigned bytesPerLane = 16 / numLanes; // Convert the shuffle to a byte-wise shuffle. - uint8_t bLane[16]; + uint8_t lanes[16]; for (unsigned i = 0; i < numLanes; i++) { for (unsigned b = 0; b < bytesPerLane; b++) { - bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b; + lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b; } } - // Use pshufb if it is available. - if (AssemblerX86Shared::HasSSSE3()) { - FloatRegister scratch1 = ToFloatRegister(ins->temp()); - ScratchSimd128Scope scratch2(masm); - - // Use pshufb instructions to gather the lanes from each source vector. - // A negative index creates a zero lane, so the two vectors can be combined. - - // Set scratch2 = lanes from lhs. - int8_t idx[16]; - for (unsigned i = 0; i < 16; i++) - idx[i] = bLane[i] < 16 ? bLane[i] : -1; - masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1); - FloatRegister lhsCopy = masm.reusedInputInt32x4(lhs, scratch2); - masm.vpshufb(scratch1, lhsCopy, scratch2); - - // Set output = lanes from rhs. - for (unsigned i = 0; i < 16; i++) - idx[i] = bLane[i] >= 16 ? bLane[i] - 16 : -1; - masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1); - FloatRegister rhsCopy = masm.reusedInputInt32x4(rhs, output); - masm.vpshufb(scratch1, rhsCopy, output); - - // Combine. - masm.vpor(scratch2, output, output); - return; - } + Maybe<FloatRegister> maybeFloatTemp; + Maybe<Register> maybeTemp; + if (AssemblerX86Shared::HasSSSE3()) + maybeFloatTemp.emplace(ToFloatRegister(ins->temp())); + else + maybeTemp.emplace(ToRegister(ins->temp())); - // Worst-case fallback for pre-SSE3 machines. Bounce through memory. - Register temp = ToRegister(ins->getTemp(0)); - masm.reserveStack(3 * Simd128DataSize); - masm.storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize)); - masm.storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize)); - for (unsigned i = 0; i < 16; i++) { - masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp); - masm.store8(temp, Address(StackPointer, i)); - } - masm.loadAlignedSimd128Int(Address(StackPointer, 0), output); - masm.freeStack(3 * Simd128DataSize); + masm.shuffleInt8x16(lhs, rhs, output, maybeFloatTemp, maybeTemp, lanes); } void @@ -3210,409 +2830,60 @@ CodeGeneratorX86Shared::visitSimdShuffleX4(LSimdShuffleX4* ins) Operand rhs = ToOperand(ins->rhs()); FloatRegister out = ToFloatRegister(ins->output()); - uint32_t x = ins->lane(0); - uint32_t y = ins->lane(1); - uint32_t z = ins->lane(2); - uint32_t w = ins->lane(3); - - // Check that lanes come from LHS in majority: - unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4); - MOZ_ASSERT(numLanesFromLHS >= 2); - - // When reading this method, remember that vshufps takes the two first - // inputs of the destination operand (right operand) and the two last - // inputs of the source operand (left operand). - // - // Legend for explanations: - // - L: LHS - // - R: RHS - // - T: temporary - - uint32_t mask; - - // If all lanes came from a single vector, we should have constructed a - // MSimdSwizzle instead. - MOZ_ASSERT(numLanesFromLHS < 4); - - // If all values stay in their lane, this is a blend. - if (AssemblerX86Shared::HasSSE41()) { - if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) { - masm.vblendps(masm.blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out); - return; - } - } - - // One element of the second, all other elements of the first - if (numLanesFromLHS == 3) { - unsigned firstMask = -1, secondMask = -1; - - // register-register vmovss preserves the high lanes. - if (ins->lanesMatch(4, 1, 2, 3) && rhs.kind() == Operand::FPREG) { - masm.vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out); - return; - } - - // SSE4.1 vinsertps can handle any single element. - unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3); - if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) { - unsigned srcLane; - unsigned dstLane; - if (x >= 4) { - srcLane = x - 4; - dstLane = 0; - } else if (y >= 4) { - srcLane = y - 4; - dstLane = 1; - } else if (z >= 4) { - srcLane = z - 4; - dstLane = 2; - } else { - MOZ_ASSERT(w >= 4); - srcLane = w - 4; - dstLane = 3; - } - masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, lhs, out); - return; - } - - FloatRegister rhsCopy = ToFloatRegister(ins->temp()); - - if (x < 4 && y < 4) { - if (w >= 4) { - w %= 4; - // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy) - firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z); - // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out) - secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0); - } else { - MOZ_ASSERT(z >= 4); - z %= 4; - // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy) - firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w); - // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out) - secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2); - } - - masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy); - masm.vshufps(secondMask, rhsCopy, lhs, out); - return; - } - - MOZ_ASSERT(z < 4 && w < 4); - - if (y >= 4) { - y %= 4; - // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy) - firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x); - // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out) - secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w); - } else { - MOZ_ASSERT(x >= 4); - x %= 4; - // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy) - firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y); - // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out) - secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w); - } - - masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy); - if (AssemblerX86Shared::HasAVX()) { - masm.vshufps(secondMask, lhs, rhsCopy, out); - } else { - masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy); - masm.moveSimd128Float(rhsCopy, out); - } - return; - } - - // Two elements from one vector, two other elements from the other - MOZ_ASSERT(numLanesFromLHS == 2); - - // TODO Here and below, symmetric case would be more handy to avoid a move, - // but can't be reached because operands would get swapped (bug 1084404). - if (ins->lanesMatch(2, 3, 6, 7)) { - ScratchSimd128Scope scratch(masm); - if (AssemblerX86Shared::HasAVX()) { - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch); - masm.vmovhlps(lhs, rhsCopy, out); - } else { - masm.loadAlignedSimd128Float(rhs, scratch); - masm.vmovhlps(lhs, scratch, scratch); - masm.moveSimd128Float(scratch, out); - } - return; - } - - if (ins->lanesMatch(0, 1, 4, 5)) { - FloatRegister rhsCopy; - ScratchSimd128Scope scratch(masm); - if (rhs.kind() == Operand::FPREG) { - // No need to make an actual copy, since the operand is already - // in a register, and it won't be clobbered by the vmovlhps. - rhsCopy = FloatRegister::FromCode(rhs.fpu()); - } else { - masm.loadAlignedSimd128Float(rhs, scratch); - rhsCopy = scratch; - } - masm.vmovlhps(rhsCopy, lhs, out); - return; - } - - if (ins->lanesMatch(0, 4, 1, 5)) { - masm.vunpcklps(rhs, lhs, out); - return; - } - - // TODO swapped case would be better (bug 1084404) - if (ins->lanesMatch(4, 0, 5, 1)) { - ScratchSimd128Scope scratch(masm); - if (AssemblerX86Shared::HasAVX()) { - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch); - masm.vunpcklps(lhs, rhsCopy, out); - } else { - masm.loadAlignedSimd128Float(rhs, scratch); - masm.vunpcklps(lhs, scratch, scratch); - masm.moveSimd128Float(scratch, out); - } - return; - } - - if (ins->lanesMatch(2, 6, 3, 7)) { - masm.vunpckhps(rhs, lhs, out); - return; - } - - // TODO swapped case would be better (bug 1084404) - if (ins->lanesMatch(6, 2, 7, 3)) { - ScratchSimd128Scope scratch(masm); - if (AssemblerX86Shared::HasAVX()) { - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch); - masm.vunpckhps(lhs, rhsCopy, out); - } else { - masm.loadAlignedSimd128Float(rhs, scratch); - masm.vunpckhps(lhs, scratch, scratch); - masm.moveSimd128Float(scratch, out); - } - return; - } - - // In one vshufps - if (x < 4 && y < 4) { - mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4); - masm.vshufps(mask, rhs, lhs, out); - return; - } - - // At creation, we should have explicitly swapped in this case. - MOZ_ASSERT(!(z >= 4 && w >= 4)); - - // In two vshufps, for the most generic case: - uint32_t firstMask[4], secondMask[4]; - unsigned i = 0, j = 2, k = 0; + unsigned lanes[4]; + for (unsigned i = 0; i < 4; i++) + lanes[i] = ins->lane(i); + Maybe<FloatRegister> maybeTemp; + if (!ins->temp()->isBogusTemp()) + maybeTemp.emplace(ToFloatRegister(ins->temp())); + masm.shuffleX4(lhs, rhs, out, maybeTemp, lanes); +} -#define COMPUTE_MASK(lane) \ - if (lane >= 4) { \ - firstMask[j] = lane % 4; \ - secondMask[k++] = j++; \ - } else { \ - firstMask[i] = lane; \ - secondMask[k++] = i++; \ +static inline Assembler::Condition +ToCondition(MSimdBinaryComp::Operation op) +{ + switch (op) { + case MSimdBinaryComp::greaterThan: return Assembler::GreaterThan; + case MSimdBinaryComp::equal: return Assembler::Equal; + case MSimdBinaryComp::lessThan: return Assembler::LessThan; + case MSimdBinaryComp::notEqual: return Assembler::NotEqual; + case MSimdBinaryComp::greaterThanOrEqual: return Assembler::GreaterThanOrEqual; + case MSimdBinaryComp::lessThanOrEqual: return Assembler::LessThanOrEqual; } - COMPUTE_MASK(x) - COMPUTE_MASK(y) - COMPUTE_MASK(z) - COMPUTE_MASK(w) -#undef COMPUTE_MASK - - MOZ_ASSERT(i == 2 && j == 4 && k == 4); - - mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1], - firstMask[2], firstMask[3]); - masm.vshufps(mask, rhs, lhs, lhs); - - mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1], - secondMask[2], secondMask[3]); - masm.vshufps(mask, lhs, lhs, lhs); + MOZ_CRASH("unexpected cond"); } void CodeGeneratorX86Shared::visitSimdBinaryCompIx16(LSimdBinaryCompIx16* ins) { - static const SimdConstant allOnes = SimdConstant::SplatX16(-1); - FloatRegister lhs = ToFloatRegister(ins->lhs()); Operand rhs = ToOperand(ins->rhs()); FloatRegister output = ToFloatRegister(ins->output()); MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs); - ScratchSimd128Scope scratch(masm); - - MSimdBinaryComp::Operation op = ins->operation(); - switch (op) { - case MSimdBinaryComp::greaterThan: - masm.vpcmpgtb(rhs, lhs, output); - return; - case MSimdBinaryComp::equal: - masm.vpcmpeqb(rhs, lhs, output); - return; - case MSimdBinaryComp::lessThan: - // src := rhs - if (rhs.kind() == Operand::FPREG) - masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch); - else - masm.loadAlignedSimd128Int(rhs, scratch); - - // src := src > lhs (i.e. lhs < rhs) - // Improve by doing custom lowering (rhs is tied to the output register) - masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch); - masm.moveSimd128Int(scratch, output); - return; - case MSimdBinaryComp::notEqual: - // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we - // should invert the comparison by, e.g. swapping the arms of a select - // if that's what it's used in. - masm.loadConstantSimd128Int(allOnes, scratch); - masm.vpcmpeqb(rhs, lhs, output); - masm.bitwiseXorSimd128(Operand(scratch), output); - return; - case MSimdBinaryComp::greaterThanOrEqual: - // src := rhs - if (rhs.kind() == Operand::FPREG) - masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch); - else - masm.loadAlignedSimd128Int(rhs, scratch); - masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch); - masm.loadConstantSimd128Int(allOnes, output); - masm.bitwiseXorSimd128(Operand(scratch), output); - return; - case MSimdBinaryComp::lessThanOrEqual: - // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. - masm.loadConstantSimd128Int(allOnes, scratch); - masm.vpcmpgtb(rhs, lhs, output); - masm.bitwiseXorSimd128(Operand(scratch), output); - return; - } - MOZ_CRASH("unexpected SIMD op"); + masm.compareInt8x16(lhs, rhs, ToCondition(ins->operation()), output); } void CodeGeneratorX86Shared::visitSimdBinaryCompIx8(LSimdBinaryCompIx8* ins) { - static const SimdConstant allOnes = SimdConstant::SplatX8(-1); - FloatRegister lhs = ToFloatRegister(ins->lhs()); Operand rhs = ToOperand(ins->rhs()); FloatRegister output = ToFloatRegister(ins->output()); MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs); - ScratchSimd128Scope scratch(masm); - - MSimdBinaryComp::Operation op = ins->operation(); - switch (op) { - case MSimdBinaryComp::greaterThan: - masm.vpcmpgtw(rhs, lhs, output); - return; - case MSimdBinaryComp::equal: - masm.vpcmpeqw(rhs, lhs, output); - return; - case MSimdBinaryComp::lessThan: - // src := rhs - if (rhs.kind() == Operand::FPREG) - masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch); - else - masm.loadAlignedSimd128Int(rhs, scratch); - - // src := src > lhs (i.e. lhs < rhs) - // Improve by doing custom lowering (rhs is tied to the output register) - masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch); - masm.moveSimd128Int(scratch, output); - return; - case MSimdBinaryComp::notEqual: - // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we - // should invert the comparison by, e.g. swapping the arms of a select - // if that's what it's used in. - masm.loadConstantSimd128Int(allOnes, scratch); - masm.vpcmpeqw(rhs, lhs, output); - masm.bitwiseXorSimd128(Operand(scratch), output); - return; - case MSimdBinaryComp::greaterThanOrEqual: - // src := rhs - if (rhs.kind() == Operand::FPREG) - masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch); - else - masm.loadAlignedSimd128Int(rhs, scratch); - masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch); - masm.loadConstantSimd128Int(allOnes, output); - masm.bitwiseXorSimd128(Operand(scratch), output); - return; - case MSimdBinaryComp::lessThanOrEqual: - // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. - masm.loadConstantSimd128Int(allOnes, scratch); - masm.vpcmpgtw(rhs, lhs, output); - masm.bitwiseXorSimd128(Operand(scratch), output); - return; - } - MOZ_CRASH("unexpected SIMD op"); + masm.compareInt16x8(lhs, rhs, ToCondition(ins->operation()), output); } void CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins) { - static const SimdConstant allOnes = SimdConstant::SplatX4(-1); - FloatRegister lhs = ToFloatRegister(ins->lhs()); Operand rhs = ToOperand(ins->rhs()); MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs); - ScratchSimd128Scope scratch(masm); - - MSimdBinaryComp::Operation op = ins->operation(); - switch (op) { - case MSimdBinaryComp::greaterThan: - masm.packedGreaterThanInt32x4(rhs, lhs); - return; - case MSimdBinaryComp::equal: - masm.packedEqualInt32x4(rhs, lhs); - return; - case MSimdBinaryComp::lessThan: - // src := rhs - if (rhs.kind() == Operand::FPREG) - masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch); - else - masm.loadAlignedSimd128Int(rhs, scratch); - - // src := src > lhs (i.e. lhs < rhs) - // Improve by doing custom lowering (rhs is tied to the output register) - masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch); - masm.moveSimd128Int(scratch, lhs); - return; - case MSimdBinaryComp::notEqual: - // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we - // should invert the comparison by, e.g. swapping the arms of a select - // if that's what it's used in. - masm.loadConstantSimd128Int(allOnes, scratch); - masm.packedEqualInt32x4(rhs, lhs); - masm.bitwiseXorSimd128(Operand(scratch), lhs); - return; - case MSimdBinaryComp::greaterThanOrEqual: - // src := rhs - if (rhs.kind() == Operand::FPREG) - masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch); - else - masm.loadAlignedSimd128Int(rhs, scratch); - masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch); - masm.loadConstantSimd128Int(allOnes, lhs); - masm.bitwiseXorSimd128(Operand(scratch), lhs); - return; - case MSimdBinaryComp::lessThanOrEqual: - // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. - masm.loadConstantSimd128Int(allOnes, scratch); - masm.packedGreaterThanInt32x4(rhs, lhs); - masm.bitwiseXorSimd128(Operand(scratch), lhs); - return; - } - MOZ_CRASH("unexpected SIMD op"); + masm.compareInt32x4(lhs, rhs, ToCondition(ins->operation()), lhs); } void @@ -3622,27 +2893,7 @@ CodeGeneratorX86Shared::visitSimdBinaryCompFx4(LSimdBinaryCompFx4* ins) Operand rhs = ToOperand(ins->rhs()); FloatRegister output = ToFloatRegister(ins->output()); - MSimdBinaryComp::Operation op = ins->operation(); - switch (op) { - case MSimdBinaryComp::equal: - masm.vcmpeqps(rhs, lhs, output); - return; - case MSimdBinaryComp::lessThan: - masm.vcmpltps(rhs, lhs, output); - return; - case MSimdBinaryComp::lessThanOrEqual: - masm.vcmpleps(rhs, lhs, output); - return; - case MSimdBinaryComp::notEqual: - masm.vcmpneqps(rhs, lhs, output); - return; - case MSimdBinaryComp::greaterThanOrEqual: - case MSimdBinaryComp::greaterThan: - // We reverse these before register allocation so that we don't have to - // copy into and out of temporaries after codegen. - MOZ_CRASH("lowering should have reversed this"); - } - MOZ_CRASH("unexpected SIMD op"); + masm.compareFloat32x4(lhs, rhs, ToCondition(ins->operation()), output); } void @@ -3655,10 +2906,10 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx16(LSimdBinaryArithIx16* ins) MSimdBinaryArith::Operation op = ins->operation(); switch (op) { case MSimdBinaryArith::Op_add: - masm.vpaddb(rhs, lhs, output); + masm.addInt8x16(lhs, rhs, output); return; case MSimdBinaryArith::Op_sub: - masm.vpsubb(rhs, lhs, output); + masm.subInt8x16(lhs, rhs, output); return; case MSimdBinaryArith::Op_mul: // 8x16 mul is a valid operation, but not supported in SSE or AVX. @@ -3685,13 +2936,13 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx8(LSimdBinaryArithIx8* ins) MSimdBinaryArith::Operation op = ins->operation(); switch (op) { case MSimdBinaryArith::Op_add: - masm.vpaddw(rhs, lhs, output); + masm.addInt16x8(lhs, rhs, output); return; case MSimdBinaryArith::Op_sub: - masm.vpsubw(rhs, lhs, output); + masm.subInt16x8(lhs, rhs, output); return; case MSimdBinaryArith::Op_mul: - masm.vpmullw(rhs, lhs, output); + masm.mulInt16x8(lhs, rhs, output); return; case MSimdBinaryArith::Op_div: case MSimdBinaryArith::Op_max: @@ -3710,35 +2961,19 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx4(LSimdBinaryArithIx4* ins) Operand rhs = ToOperand(ins->rhs()); FloatRegister output = ToFloatRegister(ins->output()); - ScratchSimd128Scope scratch(masm); - MSimdBinaryArith::Operation op = ins->operation(); switch (op) { case MSimdBinaryArith::Op_add: - masm.vpaddd(rhs, lhs, output); + masm.addInt32x4(lhs, rhs, output); return; case MSimdBinaryArith::Op_sub: - masm.vpsubd(rhs, lhs, output); + masm.subInt32x4(lhs, rhs, output); return; case MSimdBinaryArith::Op_mul: { - if (AssemblerX86Shared::HasSSE41()) { - masm.vpmulld(rhs, lhs, output); - return; - } - - masm.loadAlignedSimd128Int(rhs, scratch); - masm.vpmuludq(lhs, scratch, scratch); - // scratch contains (Rx, _, Rz, _) where R is the resulting vector. - - FloatRegister temp = ToFloatRegister(ins->temp()); - masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs); - masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, temp); - masm.vpmuludq(temp, lhs, lhs); - // lhs contains (Ry, _, Rw, _) where R is the resulting vector. - - masm.vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs); - // lhs contains (Ry, Rw, Rx, Rz) - masm.vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs); + Maybe<FloatRegister> maybeTemp; + if (!AssemblerX86Shared::HasSSE41()) + maybeTemp.emplace(ToFloatRegister(ins->getTemp(0))); + masm.mulInt32x4(lhs, rhs, maybeTemp, output); return; } case MSimdBinaryArith::Op_div: @@ -3766,104 +3001,34 @@ CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4* ins) Operand rhs = ToOperand(ins->rhs()); FloatRegister output = ToFloatRegister(ins->output()); - ScratchSimd128Scope scratch(masm); - MSimdBinaryArith::Operation op = ins->operation(); switch (op) { case MSimdBinaryArith::Op_add: - masm.vaddps(rhs, lhs, output); + masm.addFloat32x4(lhs, rhs, output); return; case MSimdBinaryArith::Op_sub: - masm.vsubps(rhs, lhs, output); + masm.subFloat32x4(lhs, rhs, output); return; case MSimdBinaryArith::Op_mul: - masm.vmulps(rhs, lhs, output); + masm.mulFloat32x4(lhs, rhs, output); return; case MSimdBinaryArith::Op_div: - masm.vdivps(rhs, lhs, output); + masm.divFloat32x4(lhs, rhs, output); return; case MSimdBinaryArith::Op_max: { - FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, scratch); - masm.vcmpunordps(rhs, lhsCopy, scratch); - - FloatRegister tmp = ToFloatRegister(ins->temp()); - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, tmp); - masm.vmaxps(Operand(lhs), rhsCopy, tmp); - masm.vmaxps(rhs, lhs, output); - - masm.vandps(tmp, output, output); - masm.vorps(scratch, output, output); // or in the all-ones NaNs + masm.maxFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output); return; } case MSimdBinaryArith::Op_min: { - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch); - masm.vminps(Operand(lhs), rhsCopy, scratch); - masm.vminps(rhs, lhs, output); - masm.vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN + masm.minFloat32x4(lhs, rhs, output); return; } case MSimdBinaryArith::Op_minNum: { - FloatRegister tmp = ToFloatRegister(ins->temp()); - masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp); - - FloatRegister mask = scratch; - FloatRegister tmpCopy = masm.reusedInputFloat32x4(tmp, scratch); - masm.vpcmpeqd(Operand(lhs), tmpCopy, mask); - masm.vandps(tmp, mask, mask); - - FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp); - masm.vminps(rhs, lhsCopy, tmp); - masm.vorps(mask, tmp, tmp); - - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask); - masm.vcmpneqps(rhs, rhsCopy, mask); - - if (AssemblerX86Shared::HasAVX()) { - masm.vblendvps(mask, lhs, tmp, output); - } else { - // Emulate vblendvps. - // With SSE.4.1 we could use blendvps, however it's awkward since - // it requires the mask to be in xmm0. - if (lhs != output) - masm.moveSimd128Float(lhs, output); - masm.vandps(Operand(mask), output, output); - masm.vandnps(Operand(tmp), mask, mask); - masm.vorps(Operand(mask), output, output); - } + masm.minNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output); return; } case MSimdBinaryArith::Op_maxNum: { - FloatRegister mask = scratch; - masm.loadConstantSimd128Int(SimdConstant::SplatX4(0), mask); - masm.vpcmpeqd(Operand(lhs), mask, mask); - - FloatRegister tmp = ToFloatRegister(ins->temp()); - masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp); - masm.vandps(tmp, mask, mask); - - FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp); - masm.vmaxps(rhs, lhsCopy, tmp); - masm.vandnps(Operand(tmp), mask, mask); - - // Ensure tmp always contains the temporary result - mask = tmp; - tmp = scratch; - - FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask); - masm.vcmpneqps(rhs, rhsCopy, mask); - - if (AssemblerX86Shared::HasAVX()) { - masm.vblendvps(mask, lhs, tmp, output); - } else { - // Emulate vblendvps. - // With SSE.4.1 we could use blendvps, however it's awkward since - // it requires the mask to be in xmm0. - if (lhs != output) - masm.moveSimd128Float(lhs, output); - masm.vandps(Operand(mask), output, output); - masm.vandnps(Operand(tmp), mask, mask); - masm.vorps(Operand(mask), output, output); - } + masm.maxNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output); return; } } @@ -3884,16 +3049,10 @@ CodeGeneratorX86Shared::visitSimdBinarySaturating(LSimdBinarySaturating* ins) case MIRType::Int8x16: switch (ins->operation()) { case MSimdBinarySaturating::add: - if (sign == SimdSign::Signed) - masm.vpaddsb(rhs, lhs, output); - else - masm.vpaddusb(rhs, lhs, output); + masm.addSatInt8x16(lhs, rhs, sign, output); return; case MSimdBinarySaturating::sub: - if (sign == SimdSign::Signed) - masm.vpsubsb(rhs, lhs, output); - else - masm.vpsubusb(rhs, lhs, output); + masm.subSatInt8x16(lhs, rhs, sign, output); return; } break; @@ -3901,16 +3060,10 @@ CodeGeneratorX86Shared::visitSimdBinarySaturating(LSimdBinarySaturating* ins) case MIRType::Int16x8: switch (ins->operation()) { case MSimdBinarySaturating::add: - if (sign == SimdSign::Signed) - masm.vpaddsw(rhs, lhs, output); - else - masm.vpaddusw(rhs, lhs, output); + masm.addSatInt16x8(lhs, rhs, sign, output); return; case MSimdBinarySaturating::sub: - if (sign == SimdSign::Signed) - masm.vpsubsw(rhs, lhs, output); - else - masm.vpsubusw(rhs, lhs, output); + masm.subSatInt16x8(lhs, rhs, sign, output); return; } break; @@ -3927,16 +3080,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx16(LSimdUnaryArithIx16* ins) Operand in = ToOperand(ins->input()); FloatRegister out = ToFloatRegister(ins->output()); - static const SimdConstant allOnes = SimdConstant::SplatX16(-1); - switch (ins->operation()) { case MSimdUnaryArith::neg: - masm.zeroSimd128Int(out); - masm.packedSubInt8(in, out); + masm.negInt8x16(in, out); return; case MSimdUnaryArith::not_: - masm.loadConstantSimd128Int(allOnes, out); - masm.bitwiseXorSimd128(in, out); + masm.notInt8x16(in, out);; return; case MSimdUnaryArith::abs: case MSimdUnaryArith::reciprocalApproximation: @@ -3953,16 +3102,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx8(LSimdUnaryArithIx8* ins) Operand in = ToOperand(ins->input()); FloatRegister out = ToFloatRegister(ins->output()); - static const SimdConstant allOnes = SimdConstant::SplatX8(-1); - switch (ins->operation()) { case MSimdUnaryArith::neg: - masm.zeroSimd128Int(out); - masm.packedSubInt16(in, out); + masm.negInt16x8(in, out); return; case MSimdUnaryArith::not_: - masm.loadConstantSimd128Int(allOnes, out); - masm.bitwiseXorSimd128(in, out); + masm.notInt16x8(in, out); return; case MSimdUnaryArith::abs: case MSimdUnaryArith::reciprocalApproximation: @@ -3979,16 +3124,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins) Operand in = ToOperand(ins->input()); FloatRegister out = ToFloatRegister(ins->output()); - static const SimdConstant allOnes = SimdConstant::SplatX4(-1); - switch (ins->operation()) { case MSimdUnaryArith::neg: - masm.zeroSimd128Int(out); - masm.packedSubInt32(in, out); + masm.negInt32x4(in, out); return; case MSimdUnaryArith::not_: - masm.loadConstantSimd128Int(allOnes, out); - masm.bitwiseXorSimd128(in, out); + masm.notInt32x4(in, out); return; case MSimdUnaryArith::abs: case MSimdUnaryArith::reciprocalApproximation: @@ -4005,29 +3146,15 @@ CodeGeneratorX86Shared::visitSimdUnaryArithFx4(LSimdUnaryArithFx4* ins) Operand in = ToOperand(ins->input()); FloatRegister out = ToFloatRegister(ins->output()); - // All ones but the sign bit - float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits); - static const SimdConstant signMasks = SimdConstant::SplatX4(signMask); - - // All ones including the sign bit - float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits); - static const SimdConstant allOnes = SimdConstant::SplatX4(ones); - - // All zeros but the sign bit - static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f); - switch (ins->operation()) { case MSimdUnaryArith::abs: - masm.loadConstantSimd128Float(signMasks, out); - masm.bitwiseAndSimd128(in, out); + masm.absFloat32x4(in, out); return; case MSimdUnaryArith::neg: - masm.loadConstantSimd128Float(minusZero, out); - masm.bitwiseXorSimd128(in, out); + masm.negFloat32x4(in, out); return; case MSimdUnaryArith::not_: - masm.loadConstantSimd128Float(allOnes, out); - masm.bitwiseXorSimd128(in, out); + masm.notFloat32x4(in, out); return; case MSimdUnaryArith::reciprocalApproximation: masm.packedRcpApproximationFloat32x4(in, out); @@ -4053,21 +3180,21 @@ CodeGeneratorX86Shared::visitSimdBinaryBitwise(LSimdBinaryBitwise* ins) switch (op) { case MSimdBinaryBitwise::and_: if (ins->type() == MIRType::Float32x4) - masm.vandps(rhs, lhs, output); + masm.bitwiseAndFloat32x4(lhs, rhs, output); else - masm.vpand(rhs, lhs, output); + masm.bitwiseAndSimdInt(lhs, rhs, output); return; case MSimdBinaryBitwise::or_: if (ins->type() == MIRType::Float32x4) - masm.vorps(rhs, lhs, output); + masm.bitwiseOrFloat32x4(lhs, rhs, output); else - masm.vpor(rhs, lhs, output); + masm.bitwiseOrSimdInt(lhs, rhs, output); return; case MSimdBinaryBitwise::xor_: if (ins->type() == MIRType::Float32x4) - masm.vxorps(rhs, lhs, output); + masm.bitwiseXorFloat32x4(lhs, rhs, output); else - masm.vpxor(rhs, lhs, output); + masm.bitwiseXorSimdInt(lhs, rhs, output); return; } MOZ_CRASH("unexpected SIMD bitwise op"); @@ -4079,15 +3206,12 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins) FloatRegister out = ToFloatRegister(ins->output()); MOZ_ASSERT(ToFloatRegister(ins->vector()) == out); // defineReuseInput(0); - // The shift amount is masked to the number of bits in a lane. - uint32_t shiftmask = (128u / SimdTypeToLength(ins->type())) - 1; - // Note that SSE doesn't have instructions for shifting 8x16 vectors. // These shifts are synthesized by the MSimdShift::AddLegalized() function. const LAllocation* val = ins->value(); if (val->isConstant()) { MOZ_ASSERT(ins->temp()->isBogusTemp()); - Imm32 count(uint32_t(ToInt32(val)) & shiftmask); + Imm32 count(uint32_t(ToInt32(val))); switch (ins->type()) { case MIRType::Int16x8: switch (ins->operation()) { @@ -4121,38 +3245,33 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins) MOZ_CRASH("unexpected SIMD bitwise op"); } - // Truncate val to 5 bits. We should have a temp register for that. - MOZ_ASSERT(val->isRegister()); - Register count = ToRegister(ins->temp()); - masm.mov(ToRegister(val), count); - masm.andl(Imm32(shiftmask), count); - ScratchFloat32Scope scratch(masm); - masm.vmovd(count, scratch); + Register temp = ToRegister(ins->temp()); + Register count = ToRegister(val); switch (ins->type()) { case MIRType::Int16x8: switch (ins->operation()) { case MSimdShift::lsh: - masm.packedLeftShiftByScalarInt16x8(scratch, out); + masm.packedLeftShiftByScalarInt16x8(out, count, temp, out); return; case MSimdShift::rsh: - masm.packedRightShiftByScalarInt16x8(scratch, out); + masm.packedRightShiftByScalarInt16x8(out, count, temp, out); return; case MSimdShift::ursh: - masm.packedUnsignedRightShiftByScalarInt16x8(scratch, out); + masm.packedUnsignedRightShiftByScalarInt16x8(out, count, temp, out); return; } break; case MIRType::Int32x4: switch (ins->operation()) { case MSimdShift::lsh: - masm.packedLeftShiftByScalarInt32x4(scratch, out); + masm.packedLeftShiftByScalarInt32x4(out, count, temp, out); return; case MSimdShift::rsh: - masm.packedRightShiftByScalarInt32x4(scratch, out); + masm.packedRightShiftByScalarInt32x4(out, count, temp, out); return; case MSimdShift::ursh: - masm.packedUnsignedRightShiftByScalarInt32x4(scratch, out); + masm.packedUnsignedRightShiftByScalarInt32x4(out, count, temp, out); return; } break; @@ -4171,26 +3290,12 @@ CodeGeneratorX86Shared::visitSimdSelect(LSimdSelect* ins) FloatRegister output = ToFloatRegister(ins->output()); FloatRegister temp = ToFloatRegister(ins->temp()); - if (onTrue != output) - masm.vmovaps(onTrue, output); - if (mask != temp) - masm.vmovaps(mask, temp); - MSimdSelect* mir = ins->mir(); unsigned lanes = SimdTypeToLength(mir->type()); - - if (AssemblerX86Shared::HasAVX() && lanes == 4) { - // TBD: Use vpblendvb for lanes > 4, HasAVX. - masm.vblendvps(mask, onTrue, onFalse, output); - return; - } - - // SSE4.1 has plain blendvps which can do this, but it is awkward - // to use because it requires the mask to be in xmm0. - - masm.bitwiseAndSimd128(Operand(temp), output); - masm.bitwiseAndNotSimd128(Operand(onFalse), temp); - masm.bitwiseOrSimd128(Operand(temp), output); + if (lanes == 4) + masm.selectX4(mask, onTrue, onFalse, temp, output); + else + masm.selectSimd128(mask, onTrue, onFalse, temp, output); } void diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h index 0b4961dddd..4b0664fb63 100644 --- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h +++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h @@ -173,12 +173,6 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared void emitTableSwitchDispatch(MTableSwitch* mir, Register index, Register base); - void emitSimdExtractLane8x16(FloatRegister input, Register output, unsigned lane, - SimdSign signedness); - void emitSimdExtractLane16x8(FloatRegister input, Register output, unsigned lane, - SimdSign signedness); - void emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane); - public: CodeGeneratorX86Shared(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm); diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp new file mode 100644 index 0000000000..0ebf30de1a --- /dev/null +++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp @@ -0,0 +1,1227 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "jit/MacroAssembler.h" +#include "jit/x86-shared/MacroAssembler-x86-shared.h" + +#include "jit/MacroAssembler-inl.h" + +using namespace js; +using namespace js::jit; + +using mozilla::DebugOnly; +using mozilla::FloatingPoint; +using mozilla::Maybe; +using mozilla::SpecificNaN; + +void +MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest, + Register temp, Label* oolEntry, + Label* rejoin) +{ + // Does the conversion and jumps to the OOL entry if the result value + // is the undefined integer pattern. + static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648)); + convertFloat32x4ToInt32x4(src, dest); + + ScratchSimd128Scope scratch(asMasm()); + asMasm().loadConstantSimd128Int(InvalidResult, scratch); + packedEqualInt32x4(Operand(dest), scratch); + // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of + // the two following instructions. + vmovmskps(scratch, temp); + cmp32(temp, Imm32(0)); + j(Assembler::NotEqual, oolEntry); + bind(rejoin); +} + +void +MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp, + Label* rejoin, Label* onConversionError) +{ + static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f); + static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f); + + ScratchSimd128Scope scratch(asMasm()); + asMasm().loadConstantSimd128Float(Int32MinX4, scratch); + vcmpleps(Operand(src), scratch, scratch); + vmovmskps(scratch, temp); + cmp32(temp, Imm32(15)); + j(Assembler::NotEqual, onConversionError); + + asMasm().loadConstantSimd128Float(Int32MaxX4, scratch); + vcmpleps(Operand(src), scratch, scratch); + vmovmskps(scratch, temp); + cmp32(temp, Imm32(0)); + j(Assembler::NotEqual, onConversionError); + + jump(rejoin); +} + +void +MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(FloatRegister in, FloatRegister out, + Register temp, FloatRegister tempF, + Label* failed) +{ + // Classify lane values into 4 disjoint classes: + // + // N-lanes: in <= -1.0 + // A-lanes: -1.0 < in <= 0x0.ffffffp31 + // B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32 + // V-lanes: 0x1.0p32 <= in, or isnan(in) + // + // We need to bail out to throw a RangeError if we see any N-lanes or + // V-lanes. + // + // For A-lanes and B-lanes, we make two float -> int32 conversions: + // + // A = cvttps2dq(in) + // B = cvttps2dq(in - 0x1.0p31f) + // + // Note that the subtraction for the B computation is exact for B-lanes. + // There is no rounding, so B is the low 31 bits of the correctly converted + // result. + // + // The cvttps2dq instruction produces 0x80000000 when the input is NaN or + // out of range for a signed int32_t. This conveniently provides the missing + // high bit for B, so the desired result is A for A-lanes and A|B for + // B-lanes. + + ScratchSimd128Scope scratch(asMasm()); + + // TODO: If the majority of lanes are A-lanes, it could be faster to compute + // A first, use vmovmskps to check for any non-A-lanes and handle them in + // ool code. OTOH, we we're wrong about the lane distribution, that would be + // slower. + + // Compute B in |scratch|. + static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC. + static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust); + asMasm().loadConstantSimd128Float(Bias, scratch); + packedAddFloat32(Operand(in), scratch); + convertFloat32x4ToInt32x4(scratch, scratch); + + // Compute A in |out|. This is the last time we use |in| and the first time + // we use |out|, so we can tolerate if they are the same register. + convertFloat32x4ToInt32x4(in, out); + + // We can identify A-lanes by the sign bits in A: Any A-lanes will be + // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a + // mask of non-A-lanes into |tempF|. + zeroSimd128Float(tempF); + packedGreaterThanInt32x4(Operand(out), tempF); + + // Clear the A-lanes in B. + bitwiseAndSimdInt(scratch, Operand(tempF), scratch); + + // Compute the final result: A for A-lanes, A|B for B-lanes. + bitwiseOrSimdInt(out, Operand(scratch), out); + + // We still need to filter out the V-lanes. They would show up as 0x80000000 + // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are + // the remaining negative lanes in B. + vmovmskps(scratch, temp); + cmp32(temp, Imm32(0)); + j(Assembler::NotEqual, failed); +} + +void +MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1, Register lane2, + Register lane3, FloatRegister dest) +{ + if (AssemblerX86Shared::HasSSE41()) { + vmovd(lane0, dest); + vpinsrd(1, lane1, dest, dest); + vpinsrd(2, lane2, dest, dest); + vpinsrd(3, lane3, dest, dest); + return; + } + + asMasm().reserveStack(Simd128DataSize); + store32(lane0, Address(StackPointer, 0 * sizeof(int32_t))); + store32(lane1, Address(StackPointer, 1 * sizeof(int32_t))); + store32(lane2, Address(StackPointer, 2 * sizeof(int32_t))); + store32(lane3, Address(StackPointer, 3 * sizeof(int32_t))); + loadAlignedSimd128Int(Address(StackPointer, 0), dest); + asMasm().freeStack(Simd128DataSize); +} + +void +MacroAssemblerX86Shared::createFloat32x4(FloatRegister lane0, FloatRegister lane1, + FloatRegister lane2, FloatRegister lane3, + FloatRegister temp, FloatRegister output) +{ + FloatRegister lane0Copy = reusedInputFloat32x4(lane0, output); + FloatRegister lane1Copy = reusedInputFloat32x4(lane1, temp); + vunpcklps(lane3, lane1Copy, temp); + vunpcklps(lane2, lane0Copy, output); + vunpcklps(temp, output, output); +} + +void +MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) +{ + vmovd(input, output); + if (AssemblerX86Shared::HasSSSE3()) { + zeroSimd128Int(ScratchSimd128Reg); + vpshufb(ScratchSimd128Reg, output, output); + } else { + // Use two shifts to duplicate the low 8 bits into the low 16 bits. + vpsllw(Imm32(8), output, output); + vmovdqa(output, ScratchSimd128Reg); + vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg); + vpor(ScratchSimd128Reg, output, output); + // Then do an X8 splat. + vpshuflw(0, output, output); + vpshufd(0, output, output); + } +} + +void +MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) +{ + vmovd(input, output); + vpshuflw(0, output, output); + vpshufd(0, output, output); +} + +void +MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) +{ + vmovd(input, output); + vpshufd(0, output, output); +} + +void +MacroAssemblerX86Shared::splatX4(FloatRegister input, FloatRegister output) +{ + FloatRegister inputCopy = reusedInputFloat32x4(input, output); + vshufps(0, inputCopy, inputCopy, output); +} + +void +MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType, FloatRegister input, + FloatRegister output) +{ + if (input.aliases(output)) + return; + if (isIntegerLaneType) + vmovdqa(input, output); + else + vmovaps(input, output); +} + +void +MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, Register output, unsigned lane) +{ + if (lane == 0) { + // The value we want to extract is in the low double-word + moveLowInt32(input, output); + } else if (AssemblerX86Shared::HasSSE41()) { + vpextrd(lane, input, output); + } else { + uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); + shuffleInt32(mask, input, ScratchSimd128Reg); + moveLowInt32(ScratchSimd128Reg, output); + } +} + +void +MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, FloatRegister output, + unsigned lane, bool canonicalize) +{ + if (lane == 0) { + // The value we want to extract is in the low double-word + if (input != output) + moveFloat32(input, output); + } else if (lane == 2) { + moveHighPairToLowPairFloat32(input, output); + } else { + uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); + shuffleFloat32(mask, input, output); + } + // NaNs contained within SIMD values are not enforced to be canonical, so + // when we extract an element into a "regular" scalar JS value, we have to + // canonicalize. In wasm code, we can skip this, as wasm only has to + // canonicalize NaNs at FFI boundaries. + if (canonicalize) + asMasm().canonicalizeFloat(output); +} + +void +MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, + SimdSign sign) +{ + // Unlike pextrd and pextrb, this is available in SSE2. + vpextrw(lane, input, output); + if (sign == SimdSign::Signed) + movswl(output, output); +} + +void +MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, + SimdSign sign) +{ + if (AssemblerX86Shared::HasSSE41()) { + vpextrb(lane, input, output); + // vpextrb clears the high bits, so no further extension required. + if (sign == SimdSign::Unsigned) + sign = SimdSign::NotApplicable; + } else { + // Extract the relevant 16 bits containing our lane, then shift the + // right 8 bits into place. + extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned); + if (lane % 2) { + shrl(Imm32(8), output); + // The shrl handles the zero-extension. Don't repeat it. + if (sign == SimdSign::Unsigned) + sign = SimdSign::NotApplicable; + } + } + + // We have the right low 8 bits in |output|, but we may need to fix the high + // bits. Note that this requires |output| to be one of the %eax-%edx + // registers. + switch (sign) { + case SimdSign::Signed: + movsbl(output, output); + break; + case SimdSign::Unsigned: + movzbl(output, output); + break; + case SimdSign::NotApplicable: + // No adjustment needed. + break; + } +} + +void +MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes, + unsigned lane) +{ + switch (numLanes) { + case 4: + extractLaneInt32x4(input, output, lane); + break; + case 8: + // Get a lane, don't bother fixing the high bits since we'll mask below. + extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable); + break; + case 16: + extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable); + break; + default: + MOZ_CRASH("Unhandled SIMD number of lanes"); + } + // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits. + asMasm().and32(Imm32(1), output); +} + +void +MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output, + unsigned lane, unsigned numLanes) +{ + if (numLanes == 8) { + // Available in SSE 2. + vpinsrw(lane, value, input, output); + return; + } + + // Note that, contrarily to float32x4, we cannot use vmovd if the inserted + // value goes into the first component, as vmovd clears out the higher lanes + // of the output. + if (AssemblerX86Shared::HasSSE41()) { + // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX. + switch (numLanes) { + case 4: + vpinsrd(lane, value, input, output); + return; + case 16: + vpinsrb(lane, value, input, output); + return; + } + } + + asMasm().reserveStack(Simd128DataSize); + storeAlignedSimd128Int(input, Address(StackPointer, 0)); + switch (numLanes) { + case 4: + store32(value, Address(StackPointer, lane * sizeof(int32_t))); + break; + case 16: + // Note that this requires `value` to be in one the registers where the + // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64). + store8(value, Address(StackPointer, lane * sizeof(int8_t))); + break; + default: + MOZ_CRASH("Unsupported SIMD numLanes"); + } + loadAlignedSimd128Int(Address(StackPointer, 0), output); + asMasm().freeStack(Simd128DataSize); +} + +void +MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input, FloatRegister value, + FloatRegister output, unsigned lane) +{ + if (lane == 0) { + // As both operands are registers, vmovss doesn't modify the upper bits + // of the destination operand. + if (value != output) + vmovss(value, input, output); + return; + } + + if (AssemblerX86Shared::HasSSE41()) { + // The input value is in the low float32 of the 'value' FloatRegister. + vinsertps(vinsertpsMask(0, lane), value, output, output); + return; + } + + asMasm().reserveStack(Simd128DataSize); + storeAlignedSimd128Float(input, Address(StackPointer, 0)); + asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t))); + loadAlignedSimd128Float(Address(StackPointer, 0), output); + asMasm().freeStack(Simd128DataSize); +} + +void +MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input, Register output) +{ + // We know that the input lanes are boolean, so they are either 0 or -1. + // The all-true vector has all 128 bits set, no matter the lane geometry. + vpmovmskb(input, output); + cmp32(output, Imm32(0xffff)); + emitSet(Assembler::Zero, output); +} + +void +MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input, Register output) +{ + vpmovmskb(input, output); + cmp32(output, Imm32(0x0)); + emitSet(Assembler::NonZero, output); +} + +void +MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input, FloatRegister output, + unsigned lanes[4]) +{ + uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1], lanes[2], lanes[3]); + shuffleInt32(mask, input, output); +} + +void +MacroAssemblerX86Shared::swizzleInt8x16(FloatRegister input, FloatRegister output, + const Maybe<Register>& temp, int8_t lanes[16]) +{ + if (AssemblerX86Shared::HasSSSE3()) { + ScratchSimd128Scope scratch(asMasm()); + asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch); + FloatRegister inputCopy = reusedInputInt32x4(input, output); + vpshufb(scratch, inputCopy, output); + return; + } + + // Worst-case fallback for pre-SSSE3 machines. Bounce through memory. + MOZ_ASSERT(!!temp, "needs a temp for the memory fallback"); + asMasm().reserveStack(2 * Simd128DataSize); + storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize)); + for (unsigned i = 0; i < 16; i++) { + load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp); + store8(*temp, Address(StackPointer, i)); + } + loadAlignedSimd128Int(Address(StackPointer, 0), output); + asMasm().freeStack(2 * Simd128DataSize); +} + +static inline bool +LanesMatch(unsigned lanes[4], unsigned x, unsigned y, unsigned z, unsigned w) +{ + return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w; +} + +void +MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input, FloatRegister output, + unsigned lanes[4]) +{ + if (AssemblerX86Shared::HasSSE3()) { + if (LanesMatch(lanes, 0, 0, 2, 2)) { + vmovsldup(input, output); + return; + } + if (LanesMatch(lanes, 1, 1, 3, 3)) { + vmovshdup(input, output); + return; + } + } + + // TODO Here and below, arch specific lowering could identify this pattern + // and use defineReuseInput to avoid this move (bug 1084404) + if (LanesMatch(lanes, 2, 3, 2, 3)) { + FloatRegister inputCopy = reusedInputFloat32x4(input, output); + vmovhlps(input, inputCopy, output); + return; + } + + if (LanesMatch(lanes, 0, 1, 0, 1)) { + if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) { + vmovddup(input, output); + return; + } + FloatRegister inputCopy = reusedInputFloat32x4(input, output); + vmovlhps(input, inputCopy, output); + return; + } + + if (LanesMatch(lanes, 0, 0, 1, 1)) { + FloatRegister inputCopy = reusedInputFloat32x4(input, output); + vunpcklps(input, inputCopy, output); + return; + } + + if (LanesMatch(lanes, 2, 2, 3, 3)) { + FloatRegister inputCopy = reusedInputFloat32x4(input, output); + vunpckhps(input, inputCopy, output); + return; + } + + uint32_t x = lanes[0]; + uint32_t y = lanes[1]; + uint32_t z = lanes[2]; + uint32_t w = lanes[3]; + + uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w); + shuffleFloat32(mask, input, output); +} + +void +MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output, + const Maybe<FloatRegister>& maybeFloatTemp, + const Maybe<Register>& maybeTemp, uint8_t lanes[16]) +{ + DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3(); + MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp); + MOZ_ASSERT(!hasSSSE3 == !!maybeTemp); + + // Use pshufb if it is available. + if (AssemblerX86Shared::HasSSSE3()) { + ScratchSimd128Scope scratch(asMasm()); + + // Use pshufb instructions to gather the lanes from each source vector. + // A negative index creates a zero lane, so the two vectors can be combined. + + // Set scratch = lanes from lhs. + int8_t idx[16]; + for (unsigned i = 0; i < 16; i++) + idx[i] = lanes[i] < 16 ? lanes[i] : -1; + asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp); + FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch); + vpshufb(*maybeFloatTemp, lhsCopy, scratch); + + // Set output = lanes from rhs. + for (unsigned i = 0; i < 16; i++) + idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1; + asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp); + FloatRegister rhsCopy = reusedInputInt32x4(rhs, output); + vpshufb(*maybeFloatTemp, rhsCopy, output); + + // Combine. + vpor(scratch, output, output); + return; + } + + // Worst-case fallback for pre-SSE3 machines. Bounce through memory. + asMasm().reserveStack(3 * Simd128DataSize); + storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize)); + storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize)); + for (unsigned i = 0; i < 16; i++) { + load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *maybeTemp); + store8(*maybeTemp, Address(StackPointer, i)); + } + loadAlignedSimd128Int(Address(StackPointer, 0), output); + asMasm().freeStack(3 * Simd128DataSize); +} + +void +MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out, + const Maybe<FloatRegister>& maybeTemp, unsigned lanes[4]) +{ + uint32_t x = lanes[0]; + uint32_t y = lanes[1]; + uint32_t z = lanes[2]; + uint32_t w = lanes[3]; + + // Check that lanes come from LHS in majority: + unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4); + MOZ_ASSERT(numLanesFromLHS >= 2); + + // When reading this method, remember that vshufps takes the two first + // inputs of the destination operand (right operand) and the two last + // inputs of the source operand (left operand). + // + // Legend for explanations: + // - L: LHS + // - R: RHS + // - T: temporary + + uint32_t mask; + + // If all lanes came from a single vector, we should use swizzle instead. + MOZ_ASSERT(numLanesFromLHS < 4); + + // If all values stay in their lane, this is a blend. + if (AssemblerX86Shared::HasSSE41()) { + if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) { + vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out); + return; + } + } + + // One element of the second, all other elements of the first + if (numLanesFromLHS == 3) { + unsigned firstMask = -1, secondMask = -1; + + // register-register vmovss preserves the high lanes. + if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) { + vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out); + return; + } + + // SSE4.1 vinsertps can handle any single element. + unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3); + if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) { + unsigned srcLane; + unsigned dstLane; + if (x >= 4) { + srcLane = x - 4; + dstLane = 0; + } else if (y >= 4) { + srcLane = y - 4; + dstLane = 1; + } else if (z >= 4) { + srcLane = z - 4; + dstLane = 2; + } else { + MOZ_ASSERT(w >= 4); + srcLane = w - 4; + dstLane = 3; + } + vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out); + return; + } + + MOZ_ASSERT(!!maybeTemp); + FloatRegister rhsCopy = *maybeTemp; + loadAlignedSimd128Float(rhs, rhsCopy); + + if (x < 4 && y < 4) { + if (w >= 4) { + w %= 4; + // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy) + firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z); + // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out) + secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0); + } else { + MOZ_ASSERT(z >= 4); + z %= 4; + // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy) + firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w); + // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out) + secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2); + } + + vshufps(firstMask, lhs, rhsCopy, rhsCopy); + vshufps(secondMask, rhsCopy, lhs, out); + return; + } + + MOZ_ASSERT(z < 4 && w < 4); + + if (y >= 4) { + y %= 4; + // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy) + firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x); + // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out) + secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w); + } else { + MOZ_ASSERT(x >= 4); + x %= 4; + // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy) + firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y); + // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out) + secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w); + } + + vshufps(firstMask, lhs, rhsCopy, rhsCopy); + if (AssemblerX86Shared::HasAVX()) { + vshufps(secondMask, lhs, rhsCopy, out); + } else { + vshufps(secondMask, lhs, rhsCopy, rhsCopy); + moveSimd128Float(rhsCopy, out); + } + return; + } + + // Two elements from one vector, two other elements from the other + MOZ_ASSERT(numLanesFromLHS == 2); + + // TODO Here and below, symmetric case would be more handy to avoid a move, + // but can't be reached because operands would get swapped (bug 1084404). + if (LanesMatch(lanes, 2, 3, 6, 7)) { + ScratchSimd128Scope scratch(asMasm()); + if (AssemblerX86Shared::HasAVX()) { + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch); + vmovhlps(lhs, rhsCopy, out); + } else { + loadAlignedSimd128Float(rhs, scratch); + vmovhlps(lhs, scratch, scratch); + moveSimd128Float(scratch, out); + } + return; + } + + if (LanesMatch(lanes, 0, 1, 4, 5)) { + FloatRegister rhsCopy; + ScratchSimd128Scope scratch(asMasm()); + if (rhs.kind() == Operand::FPREG) { + // No need to make an actual copy, since the operand is already + // in a register, and it won't be clobbered by the vmovlhps. + rhsCopy = FloatRegister::FromCode(rhs.fpu()); + } else { + loadAlignedSimd128Float(rhs, scratch); + rhsCopy = scratch; + } + vmovlhps(rhsCopy, lhs, out); + return; + } + + if (LanesMatch(lanes, 0, 4, 1, 5)) { + vunpcklps(rhs, lhs, out); + return; + } + + // TODO swapped case would be better (bug 1084404) + if (LanesMatch(lanes, 4, 0, 5, 1)) { + ScratchSimd128Scope scratch(asMasm()); + if (AssemblerX86Shared::HasAVX()) { + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch); + vunpcklps(lhs, rhsCopy, out); + } else { + loadAlignedSimd128Float(rhs, scratch); + vunpcklps(lhs, scratch, scratch); + moveSimd128Float(scratch, out); + } + return; + } + + if (LanesMatch(lanes, 2, 6, 3, 7)) { + vunpckhps(rhs, lhs, out); + return; + } + + // TODO swapped case would be better (bug 1084404) + if (LanesMatch(lanes, 6, 2, 7, 3)) { + ScratchSimd128Scope scratch(asMasm()); + if (AssemblerX86Shared::HasAVX()) { + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch); + vunpckhps(lhs, rhsCopy, out); + } else { + loadAlignedSimd128Float(rhs, scratch); + vunpckhps(lhs, scratch, scratch); + moveSimd128Float(scratch, out); + } + return; + } + + // In one vshufps + if (x < 4 && y < 4) { + mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4); + vshufps(mask, rhs, lhs, out); + return; + } + + // At creation, we should have explicitly swapped in this case. + MOZ_ASSERT(!(z >= 4 && w >= 4)); + + // In two vshufps, for the most generic case: + uint32_t firstMask[4], secondMask[4]; + unsigned i = 0, j = 2, k = 0; + +#define COMPUTE_MASK(lane) \ + if (lane >= 4) { \ + firstMask[j] = lane % 4; \ + secondMask[k++] = j++; \ + } else { \ + firstMask[i] = lane; \ + secondMask[k++] = i++; \ + } + + COMPUTE_MASK(x) + COMPUTE_MASK(y) + COMPUTE_MASK(z) + COMPUTE_MASK(w) +#undef COMPUTE_MASK + + MOZ_ASSERT(i == 2 && j == 4 && k == 4); + + mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1], + firstMask[2], firstMask[3]); + vshufps(mask, rhs, lhs, lhs); + + mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1], + secondMask[2], secondMask[3]); + vshufps(mask, lhs, lhs, lhs); +} + +static inline FloatRegister +ToSimdFloatRegister(const Operand& op) +{ + return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128); +} + +void +MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output) +{ + static const SimdConstant allOnes = SimdConstant::SplatX16(-1); + ScratchSimd128Scope scratch(asMasm()); + switch (cond) { + case Assembler::Condition::GreaterThan: + vpcmpgtb(rhs, lhs, output); + break; + case Assembler::Condition::Equal: + vpcmpeqb(rhs, lhs, output); + break; + case Assembler::Condition::LessThan: + // src := rhs + if (rhs.kind() == Operand::FPREG) + moveSimd128Int(ToSimdFloatRegister(rhs), scratch); + else + loadAlignedSimd128Int(rhs, scratch); + + // src := src > lhs (i.e. lhs < rhs) + // Improve by doing custom lowering (rhs is tied to the output register) + vpcmpgtb(Operand(lhs), scratch, scratch); + moveSimd128Int(scratch, output); + break; + case Assembler::Condition::NotEqual: + // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we + // should invert the comparison by, e.g. swapping the arms of a select + // if that's what it's used in. + asMasm().loadConstantSimd128Int(allOnes, scratch); + vpcmpeqb(rhs, lhs, output); + bitwiseXorSimdInt(output, Operand(scratch), output); + break; + case Assembler::Condition::GreaterThanOrEqual: + // src := rhs + if (rhs.kind() == Operand::FPREG) + moveSimd128Int(ToSimdFloatRegister(rhs), scratch); + else + loadAlignedSimd128Int(rhs, scratch); + vpcmpgtb(Operand(lhs), scratch, scratch); + asMasm().loadConstantSimd128Int(allOnes, output); + bitwiseXorSimdInt(output, Operand(scratch), output); + break; + case Assembler::Condition::LessThanOrEqual: + // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. + asMasm().loadConstantSimd128Int(allOnes, scratch); + vpcmpgtb(rhs, lhs, output); + bitwiseXorSimdInt(output, Operand(scratch), output); + break; + default: + MOZ_CRASH("unexpected condition op"); + } +} + +void +MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output) +{ + static const SimdConstant allOnes = SimdConstant::SplatX8(-1); + + ScratchSimd128Scope scratch(asMasm()); + switch (cond) { + case Assembler::Condition::GreaterThan: + vpcmpgtw(rhs, lhs, output); + break; + case Assembler::Condition::Equal: + vpcmpeqw(rhs, lhs, output); + break; + case Assembler::Condition::LessThan: + // src := rhs + if (rhs.kind() == Operand::FPREG) + moveSimd128Int(ToSimdFloatRegister(rhs), scratch); + else + loadAlignedSimd128Int(rhs, scratch); + + // src := src > lhs (i.e. lhs < rhs) + // Improve by doing custom lowering (rhs is tied to the output register) + vpcmpgtw(Operand(lhs), scratch, scratch); + moveSimd128Int(scratch, output); + break; + case Assembler::Condition::NotEqual: + // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we + // should invert the comparison by, e.g. swapping the arms of a select + // if that's what it's used in. + asMasm().loadConstantSimd128Int(allOnes, scratch); + vpcmpeqw(rhs, lhs, output); + bitwiseXorSimdInt(output, Operand(scratch), output); + break; + case Assembler::Condition::GreaterThanOrEqual: + // src := rhs + if (rhs.kind() == Operand::FPREG) + moveSimd128Int(ToSimdFloatRegister(rhs), scratch); + else + loadAlignedSimd128Int(rhs, scratch); + vpcmpgtw(Operand(lhs), scratch, scratch); + asMasm().loadConstantSimd128Int(allOnes, output); + bitwiseXorSimdInt(output, Operand(scratch), output); + break; + case Assembler::Condition::LessThanOrEqual: + // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. + asMasm().loadConstantSimd128Int(allOnes, scratch); + vpcmpgtw(rhs, lhs, output); + bitwiseXorSimdInt(output, Operand(scratch), output); + break; + default: + MOZ_CRASH("unexpected condition op"); + } +} + +void +MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output) +{ + static const SimdConstant allOnes = SimdConstant::SplatX4(-1); + ScratchSimd128Scope scratch(asMasm()); + switch (cond) { + case Assembler::Condition::GreaterThan: + packedGreaterThanInt32x4(rhs, lhs); + break; + case Assembler::Condition::Equal: + packedEqualInt32x4(rhs, lhs); + break; + case Assembler::Condition::LessThan: + // src := rhs + if (rhs.kind() == Operand::FPREG) + moveSimd128Int(ToSimdFloatRegister(rhs), scratch); + else + loadAlignedSimd128Int(rhs, scratch); + + // src := src > lhs (i.e. lhs < rhs) + // Improve by doing custom lowering (rhs is tied to the output register) + packedGreaterThanInt32x4(Operand(lhs), scratch); + moveSimd128Int(scratch, lhs); + break; + case Assembler::Condition::NotEqual: + // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we + // should invert the comparison by, e.g. swapping the arms of a select + // if that's what it's used in. + asMasm().loadConstantSimd128Int(allOnes, scratch); + packedEqualInt32x4(rhs, lhs); + bitwiseXorSimdInt(lhs, Operand(scratch), lhs); + break; + case Assembler::Condition::GreaterThanOrEqual: + // src := rhs + if (rhs.kind() == Operand::FPREG) + moveSimd128Int(ToSimdFloatRegister(rhs), scratch); + else + loadAlignedSimd128Int(rhs, scratch); + packedGreaterThanInt32x4(Operand(lhs), scratch); + asMasm().loadConstantSimd128Int(allOnes, lhs); + bitwiseXorSimdInt(lhs, Operand(scratch), lhs); + break; + case Assembler::Condition::LessThanOrEqual: + // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. + asMasm().loadConstantSimd128Int(allOnes, scratch); + packedGreaterThanInt32x4(rhs, lhs); + bitwiseXorSimdInt(lhs, Operand(scratch), lhs); + break; + default: + MOZ_CRASH("unexpected condition op"); + } +} + +void +MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output) +{ + switch (cond) { + case Assembler::Condition::Equal: + vcmpeqps(rhs, lhs, output); + break; + case Assembler::Condition::LessThan: + vcmpltps(rhs, lhs, output); + break; + case Assembler::Condition::LessThanOrEqual: + vcmpleps(rhs, lhs, output); + break; + case Assembler::Condition::NotEqual: + vcmpneqps(rhs, lhs, output); + break; + case Assembler::Condition::GreaterThanOrEqual: + case Assembler::Condition::GreaterThan: + // We reverse these before register allocation so that we don't have to + // copy into and out of temporaries after codegen. + MOZ_CRASH("should have reversed this"); + default: + MOZ_CRASH("unexpected condition op"); + } +} + +void +MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs, + const Maybe<FloatRegister>& temp, FloatRegister output) +{ + if (AssemblerX86Shared::HasSSE41()) { + vpmulld(rhs, lhs, output); + return; + } + + ScratchSimd128Scope scratch(asMasm()); + loadAlignedSimd128Int(rhs, scratch); + vpmuludq(lhs, scratch, scratch); + // scratch contains (Rx, _, Rz, _) where R is the resulting vector. + + MOZ_ASSERT(!!temp); + vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs); + vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp); + vpmuludq(*temp, lhs, lhs); + // lhs contains (Ry, _, Rw, _) where R is the resulting vector. + + vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs); + // lhs contains (Ry, Rw, Rx, Rz) + vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs); +} + +void +MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) +{ + ScratchSimd128Scope scratch(asMasm()); + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch); + vminps(Operand(lhs), rhsCopy, scratch); + vminps(rhs, lhs, output); + vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN +} + +void +MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, + FloatRegister output) +{ + ScratchSimd128Scope scratch(asMasm()); + FloatRegister lhsCopy = reusedInputFloat32x4(lhs, scratch); + vcmpunordps(rhs, lhsCopy, scratch); + + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, temp); + vmaxps(Operand(lhs), rhsCopy, temp); + vmaxps(rhs, lhs, output); + + vandps(temp, output, output); + vorps(scratch, output, output); // or in the all-ones NaNs +} + +void +MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, + FloatRegister output) +{ + ScratchSimd128Scope scratch(asMasm()); + asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp); + + FloatRegister mask = scratch; + FloatRegister tmpCopy = reusedInputFloat32x4(temp, scratch); + vpcmpeqd(Operand(lhs), tmpCopy, mask); + vandps(temp, mask, mask); + + FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp); + vminps(rhs, lhsCopy, temp); + vorps(mask, temp, temp); + + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask); + vcmpneqps(rhs, rhsCopy, mask); + + if (AssemblerX86Shared::HasAVX()) { + vblendvps(mask, lhs, temp, output); + } else { + // Emulate vblendvps. + // With SSE.4.1 we could use blendvps, however it's awkward since + // it requires the mask to be in xmm0. + if (lhs != output) + moveSimd128Float(lhs, output); + vandps(Operand(mask), output, output); + vandnps(Operand(temp), mask, mask); + vorps(Operand(mask), output, output); + } +} + +void +MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, + FloatRegister output) +{ + ScratchSimd128Scope scratch(asMasm()); + FloatRegister mask = scratch; + + asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask); + vpcmpeqd(Operand(lhs), mask, mask); + + asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp); + vandps(temp, mask, mask); + + FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp); + vmaxps(rhs, lhsCopy, temp); + vandnps(Operand(temp), mask, mask); + + // Ensure temp always contains the temporary result + mask = temp; + temp = scratch; + + FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask); + vcmpneqps(rhs, rhsCopy, mask); + + if (AssemblerX86Shared::HasAVX()) { + vblendvps(mask, lhs, temp, output); + } else { + // Emulate vblendvps. + // With SSE.4.1 we could use blendvps, however it's awkward since + // it requires the mask to be in xmm0. + if (lhs != output) + moveSimd128Float(lhs, output); + vandps(Operand(mask), output, output); + vandnps(Operand(temp), mask, mask); + vorps(Operand(mask), output, output); + } +} + +void +MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out) +{ + // All zeros but the sign bit + static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f); + asMasm().loadConstantSimd128Float(minusZero, out); + bitwiseXorFloat32x4(out, in, out); +} + +void +MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out) +{ + static const SimdConstant allOnes = SimdConstant::SplatX16(-1); + asMasm().loadConstantSimd128Int(allOnes, out); + bitwiseXorSimdInt(out, in, out); +} + +void +MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out) +{ + static const SimdConstant allOnes = SimdConstant::SplatX8(-1); + asMasm().loadConstantSimd128Int(allOnes, out); + bitwiseXorSimdInt(out, in, out); +} + +void +MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out) +{ + static const SimdConstant allOnes = SimdConstant::SplatX4(-1); + asMasm().loadConstantSimd128Int(allOnes, out); + bitwiseXorSimdInt(out, in, out); +} + +void +MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out) +{ + float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits); + static const SimdConstant allOnes = SimdConstant::SplatX4(ones); + asMasm().loadConstantSimd128Float(allOnes, out); + bitwiseXorFloat32x4(out, in, out); +} + +void +MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out) +{ + // All ones but the sign bit + float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits); + static const SimdConstant signMasks = SimdConstant::SplatX4(signMask); + asMasm().loadConstantSimd128Float(signMasks, out); + bitwiseAndFloat32x4(out, in, out); +} + +static inline void +MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask, Register count, Register temp, + FloatRegister dest) +{ + masm.mov(count, temp); + masm.andl(Imm32(shiftmask), temp); + masm.vmovd(temp, dest); +} + +void +MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(FloatRegister in, Register count, + Register temp, FloatRegister dest) +{ + ScratchSimd128Scope scratch(asMasm()); + MaskSimdShiftCount(asMasm(), 15, count, temp, scratch); + vpsllw(scratch, in, dest); +} + +void +MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(FloatRegister in, Register count, + Register temp, FloatRegister dest) +{ + ScratchSimd128Scope scratch(asMasm()); + MaskSimdShiftCount(asMasm(), 15, count, temp, scratch); + vpsraw(scratch, in, dest); +} + +void +MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count, + Register temp, FloatRegister dest) +{ + ScratchSimd128Scope scratch(asMasm()); + MaskSimdShiftCount(asMasm(), 15, count, temp, scratch); + vpsrlw(scratch, in, dest); +} + +void +MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(FloatRegister in, Register count, + Register temp, FloatRegister dest) +{ + ScratchSimd128Scope scratch(asMasm()); + MaskSimdShiftCount(asMasm(), 31, count, temp, scratch); + vpslld(scratch, in, dest); +} + +void +MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(FloatRegister in, Register count, + Register temp, FloatRegister dest) +{ + ScratchSimd128Scope scratch(asMasm()); + MaskSimdShiftCount(asMasm(), 31, count, temp, scratch); + vpsrad(scratch, in, dest); +} + +void +MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count, + Register temp, FloatRegister dest) +{ + ScratchSimd128Scope scratch(asMasm()); + MaskSimdShiftCount(asMasm(), 31, count, temp, scratch); + vpsrld(scratch, in, dest); +} + +void +MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse, + FloatRegister temp, FloatRegister output) +{ + if (onTrue != output) + vmovaps(onTrue, output); + if (mask != temp) + vmovaps(mask, temp); + + // SSE4.1 has plain blendvps which can do this, but it is awkward + // to use because it requires the mask to be in xmm0. + + bitwiseAndSimdInt(output, Operand(temp), output); + bitwiseAndNotSimdInt(temp, Operand(onFalse), temp); + bitwiseOrSimdInt(output, Operand(temp), output); +} diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h index 36f3a008a9..f308e41fd8 100644 --- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h +++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h @@ -1123,9 +1123,9 @@ MacroAssembler::canonicalizeFloat32x4(FloatRegister reg, FloatRegister scratch) float nanf = float(JS::GenericNaN()); loadConstantSimd128Float(SimdConstant::SplatX4(nanf), ifFalse); - bitwiseAndSimd128(Operand(mask), reg); - bitwiseAndNotSimd128(Operand(ifFalse), mask); - bitwiseOrSimd128(Operand(mask), reg); + bitwiseAndFloat32x4(reg, Operand(mask), reg); + bitwiseAndNotFloat32x4(mask, Operand(ifFalse), mask); + bitwiseOrFloat32x4(reg, Operand(mask), reg); } // ======================================================================== diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h index e7783736b2..25b3b846da 100644 --- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h +++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h @@ -820,20 +820,179 @@ class MacroAssemblerX86Shared : public Assembler vcvtdq2ps(src, dest); } - void bitwiseAndSimd128(const Operand& src, FloatRegister dest) { - // TODO Using the "ps" variant for all types incurs a domain crossing - // penalty for integer types and double. - vandps(src, dest, dest); + // SIMD methods, defined in MacroAssembler-x86-shared-SIMD.cpp. + void checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest, Register temp, + Label* oolCheck, Label* rejoin); + void oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp, Label* rejoin, + Label* onConversionError); + void checkedConvertFloat32x4ToUint32x4(FloatRegister src, FloatRegister dest, Register temp, + FloatRegister tempF, Label* failed); + + void createInt32x4(Register lane0, Register lane1, Register lane2, Register lane3, + FloatRegister dest); + void createFloat32x4(FloatRegister lane0, FloatRegister lane1, FloatRegister lane2, + FloatRegister lane3, FloatRegister temp, FloatRegister output); + + void splatX16(Register input, FloatRegister output); + void splatX8(Register input, FloatRegister output); + void splatX4(Register input, FloatRegister output); + void splatX4(FloatRegister input, FloatRegister output); + + void reinterpretSimd(bool isIntegerLaneType, FloatRegister input, FloatRegister output); + + void extractLaneInt32x4(FloatRegister input, Register output, unsigned lane); + void extractLaneFloat32x4(FloatRegister input, FloatRegister output, unsigned lane, + bool canonicalize); + void extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, SimdSign sign); + void extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, SimdSign sign); + void extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes, unsigned lane); + + void insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output, + unsigned lane, unsigned numLanes); + void insertLaneFloat32x4(FloatRegister input, FloatRegister value, FloatRegister output, + unsigned lane); + + void allTrueSimdBool(FloatRegister input, Register output); + void anyTrueSimdBool(FloatRegister input, Register output); + + void swizzleInt32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]); + void swizzleFloat32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]); + void swizzleInt8x16(FloatRegister input, FloatRegister output, + const mozilla::Maybe<Register>& temp, int8_t lanes[16]); + + void shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out, + const mozilla::Maybe<FloatRegister>& maybeTemp, unsigned lanes[4]); + void shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output, + const mozilla::Maybe<FloatRegister>& maybeFloatTemp, + const mozilla::Maybe<Register>& maybeTemp, uint8_t lanes[16]); + + void compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output); + void compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output); + void compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output); + void compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, + FloatRegister output); + + void addInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpaddb(rhs, lhs, output); + } + void addInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpaddw(rhs, lhs, output); + } + void addInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpaddd(rhs, lhs, output); + } + void addFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) { + vaddps(rhs, lhs, output); + } + + void addSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) { + if (sign == SimdSign::Signed) + vpaddsb(rhs, lhs, output); + else + vpaddusb(rhs, lhs, output); + } + void addSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) { + if (sign == SimdSign::Signed) + vpaddsw(rhs, lhs, output); + else + vpaddusw(rhs, lhs, output); + } + + void subInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpsubb(rhs, lhs, output); + } + void subInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpsubw(rhs, lhs, output); + } + void subInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpsubd(rhs, lhs, output); + } + void subFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) { + vsubps(rhs, lhs, output); + } + + void subSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) { + if (sign == SimdSign::Signed) + vpsubsb(rhs, lhs, output); + else + vpsubusb(rhs, lhs, output); + } + void subSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) { + if (sign == SimdSign::Signed) + vpsubsw(rhs, lhs, output); + else + vpsubusw(rhs, lhs, output); + } + + void mulInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) { + vpmullw(rhs, lhs, output); + } + void mulInt32x4(FloatRegister lhs, Operand rhs, const mozilla::Maybe<FloatRegister>& temp, + FloatRegister output); + void mulFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) { + vmulps(rhs, lhs, output); + } + + void negInt8x16(Operand in, FloatRegister out) { + zeroSimd128Int(out); + packedSubInt8(in, out); + } + void negInt16x8(Operand in, FloatRegister out) { + zeroSimd128Int(out); + packedSubInt16(in, out); + } + void negInt32x4(Operand in, FloatRegister out) { + zeroSimd128Int(out); + packedSubInt32(in, out); + } + void negFloat32x4(Operand in, FloatRegister out); + + void notInt8x16(Operand in, FloatRegister out); + void notInt16x8(Operand in, FloatRegister out); + void notInt32x4(Operand in, FloatRegister out); + void notFloat32x4(Operand in, FloatRegister out); + + void divFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) { + vdivps(rhs, lhs, output); } - void bitwiseAndNotSimd128(const Operand& src, FloatRegister dest) { - vandnps(src, dest, dest); + void minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output); + void maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output); + void minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output); + void maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output); + + void absFloat32x4(Operand in, FloatRegister out); + + void bitwiseAndFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vandps(rhs, lhs, dest); + } + void bitwiseAndSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vpand(rhs, lhs, dest); + } + + void bitwiseOrFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vorps(rhs, lhs, dest); + } + void bitwiseOrSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vpor(rhs, lhs, dest); } - void bitwiseOrSimd128(const Operand& src, FloatRegister dest) { - vorps(src, dest, dest); + + void bitwiseXorFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vxorps(rhs, lhs, dest); + } + void bitwiseXorSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vpxor(rhs, lhs, dest); + } + + void bitwiseAndNotFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vandnps(rhs, lhs, dest); } - void bitwiseXorSimd128(const Operand& src, FloatRegister dest) { - vxorps(src, dest, dest); + void bitwiseAndNotSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) { + vpandn(rhs, lhs, dest); } + void zeroSimd128Float(FloatRegister dest) { vxorps(dest, dest, dest); } @@ -841,6 +1000,16 @@ class MacroAssemblerX86Shared : public Assembler vpxor(dest, dest, dest); } + void selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse, + FloatRegister temp, FloatRegister output); + void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse, + FloatRegister temp, FloatRegister output) { + if (AssemblerX86Shared::HasAVX()) + vblendvps(mask, onTrue, onFalse, output); + else + selectSimd128(mask, onTrue, onFalse, temp, output); + } + template <class T, class Reg> inline void loadScalar(const Operand& src, Reg dest); template <class T, class Reg> inline void storeScalar(Reg src, const Address& dest); template <class T> inline void loadAlignedVector(const Address& src, FloatRegister dest); @@ -987,41 +1156,38 @@ class MacroAssemblerX86Shared : public Assembler vsqrtps(src, dest); } - void packedLeftShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) { - vpsllw(src, dest, dest); - } + public: + void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest); + void packedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest); + void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest); + void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) { + count.value &= 15; vpsllw(count, dest, dest); } - void packedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) { - vpsraw(src, dest, dest); - } void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) { + count.value &= 15; vpsraw(count, dest, dest); } - void packedUnsignedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) { - vpsrlw(src, dest, dest); - } void packedUnsignedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) { + count.value &= 15; vpsrlw(count, dest, dest); } - void packedLeftShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) { - vpslld(src, dest, dest); - } + void packedLeftShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest); + void packedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest); + void packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest); + void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) { + count.value &= 31; vpslld(count, dest, dest); } - void packedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) { - vpsrad(src, dest, dest); - } void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) { + count.value &= 31; vpsrad(count, dest, dest); } - void packedUnsignedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) { - vpsrld(src, dest, dest); - } void packedUnsignedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) { + count.value &= 31; vpsrld(count, dest, dest); } diff --git a/js/src/moz.build b/js/src/moz.build index 8e14de6e85..59feedf22d 100644 --- a/js/src/moz.build +++ b/js/src/moz.build @@ -431,6 +431,7 @@ elif CONFIG['JS_CODEGEN_X86'] or CONFIG['JS_CODEGEN_X64']: 'jit/x86-shared/CodeGenerator-x86-shared.cpp', 'jit/x86-shared/Disassembler-x86-shared.cpp', # using namespace js::jit::X86Encoding; 'jit/x86-shared/Lowering-x86-shared.cpp', + 'jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp', 'jit/x86-shared/MacroAssembler-x86-shared.cpp', 'jit/x86-shared/MoveEmitter-x86-shared.cpp', ] |