Issue #2307 - Part 2: Move SIMD code generation to masm methods

author: Moonchild <moonchild@palemoon.org> 2023-09-12 23:28:49 +0200
committer: Moonchild <moonchild@palemoon.org> 2023-09-12 23:28:49 +0200
commit: 1a7f79ef9acde005dd78984aeb5917af525960d6 (patch)
tree: 56845f1dfffc062d2a22719c464e25535c5f864c /js
parent: 281497201e52d95b1592e28ba59431ad4ae3bfeb (diff)
download: uxp-1a7f79ef9acde005dd78984aeb5917af525960d6.tar.gz
6 files changed, 1570 insertions, 1077 deletions
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
index 5ec00da849..9858836e7d 100644
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -27,6 +27,7 @@ using mozilla::BitwiseCast;
 using mozilla::DebugOnly;
 using mozilla::FloatingPoint;
 using mozilla::FloorLog2;
+using mozilla::Maybe;
 using mozilla::NegativeInfinity;
 using mozilla::SpecificNaN;
 
@@ -2458,51 +2459,18 @@ CodeGeneratorX86Shared::visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins)
     FloatRegister out = ToFloatRegister(ins->output());
     Register temp = ToRegister(ins->temp());
 
-    masm.convertFloat32x4ToInt32x4(in, out);
-
     auto* ool = new(alloc()) OutOfLineSimdFloatToIntCheck(temp, in, ins, ins->mir()->trapOffset());
     addOutOfLineCode(ool, ins->mir());
 
-    static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648));
-
-    ScratchSimd128Scope scratch(masm);
-    masm.loadConstantSimd128Int(InvalidResult, scratch);
-    masm.packedEqualInt32x4(Operand(out), scratch);
-    // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
-    // the two following instructions.
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(0));
-    masm.j(Assembler::NotEqual, ool->entry());
-
-    masm.bind(ool->rejoin());
+    masm.checkedConvertFloat32x4ToInt32x4(in, out, temp, ool->entry(), ool->rejoin());
 }
 
 void
-CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck *ool)
+CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck* ool)
 {
-    static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
-    static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
-
     Label onConversionError;
 
-    FloatRegister input = ool->input();
-    Register temp = ool->temp();
-
-    ScratchSimd128Scope scratch(masm);
-    masm.loadConstantSimd128Float(Int32MinX4, scratch);
-    masm.vcmpleps(Operand(input), scratch, scratch);
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(15));
-    masm.j(Assembler::NotEqual, &onConversionError);
-
-    masm.loadConstantSimd128Float(Int32MaxX4, scratch);
-    masm.vcmpleps(Operand(input), scratch, scratch);
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(0));
-    masm.j(Assembler::NotEqual, &onConversionError);
-
-    masm.jump(ool->rejoin());
-
+    masm.oolConvertFloat32x4ToInt32x4(ool->input(), ool->temp(), ool->rejoin(), &onConversionError);
     if (gen->compilingWasm()) {
         masm.bindLater(&onConversionError, trap(ool, wasm::Trap::ImpreciseSimdConversion));
     } else {
@@ -2512,105 +2480,39 @@ CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIn
 }
 
 // Convert Float32x4 to Uint32x4.
-//
 // If any input lane value is out of range or NaN, bail out.
 void
 CodeGeneratorX86Shared::visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins)
 {
-    const MSimdConvert* mir = ins->mir();
     FloatRegister in = ToFloatRegister(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
     Register temp = ToRegister(ins->tempR());
     FloatRegister tempF = ToFloatRegister(ins->tempF());
 
-    // Classify lane values into 4 disjoint classes:
-    //
-    //   N-lanes:             in <= -1.0
-    //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
-    //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
-    //   V-lanes: 0x1.0p32 <= in, or isnan(in)
-    //
-    // We need to bail out to throw a RangeError if we see any N-lanes or
-    // V-lanes.
-    //
-    // For A-lanes and B-lanes, we make two float -> int32 conversions:
-    //
-    //   A = cvttps2dq(in)
-    //   B = cvttps2dq(in - 0x1.0p31f)
-    //
-    // Note that the subtraction for the B computation is exact for B-lanes.
-    // There is no rounding, so B is the low 31 bits of the correctly converted
-    // result.
-    //
-    // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
-    // out of range for a signed int32_t. This conveniently provides the missing
-    // high bit for B, so the desired result is A for A-lanes and A|B for
-    // B-lanes.
-
-    ScratchSimd128Scope scratch(masm);
-
-    // TODO: If the majority of lanes are A-lanes, it could be faster to compute
-    // A first, use vmovmskps to check for any non-A-lanes and handle them in
-    // ool code. OTOH, we we're wrong about the lane distribution, that would be
-    // slower.
-
-    // Compute B in |scratch|.
-    static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
-    static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
-    masm.loadConstantSimd128Float(Bias, scratch);
-    masm.packedAddFloat32(Operand(in), scratch);
-    masm.convertFloat32x4ToInt32x4(scratch, scratch);
-
-    // Compute A in |out|. This is the last time we use |in| and the first time
-    // we use |out|, so we can tolerate if they are the same register.
-    masm.convertFloat32x4ToInt32x4(in, out);
-
-    // We can identify A-lanes by the sign bits in A: Any A-lanes will be
-    // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
-    // mask of non-A-lanes into |tempF|.
-    masm.zeroSimd128Float(tempF);
-    masm.packedGreaterThanInt32x4(Operand(out), tempF);
-
-    // Clear the A-lanes in B.
-    masm.bitwiseAndSimd128(Operand(tempF), scratch);
-
-    // Compute the final result: A for A-lanes, A|B for B-lanes.
-    masm.bitwiseOrSimd128(Operand(scratch), out);
-
-    // We still need to filter out the V-lanes. They would show up as 0x80000000
-    // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
-    // the remaining negative lanes in B.
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(0));
+    Label failed;
+    masm.checkedConvertFloat32x4ToUint32x4(in, out, temp, tempF, &failed);
 
+    Label ok;
+    masm.jump(&ok);
+    masm.bind(&failed);
     if (gen->compilingWasm())
-        masm.j(Assembler::NotEqual, trap(mir, wasm::Trap::ImpreciseSimdConversion));
+        masm.j(Assembler::NotEqual, trap(ins->mir(), wasm::Trap::ImpreciseSimdConversion));
     else
-        bailoutIf(Assembler::NotEqual, ins->snapshot());
+//        bailoutIf(Assembler::NotEqual, ins->snapshot());
+        bailout(ins->snapshot());
+    masm.bind(&ok);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4* ins)
 {
     MOZ_ASSERT(ins->mir()->type() == MIRType::Int32x4 || ins->mir()->type() == MIRType::Bool32x4);
-
-    FloatRegister output = ToFloatRegister(ins->output());
-    if (AssemblerX86Shared::HasSSE41()) {
-        masm.vmovd(ToRegister(ins->getOperand(0)), output);
-        for (size_t i = 1; i < 4; ++i) {
-            Register r = ToRegister(ins->getOperand(i));
-            masm.vpinsrd(i, r, output, output);
-        }
-        return;
-    }
-
-    masm.reserveStack(Simd128DataSize);
-    for (size_t i = 0; i < 4; ++i) {
-        Register r = ToRegister(ins->getOperand(i));
-        masm.store32(r, Address(StackPointer, i * sizeof(int32_t)));
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(Simd128DataSize);
+    masm.createInt32x4(ToRegister(ins->getOperand(0)),
+                       ToRegister(ins->getOperand(1)),
+                       ToRegister(ins->getOperand(2)),
+                       ToRegister(ins->getOperand(3)),
+                       ToFloatRegister(ins->output())
+                      );
 }
 
 void
@@ -2625,12 +2527,7 @@ CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
     FloatRegister tmp = ToFloatRegister(ins->getTemp(0));
     FloatRegister output = ToFloatRegister(ins->output());
 
-    FloatRegister r0Copy = masm.reusedInputFloat32x4(r0, output);
-    FloatRegister r1Copy = masm.reusedInputFloat32x4(r1, tmp);
-
-    masm.vunpcklps(r3, r1Copy, tmp);
-    masm.vunpcklps(r2, r0Copy, output);
-    masm.vunpcklps(tmp, output, output);
+    masm.createFloat32x4(r0, r1, r2, r3, tmp, output);
 }
 
 void
@@ -2639,20 +2536,7 @@ CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins)
     MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
     Register input = ToRegister(ins->getOperand(0));
     FloatRegister output = ToFloatRegister(ins->output());
-    masm.vmovd(input, output);
-    if (AssemblerX86Shared::HasSSSE3()) {
-        masm.zeroSimd128Int(ScratchSimd128Reg);
-        masm.vpshufb(ScratchSimd128Reg, output, output);
-    } else {
-        // Use two shifts to duplicate the low 8 bits into the low 16 bits.
-        masm.vpsllw(Imm32(8), output, output);
-        masm.vmovdqa(output, ScratchSimd128Reg);
-        masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
-        masm.vpor(ScratchSimd128Reg, output, output);
-        // Then do an X8 splat.
-        masm.vpshuflw(0, output, output);
-        masm.vpshufd(0, output, output);
-    }
+    masm.splatX16(input, output);
 }
 
 void
@@ -2661,9 +2545,7 @@ CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins)
     MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
     Register input = ToRegister(ins->getOperand(0));
     FloatRegister output = ToFloatRegister(ins->output());
-    masm.vmovd(input, output);
-    masm.vpshuflw(0, output, output);
-    masm.vpshufd(0, output, output);
+    masm.splatX8(input, output);
 }
 
 void
@@ -2675,15 +2557,10 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
     MOZ_ASSERT(IsSimdType(mir->type()));
     JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
 
-    if (mir->type() == MIRType::Float32x4) {
-        FloatRegister r = ToFloatRegister(ins->getOperand(0));
-        FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
-        masm.vshufps(0, rCopy, rCopy, output);
-    } else {
-        Register r = ToRegister(ins->getOperand(0));
-        masm.vmovd(r, output);
-        masm.vpshufd(0, output, output);
-    }
+    if (mir->type() == MIRType::Float32x4)
+        masm.splatX4(ToFloatRegister(ins->getOperand(0)), output);
+    else
+        masm.splatX4(ToRegister(ins->getOperand(0)), output);
 }
 
 void
@@ -2691,83 +2568,8 @@ CodeGeneratorX86Shared::visitSimdReinterpretCast(LSimdReinterpretCast* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
-
-    if (input.aliases(output))
-        return;
-
-    if (IsIntegerSimdType(ins->mir()->type()))
-        masm.vmovdqa(input, output);
-    else
-        masm.vmovaps(input, output);
-}
-
-// Extract an integer lane from the 32x4 vector register |input| and place it in
-// |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane)
-{
-    if (lane == 0) {
-        // The value we want to extract is in the low double-word
-        masm.moveLowInt32(input, output);
-    } else if (AssemblerX86Shared::HasSSE41()) {
-        masm.vpextrd(lane, input, output);
-    } else {
-        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
-        masm.shuffleInt32(mask, input, ScratchSimd128Reg);
-        masm.moveLowInt32(ScratchSimd128Reg, output);
-    }
-}
-
-// Extract an integer lane from the 16x8 vector register |input|, sign- or
-// zero-extend to 32 bits and place the result in |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane16x8(FloatRegister input, Register output,
-                                                unsigned lane, SimdSign signedness)
-{
-    // Unlike pextrd and pextrb, this is available in SSE2.
-    masm.vpextrw(lane, input, output);
-
-    if (signedness == SimdSign::Signed)
-        masm.movswl(output, output);
-}
-
-// Extract an integer lane from the 8x16 vector register |input|, sign- or
-// zero-extend to 32 bits and place the result in |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane8x16(FloatRegister input, Register output,
-                                                unsigned lane, SimdSign signedness)
-{
-    if (AssemblerX86Shared::HasSSE41()) {
-        masm.vpextrb(lane, input, output);
-        // vpextrb clears the high bits, so no further extension required.
-        if (signedness == SimdSign::Unsigned)
-            signedness = SimdSign::NotApplicable;
-    } else {
-        // Extract the relevant 16 bits containing our lane, then shift the
-        // right 8 bits into place.
-        emitSimdExtractLane16x8(input, output, lane / 2, SimdSign::Unsigned);
-        if (lane % 2) {
-            masm.shrl(Imm32(8), output);
-            // The shrl handles the zero-extension. Don't repeat it.
-            if (signedness == SimdSign::Unsigned)
-                signedness = SimdSign::NotApplicable;
-        }
-    }
-
-    // We have the right low 8 bits in |output|, but we may need to fix the high
-    // bits. Note that this requires |output| to be one of the %eax-%edx
-    // registers.
-    switch (signedness) {
-      case SimdSign::Signed:
-        masm.movsbl(output, output);
-        break;
-      case SimdSign::Unsigned:
-        masm.movzbl(output, output);
-        break;
-      case SimdSign::NotApplicable:
-        // No adjustment needed.
-        break;
-    }
+    bool isIntLaneType = IsIntegerSimdType(ins->mir()->type());
+    masm.reinterpretSimd(isIntLaneType, input, output);
 }
 
 void
@@ -2776,25 +2578,8 @@ CodeGeneratorX86Shared::visitSimdExtractElementB(LSimdExtractElementB* ins)
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
     MSimdExtractElement* mir = ins->mir();
-    unsigned length = SimdTypeToLength(mir->specialization());
-
-    switch (length) {
-      case 4:
-        emitSimdExtractLane32x4(input, output, mir->lane());
-        break;
-      case 8:
-        // Get a lane, don't bother fixing the high bits since we'll mask below.
-        emitSimdExtractLane16x8(input, output, mir->lane(), SimdSign::NotApplicable);
-        break;
-      case 16:
-        emitSimdExtractLane8x16(input, output, mir->lane(), SimdSign::NotApplicable);
-        break;
-      default:
-        MOZ_CRASH("Unhandled SIMD length");
-    }
-
-    // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
-    masm.and32(Imm32(1), output);
+    unsigned numLanes = SimdTypeToLength(mir->specialization());
+    masm.extractLaneSimdBool(input, output, numLanes, mir->lane());
 }
 
 void
@@ -2803,17 +2588,16 @@ CodeGeneratorX86Shared::visitSimdExtractElementI(LSimdExtractElementI* ins)
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
     MSimdExtractElement* mir = ins->mir();
-    unsigned length = SimdTypeToLength(mir->specialization());
-
-    switch (length) {
+    unsigned numLanes = SimdTypeToLength(mir->specialization());
+    switch (numLanes) {
       case 4:
-        emitSimdExtractLane32x4(input, output, mir->lane());
+        masm.extractLaneInt32x4(input, output, mir->lane());
         break;
       case 8:
-        emitSimdExtractLane16x8(input, output, mir->lane(), mir->signedness());
+        masm.extractLaneInt16x8(input, output, mir->lane(), mir->signedness());
         break;
       case 16:
-        emitSimdExtractLane8x16(input, output, mir->lane(), mir->signedness());
+        masm.extractLaneInt8x16(input, output, mir->lane(), mir->signedness());
         break;
       default:
         MOZ_CRASH("Unhandled SIMD length");
@@ -2828,7 +2612,7 @@ CodeGeneratorX86Shared::visitSimdExtractElementU2D(LSimdExtractElementU2D* ins)
     Register temp = ToRegister(ins->temp());
     MSimdExtractElement* mir = ins->mir();
     MOZ_ASSERT(mir->specialization() == MIRType::Int32x4);
-    emitSimdExtractLane32x4(input, temp, mir->lane());
+    masm.extractLaneInt32x4(input, temp, mir->lane());
     masm.convertUInt32ToDouble(temp, output);
 }
 
@@ -2839,102 +2623,31 @@ CodeGeneratorX86Shared::visitSimdExtractElementF(LSimdExtractElementF* ins)
     FloatRegister output = ToFloatRegister(ins->output());
 
     unsigned lane = ins->mir()->lane();
-    if (lane == 0) {
-        // The value we want to extract is in the low double-word
-        if (input != output)
-            masm.moveFloat32(input, output);
-    } else if (lane == 2) {
-        masm.moveHighPairToLowPairFloat32(input, output);
-    } else {
-        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
-        masm.shuffleFloat32(mask, input, output);
-    }
-    // NaNs contained within SIMD values are not enforced to be canonical, so
-    // when we extract an element into a "regular" scalar JS value, we have to
-    // canonicalize. In wasm code, we can skip this, as wasm only has to
-    // canonicalize NaNs at FFI boundaries.
-    if (!gen->compilingWasm())
-        masm.canonicalizeFloat(output);
+    bool canonicalize = !gen->compilingWasm();
+    masm.extractLaneFloat32x4(input, output, lane, canonicalize);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdInsertElementI(LSimdInsertElementI* ins)
 {
-    FloatRegister vector = ToFloatRegister(ins->vector());
+    FloatRegister input = ToFloatRegister(ins->vector());
     Register value = ToRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
-    MOZ_ASSERT(vector == output); // defineReuseInput(0)
-
+    MOZ_ASSERT(input == output); // defineReuseInput(0)
     unsigned lane = ins->lane();
     unsigned length = ins->length();
 
-    if (length == 8) {
-        // Available in SSE 2.
-        masm.vpinsrw(lane, value, vector, output);
-        return;
-    }
-
-    // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
-    // value goes into the first component, as vmovd clears out the higher lanes
-    // of the output.
-    if (AssemblerX86Shared::HasSSE41()) {
-        // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
-        switch (length) {
-          case 4:
-            masm.vpinsrd(lane, value, vector, output);
-            return;
-          case 16:
-            masm.vpinsrb(lane, value, vector, output);
-            return;
-        }
-    }
-
-    masm.reserveStack(Simd128DataSize);
-    masm.storeAlignedSimd128Int(vector, Address(StackPointer, 0));
-    switch (length) {
-      case 4:
-        masm.store32(value, Address(StackPointer, lane * sizeof(int32_t)));
-        break;
-      case 16:
-        // Note that this requires `value` to be in one the registers where the
-        // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
-        masm.store8(value, Address(StackPointer, lane * sizeof(int8_t)));
-        break;
-      default:
-        MOZ_CRASH("Unsupported SIMD length");
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(Simd128DataSize);
+    masm.insertLaneSimdInt(input, value, output, lane, length);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF* ins)
 {
-    FloatRegister vector = ToFloatRegister(ins->vector());
+    FloatRegister input = ToFloatRegister(ins->vector());
     FloatRegister value = ToFloatRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
-    MOZ_ASSERT(vector == output); // defineReuseInput(0)
-
-    if (ins->lane() == 0) {
-        // As both operands are registers, vmovss doesn't modify the upper bits
-        // of the destination operand.
-        if (value != output)
-            masm.vmovss(value, vector, output);
-        return;
-    }
-
-    if (AssemblerX86Shared::HasSSE41()) {
-        // The input value is in the low float32 of the 'value' FloatRegister.
-        masm.vinsertps(masm.vinsertpsMask(0, ins->lane()), value, output, output);
-        return;
-    }
-
-    unsigned component = unsigned(ins->lane());
-    masm.reserveStack(Simd128DataSize);
-    masm.storeAlignedSimd128Float(vector, Address(StackPointer, 0));
-    masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
-    masm.loadAlignedSimd128Float(Address(StackPointer, 0), output);
-    masm.freeStack(Simd128DataSize);
+    MOZ_ASSERT(input == output); // defineReuseInput(0)
+    masm.insertLaneFloat32x4(input, value, output, ins->lane());
 }
 
 void
@@ -2943,9 +2656,7 @@ CodeGeneratorX86Shared::visitSimdAllTrue(LSimdAllTrue* ins)
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
-    masm.vmovmskps(input, output);
-    masm.cmp32(output, Imm32(0xf));
-    masm.emitSet(Assembler::Zero, output);
+    masm.allTrueSimdBool(input, output);
 }
 
 void
@@ -2954,11 +2665,10 @@ CodeGeneratorX86Shared::visitSimdAnyTrue(LSimdAnyTrue* ins)
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
-    masm.vmovmskps(input, output);
-    masm.cmp32(output, Imm32(0x0));
-    masm.emitSet(Assembler::NonZero, output);
+    masm.anyTrueSimdBool(input, output);
 }
 
+// XXX note for reviewer: this is SIMD.js only, no need to keep it for wasm.
 template <class T, class Reg> void
 CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Reg tempRegister)
 {
@@ -3017,6 +2727,7 @@ CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Re
     masm.freeStack(stackSpace);
 }
 
+// XXX SIMD.js only
 void
 CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI* ins)
 {
@@ -3047,13 +2758,10 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins)
 
     switch (numLanes) {
         case 4: {
-            uint32_t x = ins->lane(0);
-            uint32_t y = ins->lane(1);
-            uint32_t z = ins->lane(2);
-            uint32_t w = ins->lane(3);
-
-            uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
-            masm.shuffleInt32(mask, input, output);
+            unsigned lanes[4];
+            for (unsigned i = 0; i < 4; i++)
+                lanes[i] = ins->lane(i);
+            masm.swizzleInt32x4(input, output, lanes);
             return;
         }
     }
@@ -3061,31 +2769,18 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins)
     // In the general case, use pshufb if it is available. Convert to a
     // byte-wise swizzle.
     const unsigned bytesPerLane = 16 / numLanes;
-    int8_t bLane[16];
+    int8_t lanes[16];
     for (unsigned i = 0; i < numLanes; i++) {
         for (unsigned b = 0; b < bytesPerLane; b++) {
-            bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+            lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
         }
     }
 
-    if (AssemblerX86Shared::HasSSSE3()) {
-        ScratchSimd128Scope scratch(masm);
-        masm.loadConstantSimd128Int(SimdConstant::CreateX16(bLane), scratch);
-        FloatRegister inputCopy = masm.reusedInputInt32x4(input, output);
-        masm.vpshufb(scratch, inputCopy, output);
-        return;
-    }
+    Maybe<Register> maybeTemp;
+    if (!ins->getTemp(0)->isBogusTemp())
+        maybeTemp.emplace(ToRegister(ins->getTemp(0)));
 
-    // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
-    Register temp = ToRegister(ins->getTemp(0));
-    masm.reserveStack(2 * Simd128DataSize);
-    masm.storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
-    for (unsigned i = 0; i < 16; i++) {
-        masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
-        masm.store8(temp, Address(StackPointer, i));
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(2 * Simd128DataSize);
+    masm.swizzleInt8x16(input, output, maybeTemp, lanes);
 }
 
 void
@@ -3095,54 +2790,10 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF* ins)
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT(ins->numLanes() == 4);
 
-    uint32_t x = ins->lane(0);
-    uint32_t y = ins->lane(1);
-    uint32_t z = ins->lane(2);
-    uint32_t w = ins->lane(3);
-
-    if (AssemblerX86Shared::HasSSE3()) {
-        if (ins->lanesMatch(0, 0, 2, 2)) {
-            masm.vmovsldup(input, output);
-            return;
-        }
-        if (ins->lanesMatch(1, 1, 3, 3)) {
-            masm.vmovshdup(input, output);
-            return;
-        }
-    }
-
-    // TODO Here and below, arch specific lowering could identify this pattern
-    // and use defineReuseInput to avoid this move (bug 1084404)
-    if (ins->lanesMatch(2, 3, 2, 3)) {
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vmovhlps(input, inputCopy, output);
-        return;
-    }
-
-    if (ins->lanesMatch(0, 1, 0, 1)) {
-        if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
-            masm.vmovddup(input, output);
-            return;
-        }
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vmovlhps(input, inputCopy, output);
-        return;
-    }
-
-    if (ins->lanesMatch(0, 0, 1, 1)) {
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vunpcklps(input, inputCopy, output);
-        return;
-    }
-
-    if (ins->lanesMatch(2, 2, 3, 3)) {
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vunpckhps(input, inputCopy, output);
-        return;
-    }
-
-    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
-    masm.shuffleFloat32(mask, input, output);
+    unsigned lanes[4];
+    for (unsigned i = 0; i < 4; i++)
+        lanes[i] = ins->lane(i);
+    masm.swizzleFloat32x4(input, output, lanes);
 }
 
 void
@@ -3155,52 +2806,21 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle* ins)
     const unsigned bytesPerLane = 16 / numLanes;
 
     // Convert the shuffle to a byte-wise shuffle.
-    uint8_t bLane[16];
+    uint8_t lanes[16];
     for (unsigned i = 0; i < numLanes; i++) {
         for (unsigned b = 0; b < bytesPerLane; b++) {
-            bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+            lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
         }
     }
 
-    // Use pshufb if it is available.
-    if (AssemblerX86Shared::HasSSSE3()) {
-        FloatRegister scratch1 = ToFloatRegister(ins->temp());
-        ScratchSimd128Scope scratch2(masm);
-
-        // Use pshufb instructions to gather the lanes from each source vector.
-        // A negative index creates a zero lane, so the two vectors can be combined.
-
-        // Set scratch2 = lanes from lhs.
-        int8_t idx[16];
-        for (unsigned i = 0; i < 16; i++)
-            idx[i] = bLane[i] < 16 ? bLane[i] : -1;
-        masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
-        FloatRegister lhsCopy = masm.reusedInputInt32x4(lhs, scratch2);
-        masm.vpshufb(scratch1, lhsCopy, scratch2);
-
-        // Set output = lanes from rhs.
-        for (unsigned i = 0; i < 16; i++)
-            idx[i] = bLane[i] >= 16 ? bLane[i] - 16 : -1;
-        masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
-        FloatRegister rhsCopy = masm.reusedInputInt32x4(rhs, output);
-        masm.vpshufb(scratch1, rhsCopy, output);
-
-        // Combine.
-        masm.vpor(scratch2, output, output);
-        return;
-    }
+    Maybe<FloatRegister> maybeFloatTemp;
+    Maybe<Register> maybeTemp;
+    if (AssemblerX86Shared::HasSSSE3())
+        maybeFloatTemp.emplace(ToFloatRegister(ins->temp()));
+    else
+        maybeTemp.emplace(ToRegister(ins->temp()));
 
-    // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
-    Register temp = ToRegister(ins->getTemp(0));
-    masm.reserveStack(3 * Simd128DataSize);
-    masm.storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
-    masm.storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
-    for (unsigned i = 0; i < 16; i++) {
-        masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
-        masm.store8(temp, Address(StackPointer, i));
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(3 * Simd128DataSize);
+    masm.shuffleInt8x16(lhs, rhs, output, maybeFloatTemp, maybeTemp, lanes);
 }
 
 void
@@ -3210,409 +2830,60 @@ CodeGeneratorX86Shared::visitSimdShuffleX4(LSimdShuffleX4* ins)
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister out = ToFloatRegister(ins->output());
 
-    uint32_t x = ins->lane(0);
-    uint32_t y = ins->lane(1);
-    uint32_t z = ins->lane(2);
-    uint32_t w = ins->lane(3);
-
-    // Check that lanes come from LHS in majority:
-    unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
-    MOZ_ASSERT(numLanesFromLHS >= 2);
-
-    // When reading this method, remember that vshufps takes the two first
-    // inputs of the destination operand (right operand) and the two last
-    // inputs of the source operand (left operand).
-    //
-    // Legend for explanations:
-    // - L: LHS
-    // - R: RHS
-    // - T: temporary
-
-    uint32_t mask;
-
-    // If all lanes came from a single vector, we should have constructed a
-    // MSimdSwizzle instead.
-    MOZ_ASSERT(numLanesFromLHS < 4);
-
-    // If all values stay in their lane, this is a blend.
-    if (AssemblerX86Shared::HasSSE41()) {
-        if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
-            masm.vblendps(masm.blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
-            return;
-        }
-    }
-
-    // One element of the second, all other elements of the first
-    if (numLanesFromLHS == 3) {
-        unsigned firstMask = -1, secondMask = -1;
-
-        // register-register vmovss preserves the high lanes.
-        if (ins->lanesMatch(4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
-            masm.vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
-            return;
-        }
-
-        // SSE4.1 vinsertps can handle any single element.
-        unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
-        if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
-            unsigned srcLane;
-            unsigned dstLane;
-            if (x >= 4) {
-                srcLane = x - 4;
-                dstLane = 0;
-            } else if (y >= 4) {
-                srcLane = y - 4;
-                dstLane = 1;
-            } else if (z >= 4) {
-                srcLane = z - 4;
-                dstLane = 2;
-            } else {
-                MOZ_ASSERT(w >= 4);
-                srcLane = w - 4;
-                dstLane = 3;
-            }
-            masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
-            return;
-        }
-
-        FloatRegister rhsCopy = ToFloatRegister(ins->temp());
-
-        if (x < 4 && y < 4) {
-            if (w >= 4) {
-                w %= 4;
-                // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
-                firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
-                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
-                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
-            } else {
-                MOZ_ASSERT(z >= 4);
-                z %= 4;
-                // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
-                firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
-                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
-                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
-            }
-
-            masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
-            masm.vshufps(secondMask, rhsCopy, lhs, out);
-            return;
-        }
-
-        MOZ_ASSERT(z < 4 && w < 4);
-
-        if (y >= 4) {
-            y %= 4;
-            // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
-            firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
-            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
-            secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
-        } else {
-            MOZ_ASSERT(x >= 4);
-            x %= 4;
-            // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
-            firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
-            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
-            secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
-        }
-
-        masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
-        if (AssemblerX86Shared::HasAVX()) {
-            masm.vshufps(secondMask, lhs, rhsCopy, out);
-        } else {
-            masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy);
-            masm.moveSimd128Float(rhsCopy, out);
-        }
-        return;
-    }
-
-    // Two elements from one vector, two other elements from the other
-    MOZ_ASSERT(numLanesFromLHS == 2);
-
-    // TODO Here and below, symmetric case would be more handy to avoid a move,
-    // but can't be reached because operands would get swapped (bug 1084404).
-    if (ins->lanesMatch(2, 3, 6, 7)) {
-        ScratchSimd128Scope scratch(masm);
-        if (AssemblerX86Shared::HasAVX()) {
-            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-            masm.vmovhlps(lhs, rhsCopy, out);
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            masm.vmovhlps(lhs, scratch, scratch);
-            masm.moveSimd128Float(scratch, out);
-        }
-        return;
-    }
-
-    if (ins->lanesMatch(0, 1, 4, 5)) {
-        FloatRegister rhsCopy;
-        ScratchSimd128Scope scratch(masm);
-        if (rhs.kind() == Operand::FPREG) {
-            // No need to make an actual copy, since the operand is already
-            // in a register, and it won't be clobbered by the vmovlhps.
-            rhsCopy = FloatRegister::FromCode(rhs.fpu());
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            rhsCopy = scratch;
-        }
-        masm.vmovlhps(rhsCopy, lhs, out);
-        return;
-    }
-
-    if (ins->lanesMatch(0, 4, 1, 5)) {
-        masm.vunpcklps(rhs, lhs, out);
-        return;
-    }
-
-    // TODO swapped case would be better (bug 1084404)
-    if (ins->lanesMatch(4, 0, 5, 1)) {
-        ScratchSimd128Scope scratch(masm);
-        if (AssemblerX86Shared::HasAVX()) {
-            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-            masm.vunpcklps(lhs, rhsCopy, out);
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            masm.vunpcklps(lhs, scratch, scratch);
-            masm.moveSimd128Float(scratch, out);
-        }
-        return;
-    }
-
-    if (ins->lanesMatch(2, 6, 3, 7)) {
-        masm.vunpckhps(rhs, lhs, out);
-        return;
-    }
-
-    // TODO swapped case would be better (bug 1084404)
-    if (ins->lanesMatch(6, 2, 7, 3)) {
-        ScratchSimd128Scope scratch(masm);
-        if (AssemblerX86Shared::HasAVX()) {
-            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-            masm.vunpckhps(lhs, rhsCopy, out);
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            masm.vunpckhps(lhs, scratch, scratch);
-            masm.moveSimd128Float(scratch, out);
-        }
-        return;
-    }
-
-    // In one vshufps
-    if (x < 4 && y < 4) {
-        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
-        masm.vshufps(mask, rhs, lhs, out);
-        return;
-    }
-
-    // At creation, we should have explicitly swapped in this case.
-    MOZ_ASSERT(!(z >= 4 && w >= 4));
-
-    // In two vshufps, for the most generic case:
-    uint32_t firstMask[4], secondMask[4];
-    unsigned i = 0, j = 2, k = 0;
+    unsigned lanes[4];
+    for (unsigned i = 0; i < 4; i++)
+        lanes[i] = ins->lane(i);
+    Maybe<FloatRegister> maybeTemp;
+    if (!ins->temp()->isBogusTemp())
+        maybeTemp.emplace(ToFloatRegister(ins->temp()));
+    masm.shuffleX4(lhs, rhs, out, maybeTemp, lanes);
+}
 
-#define COMPUTE_MASK(lane)       \
-    if (lane >= 4) {             \
-        firstMask[j] = lane % 4; \
-        secondMask[k++] = j++;   \
-    } else {                     \
-        firstMask[i] = lane;     \
-        secondMask[k++] = i++;   \
+static inline Assembler::Condition
+ToCondition(MSimdBinaryComp::Operation op)
+{
+    switch (op) {
+      case MSimdBinaryComp::greaterThan: return Assembler::GreaterThan;
+      case MSimdBinaryComp::equal: return Assembler::Equal;
+      case MSimdBinaryComp::lessThan: return Assembler::LessThan;
+      case MSimdBinaryComp::notEqual: return Assembler::NotEqual;
+      case MSimdBinaryComp::greaterThanOrEqual: return Assembler::GreaterThanOrEqual;
+      case MSimdBinaryComp::lessThanOrEqual: return Assembler::LessThanOrEqual;
     }
 
-    COMPUTE_MASK(x)
-    COMPUTE_MASK(y)
-    COMPUTE_MASK(z)
-    COMPUTE_MASK(w)
-#undef COMPUTE_MASK
-
-    MOZ_ASSERT(i == 2 && j == 4 && k == 4);
-
-    mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
-                                              firstMask[2], firstMask[3]);
-    masm.vshufps(mask, rhs, lhs, lhs);
-
-    mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
-                                              secondMask[2], secondMask[3]);
-    masm.vshufps(mask, lhs, lhs, lhs);
+    MOZ_CRASH("unexpected cond");
 }
 
 void
 CodeGeneratorX86Shared::visitSimdBinaryCompIx16(LSimdBinaryCompIx16* ins)
 {
-    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
-
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs);
 
-    ScratchSimd128Scope scratch(masm);
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::greaterThan:
-        masm.vpcmpgtb(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::equal:
-        masm.vpcmpeqb(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThan:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-
-        // src := src > lhs (i.e. lhs < rhs)
-        // Improve by doing custom lowering (rhs is tied to the output register)
-        masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch);
-        masm.moveSimd128Int(scratch, output);
-        return;
-      case MSimdBinaryComp::notEqual:
-        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
-        // should invert the comparison by, e.g. swapping the arms of a select
-        // if that's what it's used in.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpeqb(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch);
-        masm.loadConstantSimd128Int(allOnes, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpgtb(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareInt8x16(lhs, rhs, ToCondition(ins->operation()), output);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdBinaryCompIx8(LSimdBinaryCompIx8* ins)
 {
-    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
-
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs);
 
-    ScratchSimd128Scope scratch(masm);
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::greaterThan:
-        masm.vpcmpgtw(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::equal:
-        masm.vpcmpeqw(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThan:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-
-        // src := src > lhs (i.e. lhs < rhs)
-        // Improve by doing custom lowering (rhs is tied to the output register)
-        masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch);
-        masm.moveSimd128Int(scratch, output);
-        return;
-      case MSimdBinaryComp::notEqual:
-        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
-        // should invert the comparison by, e.g. swapping the arms of a select
-        // if that's what it's used in.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpeqw(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch);
-        masm.loadConstantSimd128Int(allOnes, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpgtw(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareInt16x8(lhs, rhs, ToCondition(ins->operation()), output);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins)
 {
-    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
-
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
 
-    ScratchSimd128Scope scratch(masm);
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::greaterThan:
-        masm.packedGreaterThanInt32x4(rhs, lhs);
-        return;
-      case MSimdBinaryComp::equal:
-        masm.packedEqualInt32x4(rhs, lhs);
-        return;
-      case MSimdBinaryComp::lessThan:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-
-        // src := src > lhs (i.e. lhs < rhs)
-        // Improve by doing custom lowering (rhs is tied to the output register)
-        masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
-        masm.moveSimd128Int(scratch, lhs);
-        return;
-      case MSimdBinaryComp::notEqual:
-        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
-        // should invert the comparison by, e.g. swapping the arms of a select
-        // if that's what it's used in.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.packedEqualInt32x4(rhs, lhs);
-        masm.bitwiseXorSimd128(Operand(scratch), lhs);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
-        masm.loadConstantSimd128Int(allOnes, lhs);
-        masm.bitwiseXorSimd128(Operand(scratch), lhs);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.packedGreaterThanInt32x4(rhs, lhs);
-        masm.bitwiseXorSimd128(Operand(scratch), lhs);
-        return;
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareInt32x4(lhs, rhs, ToCondition(ins->operation()), lhs);
 }
 
 void
@@ -3622,27 +2893,7 @@ CodeGeneratorX86Shared::visitSimdBinaryCompFx4(LSimdBinaryCompFx4* ins)
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::equal:
-        masm.vcmpeqps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThan:
-        masm.vcmpltps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        masm.vcmpleps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::notEqual:
-        masm.vcmpneqps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-      case MSimdBinaryComp::greaterThan:
-        // We reverse these before register allocation so that we don't have to
-        // copy into and out of temporaries after codegen.
-        MOZ_CRASH("lowering should have reversed this");
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareFloat32x4(lhs, rhs, ToCondition(ins->operation()), output);
 }
 
 void
@@ -3655,10 +2906,10 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx16(LSimdBinaryArithIx16* ins)
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vpaddb(rhs, lhs, output);
+        masm.addInt8x16(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vpsubb(rhs, lhs, output);
+        masm.subInt8x16(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul:
         // 8x16 mul is a valid operation, but not supported in SSE or AVX.
@@ -3685,13 +2936,13 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx8(LSimdBinaryArithIx8* ins)
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vpaddw(rhs, lhs, output);
+        masm.addInt16x8(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vpsubw(rhs, lhs, output);
+        masm.subInt16x8(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul:
-        masm.vpmullw(rhs, lhs, output);
+        masm.mulInt16x8(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_div:
       case MSimdBinaryArith::Op_max:
@@ -3710,35 +2961,19 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx4(LSimdBinaryArithIx4* ins)
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
-    ScratchSimd128Scope scratch(masm);
-
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vpaddd(rhs, lhs, output);
+        masm.addInt32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vpsubd(rhs, lhs, output);
+        masm.subInt32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul: {
-        if (AssemblerX86Shared::HasSSE41()) {
-            masm.vpmulld(rhs, lhs, output);
-            return;
-        }
-
-        masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.vpmuludq(lhs, scratch, scratch);
-        // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
-
-        FloatRegister temp = ToFloatRegister(ins->temp());
-        masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
-        masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, temp);
-        masm.vpmuludq(temp, lhs, lhs);
-        // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
-
-        masm.vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
-        // lhs contains (Ry, Rw, Rx, Rz)
-        masm.vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+        Maybe<FloatRegister> maybeTemp;
+        if (!AssemblerX86Shared::HasSSE41())
+            maybeTemp.emplace(ToFloatRegister(ins->getTemp(0)));
+        masm.mulInt32x4(lhs, rhs, maybeTemp, output);
         return;
       }
       case MSimdBinaryArith::Op_div:
@@ -3766,104 +3001,34 @@ CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4* ins)
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
-    ScratchSimd128Scope scratch(masm);
-
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vaddps(rhs, lhs, output);
+        masm.addFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vsubps(rhs, lhs, output);
+        masm.subFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul:
-        masm.vmulps(rhs, lhs, output);
+        masm.mulFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_div:
-        masm.vdivps(rhs, lhs, output);
+        masm.divFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_max: {
-        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, scratch);
-        masm.vcmpunordps(rhs, lhsCopy, scratch);
-
-        FloatRegister tmp = ToFloatRegister(ins->temp());
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, tmp);
-        masm.vmaxps(Operand(lhs), rhsCopy, tmp);
-        masm.vmaxps(rhs, lhs, output);
-
-        masm.vandps(tmp, output, output);
-        masm.vorps(scratch, output, output); // or in the all-ones NaNs
+        masm.maxFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
         return;
       }
       case MSimdBinaryArith::Op_min: {
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-        masm.vminps(Operand(lhs), rhsCopy, scratch);
-        masm.vminps(rhs, lhs, output);
-        masm.vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
+        masm.minFloat32x4(lhs, rhs, output);
         return;
       }
       case MSimdBinaryArith::Op_minNum: {
-        FloatRegister tmp = ToFloatRegister(ins->temp());
-        masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
-
-        FloatRegister mask = scratch;
-        FloatRegister tmpCopy = masm.reusedInputFloat32x4(tmp, scratch);
-        masm.vpcmpeqd(Operand(lhs), tmpCopy, mask);
-        masm.vandps(tmp, mask, mask);
-
-        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
-        masm.vminps(rhs, lhsCopy, tmp);
-        masm.vorps(mask, tmp, tmp);
-
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
-        masm.vcmpneqps(rhs, rhsCopy, mask);
-
-        if (AssemblerX86Shared::HasAVX()) {
-            masm.vblendvps(mask, lhs, tmp, output);
-        } else {
-            // Emulate vblendvps.
-            // With SSE.4.1 we could use blendvps, however it's awkward since
-            // it requires the mask to be in xmm0.
-            if (lhs != output)
-                masm.moveSimd128Float(lhs, output);
-            masm.vandps(Operand(mask), output, output);
-            masm.vandnps(Operand(tmp), mask, mask);
-            masm.vorps(Operand(mask), output, output);
-        }
+        masm.minNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
         return;
       }
       case MSimdBinaryArith::Op_maxNum: {
-        FloatRegister mask = scratch;
-        masm.loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
-        masm.vpcmpeqd(Operand(lhs), mask, mask);
-
-        FloatRegister tmp = ToFloatRegister(ins->temp());
-        masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
-        masm.vandps(tmp, mask, mask);
-
-        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
-        masm.vmaxps(rhs, lhsCopy, tmp);
-        masm.vandnps(Operand(tmp), mask, mask);
-
-        // Ensure tmp always contains the temporary result
-        mask = tmp;
-        tmp = scratch;
-
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
-        masm.vcmpneqps(rhs, rhsCopy, mask);
-
-        if (AssemblerX86Shared::HasAVX()) {
-            masm.vblendvps(mask, lhs, tmp, output);
-        } else {
-            // Emulate vblendvps.
-            // With SSE.4.1 we could use blendvps, however it's awkward since
-            // it requires the mask to be in xmm0.
-            if (lhs != output)
-                masm.moveSimd128Float(lhs, output);
-            masm.vandps(Operand(mask), output, output);
-            masm.vandnps(Operand(tmp), mask, mask);
-            masm.vorps(Operand(mask), output, output);
-        }
+        masm.maxNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
         return;
       }
     }
@@ -3884,16 +3049,10 @@ CodeGeneratorX86Shared::visitSimdBinarySaturating(LSimdBinarySaturating* ins)
       case MIRType::Int8x16:
         switch (ins->operation()) {
           case MSimdBinarySaturating::add:
-            if (sign == SimdSign::Signed)
-                masm.vpaddsb(rhs, lhs, output);
-            else
-                masm.vpaddusb(rhs, lhs, output);
+            masm.addSatInt8x16(lhs, rhs, sign, output);
             return;
           case MSimdBinarySaturating::sub:
-            if (sign == SimdSign::Signed)
-                masm.vpsubsb(rhs, lhs, output);
-            else
-                masm.vpsubusb(rhs, lhs, output);
+            masm.subSatInt8x16(lhs, rhs, sign, output);
             return;
         }
         break;
@@ -3901,16 +3060,10 @@ CodeGeneratorX86Shared::visitSimdBinarySaturating(LSimdBinarySaturating* ins)
       case MIRType::Int16x8:
         switch (ins->operation()) {
           case MSimdBinarySaturating::add:
-            if (sign == SimdSign::Signed)
-                masm.vpaddsw(rhs, lhs, output);
-            else
-                masm.vpaddusw(rhs, lhs, output);
+            masm.addSatInt16x8(lhs, rhs, sign, output);
             return;
           case MSimdBinarySaturating::sub:
-            if (sign == SimdSign::Signed)
-                masm.vpsubsw(rhs, lhs, output);
-            else
-                masm.vpsubusw(rhs, lhs, output);
+            masm.subSatInt16x8(lhs, rhs, sign, output);
             return;
         }
         break;
@@ -3927,16 +3080,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx16(LSimdUnaryArithIx16* ins)
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
 
-    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::neg:
-        masm.zeroSimd128Int(out);
-        masm.packedSubInt8(in, out);
+        masm.negInt8x16(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Int(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notInt8x16(in, out);;
         return;
       case MSimdUnaryArith::abs:
       case MSimdUnaryArith::reciprocalApproximation:
@@ -3953,16 +3102,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx8(LSimdUnaryArithIx8* ins)
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
 
-    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::neg:
-        masm.zeroSimd128Int(out);
-        masm.packedSubInt16(in, out);
+        masm.negInt16x8(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Int(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notInt16x8(in, out);
         return;
       case MSimdUnaryArith::abs:
       case MSimdUnaryArith::reciprocalApproximation:
@@ -3979,16 +3124,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins)
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
 
-    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::neg:
-        masm.zeroSimd128Int(out);
-        masm.packedSubInt32(in, out);
+        masm.negInt32x4(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Int(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notInt32x4(in, out);
         return;
       case MSimdUnaryArith::abs:
       case MSimdUnaryArith::reciprocalApproximation:
@@ -4005,29 +3146,15 @@ CodeGeneratorX86Shared::visitSimdUnaryArithFx4(LSimdUnaryArithFx4* ins)
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
 
-    // All ones but the sign bit
-    float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
-    static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
-
-    // All ones including the sign bit
-    float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
-    static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
-
-    // All zeros but the sign bit
-    static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::abs:
-        masm.loadConstantSimd128Float(signMasks, out);
-        masm.bitwiseAndSimd128(in, out);
+        masm.absFloat32x4(in, out);
         return;
       case MSimdUnaryArith::neg:
-        masm.loadConstantSimd128Float(minusZero, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.negFloat32x4(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Float(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notFloat32x4(in, out);
         return;
       case MSimdUnaryArith::reciprocalApproximation:
         masm.packedRcpApproximationFloat32x4(in, out);
@@ -4053,21 +3180,21 @@ CodeGeneratorX86Shared::visitSimdBinaryBitwise(LSimdBinaryBitwise* ins)
     switch (op) {
       case MSimdBinaryBitwise::and_:
         if (ins->type() == MIRType::Float32x4)
-            masm.vandps(rhs, lhs, output);
+            masm.bitwiseAndFloat32x4(lhs, rhs, output);
         else
-            masm.vpand(rhs, lhs, output);
+            masm.bitwiseAndSimdInt(lhs, rhs, output);
         return;
       case MSimdBinaryBitwise::or_:
         if (ins->type() == MIRType::Float32x4)
-            masm.vorps(rhs, lhs, output);
+            masm.bitwiseOrFloat32x4(lhs, rhs, output);
         else
-            masm.vpor(rhs, lhs, output);
+            masm.bitwiseOrSimdInt(lhs, rhs, output);
         return;
       case MSimdBinaryBitwise::xor_:
         if (ins->type() == MIRType::Float32x4)
-            masm.vxorps(rhs, lhs, output);
+            masm.bitwiseXorFloat32x4(lhs, rhs, output);
         else
-            masm.vpxor(rhs, lhs, output);
+            masm.bitwiseXorSimdInt(lhs, rhs, output);
         return;
     }
     MOZ_CRASH("unexpected SIMD bitwise op");
@@ -4079,15 +3206,12 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins)
     FloatRegister out = ToFloatRegister(ins->output());
     MOZ_ASSERT(ToFloatRegister(ins->vector()) == out); // defineReuseInput(0);
 
-    // The shift amount is masked to the number of bits in a lane.
-    uint32_t shiftmask = (128u / SimdTypeToLength(ins->type())) - 1;
-
     // Note that SSE doesn't have instructions for shifting 8x16 vectors.
     // These shifts are synthesized by the MSimdShift::AddLegalized() function.
     const LAllocation* val = ins->value();
     if (val->isConstant()) {
         MOZ_ASSERT(ins->temp()->isBogusTemp());
-        Imm32 count(uint32_t(ToInt32(val)) & shiftmask);
+        Imm32 count(uint32_t(ToInt32(val)));
         switch (ins->type()) {
           case MIRType::Int16x8:
             switch (ins->operation()) {
@@ -4121,38 +3245,33 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins)
         MOZ_CRASH("unexpected SIMD bitwise op");
     }
 
-    // Truncate val to 5 bits. We should have a temp register for that.
-    MOZ_ASSERT(val->isRegister());
-    Register count = ToRegister(ins->temp());
-    masm.mov(ToRegister(val), count);
-    masm.andl(Imm32(shiftmask), count);
-    ScratchFloat32Scope scratch(masm);
-    masm.vmovd(count, scratch);
+    Register temp = ToRegister(ins->temp());
+    Register count = ToRegister(val);
 
     switch (ins->type()) {
       case MIRType::Int16x8:
         switch (ins->operation()) {
           case MSimdShift::lsh:
-            masm.packedLeftShiftByScalarInt16x8(scratch, out);
+            masm.packedLeftShiftByScalarInt16x8(out, count, temp, out);
             return;
           case MSimdShift::rsh:
-            masm.packedRightShiftByScalarInt16x8(scratch, out);
+            masm.packedRightShiftByScalarInt16x8(out, count, temp, out);
             return;
           case MSimdShift::ursh:
-            masm.packedUnsignedRightShiftByScalarInt16x8(scratch, out);
+            masm.packedUnsignedRightShiftByScalarInt16x8(out, count, temp, out);
             return;
         }
         break;
       case MIRType::Int32x4:
         switch (ins->operation()) {
           case MSimdShift::lsh:
-            masm.packedLeftShiftByScalarInt32x4(scratch, out);
+            masm.packedLeftShiftByScalarInt32x4(out, count, temp, out);
             return;
           case MSimdShift::rsh:
-            masm.packedRightShiftByScalarInt32x4(scratch, out);
+            masm.packedRightShiftByScalarInt32x4(out, count, temp, out);
             return;
           case MSimdShift::ursh:
-            masm.packedUnsignedRightShiftByScalarInt32x4(scratch, out);
+            masm.packedUnsignedRightShiftByScalarInt32x4(out, count, temp, out);
             return;
         }
         break;
@@ -4171,26 +3290,12 @@ CodeGeneratorX86Shared::visitSimdSelect(LSimdSelect* ins)
     FloatRegister output = ToFloatRegister(ins->output());
     FloatRegister temp = ToFloatRegister(ins->temp());
 
-    if (onTrue != output)
-        masm.vmovaps(onTrue, output);
-    if (mask != temp)
-        masm.vmovaps(mask, temp);
-
     MSimdSelect* mir = ins->mir();
     unsigned lanes = SimdTypeToLength(mir->type());
-
-    if (AssemblerX86Shared::HasAVX() && lanes == 4) {
-        // TBD: Use vpblendvb for lanes > 4, HasAVX.
-        masm.vblendvps(mask, onTrue, onFalse, output);
-        return;
-    }
-
-    // SSE4.1 has plain blendvps which can do this, but it is awkward
-    // to use because it requires the mask to be in xmm0.
-
-    masm.bitwiseAndSimd128(Operand(temp), output);
-    masm.bitwiseAndNotSimd128(Operand(onFalse), temp);
-    masm.bitwiseOrSimd128(Operand(temp), output);
+    if (lanes == 4)
+        masm.selectX4(mask, onTrue, onFalse, temp, output);
+    else
+        masm.selectSimd128(mask, onTrue, onFalse, temp, output);
 }
 
 void
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
index 0b4961dddd..4b0664fb63 100644
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
@@ -173,12 +173,6 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
 
     void emitTableSwitchDispatch(MTableSwitch* mir, Register index, Register base);
 
-    void emitSimdExtractLane8x16(FloatRegister input, Register output, unsigned lane,
-                                 SimdSign signedness);
-    void emitSimdExtractLane16x8(FloatRegister input, Register output, unsigned lane,
-                                 SimdSign signedness);
-    void emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane);
-
   public:
     CodeGeneratorX86Shared(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm);
 
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
new file mode 100644
index 0000000000..0ebf30de1a
--- /dev/null
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -0,0 +1,1227 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/MacroAssembler.h"
+#include "jit/x86-shared/MacroAssembler-x86-shared.h"
+
+#include "jit/MacroAssembler-inl.h"
+
+using namespace js;
+using namespace js::jit;
+
+using mozilla::DebugOnly;
+using mozilla::FloatingPoint;
+using mozilla::Maybe;
+using mozilla::SpecificNaN;
+
+void
+MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest,
+                                                          Register temp, Label* oolEntry,
+                                                          Label* rejoin)
+{
+    // Does the conversion and jumps to the OOL entry if the result value
+    // is the undefined integer pattern.
+    static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648));
+    convertFloat32x4ToInt32x4(src, dest);
+
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Int(InvalidResult, scratch);
+    packedEqualInt32x4(Operand(dest), scratch);
+    // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
+    // the two following instructions.
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(0));
+    j(Assembler::NotEqual, oolEntry);
+    bind(rejoin);
+}
+
+void
+MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp,
+                                                      Label* rejoin, Label* onConversionError)
+{
+    static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
+    static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
+
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
+    vcmpleps(Operand(src), scratch, scratch);
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(15));
+    j(Assembler::NotEqual, onConversionError);
+
+    asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
+    vcmpleps(Operand(src), scratch, scratch);
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(0));
+    j(Assembler::NotEqual, onConversionError);
+
+    jump(rejoin);
+}
+
+void
+MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(FloatRegister in, FloatRegister out,
+                                                           Register temp, FloatRegister tempF,
+                                                           Label* failed)
+{
+    // Classify lane values into 4 disjoint classes:
+    //
+    //   N-lanes:             in <= -1.0
+    //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
+    //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
+    //   V-lanes: 0x1.0p32 <= in, or isnan(in)
+    //
+    // We need to bail out to throw a RangeError if we see any N-lanes or
+    // V-lanes.
+    //
+    // For A-lanes and B-lanes, we make two float -> int32 conversions:
+    //
+    //   A = cvttps2dq(in)
+    //   B = cvttps2dq(in - 0x1.0p31f)
+    //
+    // Note that the subtraction for the B computation is exact for B-lanes.
+    // There is no rounding, so B is the low 31 bits of the correctly converted
+    // result.
+    //
+    // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
+    // out of range for a signed int32_t. This conveniently provides the missing
+    // high bit for B, so the desired result is A for A-lanes and A|B for
+    // B-lanes.
+
+    ScratchSimd128Scope scratch(asMasm());
+
+    // TODO: If the majority of lanes are A-lanes, it could be faster to compute
+    // A first, use vmovmskps to check for any non-A-lanes and handle them in
+    // ool code. OTOH, we we're wrong about the lane distribution, that would be
+    // slower.
+
+    // Compute B in |scratch|.
+    static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
+    static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
+    asMasm().loadConstantSimd128Float(Bias, scratch);
+    packedAddFloat32(Operand(in), scratch);
+    convertFloat32x4ToInt32x4(scratch, scratch);
+
+    // Compute A in |out|. This is the last time we use |in| and the first time
+    // we use |out|, so we can tolerate if they are the same register.
+    convertFloat32x4ToInt32x4(in, out);
+
+    // We can identify A-lanes by the sign bits in A: Any A-lanes will be
+    // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
+    // mask of non-A-lanes into |tempF|.
+    zeroSimd128Float(tempF);
+    packedGreaterThanInt32x4(Operand(out), tempF);
+
+    // Clear the A-lanes in B.
+    bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
+
+    // Compute the final result: A for A-lanes, A|B for B-lanes.
+    bitwiseOrSimdInt(out, Operand(scratch), out);
+
+    // We still need to filter out the V-lanes. They would show up as 0x80000000
+    // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
+    // the remaining negative lanes in B.
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(0));
+    j(Assembler::NotEqual, failed);
+}
+
+void
+MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1, Register lane2,
+                                       Register lane3, FloatRegister dest)
+{
+    if (AssemblerX86Shared::HasSSE41()) {
+        vmovd(lane0, dest);
+        vpinsrd(1, lane1, dest, dest);
+        vpinsrd(2, lane2, dest, dest);
+        vpinsrd(3, lane3, dest, dest);
+        return;
+    }
+
+    asMasm().reserveStack(Simd128DataSize);
+    store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
+    store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
+    store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
+    store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
+    loadAlignedSimd128Int(Address(StackPointer, 0), dest);
+    asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::createFloat32x4(FloatRegister lane0, FloatRegister lane1,
+                                         FloatRegister lane2, FloatRegister lane3,
+                                         FloatRegister temp, FloatRegister output)
+{
+    FloatRegister lane0Copy = reusedInputFloat32x4(lane0, output);
+    FloatRegister lane1Copy = reusedInputFloat32x4(lane1, temp);
+    vunpcklps(lane3, lane1Copy, temp);
+    vunpcklps(lane2, lane0Copy, output);
+    vunpcklps(temp, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output)
+{
+    vmovd(input, output);
+    if (AssemblerX86Shared::HasSSSE3()) {
+        zeroSimd128Int(ScratchSimd128Reg);
+        vpshufb(ScratchSimd128Reg, output, output);
+    } else {
+        // Use two shifts to duplicate the low 8 bits into the low 16 bits.
+        vpsllw(Imm32(8), output, output);
+        vmovdqa(output, ScratchSimd128Reg);
+        vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
+        vpor(ScratchSimd128Reg, output, output);
+        // Then do an X8 splat.
+        vpshuflw(0, output, output);
+        vpshufd(0, output, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output)
+{
+    vmovd(input, output);
+    vpshuflw(0, output, output);
+    vpshufd(0, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output)
+{
+    vmovd(input, output);
+    vpshufd(0, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX4(FloatRegister input, FloatRegister output)
+{
+    FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+    vshufps(0, inputCopy, inputCopy, output);
+}
+
+void
+MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType, FloatRegister input,
+                                         FloatRegister output)
+{
+    if (input.aliases(output))
+        return;
+    if (isIntegerLaneType)
+        vmovdqa(input, output);
+    else
+        vmovaps(input, output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, Register output, unsigned lane)
+{
+    if (lane == 0) {
+        // The value we want to extract is in the low double-word
+        moveLowInt32(input, output);
+    } else if (AssemblerX86Shared::HasSSE41()) {
+        vpextrd(lane, input, output);
+    } else {
+        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+        shuffleInt32(mask, input, ScratchSimd128Reg);
+        moveLowInt32(ScratchSimd128Reg, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, FloatRegister output,
+                                              unsigned lane, bool canonicalize)
+{
+    if (lane == 0) {
+        // The value we want to extract is in the low double-word
+        if (input != output)
+            moveFloat32(input, output);
+    } else if (lane == 2) {
+        moveHighPairToLowPairFloat32(input, output);
+    } else {
+        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+        shuffleFloat32(mask, input, output);
+    }
+    // NaNs contained within SIMD values are not enforced to be canonical, so
+    // when we extract an element into a "regular" scalar JS value, we have to
+    // canonicalize. In wasm code, we can skip this, as wasm only has to
+    // canonicalize NaNs at FFI boundaries.
+    if (canonicalize)
+        asMasm().canonicalizeFloat(output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, Register output, unsigned lane,
+                                            SimdSign sign)
+{
+    // Unlike pextrd and pextrb, this is available in SSE2.
+    vpextrw(lane, input, output);
+    if (sign == SimdSign::Signed)
+        movswl(output, output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, Register output, unsigned lane,
+                                            SimdSign sign)
+{
+    if (AssemblerX86Shared::HasSSE41()) {
+        vpextrb(lane, input, output);
+        // vpextrb clears the high bits, so no further extension required.
+        if (sign == SimdSign::Unsigned)
+            sign = SimdSign::NotApplicable;
+    } else {
+        // Extract the relevant 16 bits containing our lane, then shift the
+        // right 8 bits into place.
+        extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned);
+        if (lane % 2) {
+            shrl(Imm32(8), output);
+            // The shrl handles the zero-extension. Don't repeat it.
+            if (sign == SimdSign::Unsigned)
+                sign = SimdSign::NotApplicable;
+        }
+    }
+
+    // We have the right low 8 bits in |output|, but we may need to fix the high
+    // bits. Note that this requires |output| to be one of the %eax-%edx
+    // registers.
+    switch (sign) {
+      case SimdSign::Signed:
+        movsbl(output, output);
+        break;
+      case SimdSign::Unsigned:
+        movzbl(output, output);
+        break;
+      case SimdSign::NotApplicable:
+        // No adjustment needed.
+        break;
+    }
+}
+
+void
+MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes,
+                                             unsigned lane)
+{
+    switch (numLanes) {
+      case 4:
+        extractLaneInt32x4(input, output, lane);
+        break;
+      case 8:
+        // Get a lane, don't bother fixing the high bits since we'll mask below.
+        extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
+        break;
+      case 16:
+        extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
+        break;
+      default:
+        MOZ_CRASH("Unhandled SIMD number of lanes");
+    }
+    // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
+    asMasm().and32(Imm32(1), output);
+}
+
+void
+MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output,
+                                           unsigned lane, unsigned numLanes)
+{
+    if (numLanes == 8) {
+        // Available in SSE 2.
+        vpinsrw(lane, value, input, output);
+        return;
+    }
+
+    // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
+    // value goes into the first component, as vmovd clears out the higher lanes
+    // of the output.
+    if (AssemblerX86Shared::HasSSE41()) {
+        // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
+        switch (numLanes) {
+          case 4:
+            vpinsrd(lane, value, input, output);
+            return;
+          case 16:
+            vpinsrb(lane, value, input, output);
+            return;
+        }
+    }
+
+    asMasm().reserveStack(Simd128DataSize);
+    storeAlignedSimd128Int(input, Address(StackPointer, 0));
+    switch (numLanes) {
+      case 4:
+        store32(value, Address(StackPointer, lane * sizeof(int32_t)));
+        break;
+      case 16:
+        // Note that this requires `value` to be in one the registers where the
+        // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
+        store8(value, Address(StackPointer, lane * sizeof(int8_t)));
+        break;
+      default:
+        MOZ_CRASH("Unsupported SIMD numLanes");
+    }
+    loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input, FloatRegister value,
+                                             FloatRegister output, unsigned lane)
+{
+    if (lane == 0) {
+        // As both operands are registers, vmovss doesn't modify the upper bits
+        // of the destination operand.
+        if (value != output)
+            vmovss(value, input, output);
+        return;
+    }
+
+    if (AssemblerX86Shared::HasSSE41()) {
+        // The input value is in the low float32 of the 'value' FloatRegister.
+        vinsertps(vinsertpsMask(0, lane), value, output, output);
+        return;
+    }
+
+    asMasm().reserveStack(Simd128DataSize);
+    storeAlignedSimd128Float(input, Address(StackPointer, 0));
+    asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t)));
+    loadAlignedSimd128Float(Address(StackPointer, 0), output);
+    asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input, Register output)
+{
+    // We know that the input lanes are boolean, so they are either 0 or -1.
+    // The all-true vector has all 128 bits set, no matter the lane geometry.
+    vpmovmskb(input, output);
+    cmp32(output, Imm32(0xffff));
+    emitSet(Assembler::Zero, output);
+}
+
+void
+MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input, Register output)
+{
+    vpmovmskb(input, output);
+    cmp32(output, Imm32(0x0));
+    emitSet(Assembler::NonZero, output);
+}
+
+void
+MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input, FloatRegister output,
+                                        unsigned lanes[4])
+{
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1], lanes[2], lanes[3]);
+    shuffleInt32(mask, input, output);
+}
+
+void
+MacroAssemblerX86Shared::swizzleInt8x16(FloatRegister input, FloatRegister output,
+                                        const Maybe<Register>& temp, int8_t lanes[16])
+{
+    if (AssemblerX86Shared::HasSSSE3()) {
+        ScratchSimd128Scope scratch(asMasm());
+        asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
+        FloatRegister inputCopy = reusedInputInt32x4(input, output);
+        vpshufb(scratch, inputCopy, output);
+        return;
+    }
+
+    // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
+    MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
+    asMasm().reserveStack(2 * Simd128DataSize);
+    storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
+    for (unsigned i = 0; i < 16; i++) {
+        load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
+        store8(*temp, Address(StackPointer, i));
+    }
+    loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    asMasm().freeStack(2 * Simd128DataSize);
+}
+
+static inline bool
+LanesMatch(unsigned lanes[4], unsigned x, unsigned y, unsigned z, unsigned w)
+{
+    return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
+}
+
+void
+MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input, FloatRegister output,
+                                          unsigned lanes[4])
+{
+    if (AssemblerX86Shared::HasSSE3()) {
+        if (LanesMatch(lanes, 0, 0, 2, 2)) {
+            vmovsldup(input, output);
+            return;
+        }
+        if (LanesMatch(lanes, 1, 1, 3, 3)) {
+            vmovshdup(input, output);
+            return;
+        }
+    }
+
+    // TODO Here and below, arch specific lowering could identify this pattern
+    // and use defineReuseInput to avoid this move (bug 1084404)
+    if (LanesMatch(lanes, 2, 3, 2, 3)) {
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vmovhlps(input, inputCopy, output);
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 1, 0, 1)) {
+        if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
+            vmovddup(input, output);
+            return;
+        }
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vmovlhps(input, inputCopy, output);
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 0, 1, 1)) {
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vunpcklps(input, inputCopy, output);
+        return;
+    }
+
+    if (LanesMatch(lanes, 2, 2, 3, 3)) {
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vunpckhps(input, inputCopy, output);
+        return;
+    }
+
+    uint32_t x = lanes[0];
+    uint32_t y = lanes[1];
+    uint32_t z = lanes[2];
+    uint32_t w = lanes[3];
+
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
+    shuffleFloat32(mask, input, output);
+}
+
+void
+MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+                                        const Maybe<FloatRegister>& maybeFloatTemp,
+                                        const Maybe<Register>& maybeTemp, uint8_t lanes[16])
+{
+    DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3();
+    MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp);
+    MOZ_ASSERT(!hasSSSE3 == !!maybeTemp);
+
+    // Use pshufb if it is available.
+    if (AssemblerX86Shared::HasSSSE3()) {
+        ScratchSimd128Scope scratch(asMasm());
+
+        // Use pshufb instructions to gather the lanes from each source vector.
+        // A negative index creates a zero lane, so the two vectors can be combined.
+
+        // Set scratch = lanes from lhs.
+        int8_t idx[16];
+        for (unsigned i = 0; i < 16; i++)
+            idx[i] = lanes[i] < 16 ? lanes[i] : -1;
+        asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp);
+        FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch);
+        vpshufb(*maybeFloatTemp, lhsCopy, scratch);
+
+        // Set output = lanes from rhs.
+        for (unsigned i = 0; i < 16; i++)
+            idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
+        asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp);
+        FloatRegister rhsCopy = reusedInputInt32x4(rhs, output);
+        vpshufb(*maybeFloatTemp, rhsCopy, output);
+
+        // Combine.
+        vpor(scratch, output, output);
+        return;
+    }
+
+    // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
+    asMasm().reserveStack(3 * Simd128DataSize);
+    storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
+    storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
+    for (unsigned i = 0; i < 16; i++) {
+        load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *maybeTemp);
+        store8(*maybeTemp, Address(StackPointer, i));
+    }
+    loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    asMasm().freeStack(3 * Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+                                   const Maybe<FloatRegister>& maybeTemp, unsigned lanes[4])
+{
+    uint32_t x = lanes[0];
+    uint32_t y = lanes[1];
+    uint32_t z = lanes[2];
+    uint32_t w = lanes[3];
+
+    // Check that lanes come from LHS in majority:
+    unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
+    MOZ_ASSERT(numLanesFromLHS >= 2);
+
+    // When reading this method, remember that vshufps takes the two first
+    // inputs of the destination operand (right operand) and the two last
+    // inputs of the source operand (left operand).
+    //
+    // Legend for explanations:
+    // - L: LHS
+    // - R: RHS
+    // - T: temporary
+
+    uint32_t mask;
+
+    // If all lanes came from a single vector, we should use swizzle instead.
+    MOZ_ASSERT(numLanesFromLHS < 4);
+
+    // If all values stay in their lane, this is a blend.
+    if (AssemblerX86Shared::HasSSE41()) {
+        if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
+            vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
+            return;
+        }
+    }
+
+    // One element of the second, all other elements of the first
+    if (numLanesFromLHS == 3) {
+        unsigned firstMask = -1, secondMask = -1;
+
+        // register-register vmovss preserves the high lanes.
+        if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
+            vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
+            return;
+        }
+
+        // SSE4.1 vinsertps can handle any single element.
+        unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
+        if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
+            unsigned srcLane;
+            unsigned dstLane;
+            if (x >= 4) {
+                srcLane = x - 4;
+                dstLane = 0;
+            } else if (y >= 4) {
+                srcLane = y - 4;
+                dstLane = 1;
+            } else if (z >= 4) {
+                srcLane = z - 4;
+                dstLane = 2;
+            } else {
+                MOZ_ASSERT(w >= 4);
+                srcLane = w - 4;
+                dstLane = 3;
+            }
+            vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
+            return;
+        }
+
+        MOZ_ASSERT(!!maybeTemp);
+        FloatRegister rhsCopy = *maybeTemp;
+        loadAlignedSimd128Float(rhs, rhsCopy);
+
+        if (x < 4 && y < 4) {
+            if (w >= 4) {
+                w %= 4;
+                // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
+                firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
+                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
+                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
+            } else {
+                MOZ_ASSERT(z >= 4);
+                z %= 4;
+                // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
+                firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
+                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
+                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
+            }
+
+            vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+            vshufps(secondMask, rhsCopy, lhs, out);
+            return;
+        }
+
+        MOZ_ASSERT(z < 4 && w < 4);
+
+        if (y >= 4) {
+            y %= 4;
+            // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
+            firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
+            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
+            secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
+        } else {
+            MOZ_ASSERT(x >= 4);
+            x %= 4;
+            // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
+            firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
+            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
+            secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
+        }
+
+        vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+        if (AssemblerX86Shared::HasAVX()) {
+            vshufps(secondMask, lhs, rhsCopy, out);
+        } else {
+            vshufps(secondMask, lhs, rhsCopy, rhsCopy);
+            moveSimd128Float(rhsCopy, out);
+        }
+        return;
+    }
+
+    // Two elements from one vector, two other elements from the other
+    MOZ_ASSERT(numLanesFromLHS == 2);
+
+    // TODO Here and below, symmetric case would be more handy to avoid a move,
+    // but can't be reached because operands would get swapped (bug 1084404).
+    if (LanesMatch(lanes, 2, 3, 6, 7)) {
+        ScratchSimd128Scope scratch(asMasm());
+        if (AssemblerX86Shared::HasAVX()) {
+            FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+            vmovhlps(lhs, rhsCopy, out);
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            vmovhlps(lhs, scratch, scratch);
+            moveSimd128Float(scratch, out);
+        }
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 1, 4, 5)) {
+        FloatRegister rhsCopy;
+        ScratchSimd128Scope scratch(asMasm());
+        if (rhs.kind() == Operand::FPREG) {
+            // No need to make an actual copy, since the operand is already
+            // in a register, and it won't be clobbered by the vmovlhps.
+            rhsCopy = FloatRegister::FromCode(rhs.fpu());
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            rhsCopy = scratch;
+        }
+        vmovlhps(rhsCopy, lhs, out);
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 4, 1, 5)) {
+        vunpcklps(rhs, lhs, out);
+        return;
+    }
+
+    // TODO swapped case would be better (bug 1084404)
+    if (LanesMatch(lanes, 4, 0, 5, 1)) {
+        ScratchSimd128Scope scratch(asMasm());
+        if (AssemblerX86Shared::HasAVX()) {
+            FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+            vunpcklps(lhs, rhsCopy, out);
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            vunpcklps(lhs, scratch, scratch);
+            moveSimd128Float(scratch, out);
+        }
+        return;
+    }
+
+    if (LanesMatch(lanes, 2, 6, 3, 7)) {
+        vunpckhps(rhs, lhs, out);
+        return;
+    }
+
+    // TODO swapped case would be better (bug 1084404)
+    if (LanesMatch(lanes, 6, 2, 7, 3)) {
+        ScratchSimd128Scope scratch(asMasm());
+        if (AssemblerX86Shared::HasAVX()) {
+            FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+            vunpckhps(lhs, rhsCopy, out);
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            vunpckhps(lhs, scratch, scratch);
+            moveSimd128Float(scratch, out);
+        }
+        return;
+    }
+
+    // In one vshufps
+    if (x < 4 && y < 4) {
+        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
+        vshufps(mask, rhs, lhs, out);
+        return;
+    }
+
+    // At creation, we should have explicitly swapped in this case.
+    MOZ_ASSERT(!(z >= 4 && w >= 4));
+
+    // In two vshufps, for the most generic case:
+    uint32_t firstMask[4], secondMask[4];
+    unsigned i = 0, j = 2, k = 0;
+
+#define COMPUTE_MASK(lane)       \
+    if (lane >= 4) {             \
+        firstMask[j] = lane % 4; \
+        secondMask[k++] = j++;   \
+    } else {                     \
+        firstMask[i] = lane;     \
+        secondMask[k++] = i++;   \
+    }
+
+    COMPUTE_MASK(x)
+    COMPUTE_MASK(y)
+    COMPUTE_MASK(z)
+    COMPUTE_MASK(w)
+#undef COMPUTE_MASK
+
+    MOZ_ASSERT(i == 2 && j == 4 && k == 4);
+
+    mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
+                                              firstMask[2], firstMask[3]);
+    vshufps(mask, rhs, lhs, lhs);
+
+    mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
+                                              secondMask[2], secondMask[3]);
+    vshufps(mask, lhs, lhs, lhs);
+}
+
+static inline FloatRegister
+ToSimdFloatRegister(const Operand& op)
+{
+    return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
+}
+
+void
+MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                        FloatRegister output)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+    ScratchSimd128Scope scratch(asMasm());
+    switch (cond) {
+      case Assembler::Condition::GreaterThan:
+        vpcmpgtb(rhs, lhs, output);
+        break;
+      case Assembler::Condition::Equal:
+        vpcmpeqb(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThan:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+
+        // src := src > lhs (i.e. lhs < rhs)
+        // Improve by doing custom lowering (rhs is tied to the output register)
+        vpcmpgtb(Operand(lhs), scratch, scratch);
+        moveSimd128Int(scratch, output);
+        break;
+      case Assembler::Condition::NotEqual:
+        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+        // should invert the comparison by, e.g. swapping the arms of a select
+        // if that's what it's used in.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpeqb(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+        vpcmpgtb(Operand(lhs), scratch, scratch);
+        asMasm().loadConstantSimd128Int(allOnes, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpgtb(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                        FloatRegister output)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+
+    ScratchSimd128Scope scratch(asMasm());
+    switch (cond) {
+      case Assembler::Condition::GreaterThan:
+        vpcmpgtw(rhs, lhs, output);
+        break;
+      case Assembler::Condition::Equal:
+        vpcmpeqw(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThan:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+
+        // src := src > lhs (i.e. lhs < rhs)
+        // Improve by doing custom lowering (rhs is tied to the output register)
+        vpcmpgtw(Operand(lhs), scratch, scratch);
+        moveSimd128Int(scratch, output);
+        break;
+      case Assembler::Condition::NotEqual:
+        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+        // should invert the comparison by, e.g. swapping the arms of a select
+        // if that's what it's used in.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpeqw(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+        vpcmpgtw(Operand(lhs), scratch, scratch);
+        asMasm().loadConstantSimd128Int(allOnes, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpgtw(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                        FloatRegister output)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+    ScratchSimd128Scope scratch(asMasm());
+    switch (cond) {
+      case Assembler::Condition::GreaterThan:
+        packedGreaterThanInt32x4(rhs, lhs);
+        break;
+      case Assembler::Condition::Equal:
+        packedEqualInt32x4(rhs, lhs);
+        break;
+      case Assembler::Condition::LessThan:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+
+        // src := src > lhs (i.e. lhs < rhs)
+        // Improve by doing custom lowering (rhs is tied to the output register)
+        packedGreaterThanInt32x4(Operand(lhs), scratch);
+        moveSimd128Int(scratch, lhs);
+        break;
+      case Assembler::Condition::NotEqual:
+        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+        // should invert the comparison by, e.g. swapping the arms of a select
+        // if that's what it's used in.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        packedEqualInt32x4(rhs, lhs);
+        bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+        packedGreaterThanInt32x4(Operand(lhs), scratch);
+        asMasm().loadConstantSimd128Int(allOnes, lhs);
+        bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        packedGreaterThanInt32x4(rhs, lhs);
+        bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+        break;
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                          FloatRegister output)
+{
+    switch (cond) {
+      case Assembler::Condition::Equal:
+        vcmpeqps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThan:
+        vcmpltps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        vcmpleps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::NotEqual:
+        vcmpneqps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+      case Assembler::Condition::GreaterThan:
+        // We reverse these before register allocation so that we don't have to
+        // copy into and out of temporaries after codegen.
+        MOZ_CRASH("should have reversed this");
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs,
+                                    const Maybe<FloatRegister>& temp, FloatRegister output)
+{
+    if (AssemblerX86Shared::HasSSE41()) {
+        vpmulld(rhs, lhs, output);
+        return;
+    }
+
+    ScratchSimd128Scope scratch(asMasm());
+    loadAlignedSimd128Int(rhs, scratch);
+    vpmuludq(lhs, scratch, scratch);
+    // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
+
+    MOZ_ASSERT(!!temp);
+    vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
+    vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp);
+    vpmuludq(*temp, lhs, lhs);
+    // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
+
+    vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
+    // lhs contains (Ry, Rw, Rx, Rz)
+    vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+}
+
+void
+MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+    vminps(Operand(lhs), rhsCopy, scratch);
+    vminps(rhs, lhs, output);
+    vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
+}
+
+void
+MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                                      FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    FloatRegister lhsCopy = reusedInputFloat32x4(lhs, scratch);
+    vcmpunordps(rhs, lhsCopy, scratch);
+
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, temp);
+    vmaxps(Operand(lhs), rhsCopy, temp);
+    vmaxps(rhs, lhs, output);
+
+    vandps(temp, output, output);
+    vorps(scratch, output, output); // or in the all-ones NaNs
+}
+
+void
+MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                                         FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp);
+
+    FloatRegister mask = scratch;
+    FloatRegister tmpCopy = reusedInputFloat32x4(temp, scratch);
+    vpcmpeqd(Operand(lhs), tmpCopy, mask);
+    vandps(temp, mask, mask);
+
+    FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp);
+    vminps(rhs, lhsCopy, temp);
+    vorps(mask, temp, temp);
+
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask);
+    vcmpneqps(rhs, rhsCopy, mask);
+
+    if (AssemblerX86Shared::HasAVX()) {
+        vblendvps(mask, lhs, temp, output);
+    } else {
+        // Emulate vblendvps.
+        // With SSE.4.1 we could use blendvps, however it's awkward since
+        // it requires the mask to be in xmm0.
+        if (lhs != output)
+            moveSimd128Float(lhs, output);
+        vandps(Operand(mask), output, output);
+        vandnps(Operand(temp), mask, mask);
+        vorps(Operand(mask), output, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                                         FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    FloatRegister mask = scratch;
+
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
+    vpcmpeqd(Operand(lhs), mask, mask);
+
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp);
+    vandps(temp, mask, mask);
+
+    FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp);
+    vmaxps(rhs, lhsCopy, temp);
+    vandnps(Operand(temp), mask, mask);
+
+    // Ensure temp always contains the temporary result
+    mask = temp;
+    temp = scratch;
+
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask);
+    vcmpneqps(rhs, rhsCopy, mask);
+
+    if (AssemblerX86Shared::HasAVX()) {
+        vblendvps(mask, lhs, temp, output);
+    } else {
+        // Emulate vblendvps.
+        // With SSE.4.1 we could use blendvps, however it's awkward since
+        // it requires the mask to be in xmm0.
+        if (lhs != output)
+            moveSimd128Float(lhs, output);
+        vandps(Operand(mask), output, output);
+        vandnps(Operand(temp), mask, mask);
+        vorps(Operand(mask), output, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out)
+{
+    // All zeros but the sign bit
+    static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
+    asMasm().loadConstantSimd128Float(minusZero, out);
+    bitwiseXorFloat32x4(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+    asMasm().loadConstantSimd128Int(allOnes, out);
+    bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+    asMasm().loadConstantSimd128Int(allOnes, out);
+    bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+    asMasm().loadConstantSimd128Int(allOnes, out);
+    bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out)
+{
+    float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
+    static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
+    asMasm().loadConstantSimd128Float(allOnes, out);
+    bitwiseXorFloat32x4(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out)
+{
+    // All ones but the sign bit
+    float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
+    static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
+    asMasm().loadConstantSimd128Float(signMasks, out);
+    bitwiseAndFloat32x4(out, in, out);
+}
+
+static inline void
+MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask, Register count, Register temp,
+                   FloatRegister dest)
+{
+    masm.mov(count, temp);
+    masm.andl(Imm32(shiftmask), temp);
+    masm.vmovd(temp, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(FloatRegister in, Register count,
+                                                        Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+    vpsllw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+                                                         Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+    vpsraw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+                                                                 Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+    vpsrlw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(FloatRegister in, Register count,
+                                                        Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+    vpslld(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+                                                         Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+    vpsrad(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+                                                                 Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+    vpsrld(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                       FloatRegister temp, FloatRegister output)
+{
+    if (onTrue != output)
+        vmovaps(onTrue, output);
+    if (mask != temp)
+        vmovaps(mask, temp);
+
+    // SSE4.1 has plain blendvps which can do this, but it is awkward
+    // to use because it requires the mask to be in xmm0.
+
+    bitwiseAndSimdInt(output, Operand(temp), output);
+    bitwiseAndNotSimdInt(temp, Operand(onFalse), temp);
+    bitwiseOrSimdInt(output, Operand(temp), output);
+}
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
index 36f3a008a9..f308e41fd8 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@@ -1123,9 +1123,9 @@ MacroAssembler::canonicalizeFloat32x4(FloatRegister reg, FloatRegister scratch)
     float nanf = float(JS::GenericNaN());
     loadConstantSimd128Float(SimdConstant::SplatX4(nanf), ifFalse);
 
-    bitwiseAndSimd128(Operand(mask), reg);
-    bitwiseAndNotSimd128(Operand(ifFalse), mask);
-    bitwiseOrSimd128(Operand(mask), reg);
+    bitwiseAndFloat32x4(reg, Operand(mask), reg);
+    bitwiseAndNotFloat32x4(mask, Operand(ifFalse), mask);
+    bitwiseOrFloat32x4(reg, Operand(mask), reg);
 }
 
 // ========================================================================
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
index e7783736b2..25b3b846da 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -820,20 +820,179 @@ class MacroAssemblerX86Shared : public Assembler
         vcvtdq2ps(src, dest);
     }
 
-    void bitwiseAndSimd128(const Operand& src, FloatRegister dest) {
-        // TODO Using the "ps" variant for all types incurs a domain crossing
-        // penalty for integer types and double.
-        vandps(src, dest, dest);
+    // SIMD methods, defined in MacroAssembler-x86-shared-SIMD.cpp.
+    void checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest, Register temp,
+                                          Label* oolCheck, Label* rejoin);
+    void oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp, Label* rejoin,
+                                      Label* onConversionError);
+    void checkedConvertFloat32x4ToUint32x4(FloatRegister src, FloatRegister dest, Register temp,
+                                           FloatRegister tempF, Label* failed);
+
+    void createInt32x4(Register lane0, Register lane1, Register lane2, Register lane3,
+                       FloatRegister dest);
+    void createFloat32x4(FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
+                         FloatRegister lane3, FloatRegister temp, FloatRegister output);
+
+    void splatX16(Register input, FloatRegister output);
+    void splatX8(Register input, FloatRegister output);
+    void splatX4(Register input, FloatRegister output);
+    void splatX4(FloatRegister input, FloatRegister output);
+
+    void reinterpretSimd(bool isIntegerLaneType, FloatRegister input, FloatRegister output);
+
+    void extractLaneInt32x4(FloatRegister input, Register output, unsigned lane);
+    void extractLaneFloat32x4(FloatRegister input, FloatRegister output, unsigned lane,
+                              bool canonicalize);
+    void extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, SimdSign sign);
+    void extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, SimdSign sign);
+    void extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes, unsigned lane);
+
+    void insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output,
+                           unsigned lane, unsigned numLanes);
+    void insertLaneFloat32x4(FloatRegister input, FloatRegister value, FloatRegister output,
+                             unsigned lane);
+
+    void allTrueSimdBool(FloatRegister input, Register output);
+    void anyTrueSimdBool(FloatRegister input, Register output);
+
+    void swizzleInt32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]);
+    void swizzleFloat32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]);
+    void swizzleInt8x16(FloatRegister input, FloatRegister output,
+                        const mozilla::Maybe<Register>& temp, int8_t lanes[16]);
+
+    void shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+                   const mozilla::Maybe<FloatRegister>& maybeTemp, unsigned lanes[4]);
+    void shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+                        const mozilla::Maybe<FloatRegister>& maybeFloatTemp,
+                        const mozilla::Maybe<Register>& maybeTemp, uint8_t lanes[16]);
+
+    void compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                        FloatRegister output);
+    void compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                        FloatRegister output);
+    void compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                        FloatRegister output);
+    void compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                          FloatRegister output);
+
+    void addInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpaddb(rhs, lhs, output);
+    }
+    void addInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpaddw(rhs, lhs, output);
+    }
+    void addInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpaddd(rhs, lhs, output);
+    }
+    void addFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vaddps(rhs, lhs, output);
+    }
+
+    void addSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpaddsb(rhs, lhs, output);
+        else
+            vpaddusb(rhs, lhs, output);
+    }
+    void addSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpaddsw(rhs, lhs, output);
+        else
+            vpaddusw(rhs, lhs, output);
+    }
+
+    void subInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpsubb(rhs, lhs, output);
+    }
+    void subInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpsubw(rhs, lhs, output);
+    }
+    void subInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpsubd(rhs, lhs, output);
+    }
+    void subFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vsubps(rhs, lhs, output);
+    }
+
+    void subSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpsubsb(rhs, lhs, output);
+        else
+            vpsubusb(rhs, lhs, output);
+    }
+    void subSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpsubsw(rhs, lhs, output);
+        else
+            vpsubusw(rhs, lhs, output);
+    }
+
+    void mulInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpmullw(rhs, lhs, output);
+    }
+    void mulInt32x4(FloatRegister lhs, Operand rhs, const mozilla::Maybe<FloatRegister>& temp,
+                    FloatRegister output);
+    void mulFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vmulps(rhs, lhs, output);
+    }
+
+    void negInt8x16(Operand in, FloatRegister out) {
+        zeroSimd128Int(out);
+        packedSubInt8(in, out);
+    }
+    void negInt16x8(Operand in, FloatRegister out) {
+        zeroSimd128Int(out);
+        packedSubInt16(in, out);
+    }
+    void negInt32x4(Operand in, FloatRegister out) {
+        zeroSimd128Int(out);
+        packedSubInt32(in, out);
+    }
+    void negFloat32x4(Operand in, FloatRegister out);
+
+    void notInt8x16(Operand in, FloatRegister out);
+    void notInt16x8(Operand in, FloatRegister out);
+    void notInt32x4(Operand in, FloatRegister out);
+    void notFloat32x4(Operand in, FloatRegister out);
+
+    void divFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vdivps(rhs, lhs, output);
     }
-    void bitwiseAndNotSimd128(const Operand& src, FloatRegister dest) {
-        vandnps(src, dest, dest);
+    void minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output);
+    void maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+    void minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+    void maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+
+    void absFloat32x4(Operand in, FloatRegister out);
+
+    void bitwiseAndFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vandps(rhs, lhs, dest);
+    }
+    void bitwiseAndSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpand(rhs, lhs, dest);
+    }
+
+    void bitwiseOrFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vorps(rhs, lhs, dest);
+    }
+    void bitwiseOrSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpor(rhs, lhs, dest);
     }
-    void bitwiseOrSimd128(const Operand& src, FloatRegister dest) {
-        vorps(src, dest, dest);
+
+    void bitwiseXorFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vxorps(rhs, lhs, dest);
+    }
+    void bitwiseXorSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpxor(rhs, lhs, dest);
+    }
+
+    void bitwiseAndNotFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vandnps(rhs, lhs, dest);
     }
-    void bitwiseXorSimd128(const Operand& src, FloatRegister dest) {
-        vxorps(src, dest, dest);
+    void bitwiseAndNotSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpandn(rhs, lhs, dest);
     }
+
     void zeroSimd128Float(FloatRegister dest) {
         vxorps(dest, dest, dest);
     }
@@ -841,6 +1000,16 @@ class MacroAssemblerX86Shared : public Assembler
         vpxor(dest, dest, dest);
     }
 
+    void selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                       FloatRegister temp, FloatRegister output);
+    void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                  FloatRegister temp, FloatRegister output) {
+        if (AssemblerX86Shared::HasAVX())
+            vblendvps(mask, onTrue, onFalse, output);
+        else
+            selectSimd128(mask, onTrue, onFalse, temp, output);
+    }
+
     template <class T, class Reg> inline void loadScalar(const Operand& src, Reg dest);
     template <class T, class Reg> inline void storeScalar(Reg src, const Address& dest);
     template <class T> inline void loadAlignedVector(const Address& src, FloatRegister dest);
@@ -987,41 +1156,38 @@ class MacroAssemblerX86Shared : public Assembler
         vsqrtps(src, dest);
     }
 
-    void packedLeftShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
-        vpsllw(src, dest, dest);
-    }
+  public:
+    void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+
     void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        count.value &= 15;
         vpsllw(count, dest, dest);
     }
-    void packedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
-        vpsraw(src, dest, dest);
-    }
     void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        count.value &= 15;
         vpsraw(count, dest, dest);
     }
-    void packedUnsignedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
-        vpsrlw(src, dest, dest);
-    }
     void packedUnsignedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        count.value &= 15;
         vpsrlw(count, dest, dest);
     }
 
-    void packedLeftShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
-        vpslld(src, dest, dest);
-    }
+    void packedLeftShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+
     void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+        count.value &= 31;
         vpslld(count, dest, dest);
     }
-    void packedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
-        vpsrad(src, dest, dest);
-    }
     void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+        count.value &= 31;
         vpsrad(count, dest, dest);
     }
-    void packedUnsignedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
-        vpsrld(src, dest, dest);
-    }
     void packedUnsignedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+        count.value &= 31;
         vpsrld(count, dest, dest);
     }
 
diff --git a/js/src/moz.build b/js/src/moz.build
index 8e14de6e85..59feedf22d 100644
--- a/js/src/moz.build
+++ b/js/src/moz.build
@@ -431,6 +431,7 @@ elif CONFIG['JS_CODEGEN_X86'] or CONFIG['JS_CODEGEN_X64']:
         'jit/x86-shared/CodeGenerator-x86-shared.cpp',
         'jit/x86-shared/Disassembler-x86-shared.cpp',  # using namespace js::jit::X86Encoding;
         'jit/x86-shared/Lowering-x86-shared.cpp',
+        'jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp',
         'jit/x86-shared/MacroAssembler-x86-shared.cpp',
         'jit/x86-shared/MoveEmitter-x86-shared.cpp',
     ]
author	Moonchild <moonchild@palemoon.org>	2023-09-12 23:28:49 +0200
committer	Moonchild <moonchild@palemoon.org>	2023-09-12 23:28:49 +0200
commit	1a7f79ef9acde005dd78984aeb5917af525960d6 (patch)
tree	56845f1dfffc062d2a22719c464e25535c5f864c /js
parent	281497201e52d95b1592e28ba59431ad4ae3bfeb (diff)
download	uxp-1a7f79ef9acde005dd78984aeb5917af525960d6.tar.gz