summaryrefslogtreecommitdiff
path: root/libraries/atlas/atlas.patch
diff options
context:
space:
mode:
authorSerban Udrea <S.Udrea@gsi.de>2010-04-08 23:25:08 -0500
committerRobby Workman <rworkman@slackbuilds.org>2010-05-15 10:25:39 +0200
commitbc9d95f65940e1bc59515f368898724c3a2ca9b9 (patch)
treed6f4fa780e855eff6fa1fffb7d62fa869b2e1c18 /libraries/atlas/atlas.patch
parent875ae3ca0352790a2d4ee699f4c6870e8e12f501 (diff)
downloadslackbuilds-bc9d95f65940e1bc59515f368898724c3a2ca9b9.tar.gz
libraries/atlas: Added (BLAS implementation)
Diffstat (limited to 'libraries/atlas/atlas.patch')
-rw-r--r--libraries/atlas/atlas.patch5072
1 files changed, 5072 insertions, 0 deletions
diff --git a/libraries/atlas/atlas.patch b/libraries/atlas/atlas.patch
new file mode 100644
index 0000000000..dea4dcc0b2
--- /dev/null
+++ b/libraries/atlas/atlas.patch
@@ -0,0 +1,5072 @@
+diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c
+--- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100
++++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100
+@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip,
+ iret = IntP4;
+ break;
+ case 3:
+- case 4:
++ case 4: ; case 6:
+ iret = IntP4E;
+ break;
+ default:
+diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h
+--- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100
++++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100
+@@ -126,7 +126,7 @@
+ #define CPAT Mjoin(C_ATL_, PRE);
+
+ #ifndef ATL_MaxMalloc
+- #define ATL_MaxMalloc 67108864
++ #define ATL_MaxMalloc XXX_MaxMalloc_XXX
+ #endif
+
+ typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c
+--- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100
++++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100
+@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK)
+ {
+ NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm);
+ if (SCALAR_IS_ZERO(beta))
+- Mjoin(PATL,gezero)(M, N, C, ldc);
++ /* Mjoin(PATL,gezero)(M, N, C, ldc); */
++ { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); }
+ }
+ if (nblk)
+ {
+diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c
+--- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100
++++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100
+@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx)
+ const int ldc2 = (ldc-M)<<1;
+ int i, j;
+
+- if (ialp == ATL_rzero && ibet == ATL_rzero)
++/*
++ * Cannot read C if BETA is 0
++ */
++ if (rbet == ATL_rzero && ibet == ATL_rzero)
++ {
++ if (ialp == ATL_rzero) /* alpha is a real number */
++ {
++ if (ralp == ATL_rone) /* alpha = 1.0 */
++ {
++ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
++ {
++ for (i=0; i < M; i++, C += 2)
++ {
++ *C = R[i];
++ C[1] = I[i];
++ }
++ }
++ }
++ else
++ {
++ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
++ {
++ for (i=0; i < M; i++, C += 2)
++ {
++ *C = ralp * R[i];
++ C[1] = ralp * I[i];
++ }
++ }
++ }
++ }
++ else /* alpha is a complex number */
++ {
++ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
++ {
++ for (i=0; i < M; i++, C += 2)
++ {
++ ra = R[i]; ia = I[i];
++ C[0] = ralp * ra - ialp * ia;
++ C[1] = ralp * ia + ialp * ra;
++ }
++ }
++ }
++ }
++/*
++ * If alpha and beta are both real numbers
++ */
++ else if (ialp == ATL_rzero && ibet == ATL_rzero)
+ {
+ if (ralp == ATL_rone && rbet == ATL_rone)
+ {
+diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c
+--- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100
++++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100
+@@ -27,6 +27,13 @@
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
++#if KB > 84
++ #error "KB cannot exceed 84!"
++#endif
++#if (KB/4)*4 != KB
++ #error "KB must be a multiple of 4!"
++#endif
++
+ #ifndef ATL_GAS_x8664
+ #error "This kernel requires x86-64 assembly!"
+ #endif
+@@ -58,25 +65,25 @@
+ * Integer register usage shown be these defines
+ */
+ #define pA %rcx
+-#define pA10 %rbx
+-#define ldab %rbp
+-#define mldab %rdx
++#define pA10 %rbx
++#define ldab %rbp
++#define mldab %rdx
+ #define mldab5 %rax
+ #define pB %rdi
+ #define pC %rsi
+ #define incCn %r10
+ #define stM %r9
+ #define stN %r11
+-#define pfA %r8
+-#define pA5 pA
+-#define pB0 pB
++#define pfA %r8
++#define pA5 pA
++#define pB0 pB
+ #if MB == 0
+- #define stM0 %r12
+- #define incAm %r13
++ #define stM0 %r12
++ #define incAm %r13
+ #endif
+ /* rax used in 32/64 conversion */
+
+-#define NBso (KB*4)
++#define NBso (KB*4)
+ #define MBKBso (MB*KB*4)
+ #define NB2so (NBso+NBso)
+ #define NB3so (NBso+NBso+NBso)
+@@ -95,22 +102,22 @@
+ /*
+ * SSE2 register usage shown be these defines
+ */
+-#define rA0 %xmm0
+-#define rB0 %xmm1
+-#define rC0 %xmm2
+-#define rC1 %xmm3
+-#define rC2 %xmm4
+-#define rC3 %xmm5
+-#define rC4 %xmm6
+-#define rC5 %xmm7
+-#define rC6 %xmm8
+-#define rC7 %xmm9
+-#define rC8 %xmm10
+-#define rC9 %xmm11
+-#define rC10 %xmm12
+-#define rC11 %xmm13
+-#define rC12 %xmm14
+-#define rC13 %xmm15
++#define rA0 %xmm0
++#define rB0 %xmm1
++#define rC0 %xmm2
++#define rC1 %xmm3
++#define rC2 %xmm4
++#define rC3 %xmm5
++#define rC4 %xmm6
++#define rC5 %xmm7
++#define rC6 %xmm8
++#define rC7 %xmm9
++#define rC8 %xmm10
++#define rC9 %xmm11
++#define rC10 %xmm12
++#define rC11 %xmm13
++#define rC12 %xmm14
++#define rC13 %xmm15
+ /*
+ * Prefetch defines
+ */
+@@ -127,99 +134,99 @@
+ #if MB != 0
+ #define incAm $MBKBso-NB14so+176
+ #endif
+- .text
++ .text
+ .global ATL_asmdecor(ATL_USERMM)
+ ATL_asmdecor(ATL_USERMM):
+ /*
+ * Save callee-saved iregs
+ */
+- movq %rbp, -8(%rsp)
+- movq %rbx, -16(%rsp)
++ movq %rbp, -8(%rsp)
++ movq %rbx, -16(%rsp)
+ #if MB == 0
+- movq %r12, -32(%rsp)
+- movq %r13, -40(%rsp)
++ movq %r12, -32(%rsp)
++ movq %r13, -40(%rsp)
+ #endif
+ #ifdef BETAX
+ #define BOF -56
+- movss %xmm1, BOF(%rsp)
+- movss %xmm1, BOF+4(%rsp)
+- movss %xmm1, BOF+8(%rsp)
+- movss %xmm1, BOF+12(%rsp)
++ movss %xmm1, BOF(%rsp)
++ movss %xmm1, BOF+4(%rsp)
++ movss %xmm1, BOF+8(%rsp)
++ movss %xmm1, BOF+12(%rsp)
+ #endif
+ /*
+ * pA already comes in right reg
+ * Initialize pB = B; pC = C; NBso = NB * sizeof;
+ */
+- movq %rsi, stN
+- movq %rdi, %rax
+- movq 16(%rsp), pC
+- prefC((pC))
+- prefC(64(pC))
+- movq %r9, pB
+- prefB((pB))
+- prefB(64(pB))
+- movq %rax, stM
++ movq %rsi, stN
++ movq %rdi, %rax
++ movq 16(%rsp), pC
++ prefC((pC))
++ prefC(64(pC))
++ movq %r9, pB
++ prefB((pB))
++ prefB(64(pB))
++ movq %rax, stM
+ /*
+ * stM = pA + NBNBso; stN = pB + NBNBso;
+ */
+ #if MB == 0
+- movq stM, pfA
+- imulq $NBso, pfA
+- prefB(128(pB))
+- movq pfA, incAm
+- addq pA5, pfA
+- addq $176-NB14so, incAm
++ movq stM, pfA
++ imulq $NBso, pfA
++ prefB(128(pB))
++ movq pfA, incAm
++ addq pA5, pfA
++ addq $176-NB14so, incAm
+ #else
+- movq $MBKBso, pfA
+- addq pA5, pfA
+- prefB(128(pB))
++ movq $MBKBso, pfA
++ addq pA5, pfA
++ prefB(128(pB))
+ #endif
+ /*
+ * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof
+ */
+- movl 24(%rsp), %eax
+- cltq
+- movq %rax, incCn
+- subq stM, incCn
+- addq $14, incCn
++ movl 24(%rsp), %eax
++ cltq
++ movq %rax, incCn
++ subq stM, incCn
++ addq $14, incCn
+ #ifdef SREAL
+- shl $2, incCn
++ shl $2, incCn
+ #else
+- shl $3, incCn
+- prefC(128(pC))
+- prefC(192(pC))
++ shl $3, incCn
++ prefC(128(pC))
++ prefC(192(pC))
+ #endif
+ /*
+ * Find M/14 if MB is not set
+ */
+ #if MB == 0
+- cmp $84, stM
+- jne MB_LT84
+-/* movq $84/14, stM */
+- movq $6, stM
++ cmp $84, stM
++ jne MB_LT84
++/* movq $84/14, stM */
++ movq $6, stM
+ MBFOUND:
+- subq $1, stM
+- movq stM, stM0
++ subq $1, stM
++ movq stM, stM0
+ #endif
+- addq $120, pA5
+- addq $120, pB0
+- movq $KB*4, ldab
+- movq $-KB*5*4, mldab5
+- movq $-KB*4, mldab
+- subq mldab5, pA5
+- lea KB*4(pA5, ldab,4), pA10
+-/* movq $NB, stN */
++ addq $120, pA5
++ addq $120, pB0
++ movq $KB*4, ldab
++ movq $-KB*5*4, mldab5
++ movq $-KB*4, mldab
++ subq mldab5, pA5
++ lea KB*4(pA5, ldab,4), pA10
++/* movq $NB, stN */
+
+ UNLOOP:
+ #if MB == 0
+- movq stM0, stM
+- cmp $0, stM
+- je MLAST
++ movq stM0, stM
++ cmp $0, stM
++ je MLAST
+ #else
+ #ifdef ATL_DivAns
+- movq $ATL_DivAns-1, stM
++ movq $ATL_DivAns-1, stM
+ #else
+- movq $MB/14-1, stM
++ movq $MB/14-1, stM
+ #endif
+ #endif
+ #if MB == 0 || MB > 14
+@@ -227,992 +234,992 @@ UMLOOP:
+ /*
+ * rC[0-13] = pC[0-13] * beta
+ */
+- ALIGN16
++ ALIGN16
+ /*UKLOOP: */
+ #ifdef BETA1
+- movaps 0-120(pA10,mldab5,2), rC0
+- movaps 0-120(pB0), rB0
+- mulps rB0, rC0
+- addss (pC), rC0
+- movaps 0-120(pA5, mldab,4), rC1
+- mulps rB0, rC1
+- addss CMUL(4)(pC), rC1
+- movaps 0-120(pA10, mldab,8), rC2
+- mulps rB0, rC2
+- addss CMUL(8)(pC), rC2
+- movaps 0-120(pA5, mldab,2), rC3
+- mulps rB0, rC3
+- addss CMUL(12)(pC), rC3
+- movaps 0-120(pA5, mldab), rC4
+- mulps rB0, rC4
+- addss CMUL(16)(pC), rC4
+- movaps 0-120(pA5), rC5
+- mulps rB0, rC5
+- addss CMUL(20)(pC), rC5
+- movaps 0-120(pA5, ldab), rC6
+- mulps rB0, rC6
+- addss CMUL(24)(pC), rC6
+- movaps 0-120(pA5, ldab,2), rC7
+- mulps rB0, rC7
+- addss CMUL(28)(pC), rC7
+- movaps 0-120(pA10, mldab,2), rC8
+- mulps rB0, rC8
+- addss CMUL(32)(pC), rC8
+- movaps 0-120(pA5,ldab,4), rC9
+- mulps rB0, rC9
+- addss CMUL(36)(pC), rC9
+- movaps 0-120(pA10), rC10
+- mulps rB0, rC10
+- addss CMUL(40)(pC), rC10
+- movaps 0-120(pA10,ldab), rC11
+- mulps rB0, rC11
+- addss CMUL(44)(pC), rC11
+- movaps 0-120(pA10,ldab,2), rC12
+- mulps rB0, rC12
+- addss CMUL(48)(pC), rC12
+- movaps 0-120(pA5,ldab,8), rC13
+- mulps rB0, rC13
+- addss CMUL(52)(pC), rC13
++ movaps 0-120(pA10,mldab5,2), rC0
++ movaps 0-120(pB0), rB0
++ mulps rB0, rC0
++ addss (pC), rC0
++ movaps 0-120(pA5, mldab,4), rC1
++ mulps rB0, rC1
++ addss CMUL(4)(pC), rC1
++ movaps 0-120(pA10, mldab,8), rC2
++ mulps rB0, rC2
++ addss CMUL(8)(pC), rC2
++ movaps 0-120(pA5, mldab,2), rC3
++ mulps rB0, rC3
++ addss CMUL(12)(pC), rC3
++ movaps 0-120(pA5, mldab), rC4
++ mulps rB0, rC4
++ addss CMUL(16)(pC), rC4
++ movaps 0-120(pA5), rC5
++ mulps rB0, rC5
++ addss CMUL(20)(pC), rC5
++ movaps 0-120(pA5, ldab), rC6
++ mulps rB0, rC6
++ addss CMUL(24)(pC), rC6
++ movaps 0-120(pA5, ldab,2), rC7
++ mulps rB0, rC7
++ addss CMUL(28)(pC), rC7
++ movaps 0-120(pA10, mldab,2), rC8
++ mulps rB0, rC8
++ addss CMUL(32)(pC), rC8
++ movaps 0-120(pA5,ldab,4), rC9
++ mulps rB0, rC9
++ addss CMUL(36)(pC), rC9
++ movaps 0-120(pA10), rC10
++ mulps rB0, rC10
++ addss CMUL(40)(pC), rC10
++ movaps 0-120(pA10,ldab), rC11
++ mulps rB0, rC11
++ addss CMUL(44)(pC), rC11
++ movaps 0-120(pA10,ldab,2), rC12
++ mulps rB0, rC12
++ addss CMUL(48)(pC), rC12
++ movaps 0-120(pA5,ldab,8), rC13
++ mulps rB0, rC13
++ addss CMUL(52)(pC), rC13
+ #else
+- movaps 0-120(pA10,mldab5,2), rC0
+- movaps 0-120(pB0), rC13
+- mulps rC13, rC0
+- movaps 0-120(pA5, mldab,4), rC1
+- mulps rC13, rC1
+- movaps 0-120(pA10, mldab,8), rC2
+- mulps rC13, rC2
+- movaps 0-120(pA5, mldab,2), rC3
+- mulps rC13, rC3
+- movaps 0-120(pA5, mldab), rC4
+- mulps rC13, rC4
+- movaps 0-120(pA5), rC5
+- mulps rC13, rC5
+- movaps 0-120(pA5, ldab), rC6
+- mulps rC13, rC6
+- movaps 0-120(pA5, ldab,2), rC7
+- mulps rC13, rC7
+- movaps 0-120(pA10, mldab,2), rC8
+- mulps rC13, rC8
+- movaps 0-120(pA5,ldab,4), rC9
+- mulps rC13, rC9
+- movaps 0-120(pA10), rC10
+- mulps rC13, rC10
+- movaps 0-120(pA10,ldab), rC11
+- mulps rC13, rC11
+- movaps 0-120(pA10,ldab,2), rC12
+- mulps rC13, rC12
+- mulps 0-120(pA5,ldab,8), rC13
++ movaps 0-120(pA10,mldab5,2), rC0
++ movaps 0-120(pB0), rC13
++ mulps rC13, rC0
++ movaps 0-120(pA5, mldab,4), rC1
++ mulps rC13, rC1
++ movaps 0-120(pA10, mldab,8), rC2
++ mulps rC13, rC2
++ movaps 0-120(pA5, mldab,2), rC3
++ mulps rC13, rC3
++ movaps 0-120(pA5, mldab), rC4
++ mulps rC13, rC4
++ movaps 0-120(pA5), rC5
++ mulps rC13, rC5
++ movaps 0-120(pA5, ldab), rC6
++ mulps rC13, rC6
++ movaps 0-120(pA5, ldab,2), rC7
++ mulps rC13, rC7
++ movaps 0-120(pA10, mldab,2), rC8
++ mulps rC13, rC8
++ movaps 0-120(pA5,ldab,4), rC9
++ mulps rC13, rC9
++ movaps 0-120(pA10), rC10
++ mulps rC13, rC10
++ movaps 0-120(pA10,ldab), rC11
++ mulps rC13, rC11
++ movaps 0-120(pA10,ldab,2), rC12
++ mulps rC13, rC12
++ mulps 0-120(pA5,ldab,8), rC13
+ #endif
+
+ #if KB > 4
+- movaps 16-120(pA10,mldab5,2), rA0
+- movaps 16-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 16-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 16-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 16-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 16-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 16-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 16-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 16-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 16-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 16-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 16-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 16-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 16-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 16-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 16-120(pA10,mldab5,2), rA0
++ movaps 16-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 16-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 16-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 16-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 16-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 16-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 16-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 16-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 16-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 16-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 16-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 16-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 16-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 16-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 8
+- movaps 32-120(pA10,mldab5,2), rA0
+- movaps 32-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 32-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 32-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 32-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 32-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 32-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 32-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 32-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 32-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 32-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 32-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 32-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 32-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 32-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 32-120(pA10,mldab5,2), rA0
++ movaps 32-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 32-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 32-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 32-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 32-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 32-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 32-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 32-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 32-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 32-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 32-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 32-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 32-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 32-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 12
+- movaps 48-120(pA10,mldab5,2), rA0
+- movaps 48-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 48-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 48-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 48-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 48-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 48-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 48-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 48-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 48-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 48-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 48-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 48-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 48-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 48-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 48-120(pA10,mldab5,2), rA0
++ movaps 48-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 48-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 48-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 48-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 48-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 48-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 48-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 48-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 48-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 48-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 48-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 48-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 48-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 48-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 16
+- movaps 64-120(pA10,mldab5,2), rA0
+- movaps 64-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 64-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 64-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 64-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 64-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 64-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 64-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 64-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 64-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 64-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 64-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 64-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 64-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 64-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 64-120(pA10,mldab5,2), rA0
++ movaps 64-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 64-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 64-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 64-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 64-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 64-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 64-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 64-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 64-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 64-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 64-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 64-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 64-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 64-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 20
+- movaps 80-120(pA10,mldab5,2), rA0
+- movaps 80-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 80-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 80-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 80-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 80-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 80-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 80-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 80-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 80-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 80-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 80-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 80-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 80-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 80-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 80-120(pA10,mldab5,2), rA0
++ movaps 80-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 80-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 80-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 80-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 80-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 80-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 80-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 80-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 80-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 80-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 80-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 80-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 80-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 80-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 24
+- movaps 96-120(pA10,mldab5,2), rA0
+- movaps 96-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 96-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 96-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 96-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 96-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 96-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 96-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 96-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 96-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 96-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 96-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 96-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 96-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 96-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 96-120(pA10,mldab5,2), rA0
++ movaps 96-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 96-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 96-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 96-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 96-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 96-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 96-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 96-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 96-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 96-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 96-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 96-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 96-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 96-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 28
+- movaps 112-120(pA10,mldab5,2), rA0
+- movaps 112-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 112-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 112-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 112-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 112-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 112-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 112-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 112-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 112-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 112-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 112-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 112-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 112-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 112-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 112-120(pA10,mldab5,2), rA0
++ movaps 112-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 112-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 112-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 112-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 112-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 112-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 112-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 112-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 112-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 112-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 112-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 112-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 112-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 112-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+ #ifndef SREAL
+- pref2((pfA))
+- pref2(64(pfA))
++ pref2((pfA))
++ pref2(64(pfA))
+ #endif
+
+ #if KB > 32
+- movaps 128-120(pA10,mldab5,2), rA0
+- movaps 128-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 128-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 128-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 128-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 128-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 128-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 128-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 128-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 128-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 128-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 128-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 128-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 128-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 128-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 128-120(pA10,mldab5,2), rA0
++ movaps 128-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 128-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 128-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 128-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 128-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 128-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 128-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 128-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 128-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 128-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 128-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 128-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 128-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 128-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 36
+- movaps 144-120(pA10,mldab5,2), rA0
+- movaps 144-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 144-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 144-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 144-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 144-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 144-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 144-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 144-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 144-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 144-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 144-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 144-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 144-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 144-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 144-120(pA10,mldab5,2), rA0
++ movaps 144-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 144-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 144-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 144-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 144-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 144-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 144-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 144-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 144-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 144-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 144-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 144-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 144-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 144-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 40
+- movaps 160-120(pA10,mldab5,2), rA0
+- movaps 160-120(pB0), rB0
+- mulps rB0, rA0
+- addq $176, pB0
+- addps rA0, rC0
+- movaps 160-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 160-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 160-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 160-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 160-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 160-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 160-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 160-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 160-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 160-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 160-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 160-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addq $176, pA10
+- addps rA0, rC12
+- mulps 160-120(pA5,ldab,8), rB0
+- addps rB0, rC13
+- addq $176, pA5
++ movaps 160-120(pA10,mldab5,2), rA0
++ movaps 160-120(pB0), rB0
++ mulps rB0, rA0
++ addq $176, pB0
++ addps rA0, rC0
++ movaps 160-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 160-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 160-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 160-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 160-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 160-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 160-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 160-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 160-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 160-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 160-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 160-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addq $176, pA10
++ addps rA0, rC12
++ mulps 160-120(pA5,ldab,8), rB0
++ addps rB0, rC13
++ addq $176, pA5
+ #else
+- addq $176, pB0
+- addq $176, pA10
+- addq $176, pA5
++ addq $176, pB0
++ addq $176, pA10
++ addq $176, pA5
+ #endif
+
+ #if KB > 44
+- movaps 0-120(pA10,mldab5,2), rA0
+- movaps 0-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 0-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 0-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 0-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 0-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 0-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 0-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 0-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 0-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 0-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 0-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 0-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 0-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 0-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 0-120(pA10,mldab5,2), rA0
++ movaps 0-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 0-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 0-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 0-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 0-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 0-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 0-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 0-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 0-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 0-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 0-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 0-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 0-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 0-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 48
+- movaps 16-120(pA10,mldab5,2), rA0
+- movaps 16-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 16-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 16-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 16-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 16-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 16-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 16-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 16-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 16-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 16-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 16-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 16-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 16-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 16-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 16-120(pA10,mldab5,2), rA0
++ movaps 16-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 16-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 16-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 16-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 16-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 16-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 16-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 16-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 16-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 16-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 16-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 16-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 16-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 16-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 52
+- movaps 32-120(pA10,mldab5,2), rA0
+- movaps 32-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 32-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 32-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 32-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 32-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 32-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 32-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 32-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 32-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 32-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 32-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 32-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 32-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 32-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 32-120(pA10,mldab5,2), rA0
++ movaps 32-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 32-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 32-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 32-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 32-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 32-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 32-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 32-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 32-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 32-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 32-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 32-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 32-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 32-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 56
+- movaps 48-120(pA10,mldab5,2), rA0
+- movaps 48-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 48-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 48-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 48-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 48-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 48-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 48-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 48-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 48-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 48-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 48-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 48-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 48-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 48-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 48-120(pA10,mldab5,2), rA0
++ movaps 48-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 48-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 48-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 48-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 48-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 48-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 48-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 48-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 48-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 48-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 48-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 48-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 48-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 48-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 60
+- movaps 64-120(pA10,mldab5,2), rA0
+- movaps 64-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 64-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 64-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 64-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 64-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 64-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 64-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 64-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 64-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 64-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 64-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 64-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 64-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 64-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 64-120(pA10,mldab5,2), rA0
++ movaps 64-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 64-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 64-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 64-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 64-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 64-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 64-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 64-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 64-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 64-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 64-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 64-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 64-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 64-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 64
+- movaps 80-120(pA10,mldab5,2), rA0
+- movaps 80-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 80-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 80-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 80-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 80-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 80-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 80-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 80-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 80-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 80-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 80-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 80-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 80-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 80-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 80-120(pA10,mldab5,2), rA0
++ movaps 80-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 80-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 80-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 80-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 80-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 80-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 80-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 80-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 80-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 80-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 80-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 80-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 80-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 80-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 68
+- movaps 96-120(pA10,mldab5,2), rA0
+- movaps 96-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 96-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 96-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 96-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 96-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 96-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 96-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 96-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 96-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 96-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 96-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 96-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 96-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 96-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 96-120(pA10,mldab5,2), rA0
++ movaps 96-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 96-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 96-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 96-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 96-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 96-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 96-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 96-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 96-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 96-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 96-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 96-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 96-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 96-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 72
+- movaps 112-120(pA10,mldab5,2), rA0
+- movaps 112-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 112-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 112-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 112-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 112-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 112-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 112-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 112-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 112-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 112-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 112-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 112-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 112-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 112-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 112-120(pA10,mldab5,2), rA0
++ movaps 112-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 112-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 112-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 112-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 112-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 112-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 112-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 112-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 112-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 112-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 112-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 112-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 112-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 112-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 76
+- movaps 128-120(pA10,mldab5,2), rA0
+- movaps 128-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 128-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 128-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 128-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 128-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 128-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 128-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 128-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 128-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 128-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 128-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 128-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 128-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 128-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 128-120(pA10,mldab5,2), rA0
++ movaps 128-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 128-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 128-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 128-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 128-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 128-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 128-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 128-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 128-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 128-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 128-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 128-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 128-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 128-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 80
+- movaps 144-120(pA10,mldab5,2), rA0
+- movaps 144-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 144-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 144-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 144-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 144-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 144-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 144-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 144-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 144-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 144-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 144-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 144-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 144-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 144-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 144-120(pA10,mldab5,2), rA0
++ movaps 144-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 144-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 144-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 144-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 144-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 144-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 144-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 144-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 144-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 144-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 144-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 144-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 144-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 144-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ /*UKLOOP */
+@@ -1220,234 +1227,234 @@ UMLOOP:
+ * Get these bastard things summed up correctly
+ */
+
+- /* rC0 = c0a c0b c0c c0d */
+- /* rC1 = c1a c1b c1c c1d */
+- /* rC2 = c2a c2b c2c c2d */
+- /* rC3 = c3a c3b c3c c3d */
++ /* rC0 = c0a c0b c0c c0d */
++ /* rC1 = c1a c1b c1c c1d */
++ /* rC2 = c2a c2b c2c c2d */
++ /* rC3 = c3a c3b c3c c3d */
+ /* */
+- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
+- prefC((pC))
+- prefC(64(pC))
+- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
+- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
+- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
+- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
+- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
+- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
+- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
+- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
+- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
+- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
+- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
+- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
+- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
+- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
+- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
+- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
+-
+-
+- /* rC4 = c4a c4b c4c c4d */
+- /* rC5 = c5a c5b c5c c5d */
+- /* rC6 = c6a c6b c6c c6d */
+- /* rC7 = c7a c7b c7c c7d */
+- /* rC8 = c08a c08b c08c c08d */
+- /* rC9 = c09a c09b c09c c09d */
+- /* rC10 = c10a c10b c10c c10d */
+- /* rC11 = c11a c11b c11c c11d */
+- /* rC12 = c12a c12b c12c c12d */
+- /* rC13 = c13a c13b c13c c13d */
++ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
++ prefC((pC))
++ prefC(64(pC))
++ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
++ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
++ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
++ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
++ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
++ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
++ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
++ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
++ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
++ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
++ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
++ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
++ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
++ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
++ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
++ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
++
++
++ /* rC4 = c4a c4b c4c c4d */
++ /* rC5 = c5a c5b c5c c5d */
++ /* rC6 = c6a c6b c6c c6d */
++ /* rC7 = c7a c7b c7c c7d */
++ /* rC8 = c08a c08b c08c c08d */
++ /* rC9 = c09a c09b c09c c09d */
++ /* rC10 = c10a c10b c10c c10d */
++ /* rC11 = c11a c11b c11c c11d */
++ /* rC12 = c12a c12b c12c c12d */
++ /* rC13 = c13a c13b c13c c13d */
+ /* */
+- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
+- prefC(128(pC))
++ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
++ prefC(128(pC))
+ #ifdef SREAL
+- pref2((pfA))
++ pref2((pfA))
+ #else
+- prefC(192(pC))
++ prefC(192(pC))
+ #endif
+- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
+- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
+- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
+- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
+- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
+- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
+- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
+- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
+- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
+- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
+- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
+- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
+- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
+- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
+- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
++ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
++ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
++ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
++ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
++ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
++ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
++ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
++ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
++ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
++ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
++ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
++ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
++ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
++ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
++ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
+ #ifdef BETAX
+ #ifdef SREAL
+- movups (pC), rA0
+- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+- movups 16(pC), rC4
+- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+- movups 32(pC), rC5
+- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+- movlps 48(pC), rC1
+- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+- pref2(64(pfA))
+- mulps BOF(%rsp), rA0
+- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+- mulps BOF(%rsp), rC4
+- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+- mulps BOF(%rsp), rC5
+- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+- mulps BOF(%rsp), rC1
++ movups (pC), rA0
++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
++ movups 16(pC), rC4
++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
++ movups 32(pC), rC5
++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
++ movlps 48(pC), rC1
++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
++ pref2(64(pfA))
++ mulps BOF(%rsp), rA0
++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
++ mulps BOF(%rsp), rC4
++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
++ mulps BOF(%rsp), rC5
++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
++ mulps BOF(%rsp), rC1
+
+ /* */
+
+- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+- addps rA0, rC3
+- addq $68, pfA
+- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+- addps rC4, rC7
+- addps rC5, rC11
+- addps rC1, rC12
++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
++ addps rA0, rC3
++ addq $68, pfA
++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
++ addps rC4, rC7
++ addps rC5, rC11
++ addps rC1, rC12
+ #else /* BETA = X, complex type */
+- movups (pC), rA0
+- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+- movups 16(pC), rC4
+- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
+- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
+- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
+- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
+- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
+- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
+- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
+- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+- movss 96(pC), rC1
+- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+- movss 104(pC), rB0
+- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+- unpcklps rB0, rC1
+- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+- prefC(256(pC))
+- mulps BOF(%rsp), rA0
+- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+- mulps BOF(%rsp), rC4
+- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+- mulps BOF(%rsp), rC5
+- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+- mulps BOF(%rsp), rC1
++ movups (pC), rA0
++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
++ movups 16(pC), rC4
++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
++ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
++ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
++ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
++ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
++ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
++ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
++ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
++ movss 96(pC), rC1
++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
++ movss 104(pC), rB0
++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
++ unpcklps rB0, rC1
++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
++ prefC(256(pC))
++ mulps BOF(%rsp), rA0
++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
++ mulps BOF(%rsp), rC4
++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
++ mulps BOF(%rsp), rC5
++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
++ mulps BOF(%rsp), rC1
+
+ /* */
+
+- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+- addps rA0, rC3
+- prefC(192(pC))
+- addq $68, pfA
+- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+- addps rC4, rC7
+- addps rC5, rC11
+- addps rC1, rC12
++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
++ addps rA0, rC3
++ prefC(192(pC))
++ addq $68, pfA
++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
++ addps rC4, rC7
++ addps rC5, rC11
++ addps rC1, rC12
+ #endif
+
+ #else
+- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ #ifdef SREAL
+- pref2(64(pfA))
++ pref2(64(pfA))
+ #else
+- prefC(256(pC))
++ prefC(256(pC))
+ #endif
+- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+
+ /* */
+
+- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ #ifndef SREAL
+- prefC(192(pC))
++ prefC(192(pC))
+ #endif
+- addq $68, pfA
+- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
++ addq $68, pfA
++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+
+ #endif
+ /*
+ * Write results back to C; pC += 14;
+ */
+ #ifdef SREAL
+- movups rC3, (pC)
+- movups rC7, 16(pC)
+- movups rC11, 32(pC)
+- movlps rC12, 48(pC)
+- addq $56, pC
++ movups rC3, (pC)
++ movups rC7, 16(pC)
++ movups rC11, 32(pC)
++ movlps rC12, 48(pC)
++ addq $56, pC
+ #else
+- movss rC3, (pC)
+- movss rC7, 32(pC)
+- movhlps rC3, rC0
+- movhlps rC7, rC6
+- movss rC0, 16(pC)
+- movss rC6, 48(pC)
+- shufps $0x55, rC3, rC3
+- shufps $0x55, rC7, rC7
+- movss rC3, 8(pC)
+- movss rC7, 40(pC)
+- shufps $0x55, rC0, rC0
+- shufps $0x55, rC6, rC6
+- movss rC0, 24(pC)
+- movss rC6, 56(pC)
+-
+- movss rC11, 64(pC)
+- movhlps rC11, rC2
+- movss rC12, 96(pC)
+- movss rC2, 80(pC)
+- shufps $0x55, rC11, rC11
+- shufps $0x55, rC12, rC12
+- movss rC11, 72(pC)
+- shufps $0x55, rC2, rC2
+- movss rC12, 104(pC)
+- movss rC2, 88(pC)
++ movss rC3, (pC)
++ movss rC7, 32(pC)
++ movhlps rC3, rC0
++ movhlps rC7, rC6
++ movss rC0, 16(pC)
++ movss rC6, 48(pC)
++ shufps $0x55, rC3, rC3
++ shufps $0x55, rC7, rC7
++ movss rC3, 8(pC)
++ movss rC7, 40(pC)
++ shufps $0x55, rC0, rC0
++ shufps $0x55, rC6, rC6
++ movss rC0, 24(pC)
++ movss rC6, 56(pC)
++
++ movss rC11, 64(pC)
++ movhlps rC11, rC2
++ movss rC12, 96(pC)
++ movss rC2, 80(pC)
++ shufps $0x55, rC11, rC11
++ shufps $0x55, rC12, rC12
++ movss rC11, 72(pC)
++ shufps $0x55, rC2, rC2
++ movss rC12, 104(pC)
++ movss rC2, 88(pC)
+
+- addq $112, pC
++ addq $112, pC
+ #endif
+ /*
+ * Write results back to C
+ */
+- addq $NB14so-176, pA5
+- addq $NB14so-176, pA10
+- subq $176, pB0
++ addq $NB14so-176, pA5
++ addq $NB14so-176, pA10
++ subq $176, pB0
+ /*
+ * pC += 14; pA += 14*NB; pB -= NB;
+ */
+ /*
+ * while (pA != stM);
+ */
+- subq $1, stM
+- jne UMLOOP
++ subq $1, stM
++ jne UMLOOP
+ #endif
+
+ /*
+@@ -1459,994 +1466,994 @@ MLAST:
+ #endif
+ /*UKLOOP: */
+ #ifdef BETA1
+- movaps 0-120(pA10,mldab5,2), rC0
+- movaps 0-120(pB0), rB0
+- mulps rB0, rC0
+- addss (pC), rC0
+- movaps 0-120(pA5, mldab,4), rC1
+- mulps rB0, rC1
+- addss CMUL(4)(pC), rC1
+- movaps 0-120(pA10, mldab,8), rC2
+- mulps rB0, rC2
+- addss CMUL(8)(pC), rC2
+- movaps 0-120(pA5, mldab,2), rC3
+- mulps rB0, rC3
+- addss CMUL(12)(pC), rC3
+- movaps 0-120(pA5, mldab), rC4
+- mulps rB0, rC4
+- addss CMUL(16)(pC), rC4
+- movaps 0-120(pA5), rC5
+- mulps rB0, rC5
+- addss CMUL(20)(pC), rC5
+- movaps 0-120(pA5, ldab), rC6
+- mulps rB0, rC6
+- addss CMUL(24)(pC), rC6
+- movaps 0-120(pA5, ldab,2), rC7
+- mulps rB0, rC7
+- addss CMUL(28)(pC), rC7
+- movaps 0-120(pA10, mldab,2), rC8
+- mulps rB0, rC8
+- addss CMUL(32)(pC), rC8
+- movaps 0-120(pA5,ldab,4), rC9
+- mulps rB0, rC9
+- addss CMUL(36)(pC), rC9
+- movaps 0-120(pA10), rC10
+- mulps rB0, rC10
+- addss CMUL(40)(pC), rC10
+- movaps 0-120(pA10,ldab), rC11
+- mulps rB0, rC11
+- addss CMUL(44)(pC), rC11
+- movaps 0-120(pA10,ldab,2), rC12
+- mulps rB0, rC12
+- addss CMUL(48)(pC), rC12
+- movaps 0-120(pA5,ldab,8), rC13
+- mulps rB0, rC13
+- addss CMUL(52)(pC), rC13
++ movaps 0-120(pA10,mldab5,2), rC0
++ movaps 0-120(pB0), rB0
++ mulps rB0, rC0
++ addss (pC), rC0
++ movaps 0-120(pA5, mldab,4), rC1
++ mulps rB0, rC1
++ addss CMUL(4)(pC), rC1
++ movaps 0-120(pA10, mldab,8), rC2
++ mulps rB0, rC2
++ addss CMUL(8)(pC), rC2
++ movaps 0-120(pA5, mldab,2), rC3
++ mulps rB0, rC3
++ addss CMUL(12)(pC), rC3
++ movaps 0-120(pA5, mldab), rC4
++ mulps rB0, rC4
++ addss CMUL(16)(pC), rC4
++ movaps 0-120(pA5), rC5
++ mulps rB0, rC5
++ addss CMUL(20)(pC), rC5
++ movaps 0-120(pA5, ldab), rC6
++ mulps rB0, rC6
++ addss CMUL(24)(pC), rC6
++ movaps 0-120(pA5, ldab,2), rC7
++ mulps rB0, rC7
++ addss CMUL(28)(pC), rC7
++ movaps 0-120(pA10, mldab,2), rC8
++ mulps rB0, rC8
++ addss CMUL(32)(pC), rC8
++ movaps 0-120(pA5,ldab,4), rC9
++ mulps rB0, rC9
++ addss CMUL(36)(pC), rC9
++ movaps 0-120(pA10), rC10
++ mulps rB0, rC10
++ addss CMUL(40)(pC), rC10
++ movaps 0-120(pA10,ldab), rC11
++ mulps rB0, rC11
++ addss CMUL(44)(pC), rC11
++ movaps 0-120(pA10,ldab,2), rC12
++ mulps rB0, rC12
++ addss CMUL(48)(pC), rC12
++ movaps 0-120(pA5,ldab,8), rC13
++ mulps rB0, rC13
++ addss CMUL(52)(pC), rC13
+ #else
+- movaps 0-120(pA10,mldab5,2), rC0
+- movaps 0-120(pB0), rC13
+- mulps rC13, rC0
+- movaps 0-120(pA5, mldab,4), rC1
+- mulps rC13, rC1
+- movaps 0-120(pA10, mldab,8), rC2
+- mulps rC13, rC2
+- movaps 0-120(pA5, mldab,2), rC3
+- mulps rC13, rC3
+- movaps 0-120(pA5, mldab), rC4
+- mulps rC13, rC4
+- movaps 0-120(pA5), rC5
+- mulps rC13, rC5
+- movaps 0-120(pA5, ldab), rC6
+- mulps rC13, rC6
+- movaps 0-120(pA5, ldab,2), rC7
+- mulps rC13, rC7
+- movaps 0-120(pA10, mldab,2), rC8
+- mulps rC13, rC8
+- movaps 0-120(pA5,ldab,4), rC9
+- mulps rC13, rC9
+- movaps 0-120(pA10), rC10
+- mulps rC13, rC10
+- movaps 0-120(pA10,ldab), rC11
+- mulps rC13, rC11
+- movaps 0-120(pA10,ldab,2), rC12
+- mulps rC13, rC12
+- mulps 0-120(pA5,ldab,8), rC13
++ movaps 0-120(pA10,mldab5,2), rC0
++ movaps 0-120(pB0), rC13
++ mulps rC13, rC0
++ movaps 0-120(pA5, mldab,4), rC1
++ mulps rC13, rC1
++ movaps 0-120(pA10, mldab,8), rC2
++ mulps rC13, rC2
++ movaps 0-120(pA5, mldab,2), rC3
++ mulps rC13, rC3
++ movaps 0-120(pA5, mldab), rC4
++ mulps rC13, rC4
++ movaps 0-120(pA5), rC5
++ mulps rC13, rC5
++ movaps 0-120(pA5, ldab), rC6
++ mulps rC13, rC6
++ movaps 0-120(pA5, ldab,2), rC7
++ mulps rC13, rC7
++ movaps 0-120(pA10, mldab,2), rC8
++ mulps rC13, rC8
++ movaps 0-120(pA5,ldab,4), rC9
++ mulps rC13, rC9
++ movaps 0-120(pA10), rC10
++ mulps rC13, rC10
++ movaps 0-120(pA10,ldab), rC11
++ mulps rC13, rC11
++ movaps 0-120(pA10,ldab,2), rC12
++ mulps rC13, rC12
++ mulps 0-120(pA5,ldab,8), rC13
+ #endif
+
+ #if KB > 4
+- movaps 16-120(pA10,mldab5,2), rA0
+- movaps 16-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 16-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 16-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 16-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 16-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 16-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 16-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 16-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 16-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 16-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 16-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 16-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 16-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 16-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 16-120(pA10,mldab5,2), rA0
++ movaps 16-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 16-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 16-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 16-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 16-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 16-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 16-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 16-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 16-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 16-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 16-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 16-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 16-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 16-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 8
+- movaps 32-120(pA10,mldab5,2), rA0
+- movaps 32-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 32-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 32-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 32-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 32-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 32-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 32-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 32-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 32-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 32-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 32-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 32-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 32-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 32-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 32-120(pA10,mldab5,2), rA0
++ movaps 32-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 32-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 32-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 32-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 32-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 32-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 32-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 32-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 32-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 32-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 32-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 32-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 32-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 32-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 12
+- movaps 48-120(pA10,mldab5,2), rA0
+- movaps 48-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 48-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 48-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 48-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 48-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 48-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 48-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 48-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 48-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 48-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 48-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 48-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 48-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 48-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 48-120(pA10,mldab5,2), rA0
++ movaps 48-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 48-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 48-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 48-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 48-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 48-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 48-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 48-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 48-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 48-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 48-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 48-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 48-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 48-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 16
+- movaps 64-120(pA10,mldab5,2), rA0
+- movaps 64-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 64-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 64-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 64-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 64-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 64-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 64-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 64-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 64-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 64-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 64-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 64-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 64-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 64-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 64-120(pA10,mldab5,2), rA0
++ movaps 64-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 64-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 64-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 64-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 64-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 64-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 64-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 64-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 64-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 64-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 64-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 64-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 64-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 64-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 20
+- movaps 80-120(pA10,mldab5,2), rA0
+- movaps 80-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 80-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 80-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 80-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 80-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 80-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 80-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 80-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 80-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 80-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 80-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 80-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 80-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 80-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 80-120(pA10,mldab5,2), rA0
++ movaps 80-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 80-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 80-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 80-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 80-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 80-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 80-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 80-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 80-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 80-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 80-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 80-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 80-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 80-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 24
+- movaps 96-120(pA10,mldab5,2), rA0
+- movaps 96-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 96-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 96-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 96-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 96-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 96-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 96-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 96-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 96-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 96-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 96-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 96-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 96-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 96-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 96-120(pA10,mldab5,2), rA0
++ movaps 96-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 96-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 96-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 96-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 96-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 96-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 96-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 96-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 96-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 96-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 96-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 96-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 96-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 96-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 28
+- movaps 112-120(pA10,mldab5,2), rA0
+- movaps 112-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 112-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 112-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 112-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 112-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 112-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 112-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 112-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 112-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 112-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 112-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 112-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 112-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 112-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 112-120(pA10,mldab5,2), rA0
++ movaps 112-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 112-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 112-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 112-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 112-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 112-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 112-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 112-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 112-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 112-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 112-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 112-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 112-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 112-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 32
+- movaps 128-120(pA10,mldab5,2), rA0
+- movaps 128-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 128-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 128-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 128-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 128-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 128-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 128-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 128-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 128-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 128-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 128-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 128-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 128-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 128-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 128-120(pA10,mldab5,2), rA0
++ movaps 128-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 128-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 128-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 128-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 128-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 128-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 128-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 128-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 128-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 128-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 128-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 128-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 128-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 128-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 36
+- movaps 144-120(pA10,mldab5,2), rA0
+- movaps 144-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 144-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 144-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 144-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 144-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 144-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 144-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 144-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 144-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 144-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 144-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 144-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 144-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 144-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 144-120(pA10,mldab5,2), rA0
++ movaps 144-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 144-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 144-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 144-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 144-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 144-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 144-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 144-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 144-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 144-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 144-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 144-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 144-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 144-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+- prefB((pB,ldab))
+- prefB(64(pB,ldab))
++ prefB((pB,ldab))
++ prefB(64(pB,ldab))
+
+ #if KB > 40
+- movaps 160-120(pA10,mldab5,2), rA0
+- movaps 160-120(pB0), rB0
+- mulps rB0, rA0
+- addq $176, pB0
+- addps rA0, rC0
+- movaps 160-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 160-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 160-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 160-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 160-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 160-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 160-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 160-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 160-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 160-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 160-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 160-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addq $176, pA10
+- addps rA0, rC12
+- mulps 160-120(pA5,ldab,8), rB0
+- addps rB0, rC13
+- addq $176, pA5
++ movaps 160-120(pA10,mldab5,2), rA0
++ movaps 160-120(pB0), rB0
++ mulps rB0, rA0
++ addq $176, pB0
++ addps rA0, rC0
++ movaps 160-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 160-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 160-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 160-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 160-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 160-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 160-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 160-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 160-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 160-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 160-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 160-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addq $176, pA10
++ addps rA0, rC12
++ mulps 160-120(pA5,ldab,8), rB0
++ addps rB0, rC13
++ addq $176, pA5
+ #else
+- addq $176, pB0
+- addq $176, pA10
+- addq $176, pA5
++ addq $176, pB0
++ addq $176, pA10
++ addq $176, pA5
+ #endif
+
+ #if KB > 44
+- movaps 0-120(pA10,mldab5,2), rA0
+- movaps 0-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 0-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 0-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 0-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 0-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 0-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 0-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 0-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 0-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 0-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 0-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 0-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 0-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 0-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 0-120(pA10,mldab5,2), rA0
++ movaps 0-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 0-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 0-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 0-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 0-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 0-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 0-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 0-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 0-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 0-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 0-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 0-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 0-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 0-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 48
+- movaps 16-120(pA10,mldab5,2), rA0
+- movaps 16-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 16-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 16-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 16-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 16-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 16-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 16-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 16-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 16-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 16-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 16-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 16-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 16-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 16-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 16-120(pA10,mldab5,2), rA0
++ movaps 16-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 16-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 16-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 16-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 16-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 16-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 16-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 16-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 16-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 16-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 16-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 16-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 16-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 16-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 52
+- movaps 32-120(pA10,mldab5,2), rA0
+- movaps 32-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 32-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 32-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 32-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 32-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 32-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 32-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 32-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 32-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 32-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 32-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 32-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 32-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 32-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 32-120(pA10,mldab5,2), rA0
++ movaps 32-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 32-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 32-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 32-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 32-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 32-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 32-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 32-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 32-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 32-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 32-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 32-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 32-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 32-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 56
+- movaps 48-120(pA10,mldab5,2), rA0
+- movaps 48-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 48-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 48-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 48-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 48-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 48-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 48-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 48-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 48-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 48-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 48-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 48-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 48-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 48-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 48-120(pA10,mldab5,2), rA0
++ movaps 48-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 48-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 48-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 48-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 48-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 48-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 48-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 48-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 48-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 48-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 48-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 48-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 48-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 48-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 60
+- movaps 64-120(pA10,mldab5,2), rA0
+- movaps 64-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 64-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 64-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 64-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 64-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 64-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 64-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 64-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 64-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 64-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 64-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 64-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 64-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 64-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 64-120(pA10,mldab5,2), rA0
++ movaps 64-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 64-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 64-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 64-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 64-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 64-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 64-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 64-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 64-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 64-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 64-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 64-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 64-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 64-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+- prefB(128-176(pB,ldab))
+- prefB(192-176(pB,ldab))
++ prefB(128-176(pB,ldab))
++ prefB(192-176(pB,ldab))
+
+ #if KB > 64
+- movaps 80-120(pA10,mldab5,2), rA0
+- movaps 80-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 80-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 80-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 80-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 80-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 80-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 80-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 80-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 80-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 80-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 80-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 80-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 80-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 80-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 80-120(pA10,mldab5,2), rA0
++ movaps 80-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 80-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 80-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 80-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 80-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 80-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 80-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 80-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 80-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 80-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 80-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 80-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 80-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 80-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 68
+- movaps 96-120(pA10,mldab5,2), rA0
+- movaps 96-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 96-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 96-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 96-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 96-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 96-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 96-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 96-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 96-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 96-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 96-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 96-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 96-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 96-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 96-120(pA10,mldab5,2), rA0
++ movaps 96-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 96-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 96-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 96-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 96-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 96-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 96-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 96-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 96-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 96-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 96-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 96-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 96-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 96-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 72
+- movaps 112-120(pA10,mldab5,2), rA0
+- movaps 112-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 112-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 112-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 112-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 112-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 112-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 112-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 112-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 112-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 112-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 112-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 112-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 112-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 112-120(pA5,ldab,8), rB0
+- prefC((pC))
+- prefC((pC,incCn))
+- addps rB0, rC13
++ movaps 112-120(pA10,mldab5,2), rA0
++ movaps 112-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 112-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 112-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 112-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 112-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 112-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 112-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 112-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 112-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 112-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 112-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 112-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 112-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 112-120(pA5,ldab,8), rB0
++ prefC((pC))
++ prefC((pC,incCn))
++ addps rB0, rC13
+ #else
+- prefC((pC))
+- prefC((pC,incCn))
++ prefC((pC))
++ prefC((pC,incCn))
+ #endif
+
+ #if KB > 76
+- movaps 128-120(pA10,mldab5,2), rA0
+- movaps 128-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 128-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 128-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 128-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 128-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 128-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 128-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 128-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 128-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 128-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 128-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 128-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 128-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 128-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 128-120(pA10,mldab5,2), rA0
++ movaps 128-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 128-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 128-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 128-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 128-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 128-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 128-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 128-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 128-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 128-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 128-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 128-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 128-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 128-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ #if KB > 80
+- movaps 144-120(pA10,mldab5,2), rA0
+- movaps 144-120(pB0), rB0
+- mulps rB0, rA0
+- addps rA0, rC0
+- movaps 144-120(pA5, mldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC1
+- movaps 144-120(pA10, mldab,8), rA0
+- mulps rB0, rA0
+- addps rA0, rC2
+- movaps 144-120(pA5, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC3
+- movaps 144-120(pA5, mldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC4
+- movaps 144-120(pA5), rA0
+- mulps rB0, rA0
+- addps rA0, rC5
+- movaps 144-120(pA5, ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC6
+- movaps 144-120(pA5, ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC7
+- movaps 144-120(pA10, mldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC8
+- movaps 144-120(pA5,ldab,4), rA0
+- mulps rB0, rA0
+- addps rA0, rC9
+- movaps 144-120(pA10), rA0
+- mulps rB0, rA0
+- addps rA0, rC10
+- movaps 144-120(pA10,ldab), rA0
+- mulps rB0, rA0
+- addps rA0, rC11
+- movaps 144-120(pA10,ldab,2), rA0
+- mulps rB0, rA0
+- addps rA0, rC12
+- mulps 144-120(pA5,ldab,8), rB0
+- addps rB0, rC13
++ movaps 144-120(pA10,mldab5,2), rA0
++ movaps 144-120(pB0), rB0
++ mulps rB0, rA0
++ addps rA0, rC0
++ movaps 144-120(pA5, mldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC1
++ movaps 144-120(pA10, mldab,8), rA0
++ mulps rB0, rA0
++ addps rA0, rC2
++ movaps 144-120(pA5, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC3
++ movaps 144-120(pA5, mldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC4
++ movaps 144-120(pA5), rA0
++ mulps rB0, rA0
++ addps rA0, rC5
++ movaps 144-120(pA5, ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC6
++ movaps 144-120(pA5, ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC7
++ movaps 144-120(pA10, mldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC8
++ movaps 144-120(pA5,ldab,4), rA0
++ mulps rB0, rA0
++ addps rA0, rC9
++ movaps 144-120(pA10), rA0
++ mulps rB0, rA0
++ addps rA0, rC10
++ movaps 144-120(pA10,ldab), rA0
++ mulps rB0, rA0
++ addps rA0, rC11
++ movaps 144-120(pA10,ldab,2), rA0
++ mulps rB0, rA0
++ addps rA0, rC12
++ mulps 144-120(pA5,ldab,8), rB0
++ addps rB0, rC13
+ #endif
+
+ /*UKLOOP */
+@@ -2454,202 +2461,202 @@ MLAST:
+ * Get these bastard things summed up correctly
+ */
+
+- /* rC0 = c0a c0b c0c c0d */
+- /* rC1 = c1a c1b c1c c1d */
+- /* rC2 = c2a c2b c2c c2d */
+- /* rC3 = c3a c3b c3c c3d */
++ /* rC0 = c0a c0b c0c c0d */
++ /* rC1 = c1a c1b c1c c1d */
++ /* rC2 = c2a c2b c2c c2d */
++ /* rC3 = c3a c3b c3c c3d */
+ /* */
+- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
+- prefC(64(pC,incCn))
+- prefB(256-176(pB,ldab))
+- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
+- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
+- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
+- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
+- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
+- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
+- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
+- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
+- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
+- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
+- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
+- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
+- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
+- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
+- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
+- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
+-
+-
+- /* rC4 = c4a c4b c4c c4d */
+- /* rC5 = c5a c5b c5c c5d */
+- /* rC6 = c6a c6b c6c c6d */
+- /* rC7 = c7a c7b c7c c7d */
+- /* rC8 = c08a c08b c08c c08d */
+- /* rC9 = c09a c09b c09c c09d */
+- /* rC10 = c10a c10b c10c c10d */
+- /* rC11 = c11a c11b c11c c11d */
+- /* rC12 = c12a c12b c12c c12d */
+- /* rC13 = c13a c13b c13c c13d */
++ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
++ prefC(64(pC,incCn))
++ prefB(256-176(pB,ldab))
++ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
++ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
++ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
++ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
++ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
++ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
++ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
++ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
++ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
++ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
++ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
++ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
++ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
++ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
++ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
++ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
++
++
++ /* rC4 = c4a c4b c4c c4d */
++ /* rC5 = c5a c5b c5c c5d */
++ /* rC6 = c6a c6b c6c c6d */
++ /* rC7 = c7a c7b c7c c7d */
++ /* rC8 = c08a c08b c08c c08d */
++ /* rC9 = c09a c09b c09c c09d */
++ /* rC10 = c10a c10b c10c c10d */
++ /* rC11 = c11a c11b c11c c11d */
++ /* rC12 = c12a c12b c12c c12d */
++ /* rC13 = c13a c13b c13c c13d */
+ /* */
+- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
+- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
+- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
+- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
+- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
+- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
+- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
+- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
+- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
+- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
+- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
+- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
+- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
+- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
+- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
+- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
++ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
++ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
++ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
++ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
++ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
++ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
++ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
++ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
++ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
++ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
++ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
++ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
++ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
++ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
++ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
++ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
+ #ifdef BETAX
+ #ifdef SREAL
+- movups (pC), rA0
+- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+- movups 16(pC), rC4
+- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+- movups 32(pC), rC5
+- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+- movlps 48(pC), rC1
+- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+- mulps BOF(%rsp), rA0
+- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+- mulps BOF(%rsp), rC4
+- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+- mulps BOF(%rsp), rC5
+- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+- mulps BOF(%rsp), rC1
++ movups (pC), rA0
++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
++ movups 16(pC), rC4
++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
++ movups 32(pC), rC5
++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
++ movlps 48(pC), rC1
++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
++ mulps BOF(%rsp), rA0
++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
++ mulps BOF(%rsp), rC4
++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
++ mulps BOF(%rsp), rC5
++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
++ mulps BOF(%rsp), rC1
+
+ /* */
+
+- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+- addps rA0, rC3
+- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+- addps rC4, rC7
+- addps rC5, rC11
+- prefB(320-176(pB,ldab))
+- addps rC1, rC12
++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
++ addps rA0, rC3
++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
++ addps rC4, rC7
++ addps rC5, rC11
++ prefB(320-176(pB,ldab))
++ addps rC1, rC12
+ #else /* BETA = X, complex type */
+- movups (pC), rA0
+- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+- movups 16(pC), rC4
+- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
+- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
+- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
+- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
+- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
+- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
+- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
+- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+- movss 96(pC), rC1
+- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+- movss 104(pC), rB0
+- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+- unpcklps rB0, rC1
+- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+- mulps BOF(%rsp), rA0
+- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+- mulps BOF(%rsp), rC4
+- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+- mulps BOF(%rsp), rC5
+- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+- mulps BOF(%rsp), rC1
++ movups (pC), rA0
++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
++ movups 16(pC), rC4
++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
++ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
++ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
++ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
++ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
++ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
++ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
++ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
++ movss 96(pC), rC1
++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
++ movss 104(pC), rB0
++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
++ unpcklps rB0, rC1
++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
++ mulps BOF(%rsp), rA0
++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
++ mulps BOF(%rsp), rC4
++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
++ mulps BOF(%rsp), rC5
++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
++ mulps BOF(%rsp), rC1
+
+ /* */
+
+- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+- addps rA0, rC3
+- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+- addps rC4, rC7
+- addps rC5, rC11
+- prefB(320-176(pB,ldab))
+- addps rC1, rC12
++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
++ addps rA0, rC3
++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
++ addps rC4, rC7
++ addps rC5, rC11
++ prefB(320-176(pB,ldab))
++ addps rC1, rC12
+ #endif
+
+ #else
+- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+
+ /* */
+
+- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+- prefB(320-176(pB,ldab))
+- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
++ prefB(320-176(pB,ldab))
++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+
+ #endif
+ /*
+ * Write results back to C; pC += 14;
+ */
+ #ifdef SREAL
+- movups rC3, (pC)
+- movups rC7, 16(pC)
+- movups rC11, 32(pC)
+- movlps rC12, 48(pC)
+-/* addq $56, pC */
++ movups rC3, (pC)
++ movups rC7, 16(pC)
++ movups rC11, 32(pC)
++ movlps rC12, 48(pC)
++/* addq $56, pC */
+ #else
+- movss rC3, (pC)
+- movss rC7, 32(pC)
+- movhlps rC3, rC0
+- movhlps rC7, rC6
+- movss rC0, 16(pC)
+- movss rC6, 48(pC)
+- shufps $0x55, rC3, rC3
+- shufps $0x55, rC7, rC7
+- movss rC3, 8(pC)
+- movss rC7, 40(pC)
+- shufps $0x55, rC0, rC0
+- shufps $0x55, rC6, rC6
+- movss rC0, 24(pC)
+- movss rC6, 56(pC)
+-
+- movss rC11, 64(pC)
+- movhlps rC11, rC2
+- movss rC12, 96(pC)
+- movss rC2, 80(pC)
+- shufps $0x55, rC11, rC11
+- shufps $0x55, rC12, rC12
+- movss rC11, 72(pC)
+- shufps $0x55, rC2, rC2
+- movss rC12, 104(pC)
+- movss rC2, 88(pC)
++ movss rC3, (pC)
++ movss rC7, 32(pC)
++ movhlps rC3, rC0
++ movhlps rC7, rC6
++ movss rC0, 16(pC)
++ movss rC6, 48(pC)
++ shufps $0x55, rC3, rC3
++ shufps $0x55, rC7, rC7
++ movss rC3, 8(pC)
++ movss rC7, 40(pC)
++ shufps $0x55, rC0, rC0
++ shufps $0x55, rC6, rC6
++ movss rC0, 24(pC)
++ movss rC6, 56(pC)
++
++ movss rC11, 64(pC)
++ movhlps rC11, rC2
++ movss rC12, 96(pC)
++ movss rC2, 80(pC)
++ shufps $0x55, rC11, rC11
++ shufps $0x55, rC12, rC12
++ movss rC11, 72(pC)
++ shufps $0x55, rC2, rC2
++ movss rC12, 104(pC)
++ movss rC2, 88(pC)
+
+-/* addq $112, pC */
++/* addq $112, pC */
+ #endif
+ /*
+ * Write results back to C
+@@ -2660,55 +2667,55 @@ MLAST:
+ /*
+ * while (pA != stM);
+ */
+-/* subq $1, stM */
+-/* jne UMLOOP */
++/* subq $1, stM */
++/* jne UMLOOP */
+ /*
+ * pC += 14; pA += 14*NB; pB -= NB;
+ */
+-/* subq $MBKBso-NB14so+176, pA5 */
+-/* subq $MBKBso-NB14so+176, pA10 */
+- subq incAm, pA5
+- subq incAm, pA10
+- addq $NBso-176, pB0
++/* subq $MBKBso-NB14so+176, pA5 */
++/* subq $MBKBso-NB14so+176, pA10 */
++ subq incAm, pA5
++ subq incAm, pA10
++ addq $NBso-176, pB0
+ /*
+ * while (pA != stM);
+ */
+-/* subq $1, stM */
+-/* jne UMLOOP */
++/* subq $1, stM */
++/* jne UMLOOP */
+ /*
+ * pC += incCn; pA -= NBNB; pB += NB;
+ */
+- addq incCn, pC
++ addq incCn, pC
+ /*
+ * while (pB != stN);
+ */
+- sub $1, stN
+- jne UNLOOP
++ sub $1, stN
++ jne UNLOOP
+
+ /*
+ * Restore callee-saved iregs
+ */
+ DONE:
+- movq -8(%rsp), %rbp
+- movq -16(%rsp), %rbx
++ movq -8(%rsp), %rbp
++ movq -16(%rsp), %rbx
+ #if MB == 0
+- movq -32(%rsp), %r12
+- movq -40(%rsp), %r13
++ movq -32(%rsp), %r12
++ movq -40(%rsp), %r13
+ #endif
+- ret
++ ret
+ #if MB == 0
+ MB_LT84:
+- cmp $70, stM
+- jne MB_LT70
+-/* movq $70/14, stM */
+- movq $5, stM
+- jmp MBFOUND
++ cmp $70, stM
++ jne MB_LT70
++/* movq $70/14, stM */
++ movq $5, stM
++ jmp MBFOUND
+ MB_LT70:
+- cmp $56, stM
+- jne MB_LT56
+-/* movq $56/14, stM */
+- movq $4, stM
+- jmp MBFOUND
++ cmp $56, stM
++ jne MB_LT56
++/* movq $56/14, stM */
++ movq $4, stM
++ jmp MBFOUND
+ MB_LT56:
+ cmp $42, stM
+ jne MB_LT42
+diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c
+--- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100
++++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100
+@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i
+ /*
+ * Handle all special alpha cases
+ */
+- fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
++ /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
+ fprintf(fpout, "%s{\n", spc);
+ if (pre == 'c' || pre == 'z')
+ {
+@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i
+ }
+ else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc);
+ fprintf(fpout, "%s return;\n", spc);
+- fprintf(fpout, "%s}\n", spc);
++ fprintf(fpout, "%s}\n", spc); */
+ GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib);
+ GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib);
+ if (pre == 'c' || pre == 'z')