summaryrefslogtreecommitdiff
path: root/media/libjpeg/simd/jquantf-sse2.asm
blob: 1cbc26740095789d8afbfbd14ebc6e9c94d9ffe3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
;
; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]

%include "jsimdext.inc"
%include "jdct.inc"

; --------------------------------------------------------------------------
        SECTION SEG_TEXT
        BITS    32
;
; Load data into workspace, applying unsigned->signed conversion
;
; GLOBAL(void)
; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
;                            FAST_FLOAT *workspace);
;

%define sample_data     ebp+8           ; JSAMPARRAY sample_data
%define start_col       ebp+12          ; JDIMENSION start_col
%define workspace       ebp+16          ; FAST_FLOAT *workspace

        align   16
        global  EXTN(jsimd_convsamp_float_sse2)

EXTN(jsimd_convsamp_float_sse2):
        push    ebp
        mov     ebp,esp
        push    ebx
;       push    ecx             ; need not be preserved
;       push    edx             ; need not be preserved
        push    esi
        push    edi

        pcmpeqw  xmm7,xmm7
        psllw    xmm7,7
        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)

        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
        mov     eax, JDIMENSION [start_col]
        mov     edi, POINTER [workspace]        ; (DCTELEM *)
        mov     ecx, DCTSIZE/2
        alignx  16,7
.convloop:
        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)

        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]

        psubb   xmm0,xmm7                       ; xmm0=(01234567)
        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)

        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)

        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)

        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)

        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1

        add     esi, byte 2*SIZEOF_JSAMPROW
        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
        dec     ecx
        jnz     short .convloop

        pop     edi
        pop     esi
;       pop     edx             ; need not be preserved
;       pop     ecx             ; need not be preserved
        pop     ebx
        pop     ebp
        ret


; --------------------------------------------------------------------------
;
; Quantize/descale the coefficients, and store into coef_block
;
; GLOBAL(void)
; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
;                            FAST_FLOAT *workspace);
;

%define coef_block      ebp+8           ; JCOEFPTR coef_block
%define divisors        ebp+12          ; FAST_FLOAT *divisors
%define workspace       ebp+16          ; FAST_FLOAT *workspace

        align   16
        global  EXTN(jsimd_quantize_float_sse2)

EXTN(jsimd_quantize_float_sse2):
        push    ebp
        mov     ebp,esp
;       push    ebx             ; unused
;       push    ecx             ; unused
;       push    edx             ; need not be preserved
        push    esi
        push    edi

        mov     esi, POINTER [workspace]
        mov     edx, POINTER [divisors]
        mov     edi, JCOEFPTR [coef_block]
        mov     eax, DCTSIZE2/16
        alignx  16,7
.quantloop:
        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]

        cvtps2dq xmm0,xmm0
        cvtps2dq xmm1,xmm1
        cvtps2dq xmm2,xmm2
        cvtps2dq xmm3,xmm3

        packssdw xmm0,xmm1
        packssdw xmm2,xmm3

        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2

        add     esi, byte 16*SIZEOF_FAST_FLOAT
        add     edx, byte 16*SIZEOF_FAST_FLOAT
        add     edi, byte 16*SIZEOF_JCOEF
        dec     eax
        jnz     short .quantloop

        pop     edi
        pop     esi
;       pop     edx             ; need not be preserved
;       pop     ecx             ; unused
;       pop     ebx             ; unused
        pop     ebp
        ret

; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
        align   16