diff options
Diffstat (limited to 'media/libjpeg/simd/jcsample-sse2-64.asm')
-rw-r--r-- | media/libjpeg/simd/jcsample-sse2-64.asm | 329 |
1 files changed, 0 insertions, 329 deletions
diff --git a/media/libjpeg/simd/jcsample-sse2-64.asm b/media/libjpeg/simd/jcsample-sse2-64.asm deleted file mode 100644 index 40ee15fcbb..0000000000 --- a/media/libjpeg/simd/jcsample-sse2-64.asm +++ /dev/null @@ -1,329 +0,0 @@ -; -; jcsample.asm - downsampling (64-bit SSE2) -; -; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB -; Copyright (C) 2009, D. R. Commander. -; -; Based on the x86 SIMD extension for IJG JPEG library -; Copyright (C) 1999-2006, MIYASAKA Masaru. -; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] - -%include "jsimdext.inc" - -; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 -; -; Downsample pixel values of a single component. -; This version handles the common case of 2:1 horizontal and 1:1 vertical, -; without smoothing. -; -; GLOBAL(void) -; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, -; JDIMENSION v_samp_factor, JDIMENSION width_blocks, -; JSAMPARRAY input_data, JSAMPARRAY output_data); -; - -; r10 = JDIMENSION image_width -; r11 = int max_v_samp_factor -; r12 = JDIMENSION v_samp_factor -; r13 = JDIMENSION width_blocks -; r14 = JSAMPARRAY input_data -; r15 = JSAMPARRAY output_data - - align 16 - global EXTN(jsimd_h2v1_downsample_sse2) - -EXTN(jsimd_h2v1_downsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov ecx, r13d - shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) - jz near .return - - mov edx, r10d - - ; -- expand_right_edge - - push rcx - shl rcx,1 ; output_cols * 2 - sub rcx,rdx - jle short .expand_end - - mov rax, r11 - test rax,rax - jle short .expand_end - - cld - mov rsi, r14 ; input_data -.expandloop: - push rax - push rcx - - mov rdi, JSAMPROW [rsi] - add rdi,rdx - mov al, JSAMPLE [rdi-1] - - rep stosb - - pop rcx - pop rax - - add rsi, byte SIZEOF_JSAMPROW - dec rax - jg short .expandloop - -.expand_end: - pop rcx ; output_cols - - ; -- h2v1_downsample - - mov eax, r12d ; rowctr - test eax,eax - jle near .return - - mov rdx, 0x00010000 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - - mov rsi, r14 ; input_data - mov rdi, r15 ; output_data -.rowloop: - push rcx - push rdi - push rsi - - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr - - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop - -.columnloop_r8: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - pxor xmm1,xmm1 - mov rcx, SIZEOF_XMMWORD - jmp short .downsample - -.columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] - -.downsample: - movdqa xmm2,xmm0 - movdqa xmm3,xmm1 - - pand xmm0,xmm6 - psrlw xmm2,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm3,BYTE_BIT - - paddw xmm0,xmm2 - paddw xmm1,xmm3 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - psrlw xmm0,1 - psrlw xmm1,1 - - packuswb xmm0,xmm1 - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - - sub rcx, byte SIZEOF_XMMWORD ; outcol - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rdi, byte 1*SIZEOF_XMMWORD ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop - test rcx,rcx - jnz short .columnloop_r8 - - pop rsi - pop rdi - pop rcx - - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rax ; rowctr - jg near .rowloop - -.return: - uncollect_args - pop rbp - ret - -; -------------------------------------------------------------------------- -; -; Downsample pixel values of a single component. -; This version handles the standard case of 2:1 horizontal and 2:1 vertical, -; without smoothing. -; -; GLOBAL(void) -; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, -; JDIMENSION v_samp_factor, JDIMENSION width_blocks, -; JSAMPARRAY input_data, JSAMPARRAY output_data); -; - -; r10 = JDIMENSION image_width -; r11 = int max_v_samp_factor -; r12 = JDIMENSION v_samp_factor -; r13 = JDIMENSION width_blocks -; r14 = JSAMPARRAY input_data -; r15 = JSAMPARRAY output_data - - align 16 - global EXTN(jsimd_h2v2_downsample_sse2) - -EXTN(jsimd_h2v2_downsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov ecx, r13d - shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) - jz near .return - - mov edx, r10d - - ; -- expand_right_edge - - push rcx - shl rcx,1 ; output_cols * 2 - sub rcx,rdx - jle short .expand_end - - mov rax, r11 - test rax,rax - jle short .expand_end - - cld - mov rsi, r14 ; input_data -.expandloop: - push rax - push rcx - - mov rdi, JSAMPROW [rsi] - add rdi,rdx - mov al, JSAMPLE [rdi-1] - - rep stosb - - pop rcx - pop rax - - add rsi, byte SIZEOF_JSAMPROW - dec rax - jg short .expandloop - -.expand_end: - pop rcx ; output_cols - - ; -- h2v2_downsample - - mov eax, r12d ; rowctr - test rax,rax - jle near .return - - mov rdx, 0x00020001 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - - mov rsi, r14 ; input_data - mov rdi, r15 ; output_data -.rowloop: - push rcx - push rdi - push rsi - - mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 - mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 - mov rdi, JSAMPROW [rdi] ; outptr - - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop - -.columnloop_r8: - movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov rcx, SIZEOF_XMMWORD - jmp short .downsample - -.columnloop: - movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] - -.downsample: - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - pand xmm0,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm0,xmm4 - paddw xmm1,xmm5 - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - pand xmm2,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm3,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm2,xmm4 - paddw xmm3,xmm5 - - paddw xmm0,xmm1 - paddw xmm2,xmm3 - paddw xmm0,xmm7 - paddw xmm2,xmm7 - psrlw xmm0,2 - psrlw xmm2,2 - - packuswb xmm0,xmm2 - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - - sub rcx, byte SIZEOF_XMMWORD ; outcol - add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 - add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 - add rdi, byte 1*SIZEOF_XMMWORD ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .columnloop_r8 - - pop rsi - pop rdi - pop rcx - - add rsi, byte 2*SIZEOF_JSAMPROW ; input_data - add rdi, byte 1*SIZEOF_JSAMPROW ; output_data - dec rax ; rowctr - jg near .rowloop - -.return: - uncollect_args - pop rbp - ret - -; For some reason, the OS X linker does not honor the request to align the -; segment unless we do this. - align 16 |