diff options
Diffstat (limited to 'media/libjpeg/simd/jsimd_arm_neon.S')
-rw-r--r-- | media/libjpeg/simd/jsimd_arm_neon.S | 2878 |
1 files changed, 0 insertions, 2878 deletions
diff --git a/media/libjpeg/simd/jsimd_arm_neon.S b/media/libjpeg/simd/jsimd_arm_neon.S deleted file mode 100644 index cd2612724a..0000000000 --- a/media/libjpeg/simd/jsimd_arm_neon.S +++ /dev/null @@ -1,2878 +0,0 @@ -/* - * ARMv7 NEON optimizations for libjpeg-turbo - * - * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). - * All Rights Reserved. - * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> - * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. - * Copyright (C) 2014, Linaro Limited. All Rights Reserved. - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ -#endif - -.text -.fpu neon -.arch armv7a -.object_arch armv4 -.arm -.syntax unified - - -#define RESPECT_STRICT_ALIGNMENT 1 - - -/*****************************************************************************/ - -/* Supplementary macro for setting function attributes */ -.macro asm_function fname -#ifdef __APPLE__ - .globl _\fname -_\fname: -#else - .global \fname -#ifdef __ELF__ - .hidden \fname - .type \fname, %function -#endif -\fname: -#endif -.endm - -/* Transpose a block of 4x4 coefficients in four 64-bit registers */ -.macro transpose_4x4 x0, x1, x2, x3 - vtrn.16 \x0, \x1 - vtrn.16 \x2, \x3 - vtrn.32 \x0, \x2 - vtrn.32 \x1, \x3 -.endm - - -#define CENTERJSAMPLE 128 - -/*****************************************************************************/ - -/* - * Perform dequantization and inverse DCT on one block of coefficients. - * - * GLOBAL(void) - * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, - * JSAMPARRAY output_buf, JDIMENSION output_col) - */ - -#define FIX_0_298631336 (2446) -#define FIX_0_390180644 (3196) -#define FIX_0_541196100 (4433) -#define FIX_0_765366865 (6270) -#define FIX_0_899976223 (7373) -#define FIX_1_175875602 (9633) -#define FIX_1_501321110 (12299) -#define FIX_1_847759065 (15137) -#define FIX_1_961570560 (16069) -#define FIX_2_053119869 (16819) -#define FIX_2_562915447 (20995) -#define FIX_3_072711026 (25172) - -#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) -#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) -#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) -#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) -#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) -#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) -#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) -#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) - -/* - * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. - * Uses some ideas from the comments in 'simd/jiss2int-64.asm' - */ -#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ -{ \ - DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ - JLONG q1, q2, q3, q4, q5, q6, q7; \ - JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ - \ - /* 1-D iDCT input data */ \ - row0 = xrow0; \ - row1 = xrow1; \ - row2 = xrow2; \ - row3 = xrow3; \ - row4 = xrow4; \ - row5 = xrow5; \ - row6 = xrow6; \ - row7 = xrow7; \ - \ - q5 = row7 + row3; \ - q4 = row5 + row1; \ - q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ - MULTIPLY(q4, FIX_1_175875602); \ - q7 = MULTIPLY(q5, FIX_1_175875602) + \ - MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ - q2 = MULTIPLY(row2, FIX_0_541196100) + \ - MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ - q4 = q6; \ - q3 = ((JLONG) row0 - (JLONG) row4) << 13; \ - q6 += MULTIPLY(row5, -FIX_2_562915447) + \ - MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ - /* now we can use q1 (reloadable constants have been used up) */ \ - q1 = q3 + q2; \ - q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ - MULTIPLY(row1, -FIX_0_899976223); \ - q5 = q7; \ - q1 = q1 + q6; \ - q7 += MULTIPLY(row7, -FIX_0_899976223) + \ - MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ - \ - /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ - tmp11_plus_tmp2 = q1; \ - row1 = 0; \ - \ - q1 = q1 - q6; \ - q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ - MULTIPLY(row3, -FIX_2_562915447); \ - q1 = q1 - q6; \ - q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ - MULTIPLY(row6, FIX_0_541196100); \ - q3 = q3 - q2; \ - \ - /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ - tmp11_minus_tmp2 = q1; \ - \ - q1 = ((JLONG) row0 + (JLONG) row4) << 13; \ - q2 = q1 + q6; \ - q1 = q1 - q6; \ - \ - /* pick up the results */ \ - tmp0 = q4; \ - tmp1 = q5; \ - tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ - tmp3 = q7; \ - tmp10 = q2; \ - tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ - tmp12 = q3; \ - tmp13 = q1; \ -} - -#define XFIX_0_899976223 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_2_562915447 d0[2] -#define XFIX_0_298631336_MINUS_0_899976223 d0[3] -#define XFIX_1_501321110_MINUS_0_899976223 d1[0] -#define XFIX_2_053119869_MINUS_2_562915447 d1[1] -#define XFIX_0_541196100_PLUS_0_765366865 d1[2] -#define XFIX_1_175875602 d1[3] -#define XFIX_1_175875602_MINUS_0_390180644 d2[0] -#define XFIX_0_541196100_MINUS_1_847759065 d2[1] -#define XFIX_3_072711026_MINUS_2_562915447 d2[2] -#define XFIX_1_175875602_MINUS_1_961570560 d2[3] - -.balign 16 -jsimd_idct_islow_neon_consts: - .short FIX_0_899976223 /* d0[0] */ - .short FIX_0_541196100 /* d0[1] */ - .short FIX_2_562915447 /* d0[2] */ - .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ - .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ - .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ - .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ - .short FIX_1_175875602 /* d1[3] */ - /* reloadable constants */ - .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ - .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ - .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ - .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ - -asm_function jsimd_idct_islow_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - ROW0L .req d16 - ROW0R .req d17 - ROW1L .req d18 - ROW1R .req d19 - ROW2L .req d20 - ROW2R .req d21 - ROW3L .req d22 - ROW3R .req d23 - ROW4L .req d24 - ROW4R .req d25 - ROW5L .req d26 - ROW5R .req d27 - ROW6L .req d28 - ROW6R .req d29 - ROW7L .req d30 - ROW7R .req d31 - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( q8 ) - * 1 | d18 | d19 ( q9 ) - * 2 | d20 | d21 ( q10 ) - * 3 | d22 | d23 ( q11 ) - * 4 | d24 | d25 ( q12 ) - * 5 | d26 | d27 ( q13 ) - * 6 | d28 | d29 ( q14 ) - * 7 | d30 | d31 ( q15 ) - */ - adr ip, jsimd_idct_islow_neon_consts - vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! - vmul.s16 q8, q8, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q9, q9, q1 - vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! - vmul.s16 q10, q10, q2 - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vmul.s16 q11, q11, q3 - vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] - vmul.s16 q12, q12, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q14, q14, q2 - vmul.s16 q13, q13, q1 - vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ - add ip, ip, #16 - vmul.s16 q15, q15, q3 - vpush {d8-d15} /* save NEON registers */ - /* 1-D IDCT, pass 1, left 4x8 half */ - vadd.s16 d4, ROW7L, ROW3L - vadd.s16 d5, ROW5L, ROW1L - vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d5, XFIX_1_175875602 - vmull.s16 q7, d4, XFIX_1_175875602 - /* Check for the zero coefficients in the right 4x8 half */ - push {r4, r5} - vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW4L - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 - orr r0, r4, r5 - vmov q4, q6 - vmlsl.s16 q6, ROW5L, XFIX_2_562915447 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vshl.s32 q3, q3, #13 - orr r0, r0, r4 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - orr r0, r0, r5 - vadd.s32 q1, q3, q2 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] - vmov q5, q7 - vadd.s32 q1, q1, q6 - orr r0, r0, r4 - vmlsl.s16 q7, ROW7L, XFIX_0_899976223 - orr r0, r0, r5 - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vrshrn.s32 ROW1L, q1, #11 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 - orr r0, r0, r4 - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - orr r0, r0, r5 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] - vmlal.s16 q6, ROW6L, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - orr r0, r0, r4 - vrshrn.s32 ROW6L, q1, #11 - orr r0, r0, r5 - vadd.s32 q1, q3, q5 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW4L - orr r0, r0, r4 - vrshrn.s32 ROW2L, q1, #11 - orr r0, r0, r5 - vrshrn.s32 ROW5L, q3, #11 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 - orr r0, r0, r4 - vadd.s32 q2, q5, q6 - orrs r0, r0, r5 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - orr r0, r4, r5 - vsub.s32 q3, q1, q4 - pop {r4, r5} - vrshrn.s32 ROW7L, q2, #11 - vrshrn.s32 ROW3L, q5, #11 - vrshrn.s32 ROW0L, q6, #11 - vrshrn.s32 ROW4L, q3, #11 - - beq 3f /* Go to do some special handling for the sparse - right 4x8 half */ - - /* 1-D IDCT, pass 1, right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vadd.s16 d10, ROW7R, ROW3R - vadd.s16 d8, ROW5R, ROW1R - /* Transpose left 4x8 half */ - vtrn.16 ROW6L, ROW7L - vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d8, XFIX_1_175875602 - vtrn.16 ROW2L, ROW3L - vmull.s16 q7, d10, XFIX_1_175875602 - vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 - vtrn.16 ROW0L, ROW1L - vsubl.s16 q3, ROW0R, ROW4R - vmull.s16 q2, ROW2R, XFIX_0_541196100 - vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 - vtrn.16 ROW4L, ROW5L - vmov q4, q6 - vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 - vtrn.32 ROW1L, ROW3L - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1R, XFIX_0_899976223 - vtrn.32 ROW4L, ROW6L - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vtrn.32 ROW0L, ROW2L - vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 - vrshrn.s32 ROW1R, q1, #11 - vtrn.32 ROW5L, ROW7L - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW3R, XFIX_2_562915447 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW6R, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - vrshrn.s32 ROW6R, q1, #11 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0R, ROW4R - vrshrn.s32 ROW2R, q1, #11 - vrshrn.s32 ROW5R, q3, #11 - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vrshrn.s32 ROW7R, q2, #11 - vrshrn.s32 ROW3R, q5, #11 - vrshrn.s32 ROW0R, q6, #11 - vrshrn.s32 ROW4R, q3, #11 - /* Transpose right 4x8 half */ - vtrn.16 ROW6R, ROW7R - vtrn.16 ROW2R, ROW3R - vtrn.16 ROW0R, ROW1R - vtrn.16 ROW4R, ROW5R - vtrn.32 ROW1R, ROW3R - vtrn.32 ROW4R, ROW6R - vtrn.32 ROW0R, ROW2R - vtrn.32 ROW5R, ROW7R - -1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW1L, XFIX_1_175875602 - vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ - vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW3L, XFIX_1_175875602 - vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ - vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ - vmov q4, q6 - vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vshrn.s32 ROW1L, q1, #16 - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ - vsub.s32 q3, q3, q2 - vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ - vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW3L, q5, #16 - vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ - /* 1-D IDCT, pass 2, right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW5R, XFIX_1_175875602 - vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ - vmull.s16 q7, ROW7R, XFIX_1_175875602 - vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 - vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ - vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ - vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ - vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 - vmov q4, q6 - vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ - vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ - vmlal.s16 q6, ROW6R, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW6R, q1, #16 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ - vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ - vshrn.s32 ROW5R, q3, #16 - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ - vshrn.s32 ROW4R, q3, #16 - -2: /* Descale to 8-bit and range limit */ - vqrshrn.s16 d16, q8, #2 - vqrshrn.s16 d17, q9, #2 - vqrshrn.s16 d18, q10, #2 - vqrshrn.s16 d19, q11, #2 - vpop {d8-d15} /* restore NEON registers */ - vqrshrn.s16 d20, q12, #2 - /* Transpose the final 8-bit samples and do signed->unsigned conversion */ - vtrn.16 q8, q9 - vqrshrn.s16 d21, q13, #2 - vqrshrn.s16 d22, q14, #2 - vmov.u8 q0, #(CENTERJSAMPLE) - vqrshrn.s16 d23, q15, #2 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - vadd.u8 q8, q8, q0 - vadd.u8 q9, q9, q0 - vtrn.16 q10, q11 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vadd.u8 q10, q10, q0 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vtrn.8 d22, d23 - vst1.8 {d20}, [TMP1] - vadd.u8 q11, q11, q0 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] - bx lr - -3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ - - /* Transpose left 4x8 half */ - vtrn.16 ROW6L, ROW7L - vtrn.16 ROW2L, ROW3L - vtrn.16 ROW0L, ROW1L - vtrn.16 ROW4L, ROW5L - vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ - vtrn.32 ROW1L, ROW3L - vtrn.32 ROW4L, ROW6L - vtrn.32 ROW0L, ROW2L - vtrn.32 ROW5L, ROW7L - - cmp r0, #0 - beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second - pass */ - - /* Only row 0 is non-zero for the right 4x8 half */ - vdup.s16 ROW1R, ROW0R[1] - vdup.s16 ROW2R, ROW0R[2] - vdup.s16 ROW3R, ROW0R[3] - vdup.s16 ROW4R, ROW0R[0] - vdup.s16 ROW5R, ROW0R[1] - vdup.s16 ROW6R, ROW0R[2] - vdup.s16 ROW7R, ROW0R[3] - vdup.s16 ROW0R, ROW0R[0] - b 1b /* Go to 'normal' second pass */ - -4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW1L, XFIX_1_175875602 - vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW3L, XFIX_1_175875602 - vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vshll.s16 q3, ROW0L, #13 - vmov q4, q6 - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vadd.s32 q1, q1, q6 - vadd.s32 q6, q6, q6 - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - vshrn.s32 ROW1L, q1, #16 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vshll.s16 q5, ROW0L, #13 - vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW3L, q5, #16 - vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ - /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW5L, XFIX_1_175875602 - vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW7L, XFIX_1_175875602 - vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 - vmull.s16 q2, ROW6L, XFIX_0_541196100 - vshll.s16 q3, ROW4L, #13 - vmov q4, q6 - vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 - vmlsl.s16 q4, ROW5L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 - vadd.s32 q1, q1, q6 - vadd.s32 q6, q6, q6 - vmlsl.s16 q5, ROW7L, XFIX_2_562915447 - vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW6R, q1, #16 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vshll.s16 q5, ROW4L, #13 - vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ - vshrn.s32 ROW5R, q3, #16 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ - vshrn.s32 ROW4R, q3, #16 - b 2b /* Go to epilogue */ - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - .unreq ROW0L - .unreq ROW0R - .unreq ROW1L - .unreq ROW1R - .unreq ROW2L - .unreq ROW2R - .unreq ROW3L - .unreq ROW3R - .unreq ROW4L - .unreq ROW4R - .unreq ROW5L - .unreq ROW5R - .unreq ROW6L - .unreq ROW6R - .unreq ROW7L - .unreq ROW7R - - -/*****************************************************************************/ - -/* - * jsimd_idct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the inverse DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' - * function from jidctfst.c - * - * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. - * But in ARM NEON case some extra additions are required because VQDMULH - * instruction can't handle the constants larger than 1. So the expressions - * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", - * which introduces an extra addition. Overall, there are 6 extra additions - * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. - */ - -#define XFIX_1_082392200 d0[0] -#define XFIX_1_414213562 d0[1] -#define XFIX_1_847759065 d0[2] -#define XFIX_2_613125930 d0[3] - -.balign 16 -jsimd_idct_ifast_neon_consts: - .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ - .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ - .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ - .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ - -asm_function jsimd_idct_ifast_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( q8 ) - * 1 | d18 | d19 ( q9 ) - * 2 | d20 | d21 ( q10 ) - * 3 | d22 | d23 ( q11 ) - * 4 | d24 | d25 ( q12 ) - * 5 | d26 | d27 ( q13 ) - * 6 | d28 | d29 ( q14 ) - * 7 | d30 | d31 ( q15 ) - */ - adr ip, jsimd_idct_ifast_neon_consts - vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! - vmul.s16 q8, q8, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q9, q9, q1 - vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! - vmul.s16 q10, q10, q2 - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vmul.s16 q11, q11, q3 - vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] - vmul.s16 q12, q12, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q14, q14, q2 - vmul.s16 q13, q13, q1 - vld1.16 {d0}, [ip, :64] /* load constants */ - vmul.s16 q15, q15, q3 - vpush {d8-d13} /* save NEON registers */ - /* 1-D IDCT, pass 1 */ - vsub.s16 q2, q10, q14 - vadd.s16 q14, q10, q14 - vsub.s16 q1, q11, q13 - vadd.s16 q13, q11, q13 - vsub.s16 q5, q9, q15 - vadd.s16 q15, q9, q15 - vqdmulh.s16 q4, q2, XFIX_1_414213562 - vqdmulh.s16 q6, q1, XFIX_2_613125930 - vadd.s16 q3, q1, q1 - vsub.s16 q1, q5, q1 - vadd.s16 q10, q2, q4 - vqdmulh.s16 q4, q1, XFIX_1_847759065 - vsub.s16 q2, q15, q13 - vadd.s16 q3, q3, q6 - vqdmulh.s16 q6, q2, XFIX_1_414213562 - vadd.s16 q1, q1, q4 - vqdmulh.s16 q4, q5, XFIX_1_082392200 - vsub.s16 q10, q10, q14 - vadd.s16 q2, q2, q6 - vsub.s16 q6, q8, q12 - vadd.s16 q12, q8, q12 - vadd.s16 q9, q5, q4 - vadd.s16 q5, q6, q10 - vsub.s16 q10, q6, q10 - vadd.s16 q6, q15, q13 - vadd.s16 q8, q12, q14 - vsub.s16 q3, q6, q3 - vsub.s16 q12, q12, q14 - vsub.s16 q3, q3, q1 - vsub.s16 q1, q9, q1 - vadd.s16 q2, q3, q2 - vsub.s16 q15, q8, q6 - vadd.s16 q1, q1, q2 - vadd.s16 q8, q8, q6 - vadd.s16 q14, q5, q3 - vsub.s16 q9, q5, q3 - vsub.s16 q13, q10, q2 - vadd.s16 q10, q10, q2 - /* Transpose */ - vtrn.16 q8, q9 - vsub.s16 q11, q12, q1 - vtrn.16 q14, q15 - vadd.s16 q12, q12, q1 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q8, q10 - vtrn.32 q13, q15 - vswp d28, d21 - vswp d26, d19 - /* 1-D IDCT, pass 2 */ - vsub.s16 q2, q10, q14 - vswp d30, d23 - vadd.s16 q14, q10, q14 - vswp d24, d17 - vsub.s16 q1, q11, q13 - vadd.s16 q13, q11, q13 - vsub.s16 q5, q9, q15 - vadd.s16 q15, q9, q15 - vqdmulh.s16 q4, q2, XFIX_1_414213562 - vqdmulh.s16 q6, q1, XFIX_2_613125930 - vadd.s16 q3, q1, q1 - vsub.s16 q1, q5, q1 - vadd.s16 q10, q2, q4 - vqdmulh.s16 q4, q1, XFIX_1_847759065 - vsub.s16 q2, q15, q13 - vadd.s16 q3, q3, q6 - vqdmulh.s16 q6, q2, XFIX_1_414213562 - vadd.s16 q1, q1, q4 - vqdmulh.s16 q4, q5, XFIX_1_082392200 - vsub.s16 q10, q10, q14 - vadd.s16 q2, q2, q6 - vsub.s16 q6, q8, q12 - vadd.s16 q12, q8, q12 - vadd.s16 q9, q5, q4 - vadd.s16 q5, q6, q10 - vsub.s16 q10, q6, q10 - vadd.s16 q6, q15, q13 - vadd.s16 q8, q12, q14 - vsub.s16 q3, q6, q3 - vsub.s16 q12, q12, q14 - vsub.s16 q3, q3, q1 - vsub.s16 q1, q9, q1 - vadd.s16 q2, q3, q2 - vsub.s16 q15, q8, q6 - vadd.s16 q1, q1, q2 - vadd.s16 q8, q8, q6 - vadd.s16 q14, q5, q3 - vsub.s16 q9, q5, q3 - vsub.s16 q13, q10, q2 - vpop {d8-d13} /* restore NEON registers */ - vadd.s16 q10, q10, q2 - vsub.s16 q11, q12, q1 - vadd.s16 q12, q12, q1 - /* Descale to 8-bit and range limit */ - vmov.u8 q0, #0x80 - vqshrn.s16 d16, q8, #5 - vqshrn.s16 d17, q9, #5 - vqshrn.s16 d18, q10, #5 - vqshrn.s16 d19, q11, #5 - vqshrn.s16 d20, q12, #5 - vqshrn.s16 d21, q13, #5 - vqshrn.s16 d22, q14, #5 - vqshrn.s16 d23, q15, #5 - vadd.u8 q8, q8, q0 - vadd.u8 q9, q9, q0 - vadd.u8 q10, q10, q0 - vadd.u8 q11, q11, q0 - /* Transpose the final 8-bit samples */ - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vst1.8 {d20}, [TMP1] - vtrn.8 d22, d23 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - -/*****************************************************************************/ - -/* - * jsimd_idct_4x4_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - * - * TODO: a bit better instructions scheduling can be achieved by expanding - * idct_helper/transpose_4x4 macros and reordering instructions, - * but readability will suffer somewhat. - */ - -#define CONST_BITS 13 - -#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ -#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ -#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ -#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ -#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ -#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ -#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ -#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ -#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ -#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ -#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ -#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ -#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ -#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ - -.balign 16 -jsimd_idct_4x4_neon_consts: - .short FIX_1_847759065 /* d0[0] */ - .short -FIX_0_765366865 /* d0[1] */ - .short -FIX_0_211164243 /* d0[2] */ - .short FIX_1_451774981 /* d0[3] */ - .short -FIX_2_172734803 /* d1[0] */ - .short FIX_1_061594337 /* d1[1] */ - .short -FIX_0_509795579 /* d1[2] */ - .short -FIX_0_601344887 /* d1[3] */ - .short FIX_0_899976223 /* d2[0] */ - .short FIX_2_562915447 /* d2[1] */ - .short 1 << (CONST_BITS+1) /* d2[2] */ - .short 0 /* d2[3] */ - -.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 - vmull.s16 q14, \x4, d2[2] - vmlal.s16 q14, \x8, d0[0] - vmlal.s16 q14, \x14, d0[1] - - vmull.s16 q13, \x16, d1[2] - vmlal.s16 q13, \x12, d1[3] - vmlal.s16 q13, \x10, d2[0] - vmlal.s16 q13, \x6, d2[1] - - vmull.s16 q15, \x4, d2[2] - vmlsl.s16 q15, \x8, d0[0] - vmlsl.s16 q15, \x14, d0[1] - - vmull.s16 q12, \x16, d0[2] - vmlal.s16 q12, \x12, d0[3] - vmlal.s16 q12, \x10, d1[0] - vmlal.s16 q12, \x6, d1[1] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y29, q14 - .else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y29, q14, #\shift - .endif - - vadd.s32 q10, q15, q12 - vsub.s32 q15, q15, q12 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q15, q15, #\shift - vmovn.s32 \y27, q10 - vmovn.s32 \y28, q15 - .else - vrshrn.s32 \y27, q10, #\shift - vrshrn.s32 \y28, q15, #\shift - .endif -.endm - -asm_function jsimd_idct_4x4_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - vpush {d8-d15} - - /* Load constants (d3 is just used for padding) */ - adr TMP4, jsimd_idct_4x4_neon_consts - vld1.16 {d0, d1, d2, d3}, [TMP4, :128] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | d8 | d9 - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | d14 | d15 - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q3, q3, q10 - vmul.s16 q4, q4, q11 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - vmul.s16 q6, q6, q13 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q7, q7, q14 - vmul.s16 q8, q8, q15 - - /* Pass 1 */ - idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 - transpose_4x4 d5, d7, d9, d11 - - /* Pass 2 */ - idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 - transpose_4x4 d26, d27, d28, d29 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vadd.s16 q14, q14, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q14 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - -#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT - /* We can use much less instructions on little endian systems if the - * OS kernel is not configured to trap unaligned memory accesses - */ - vst1.32 {d26[0]}, [TMP1]! - vst1.32 {d27[0]}, [TMP3]! - vst1.32 {d26[1]}, [TMP2]! - vst1.32 {d27[1]}, [TMP4]! -#else - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[0]}, [TMP3]! - vst1.8 {d26[1]}, [TMP1]! - vst1.8 {d27[1]}, [TMP3]! - vst1.8 {d26[2]}, [TMP1]! - vst1.8 {d27[2]}, [TMP3]! - vst1.8 {d26[3]}, [TMP1]! - vst1.8 {d27[3]}, [TMP3]! - - vst1.8 {d26[4]}, [TMP2]! - vst1.8 {d27[4]}, [TMP4]! - vst1.8 {d26[5]}, [TMP2]! - vst1.8 {d27[5]}, [TMP4]! - vst1.8 {d26[6]}, [TMP2]! - vst1.8 {d27[6]}, [TMP4]! - vst1.8 {d26[7]}, [TMP2]! - vst1.8 {d27[7]}, [TMP4]! -#endif - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_idct_2x2_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - */ - -.balign 8 -jsimd_idct_2x2_neon_consts: - .short -FIX_0_720959822 /* d0[0] */ - .short FIX_0_850430095 /* d0[1] */ - .short -FIX_1_272758580 /* d0[2] */ - .short FIX_3_624509785 /* d0[3] */ - -.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - vshll.s16 q14, \x4, #15 - vmull.s16 q13, \x6, d0[3] - vmlal.s16 q13, \x10, d0[2] - vmlal.s16 q13, \x12, d0[1] - vmlal.s16 q13, \x16, d0[0] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y27, q14 - .else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y27, q14, #\shift - .endif -.endm - -asm_function jsimd_idct_2x2_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP2, jsimd_idct_2x2_neon_consts - vld1.16 {d0}, [TMP2, :64] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | - | - - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | - | - - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* Dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vmul.s16 q3, q3, q10 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27}, [DCT_TABLE, :128]! - vmul.s16 q6, q6, q13 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q8, q8, q15 - - /* Pass 1 */ -#if 0 - idct_helper d4, d6, d10, d12, d16, 13, d4, d6 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d11, d13, d17, 13, d5, d7 - transpose_4x4 d5, d7, d9, d11 -#else - vmull.s16 q13, d6, d0[3] - vmlal.s16 q13, d10, d0[2] - vmlal.s16 q13, d12, d0[1] - vmlal.s16 q13, d16, d0[0] - vmull.s16 q12, d7, d0[3] - vmlal.s16 q12, d11, d0[2] - vmlal.s16 q12, d13, d0[1] - vmlal.s16 q12, d17, d0[0] - vshll.s16 q14, d4, #15 - vshll.s16 q15, d5, #15 - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - vrshrn.s32 d4, q10, #13 - vrshrn.s32 d6, q14, #13 - vadd.s32 q10, q15, q12 - vsub.s32 q14, q15, q12 - vrshrn.s32 d5, q10, #13 - vrshrn.s32 d7, q14, #13 - vtrn.16 q2, q3 - vtrn.32 q3, q5 -#endif - - /* Pass 2 */ - idct_helper d4, d6, d10, d7, d11, 20, d26, d27 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q13 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[4]}, [TMP1]! - vst1.8 {d26[1]}, [TMP2]! - vst1.8 {d27[5]}, [TMP2]! - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_ycc_extrgb_convert_neon - * jsimd_ycc_extbgr_convert_neon - * jsimd_ycc_extrgbx_convert_neon - * jsimd_ycc_extbgrx_convert_neon - * jsimd_ycc_extxbgr_convert_neon - * jsimd_ycc_extxrgb_convert_neon - * - * Colorspace conversion YCbCr -> RGB - */ - - -.macro do_load size - .if \size == 8 - vld1.8 {d4}, [U, :64]! - vld1.8 {d5}, [V, :64]! - vld1.8 {d0}, [Y, :64]! - pld [U, #64] - pld [V, #64] - pld [Y, #64] - .elseif \size == 4 - vld1.8 {d4[0]}, [U]! - vld1.8 {d4[1]}, [U]! - vld1.8 {d4[2]}, [U]! - vld1.8 {d4[3]}, [U]! - vld1.8 {d5[0]}, [V]! - vld1.8 {d5[1]}, [V]! - vld1.8 {d5[2]}, [V]! - vld1.8 {d5[3]}, [V]! - vld1.8 {d0[0]}, [Y]! - vld1.8 {d0[1]}, [Y]! - vld1.8 {d0[2]}, [Y]! - vld1.8 {d0[3]}, [Y]! - .elseif \size == 2 - vld1.8 {d4[4]}, [U]! - vld1.8 {d4[5]}, [U]! - vld1.8 {d5[4]}, [V]! - vld1.8 {d5[5]}, [V]! - vld1.8 {d0[4]}, [Y]! - vld1.8 {d0[5]}, [Y]! - .elseif \size == 1 - vld1.8 {d4[6]}, [U]! - vld1.8 {d5[6]}, [V]! - vld1.8 {d0[6]}, [Y]! - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_store bpp, size - .if \bpp == 24 - .if \size == 8 - vst3.8 {d10, d11, d12}, [RGB]! - .elseif \size == 4 - vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vst4.8 {d10, d11, d12, d13}, [RGB]! - .elseif \size == 4 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 16 - .if \size == 8 - vst1.16 {q15}, [RGB]! - .elseif \size == 4 - vst1.16 {d30}, [RGB]! - .elseif \size == 2 - vst1.16 {d31[0]}, [RGB]! - vst1.16 {d31[1]}, [RGB]! - .elseif \size == 1 - vst1.16 {d31[2]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs - -/* - * 2-stage pipelined YCbCr->RGB conversion - */ - -.macro do_yuv_to_rgb_stage1 - vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ - vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ - vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ -.endm - -.macro do_yuv_to_rgb_stage2 - vrshrn.s32 d20, q10, #15 - vrshrn.s32 d21, q11, #15 - vrshrn.s32 d24, q12, #14 - vrshrn.s32 d25, q13, #14 - vrshrn.s32 d28, q14, #14 - vrshrn.s32 d29, q15, #14 - vaddw.u8 q11, q10, d0 - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - .if \bpp != 16 - vqmovun.s16 d1\g_offs, q11 - vqmovun.s16 d1\r_offs, q12 - vqmovun.s16 d1\b_offs, q14 - .else /* rgb565 */ - vqshlu.s16 q13, q11, #8 - vqshlu.s16 q15, q12, #8 - vqshlu.s16 q14, q14, #8 - vsri.u16 q15, q13, #5 - vsri.u16 q15, q14, #11 - .endif -.endm - -.macro do_yuv_to_rgb_stage2_store_load_stage1 - /* "do_yuv_to_rgb_stage2" and "store" */ - vrshrn.s32 d20, q10, #15 - /* "load" and "do_yuv_to_rgb_stage1" */ - pld [U, #64] - vrshrn.s32 d21, q11, #15 - pld [V, #64] - vrshrn.s32 d24, q12, #14 - vrshrn.s32 d25, q13, #14 - vld1.8 {d4}, [U, :64]! - vrshrn.s32 d28, q14, #14 - vld1.8 {d5}, [V, :64]! - vrshrn.s32 d29, q15, #14 - vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ - vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - vaddw.u8 q11, q10, d0 - vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ - vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ - vqmovun.s16 d1\g_offs, q11 - pld [Y, #64] - vqmovun.s16 d1\r_offs, q12 - vld1.8 {d0}, [Y, :64]! - vqmovun.s16 d1\b_offs, q14 - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - do_store \bpp, 8 - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ - .else /**************************** rgb565 ********************************/ - vqshlu.s16 q13, q11, #8 - pld [Y, #64] - vqshlu.s16 q15, q12, #8 - vqshlu.s16 q14, q14, #8 - vld1.8 {d0}, [Y, :64]! - vmull.s16 q11, d7, d1[1] - vmlal.s16 q11, d9, d1[2] - vsri.u16 q15, q13, #5 - vmull.s16 q12, d8, d1[0] - vsri.u16 q15, q14, #11 - vmull.s16 q13, d9, d1[0] - vmull.s16 q14, d6, d1[3] - do_store \bpp, 8 - vmull.s16 q15, d7, d1[3] - .endif -.endm - -.macro do_yuv_to_rgb - do_yuv_to_rgb_stage1 - do_yuv_to_rgb_stage2 -.endm - -/* Apple gas crashes on adrl, work around that by using adr. - * But this requires a copy of these constants for each function. - */ - -.balign 16 -jsimd_ycc_\colorid\()_neon_consts: - .short 0, 0, 0, 0 - .short 22971, -11277, -23401, 29033 - .short -128, -128, -128, -128 - .short -128, -128, -128, -128 - -asm_function jsimd_ycc_\colorid\()_convert_neon - OUTPUT_WIDTH .req r0 - INPUT_BUF .req r1 - INPUT_ROW .req r2 - OUTPUT_BUF .req r3 - NUM_ROWS .req r4 - - INPUT_BUF0 .req r5 - INPUT_BUF1 .req r6 - INPUT_BUF2 .req INPUT_BUF - - RGB .req r7 - Y .req r8 - U .req r9 - V .req r10 - N .req ip - - /* Load constants to d1, d2, d3 (d0 is just used for padding) */ - adr ip, jsimd_ycc_\colorid\()_neon_consts - vld1.16 {d0, d1, d2, d3}, [ip, :128] - - /* Save ARM registers and handle input arguments */ - push {r4, r5, r6, r7, r8, r9, r10, lr} - ldr NUM_ROWS, [sp, #(4 * 8)] - ldr INPUT_BUF0, [INPUT_BUF] - ldr INPUT_BUF1, [INPUT_BUF, #4] - ldr INPUT_BUF2, [INPUT_BUF, #8] - .unreq INPUT_BUF - - /* Save NEON registers */ - vpush {d8-d15} - - /* Initially set d10, d11, d12, d13 to 0xFF */ - vmov.u8 q5, #255 - vmov.u8 q6, #255 - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - blt 9f -0: - ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] - ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] - mov N, OUTPUT_WIDTH - ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] - add INPUT_ROW, INPUT_ROW, #1 - ldr RGB, [OUTPUT_BUF], #4 - - /* Inner loop over pixels */ - subs N, N, #8 - blt 3f - do_load 8 - do_yuv_to_rgb_stage1 - subs N, N, #8 - blt 2f -1: - do_yuv_to_rgb_stage2_store_load_stage1 - subs N, N, #8 - bge 1b -2: - do_yuv_to_rgb_stage2 - do_store \bpp, 8 - tst N, #7 - beq 8f -3: - tst N, #4 - beq 3f - do_load 4 -3: - tst N, #2 - beq 4f - do_load 2 -4: - tst N, #1 - beq 5f - do_load 1 -5: - do_yuv_to_rgb - tst N, #4 - beq 6f - do_store \bpp, 4 -6: - tst N, #2 - beq 7f - do_store \bpp, 2 -7: - tst N, #1 - beq 8f - do_store \bpp, 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - bgt 0b -9: - /* Restore all registers and return */ - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, r10, pc} - - .unreq OUTPUT_WIDTH - .unreq INPUT_ROW - .unreq OUTPUT_BUF - .unreq NUM_ROWS - .unreq INPUT_BUF0 - .unreq INPUT_BUF1 - .unreq INPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N - -.purgem do_yuv_to_rgb -.purgem do_yuv_to_rgb_stage1 -.purgem do_yuv_to_rgb_stage2 -.purgem do_yuv_to_rgb_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B */ -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 -generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * jsimd_extrgb_ycc_convert_neon - * jsimd_extbgr_ycc_convert_neon - * jsimd_extrgbx_ycc_convert_neon - * jsimd_extbgrx_ycc_convert_neon - * jsimd_extxbgr_ycc_convert_neon - * jsimd_extxrgb_ycc_convert_neon - * - * Colorspace conversion RGB -> YCbCr - */ - -.macro do_store size - .if \size == 8 - vst1.8 {d20}, [Y]! - vst1.8 {d21}, [U]! - vst1.8 {d22}, [V]! - .elseif \size == 4 - vst1.8 {d20[0]}, [Y]! - vst1.8 {d20[1]}, [Y]! - vst1.8 {d20[2]}, [Y]! - vst1.8 {d20[3]}, [Y]! - vst1.8 {d21[0]}, [U]! - vst1.8 {d21[1]}, [U]! - vst1.8 {d21[2]}, [U]! - vst1.8 {d21[3]}, [U]! - vst1.8 {d22[0]}, [V]! - vst1.8 {d22[1]}, [V]! - vst1.8 {d22[2]}, [V]! - vst1.8 {d22[3]}, [V]! - .elseif \size == 2 - vst1.8 {d20[4]}, [Y]! - vst1.8 {d20[5]}, [Y]! - vst1.8 {d21[4]}, [U]! - vst1.8 {d21[5]}, [U]! - vst1.8 {d22[4]}, [V]! - vst1.8 {d22[5]}, [V]! - .elseif \size == 1 - vst1.8 {d20[6]}, [Y]! - vst1.8 {d21[6]}, [U]! - vst1.8 {d22[6]}, [V]! - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_load bpp, size - .if \bpp == 24 - .if \size == 8 - vld3.8 {d10, d11, d12}, [RGB]! - pld [RGB, #128] - .elseif \size == 4 - vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vld4.8 {d10, d11, d12, d13}, [RGB]! - pld [RGB, #128] - .elseif \size == 4 - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs - -/* - * 2-stage pipelined RGB->YCbCr conversion - */ - -.macro do_rgb_to_yuv_stage1 - vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ - vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ - vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ - vmull.u16 q7, d4, d0[0] - vmlal.u16 q7, d6, d0[1] - vmlal.u16 q7, d8, d0[2] - vmull.u16 q8, d5, d0[0] - vmlal.u16 q8, d7, d0[1] - vmlal.u16 q8, d9, d0[2] - vrev64.32 q9, q1 - vrev64.32 q13, q1 - vmlsl.u16 q9, d4, d0[3] - vmlsl.u16 q9, d6, d1[0] - vmlal.u16 q9, d8, d1[1] - vmlsl.u16 q13, d5, d0[3] - vmlsl.u16 q13, d7, d1[0] - vmlal.u16 q13, d9, d1[1] - vrev64.32 q14, q1 - vrev64.32 q15, q1 - vmlal.u16 q14, d4, d1[1] - vmlsl.u16 q14, d6, d1[2] - vmlsl.u16 q14, d8, d1[3] - vmlal.u16 q15, d5, d1[1] - vmlsl.u16 q15, d7, d1[2] - vmlsl.u16 q15, d9, d1[3] -.endm - -.macro do_rgb_to_yuv_stage2 - vrshrn.u32 d20, q7, #16 - vrshrn.u32 d21, q8, #16 - vshrn.u32 d22, q9, #16 - vshrn.u32 d23, q13, #16 - vshrn.u32 d24, q14, #16 - vshrn.u32 d25, q15, #16 - vmovn.u16 d20, q10 /* d20 = y */ - vmovn.u16 d21, q11 /* d21 = u */ - vmovn.u16 d22, q12 /* d22 = v */ -.endm - -.macro do_rgb_to_yuv - do_rgb_to_yuv_stage1 - do_rgb_to_yuv_stage2 -.endm - -.macro do_rgb_to_yuv_stage2_store_load_stage1 - vrshrn.u32 d20, q7, #16 - vrshrn.u32 d21, q8, #16 - vshrn.u32 d22, q9, #16 - vrev64.32 q9, q1 - vshrn.u32 d23, q13, #16 - vrev64.32 q13, q1 - vshrn.u32 d24, q14, #16 - vshrn.u32 d25, q15, #16 - do_load \bpp, 8 - vmovn.u16 d20, q10 /* d20 = y */ - vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ - vmovn.u16 d21, q11 /* d21 = u */ - vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ - vmovn.u16 d22, q12 /* d22 = v */ - vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ - vmull.u16 q7, d4, d0[0] - vmlal.u16 q7, d6, d0[1] - vmlal.u16 q7, d8, d0[2] - vst1.8 {d20}, [Y]! - vmull.u16 q8, d5, d0[0] - vmlal.u16 q8, d7, d0[1] - vmlal.u16 q8, d9, d0[2] - vmlsl.u16 q9, d4, d0[3] - vmlsl.u16 q9, d6, d1[0] - vmlal.u16 q9, d8, d1[1] - vst1.8 {d21}, [U]! - vmlsl.u16 q13, d5, d0[3] - vmlsl.u16 q13, d7, d1[0] - vmlal.u16 q13, d9, d1[1] - vrev64.32 q14, q1 - vrev64.32 q15, q1 - vmlal.u16 q14, d4, d1[1] - vmlsl.u16 q14, d6, d1[2] - vmlsl.u16 q14, d8, d1[3] - vst1.8 {d22}, [V]! - vmlal.u16 q15, d5, d1[1] - vmlsl.u16 q15, d7, d1[2] - vmlsl.u16 q15, d9, d1[3] -.endm - -.balign 16 -jsimd_\colorid\()_ycc_neon_consts: - .short 19595, 38470, 7471, 11059 - .short 21709, 32768, 27439, 5329 - .short 32767, 128, 32767, 128 - .short 32767, 128, 32767, 128 - -asm_function jsimd_\colorid\()_ycc_convert_neon - OUTPUT_WIDTH .req r0 - INPUT_BUF .req r1 - OUTPUT_BUF .req r2 - OUTPUT_ROW .req r3 - NUM_ROWS .req r4 - - OUTPUT_BUF0 .req r5 - OUTPUT_BUF1 .req r6 - OUTPUT_BUF2 .req OUTPUT_BUF - - RGB .req r7 - Y .req r8 - U .req r9 - V .req r10 - N .req ip - - /* Load constants to d0, d1, d2, d3 */ - adr ip, jsimd_\colorid\()_ycc_neon_consts - vld1.16 {d0, d1, d2, d3}, [ip, :128] - - /* Save ARM registers and handle input arguments */ - push {r4, r5, r6, r7, r8, r9, r10, lr} - ldr NUM_ROWS, [sp, #(4 * 8)] - ldr OUTPUT_BUF0, [OUTPUT_BUF] - ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] - ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] - .unreq OUTPUT_BUF - - /* Save NEON registers */ - vpush {d8-d15} - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - blt 9f -0: - ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] - ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] - mov N, OUTPUT_WIDTH - ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] - add OUTPUT_ROW, OUTPUT_ROW, #1 - ldr RGB, [INPUT_BUF], #4 - - /* Inner loop over pixels */ - subs N, N, #8 - blt 3f - do_load \bpp, 8 - do_rgb_to_yuv_stage1 - subs N, N, #8 - blt 2f -1: - do_rgb_to_yuv_stage2_store_load_stage1 - subs N, N, #8 - bge 1b -2: - do_rgb_to_yuv_stage2 - do_store 8 - tst N, #7 - beq 8f -3: - tst N, #4 - beq 3f - do_load \bpp, 4 -3: - tst N, #2 - beq 4f - do_load \bpp, 2 -4: - tst N, #1 - beq 5f - do_load \bpp, 1 -5: - do_rgb_to_yuv - tst N, #4 - beq 6f - do_store 4 -6: - tst N, #2 - beq 7f - do_store 2 -7: - tst N, #1 - beq 8f - do_store 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - bgt 0b -9: - /* Restore all registers and return */ - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, r10, pc} - - .unreq OUTPUT_WIDTH - .unreq OUTPUT_ROW - .unreq INPUT_BUF - .unreq NUM_ROWS - .unreq OUTPUT_BUF0 - .unreq OUTPUT_BUF1 - .unreq OUTPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N - -.purgem do_rgb_to_yuv -.purgem do_rgb_to_yuv_stage1 -.purgem do_rgb_to_yuv_stage2 -.purgem do_rgb_to_yuv_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B */ -generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * Load data into workspace, applying unsigned->signed conversion - * - * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get - * rid of VST1.16 instructions - */ - -asm_function jsimd_convsamp_neon - SAMPLE_DATA .req r0 - START_COL .req r1 - WORKSPACE .req r2 - TMP1 .req r3 - TMP2 .req r4 - TMP3 .req r5 - TMP4 .req ip - - push {r4, r5} - vmov.u8 d0, #128 - - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d16}, [TMP1] - vsubl.u8 q8, d16, d0 - vld1.8 {d18}, [TMP2] - vsubl.u8 q9, d18, d0 - vld1.8 {d20}, [TMP3] - vsubl.u8 q10, d20, d0 - vld1.8 {d22}, [TMP4] - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - vsubl.u8 q11, d22, d0 - vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d24}, [TMP1] - vsubl.u8 q12, d24, d0 - vld1.8 {d26}, [TMP2] - vsubl.u8 q13, d26, d0 - vld1.8 {d28}, [TMP3] - vsubl.u8 q14, d28, d0 - vld1.8 {d30}, [TMP4] - vsubl.u8 q15, d30, d0 - vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! - vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! - pop {r4, r5} - bx lr - - .unreq SAMPLE_DATA - .unreq START_COL - .unreq WORKSPACE - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - -/*****************************************************************************/ - -/* - * jsimd_fdct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the forward DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' - * function from jfdctfst.c - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#define XFIX_0_382683433 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_0_707106781 d0[2] -#define XFIX_1_306562965 d0[3] - -.balign 16 -jsimd_fdct_ifast_neon_consts: - .short (98 * 128) /* XFIX_0_382683433 */ - .short (139 * 128) /* XFIX_0_541196100 */ - .short (181 * 128) /* XFIX_0_707106781 */ - .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ - -asm_function jsimd_fdct_ifast_neon - - DATA .req r0 - TMP .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP, jsimd_fdct_ifast_neon_consts - vld1.16 {d0}, [TMP, :64] - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | q8 - * 1 | d18 | d19 | q9 - * 2 | d20 | d21 | q10 - * 3 | d22 | d23 | q11 - * 4 | d24 | d25 | q12 - * 5 | d26 | d27 | q13 - * 6 | d28 | d29 | q14 - * 7 | d30 | d31 | q15 - */ - - vld1.16 {d16, d17, d18, d19}, [DATA, :128]! - vld1.16 {d20, d21, d22, d23}, [DATA, :128]! - vld1.16 {d24, d25, d26, d27}, [DATA, :128]! - vld1.16 {d28, d29, d30, d31}, [DATA, :128] - sub DATA, DATA, #(128 - 32) - - mov TMP, #2 -1: - /* Transpose */ - vtrn.16 q12, q13 - vtrn.16 q10, q11 - vtrn.16 q8, q9 - vtrn.16 q14, q15 - vtrn.32 q9, q11 - vtrn.32 q13, q15 - vtrn.32 q8, q10 - vtrn.32 q12, q14 - vswp d30, d23 - vswp d24, d17 - vswp d26, d19 - /* 1-D FDCT */ - vadd.s16 q2, q11, q12 - vswp d28, d21 - vsub.s16 q12, q11, q12 - vsub.s16 q6, q10, q13 - vadd.s16 q10, q10, q13 - vsub.s16 q7, q9, q14 - vadd.s16 q9, q9, q14 - vsub.s16 q1, q8, q15 - vadd.s16 q8, q8, q15 - vsub.s16 q4, q9, q10 - vsub.s16 q5, q8, q2 - vadd.s16 q3, q9, q10 - vadd.s16 q4, q4, q5 - vadd.s16 q2, q8, q2 - vqdmulh.s16 q4, q4, XFIX_0_707106781 - vadd.s16 q11, q12, q6 - vadd.s16 q8, q2, q3 - vsub.s16 q12, q2, q3 - vadd.s16 q3, q6, q7 - vadd.s16 q7, q7, q1 - vqdmulh.s16 q3, q3, XFIX_0_707106781 - vsub.s16 q6, q11, q7 - vadd.s16 q10, q5, q4 - vqdmulh.s16 q6, q6, XFIX_0_382683433 - vsub.s16 q14, q5, q4 - vqdmulh.s16 q11, q11, XFIX_0_541196100 - vqdmulh.s16 q5, q7, XFIX_1_306562965 - vadd.s16 q4, q1, q3 - vsub.s16 q3, q1, q3 - vadd.s16 q7, q7, q6 - vadd.s16 q11, q11, q6 - vadd.s16 q7, q7, q5 - vadd.s16 q13, q3, q11 - vsub.s16 q11, q3, q11 - vadd.s16 q9, q4, q7 - vsub.s16 q15, q4, q7 - subs TMP, TMP, #1 - bne 1b - - /* store results */ - vst1.16 {d16, d17, d18, d19}, [DATA, :128]! - vst1.16 {d20, d21, d22, d23}, [DATA, :128]! - vst1.16 {d24, d25, d26, d27}, [DATA, :128]! - vst1.16 {d28, d29, d30, d31}, [DATA, :128] - - vpop {d8-d15} - bx lr - - .unreq DATA - .unreq TMP - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, - * DCTELEM *workspace); - * - * Note: the code uses 2 stage pipelining in order to improve instructions - * scheduling and eliminate stalls (this provides ~15% better - * performance for this function on both ARM Cortex-A8 and - * ARM Cortex-A9 when compared to the non-pipelined variant). - * The instructions which belong to the second stage use different - * indentation for better readiability. - */ -asm_function jsimd_quantize_neon - - COEF_BLOCK .req r0 - DIVISORS .req r1 - WORKSPACE .req r2 - - RECIPROCAL .req DIVISORS - CORRECTION .req r3 - SHIFT .req ip - LOOP_COUNT .req r4 - - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - vabs.s16 q12, q0 - add CORRECTION, DIVISORS, #(64 * 2) - add SHIFT, DIVISORS, #(64 * 6) - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - - push {r4, r5} - mov LOOP_COUNT, #3 -1: - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - veor.u16 q14, q14, q2 /* restore sign */ - vabs.s16 q12, q0 - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - veor.u16 q15, q15, q3 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vsub.u16 q14, q14, q2 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vsub.u16 q15, q15, q3 - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - subs LOOP_COUNT, LOOP_COUNT, #1 - bne 1b - pop {r4, r5} - - veor.u16 q14, q14, q2 /* restore sign */ - veor.u16 q15, q15, q3 - vsub.u16 q14, q14, q2 - vsub.u16 q15, q15, q3 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - - bx lr /* return */ - - .unreq COEF_BLOCK - .unreq DIVISORS - .unreq WORKSPACE - .unreq RECIPROCAL - .unreq CORRECTION - .unreq SHIFT - .unreq LOOP_COUNT - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, - * JDIMENSION downsampled_width, - * JSAMPARRAY input_data, - * JSAMPARRAY *output_data_ptr); - * - * Note: the use of unaligned writes is the main remaining bottleneck in - * this code, which can be potentially solved to get up to tens - * of percents performance improvement on Cortex-A8/Cortex-A9. - */ - -/* - * Upsample 16 source pixels to 32 destination pixels. The new 16 source - * pixels are loaded to q0. The previous 16 source pixels are in q1. The - * shifted-by-one source pixels are constructed in q2 by using q0 and q1. - * Register d28 is used for multiplication by 3. Register q15 is used - * for adding +1 bias. - */ -.macro upsample16 OUTPTR, INPTR - vld1.8 {q0}, [\INPTR]! - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - vmov q1, q0 /* backup source pixels to q1 */ - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! -.endm - -/* - * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' - * macro, the roles of q0 and q1 registers are reversed for even and odd - * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. - * Also this unrolling allows to reorder loads and stores to compensate - * multiplication latency and reduce stalls. - */ -.macro upsample32 OUTPTR, INPTR - /* even 16 pixels group */ - vld1.8 {q0}, [\INPTR]! - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - /* odd 16 pixels group */ - vld1.8 {q1}, [\INPTR]! - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vmovl.u8 q8, d2 - vext.8 q2, q0, q1, #15 - vmovl.u8 q9, d3 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d2, d28 - vmlal.u8 q11, d3, d28 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! -.endm - -/* - * Upsample a row of WIDTH pixels from INPTR to OUTPTR. - */ -.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 - /* special case for the first and last pixels */ - sub \WIDTH, \WIDTH, #1 - add \OUTPTR, \OUTPTR, #1 - ldrb \TMP1, [\INPTR, \WIDTH] - strb \TMP1, [\OUTPTR, \WIDTH, asl #1] - ldrb \TMP1, [\INPTR], #1 - strb \TMP1, [\OUTPTR, #-1] - vmov.8 d3[7], \TMP1 - - subs \WIDTH, \WIDTH, #32 - blt 5f -0: /* process 32 pixels per iteration */ - upsample32 \OUTPTR, \INPTR - subs \WIDTH, \WIDTH, #32 - bge 0b -5: - adds \WIDTH, \WIDTH, #16 - blt 1f -0: /* process 16 pixels if needed */ - upsample16 \OUTPTR, \INPTR - subs \WIDTH, \WIDTH, #16 -1: - adds \WIDTH, \WIDTH, #16 - beq 9f - - /* load the remaining 1-15 pixels */ - add \INPTR, \INPTR, \WIDTH - tst \WIDTH, #1 - beq 2f - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #2 - beq 2f - vext.8 d0, d0, d0, #6 - sub \INPTR, \INPTR, #1 - vld1.8 {d0[1]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #4 - beq 2f - vrev64.32 d0, d0 - sub \INPTR, \INPTR, #1 - vld1.8 {d0[3]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[2]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[1]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #8 - beq 2f - vmov d1, d0 - sub \INPTR, \INPTR, #8 - vld1.8 {d0}, [\INPTR] -2: /* upsample the remaining pixels */ - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - vrshrn.u16 d10, q8, #2 - vrshrn.u16 d12, q9, #2 - vshrn.u16 d11, q10, #2 - vshrn.u16 d13, q11, #2 - vzip.8 d10, d11 - vzip.8 d12, d13 - /* store the remaining pixels */ - tst \WIDTH, #8 - beq 2f - vst1.8 {d10, d11}, [\OUTPTR]! - vmov q5, q6 -2: - tst \WIDTH, #4 - beq 2f - vst1.8 {d10}, [\OUTPTR]! - vmov d10, d11 -2: - tst \WIDTH, #2 - beq 2f - vst1.8 {d10[0]}, [\OUTPTR]! - vst1.8 {d10[1]}, [\OUTPTR]! - vst1.8 {d10[2]}, [\OUTPTR]! - vst1.8 {d10[3]}, [\OUTPTR]! - vext.8 d10, d10, d10, #4 -2: - tst \WIDTH, #1 - beq 2f - vst1.8 {d10[0]}, [\OUTPTR]! - vst1.8 {d10[1]}, [\OUTPTR]! -2: -9: -.endm - -asm_function jsimd_h2v1_fancy_upsample_neon - - MAX_V_SAMP_FACTOR .req r0 - DOWNSAMPLED_WIDTH .req r1 - INPUT_DATA .req r2 - OUTPUT_DATA_PTR .req r3 - OUTPUT_DATA .req OUTPUT_DATA_PTR - - OUTPTR .req r4 - INPTR .req r5 - WIDTH .req ip - TMP .req lr - - push {r4, r5, r6, lr} - vpush {d8-d15} - - ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] - cmp MAX_V_SAMP_FACTOR, #0 - ble 99f - - /* initialize constants */ - vmov.u8 d28, #3 - vmov.u16 q15, #1 -11: - ldr INPTR, [INPUT_DATA], #4 - ldr OUTPTR, [OUTPUT_DATA], #4 - mov WIDTH, DOWNSAMPLED_WIDTH - upsample_row OUTPTR, INPTR, WIDTH, TMP - subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 - bgt 11b - -99: - vpop {d8-d15} - pop {r4, r5, r6, pc} - - .unreq MAX_V_SAMP_FACTOR - .unreq DOWNSAMPLED_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA_PTR - .unreq OUTPUT_DATA - - .unreq OUTPTR - .unreq INPTR - .unreq WIDTH - .unreq TMP - -.purgem upsample16 -.purgem upsample32 -.purgem upsample_row - - -/*****************************************************************************/ - -/* - * GLOBAL(JOCTET*) - * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, - * JCOEFPTR block, int last_dc_val, - * c_derived_tbl *dctbl, c_derived_tbl *actbl) - * - */ - -.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP - sub \PUT_BITS, \PUT_BITS, #0x8 - lsr \TMP, \PUT_BUFFER, \PUT_BITS - uxtb \TMP, \TMP - strb \TMP, [\BUFFER, #1]! - cmp \TMP, #0xff - /*it eq*/ - strbeq \ZERO, [\BUFFER, #1]! -.endm - -.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE - /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ - add \PUT_BITS, \SIZE - /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ - orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE -.endm - -.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP - cmp \PUT_BITS, #0x10 - blt 15f - eor \ZERO, \ZERO, \ZERO - emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP - emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP -15: -.endm - -.balign 16 -jsimd_huff_encode_one_block_neon_consts: - .byte 0x01 - .byte 0x02 - .byte 0x04 - .byte 0x08 - .byte 0x10 - .byte 0x20 - .byte 0x40 - .byte 0x80 - -asm_function jsimd_huff_encode_one_block_neon - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - add r7, sp, #0x1c - sub r4, sp, #0x40 - bfc r4, #0, #5 - mov sp, r4 /* align sp on 32 bytes */ - vst1.64 {d8, d9, d10, d11}, [r4, :128]! - vst1.64 {d12, d13, d14, d15}, [r4, :128] - sub sp, #0x140 /* reserve 320 bytes */ - str r0, [sp, #0x18] /* working state > sp + Ox18 */ - add r4, sp, #0x20 /* r4 = t1 */ - ldr lr, [r7, #0x8] /* lr = dctbl */ - sub r10, r1, #0x1 /* r10=buffer-- */ - ldrsh r1, [r2] - mov r9, #0x10 - mov r8, #0x1 - adr r5, jsimd_huff_encode_one_block_neon_consts - /* prepare data */ - vld1.8 {d26}, [r5, :64] - veor q8, q8, q8 - veor q9, q9, q9 - vdup.16 q14, r9 - vdup.16 q15, r8 - veor q10, q10, q10 - veor q11, q11, q11 - sub r1, r1, r3 - add r9, r2, #0x22 - add r8, r2, #0x18 - add r3, r2, #0x36 - vmov.16 d0[0], r1 - vld1.16 {d2[0]}, [r9, :16] - vld1.16 {d4[0]}, [r8, :16] - vld1.16 {d6[0]}, [r3, :16] - add r1, r2, #0x2 - add r9, r2, #0x30 - add r8, r2, #0x26 - add r3, r2, #0x28 - vld1.16 {d0[1]}, [r1, :16] - vld1.16 {d2[1]}, [r9, :16] - vld1.16 {d4[1]}, [r8, :16] - vld1.16 {d6[1]}, [r3, :16] - add r1, r2, #0x10 - add r9, r2, #0x40 - add r8, r2, #0x34 - add r3, r2, #0x1a - vld1.16 {d0[2]}, [r1, :16] - vld1.16 {d2[2]}, [r9, :16] - vld1.16 {d4[2]}, [r8, :16] - vld1.16 {d6[2]}, [r3, :16] - add r1, r2, #0x20 - add r9, r2, #0x32 - add r8, r2, #0x42 - add r3, r2, #0xc - vld1.16 {d0[3]}, [r1, :16] - vld1.16 {d2[3]}, [r9, :16] - vld1.16 {d4[3]}, [r8, :16] - vld1.16 {d6[3]}, [r3, :16] - add r1, r2, #0x12 - add r9, r2, #0x24 - add r8, r2, #0x50 - add r3, r2, #0xe - vld1.16 {d1[0]}, [r1, :16] - vld1.16 {d3[0]}, [r9, :16] - vld1.16 {d5[0]}, [r8, :16] - vld1.16 {d7[0]}, [r3, :16] - add r1, r2, #0x4 - add r9, r2, #0x16 - add r8, r2, #0x60 - add r3, r2, #0x1c - vld1.16 {d1[1]}, [r1, :16] - vld1.16 {d3[1]}, [r9, :16] - vld1.16 {d5[1]}, [r8, :16] - vld1.16 {d7[1]}, [r3, :16] - add r1, r2, #0x6 - add r9, r2, #0x8 - add r8, r2, #0x52 - add r3, r2, #0x2a - vld1.16 {d1[2]}, [r1, :16] - vld1.16 {d3[2]}, [r9, :16] - vld1.16 {d5[2]}, [r8, :16] - vld1.16 {d7[2]}, [r3, :16] - add r1, r2, #0x14 - add r9, r2, #0xa - add r8, r2, #0x44 - add r3, r2, #0x38 - vld1.16 {d1[3]}, [r1, :16] - vld1.16 {d3[3]}, [r9, :16] - vld1.16 {d5[3]}, [r8, :16] - vld1.16 {d7[3]}, [r3, :16] - vcgt.s16 q8, q8, q0 - vcgt.s16 q9, q9, q1 - vcgt.s16 q10, q10, q2 - vcgt.s16 q11, q11, q3 - vabs.s16 q0, q0 - vabs.s16 q1, q1 - vabs.s16 q2, q2 - vabs.s16 q3, q3 - veor q8, q8, q0 - veor q9, q9, q1 - veor q10, q10, q2 - veor q11, q11, q3 - add r9, r4, #0x20 - add r8, r4, #0x80 - add r3, r4, #0xa0 - vclz.i16 q0, q0 - vclz.i16 q1, q1 - vclz.i16 q2, q2 - vclz.i16 q3, q3 - vsub.i16 q0, q14, q0 - vsub.i16 q1, q14, q1 - vsub.i16 q2, q14, q2 - vsub.i16 q3, q14, q3 - vst1.16 {d0, d1, d2, d3}, [r4, :256] - vst1.16 {d4, d5, d6, d7}, [r9, :256] - vshl.s16 q0, q15, q0 - vshl.s16 q1, q15, q1 - vshl.s16 q2, q15, q2 - vshl.s16 q3, q15, q3 - vsub.i16 q0, q0, q15 - vsub.i16 q1, q1, q15 - vsub.i16 q2, q2, q15 - vsub.i16 q3, q3, q15 - vand q8, q8, q0 - vand q9, q9, q1 - vand q10, q10, q2 - vand q11, q11, q3 - vst1.16 {d16, d17, d18, d19}, [r8, :256] - vst1.16 {d20, d21, d22, d23}, [r3, :256] - add r1, r2, #0x46 - add r9, r2, #0x3a - add r8, r2, #0x74 - add r3, r2, #0x6a - vld1.16 {d8[0]}, [r1, :16] - vld1.16 {d10[0]}, [r9, :16] - vld1.16 {d12[0]}, [r8, :16] - vld1.16 {d14[0]}, [r3, :16] - veor q8, q8, q8 - veor q9, q9, q9 - veor q10, q10, q10 - veor q11, q11, q11 - add r1, r2, #0x54 - add r9, r2, #0x2c - add r8, r2, #0x76 - add r3, r2, #0x78 - vld1.16 {d8[1]}, [r1, :16] - vld1.16 {d10[1]}, [r9, :16] - vld1.16 {d12[1]}, [r8, :16] - vld1.16 {d14[1]}, [r3, :16] - add r1, r2, #0x62 - add r9, r2, #0x1e - add r8, r2, #0x68 - add r3, r2, #0x7a - vld1.16 {d8[2]}, [r1, :16] - vld1.16 {d10[2]}, [r9, :16] - vld1.16 {d12[2]}, [r8, :16] - vld1.16 {d14[2]}, [r3, :16] - add r1, r2, #0x70 - add r9, r2, #0x2e - add r8, r2, #0x5a - add r3, r2, #0x6c - vld1.16 {d8[3]}, [r1, :16] - vld1.16 {d10[3]}, [r9, :16] - vld1.16 {d12[3]}, [r8, :16] - vld1.16 {d14[3]}, [r3, :16] - add r1, r2, #0x72 - add r9, r2, #0x3c - add r8, r2, #0x4c - add r3, r2, #0x5e - vld1.16 {d9[0]}, [r1, :16] - vld1.16 {d11[0]}, [r9, :16] - vld1.16 {d13[0]}, [r8, :16] - vld1.16 {d15[0]}, [r3, :16] - add r1, r2, #0x64 - add r9, r2, #0x4a - add r8, r2, #0x3e - add r3, r2, #0x6e - vld1.16 {d9[1]}, [r1, :16] - vld1.16 {d11[1]}, [r9, :16] - vld1.16 {d13[1]}, [r8, :16] - vld1.16 {d15[1]}, [r3, :16] - add r1, r2, #0x56 - add r9, r2, #0x58 - add r8, r2, #0x4e - add r3, r2, #0x7c - vld1.16 {d9[2]}, [r1, :16] - vld1.16 {d11[2]}, [r9, :16] - vld1.16 {d13[2]}, [r8, :16] - vld1.16 {d15[2]}, [r3, :16] - add r1, r2, #0x48 - add r9, r2, #0x66 - add r8, r2, #0x5c - add r3, r2, #0x7e - vld1.16 {d9[3]}, [r1, :16] - vld1.16 {d11[3]}, [r9, :16] - vld1.16 {d13[3]}, [r8, :16] - vld1.16 {d15[3]}, [r3, :16] - vcgt.s16 q8, q8, q4 - vcgt.s16 q9, q9, q5 - vcgt.s16 q10, q10, q6 - vcgt.s16 q11, q11, q7 - vabs.s16 q4, q4 - vabs.s16 q5, q5 - vabs.s16 q6, q6 - vabs.s16 q7, q7 - veor q8, q8, q4 - veor q9, q9, q5 - veor q10, q10, q6 - veor q11, q11, q7 - add r1, r4, #0x40 - add r9, r4, #0x60 - add r8, r4, #0xc0 - add r3, r4, #0xe0 - vclz.i16 q4, q4 - vclz.i16 q5, q5 - vclz.i16 q6, q6 - vclz.i16 q7, q7 - vsub.i16 q4, q14, q4 - vsub.i16 q5, q14, q5 - vsub.i16 q6, q14, q6 - vsub.i16 q7, q14, q7 - vst1.16 {d8, d9, d10, d11}, [r1, :256] - vst1.16 {d12, d13, d14, d15}, [r9, :256] - vshl.s16 q4, q15, q4 - vshl.s16 q5, q15, q5 - vshl.s16 q6, q15, q6 - vshl.s16 q7, q15, q7 - vsub.i16 q4, q4, q15 - vsub.i16 q5, q5, q15 - vsub.i16 q6, q6, q15 - vsub.i16 q7, q7, q15 - vand q8, q8, q4 - vand q9, q9, q5 - vand q10, q10, q6 - vand q11, q11, q7 - vst1.16 {d16, d17, d18, d19}, [r8, :256] - vst1.16 {d20, d21, d22, d23}, [r3, :256] - ldr r12, [r7, #0xc] /* r12 = actbl */ - add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ - mov r9, r12 /* r9 = actbl */ - add r6, r4, #0x80 /* r6 = t2 */ - ldr r11, [r0, #0x8] /* r11 = put_buffer */ - ldr r4, [r0, #0xc] /* r4 = put_bits */ - ldrh r2, [r6, #-128] /* r2 = nbits */ - ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */ - ldr r0, [lr, r2, lsl #2] - ldrb r5, [r1, r2] - put_bits r11, r4, r0, r5 - checkbuf15 r10, r11, r4, r5, r0 - put_bits r11, r4, r3, r2 - checkbuf15 r10, r11, r4, r5, r0 - mov lr, r6 /* lr = t2 */ - add r5, r9, #0x400 /* r5 = actbl->ehufsi */ - ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ - veor q8, q8, q8 - vceq.i16 q0, q0, q8 - vceq.i16 q1, q1, q8 - vceq.i16 q2, q2, q8 - vceq.i16 q3, q3, q8 - vceq.i16 q4, q4, q8 - vceq.i16 q5, q5, q8 - vceq.i16 q6, q6, q8 - vceq.i16 q7, q7, q8 - vmovn.i16 d0, q0 - vmovn.i16 d2, q1 - vmovn.i16 d4, q2 - vmovn.i16 d6, q3 - vmovn.i16 d8, q4 - vmovn.i16 d10, q5 - vmovn.i16 d12, q6 - vmovn.i16 d14, q7 - vand d0, d0, d26 - vand d2, d2, d26 - vand d4, d4, d26 - vand d6, d6, d26 - vand d8, d8, d26 - vand d10, d10, d26 - vand d12, d12, d26 - vand d14, d14, d26 - vpadd.i8 d0, d0, d2 - vpadd.i8 d4, d4, d6 - vpadd.i8 d8, d8, d10 - vpadd.i8 d12, d12, d14 - vpadd.i8 d0, d0, d4 - vpadd.i8 d8, d8, d12 - vpadd.i8 d0, d0, d8 - vmov.32 r1, d0[1] - vmov.32 r8, d0[0] - mvn r1, r1 - mvn r8, r8 - lsrs r1, r1, #0x1 - rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ - rbit r1, r1 /* r1 = index1 */ - rbit r8, r8 /* r8 = index0 */ - ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ - str r1, [sp, #0x14] /* index1 > sp + 0x14 */ - cmp r8, #0x0 - beq 6f -1: - clz r2, r8 - add lr, lr, r2, lsl #1 - lsl r8, r8, r2 - ldrh r1, [lr, #-126] -2: - cmp r2, #0x10 - blt 3f - sub r2, r2, #0x10 - put_bits r11, r4, r0, r6 - cmp r4, #0x10 - blt 2b - eor r3, r3, r3 - emit_byte r10, r11, r4, r3, r12 - emit_byte r10, r11, r4, r3, r12 - b 2b -3: - add r2, r1, r2, lsl #4 - ldrh r3, [lr, #2]! - ldr r12, [r9, r2, lsl #2] - ldrb r2, [r5, r2] - put_bits r11, r4, r12, r2 - checkbuf15 r10, r11, r4, r2, r12 - put_bits r11, r4, r3, r1 - checkbuf15 r10, r11, r4, r2, r12 - lsls r8, r8, #0x1 - bne 1b -6: - add r12, sp, #0x20 /* r12 = t1 */ - ldr r8, [sp, #0x14] /* r8 = index1 */ - adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ - cmp r8, #0x0 - beq 6f - clz r2, r8 - sub r12, r12, lr - lsl r8, r8, r2 - add r2, r2, r12, lsr #1 - add lr, lr, r2, lsl #1 - b 7f -1: - clz r2, r8 - add lr, lr, r2, lsl #1 - lsl r8, r8, r2 -7: - ldrh r1, [lr, #-126] -2: - cmp r2, #0x10 - blt 3f - sub r2, r2, #0x10 - put_bits r11, r4, r0, r6 - cmp r4, #0x10 - blt 2b - eor r3, r3, r3 - emit_byte r10, r11, r4, r3, r12 - emit_byte r10, r11, r4, r3, r12 - b 2b -3: - add r2, r1, r2, lsl #4 - ldrh r3, [lr, #2]! - ldr r12, [r9, r2, lsl #2] - ldrb r2, [r5, r2] - put_bits r11, r4, r12, r2 - checkbuf15 r10, r11, r4, r2, r12 - put_bits r11, r4, r3, r1 - checkbuf15 r10, r11, r4, r2, r12 - lsls r8, r8, #0x1 - bne 1b -6: - add r0, sp, #0x20 - add r0, #0xfe - cmp lr, r0 - bhs 1f - ldr r1, [r9] - ldrb r0, [r5] - put_bits r11, r4, r1, r0 - checkbuf15 r10, r11, r4, r0, r1 -1: - ldr r12, [sp, #0x18] - str r11, [r12, #0x8] - str r4, [r12, #0xc] - add r0, r10, #0x1 - add r4, sp, #0x140 - vld1.64 {d8, d9, d10, d11}, [r4, :128]! - vld1.64 {d12, d13, d14, d15}, [r4, :128] - sub r4, r7, #0x1c - mov sp, r4 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - -.purgem emit_byte -.purgem put_bits -.purgem checkbuf15 |