diff options
Diffstat (limited to 'media/libtheora/lib/arm/armidct.s')
-rw-r--r-- | media/libtheora/lib/arm/armidct.s | 177 |
1 files changed, 58 insertions, 119 deletions
diff --git a/media/libtheora/lib/arm/armidct.s b/media/libtheora/lib/arm/armidct.s index babd846ecd..68530c7140 100644 --- a/media/libtheora/lib/arm/armidct.s +++ b/media/libtheora/lib/arm/armidct.s @@ -11,17 +11,11 @@ ;******************************************************************** ; Original implementation: ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd -; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $ +; last mod: $Id$ ;******************************************************************** AREA |.text|, CODE, READONLY - ; Explicitly specifying alignment here because some versions of - ; gas don't align code correctly. See - ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html - ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 - ALIGN - GET armopts.s EXPORT oc_idct8x8_1_arm @@ -70,11 +64,8 @@ oc_idct8x8_slow_arm BL idct8core_arm BL idct8core_arm LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block (decoder only). SUB r2, r1, #8*16 - CMP r0, r2 - MOV r1, r13 ; And read from temp storage. - BEQ oc_idct8x8_slow_arm_cols + ; Clear input data for next block. MOV r4, #0 MOV r5, #0 MOV r6, #0 @@ -87,7 +78,7 @@ oc_idct8x8_slow_arm STMIA r2!,{r4,r5,r6,r7} STMIA r2!,{r4,r5,r6,r7} STMIA r2!,{r4,r5,r6,r7} -oc_idct8x8_slow_arm_cols + MOV r1, r13 ; And read from temp storage. ; Column transforms BL idct8core_down_arm BL idct8core_down_arm @@ -111,18 +102,15 @@ oc_idct8x8_10_arm PROC BL idct3core_arm BL idct2core_arm BL idct1core_arm - ; Clear input data for next block (decoder only). - SUB r0, r1, #4*16 - CMP r0, r2 - MOV r1, r13 ; Read from temp storage. - BEQ oc_idct8x8_10_arm_cols + ; Clear input data for next block. MOV r4, #0 - STR r4, [r0] - STR r4, [r0,#4] - STR r4, [r0,#16] - STR r4, [r0,#20] - STR r4, [r0,#32] - STR r4, [r0,#48] + STR r4, [r1,#-4*16]! + STR r4, [r1,#4] + STR r4, [r1,#16] + STR r4, [r1,#20] + STR r4, [r1,#32] + STR r4, [r1,#48] + MOV r1, r13 ; Read from temp storage. MOV r0, r2 ; Write to the final destination oc_idct8x8_10_arm_cols ; Column transforms @@ -147,18 +135,14 @@ oc_idct8x8_6_arm PROC BL idct3core_arm BL idct2core_arm BL idct1core_arm - ; Clear input data for next block (decoder only). - SUB r0, r1, #3*16 - CMP r0, r2 - MOV r1, r13 ; Read from temp storage. - BEQ oc_idct8x8_6_arm_cols + ; Clear input data for next block. MOV r4, #0 - STR r4, [r0] - STR r4, [r0,#4] - STR r4, [r0,#16] - STR r4, [r0,#32] + STR r4, [r1,#-3*16]! + STR r4, [r1,#4] + STR r4, [r1,#16] + STR r4, [r1,#32] + MOV r1, r13 ; Read from temp storage. MOV r0, r2 ; Write to the final destination -oc_idct8x8_6_arm_cols ; Column transforms BL idct3core_down_arm BL idct3core_down_arm @@ -180,14 +164,12 @@ oc_idct8x8_3_arm PROC MOV r0, r13 ; Write to temp storage. BL idct2core_arm BL idct1core_arm - ; Clear input data for next block (decoder only). - SUB r0, r1, #2*16 - CMP r0, r2 + ; Clear input data for next block. + MOV r4, #0 + STR r4, [r1,#-2*16]! + STR r4, [r1,#16] MOV r1, r13 ; Read from temp storage. - MOVNE r4, #0 - STRNE r4, [r0] - STRNE r4, [r0,#16] - MOVNE r0, r2 ; Write to the final destination + MOV r0, r2 ; Write to the final destination ; Column transforms BL idct2core_down_arm BL idct2core_down_arm @@ -805,30 +787,26 @@ oc_idct8x8_slow_v6 BL idct8_8core_v6 BL idct8_8core_v6 LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block (decoder only). - SUB r2, r1, #8*16 - CMP r0, r2 - MOV r1, r13 ; And read from temp storage. - BEQ oc_idct8x8_slow_v6_cols + ; Clear input data for next block. MOV r4, #0 MOV r5, #0 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 - STRD r4, [r2], #8 -oc_idct8x8_slow_v6_cols + STRD r4, [r1,#-8*16]! + STRD r4, [r1,#8] + STRD r4, [r1,#16] + STRD r4, [r1,#24] + STRD r4, [r1,#32] + STRD r4, [r1,#40] + STRD r4, [r1,#48] + STRD r4, [r1,#56] + STRD r4, [r1,#64] + STRD r4, [r1,#72] + STRD r4, [r1,#80] + STRD r4, [r1,#88] + STRD r4, [r1,#96] + STRD r4, [r1,#104] + STRD r4, [r1,#112] + STRD r4, [r1,#120] + MOV r1, r13 ; And read from temp storage. ; Column transforms BL idct8_8core_down_v6 BL idct8_8core_down_v6 @@ -849,20 +827,16 @@ oc_idct8x8_10_v6 PROC BL idct4_3core_v6 BL idct2_1core_v6 LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block (decoder only). - SUB r2, r1, #4*16 - CMP r0, r2 - AND r1, r13,#4 ; Align the stack. - BEQ oc_idct8x8_10_v6_cols + ; Clear input data for next block. MOV r4, #0 MOV r5, #0 - STRD r4, [r2] - STRD r4, [r2,#16] - STR r4, [r2,#32] - STR r4, [r2,#48] -oc_idct8x8_10_v6_cols -; Column transforms + STRD r4, [r1,#-4*16]! + STRD r4, [r1,#16] + STR r4, [r1,#32] + STR r4, [r1,#48] + AND r1, r13,#4 ; Align the stack. ADD r1, r1, r13 ; And read from temp storage. +; Column transforms BL idct4_4core_down_v6 BL idct4_4core_down_v6 BL idct4_4core_down_v6 @@ -878,14 +852,12 @@ oc_idct8x8_3_v6 PROC MOV r8, r0 MOV r0, r13 ; Write to temp storage. BL idct2_1core_v6 - ; Clear input data for next block (decoder only). - SUB r0, r1, #2*16 - CMP r0, r8 + ; Clear input data for next block. + MOV r4, #0 + STR r4, [r1,#-2*16]! + STR r4, [r1,#16] MOV r1, r13 ; Read from temp storage. - MOVNE r4, #0 - STRNE r4, [r0] - STRNE r4, [r0,#16] - MOVNE r0, r8 ; Write to the final destination. + MOV r0, r8 ; Write to the final destination. ; Column transforms BL idct2_2core_down_v6 BL idct2_2core_down_v6 @@ -1041,20 +1013,16 @@ oc_idct8x8_6_v6 PROC ADD r0, r0, r13 ; Write to temp storage. BL idct3_2core_v6 BL idct1core_v6 - ; Clear input data for next block (decoder only). - SUB r0, r1, #3*16 - CMP r0, r8 - AND r1, r13,#4 ; Align the stack. - BEQ oc_idct8x8_6_v6_cols + ; Clear input data for next block. MOV r4, #0 MOV r5, #0 - STRD r4, [r0] - STR r4, [r0,#16] - STR r4, [r0,#32] + STRD r4, [r1,#-3*16]! + STR r4, [r1,#16] + STR r4, [r1,#32] + AND r1, r13,#4 ; Align the stack. MOV r0, r8 ; Write to the final destination. -oc_idct8x8_6_v6_cols -; Column transforms ADD r1, r1, r13 ; And read from temp storage. +; Column transforms BL idct3_3core_down_v6 BL idct3_3core_down_v6 BL idct3_3core_down_v6 @@ -1596,7 +1564,6 @@ oc_idct8x8_slow_neon VSWP D23,D30 ; Column transforms BL oc_idct8x8_stage123_neon - CMP r0,r1 ; We have to put the return address back in the LR, or the branch ; predictor will not recognize the function return and mis-predict the ; entire call stack. @@ -1610,7 +1577,6 @@ oc_idct8x8_slow_neon VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' - BEQ oc_idct8x8_slow_neon_noclear VMOV.I8 Q2,#0 VPOP {D8-D15} VMOV.I8 Q3,#0 @@ -1628,19 +1594,6 @@ oc_idct8x8_slow_neon VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 VSTMIA r0, {D16-D31} MOV PC, r14 - -oc_idct8x8_slow_neon_noclear - VPOP {D8-D15} - VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 - VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 - VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 - VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 - VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 - VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 - VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 - VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 - VSTMIA r0, {D16-D31} - MOV PC, r14 ENDP oc_idct8x8_stage123_neon PROC @@ -1871,7 +1824,6 @@ oc_idct8x8_10_neon PROC VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2] VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2] ; Stage 4 - CMP r0, r1 VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]' VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]'' VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]' @@ -1880,7 +1832,6 @@ oc_idct8x8_10_neon PROC VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]' VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]' VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]'' - BEQ oc_idct8x8_10_neon_noclear VMOV.I8 D2, #0 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 VST1.64 {D2}, [r1@64], r12 @@ -1896,18 +1847,6 @@ oc_idct8x8_10_neon PROC VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 VSTMIA r0, {D16-D31} MOV PC, r14 - -oc_idct8x8_10_neon_noclear - VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 - VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 - VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 - VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 - VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 - VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 - VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 - VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 - VSTMIA r0, {D16-D31} - MOV PC, r14 ENDP ] |