diff options
Diffstat (limited to 'media/libtheora/lib/arm/armidct.s')
-rw-r--r-- | media/libtheora/lib/arm/armidct.s | 177 |
1 files changed, 119 insertions, 58 deletions
diff --git a/media/libtheora/lib/arm/armidct.s b/media/libtheora/lib/arm/armidct.s index 68530c7140..babd846ecd 100644 --- a/media/libtheora/lib/arm/armidct.s +++ b/media/libtheora/lib/arm/armidct.s @@ -11,11 +11,17 @@ ;******************************************************************** ; Original implementation: ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd -; last mod: $Id$ +; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $ ;******************************************************************** AREA |.text|, CODE, READONLY + ; Explicitly specifying alignment here because some versions of + ; gas don't align code correctly. See + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 + ALIGN + GET armopts.s EXPORT oc_idct8x8_1_arm @@ -64,8 +70,11 @@ oc_idct8x8_slow_arm BL idct8core_arm BL idct8core_arm LDR r0, [r13], #4 ; Write to the final destination. + ; Clear input data for next block (decoder only). SUB r2, r1, #8*16 - ; Clear input data for next block. + CMP r0, r2 + MOV r1, r13 ; And read from temp storage. + BEQ oc_idct8x8_slow_arm_cols MOV r4, #0 MOV r5, #0 MOV r6, #0 @@ -78,7 +87,7 @@ oc_idct8x8_slow_arm STMIA r2!,{r4,r5,r6,r7} STMIA r2!,{r4,r5,r6,r7} STMIA r2!,{r4,r5,r6,r7} - MOV r1, r13 ; And read from temp storage. +oc_idct8x8_slow_arm_cols ; Column transforms BL idct8core_down_arm BL idct8core_down_arm @@ -102,15 +111,18 @@ oc_idct8x8_10_arm PROC BL idct3core_arm BL idct2core_arm BL idct1core_arm - ; Clear input data for next block. - MOV r4, #0 - STR r4, [r1,#-4*16]! - STR r4, [r1,#4] - STR r4, [r1,#16] - STR r4, [r1,#20] - STR r4, [r1,#32] - STR r4, [r1,#48] + ; Clear input data for next block (decoder only). + SUB r0, r1, #4*16 + CMP r0, r2 MOV r1, r13 ; Read from temp storage. + BEQ oc_idct8x8_10_arm_cols + MOV r4, #0 + STR r4, [r0] + STR r4, [r0,#4] + STR r4, [r0,#16] + STR r4, [r0,#20] + STR r4, [r0,#32] + STR r4, [r0,#48] MOV r0, r2 ; Write to the final destination oc_idct8x8_10_arm_cols ; Column transforms @@ -135,14 +147,18 @@ oc_idct8x8_6_arm PROC BL idct3core_arm BL idct2core_arm BL idct1core_arm - ; Clear input data for next block. - MOV r4, #0 - STR r4, [r1,#-3*16]! - STR r4, [r1,#4] - STR r4, [r1,#16] - STR r4, [r1,#32] + ; Clear input data for next block (decoder only). + SUB r0, r1, #3*16 + CMP r0, r2 MOV r1, r13 ; Read from temp storage. + BEQ oc_idct8x8_6_arm_cols + MOV r4, #0 + STR r4, [r0] + STR r4, [r0,#4] + STR r4, [r0,#16] + STR r4, [r0,#32] MOV r0, r2 ; Write to the final destination +oc_idct8x8_6_arm_cols ; Column transforms BL idct3core_down_arm BL idct3core_down_arm @@ -164,12 +180,14 @@ oc_idct8x8_3_arm PROC MOV r0, r13 ; Write to temp storage. BL idct2core_arm BL idct1core_arm - ; Clear input data for next block. - MOV r4, #0 - STR r4, [r1,#-2*16]! - STR r4, [r1,#16] + ; Clear input data for next block (decoder only). + SUB r0, r1, #2*16 + CMP r0, r2 MOV r1, r13 ; Read from temp storage. - MOV r0, r2 ; Write to the final destination + MOVNE r4, #0 + STRNE r4, [r0] + STRNE r4, [r0,#16] + MOVNE r0, r2 ; Write to the final destination ; Column transforms BL idct2core_down_arm BL idct2core_down_arm @@ -787,26 +805,30 @@ oc_idct8x8_slow_v6 BL idct8_8core_v6 BL idct8_8core_v6 LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block. + ; Clear input data for next block (decoder only). + SUB r2, r1, #8*16 + CMP r0, r2 + MOV r1, r13 ; And read from temp storage. + BEQ oc_idct8x8_slow_v6_cols MOV r4, #0 MOV r5, #0 - STRD r4, [r1,#-8*16]! - STRD r4, [r1,#8] - STRD r4, [r1,#16] - STRD r4, [r1,#24] - STRD r4, [r1,#32] - STRD r4, [r1,#40] - STRD r4, [r1,#48] - STRD r4, [r1,#56] - STRD r4, [r1,#64] - STRD r4, [r1,#72] - STRD r4, [r1,#80] - STRD r4, [r1,#88] - STRD r4, [r1,#96] - STRD r4, [r1,#104] - STRD r4, [r1,#112] - STRD r4, [r1,#120] - MOV r1, r13 ; And read from temp storage. + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 +oc_idct8x8_slow_v6_cols ; Column transforms BL idct8_8core_down_v6 BL idct8_8core_down_v6 @@ -827,16 +849,20 @@ oc_idct8x8_10_v6 PROC BL idct4_3core_v6 BL idct2_1core_v6 LDR r0, [r13], #4 ; Write to the final destination. - ; Clear input data for next block. + ; Clear input data for next block (decoder only). + SUB r2, r1, #4*16 + CMP r0, r2 + AND r1, r13,#4 ; Align the stack. + BEQ oc_idct8x8_10_v6_cols MOV r4, #0 MOV r5, #0 - STRD r4, [r1,#-4*16]! - STRD r4, [r1,#16] - STR r4, [r1,#32] - STR r4, [r1,#48] - AND r1, r13,#4 ; Align the stack. - ADD r1, r1, r13 ; And read from temp storage. + STRD r4, [r2] + STRD r4, [r2,#16] + STR r4, [r2,#32] + STR r4, [r2,#48] +oc_idct8x8_10_v6_cols ; Column transforms + ADD r1, r1, r13 ; And read from temp storage. BL idct4_4core_down_v6 BL idct4_4core_down_v6 BL idct4_4core_down_v6 @@ -852,12 +878,14 @@ oc_idct8x8_3_v6 PROC MOV r8, r0 MOV r0, r13 ; Write to temp storage. BL idct2_1core_v6 - ; Clear input data for next block. - MOV r4, #0 - STR r4, [r1,#-2*16]! - STR r4, [r1,#16] + ; Clear input data for next block (decoder only). + SUB r0, r1, #2*16 + CMP r0, r8 MOV r1, r13 ; Read from temp storage. - MOV r0, r8 ; Write to the final destination. + MOVNE r4, #0 + STRNE r4, [r0] + STRNE r4, [r0,#16] + MOVNE r0, r8 ; Write to the final destination. ; Column transforms BL idct2_2core_down_v6 BL idct2_2core_down_v6 @@ -1013,16 +1041,20 @@ oc_idct8x8_6_v6 PROC ADD r0, r0, r13 ; Write to temp storage. BL idct3_2core_v6 BL idct1core_v6 - ; Clear input data for next block. + ; Clear input data for next block (decoder only). + SUB r0, r1, #3*16 + CMP r0, r8 + AND r1, r13,#4 ; Align the stack. + BEQ oc_idct8x8_6_v6_cols MOV r4, #0 MOV r5, #0 - STRD r4, [r1,#-3*16]! - STR r4, [r1,#16] - STR r4, [r1,#32] - AND r1, r13,#4 ; Align the stack. + STRD r4, [r0] + STR r4, [r0,#16] + STR r4, [r0,#32] MOV r0, r8 ; Write to the final destination. - ADD r1, r1, r13 ; And read from temp storage. +oc_idct8x8_6_v6_cols ; Column transforms + ADD r1, r1, r13 ; And read from temp storage. BL idct3_3core_down_v6 BL idct3_3core_down_v6 BL idct3_3core_down_v6 @@ -1564,6 +1596,7 @@ oc_idct8x8_slow_neon VSWP D23,D30 ; Column transforms BL oc_idct8x8_stage123_neon + CMP r0,r1 ; We have to put the return address back in the LR, or the branch ; predictor will not recognize the function return and mis-predict the ; entire call stack. @@ -1577,6 +1610,7 @@ oc_idct8x8_slow_neon VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' + BEQ oc_idct8x8_slow_neon_noclear VMOV.I8 Q2,#0 VPOP {D8-D15} VMOV.I8 Q3,#0 @@ -1594,6 +1628,19 @@ oc_idct8x8_slow_neon VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 VSTMIA r0, {D16-D31} MOV PC, r14 + +oc_idct8x8_slow_neon_noclear + VPOP {D8-D15} + VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 + VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 + VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 + VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 + VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 + VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 + VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 + VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 + VSTMIA r0, {D16-D31} + MOV PC, r14 ENDP oc_idct8x8_stage123_neon PROC @@ -1824,6 +1871,7 @@ oc_idct8x8_10_neon PROC VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2] VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2] ; Stage 4 + CMP r0, r1 VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]' VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]'' VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]' @@ -1832,6 +1880,7 @@ oc_idct8x8_10_neon PROC VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]' VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]' VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]'' + BEQ oc_idct8x8_10_neon_noclear VMOV.I8 D2, #0 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 VST1.64 {D2}, [r1@64], r12 @@ -1847,6 +1896,18 @@ oc_idct8x8_10_neon PROC VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 VSTMIA r0, {D16-D31} MOV PC, r14 + +oc_idct8x8_10_neon_noclear + VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 + VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 + VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 + VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 + VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 + VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 + VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 + VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 + VSTMIA r0, {D16-D31} + MOV PC, r14 ENDP ] |