summaryrefslogtreecommitdiff
path: root/media/libtheora/lib/arm/armidct.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libtheora/lib/arm/armidct.s')
-rw-r--r--media/libtheora/lib/arm/armidct.s177
1 files changed, 119 insertions, 58 deletions
diff --git a/media/libtheora/lib/arm/armidct.s b/media/libtheora/lib/arm/armidct.s
index 68530c7140..babd846ecd 100644
--- a/media/libtheora/lib/arm/armidct.s
+++ b/media/libtheora/lib/arm/armidct.s
@@ -11,11 +11,17 @@
;********************************************************************
; Original implementation:
; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
-; last mod: $Id$
+; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
;********************************************************************
AREA |.text|, CODE, READONLY
+ ; Explicitly specifying alignment here because some versions of
+ ; gas don't align code correctly. See
+ ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
+ ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
+ ALIGN
+
GET armopts.s
EXPORT oc_idct8x8_1_arm
@@ -64,8 +70,11 @@ oc_idct8x8_slow_arm
BL idct8core_arm
BL idct8core_arm
LDR r0, [r13], #4 ; Write to the final destination.
+ ; Clear input data for next block (decoder only).
SUB r2, r1, #8*16
- ; Clear input data for next block.
+ CMP r0, r2
+ MOV r1, r13 ; And read from temp storage.
+ BEQ oc_idct8x8_slow_arm_cols
MOV r4, #0
MOV r5, #0
MOV r6, #0
@@ -78,7 +87,7 @@ oc_idct8x8_slow_arm
STMIA r2!,{r4,r5,r6,r7}
STMIA r2!,{r4,r5,r6,r7}
STMIA r2!,{r4,r5,r6,r7}
- MOV r1, r13 ; And read from temp storage.
+oc_idct8x8_slow_arm_cols
; Column transforms
BL idct8core_down_arm
BL idct8core_down_arm
@@ -102,15 +111,18 @@ oc_idct8x8_10_arm PROC
BL idct3core_arm
BL idct2core_arm
BL idct1core_arm
- ; Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-4*16]!
- STR r4, [r1,#4]
- STR r4, [r1,#16]
- STR r4, [r1,#20]
- STR r4, [r1,#32]
- STR r4, [r1,#48]
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #4*16
+ CMP r0, r2
MOV r1, r13 ; Read from temp storage.
+ BEQ oc_idct8x8_10_arm_cols
+ MOV r4, #0
+ STR r4, [r0]
+ STR r4, [r0,#4]
+ STR r4, [r0,#16]
+ STR r4, [r0,#20]
+ STR r4, [r0,#32]
+ STR r4, [r0,#48]
MOV r0, r2 ; Write to the final destination
oc_idct8x8_10_arm_cols
; Column transforms
@@ -135,14 +147,18 @@ oc_idct8x8_6_arm PROC
BL idct3core_arm
BL idct2core_arm
BL idct1core_arm
- ; Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-3*16]!
- STR r4, [r1,#4]
- STR r4, [r1,#16]
- STR r4, [r1,#32]
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #3*16
+ CMP r0, r2
MOV r1, r13 ; Read from temp storage.
+ BEQ oc_idct8x8_6_arm_cols
+ MOV r4, #0
+ STR r4, [r0]
+ STR r4, [r0,#4]
+ STR r4, [r0,#16]
+ STR r4, [r0,#32]
MOV r0, r2 ; Write to the final destination
+oc_idct8x8_6_arm_cols
; Column transforms
BL idct3core_down_arm
BL idct3core_down_arm
@@ -164,12 +180,14 @@ oc_idct8x8_3_arm PROC
MOV r0, r13 ; Write to temp storage.
BL idct2core_arm
BL idct1core_arm
- ; Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-2*16]!
- STR r4, [r1,#16]
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #2*16
+ CMP r0, r2
MOV r1, r13 ; Read from temp storage.
- MOV r0, r2 ; Write to the final destination
+ MOVNE r4, #0
+ STRNE r4, [r0]
+ STRNE r4, [r0,#16]
+ MOVNE r0, r2 ; Write to the final destination
; Column transforms
BL idct2core_down_arm
BL idct2core_down_arm
@@ -787,26 +805,30 @@ oc_idct8x8_slow_v6
BL idct8_8core_v6
BL idct8_8core_v6
LDR r0, [r13], #4 ; Write to the final destination.
- ; Clear input data for next block.
+ ; Clear input data for next block (decoder only).
+ SUB r2, r1, #8*16
+ CMP r0, r2
+ MOV r1, r13 ; And read from temp storage.
+ BEQ oc_idct8x8_slow_v6_cols
MOV r4, #0
MOV r5, #0
- STRD r4, [r1,#-8*16]!
- STRD r4, [r1,#8]
- STRD r4, [r1,#16]
- STRD r4, [r1,#24]
- STRD r4, [r1,#32]
- STRD r4, [r1,#40]
- STRD r4, [r1,#48]
- STRD r4, [r1,#56]
- STRD r4, [r1,#64]
- STRD r4, [r1,#72]
- STRD r4, [r1,#80]
- STRD r4, [r1,#88]
- STRD r4, [r1,#96]
- STRD r4, [r1,#104]
- STRD r4, [r1,#112]
- STRD r4, [r1,#120]
- MOV r1, r13 ; And read from temp storage.
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+oc_idct8x8_slow_v6_cols
; Column transforms
BL idct8_8core_down_v6
BL idct8_8core_down_v6
@@ -827,16 +849,20 @@ oc_idct8x8_10_v6 PROC
BL idct4_3core_v6
BL idct2_1core_v6
LDR r0, [r13], #4 ; Write to the final destination.
- ; Clear input data for next block.
+ ; Clear input data for next block (decoder only).
+ SUB r2, r1, #4*16
+ CMP r0, r2
+ AND r1, r13,#4 ; Align the stack.
+ BEQ oc_idct8x8_10_v6_cols
MOV r4, #0
MOV r5, #0
- STRD r4, [r1,#-4*16]!
- STRD r4, [r1,#16]
- STR r4, [r1,#32]
- STR r4, [r1,#48]
- AND r1, r13,#4 ; Align the stack.
- ADD r1, r1, r13 ; And read from temp storage.
+ STRD r4, [r2]
+ STRD r4, [r2,#16]
+ STR r4, [r2,#32]
+ STR r4, [r2,#48]
+oc_idct8x8_10_v6_cols
; Column transforms
+ ADD r1, r1, r13 ; And read from temp storage.
BL idct4_4core_down_v6
BL idct4_4core_down_v6
BL idct4_4core_down_v6
@@ -852,12 +878,14 @@ oc_idct8x8_3_v6 PROC
MOV r8, r0
MOV r0, r13 ; Write to temp storage.
BL idct2_1core_v6
- ; Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-2*16]!
- STR r4, [r1,#16]
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #2*16
+ CMP r0, r8
MOV r1, r13 ; Read from temp storage.
- MOV r0, r8 ; Write to the final destination.
+ MOVNE r4, #0
+ STRNE r4, [r0]
+ STRNE r4, [r0,#16]
+ MOVNE r0, r8 ; Write to the final destination.
; Column transforms
BL idct2_2core_down_v6
BL idct2_2core_down_v6
@@ -1013,16 +1041,20 @@ oc_idct8x8_6_v6 PROC
ADD r0, r0, r13 ; Write to temp storage.
BL idct3_2core_v6
BL idct1core_v6
- ; Clear input data for next block.
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #3*16
+ CMP r0, r8
+ AND r1, r13,#4 ; Align the stack.
+ BEQ oc_idct8x8_6_v6_cols
MOV r4, #0
MOV r5, #0
- STRD r4, [r1,#-3*16]!
- STR r4, [r1,#16]
- STR r4, [r1,#32]
- AND r1, r13,#4 ; Align the stack.
+ STRD r4, [r0]
+ STR r4, [r0,#16]
+ STR r4, [r0,#32]
MOV r0, r8 ; Write to the final destination.
- ADD r1, r1, r13 ; And read from temp storage.
+oc_idct8x8_6_v6_cols
; Column transforms
+ ADD r1, r1, r13 ; And read from temp storage.
BL idct3_3core_down_v6
BL idct3_3core_down_v6
BL idct3_3core_down_v6
@@ -1564,6 +1596,7 @@ oc_idct8x8_slow_neon
VSWP D23,D30
; Column transforms
BL oc_idct8x8_stage123_neon
+ CMP r0,r1
; We have to put the return address back in the LR, or the branch
; predictor will not recognize the function return and mis-predict the
; entire call stack.
@@ -1577,6 +1610,7 @@ oc_idct8x8_slow_neon
VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
+ BEQ oc_idct8x8_slow_neon_noclear
VMOV.I8 Q2,#0
VPOP {D8-D15}
VMOV.I8 Q3,#0
@@ -1594,6 +1628,19 @@ oc_idct8x8_slow_neon
VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
VSTMIA r0, {D16-D31}
MOV PC, r14
+
+oc_idct8x8_slow_neon_noclear
+ VPOP {D8-D15}
+ VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
+ VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
+ VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
+ VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
+ VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
+ VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
+ VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
+ VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
+ VSTMIA r0, {D16-D31}
+ MOV PC, r14
ENDP
oc_idct8x8_stage123_neon PROC
@@ -1824,6 +1871,7 @@ oc_idct8x8_10_neon PROC
VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2]
VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2]
; Stage 4
+ CMP r0, r1
VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]'
VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]''
VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]'
@@ -1832,6 +1880,7 @@ oc_idct8x8_10_neon PROC
VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]'
VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]'
VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]''
+ BEQ oc_idct8x8_10_neon_noclear
VMOV.I8 D2, #0
VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
VST1.64 {D2}, [r1@64], r12
@@ -1847,6 +1896,18 @@ oc_idct8x8_10_neon PROC
VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
VSTMIA r0, {D16-D31}
MOV PC, r14
+
+oc_idct8x8_10_neon_noclear
+ VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
+ VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
+ VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
+ VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
+ VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
+ VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
+ VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
+ VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
+ VSTMIA r0, {D16-D31}
+ MOV PC, r14
ENDP
]