1 files changed, 119 insertions, 58 deletions
diff --git a/media/libtheora/lib/arm/armidct.s b/media/libtheora/lib/arm/armidct.s
index 68530c7140..babd846ecd 100644
--- a/media/libtheora/lib/arm/armidct.s
+++ b/media/libtheora/lib/arm/armidct.s
@@ -11,11 +11,17 @@
 ;********************************************************************
 ; Original implementation:
 ;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
-; last mod: $Id$
+; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
 ;********************************************************************
 
 	AREA	|.text|, CODE, READONLY
 
+	; Explicitly specifying alignment here because some versions of
+	; gas don't align code correctly. See
+	; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
+	; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
+	ALIGN
+
 	GET	armopts.s
 
 	EXPORT	oc_idct8x8_1_arm
@@ -64,8 +70,11 @@ oc_idct8x8_slow_arm
 	BL	idct8core_arm
 	BL	idct8core_arm
 	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
 	SUB	r2, r1, #8*16
-	; Clear input data for next block.
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_arm_cols
 	MOV	r4, #0
 	MOV	r5, #0
 	MOV	r6, #0
@@ -78,7 +87,7 @@ oc_idct8x8_slow_arm
 	STMIA	r2!,{r4,r5,r6,r7}
 	STMIA	r2!,{r4,r5,r6,r7}
 	STMIA	r2!,{r4,r5,r6,r7}
-	MOV	r1, r13		; And read from temp storage.
+oc_idct8x8_slow_arm_cols
 ; Column transforms
 	BL	idct8core_down_arm
 	BL	idct8core_down_arm
@@ -102,15 +111,18 @@ oc_idct8x8_10_arm PROC
 	BL	idct3core_arm
 	BL	idct2core_arm
 	BL	idct1core_arm
-	; Clear input data for next block.
-	MOV	r4, #0
-	STR	r4, [r1,#-4*16]!
-	STR	r4, [r1,#4]
-	STR	r4, [r1,#16]
-	STR	r4, [r1,#20]
-	STR	r4, [r1,#32]
-	STR	r4, [r1,#48]
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #4*16
+	CMP	r0, r2
 	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_10_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#20]
+	STR	r4, [r0,#32]
+	STR	r4, [r0,#48]
 	MOV	r0, r2		; Write to the final destination
 oc_idct8x8_10_arm_cols
 ; Column transforms
@@ -135,14 +147,18 @@ oc_idct8x8_6_arm PROC
 	BL	idct3core_arm
 	BL	idct2core_arm
 	BL	idct1core_arm
-	; Clear input data for next block.
-	MOV	r4, #0
-	STR	r4, [r1,#-3*16]!
-	STR	r4, [r1,#4]
-	STR	r4, [r1,#16]
-	STR	r4, [r1,#32]
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r2
 	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_6_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
 	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_6_arm_cols
 ; Column transforms
 	BL	idct3core_down_arm
 	BL	idct3core_down_arm
@@ -164,12 +180,14 @@ oc_idct8x8_3_arm PROC
 	MOV	r0, r13		; Write to temp storage.
 	BL	idct2core_arm
 	BL	idct1core_arm
-	; Clear input data for next block.
-	MOV	r4, #0
-	STR	r4, [r1,#-2*16]!
-	STR	r4, [r1,#16]
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r2
 	MOV	r1, r13		; Read from temp storage.
-	MOV	r0, r2		; Write to the final destination
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r2		; Write to the final destination
 ; Column transforms
 	BL	idct2core_down_arm
 	BL	idct2core_down_arm
@@ -787,26 +805,30 @@ oc_idct8x8_slow_v6
 	BL	idct8_8core_v6
 	BL	idct8_8core_v6
 	LDR	r0, [r13], #4	; Write to the final destination.
-	; Clear input data for next block.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_v6_cols
 	MOV	r4, #0
 	MOV	r5, #0
-	STRD	r4, [r1,#-8*16]!
-	STRD	r4, [r1,#8]
-	STRD	r4, [r1,#16]
-	STRD	r4, [r1,#24]
-	STRD	r4, [r1,#32]
-	STRD	r4, [r1,#40]
-	STRD	r4, [r1,#48]
-	STRD	r4, [r1,#56]
-	STRD	r4, [r1,#64]
-	STRD	r4, [r1,#72]
-	STRD	r4, [r1,#80]
-	STRD	r4, [r1,#88]
-	STRD	r4, [r1,#96]
-	STRD	r4, [r1,#104]
-	STRD	r4, [r1,#112]
-	STRD	r4, [r1,#120]
-	MOV	r1, r13		; And read from temp storage.
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+oc_idct8x8_slow_v6_cols
 ; Column transforms
 	BL	idct8_8core_down_v6
 	BL	idct8_8core_down_v6
@@ -827,16 +849,20 @@ oc_idct8x8_10_v6 PROC
 	BL	idct4_3core_v6
 	BL	idct2_1core_v6
 	LDR	r0, [r13], #4	; Write to the final destination.
-	; Clear input data for next block.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #4*16
+	CMP	r0, r2
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_10_v6_cols
 	MOV	r4, #0
 	MOV	r5, #0
-	STRD	r4, [r1,#-4*16]!
-	STRD	r4, [r1,#16]
-	STR	r4, [r1,#32]
-	STR	r4, [r1,#48]
-	AND	r1, r13,#4	; Align the stack.
-	ADD	r1, r1, r13	; And read from temp storage.
+	STRD	r4, [r2]
+	STRD	r4, [r2,#16]
+	STR	r4, [r2,#32]
+	STR	r4, [r2,#48]
+oc_idct8x8_10_v6_cols
 ; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
 	BL	idct4_4core_down_v6
 	BL	idct4_4core_down_v6
 	BL	idct4_4core_down_v6
@@ -852,12 +878,14 @@ oc_idct8x8_3_v6 PROC
 	MOV	r8, r0
 	MOV	r0, r13		; Write to temp storage.
 	BL	idct2_1core_v6
-	; Clear input data for next block.
-	MOV	r4, #0
-	STR	r4, [r1,#-2*16]!
-	STR	r4, [r1,#16]
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r8
 	MOV	r1, r13		; Read from temp storage.
-	MOV	r0, r8		; Write to the final destination.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r8		; Write to the final destination.
 ; Column transforms
 	BL	idct2_2core_down_v6
 	BL	idct2_2core_down_v6
@@ -1013,16 +1041,20 @@ oc_idct8x8_6_v6 PROC
 	ADD	r0, r0, r13	; Write to temp storage.
 	BL	idct3_2core_v6
 	BL	idct1core_v6
-	; Clear input data for next block.
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r8
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_6_v6_cols
 	MOV	r4, #0
 	MOV	r5, #0
-	STRD	r4, [r1,#-3*16]!
-	STR	r4, [r1,#16]
-	STR	r4, [r1,#32]
-	AND	r1, r13,#4	; Align the stack.
+	STRD	r4, [r0]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
 	MOV	r0, r8		; Write to the final destination.
-	ADD	r1, r1, r13	; And read from temp storage.
+oc_idct8x8_6_v6_cols
 ; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
 	BL	idct3_3core_down_v6
 	BL	idct3_3core_down_v6
 	BL	idct3_3core_down_v6
@@ -1564,6 +1596,7 @@ oc_idct8x8_slow_neon
 	VSWP		D23,D30
 	; Column transforms
 	BL	oc_idct8x8_stage123_neon
+	CMP	r0,r1
 	; We have to put the return address back in the LR, or the branch
 	;  predictor will not recognize the function return and mis-predict the
 	;  entire call stack.
@@ -1577,6 +1610,7 @@ oc_idct8x8_slow_neon
 	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
 	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
 	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	BEQ		oc_idct8x8_slow_neon_noclear
 	VMOV.I8		Q2,#0
 	VPOP		{D8-D15}
 	VMOV.I8		Q3,#0
@@ -1594,6 +1628,19 @@ oc_idct8x8_slow_neon
 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
 	VSTMIA		r0, {D16-D31}
 	MOV	PC, r14
+
+oc_idct8x8_slow_neon_noclear
+	VPOP		{D8-D15}
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
 	ENDP
 
 oc_idct8x8_stage123_neon PROC
@@ -1824,6 +1871,7 @@ oc_idct8x8_10_neon PROC
 	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
 	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
 ; Stage 4
+	CMP	r0, r1
 	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
 	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
 	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
@@ -1832,6 +1880,7 @@ oc_idct8x8_10_neon PROC
 	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
 	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
 	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
+	BEQ	oc_idct8x8_10_neon_noclear
 	VMOV.I8		D2, #0
 	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
 	VST1.64		{D2}, [r1@64], r12
@@ -1847,6 +1896,18 @@ oc_idct8x8_10_neon PROC
 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
 	VSTMIA		r0, {D16-D31}
 	MOV	PC, r14
+
+oc_idct8x8_10_neon_noclear
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
 	ENDP
  ]