gfx/cairo/libpixman/src/pixman-mips-memcpy-asm.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

/*
 * Copyright (c) 2012
 *      MIPS Technologies, Inc., California.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "pixman-mips-dspr2-asm.h"

/*
 * This routine could be optimized for MIPS64. The current code only
 * uses MIPS32 instructions.
 */

#ifdef EB
#  define LWHI	lwl		/* high part is left in big-endian */
#  define SWHI	swl		/* high part is left in big-endian */
#  define LWLO	lwr		/* low part is right in big-endian */
#  define SWLO	swr		/* low part is right in big-endian */
#else
#  define LWHI	lwr		/* high part is right in little-endian */
#  define SWHI	swr		/* high part is right in little-endian */
#  define LWLO	lwl		/* low part is left in big-endian */
#  define SWLO	swl		/* low part is left in big-endian */
#endif

LEAF_MIPS32R2(pixman_mips_fast_memcpy)

	slti	AT, a2, 8
	bne	AT, zero, $last8
	move	v0, a0	/* memcpy returns the dst pointer */

/* Test if the src and dst are word-aligned, or can be made word-aligned */
	xor	t8, a1, a0
	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */

	bne	t8, zero, $unaligned
	negu	a3, a0

	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
	subu	a2, a2, a3	/* now a2 is the remining bytes count */

	LWHI	t8, 0(a1)
	addu	a1, a1, a3
	SWHI	t8, 0(a0)
	addu	a0, a0, a3

/* Now the dst/src are mutually word-aligned with word-aligned addresses */
$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
				/* t8 is the byte count after 64-byte chunks */

	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
				/* There will be at most 1 32-byte chunk after it */
	subu	a3, a2, t8	/* subtract from a2 the reminder */
                                /* Here a3 counts bytes in 16w chunks */
	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */

	addu	t0, a0, a2	/* t0 is the "past the end" address */

/*
 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
 * the "t0-32" address
 * This means: for x=128 the last "safe" a0 address is "t0-160"
 * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
 */
	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */

	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
/* In case the a0 > t9 don't use "pref 30" at all */
	sgtu	v1, a0, t9
	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
	nop
/* otherwise, start with using pref30 */
	pref	30, 64(a0)
$loop16w:
	pref	0, 96(a1)
	lw	t0, 0(a1)
	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
	lw	t1, 4(a1)
	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
$skip_pref30_96:
	lw	t2, 8(a1)
	lw	t3, 12(a1)
	lw	t4, 16(a1)
	lw	t5, 20(a1)
	lw	t6, 24(a1)
	lw	t7, 28(a1)
        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */

	sw	t0, 0(a0)
	sw	t1, 4(a0)
	sw	t2, 8(a0)
	sw	t3, 12(a0)
	sw	t4, 16(a0)
	sw	t5, 20(a0)
	sw	t6, 24(a0)
	sw	t7, 28(a0)

	lw	t0, 32(a1)
	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
	lw	t1, 36(a1)
	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
$skip_pref30_128:
	lw	t2, 40(a1)
	lw	t3, 44(a1)
	lw	t4, 48(a1)
	lw	t5, 52(a1)
	lw	t6, 56(a1)
	lw	t7, 60(a1)
        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */

	sw	t0, 32(a0)
	sw	t1, 36(a0)
	sw	t2, 40(a0)
	sw	t3, 44(a0)
	sw	t4, 48(a0)
	sw	t5, 52(a0)
	sw	t6, 56(a0)
	sw	t7, 60(a0)

	addiu	a0, a0, 64	/* adding 64 to dest */
	sgtu	v1, a0, t9
	bne	a0, a3, $loop16w
	addiu	a1, a1, 64	/* adding 64 to src */
	move	a2, t8

/* Here we have src and dest word-aligned but less than 64-bytes to go */

$chk8w:
	pref 0, 0x0(a1)
	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
				/* the t8 is the reminder count past 32-bytes */
	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
	 nop

	lw	t0, 0(a1)
	lw	t1, 4(a1)
	lw	t2, 8(a1)
	lw	t3, 12(a1)
	lw	t4, 16(a1)
	lw	t5, 20(a1)
	lw	t6, 24(a1)
	lw	t7, 28(a1)
	addiu	a1, a1, 32

	sw	t0, 0(a0)
	sw	t1, 4(a0)
	sw	t2, 8(a0)
	sw	t3, 12(a0)
	sw	t4, 16(a0)
	sw	t5, 20(a0)
	sw	t6, 24(a0)
	sw	t7, 28(a0)
	addiu	a0, a0, 32

$chk1w:
	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
	beq	a2, t8, $last8
	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */

/* copying in words (4-byte chunks) */
$wordCopy_loop:
	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
	addiu	a1, a1, 4
	addiu	a0, a0, 4
	bne	a0, a3, $wordCopy_loop
	sw	t3, -4(a0)

/* For the last (<8) bytes */
$last8:
	blez	a2, leave
	addu	a3, a0, a2	/* a3 is the last dst address */
$last8loop:
	lb	v1, 0(a1)
	addiu	a1, a1, 1
	addiu	a0, a0, 1
	bne	a0, a3, $last8loop
	sb	v1, -1(a0)

leave:	j	ra
	nop

/*
 * UNALIGNED case
 */

$unaligned:
	/* got here with a3="negu a0" */
	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
	beqz	a3, $ua_chk16w
	subu	a2, a2, a3	/* bytes left after initial a3 bytes */

	LWHI	v1, 0(a1)
	LWLO	v1, 3(a1)
	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
	SWHI	v1, 0(a0)
	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */

$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
				/* t8 is the byte count after 64-byte chunks */
	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
				/* There will be at most 1 32-byte chunk after it */
	subu	a3, a2, t8	/* subtract from a2 the reminder */
                                /* Here a3 counts bytes in 16w chunks */
	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */

	addu	t0, a0, a2	/* t0 is the "past the end" address */

	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */

	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
/* In case the a0 > t9 don't use "pref 30" at all */
	sgtu	v1, a0, t9
	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
	nop
/* otherwise,  start with using pref30 */
	pref	30, 64(a0)
$ua_loop16w:
	pref	0, 96(a1)
	LWHI	t0, 0(a1)
	LWLO	t0, 3(a1)
	LWHI	t1, 4(a1)
	bgtz	v1, $ua_skip_pref30_96
	LWLO	t1, 7(a1)
	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
$ua_skip_pref30_96:
	LWHI	t2, 8(a1)
	LWLO	t2, 11(a1)
	LWHI	t3, 12(a1)
	LWLO	t3, 15(a1)
	LWHI	t4, 16(a1)
	LWLO	t4, 19(a1)
	LWHI	t5, 20(a1)
	LWLO	t5, 23(a1)
	LWHI	t6, 24(a1)
	LWLO	t6, 27(a1)
	LWHI	t7, 28(a1)
	LWLO	t7, 31(a1)
        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */

	sw	t0, 0(a0)
	sw	t1, 4(a0)
	sw	t2, 8(a0)
	sw	t3, 12(a0)
	sw	t4, 16(a0)
	sw	t5, 20(a0)
	sw	t6, 24(a0)
	sw	t7, 28(a0)

	LWHI	t0, 32(a1)
	LWLO	t0, 35(a1)
	LWHI	t1, 36(a1)
	bgtz	v1, $ua_skip_pref30_128
	LWLO	t1, 39(a1)
	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
$ua_skip_pref30_128:
	LWHI	t2, 40(a1)
	LWLO	t2, 43(a1)
	LWHI	t3, 44(a1)
	LWLO	t3, 47(a1)
	LWHI	t4, 48(a1)
	LWLO	t4, 51(a1)
	LWHI	t5, 52(a1)
	LWLO	t5, 55(a1)
	LWHI	t6, 56(a1)
	LWLO	t6, 59(a1)
	LWHI	t7, 60(a1)
	LWLO	t7, 63(a1)
        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */

	sw	t0, 32(a0)
	sw	t1, 36(a0)
	sw	t2, 40(a0)
	sw	t3, 44(a0)
	sw	t4, 48(a0)
	sw	t5, 52(a0)
	sw	t6, 56(a0)
	sw	t7, 60(a0)

	addiu	a0, a0, 64	/* adding 64 to dest */
	sgtu	v1, a0, t9
	bne	a0, a3, $ua_loop16w
	addiu	a1, a1, 64	/* adding 64 to src */
	move	a2, t8

/* Here we have src and dest word-aligned but less than 64-bytes to go */

$ua_chk8w:
	pref 0, 0x0(a1)
	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
				/* the t8 is the reminder count */
	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */

	LWHI	t0, 0(a1)
	LWLO	t0, 3(a1)
	LWHI	t1, 4(a1)
	LWLO	t1, 7(a1)
	LWHI	t2, 8(a1)
	LWLO	t2, 11(a1)
	LWHI	t3, 12(a1)
	LWLO	t3, 15(a1)
	LWHI	t4, 16(a1)
	LWLO	t4, 19(a1)
	LWHI	t5, 20(a1)
	LWLO	t5, 23(a1)
	LWHI	t6, 24(a1)
	LWLO	t6, 27(a1)
	LWHI	t7, 28(a1)
	LWLO	t7, 31(a1)
	addiu	a1, a1, 32

	sw	t0, 0(a0)
	sw	t1, 4(a0)
	sw	t2, 8(a0)
	sw	t3, 12(a0)
	sw	t4, 16(a0)
	sw	t5, 20(a0)
	sw	t6, 24(a0)
	sw	t7, 28(a0)
	addiu	a0, a0, 32

$ua_chk1w:
	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
	beq	a2, t8, $ua_smallCopy
	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */

/* copying in words (4-byte chunks) */
$ua_wordCopy_loop:
	LWHI	v1, 0(a1)
	LWLO	v1, 3(a1)
	addiu	a1, a1, 4
	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
	bne	a0, a3, $ua_wordCopy_loop
	sw	v1, -4(a0)

/* Now less than 4 bytes (value in a2) left to copy */
$ua_smallCopy:
	beqz	a2, leave
	addu	a3, a0, a2	/* a3 is the last dst address */
$ua_smallCopy_loop:
	lb	v1, 0(a1)
	addiu	a1, a1, 1
	addiu	a0, a0, 1
	bne	a0, a3, $ua_smallCopy_loop
	sb	v1, -1(a0)

	j	ra
	nop

END(pixman_mips_fast_memcpy)