xref: /openbmc/linux/tools/testing/selftests/powerpc/copyloops/copyuser_64.S (revision 023e41632e065d49bcbe31b3c4b336217f96a271)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/asm-compat.h>
13#include <asm/feature-fixups.h>
14
15#ifndef SELFTEST_CASE
16/* 0 == most CPUs, 1 == POWER6, 2 == Cell */
17#define SELFTEST_CASE	0
18#endif
19
20#ifdef __BIG_ENDIAN__
21#define sLd sld		/* Shift towards low-numbered address. */
22#define sHd srd		/* Shift towards high-numbered address. */
23#else
24#define sLd srd		/* Shift towards low-numbered address. */
25#define sHd sld		/* Shift towards high-numbered address. */
26#endif
27
28/*
29 * These macros are used to generate exception table entries.
30 * The exception handlers below use the original arguments
31 * (stored on the stack) and the point where we're up to in
32 * the destination buffer, i.e. the address of the first
33 * unmodified byte.  Generally r3 points into the destination
34 * buffer, but the first unmodified byte is at a variable
35 * offset from r3.  In the code below, the symbol r3_offset
36 * is set to indicate the current offset at each point in
37 * the code.  This offset is then used as a negative offset
38 * from the exception handler code, and those instructions
39 * before the exception handlers are addi instructions that
40 * adjust r3 to point to the correct place.
41 */
42	.macro	lex		/* exception handler for load */
43100:	EX_TABLE(100b, .Lld_exc - r3_offset)
44	.endm
45
46	.macro	stex		/* exception handler for store */
47100:	EX_TABLE(100b, .Lst_exc - r3_offset)
48	.endm
49
50	.align	7
51_GLOBAL_TOC(__copy_tofrom_user)
52#ifdef CONFIG_PPC_BOOK3S_64
53BEGIN_FTR_SECTION
54	nop
55FTR_SECTION_ELSE
56	b	__copy_tofrom_user_power7
57ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
58#endif
59_GLOBAL(__copy_tofrom_user_base)
60	/* first check for a 4kB copy on a 4kB boundary */
61	cmpldi	cr1,r5,16
62	cmpdi	cr6,r5,4096
63	or	r0,r3,r4
64	neg	r6,r3		/* LS 3 bits = # bytes to 8-byte dest bdry */
65	andi.	r0,r0,4095
66	std	r3,-24(r1)
67	crand	cr0*4+2,cr0*4+2,cr6*4+2
68	std	r4,-16(r1)
69	std	r5,-8(r1)
70	dcbt	0,r4
71	beq	.Lcopy_page_4K
72	andi.	r6,r6,7
73	PPC_MTOCRF(0x01,r5)
74	blt	cr1,.Lshort_copy
75/* Below we want to nop out the bne if we're on a CPU that has the
76 * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
77 * cleared.
78 * At the time of writing the only CPU that has this combination of bits
79 * set is Power6.
80 */
81test_feature = (SELFTEST_CASE == 1)
82BEGIN_FTR_SECTION
83	nop
84FTR_SECTION_ELSE
85	bne	.Ldst_unaligned
86ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
87		    CPU_FTR_UNALIGNED_LD_STD)
88.Ldst_aligned:
89	addi	r3,r3,-16
90r3_offset = 16
91test_feature = (SELFTEST_CASE == 0)
92BEGIN_FTR_SECTION
93	andi.	r0,r4,7
94	bne	.Lsrc_unaligned
95END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
96	blt	cr1,.Ldo_tail		/* if < 16 bytes to copy */
97	srdi	r0,r5,5
98	cmpdi	cr1,r0,0
99lex;	ld	r7,0(r4)
100lex;	ld	r6,8(r4)
101	addi	r4,r4,16
102	mtctr	r0
103	andi.	r0,r5,0x10
104	beq	22f
105	addi	r3,r3,16
106r3_offset = 0
107	addi	r4,r4,-16
108	mr	r9,r7
109	mr	r8,r6
110	beq	cr1,72f
11121:
112lex;	ld	r7,16(r4)
113lex;	ld	r6,24(r4)
114	addi	r4,r4,32
115stex;	std	r9,0(r3)
116r3_offset = 8
117stex;	std	r8,8(r3)
118r3_offset = 16
11922:
120lex;	ld	r9,0(r4)
121lex;	ld	r8,8(r4)
122stex;	std	r7,16(r3)
123r3_offset = 24
124stex;	std	r6,24(r3)
125	addi	r3,r3,32
126r3_offset = 0
127	bdnz	21b
12872:
129stex;	std	r9,0(r3)
130r3_offset = 8
131stex;	std	r8,8(r3)
132r3_offset = 16
133	andi.	r5,r5,0xf
134	beq+	3f
135	addi	r4,r4,16
136.Ldo_tail:
137	addi	r3,r3,16
138r3_offset = 0
139	bf	cr7*4+0,246f
140lex;	ld	r9,0(r4)
141	addi	r4,r4,8
142stex;	std	r9,0(r3)
143	addi	r3,r3,8
144246:	bf	cr7*4+1,1f
145lex;	lwz	r9,0(r4)
146	addi	r4,r4,4
147stex;	stw	r9,0(r3)
148	addi	r3,r3,4
1491:	bf	cr7*4+2,2f
150lex;	lhz	r9,0(r4)
151	addi	r4,r4,2
152stex;	sth	r9,0(r3)
153	addi	r3,r3,2
1542:	bf	cr7*4+3,3f
155lex;	lbz	r9,0(r4)
156stex;	stb	r9,0(r3)
1573:	li	r3,0
158	blr
159
160.Lsrc_unaligned:
161r3_offset = 16
162	srdi	r6,r5,3
163	addi	r5,r5,-16
164	subf	r4,r0,r4
165	srdi	r7,r5,4
166	sldi	r10,r0,3
167	cmpldi	cr6,r6,3
168	andi.	r5,r5,7
169	mtctr	r7
170	subfic	r11,r10,64
171	add	r5,r5,r0
172	bt	cr7*4+0,28f
173
174lex;	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
175lex;	ld	r0,8(r4)
176	sLd	r6,r9,r10
177lex;	ldu	r9,16(r4)
178	sHd	r7,r0,r11
179	sLd	r8,r0,r10
180	or	r7,r7,r6
181	blt	cr6,79f
182lex;	ld	r0,8(r4)
183	b	2f
184
18528:
186lex;	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
187lex;	ldu	r9,8(r4)
188	sLd	r8,r0,r10
189	addi	r3,r3,-8
190r3_offset = 24
191	blt	cr6,5f
192lex;	ld	r0,8(r4)
193	sHd	r12,r9,r11
194	sLd	r6,r9,r10
195lex;	ldu	r9,16(r4)
196	or	r12,r8,r12
197	sHd	r7,r0,r11
198	sLd	r8,r0,r10
199	addi	r3,r3,16
200r3_offset = 8
201	beq	cr6,78f
202
2031:	or	r7,r7,r6
204lex;	ld	r0,8(r4)
205stex;	std	r12,8(r3)
206r3_offset = 16
2072:	sHd	r12,r9,r11
208	sLd	r6,r9,r10
209lex;	ldu	r9,16(r4)
210	or	r12,r8,r12
211stex;	stdu	r7,16(r3)
212r3_offset = 8
213	sHd	r7,r0,r11
214	sLd	r8,r0,r10
215	bdnz	1b
216
21778:
218stex;	std	r12,8(r3)
219r3_offset = 16
220	or	r7,r7,r6
22179:
222stex;	std	r7,16(r3)
223r3_offset = 24
2245:	sHd	r12,r9,r11
225	or	r12,r8,r12
226stex;	std	r12,24(r3)
227r3_offset = 32
228	bne	6f
229	li	r3,0
230	blr
2316:	cmpwi	cr1,r5,8
232	addi	r3,r3,32
233r3_offset = 0
234	sLd	r9,r9,r10
235	ble	cr1,7f
236lex;	ld	r0,8(r4)
237	sHd	r7,r0,r11
238	or	r9,r7,r9
2397:
240	bf	cr7*4+1,1f
241#ifdef __BIG_ENDIAN__
242	rotldi	r9,r9,32
243#endif
244stex;	stw	r9,0(r3)
245#ifdef __LITTLE_ENDIAN__
246	rotrdi	r9,r9,32
247#endif
248	addi	r3,r3,4
2491:	bf	cr7*4+2,2f
250#ifdef __BIG_ENDIAN__
251	rotldi	r9,r9,16
252#endif
253stex;	sth	r9,0(r3)
254#ifdef __LITTLE_ENDIAN__
255	rotrdi	r9,r9,16
256#endif
257	addi	r3,r3,2
2582:	bf	cr7*4+3,3f
259#ifdef __BIG_ENDIAN__
260	rotldi	r9,r9,8
261#endif
262stex;	stb	r9,0(r3)
263#ifdef __LITTLE_ENDIAN__
264	rotrdi	r9,r9,8
265#endif
2663:	li	r3,0
267	blr
268
269.Ldst_unaligned:
270r3_offset = 0
271	PPC_MTOCRF(0x01,r6)		/* put #bytes to 8B bdry into cr7 */
272	subf	r5,r6,r5
273	li	r7,0
274	cmpldi	cr1,r5,16
275	bf	cr7*4+3,1f
276100:	EX_TABLE(100b, .Lld_exc_r7)
277	lbz	r0,0(r4)
278100:	EX_TABLE(100b, .Lst_exc_r7)
279	stb	r0,0(r3)
280	addi	r7,r7,1
2811:	bf	cr7*4+2,2f
282100:	EX_TABLE(100b, .Lld_exc_r7)
283	lhzx	r0,r7,r4
284100:	EX_TABLE(100b, .Lst_exc_r7)
285	sthx	r0,r7,r3
286	addi	r7,r7,2
2872:	bf	cr7*4+1,3f
288100:	EX_TABLE(100b, .Lld_exc_r7)
289	lwzx	r0,r7,r4
290100:	EX_TABLE(100b, .Lst_exc_r7)
291	stwx	r0,r7,r3
2923:	PPC_MTOCRF(0x01,r5)
293	add	r4,r6,r4
294	add	r3,r6,r3
295	b	.Ldst_aligned
296
297.Lshort_copy:
298r3_offset = 0
299	bf	cr7*4+0,1f
300lex;	lwz	r0,0(r4)
301lex;	lwz	r9,4(r4)
302	addi	r4,r4,8
303stex;	stw	r0,0(r3)
304stex;	stw	r9,4(r3)
305	addi	r3,r3,8
3061:	bf	cr7*4+1,2f
307lex;	lwz	r0,0(r4)
308	addi	r4,r4,4
309stex;	stw	r0,0(r3)
310	addi	r3,r3,4
3112:	bf	cr7*4+2,3f
312lex;	lhz	r0,0(r4)
313	addi	r4,r4,2
314stex;	sth	r0,0(r3)
315	addi	r3,r3,2
3163:	bf	cr7*4+3,4f
317lex;	lbz	r0,0(r4)
318stex;	stb	r0,0(r3)
3194:	li	r3,0
320	blr
321
322/*
323 * exception handlers follow
324 * we have to return the number of bytes not copied
325 * for an exception on a load, we set the rest of the destination to 0
326 * Note that the number of bytes of instructions for adjusting r3 needs
327 * to equal the amount of the adjustment, due to the trick of using
328 * .Lld_exc - r3_offset as the handler address.
329 */
330
331.Lld_exc_r7:
332	add	r3,r3,r7
333	b	.Lld_exc
334
335	/* adjust by 24 */
336	addi	r3,r3,8
337	nop
338	/* adjust by 16 */
339	addi	r3,r3,8
340	nop
341	/* adjust by 8 */
342	addi	r3,r3,8
343	nop
344
345/*
346 * Here we have had a fault on a load and r3 points to the first
347 * unmodified byte of the destination.  We use the original arguments
348 * and r3 to work out how much wasn't copied.  Since we load some
349 * distance ahead of the stores, we continue copying byte-by-byte until
350 * we hit the load fault again in order to copy as much as possible.
351 */
352.Lld_exc:
353	ld	r6,-24(r1)
354	ld	r4,-16(r1)
355	ld	r5,-8(r1)
356	subf	r6,r6,r3
357	add	r4,r4,r6
358	subf	r5,r6,r5	/* #bytes left to go */
359
360/*
361 * first see if we can copy any more bytes before hitting another exception
362 */
363	mtctr	r5
364r3_offset = 0
365100:	EX_TABLE(100b, .Ldone)
36643:	lbz	r0,0(r4)
367	addi	r4,r4,1
368stex;	stb	r0,0(r3)
369	addi	r3,r3,1
370	bdnz	43b
371	li	r3,0		/* huh? all copied successfully this time? */
372	blr
373
374/*
375 * here we have trapped again, amount remaining is in ctr.
376 */
377.Ldone:
378	mfctr	r3
379	blr
380
381/*
382 * exception handlers for stores: we need to work out how many bytes
383 * weren't copied, and we may need to copy some more.
384 * Note that the number of bytes of instructions for adjusting r3 needs
385 * to equal the amount of the adjustment, due to the trick of using
386 * .Lst_exc - r3_offset as the handler address.
387 */
388.Lst_exc_r7:
389	add	r3,r3,r7
390	b	.Lst_exc
391
392	/* adjust by 24 */
393	addi	r3,r3,8
394	nop
395	/* adjust by 16 */
396	addi	r3,r3,8
397	nop
398	/* adjust by 8 */
399	addi	r3,r3,4
400	/* adjust by 4 */
401	addi	r3,r3,4
402.Lst_exc:
403	ld	r6,-24(r1)	/* original destination pointer */
404	ld	r4,-16(r1)	/* original source pointer */
405	ld	r5,-8(r1)	/* original number of bytes */
406	add	r7,r6,r5
407	/*
408	 * If the destination pointer isn't 8-byte aligned,
409	 * we may have got the exception as a result of a
410	 * store that overlapped a page boundary, so we may be
411	 * able to copy a few more bytes.
412	 */
41317:	andi.	r0,r3,7
414	beq	19f
415	subf	r8,r6,r3	/* #bytes copied */
416100:	EX_TABLE(100b,19f)
417	lbzx	r0,r8,r4
418100:	EX_TABLE(100b,19f)
419	stb	r0,0(r3)
420	addi	r3,r3,1
421	cmpld	r3,r7
422	blt	17b
42319:	subf	r3,r3,r7	/* #bytes not copied in r3 */
424	blr
425
426/*
427 * Routine to copy a whole page of data, optimized for POWER4.
428 * On POWER4 it is more than 50% faster than the simple loop
429 * above (following the .Ldst_aligned label).
430 */
431	.macro	exc
432100:	EX_TABLE(100b, .Labort)
433	.endm
434.Lcopy_page_4K:
435	std	r31,-32(1)
436	std	r30,-40(1)
437	std	r29,-48(1)
438	std	r28,-56(1)
439	std	r27,-64(1)
440	std	r26,-72(1)
441	std	r25,-80(1)
442	std	r24,-88(1)
443	std	r23,-96(1)
444	std	r22,-104(1)
445	std	r21,-112(1)
446	std	r20,-120(1)
447	li	r5,4096/32 - 1
448	addi	r3,r3,-8
449	li	r0,5
4500:	addi	r5,r5,-24
451	mtctr	r0
452exc;	ld	r22,640(4)
453exc;	ld	r21,512(4)
454exc;	ld	r20,384(4)
455exc;	ld	r11,256(4)
456exc;	ld	r9,128(4)
457exc;	ld	r7,0(4)
458exc;	ld	r25,648(4)
459exc;	ld	r24,520(4)
460exc;	ld	r23,392(4)
461exc;	ld	r10,264(4)
462exc;	ld	r8,136(4)
463exc;	ldu	r6,8(4)
464	cmpwi	r5,24
4651:
466exc;	std	r22,648(3)
467exc;	std	r21,520(3)
468exc;	std	r20,392(3)
469exc;	std	r11,264(3)
470exc;	std	r9,136(3)
471exc;	std	r7,8(3)
472exc;	ld	r28,648(4)
473exc;	ld	r27,520(4)
474exc;	ld	r26,392(4)
475exc;	ld	r31,264(4)
476exc;	ld	r30,136(4)
477exc;	ld	r29,8(4)
478exc;	std	r25,656(3)
479exc;	std	r24,528(3)
480exc;	std	r23,400(3)
481exc;	std	r10,272(3)
482exc;	std	r8,144(3)
483exc;	std	r6,16(3)
484exc;	ld	r22,656(4)
485exc;	ld	r21,528(4)
486exc;	ld	r20,400(4)
487exc;	ld	r11,272(4)
488exc;	ld	r9,144(4)
489exc;	ld	r7,16(4)
490exc;	std	r28,664(3)
491exc;	std	r27,536(3)
492exc;	std	r26,408(3)
493exc;	std	r31,280(3)
494exc;	std	r30,152(3)
495exc;	stdu	r29,24(3)
496exc;	ld	r25,664(4)
497exc;	ld	r24,536(4)
498exc;	ld	r23,408(4)
499exc;	ld	r10,280(4)
500exc;	ld	r8,152(4)
501exc;	ldu	r6,24(4)
502	bdnz	1b
503exc;	std	r22,648(3)
504exc;	std	r21,520(3)
505exc;	std	r20,392(3)
506exc;	std	r11,264(3)
507exc;	std	r9,136(3)
508exc;	std	r7,8(3)
509	addi	r4,r4,640
510	addi	r3,r3,648
511	bge	0b
512	mtctr	r5
513exc;	ld	r7,0(4)
514exc;	ld	r8,8(4)
515exc;	ldu	r9,16(4)
5163:
517exc;	ld	r10,8(4)
518exc;	std	r7,8(3)
519exc;	ld	r7,16(4)
520exc;	std	r8,16(3)
521exc;	ld	r8,24(4)
522exc;	std	r9,24(3)
523exc;	ldu	r9,32(4)
524exc;	stdu	r10,32(3)
525	bdnz	3b
5264:
527exc;	ld	r10,8(4)
528exc;	std	r7,8(3)
529exc;	std	r8,16(3)
530exc;	std	r9,24(3)
531exc;	std	r10,32(3)
5329:	ld	r20,-120(1)
533	ld	r21,-112(1)
534	ld	r22,-104(1)
535	ld	r23,-96(1)
536	ld	r24,-88(1)
537	ld	r25,-80(1)
538	ld	r26,-72(1)
539	ld	r27,-64(1)
540	ld	r28,-56(1)
541	ld	r29,-48(1)
542	ld	r30,-40(1)
543	ld	r31,-32(1)
544	li	r3,0
545	blr
546
547/*
548 * on an exception, reset to the beginning and jump back into the
549 * standard __copy_tofrom_user
550 */
551.Labort:
552	ld	r20,-120(1)
553	ld	r21,-112(1)
554	ld	r22,-104(1)
555	ld	r23,-96(1)
556	ld	r24,-88(1)
557	ld	r25,-80(1)
558	ld	r26,-72(1)
559	ld	r27,-64(1)
560	ld	r28,-56(1)
561	ld	r29,-48(1)
562	ld	r30,-40(1)
563	ld	r31,-32(1)
564	ld	r3,-24(r1)
565	ld	r4,-16(r1)
566	li	r5,4096
567	b	.Ldst_aligned
568EXPORT_SYMBOL(__copy_tofrom_user)
569