xref: /openbmc/linux/arch/alpha/lib/ev6-copy_page.S (revision 00fc0e0d)
11da177e4SLinus Torvalds/*
21da177e4SLinus Torvalds * arch/alpha/lib/ev6-copy_page.S
31da177e4SLinus Torvalds *
41da177e4SLinus Torvalds * Copy an entire page.
51da177e4SLinus Torvalds */
61da177e4SLinus Torvalds
71da177e4SLinus Torvalds/* The following comparison of this routine vs the normal copy_page.S
81da177e4SLinus Torvalds   was written by an unnamed ev6 hardware designer and forwarded to me
91da177e4SLinus Torvalds   via Steven Hobbs <hobbs@steven.zko.dec.com>.
101da177e4SLinus Torvalds
111da177e4SLinus Torvalds   First Problem: STQ overflows.
121da177e4SLinus Torvalds   -----------------------------
131da177e4SLinus Torvalds
141da177e4SLinus Torvalds	It would be nice if EV6 handled every resource overflow efficiently,
151da177e4SLinus Torvalds	but for some it doesn't.  Including store queue overflows.  It causes
161da177e4SLinus Torvalds	a trap and a restart of the pipe.
171da177e4SLinus Torvalds
181da177e4SLinus Torvalds	To get around this we sometimes use (to borrow a term from a VSSAD
191da177e4SLinus Torvalds	researcher) "aeration".  The idea is to slow the rate at which the
201da177e4SLinus Torvalds	processor receives valid instructions by inserting nops in the fetch
211da177e4SLinus Torvalds	path.  In doing so, you can prevent the overflow and actually make
221da177e4SLinus Torvalds	the code run faster.  You can, of course, take advantage of the fact
231da177e4SLinus Torvalds	that the processor can fetch at most 4 aligned instructions per cycle.
241da177e4SLinus Torvalds
251da177e4SLinus Torvalds	I inserted enough nops to force it to take 10 cycles to fetch the
261da177e4SLinus Torvalds	loop code.  In theory, EV6 should be able to execute this loop in
271da177e4SLinus Torvalds	9 cycles but I was not able to get it to run that fast -- the initial
281da177e4SLinus Torvalds	conditions were such that I could not reach this optimum rate on
291da177e4SLinus Torvalds	(chaotic) EV6.  I wrote the code such that everything would issue
301da177e4SLinus Torvalds	in order.
311da177e4SLinus Torvalds
321da177e4SLinus Torvalds   Second Problem: Dcache index matches.
331da177e4SLinus Torvalds   -------------------------------------
341da177e4SLinus Torvalds
351da177e4SLinus Torvalds	If you are going to use this routine on random aligned pages, there
361da177e4SLinus Torvalds	is a 25% chance that the pages will be at the same dcache indices.
371da177e4SLinus Torvalds	This results in many nasty memory traps without care.
381da177e4SLinus Torvalds
391da177e4SLinus Torvalds	The solution is to schedule the prefetches to avoid the memory
401da177e4SLinus Torvalds	conflicts.  I schedule the wh64 prefetches farther ahead of the
411da177e4SLinus Torvalds	read prefetches to avoid this problem.
421da177e4SLinus Torvalds
431da177e4SLinus Torvalds   Third Problem: Needs more prefetching.
441da177e4SLinus Torvalds   --------------------------------------
451da177e4SLinus Torvalds
461da177e4SLinus Torvalds	In order to improve the code I added deeper prefetching to take the
471da177e4SLinus Torvalds	most advantage of EV6's bandwidth.
481da177e4SLinus Torvalds
491da177e4SLinus Torvalds	I also prefetched the read stream. Note that adding the read prefetch
501da177e4SLinus Torvalds	forced me to add another cycle to the inner-most kernel - up to 11
511da177e4SLinus Torvalds	from the original 8 cycles per iteration.  We could improve performance
521da177e4SLinus Torvalds	further by unrolling the loop and doing multiple prefetches per cycle.
531da177e4SLinus Torvalds
541da177e4SLinus Torvalds   I think that the code below will be very robust and fast code for the
551da177e4SLinus Torvalds   purposes of copying aligned pages.  It is slower when both source and
561da177e4SLinus Torvalds   destination pages are in the dcache, but it is my guess that this is
571da177e4SLinus Torvalds   less important than the dcache miss case.  */
581da177e4SLinus Torvalds
5900fc0e0dSAl Viro#include <asm/export.h>
601da177e4SLinus Torvalds	.text
611da177e4SLinus Torvalds	.align 4
621da177e4SLinus Torvalds	.global copy_page
631da177e4SLinus Torvalds	.ent copy_page
641da177e4SLinus Torvaldscopy_page:
651da177e4SLinus Torvalds	.prologue 0
661da177e4SLinus Torvalds
671da177e4SLinus Torvalds	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
681da177e4SLinus Torvalds	wh64	($16)
691da177e4SLinus Torvalds	ldl	$31,0($17)
701da177e4SLinus Torvalds	ldl	$31,64($17)
711da177e4SLinus Torvalds	lda	$1,1*64($16)
721da177e4SLinus Torvalds
731da177e4SLinus Torvalds	wh64	($1)
741da177e4SLinus Torvalds	ldl	$31,128($17)
751da177e4SLinus Torvalds	ldl	$31,192($17)
761da177e4SLinus Torvalds	lda	$1,2*64($16)
771da177e4SLinus Torvalds
781da177e4SLinus Torvalds	wh64	($1)
791da177e4SLinus Torvalds	ldl	$31,256($17)
801da177e4SLinus Torvalds	lda	$18,118
811da177e4SLinus Torvalds	lda	$1,3*64($16)
821da177e4SLinus Torvalds
831da177e4SLinus Torvalds	wh64	($1)
841da177e4SLinus Torvalds	nop
851da177e4SLinus Torvalds	lda	$1,4*64($16)
861da177e4SLinus Torvalds	lda	$2,5*64($16)
871da177e4SLinus Torvalds
881da177e4SLinus Torvalds	wh64	($1)
891da177e4SLinus Torvalds	wh64	($2)
901da177e4SLinus Torvalds	lda	$1,6*64($16)
911da177e4SLinus Torvalds	lda	$2,7*64($16)
921da177e4SLinus Torvalds
931da177e4SLinus Torvalds	wh64	($1)
941da177e4SLinus Torvalds	wh64	($2)
951da177e4SLinus Torvalds	lda	$1,8*64($16)
961da177e4SLinus Torvalds	lda	$2,9*64($16)
971da177e4SLinus Torvalds
981da177e4SLinus Torvalds	wh64	($1)
991da177e4SLinus Torvalds	wh64	($2)
1001da177e4SLinus Torvalds	lda	$19,10*64($16)
1011da177e4SLinus Torvalds	nop
1021da177e4SLinus Torvalds
1031da177e4SLinus Torvalds	/* Main prefetching/write-hinting loop.  */
1041da177e4SLinus Torvalds1:	ldq	$0,0($17)
1051da177e4SLinus Torvalds	ldq	$1,8($17)
1061da177e4SLinus Torvalds	unop
1071da177e4SLinus Torvalds	unop
1081da177e4SLinus Torvalds
1091da177e4SLinus Torvalds	unop
1101da177e4SLinus Torvalds	unop
1111da177e4SLinus Torvalds	ldq	$2,16($17)
1121da177e4SLinus Torvalds	ldq	$3,24($17)
1131da177e4SLinus Torvalds
1141da177e4SLinus Torvalds	ldq	$4,32($17)
1151da177e4SLinus Torvalds	ldq	$5,40($17)
1161da177e4SLinus Torvalds	unop
1171da177e4SLinus Torvalds	unop
1181da177e4SLinus Torvalds
1191da177e4SLinus Torvalds	unop
1201da177e4SLinus Torvalds	unop
1211da177e4SLinus Torvalds	ldq	$6,48($17)
1221da177e4SLinus Torvalds	ldq	$7,56($17)
1231da177e4SLinus Torvalds
1241da177e4SLinus Torvalds	ldl	$31,320($17)
1251da177e4SLinus Torvalds	unop
1261da177e4SLinus Torvalds	unop
1271da177e4SLinus Torvalds	unop
1281da177e4SLinus Torvalds
1291da177e4SLinus Torvalds	/* This gives the extra cycle of aeration above the minimum.  */
1301da177e4SLinus Torvalds	unop
1311da177e4SLinus Torvalds	unop
1321da177e4SLinus Torvalds	unop
1331da177e4SLinus Torvalds	unop
1341da177e4SLinus Torvalds
1351da177e4SLinus Torvalds	wh64	($19)
1361da177e4SLinus Torvalds	unop
1371da177e4SLinus Torvalds	unop
1381da177e4SLinus Torvalds	unop
1391da177e4SLinus Torvalds
1401da177e4SLinus Torvalds	stq	$0,0($16)
1411da177e4SLinus Torvalds	subq	$18,1,$18
1421da177e4SLinus Torvalds	stq	$1,8($16)
1431da177e4SLinus Torvalds	unop
1441da177e4SLinus Torvalds
1451da177e4SLinus Torvalds	unop
1461da177e4SLinus Torvalds	stq	$2,16($16)
1471da177e4SLinus Torvalds	addq	$17,64,$17
1481da177e4SLinus Torvalds	stq	$3,24($16)
1491da177e4SLinus Torvalds
1501da177e4SLinus Torvalds	stq	$4,32($16)
1511da177e4SLinus Torvalds	stq	$5,40($16)
1521da177e4SLinus Torvalds	addq	$19,64,$19
1531da177e4SLinus Torvalds	unop
1541da177e4SLinus Torvalds
1551da177e4SLinus Torvalds	stq	$6,48($16)
1561da177e4SLinus Torvalds	stq	$7,56($16)
1571da177e4SLinus Torvalds	addq	$16,64,$16
1581da177e4SLinus Torvalds	bne	$18, 1b
1591da177e4SLinus Torvalds
1601da177e4SLinus Torvalds	/* Prefetch the final 5 cache lines of the read stream.  */
1611da177e4SLinus Torvalds	lda	$18,10
1621da177e4SLinus Torvalds	ldl	$31,320($17)
1631da177e4SLinus Torvalds	ldl	$31,384($17)
1641da177e4SLinus Torvalds	ldl	$31,448($17)
1651da177e4SLinus Torvalds
1661da177e4SLinus Torvalds	ldl	$31,512($17)
1671da177e4SLinus Torvalds	ldl	$31,576($17)
1681da177e4SLinus Torvalds	nop
1691da177e4SLinus Torvalds	nop
1701da177e4SLinus Torvalds
1711da177e4SLinus Torvalds	/* Non-prefetching, non-write-hinting cleanup loop for the
1721da177e4SLinus Torvalds	   final 10 cache lines.  */
1731da177e4SLinus Torvalds2:	ldq	$0,0($17)
1741da177e4SLinus Torvalds	ldq	$1,8($17)
1751da177e4SLinus Torvalds	ldq	$2,16($17)
1761da177e4SLinus Torvalds	ldq	$3,24($17)
1771da177e4SLinus Torvalds
1781da177e4SLinus Torvalds	ldq	$4,32($17)
1791da177e4SLinus Torvalds	ldq	$5,40($17)
1801da177e4SLinus Torvalds	ldq	$6,48($17)
1811da177e4SLinus Torvalds	ldq	$7,56($17)
1821da177e4SLinus Torvalds
1831da177e4SLinus Torvalds	stq	$0,0($16)
1841da177e4SLinus Torvalds	subq	$18,1,$18
1851da177e4SLinus Torvalds	stq	$1,8($16)
1861da177e4SLinus Torvalds	addq	$17,64,$17
1871da177e4SLinus Torvalds
1881da177e4SLinus Torvalds	stq	$2,16($16)
1891da177e4SLinus Torvalds	stq	$3,24($16)
1901da177e4SLinus Torvalds	stq	$4,32($16)
1911da177e4SLinus Torvalds	stq	$5,40($16)
1921da177e4SLinus Torvalds
1931da177e4SLinus Torvalds	stq	$6,48($16)
1941da177e4SLinus Torvalds	stq	$7,56($16)
1951da177e4SLinus Torvalds	addq	$16,64,$16
1961da177e4SLinus Torvalds	bne	$18, 2b
1971da177e4SLinus Torvalds
1981da177e4SLinus Torvalds	ret
1991da177e4SLinus Torvalds	nop
2001da177e4SLinus Torvalds	unop
2011da177e4SLinus Torvalds	nop
2021da177e4SLinus Torvalds
2031da177e4SLinus Torvalds	.end copy_page
20400fc0e0dSAl Viro	EXPORT_SYMBOL(copy_page)
205