xref: /openbmc/linux/arch/alpha/lib/ev6-copy_page.S (revision f3c78e94)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-copy_page.S
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copy an entire page.
61da177e4SLinus Torvalds */
71da177e4SLinus Torvalds
81da177e4SLinus Torvalds/* The following comparison of this routine vs the normal copy_page.S
91da177e4SLinus Torvalds   was written by an unnamed ev6 hardware designer and forwarded to me
101da177e4SLinus Torvalds   via Steven Hobbs <hobbs@steven.zko.dec.com>.
111da177e4SLinus Torvalds
121da177e4SLinus Torvalds   First Problem: STQ overflows.
131da177e4SLinus Torvalds   -----------------------------
141da177e4SLinus Torvalds
151da177e4SLinus Torvalds	It would be nice if EV6 handled every resource overflow efficiently,
161da177e4SLinus Torvalds	but for some it doesn't.  Including store queue overflows.  It causes
171da177e4SLinus Torvalds	a trap and a restart of the pipe.
181da177e4SLinus Torvalds
191da177e4SLinus Torvalds	To get around this we sometimes use (to borrow a term from a VSSAD
201da177e4SLinus Torvalds	researcher) "aeration".  The idea is to slow the rate at which the
211da177e4SLinus Torvalds	processor receives valid instructions by inserting nops in the fetch
221da177e4SLinus Torvalds	path.  In doing so, you can prevent the overflow and actually make
231da177e4SLinus Torvalds	the code run faster.  You can, of course, take advantage of the fact
241da177e4SLinus Torvalds	that the processor can fetch at most 4 aligned instructions per cycle.
251da177e4SLinus Torvalds
261da177e4SLinus Torvalds	I inserted enough nops to force it to take 10 cycles to fetch the
271da177e4SLinus Torvalds	loop code.  In theory, EV6 should be able to execute this loop in
281da177e4SLinus Torvalds	9 cycles but I was not able to get it to run that fast -- the initial
291da177e4SLinus Torvalds	conditions were such that I could not reach this optimum rate on
301da177e4SLinus Torvalds	(chaotic) EV6.  I wrote the code such that everything would issue
311da177e4SLinus Torvalds	in order.
321da177e4SLinus Torvalds
331da177e4SLinus Torvalds   Second Problem: Dcache index matches.
341da177e4SLinus Torvalds   -------------------------------------
351da177e4SLinus Torvalds
361da177e4SLinus Torvalds	If you are going to use this routine on random aligned pages, there
371da177e4SLinus Torvalds	is a 25% chance that the pages will be at the same dcache indices.
381da177e4SLinus Torvalds	This results in many nasty memory traps without care.
391da177e4SLinus Torvalds
401da177e4SLinus Torvalds	The solution is to schedule the prefetches to avoid the memory
411da177e4SLinus Torvalds	conflicts.  I schedule the wh64 prefetches farther ahead of the
421da177e4SLinus Torvalds	read prefetches to avoid this problem.
431da177e4SLinus Torvalds
441da177e4SLinus Torvalds   Third Problem: Needs more prefetching.
451da177e4SLinus Torvalds   --------------------------------------
461da177e4SLinus Torvalds
471da177e4SLinus Torvalds	In order to improve the code I added deeper prefetching to take the
481da177e4SLinus Torvalds	most advantage of EV6's bandwidth.
491da177e4SLinus Torvalds
501da177e4SLinus Torvalds	I also prefetched the read stream. Note that adding the read prefetch
511da177e4SLinus Torvalds	forced me to add another cycle to the inner-most kernel - up to 11
521da177e4SLinus Torvalds	from the original 8 cycles per iteration.  We could improve performance
531da177e4SLinus Torvalds	further by unrolling the loop and doing multiple prefetches per cycle.
541da177e4SLinus Torvalds
551da177e4SLinus Torvalds   I think that the code below will be very robust and fast code for the
561da177e4SLinus Torvalds   purposes of copying aligned pages.  It is slower when both source and
571da177e4SLinus Torvalds   destination pages are in the dcache, but it is my guess that this is
581da177e4SLinus Torvalds   less important than the dcache miss case.  */
591da177e4SLinus Torvalds
60*f3c78e94SMasahiro Yamada#include <linux/export.h>
611da177e4SLinus Torvalds	.text
621da177e4SLinus Torvalds	.align 4
631da177e4SLinus Torvalds	.global copy_page
641da177e4SLinus Torvalds	.ent copy_page
651da177e4SLinus Torvaldscopy_page:
661da177e4SLinus Torvalds	.prologue 0
671da177e4SLinus Torvalds
681da177e4SLinus Torvalds	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
691da177e4SLinus Torvalds	wh64	($16)
701da177e4SLinus Torvalds	ldl	$31,0($17)
711da177e4SLinus Torvalds	ldl	$31,64($17)
721da177e4SLinus Torvalds	lda	$1,1*64($16)
731da177e4SLinus Torvalds
741da177e4SLinus Torvalds	wh64	($1)
751da177e4SLinus Torvalds	ldl	$31,128($17)
761da177e4SLinus Torvalds	ldl	$31,192($17)
771da177e4SLinus Torvalds	lda	$1,2*64($16)
781da177e4SLinus Torvalds
791da177e4SLinus Torvalds	wh64	($1)
801da177e4SLinus Torvalds	ldl	$31,256($17)
811da177e4SLinus Torvalds	lda	$18,118
821da177e4SLinus Torvalds	lda	$1,3*64($16)
831da177e4SLinus Torvalds
841da177e4SLinus Torvalds	wh64	($1)
851da177e4SLinus Torvalds	nop
861da177e4SLinus Torvalds	lda	$1,4*64($16)
871da177e4SLinus Torvalds	lda	$2,5*64($16)
881da177e4SLinus Torvalds
891da177e4SLinus Torvalds	wh64	($1)
901da177e4SLinus Torvalds	wh64	($2)
911da177e4SLinus Torvalds	lda	$1,6*64($16)
921da177e4SLinus Torvalds	lda	$2,7*64($16)
931da177e4SLinus Torvalds
941da177e4SLinus Torvalds	wh64	($1)
951da177e4SLinus Torvalds	wh64	($2)
961da177e4SLinus Torvalds	lda	$1,8*64($16)
971da177e4SLinus Torvalds	lda	$2,9*64($16)
981da177e4SLinus Torvalds
991da177e4SLinus Torvalds	wh64	($1)
1001da177e4SLinus Torvalds	wh64	($2)
1011da177e4SLinus Torvalds	lda	$19,10*64($16)
1021da177e4SLinus Torvalds	nop
1031da177e4SLinus Torvalds
1041da177e4SLinus Torvalds	/* Main prefetching/write-hinting loop.  */
1051da177e4SLinus Torvalds1:	ldq	$0,0($17)
1061da177e4SLinus Torvalds	ldq	$1,8($17)
1071da177e4SLinus Torvalds	unop
1081da177e4SLinus Torvalds	unop
1091da177e4SLinus Torvalds
1101da177e4SLinus Torvalds	unop
1111da177e4SLinus Torvalds	unop
1121da177e4SLinus Torvalds	ldq	$2,16($17)
1131da177e4SLinus Torvalds	ldq	$3,24($17)
1141da177e4SLinus Torvalds
1151da177e4SLinus Torvalds	ldq	$4,32($17)
1161da177e4SLinus Torvalds	ldq	$5,40($17)
1171da177e4SLinus Torvalds	unop
1181da177e4SLinus Torvalds	unop
1191da177e4SLinus Torvalds
1201da177e4SLinus Torvalds	unop
1211da177e4SLinus Torvalds	unop
1221da177e4SLinus Torvalds	ldq	$6,48($17)
1231da177e4SLinus Torvalds	ldq	$7,56($17)
1241da177e4SLinus Torvalds
1251da177e4SLinus Torvalds	ldl	$31,320($17)
1261da177e4SLinus Torvalds	unop
1271da177e4SLinus Torvalds	unop
1281da177e4SLinus Torvalds	unop
1291da177e4SLinus Torvalds
1301da177e4SLinus Torvalds	/* This gives the extra cycle of aeration above the minimum.  */
1311da177e4SLinus Torvalds	unop
1321da177e4SLinus Torvalds	unop
1331da177e4SLinus Torvalds	unop
1341da177e4SLinus Torvalds	unop
1351da177e4SLinus Torvalds
1361da177e4SLinus Torvalds	wh64	($19)
1371da177e4SLinus Torvalds	unop
1381da177e4SLinus Torvalds	unop
1391da177e4SLinus Torvalds	unop
1401da177e4SLinus Torvalds
1411da177e4SLinus Torvalds	stq	$0,0($16)
1421da177e4SLinus Torvalds	subq	$18,1,$18
1431da177e4SLinus Torvalds	stq	$1,8($16)
1441da177e4SLinus Torvalds	unop
1451da177e4SLinus Torvalds
1461da177e4SLinus Torvalds	unop
1471da177e4SLinus Torvalds	stq	$2,16($16)
1481da177e4SLinus Torvalds	addq	$17,64,$17
1491da177e4SLinus Torvalds	stq	$3,24($16)
1501da177e4SLinus Torvalds
1511da177e4SLinus Torvalds	stq	$4,32($16)
1521da177e4SLinus Torvalds	stq	$5,40($16)
1531da177e4SLinus Torvalds	addq	$19,64,$19
1541da177e4SLinus Torvalds	unop
1551da177e4SLinus Torvalds
1561da177e4SLinus Torvalds	stq	$6,48($16)
1571da177e4SLinus Torvalds	stq	$7,56($16)
1581da177e4SLinus Torvalds	addq	$16,64,$16
1591da177e4SLinus Torvalds	bne	$18, 1b
1601da177e4SLinus Torvalds
1611da177e4SLinus Torvalds	/* Prefetch the final 5 cache lines of the read stream.  */
1621da177e4SLinus Torvalds	lda	$18,10
1631da177e4SLinus Torvalds	ldl	$31,320($17)
1641da177e4SLinus Torvalds	ldl	$31,384($17)
1651da177e4SLinus Torvalds	ldl	$31,448($17)
1661da177e4SLinus Torvalds
1671da177e4SLinus Torvalds	ldl	$31,512($17)
1681da177e4SLinus Torvalds	ldl	$31,576($17)
1691da177e4SLinus Torvalds	nop
1701da177e4SLinus Torvalds	nop
1711da177e4SLinus Torvalds
1721da177e4SLinus Torvalds	/* Non-prefetching, non-write-hinting cleanup loop for the
1731da177e4SLinus Torvalds	   final 10 cache lines.  */
1741da177e4SLinus Torvalds2:	ldq	$0,0($17)
1751da177e4SLinus Torvalds	ldq	$1,8($17)
1761da177e4SLinus Torvalds	ldq	$2,16($17)
1771da177e4SLinus Torvalds	ldq	$3,24($17)
1781da177e4SLinus Torvalds
1791da177e4SLinus Torvalds	ldq	$4,32($17)
1801da177e4SLinus Torvalds	ldq	$5,40($17)
1811da177e4SLinus Torvalds	ldq	$6,48($17)
1821da177e4SLinus Torvalds	ldq	$7,56($17)
1831da177e4SLinus Torvalds
1841da177e4SLinus Torvalds	stq	$0,0($16)
1851da177e4SLinus Torvalds	subq	$18,1,$18
1861da177e4SLinus Torvalds	stq	$1,8($16)
1871da177e4SLinus Torvalds	addq	$17,64,$17
1881da177e4SLinus Torvalds
1891da177e4SLinus Torvalds	stq	$2,16($16)
1901da177e4SLinus Torvalds	stq	$3,24($16)
1911da177e4SLinus Torvalds	stq	$4,32($16)
1921da177e4SLinus Torvalds	stq	$5,40($16)
1931da177e4SLinus Torvalds
1941da177e4SLinus Torvalds	stq	$6,48($16)
1951da177e4SLinus Torvalds	stq	$7,56($16)
1961da177e4SLinus Torvalds	addq	$16,64,$16
1971da177e4SLinus Torvalds	bne	$18, 2b
1981da177e4SLinus Torvalds
1991da177e4SLinus Torvalds	ret
2001da177e4SLinus Torvalds	nop
2011da177e4SLinus Torvalds	unop
2021da177e4SLinus Torvalds	nop
2031da177e4SLinus Torvalds
2041da177e4SLinus Torvalds	.end copy_page
20500fc0e0dSAl Viro	EXPORT_SYMBOL(copy_page)
206