11da177e4SLinus Torvalds/* 21da177e4SLinus Torvalds * arch/alpha/lib/ev6-copy_page.S 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copy an entire page. 51da177e4SLinus Torvalds */ 61da177e4SLinus Torvalds 71da177e4SLinus Torvalds/* The following comparison of this routine vs the normal copy_page.S 81da177e4SLinus Torvalds was written by an unnamed ev6 hardware designer and forwarded to me 91da177e4SLinus Torvalds via Steven Hobbs <hobbs@steven.zko.dec.com>. 101da177e4SLinus Torvalds 111da177e4SLinus Torvalds First Problem: STQ overflows. 121da177e4SLinus Torvalds ----------------------------- 131da177e4SLinus Torvalds 141da177e4SLinus Torvalds It would be nice if EV6 handled every resource overflow efficiently, 151da177e4SLinus Torvalds but for some it doesn't. Including store queue overflows. It causes 161da177e4SLinus Torvalds a trap and a restart of the pipe. 171da177e4SLinus Torvalds 181da177e4SLinus Torvalds To get around this we sometimes use (to borrow a term from a VSSAD 191da177e4SLinus Torvalds researcher) "aeration". The idea is to slow the rate at which the 201da177e4SLinus Torvalds processor receives valid instructions by inserting nops in the fetch 211da177e4SLinus Torvalds path. In doing so, you can prevent the overflow and actually make 221da177e4SLinus Torvalds the code run faster. You can, of course, take advantage of the fact 231da177e4SLinus Torvalds that the processor can fetch at most 4 aligned instructions per cycle. 241da177e4SLinus Torvalds 251da177e4SLinus Torvalds I inserted enough nops to force it to take 10 cycles to fetch the 261da177e4SLinus Torvalds loop code. In theory, EV6 should be able to execute this loop in 271da177e4SLinus Torvalds 9 cycles but I was not able to get it to run that fast -- the initial 281da177e4SLinus Torvalds conditions were such that I could not reach this optimum rate on 291da177e4SLinus Torvalds (chaotic) EV6. I wrote the code such that everything would issue 301da177e4SLinus Torvalds in order. 311da177e4SLinus Torvalds 321da177e4SLinus Torvalds Second Problem: Dcache index matches. 331da177e4SLinus Torvalds ------------------------------------- 341da177e4SLinus Torvalds 351da177e4SLinus Torvalds If you are going to use this routine on random aligned pages, there 361da177e4SLinus Torvalds is a 25% chance that the pages will be at the same dcache indices. 371da177e4SLinus Torvalds This results in many nasty memory traps without care. 381da177e4SLinus Torvalds 391da177e4SLinus Torvalds The solution is to schedule the prefetches to avoid the memory 401da177e4SLinus Torvalds conflicts. I schedule the wh64 prefetches farther ahead of the 411da177e4SLinus Torvalds read prefetches to avoid this problem. 421da177e4SLinus Torvalds 431da177e4SLinus Torvalds Third Problem: Needs more prefetching. 441da177e4SLinus Torvalds -------------------------------------- 451da177e4SLinus Torvalds 461da177e4SLinus Torvalds In order to improve the code I added deeper prefetching to take the 471da177e4SLinus Torvalds most advantage of EV6's bandwidth. 481da177e4SLinus Torvalds 491da177e4SLinus Torvalds I also prefetched the read stream. Note that adding the read prefetch 501da177e4SLinus Torvalds forced me to add another cycle to the inner-most kernel - up to 11 511da177e4SLinus Torvalds from the original 8 cycles per iteration. We could improve performance 521da177e4SLinus Torvalds further by unrolling the loop and doing multiple prefetches per cycle. 531da177e4SLinus Torvalds 541da177e4SLinus Torvalds I think that the code below will be very robust and fast code for the 551da177e4SLinus Torvalds purposes of copying aligned pages. It is slower when both source and 561da177e4SLinus Torvalds destination pages are in the dcache, but it is my guess that this is 571da177e4SLinus Torvalds less important than the dcache miss case. */ 581da177e4SLinus Torvalds 5900fc0e0dSAl Viro#include <asm/export.h> 601da177e4SLinus Torvalds .text 611da177e4SLinus Torvalds .align 4 621da177e4SLinus Torvalds .global copy_page 631da177e4SLinus Torvalds .ent copy_page 641da177e4SLinus Torvaldscopy_page: 651da177e4SLinus Torvalds .prologue 0 661da177e4SLinus Torvalds 671da177e4SLinus Torvalds /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ 681da177e4SLinus Torvalds wh64 ($16) 691da177e4SLinus Torvalds ldl $31,0($17) 701da177e4SLinus Torvalds ldl $31,64($17) 711da177e4SLinus Torvalds lda $1,1*64($16) 721da177e4SLinus Torvalds 731da177e4SLinus Torvalds wh64 ($1) 741da177e4SLinus Torvalds ldl $31,128($17) 751da177e4SLinus Torvalds ldl $31,192($17) 761da177e4SLinus Torvalds lda $1,2*64($16) 771da177e4SLinus Torvalds 781da177e4SLinus Torvalds wh64 ($1) 791da177e4SLinus Torvalds ldl $31,256($17) 801da177e4SLinus Torvalds lda $18,118 811da177e4SLinus Torvalds lda $1,3*64($16) 821da177e4SLinus Torvalds 831da177e4SLinus Torvalds wh64 ($1) 841da177e4SLinus Torvalds nop 851da177e4SLinus Torvalds lda $1,4*64($16) 861da177e4SLinus Torvalds lda $2,5*64($16) 871da177e4SLinus Torvalds 881da177e4SLinus Torvalds wh64 ($1) 891da177e4SLinus Torvalds wh64 ($2) 901da177e4SLinus Torvalds lda $1,6*64($16) 911da177e4SLinus Torvalds lda $2,7*64($16) 921da177e4SLinus Torvalds 931da177e4SLinus Torvalds wh64 ($1) 941da177e4SLinus Torvalds wh64 ($2) 951da177e4SLinus Torvalds lda $1,8*64($16) 961da177e4SLinus Torvalds lda $2,9*64($16) 971da177e4SLinus Torvalds 981da177e4SLinus Torvalds wh64 ($1) 991da177e4SLinus Torvalds wh64 ($2) 1001da177e4SLinus Torvalds lda $19,10*64($16) 1011da177e4SLinus Torvalds nop 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds /* Main prefetching/write-hinting loop. */ 1041da177e4SLinus Torvalds1: ldq $0,0($17) 1051da177e4SLinus Torvalds ldq $1,8($17) 1061da177e4SLinus Torvalds unop 1071da177e4SLinus Torvalds unop 1081da177e4SLinus Torvalds 1091da177e4SLinus Torvalds unop 1101da177e4SLinus Torvalds unop 1111da177e4SLinus Torvalds ldq $2,16($17) 1121da177e4SLinus Torvalds ldq $3,24($17) 1131da177e4SLinus Torvalds 1141da177e4SLinus Torvalds ldq $4,32($17) 1151da177e4SLinus Torvalds ldq $5,40($17) 1161da177e4SLinus Torvalds unop 1171da177e4SLinus Torvalds unop 1181da177e4SLinus Torvalds 1191da177e4SLinus Torvalds unop 1201da177e4SLinus Torvalds unop 1211da177e4SLinus Torvalds ldq $6,48($17) 1221da177e4SLinus Torvalds ldq $7,56($17) 1231da177e4SLinus Torvalds 1241da177e4SLinus Torvalds ldl $31,320($17) 1251da177e4SLinus Torvalds unop 1261da177e4SLinus Torvalds unop 1271da177e4SLinus Torvalds unop 1281da177e4SLinus Torvalds 1291da177e4SLinus Torvalds /* This gives the extra cycle of aeration above the minimum. */ 1301da177e4SLinus Torvalds unop 1311da177e4SLinus Torvalds unop 1321da177e4SLinus Torvalds unop 1331da177e4SLinus Torvalds unop 1341da177e4SLinus Torvalds 1351da177e4SLinus Torvalds wh64 ($19) 1361da177e4SLinus Torvalds unop 1371da177e4SLinus Torvalds unop 1381da177e4SLinus Torvalds unop 1391da177e4SLinus Torvalds 1401da177e4SLinus Torvalds stq $0,0($16) 1411da177e4SLinus Torvalds subq $18,1,$18 1421da177e4SLinus Torvalds stq $1,8($16) 1431da177e4SLinus Torvalds unop 1441da177e4SLinus Torvalds 1451da177e4SLinus Torvalds unop 1461da177e4SLinus Torvalds stq $2,16($16) 1471da177e4SLinus Torvalds addq $17,64,$17 1481da177e4SLinus Torvalds stq $3,24($16) 1491da177e4SLinus Torvalds 1501da177e4SLinus Torvalds stq $4,32($16) 1511da177e4SLinus Torvalds stq $5,40($16) 1521da177e4SLinus Torvalds addq $19,64,$19 1531da177e4SLinus Torvalds unop 1541da177e4SLinus Torvalds 1551da177e4SLinus Torvalds stq $6,48($16) 1561da177e4SLinus Torvalds stq $7,56($16) 1571da177e4SLinus Torvalds addq $16,64,$16 1581da177e4SLinus Torvalds bne $18, 1b 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds /* Prefetch the final 5 cache lines of the read stream. */ 1611da177e4SLinus Torvalds lda $18,10 1621da177e4SLinus Torvalds ldl $31,320($17) 1631da177e4SLinus Torvalds ldl $31,384($17) 1641da177e4SLinus Torvalds ldl $31,448($17) 1651da177e4SLinus Torvalds 1661da177e4SLinus Torvalds ldl $31,512($17) 1671da177e4SLinus Torvalds ldl $31,576($17) 1681da177e4SLinus Torvalds nop 1691da177e4SLinus Torvalds nop 1701da177e4SLinus Torvalds 1711da177e4SLinus Torvalds /* Non-prefetching, non-write-hinting cleanup loop for the 1721da177e4SLinus Torvalds final 10 cache lines. */ 1731da177e4SLinus Torvalds2: ldq $0,0($17) 1741da177e4SLinus Torvalds ldq $1,8($17) 1751da177e4SLinus Torvalds ldq $2,16($17) 1761da177e4SLinus Torvalds ldq $3,24($17) 1771da177e4SLinus Torvalds 1781da177e4SLinus Torvalds ldq $4,32($17) 1791da177e4SLinus Torvalds ldq $5,40($17) 1801da177e4SLinus Torvalds ldq $6,48($17) 1811da177e4SLinus Torvalds ldq $7,56($17) 1821da177e4SLinus Torvalds 1831da177e4SLinus Torvalds stq $0,0($16) 1841da177e4SLinus Torvalds subq $18,1,$18 1851da177e4SLinus Torvalds stq $1,8($16) 1861da177e4SLinus Torvalds addq $17,64,$17 1871da177e4SLinus Torvalds 1881da177e4SLinus Torvalds stq $2,16($16) 1891da177e4SLinus Torvalds stq $3,24($16) 1901da177e4SLinus Torvalds stq $4,32($16) 1911da177e4SLinus Torvalds stq $5,40($16) 1921da177e4SLinus Torvalds 1931da177e4SLinus Torvalds stq $6,48($16) 1941da177e4SLinus Torvalds stq $7,56($16) 1951da177e4SLinus Torvalds addq $16,64,$16 1961da177e4SLinus Torvalds bne $18, 2b 1971da177e4SLinus Torvalds 1981da177e4SLinus Torvalds ret 1991da177e4SLinus Torvalds nop 2001da177e4SLinus Torvalds unop 2011da177e4SLinus Torvalds nop 2021da177e4SLinus Torvalds 2031da177e4SLinus Torvalds .end copy_page 20400fc0e0dSAl Viro EXPORT_SYMBOL(copy_page) 205