1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-copy_page.S 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copy an entire page. 61da177e4SLinus Torvalds */ 71da177e4SLinus Torvalds 81da177e4SLinus Torvalds/* The following comparison of this routine vs the normal copy_page.S 91da177e4SLinus Torvalds was written by an unnamed ev6 hardware designer and forwarded to me 101da177e4SLinus Torvalds via Steven Hobbs <hobbs@steven.zko.dec.com>. 111da177e4SLinus Torvalds 121da177e4SLinus Torvalds First Problem: STQ overflows. 131da177e4SLinus Torvalds ----------------------------- 141da177e4SLinus Torvalds 151da177e4SLinus Torvalds It would be nice if EV6 handled every resource overflow efficiently, 161da177e4SLinus Torvalds but for some it doesn't. Including store queue overflows. It causes 171da177e4SLinus Torvalds a trap and a restart of the pipe. 181da177e4SLinus Torvalds 191da177e4SLinus Torvalds To get around this we sometimes use (to borrow a term from a VSSAD 201da177e4SLinus Torvalds researcher) "aeration". The idea is to slow the rate at which the 211da177e4SLinus Torvalds processor receives valid instructions by inserting nops in the fetch 221da177e4SLinus Torvalds path. In doing so, you can prevent the overflow and actually make 231da177e4SLinus Torvalds the code run faster. You can, of course, take advantage of the fact 241da177e4SLinus Torvalds that the processor can fetch at most 4 aligned instructions per cycle. 251da177e4SLinus Torvalds 261da177e4SLinus Torvalds I inserted enough nops to force it to take 10 cycles to fetch the 271da177e4SLinus Torvalds loop code. In theory, EV6 should be able to execute this loop in 281da177e4SLinus Torvalds 9 cycles but I was not able to get it to run that fast -- the initial 291da177e4SLinus Torvalds conditions were such that I could not reach this optimum rate on 301da177e4SLinus Torvalds (chaotic) EV6. I wrote the code such that everything would issue 311da177e4SLinus Torvalds in order. 321da177e4SLinus Torvalds 331da177e4SLinus Torvalds Second Problem: Dcache index matches. 341da177e4SLinus Torvalds ------------------------------------- 351da177e4SLinus Torvalds 361da177e4SLinus Torvalds If you are going to use this routine on random aligned pages, there 371da177e4SLinus Torvalds is a 25% chance that the pages will be at the same dcache indices. 381da177e4SLinus Torvalds This results in many nasty memory traps without care. 391da177e4SLinus Torvalds 401da177e4SLinus Torvalds The solution is to schedule the prefetches to avoid the memory 411da177e4SLinus Torvalds conflicts. I schedule the wh64 prefetches farther ahead of the 421da177e4SLinus Torvalds read prefetches to avoid this problem. 431da177e4SLinus Torvalds 441da177e4SLinus Torvalds Third Problem: Needs more prefetching. 451da177e4SLinus Torvalds -------------------------------------- 461da177e4SLinus Torvalds 471da177e4SLinus Torvalds In order to improve the code I added deeper prefetching to take the 481da177e4SLinus Torvalds most advantage of EV6's bandwidth. 491da177e4SLinus Torvalds 501da177e4SLinus Torvalds I also prefetched the read stream. Note that adding the read prefetch 511da177e4SLinus Torvalds forced me to add another cycle to the inner-most kernel - up to 11 521da177e4SLinus Torvalds from the original 8 cycles per iteration. We could improve performance 531da177e4SLinus Torvalds further by unrolling the loop and doing multiple prefetches per cycle. 541da177e4SLinus Torvalds 551da177e4SLinus Torvalds I think that the code below will be very robust and fast code for the 561da177e4SLinus Torvalds purposes of copying aligned pages. It is slower when both source and 571da177e4SLinus Torvalds destination pages are in the dcache, but it is my guess that this is 581da177e4SLinus Torvalds less important than the dcache miss case. */ 591da177e4SLinus Torvalds 6000fc0e0dSAl Viro#include <asm/export.h> 611da177e4SLinus Torvalds .text 621da177e4SLinus Torvalds .align 4 631da177e4SLinus Torvalds .global copy_page 641da177e4SLinus Torvalds .ent copy_page 651da177e4SLinus Torvaldscopy_page: 661da177e4SLinus Torvalds .prologue 0 671da177e4SLinus Torvalds 681da177e4SLinus Torvalds /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ 691da177e4SLinus Torvalds wh64 ($16) 701da177e4SLinus Torvalds ldl $31,0($17) 711da177e4SLinus Torvalds ldl $31,64($17) 721da177e4SLinus Torvalds lda $1,1*64($16) 731da177e4SLinus Torvalds 741da177e4SLinus Torvalds wh64 ($1) 751da177e4SLinus Torvalds ldl $31,128($17) 761da177e4SLinus Torvalds ldl $31,192($17) 771da177e4SLinus Torvalds lda $1,2*64($16) 781da177e4SLinus Torvalds 791da177e4SLinus Torvalds wh64 ($1) 801da177e4SLinus Torvalds ldl $31,256($17) 811da177e4SLinus Torvalds lda $18,118 821da177e4SLinus Torvalds lda $1,3*64($16) 831da177e4SLinus Torvalds 841da177e4SLinus Torvalds wh64 ($1) 851da177e4SLinus Torvalds nop 861da177e4SLinus Torvalds lda $1,4*64($16) 871da177e4SLinus Torvalds lda $2,5*64($16) 881da177e4SLinus Torvalds 891da177e4SLinus Torvalds wh64 ($1) 901da177e4SLinus Torvalds wh64 ($2) 911da177e4SLinus Torvalds lda $1,6*64($16) 921da177e4SLinus Torvalds lda $2,7*64($16) 931da177e4SLinus Torvalds 941da177e4SLinus Torvalds wh64 ($1) 951da177e4SLinus Torvalds wh64 ($2) 961da177e4SLinus Torvalds lda $1,8*64($16) 971da177e4SLinus Torvalds lda $2,9*64($16) 981da177e4SLinus Torvalds 991da177e4SLinus Torvalds wh64 ($1) 1001da177e4SLinus Torvalds wh64 ($2) 1011da177e4SLinus Torvalds lda $19,10*64($16) 1021da177e4SLinus Torvalds nop 1031da177e4SLinus Torvalds 1041da177e4SLinus Torvalds /* Main prefetching/write-hinting loop. */ 1051da177e4SLinus Torvalds1: ldq $0,0($17) 1061da177e4SLinus Torvalds ldq $1,8($17) 1071da177e4SLinus Torvalds unop 1081da177e4SLinus Torvalds unop 1091da177e4SLinus Torvalds 1101da177e4SLinus Torvalds unop 1111da177e4SLinus Torvalds unop 1121da177e4SLinus Torvalds ldq $2,16($17) 1131da177e4SLinus Torvalds ldq $3,24($17) 1141da177e4SLinus Torvalds 1151da177e4SLinus Torvalds ldq $4,32($17) 1161da177e4SLinus Torvalds ldq $5,40($17) 1171da177e4SLinus Torvalds unop 1181da177e4SLinus Torvalds unop 1191da177e4SLinus Torvalds 1201da177e4SLinus Torvalds unop 1211da177e4SLinus Torvalds unop 1221da177e4SLinus Torvalds ldq $6,48($17) 1231da177e4SLinus Torvalds ldq $7,56($17) 1241da177e4SLinus Torvalds 1251da177e4SLinus Torvalds ldl $31,320($17) 1261da177e4SLinus Torvalds unop 1271da177e4SLinus Torvalds unop 1281da177e4SLinus Torvalds unop 1291da177e4SLinus Torvalds 1301da177e4SLinus Torvalds /* This gives the extra cycle of aeration above the minimum. */ 1311da177e4SLinus Torvalds unop 1321da177e4SLinus Torvalds unop 1331da177e4SLinus Torvalds unop 1341da177e4SLinus Torvalds unop 1351da177e4SLinus Torvalds 1361da177e4SLinus Torvalds wh64 ($19) 1371da177e4SLinus Torvalds unop 1381da177e4SLinus Torvalds unop 1391da177e4SLinus Torvalds unop 1401da177e4SLinus Torvalds 1411da177e4SLinus Torvalds stq $0,0($16) 1421da177e4SLinus Torvalds subq $18,1,$18 1431da177e4SLinus Torvalds stq $1,8($16) 1441da177e4SLinus Torvalds unop 1451da177e4SLinus Torvalds 1461da177e4SLinus Torvalds unop 1471da177e4SLinus Torvalds stq $2,16($16) 1481da177e4SLinus Torvalds addq $17,64,$17 1491da177e4SLinus Torvalds stq $3,24($16) 1501da177e4SLinus Torvalds 1511da177e4SLinus Torvalds stq $4,32($16) 1521da177e4SLinus Torvalds stq $5,40($16) 1531da177e4SLinus Torvalds addq $19,64,$19 1541da177e4SLinus Torvalds unop 1551da177e4SLinus Torvalds 1561da177e4SLinus Torvalds stq $6,48($16) 1571da177e4SLinus Torvalds stq $7,56($16) 1581da177e4SLinus Torvalds addq $16,64,$16 1591da177e4SLinus Torvalds bne $18, 1b 1601da177e4SLinus Torvalds 1611da177e4SLinus Torvalds /* Prefetch the final 5 cache lines of the read stream. */ 1621da177e4SLinus Torvalds lda $18,10 1631da177e4SLinus Torvalds ldl $31,320($17) 1641da177e4SLinus Torvalds ldl $31,384($17) 1651da177e4SLinus Torvalds ldl $31,448($17) 1661da177e4SLinus Torvalds 1671da177e4SLinus Torvalds ldl $31,512($17) 1681da177e4SLinus Torvalds ldl $31,576($17) 1691da177e4SLinus Torvalds nop 1701da177e4SLinus Torvalds nop 1711da177e4SLinus Torvalds 1721da177e4SLinus Torvalds /* Non-prefetching, non-write-hinting cleanup loop for the 1731da177e4SLinus Torvalds final 10 cache lines. */ 1741da177e4SLinus Torvalds2: ldq $0,0($17) 1751da177e4SLinus Torvalds ldq $1,8($17) 1761da177e4SLinus Torvalds ldq $2,16($17) 1771da177e4SLinus Torvalds ldq $3,24($17) 1781da177e4SLinus Torvalds 1791da177e4SLinus Torvalds ldq $4,32($17) 1801da177e4SLinus Torvalds ldq $5,40($17) 1811da177e4SLinus Torvalds ldq $6,48($17) 1821da177e4SLinus Torvalds ldq $7,56($17) 1831da177e4SLinus Torvalds 1841da177e4SLinus Torvalds stq $0,0($16) 1851da177e4SLinus Torvalds subq $18,1,$18 1861da177e4SLinus Torvalds stq $1,8($16) 1871da177e4SLinus Torvalds addq $17,64,$17 1881da177e4SLinus Torvalds 1891da177e4SLinus Torvalds stq $2,16($16) 1901da177e4SLinus Torvalds stq $3,24($16) 1911da177e4SLinus Torvalds stq $4,32($16) 1921da177e4SLinus Torvalds stq $5,40($16) 1931da177e4SLinus Torvalds 1941da177e4SLinus Torvalds stq $6,48($16) 1951da177e4SLinus Torvalds stq $7,56($16) 1961da177e4SLinus Torvalds addq $16,64,$16 1971da177e4SLinus Torvalds bne $18, 2b 1981da177e4SLinus Torvalds 1991da177e4SLinus Torvalds ret 2001da177e4SLinus Torvalds nop 2011da177e4SLinus Torvalds unop 2021da177e4SLinus Torvalds nop 2031da177e4SLinus Torvalds 2041da177e4SLinus Torvalds .end copy_page 20500fc0e0dSAl Viro EXPORT_SYMBOL(copy_page) 206