1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-memcpy.S 41da177e4SLinus Torvalds * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * Reasonably optimized memcpy() routine for the Alpha 21264 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * - memory accessed as aligned quadwords only 91da177e4SLinus Torvalds * - uses bcmpge to compare 8 bytes in parallel 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from: 121da177e4SLinus Torvalds * Compiler Writer's Guide for the Alpha 21264 131da177e4SLinus Torvalds * abbreviated as 'CWG' in other comments here 141da177e4SLinus Torvalds * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 151da177e4SLinus Torvalds * Scheduling notation: 161da177e4SLinus Torvalds * E - either cluster 171da177e4SLinus Torvalds * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 181da177e4SLinus Torvalds * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 191da177e4SLinus Torvalds * 201da177e4SLinus Torvalds * Temp usage notes: 211da177e4SLinus Torvalds * $1,$2, - scratch 221da177e4SLinus Torvalds */ 23*f3c78e94SMasahiro Yamada#include <linux/export.h> 241da177e4SLinus Torvalds .set noreorder 251da177e4SLinus Torvalds .set noat 261da177e4SLinus Torvalds 271da177e4SLinus Torvalds .align 4 281da177e4SLinus Torvalds .globl memcpy 291da177e4SLinus Torvalds .ent memcpy 301da177e4SLinus Torvaldsmemcpy: 311da177e4SLinus Torvalds .frame $30,0,$26,0 321da177e4SLinus Torvalds .prologue 0 331da177e4SLinus Torvalds 341da177e4SLinus Torvalds mov $16, $0 # E : copy dest to return 351da177e4SLinus Torvalds ble $18, $nomoredata # U : done with the copy? 361da177e4SLinus Torvalds xor $16, $17, $1 # E : are source and dest alignments the same? 371da177e4SLinus Torvalds and $1, 7, $1 # E : are they the same mod 8? 381da177e4SLinus Torvalds 391da177e4SLinus Torvalds bne $1, $misaligned # U : Nope - gotta do this the slow way 401da177e4SLinus Torvalds /* source and dest are same mod 8 address */ 411da177e4SLinus Torvalds and $16, 7, $1 # E : Are both 0mod8? 421da177e4SLinus Torvalds beq $1, $both_0mod8 # U : Yes 431da177e4SLinus Torvalds nop # E : 441da177e4SLinus Torvalds 451da177e4SLinus Torvalds /* 461da177e4SLinus Torvalds * source and dest are same misalignment. move a byte at a time 471da177e4SLinus Torvalds * until a 0mod8 alignment for both is reached. 481da177e4SLinus Torvalds * At least one byte more to move 491da177e4SLinus Torvalds */ 501da177e4SLinus Torvalds 511da177e4SLinus Torvalds$head_align: 521da177e4SLinus Torvalds ldbu $1, 0($17) # L : grab a byte 531da177e4SLinus Torvalds subq $18, 1, $18 # E : count-- 541da177e4SLinus Torvalds addq $17, 1, $17 # E : src++ 551da177e4SLinus Torvalds stb $1, 0($16) # L : 561da177e4SLinus Torvalds addq $16, 1, $16 # E : dest++ 571da177e4SLinus Torvalds and $16, 7, $1 # E : Are we at 0mod8 yet? 581da177e4SLinus Torvalds ble $18, $nomoredata # U : done with the copy? 591da177e4SLinus Torvalds bne $1, $head_align # U : 601da177e4SLinus Torvalds 611da177e4SLinus Torvalds$both_0mod8: 621da177e4SLinus Torvalds cmple $18, 127, $1 # E : Can we unroll the loop? 631da177e4SLinus Torvalds bne $1, $no_unroll # U : 641da177e4SLinus Torvalds and $16, 63, $1 # E : get mod64 alignment 651da177e4SLinus Torvalds beq $1, $do_unroll # U : no single quads to fiddle 661da177e4SLinus Torvalds 671da177e4SLinus Torvalds$single_head_quad: 681da177e4SLinus Torvalds ldq $1, 0($17) # L : get 8 bytes 691da177e4SLinus Torvalds subq $18, 8, $18 # E : count -= 8 701da177e4SLinus Torvalds addq $17, 8, $17 # E : src += 8 711da177e4SLinus Torvalds nop # E : 721da177e4SLinus Torvalds 731da177e4SLinus Torvalds stq $1, 0($16) # L : store 741da177e4SLinus Torvalds addq $16, 8, $16 # E : dest += 8 751da177e4SLinus Torvalds and $16, 63, $1 # E : get mod64 alignment 761da177e4SLinus Torvalds bne $1, $single_head_quad # U : still not fully aligned 771da177e4SLinus Torvalds 781da177e4SLinus Torvalds$do_unroll: 791da177e4SLinus Torvalds addq $16, 64, $7 # E : Initial (+1 trip) wh64 address 801da177e4SLinus Torvalds cmple $18, 127, $1 # E : Can we go through the unrolled loop? 811da177e4SLinus Torvalds bne $1, $tail_quads # U : Nope 821da177e4SLinus Torvalds nop # E : 831da177e4SLinus Torvalds 841da177e4SLinus Torvalds$unroll_body: 851da177e4SLinus Torvalds wh64 ($7) # L1 : memory subsystem hint: 64 bytes at 861da177e4SLinus Torvalds # ($7) are about to be over-written 871da177e4SLinus Torvalds ldq $6, 0($17) # L0 : bytes 0..7 881da177e4SLinus Torvalds nop # E : 891da177e4SLinus Torvalds nop # E : 901da177e4SLinus Torvalds 911da177e4SLinus Torvalds ldq $4, 8($17) # L : bytes 8..15 921da177e4SLinus Torvalds ldq $5, 16($17) # L : bytes 16..23 931da177e4SLinus Torvalds addq $7, 64, $7 # E : Update next wh64 address 941da177e4SLinus Torvalds nop # E : 951da177e4SLinus Torvalds 961da177e4SLinus Torvalds ldq $3, 24($17) # L : bytes 24..31 971da177e4SLinus Torvalds addq $16, 64, $1 # E : fallback value for wh64 981da177e4SLinus Torvalds nop # E : 991da177e4SLinus Torvalds nop # E : 1001da177e4SLinus Torvalds 1011da177e4SLinus Torvalds addq $17, 32, $17 # E : src += 32 bytes 1021da177e4SLinus Torvalds stq $6, 0($16) # L : bytes 0..7 1031da177e4SLinus Torvalds nop # E : 1041da177e4SLinus Torvalds nop # E : 1051da177e4SLinus Torvalds 1061da177e4SLinus Torvalds stq $4, 8($16) # L : bytes 8..15 1071da177e4SLinus Torvalds stq $5, 16($16) # L : bytes 16..23 1081da177e4SLinus Torvalds subq $18, 192, $2 # E : At least two more trips to go? 1091da177e4SLinus Torvalds nop # E : 1101da177e4SLinus Torvalds 1111da177e4SLinus Torvalds stq $3, 24($16) # L : bytes 24..31 1121da177e4SLinus Torvalds addq $16, 32, $16 # E : dest += 32 bytes 1131da177e4SLinus Torvalds nop # E : 1141da177e4SLinus Torvalds nop # E : 1151da177e4SLinus Torvalds 1161da177e4SLinus Torvalds ldq $6, 0($17) # L : bytes 0..7 1171da177e4SLinus Torvalds ldq $4, 8($17) # L : bytes 8..15 1181da177e4SLinus Torvalds cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use 1191da177e4SLinus Torvalds # fallback wh64 address if < 2 more trips 1201da177e4SLinus Torvalds nop # E : 1211da177e4SLinus Torvalds 1221da177e4SLinus Torvalds ldq $5, 16($17) # L : bytes 16..23 1231da177e4SLinus Torvalds ldq $3, 24($17) # L : bytes 24..31 1241da177e4SLinus Torvalds addq $16, 32, $16 # E : dest += 32 1251da177e4SLinus Torvalds subq $18, 64, $18 # E : count -= 64 1261da177e4SLinus Torvalds 1271da177e4SLinus Torvalds addq $17, 32, $17 # E : src += 32 1281da177e4SLinus Torvalds stq $6, -32($16) # L : bytes 0..7 1291da177e4SLinus Torvalds stq $4, -24($16) # L : bytes 8..15 1301da177e4SLinus Torvalds cmple $18, 63, $1 # E : At least one more trip? 1311da177e4SLinus Torvalds 1321da177e4SLinus Torvalds stq $5, -16($16) # L : bytes 16..23 1331da177e4SLinus Torvalds stq $3, -8($16) # L : bytes 24..31 1341da177e4SLinus Torvalds nop # E : 1351da177e4SLinus Torvalds beq $1, $unroll_body 1361da177e4SLinus Torvalds 1371da177e4SLinus Torvalds$tail_quads: 1381da177e4SLinus Torvalds$no_unroll: 1391da177e4SLinus Torvalds .align 4 1401da177e4SLinus Torvalds subq $18, 8, $18 # E : At least a quad left? 1411da177e4SLinus Torvalds blt $18, $less_than_8 # U : Nope 1421da177e4SLinus Torvalds nop # E : 1431da177e4SLinus Torvalds nop # E : 1441da177e4SLinus Torvalds 1451da177e4SLinus Torvalds$move_a_quad: 1461da177e4SLinus Torvalds ldq $1, 0($17) # L : fetch 8 1471da177e4SLinus Torvalds subq $18, 8, $18 # E : count -= 8 1481da177e4SLinus Torvalds addq $17, 8, $17 # E : src += 8 1491da177e4SLinus Torvalds nop # E : 1501da177e4SLinus Torvalds 1511da177e4SLinus Torvalds stq $1, 0($16) # L : store 8 1521da177e4SLinus Torvalds addq $16, 8, $16 # E : dest += 8 1531da177e4SLinus Torvalds bge $18, $move_a_quad # U : 1541da177e4SLinus Torvalds nop # E : 1551da177e4SLinus Torvalds 1561da177e4SLinus Torvalds$less_than_8: 1571da177e4SLinus Torvalds .align 4 1581da177e4SLinus Torvalds addq $18, 8, $18 # E : add back for trailing bytes 1591da177e4SLinus Torvalds ble $18, $nomoredata # U : All-done 1601da177e4SLinus Torvalds nop # E : 1611da177e4SLinus Torvalds nop # E : 1621da177e4SLinus Torvalds 1631da177e4SLinus Torvalds /* Trailing bytes */ 1641da177e4SLinus Torvalds$tail_bytes: 1651da177e4SLinus Torvalds subq $18, 1, $18 # E : count-- 1661da177e4SLinus Torvalds ldbu $1, 0($17) # L : fetch a byte 1671da177e4SLinus Torvalds addq $17, 1, $17 # E : src++ 1681da177e4SLinus Torvalds nop # E : 1691da177e4SLinus Torvalds 1701da177e4SLinus Torvalds stb $1, 0($16) # L : store a byte 1711da177e4SLinus Torvalds addq $16, 1, $16 # E : dest++ 1721da177e4SLinus Torvalds bgt $18, $tail_bytes # U : more to be done? 1731da177e4SLinus Torvalds nop # E : 1741da177e4SLinus Torvalds 1751da177e4SLinus Torvalds /* branching to exit takes 3 extra cycles, so replicate exit here */ 1761da177e4SLinus Torvalds ret $31, ($26), 1 # L0 : 1771da177e4SLinus Torvalds nop # E : 1781da177e4SLinus Torvalds nop # E : 1791da177e4SLinus Torvalds nop # E : 1801da177e4SLinus Torvalds 1811da177e4SLinus Torvalds$misaligned: 1821da177e4SLinus Torvalds mov $0, $4 # E : dest temp 1831da177e4SLinus Torvalds and $0, 7, $1 # E : dest alignment mod8 1841da177e4SLinus Torvalds beq $1, $dest_0mod8 # U : life doesnt totally suck 1851da177e4SLinus Torvalds nop 1861da177e4SLinus Torvalds 1871da177e4SLinus Torvalds$aligndest: 1881da177e4SLinus Torvalds ble $18, $nomoredata # U : 1891da177e4SLinus Torvalds ldbu $1, 0($17) # L : fetch a byte 1901da177e4SLinus Torvalds subq $18, 1, $18 # E : count-- 1911da177e4SLinus Torvalds addq $17, 1, $17 # E : src++ 1921da177e4SLinus Torvalds 1931da177e4SLinus Torvalds stb $1, 0($4) # L : store it 1941da177e4SLinus Torvalds addq $4, 1, $4 # E : dest++ 1951da177e4SLinus Torvalds and $4, 7, $1 # E : dest 0mod8 yet? 1961da177e4SLinus Torvalds bne $1, $aligndest # U : go until we are aligned. 1971da177e4SLinus Torvalds 1981da177e4SLinus Torvalds /* Source has unknown alignment, but dest is known to be 0mod8 */ 1991da177e4SLinus Torvalds$dest_0mod8: 2001da177e4SLinus Torvalds subq $18, 8, $18 # E : At least a quad left? 2011da177e4SLinus Torvalds blt $18, $misalign_tail # U : Nope 2021da177e4SLinus Torvalds ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes 2031da177e4SLinus Torvalds nop # E : 2041da177e4SLinus Torvalds 2051da177e4SLinus Torvalds$mis_quad: 2061da177e4SLinus Torvalds ldq_u $16, 8($17) # L : Fetch next 8 2071da177e4SLinus Torvalds extql $3, $17, $3 # U : masking 2081da177e4SLinus Torvalds extqh $16, $17, $1 # U : masking 2091da177e4SLinus Torvalds bis $3, $1, $1 # E : merged bytes to store 2101da177e4SLinus Torvalds 2111da177e4SLinus Torvalds subq $18, 8, $18 # E : count -= 8 2121da177e4SLinus Torvalds addq $17, 8, $17 # E : src += 8 2131da177e4SLinus Torvalds stq $1, 0($4) # L : store 8 (aligned) 2141da177e4SLinus Torvalds mov $16, $3 # E : "rotate" source data 2151da177e4SLinus Torvalds 2161da177e4SLinus Torvalds addq $4, 8, $4 # E : dest += 8 2171da177e4SLinus Torvalds bge $18, $mis_quad # U : More quads to move 2181da177e4SLinus Torvalds nop 2191da177e4SLinus Torvalds nop 2201da177e4SLinus Torvalds 2211da177e4SLinus Torvalds$misalign_tail: 2221da177e4SLinus Torvalds addq $18, 8, $18 # E : account for tail stuff 2231da177e4SLinus Torvalds ble $18, $nomoredata # U : 2241da177e4SLinus Torvalds nop 2251da177e4SLinus Torvalds nop 2261da177e4SLinus Torvalds 2271da177e4SLinus Torvalds$misalign_byte: 2281da177e4SLinus Torvalds ldbu $1, 0($17) # L : fetch 1 2291da177e4SLinus Torvalds subq $18, 1, $18 # E : count-- 2301da177e4SLinus Torvalds addq $17, 1, $17 # E : src++ 2311da177e4SLinus Torvalds nop # E : 2321da177e4SLinus Torvalds 2331da177e4SLinus Torvalds stb $1, 0($4) # L : store 2341da177e4SLinus Torvalds addq $4, 1, $4 # E : dest++ 2351da177e4SLinus Torvalds bgt $18, $misalign_byte # U : more to go? 2361da177e4SLinus Torvalds nop 2371da177e4SLinus Torvalds 2381da177e4SLinus Torvalds 2391da177e4SLinus Torvalds$nomoredata: 2401da177e4SLinus Torvalds ret $31, ($26), 1 # L0 : 2411da177e4SLinus Torvalds nop # E : 2421da177e4SLinus Torvalds nop # E : 2431da177e4SLinus Torvalds nop # E : 2441da177e4SLinus Torvalds 2451da177e4SLinus Torvalds .end memcpy 24600fc0e0dSAl Viro EXPORT_SYMBOL(memcpy) 2471da177e4SLinus Torvalds 2481da177e4SLinus Torvalds/* For backwards module compatibility. */ 2491da177e4SLinus Torvalds__memcpy = memcpy 2501da177e4SLinus Torvalds.globl __memcpy 251