1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-memset.S 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * This is an efficient (and relatively small) implementation of the C library 61da177e4SLinus Torvalds * "memset()" function for the 21264 implementation of Alpha. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 91da177e4SLinus Torvalds * 101da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from: 111da177e4SLinus Torvalds * Compiler Writer's Guide for the Alpha 21264 121da177e4SLinus Torvalds * abbreviated as 'CWG' in other comments here 131da177e4SLinus Torvalds * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 141da177e4SLinus Torvalds * Scheduling notation: 151da177e4SLinus Torvalds * E - either cluster 161da177e4SLinus Torvalds * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 171da177e4SLinus Torvalds * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 181da177e4SLinus Torvalds * The algorithm for the leading and trailing quadwords remains the same, 191da177e4SLinus Torvalds * however the loop has been unrolled to enable better memory throughput, 201da177e4SLinus Torvalds * and the code has been replicated for each of the entry points: __memset 210d83620fSMichael Cree * and __memset16 to permit better scheduling to eliminate the stalling 221da177e4SLinus Torvalds * encountered during the mask replication. 231da177e4SLinus Torvalds * A future enhancement might be to put in a byte store loop for really 241da177e4SLinus Torvalds * small (say < 32 bytes) memset()s. Whether or not that change would be 251da177e4SLinus Torvalds * a win in the kernel would depend upon the contextual usage. 261da177e4SLinus Torvalds * WARNING: Maintaining this is going to be more work than the above version, 271da177e4SLinus Torvalds * as fixes will need to be made in multiple places. The performance gain 281da177e4SLinus Torvalds * is worth it. 291da177e4SLinus Torvalds */ 30*f3c78e94SMasahiro Yamada#include <linux/export.h> 311da177e4SLinus Torvalds .set noat 321da177e4SLinus Torvalds .set noreorder 331da177e4SLinus Torvalds.text 34a47e5bb5SRichard Henderson .globl memset 351da177e4SLinus Torvalds .globl __memset 36a47e5bb5SRichard Henderson .globl ___memset 370d83620fSMichael Cree .globl __memset16 381da177e4SLinus Torvalds .globl __constant_c_memset 391da177e4SLinus Torvalds 40a47e5bb5SRichard Henderson .ent ___memset 411da177e4SLinus Torvalds.align 5 42a47e5bb5SRichard Henderson___memset: 431da177e4SLinus Torvalds .frame $30,0,$26,0 441da177e4SLinus Torvalds .prologue 0 451da177e4SLinus Torvalds 461da177e4SLinus Torvalds /* 471da177e4SLinus Torvalds * Serious stalling happens. The only way to mitigate this is to 481da177e4SLinus Torvalds * undertake a major re-write to interleave the constant materialization 491da177e4SLinus Torvalds * with other parts of the fall-through code. This is important, even 501da177e4SLinus Torvalds * though it makes maintenance tougher. 511da177e4SLinus Torvalds * Do this later. 521da177e4SLinus Torvalds */ 531da177e4SLinus Torvalds and $17,255,$1 # E : 00000000000000ch 541da177e4SLinus Torvalds insbl $17,1,$2 # U : 000000000000ch00 551da177e4SLinus Torvalds bis $16,$16,$0 # E : return value 561da177e4SLinus Torvalds ble $18,end_b # U : zero length requested? 571da177e4SLinus Torvalds 581da177e4SLinus Torvalds addq $18,$16,$6 # E : max address to write to 591da177e4SLinus Torvalds bis $1,$2,$17 # E : 000000000000chch 601da177e4SLinus Torvalds insbl $1,2,$3 # U : 0000000000ch0000 611da177e4SLinus Torvalds insbl $1,3,$4 # U : 00000000ch000000 621da177e4SLinus Torvalds 631da177e4SLinus Torvalds or $3,$4,$3 # E : 00000000chch0000 641da177e4SLinus Torvalds inswl $17,4,$5 # U : 0000chch00000000 651da177e4SLinus Torvalds xor $16,$6,$1 # E : will complete write be within one quadword? 661da177e4SLinus Torvalds inswl $17,6,$2 # U : chch000000000000 671da177e4SLinus Torvalds 681da177e4SLinus Torvalds or $17,$3,$17 # E : 00000000chchchch 691da177e4SLinus Torvalds or $2,$5,$2 # E : chchchch00000000 701da177e4SLinus Torvalds bic $1,7,$1 # E : fit within a single quadword? 711da177e4SLinus Torvalds and $16,7,$3 # E : Target addr misalignment 721da177e4SLinus Torvalds 731da177e4SLinus Torvalds or $17,$2,$17 # E : chchchchchchchch 741da177e4SLinus Torvalds beq $1,within_quad_b # U : 751da177e4SLinus Torvalds nop # E : 761da177e4SLinus Torvalds beq $3,aligned_b # U : target is 0mod8 771da177e4SLinus Torvalds 781da177e4SLinus Torvalds /* 791da177e4SLinus Torvalds * Target address is misaligned, and won't fit within a quadword 801da177e4SLinus Torvalds */ 811da177e4SLinus Torvalds ldq_u $4,0($16) # L : Fetch first partial 821da177e4SLinus Torvalds bis $16,$16,$5 # E : Save the address 831da177e4SLinus Torvalds insql $17,$16,$2 # U : Insert new bytes 841da177e4SLinus Torvalds subq $3,8,$3 # E : Invert (for addressing uses) 851da177e4SLinus Torvalds 861da177e4SLinus Torvalds addq $18,$3,$18 # E : $18 is new count ($3 is negative) 871da177e4SLinus Torvalds mskql $4,$16,$4 # U : clear relevant parts of the quad 881da177e4SLinus Torvalds subq $16,$3,$16 # E : $16 is new aligned destination 891da177e4SLinus Torvalds bis $2,$4,$1 # E : Final bytes 901da177e4SLinus Torvalds 911da177e4SLinus Torvalds nop 921da177e4SLinus Torvalds stq_u $1,0($5) # L : Store result 931da177e4SLinus Torvalds nop 941da177e4SLinus Torvalds nop 951da177e4SLinus Torvalds 961da177e4SLinus Torvalds.align 4 971da177e4SLinus Torvaldsaligned_b: 981da177e4SLinus Torvalds /* 991da177e4SLinus Torvalds * We are now guaranteed to be quad aligned, with at least 1001da177e4SLinus Torvalds * one partial quad to write. 1011da177e4SLinus Torvalds */ 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds sra $18,3,$3 # U : Number of remaining quads to write 1041da177e4SLinus Torvalds and $18,7,$18 # E : Number of trailing bytes to write 1051da177e4SLinus Torvalds bis $16,$16,$5 # E : Save dest address 1061da177e4SLinus Torvalds beq $3,no_quad_b # U : tail stuff only 1071da177e4SLinus Torvalds 1081da177e4SLinus Torvalds /* 1091da177e4SLinus Torvalds * it's worth the effort to unroll this and use wh64 if possible 1101da177e4SLinus Torvalds * Lifted a bunch of code from clear_user.S 1111da177e4SLinus Torvalds * At this point, entry values are: 1121da177e4SLinus Torvalds * $16 Current destination address 1131da177e4SLinus Torvalds * $5 A copy of $16 1141da177e4SLinus Torvalds * $6 The max quadword address to write to 1151da177e4SLinus Torvalds * $18 Number trailer bytes 1161da177e4SLinus Torvalds * $3 Number quads to write 1171da177e4SLinus Torvalds */ 1181da177e4SLinus Torvalds 1191da177e4SLinus Torvalds and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 1201da177e4SLinus Torvalds subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 1211da177e4SLinus Torvalds subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 1221da177e4SLinus Torvalds blt $4, loop_b # U : 1231da177e4SLinus Torvalds 1241da177e4SLinus Torvalds /* 1251da177e4SLinus Torvalds * We know we've got at least 16 quads, minimum of one trip 1261da177e4SLinus Torvalds * through unrolled loop. Do a quad at a time to get us 0mod64 1271da177e4SLinus Torvalds * aligned. 1281da177e4SLinus Torvalds */ 1291da177e4SLinus Torvalds 1301da177e4SLinus Torvalds nop # E : 1311da177e4SLinus Torvalds nop # E : 1321da177e4SLinus Torvalds nop # E : 1331da177e4SLinus Torvalds beq $1, $bigalign_b # U : 1341da177e4SLinus Torvalds 1351da177e4SLinus Torvalds$alignmod64_b: 1361da177e4SLinus Torvalds stq $17, 0($5) # L : 1371da177e4SLinus Torvalds subq $3, 1, $3 # E : For consistency later 1381da177e4SLinus Torvalds addq $1, 8, $1 # E : Increment towards zero for alignment 1391da177e4SLinus Torvalds addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 1401da177e4SLinus Torvalds 1411da177e4SLinus Torvalds nop 1421da177e4SLinus Torvalds nop 1431da177e4SLinus Torvalds addq $5, 8, $5 # E : Inc address 1441da177e4SLinus Torvalds blt $1, $alignmod64_b # U : 1451da177e4SLinus Torvalds 1461da177e4SLinus Torvalds$bigalign_b: 1471da177e4SLinus Torvalds /* 1481da177e4SLinus Torvalds * $3 - number quads left to go 1491da177e4SLinus Torvalds * $5 - target address (aligned 0mod64) 1501da177e4SLinus Torvalds * $17 - mask of stuff to store 1511da177e4SLinus Torvalds * Scratch registers available: $7, $2, $4, $1 1521da177e4SLinus Torvalds * we know that we'll be taking a minimum of one trip through 1531da177e4SLinus Torvalds * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 1541da177e4SLinus Torvalds * Assumes the wh64 needs to be for 2 trips through the loop in the future 1551da177e4SLinus Torvalds * The wh64 is issued on for the starting destination address for trip +2 1561da177e4SLinus Torvalds * through the loop, and if there are less than two trips left, the target 1571da177e4SLinus Torvalds * address will be for the current trip. 1581da177e4SLinus Torvalds */ 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds$do_wh64_b: 1611da177e4SLinus Torvalds wh64 ($4) # L1 : memory subsystem write hint 1621da177e4SLinus Torvalds subq $3, 24, $2 # E : For determining future wh64 addresses 1631da177e4SLinus Torvalds stq $17, 0($5) # L : 1641da177e4SLinus Torvalds nop # E : 1651da177e4SLinus Torvalds 1661da177e4SLinus Torvalds addq $5, 128, $4 # E : speculative target of next wh64 1671da177e4SLinus Torvalds stq $17, 8($5) # L : 1681da177e4SLinus Torvalds stq $17, 16($5) # L : 1691da177e4SLinus Torvalds addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 1701da177e4SLinus Torvalds 1711da177e4SLinus Torvalds stq $17, 24($5) # L : 1721da177e4SLinus Torvalds stq $17, 32($5) # L : 1731da177e4SLinus Torvalds cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 1741da177e4SLinus Torvalds nop 1751da177e4SLinus Torvalds 1761da177e4SLinus Torvalds stq $17, 40($5) # L : 1771da177e4SLinus Torvalds stq $17, 48($5) # L : 1781da177e4SLinus Torvalds subq $3, 16, $2 # E : Repeat the loop at least once more? 1791da177e4SLinus Torvalds nop 1801da177e4SLinus Torvalds 1811da177e4SLinus Torvalds stq $17, 56($5) # L : 1821da177e4SLinus Torvalds addq $5, 64, $5 # E : 1831da177e4SLinus Torvalds subq $3, 8, $3 # E : 1841da177e4SLinus Torvalds bge $2, $do_wh64_b # U : 1851da177e4SLinus Torvalds 1861da177e4SLinus Torvalds nop 1871da177e4SLinus Torvalds nop 1881da177e4SLinus Torvalds nop 1891da177e4SLinus Torvalds beq $3, no_quad_b # U : Might have finished already 1901da177e4SLinus Torvalds 1911da177e4SLinus Torvalds.align 4 1921da177e4SLinus Torvalds /* 1931da177e4SLinus Torvalds * Simple loop for trailing quadwords, or for small amounts 1941da177e4SLinus Torvalds * of data (where we can't use an unrolled loop and wh64) 1951da177e4SLinus Torvalds */ 1961da177e4SLinus Torvaldsloop_b: 1971da177e4SLinus Torvalds stq $17,0($5) # L : 1981da177e4SLinus Torvalds subq $3,1,$3 # E : Decrement number quads left 1991da177e4SLinus Torvalds addq $5,8,$5 # E : Inc address 2001da177e4SLinus Torvalds bne $3,loop_b # U : more? 2011da177e4SLinus Torvalds 2021da177e4SLinus Torvaldsno_quad_b: 2031da177e4SLinus Torvalds /* 2041da177e4SLinus Torvalds * Write 0..7 trailing bytes. 2051da177e4SLinus Torvalds */ 2061da177e4SLinus Torvalds nop # E : 2071da177e4SLinus Torvalds beq $18,end_b # U : All done? 2081da177e4SLinus Torvalds ldq $7,0($5) # L : 2091da177e4SLinus Torvalds mskqh $7,$6,$2 # U : Mask final quad 2101da177e4SLinus Torvalds 2111da177e4SLinus Torvalds insqh $17,$6,$4 # U : New bits 2121da177e4SLinus Torvalds bis $2,$4,$1 # E : Put it all together 2131da177e4SLinus Torvalds stq $1,0($5) # L : And back to memory 2141da177e4SLinus Torvalds ret $31,($26),1 # L0 : 2151da177e4SLinus Torvalds 2161da177e4SLinus Torvaldswithin_quad_b: 2171da177e4SLinus Torvalds ldq_u $1,0($16) # L : 2181da177e4SLinus Torvalds insql $17,$16,$2 # U : New bits 2191da177e4SLinus Torvalds mskql $1,$16,$4 # U : Clear old 2201da177e4SLinus Torvalds bis $2,$4,$2 # E : New result 2211da177e4SLinus Torvalds 2221da177e4SLinus Torvalds mskql $2,$6,$4 # U : 2231da177e4SLinus Torvalds mskqh $1,$6,$2 # U : 2241da177e4SLinus Torvalds bis $2,$4,$1 # E : 2251da177e4SLinus Torvalds stq_u $1,0($16) # L : 2261da177e4SLinus Torvalds 2271da177e4SLinus Torvaldsend_b: 2281da177e4SLinus Torvalds nop 2291da177e4SLinus Torvalds nop 2301da177e4SLinus Torvalds nop 2311da177e4SLinus Torvalds ret $31,($26),1 # L0 : 232a47e5bb5SRichard Henderson .end ___memset 23300fc0e0dSAl Viro EXPORT_SYMBOL(___memset) 2341da177e4SLinus Torvalds 2351da177e4SLinus Torvalds /* 2361da177e4SLinus Torvalds * This is the original body of code, prior to replication and 2371da177e4SLinus Torvalds * rescheduling. Leave it here, as there may be calls to this 2381da177e4SLinus Torvalds * entry point. 2391da177e4SLinus Torvalds */ 2401da177e4SLinus Torvalds.align 4 2411da177e4SLinus Torvalds .ent __constant_c_memset 2421da177e4SLinus Torvalds__constant_c_memset: 2431da177e4SLinus Torvalds .frame $30,0,$26,0 2441da177e4SLinus Torvalds .prologue 0 2451da177e4SLinus Torvalds 2461da177e4SLinus Torvalds addq $18,$16,$6 # E : max address to write to 2471da177e4SLinus Torvalds bis $16,$16,$0 # E : return value 2481da177e4SLinus Torvalds xor $16,$6,$1 # E : will complete write be within one quadword? 2491da177e4SLinus Torvalds ble $18,end # U : zero length requested? 2501da177e4SLinus Torvalds 2511da177e4SLinus Torvalds bic $1,7,$1 # E : fit within a single quadword 2521da177e4SLinus Torvalds beq $1,within_one_quad # U : 2531da177e4SLinus Torvalds and $16,7,$3 # E : Target addr misalignment 2541da177e4SLinus Torvalds beq $3,aligned # U : target is 0mod8 2551da177e4SLinus Torvalds 2561da177e4SLinus Torvalds /* 2571da177e4SLinus Torvalds * Target address is misaligned, and won't fit within a quadword 2581da177e4SLinus Torvalds */ 2591da177e4SLinus Torvalds ldq_u $4,0($16) # L : Fetch first partial 2601da177e4SLinus Torvalds bis $16,$16,$5 # E : Save the address 2611da177e4SLinus Torvalds insql $17,$16,$2 # U : Insert new bytes 2621da177e4SLinus Torvalds subq $3,8,$3 # E : Invert (for addressing uses) 2631da177e4SLinus Torvalds 2641da177e4SLinus Torvalds addq $18,$3,$18 # E : $18 is new count ($3 is negative) 2651da177e4SLinus Torvalds mskql $4,$16,$4 # U : clear relevant parts of the quad 2661da177e4SLinus Torvalds subq $16,$3,$16 # E : $16 is new aligned destination 2671da177e4SLinus Torvalds bis $2,$4,$1 # E : Final bytes 2681da177e4SLinus Torvalds 2691da177e4SLinus Torvalds nop 2701da177e4SLinus Torvalds stq_u $1,0($5) # L : Store result 2711da177e4SLinus Torvalds nop 2721da177e4SLinus Torvalds nop 2731da177e4SLinus Torvalds 2741da177e4SLinus Torvalds.align 4 2751da177e4SLinus Torvaldsaligned: 2761da177e4SLinus Torvalds /* 2771da177e4SLinus Torvalds * We are now guaranteed to be quad aligned, with at least 2781da177e4SLinus Torvalds * one partial quad to write. 2791da177e4SLinus Torvalds */ 2801da177e4SLinus Torvalds 2811da177e4SLinus Torvalds sra $18,3,$3 # U : Number of remaining quads to write 2821da177e4SLinus Torvalds and $18,7,$18 # E : Number of trailing bytes to write 2831da177e4SLinus Torvalds bis $16,$16,$5 # E : Save dest address 2841da177e4SLinus Torvalds beq $3,no_quad # U : tail stuff only 2851da177e4SLinus Torvalds 2861da177e4SLinus Torvalds /* 2871da177e4SLinus Torvalds * it's worth the effort to unroll this and use wh64 if possible 2881da177e4SLinus Torvalds * Lifted a bunch of code from clear_user.S 2891da177e4SLinus Torvalds * At this point, entry values are: 2901da177e4SLinus Torvalds * $16 Current destination address 2911da177e4SLinus Torvalds * $5 A copy of $16 2921da177e4SLinus Torvalds * $6 The max quadword address to write to 2931da177e4SLinus Torvalds * $18 Number trailer bytes 2941da177e4SLinus Torvalds * $3 Number quads to write 2951da177e4SLinus Torvalds */ 2961da177e4SLinus Torvalds 2971da177e4SLinus Torvalds and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 2981da177e4SLinus Torvalds subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 2991da177e4SLinus Torvalds subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 3001da177e4SLinus Torvalds blt $4, loop # U : 3011da177e4SLinus Torvalds 3021da177e4SLinus Torvalds /* 3031da177e4SLinus Torvalds * We know we've got at least 16 quads, minimum of one trip 3041da177e4SLinus Torvalds * through unrolled loop. Do a quad at a time to get us 0mod64 3051da177e4SLinus Torvalds * aligned. 3061da177e4SLinus Torvalds */ 3071da177e4SLinus Torvalds 3081da177e4SLinus Torvalds nop # E : 3091da177e4SLinus Torvalds nop # E : 3101da177e4SLinus Torvalds nop # E : 3111da177e4SLinus Torvalds beq $1, $bigalign # U : 3121da177e4SLinus Torvalds 3131da177e4SLinus Torvalds$alignmod64: 3141da177e4SLinus Torvalds stq $17, 0($5) # L : 3151da177e4SLinus Torvalds subq $3, 1, $3 # E : For consistency later 3161da177e4SLinus Torvalds addq $1, 8, $1 # E : Increment towards zero for alignment 3171da177e4SLinus Torvalds addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 3181da177e4SLinus Torvalds 3191da177e4SLinus Torvalds nop 3201da177e4SLinus Torvalds nop 3211da177e4SLinus Torvalds addq $5, 8, $5 # E : Inc address 3221da177e4SLinus Torvalds blt $1, $alignmod64 # U : 3231da177e4SLinus Torvalds 3241da177e4SLinus Torvalds$bigalign: 3251da177e4SLinus Torvalds /* 3261da177e4SLinus Torvalds * $3 - number quads left to go 3271da177e4SLinus Torvalds * $5 - target address (aligned 0mod64) 3281da177e4SLinus Torvalds * $17 - mask of stuff to store 3291da177e4SLinus Torvalds * Scratch registers available: $7, $2, $4, $1 3301da177e4SLinus Torvalds * we know that we'll be taking a minimum of one trip through 3311da177e4SLinus Torvalds * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 3321da177e4SLinus Torvalds * Assumes the wh64 needs to be for 2 trips through the loop in the future 3331da177e4SLinus Torvalds * The wh64 is issued on for the starting destination address for trip +2 3341da177e4SLinus Torvalds * through the loop, and if there are less than two trips left, the target 3351da177e4SLinus Torvalds * address will be for the current trip. 3361da177e4SLinus Torvalds */ 3371da177e4SLinus Torvalds 3381da177e4SLinus Torvalds$do_wh64: 3391da177e4SLinus Torvalds wh64 ($4) # L1 : memory subsystem write hint 3401da177e4SLinus Torvalds subq $3, 24, $2 # E : For determining future wh64 addresses 3411da177e4SLinus Torvalds stq $17, 0($5) # L : 3421da177e4SLinus Torvalds nop # E : 3431da177e4SLinus Torvalds 3441da177e4SLinus Torvalds addq $5, 128, $4 # E : speculative target of next wh64 3451da177e4SLinus Torvalds stq $17, 8($5) # L : 3461da177e4SLinus Torvalds stq $17, 16($5) # L : 3471da177e4SLinus Torvalds addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 3481da177e4SLinus Torvalds 3491da177e4SLinus Torvalds stq $17, 24($5) # L : 3501da177e4SLinus Torvalds stq $17, 32($5) # L : 3511da177e4SLinus Torvalds cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 3521da177e4SLinus Torvalds nop 3531da177e4SLinus Torvalds 3541da177e4SLinus Torvalds stq $17, 40($5) # L : 3551da177e4SLinus Torvalds stq $17, 48($5) # L : 3561da177e4SLinus Torvalds subq $3, 16, $2 # E : Repeat the loop at least once more? 3571da177e4SLinus Torvalds nop 3581da177e4SLinus Torvalds 3591da177e4SLinus Torvalds stq $17, 56($5) # L : 3601da177e4SLinus Torvalds addq $5, 64, $5 # E : 3611da177e4SLinus Torvalds subq $3, 8, $3 # E : 3621da177e4SLinus Torvalds bge $2, $do_wh64 # U : 3631da177e4SLinus Torvalds 3641da177e4SLinus Torvalds nop 3651da177e4SLinus Torvalds nop 3661da177e4SLinus Torvalds nop 3671da177e4SLinus Torvalds beq $3, no_quad # U : Might have finished already 3681da177e4SLinus Torvalds 3691da177e4SLinus Torvalds.align 4 3701da177e4SLinus Torvalds /* 3711da177e4SLinus Torvalds * Simple loop for trailing quadwords, or for small amounts 3721da177e4SLinus Torvalds * of data (where we can't use an unrolled loop and wh64) 3731da177e4SLinus Torvalds */ 3741da177e4SLinus Torvaldsloop: 3751da177e4SLinus Torvalds stq $17,0($5) # L : 3761da177e4SLinus Torvalds subq $3,1,$3 # E : Decrement number quads left 3771da177e4SLinus Torvalds addq $5,8,$5 # E : Inc address 3781da177e4SLinus Torvalds bne $3,loop # U : more? 3791da177e4SLinus Torvalds 3801da177e4SLinus Torvaldsno_quad: 3811da177e4SLinus Torvalds /* 3821da177e4SLinus Torvalds * Write 0..7 trailing bytes. 3831da177e4SLinus Torvalds */ 3841da177e4SLinus Torvalds nop # E : 3851da177e4SLinus Torvalds beq $18,end # U : All done? 3861da177e4SLinus Torvalds ldq $7,0($5) # L : 3871da177e4SLinus Torvalds mskqh $7,$6,$2 # U : Mask final quad 3881da177e4SLinus Torvalds 3891da177e4SLinus Torvalds insqh $17,$6,$4 # U : New bits 3901da177e4SLinus Torvalds bis $2,$4,$1 # E : Put it all together 3911da177e4SLinus Torvalds stq $1,0($5) # L : And back to memory 3921da177e4SLinus Torvalds ret $31,($26),1 # L0 : 3931da177e4SLinus Torvalds 3941da177e4SLinus Torvaldswithin_one_quad: 3951da177e4SLinus Torvalds ldq_u $1,0($16) # L : 3961da177e4SLinus Torvalds insql $17,$16,$2 # U : New bits 3971da177e4SLinus Torvalds mskql $1,$16,$4 # U : Clear old 3981da177e4SLinus Torvalds bis $2,$4,$2 # E : New result 3991da177e4SLinus Torvalds 4001da177e4SLinus Torvalds mskql $2,$6,$4 # U : 4011da177e4SLinus Torvalds mskqh $1,$6,$2 # U : 4021da177e4SLinus Torvalds bis $2,$4,$1 # E : 4031da177e4SLinus Torvalds stq_u $1,0($16) # L : 4041da177e4SLinus Torvalds 4051da177e4SLinus Torvaldsend: 4061da177e4SLinus Torvalds nop 4071da177e4SLinus Torvalds nop 4081da177e4SLinus Torvalds nop 4091da177e4SLinus Torvalds ret $31,($26),1 # L0 : 4101da177e4SLinus Torvalds .end __constant_c_memset 41100fc0e0dSAl Viro EXPORT_SYMBOL(__constant_c_memset) 4121da177e4SLinus Torvalds 4131da177e4SLinus Torvalds /* 4141da177e4SLinus Torvalds * This is a replicant of the __constant_c_memset code, rescheduled 4151da177e4SLinus Torvalds * to mask stalls. Note that entry point names also had to change 4161da177e4SLinus Torvalds */ 4171da177e4SLinus Torvalds .align 5 4180d83620fSMichael Cree .ent __memset16 4191da177e4SLinus Torvalds 4200d83620fSMichael Cree__memset16: 4211da177e4SLinus Torvalds .frame $30,0,$26,0 4221da177e4SLinus Torvalds .prologue 0 4231da177e4SLinus Torvalds 4241da177e4SLinus Torvalds inswl $17,0,$5 # U : 000000000000c1c2 4251da177e4SLinus Torvalds inswl $17,2,$2 # U : 00000000c1c20000 4261da177e4SLinus Torvalds bis $16,$16,$0 # E : return value 4271da177e4SLinus Torvalds addq $18,$16,$6 # E : max address to write to 4281da177e4SLinus Torvalds 4291da177e4SLinus Torvalds ble $18, end_w # U : zero length requested? 4301da177e4SLinus Torvalds inswl $17,4,$3 # U : 0000c1c200000000 4311da177e4SLinus Torvalds inswl $17,6,$4 # U : c1c2000000000000 4321da177e4SLinus Torvalds xor $16,$6,$1 # E : will complete write be within one quadword? 4331da177e4SLinus Torvalds 4341da177e4SLinus Torvalds or $2,$5,$2 # E : 00000000c1c2c1c2 4351da177e4SLinus Torvalds or $3,$4,$17 # E : c1c2c1c200000000 4361da177e4SLinus Torvalds bic $1,7,$1 # E : fit within a single quadword 4371da177e4SLinus Torvalds and $16,7,$3 # E : Target addr misalignment 4381da177e4SLinus Torvalds 4391da177e4SLinus Torvalds or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 4401da177e4SLinus Torvalds beq $1,within_quad_w # U : 4411da177e4SLinus Torvalds nop 4421da177e4SLinus Torvalds beq $3,aligned_w # U : target is 0mod8 4431da177e4SLinus Torvalds 4441da177e4SLinus Torvalds /* 4451da177e4SLinus Torvalds * Target address is misaligned, and won't fit within a quadword 4461da177e4SLinus Torvalds */ 4471da177e4SLinus Torvalds ldq_u $4,0($16) # L : Fetch first partial 4481da177e4SLinus Torvalds bis $16,$16,$5 # E : Save the address 4491da177e4SLinus Torvalds insql $17,$16,$2 # U : Insert new bytes 4501da177e4SLinus Torvalds subq $3,8,$3 # E : Invert (for addressing uses) 4511da177e4SLinus Torvalds 4521da177e4SLinus Torvalds addq $18,$3,$18 # E : $18 is new count ($3 is negative) 4531da177e4SLinus Torvalds mskql $4,$16,$4 # U : clear relevant parts of the quad 4541da177e4SLinus Torvalds subq $16,$3,$16 # E : $16 is new aligned destination 4551da177e4SLinus Torvalds bis $2,$4,$1 # E : Final bytes 4561da177e4SLinus Torvalds 4571da177e4SLinus Torvalds nop 4581da177e4SLinus Torvalds stq_u $1,0($5) # L : Store result 4591da177e4SLinus Torvalds nop 4601da177e4SLinus Torvalds nop 4611da177e4SLinus Torvalds 4621da177e4SLinus Torvalds.align 4 4631da177e4SLinus Torvaldsaligned_w: 4641da177e4SLinus Torvalds /* 4651da177e4SLinus Torvalds * We are now guaranteed to be quad aligned, with at least 4661da177e4SLinus Torvalds * one partial quad to write. 4671da177e4SLinus Torvalds */ 4681da177e4SLinus Torvalds 4691da177e4SLinus Torvalds sra $18,3,$3 # U : Number of remaining quads to write 4701da177e4SLinus Torvalds and $18,7,$18 # E : Number of trailing bytes to write 4711da177e4SLinus Torvalds bis $16,$16,$5 # E : Save dest address 4721da177e4SLinus Torvalds beq $3,no_quad_w # U : tail stuff only 4731da177e4SLinus Torvalds 4741da177e4SLinus Torvalds /* 4751da177e4SLinus Torvalds * it's worth the effort to unroll this and use wh64 if possible 4761da177e4SLinus Torvalds * Lifted a bunch of code from clear_user.S 4771da177e4SLinus Torvalds * At this point, entry values are: 4781da177e4SLinus Torvalds * $16 Current destination address 4791da177e4SLinus Torvalds * $5 A copy of $16 4801da177e4SLinus Torvalds * $6 The max quadword address to write to 4811da177e4SLinus Torvalds * $18 Number trailer bytes 4821da177e4SLinus Torvalds * $3 Number quads to write 4831da177e4SLinus Torvalds */ 4841da177e4SLinus Torvalds 4851da177e4SLinus Torvalds and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 4861da177e4SLinus Torvalds subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 4871da177e4SLinus Torvalds subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 4881da177e4SLinus Torvalds blt $4, loop_w # U : 4891da177e4SLinus Torvalds 4901da177e4SLinus Torvalds /* 4911da177e4SLinus Torvalds * We know we've got at least 16 quads, minimum of one trip 4921da177e4SLinus Torvalds * through unrolled loop. Do a quad at a time to get us 0mod64 4931da177e4SLinus Torvalds * aligned. 4941da177e4SLinus Torvalds */ 4951da177e4SLinus Torvalds 4961da177e4SLinus Torvalds nop # E : 4971da177e4SLinus Torvalds nop # E : 4981da177e4SLinus Torvalds nop # E : 4991da177e4SLinus Torvalds beq $1, $bigalign_w # U : 5001da177e4SLinus Torvalds 5011da177e4SLinus Torvalds$alignmod64_w: 5021da177e4SLinus Torvalds stq $17, 0($5) # L : 5031da177e4SLinus Torvalds subq $3, 1, $3 # E : For consistency later 5041da177e4SLinus Torvalds addq $1, 8, $1 # E : Increment towards zero for alignment 5051da177e4SLinus Torvalds addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 5061da177e4SLinus Torvalds 5071da177e4SLinus Torvalds nop 5081da177e4SLinus Torvalds nop 5091da177e4SLinus Torvalds addq $5, 8, $5 # E : Inc address 5101da177e4SLinus Torvalds blt $1, $alignmod64_w # U : 5111da177e4SLinus Torvalds 5121da177e4SLinus Torvalds$bigalign_w: 5131da177e4SLinus Torvalds /* 5141da177e4SLinus Torvalds * $3 - number quads left to go 5151da177e4SLinus Torvalds * $5 - target address (aligned 0mod64) 5161da177e4SLinus Torvalds * $17 - mask of stuff to store 5171da177e4SLinus Torvalds * Scratch registers available: $7, $2, $4, $1 5181da177e4SLinus Torvalds * we know that we'll be taking a minimum of one trip through 5191da177e4SLinus Torvalds * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 5201da177e4SLinus Torvalds * Assumes the wh64 needs to be for 2 trips through the loop in the future 5211da177e4SLinus Torvalds * The wh64 is issued on for the starting destination address for trip +2 5221da177e4SLinus Torvalds * through the loop, and if there are less than two trips left, the target 5231da177e4SLinus Torvalds * address will be for the current trip. 5241da177e4SLinus Torvalds */ 5251da177e4SLinus Torvalds 5261da177e4SLinus Torvalds$do_wh64_w: 5271da177e4SLinus Torvalds wh64 ($4) # L1 : memory subsystem write hint 5281da177e4SLinus Torvalds subq $3, 24, $2 # E : For determining future wh64 addresses 5291da177e4SLinus Torvalds stq $17, 0($5) # L : 5301da177e4SLinus Torvalds nop # E : 5311da177e4SLinus Torvalds 5321da177e4SLinus Torvalds addq $5, 128, $4 # E : speculative target of next wh64 5331da177e4SLinus Torvalds stq $17, 8($5) # L : 5341da177e4SLinus Torvalds stq $17, 16($5) # L : 5351da177e4SLinus Torvalds addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 5361da177e4SLinus Torvalds 5371da177e4SLinus Torvalds stq $17, 24($5) # L : 5381da177e4SLinus Torvalds stq $17, 32($5) # L : 5391da177e4SLinus Torvalds cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 5401da177e4SLinus Torvalds nop 5411da177e4SLinus Torvalds 5421da177e4SLinus Torvalds stq $17, 40($5) # L : 5431da177e4SLinus Torvalds stq $17, 48($5) # L : 5441da177e4SLinus Torvalds subq $3, 16, $2 # E : Repeat the loop at least once more? 5451da177e4SLinus Torvalds nop 5461da177e4SLinus Torvalds 5471da177e4SLinus Torvalds stq $17, 56($5) # L : 5481da177e4SLinus Torvalds addq $5, 64, $5 # E : 5491da177e4SLinus Torvalds subq $3, 8, $3 # E : 5501da177e4SLinus Torvalds bge $2, $do_wh64_w # U : 5511da177e4SLinus Torvalds 5521da177e4SLinus Torvalds nop 5531da177e4SLinus Torvalds nop 5541da177e4SLinus Torvalds nop 5551da177e4SLinus Torvalds beq $3, no_quad_w # U : Might have finished already 5561da177e4SLinus Torvalds 5571da177e4SLinus Torvalds.align 4 5581da177e4SLinus Torvalds /* 5591da177e4SLinus Torvalds * Simple loop for trailing quadwords, or for small amounts 5601da177e4SLinus Torvalds * of data (where we can't use an unrolled loop and wh64) 5611da177e4SLinus Torvalds */ 5621da177e4SLinus Torvaldsloop_w: 5631da177e4SLinus Torvalds stq $17,0($5) # L : 5641da177e4SLinus Torvalds subq $3,1,$3 # E : Decrement number quads left 5651da177e4SLinus Torvalds addq $5,8,$5 # E : Inc address 5661da177e4SLinus Torvalds bne $3,loop_w # U : more? 5671da177e4SLinus Torvalds 5681da177e4SLinus Torvaldsno_quad_w: 5691da177e4SLinus Torvalds /* 5701da177e4SLinus Torvalds * Write 0..7 trailing bytes. 5711da177e4SLinus Torvalds */ 5721da177e4SLinus Torvalds nop # E : 5731da177e4SLinus Torvalds beq $18,end_w # U : All done? 5741da177e4SLinus Torvalds ldq $7,0($5) # L : 5751da177e4SLinus Torvalds mskqh $7,$6,$2 # U : Mask final quad 5761da177e4SLinus Torvalds 5771da177e4SLinus Torvalds insqh $17,$6,$4 # U : New bits 5781da177e4SLinus Torvalds bis $2,$4,$1 # E : Put it all together 5791da177e4SLinus Torvalds stq $1,0($5) # L : And back to memory 5801da177e4SLinus Torvalds ret $31,($26),1 # L0 : 5811da177e4SLinus Torvalds 5821da177e4SLinus Torvaldswithin_quad_w: 5831da177e4SLinus Torvalds ldq_u $1,0($16) # L : 5841da177e4SLinus Torvalds insql $17,$16,$2 # U : New bits 5851da177e4SLinus Torvalds mskql $1,$16,$4 # U : Clear old 5861da177e4SLinus Torvalds bis $2,$4,$2 # E : New result 5871da177e4SLinus Torvalds 5881da177e4SLinus Torvalds mskql $2,$6,$4 # U : 5891da177e4SLinus Torvalds mskqh $1,$6,$2 # U : 5901da177e4SLinus Torvalds bis $2,$4,$1 # E : 5911da177e4SLinus Torvalds stq_u $1,0($16) # L : 5921da177e4SLinus Torvalds 5931da177e4SLinus Torvaldsend_w: 5941da177e4SLinus Torvalds nop 5951da177e4SLinus Torvalds nop 5961da177e4SLinus Torvalds nop 5971da177e4SLinus Torvalds ret $31,($26),1 # L0 : 5981da177e4SLinus Torvalds 5990d83620fSMichael Cree .end __memset16 6000d83620fSMichael Cree EXPORT_SYMBOL(__memset16) 6011da177e4SLinus Torvalds 602a47e5bb5SRichard Hendersonmemset = ___memset 603a47e5bb5SRichard Henderson__memset = ___memset 60400fc0e0dSAl Viro EXPORT_SYMBOL(memset) 60500fc0e0dSAl Viro EXPORT_SYMBOL(__memset) 606