1*b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-stxcpy.S 41da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * Copy a null-terminated string from SRC to DST. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * This is an internal routine used by strcpy, stpcpy, and strcat. 91da177e4SLinus Torvalds * As such, it uses special linkage conventions to make implementation 101da177e4SLinus Torvalds * of these public functions more efficient. 111da177e4SLinus Torvalds * 121da177e4SLinus Torvalds * On input: 131da177e4SLinus Torvalds * t9 = return address 141da177e4SLinus Torvalds * a0 = DST 151da177e4SLinus Torvalds * a1 = SRC 161da177e4SLinus Torvalds * 171da177e4SLinus Torvalds * On output: 181da177e4SLinus Torvalds * t12 = bitmask (with one bit set) indicating the last byte written 191da177e4SLinus Torvalds * a0 = unaligned address of the last *word* written 201da177e4SLinus Torvalds * 211da177e4SLinus Torvalds * Furthermore, v0, a3-a5, t11, and t12 are untouched. 221da177e4SLinus Torvalds * 231da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from: 241da177e4SLinus Torvalds * Compiler Writer's Guide for the Alpha 21264 251da177e4SLinus Torvalds * abbreviated as 'CWG' in other comments here 261da177e4SLinus Torvalds * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 271da177e4SLinus Torvalds * Scheduling notation: 281da177e4SLinus Torvalds * E - either cluster 291da177e4SLinus Torvalds * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 301da177e4SLinus Torvalds * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 311da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency. 321da177e4SLinus Torvalds */ 331da177e4SLinus Torvalds 341da177e4SLinus Torvalds#include <asm/regdef.h> 351da177e4SLinus Torvalds 361da177e4SLinus Torvalds .set noat 371da177e4SLinus Torvalds .set noreorder 381da177e4SLinus Torvalds 391da177e4SLinus Torvalds .text 401da177e4SLinus Torvalds 411da177e4SLinus Torvalds/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that 421da177e4SLinus Torvalds doesn't like putting the entry point for a procedure somewhere in the 431da177e4SLinus Torvalds middle of the procedure descriptor. Work around this by putting the 441da177e4SLinus Torvalds aligned copy in its own procedure descriptor */ 451da177e4SLinus Torvalds 461da177e4SLinus Torvalds 471da177e4SLinus Torvalds .ent stxcpy_aligned 481da177e4SLinus Torvalds .align 4 491da177e4SLinus Torvaldsstxcpy_aligned: 501da177e4SLinus Torvalds .frame sp, 0, t9 511da177e4SLinus Torvalds .prologue 0 521da177e4SLinus Torvalds 531da177e4SLinus Torvalds /* On entry to this basic block: 541da177e4SLinus Torvalds t0 == the first destination word for masking back in 551da177e4SLinus Torvalds t1 == the first source word. */ 561da177e4SLinus Torvalds 571da177e4SLinus Torvalds /* Create the 1st output word and detect 0's in the 1st input word. */ 581da177e4SLinus Torvalds lda t2, -1 # E : build a mask against false zero 591da177e4SLinus Torvalds mskqh t2, a1, t2 # U : detection in the src word (stall) 601da177e4SLinus Torvalds mskqh t1, a1, t3 # U : 611da177e4SLinus Torvalds ornot t1, t2, t2 # E : (stall) 621da177e4SLinus Torvalds 631da177e4SLinus Torvalds mskql t0, a1, t0 # U : assemble the first output word 641da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : bits set iff null found 651da177e4SLinus Torvalds or t0, t3, t1 # E : (stall) 661da177e4SLinus Torvalds bne t8, $a_eos # U : (stall) 671da177e4SLinus Torvalds 681da177e4SLinus Torvalds /* On entry to this basic block: 691da177e4SLinus Torvalds t0 == the first destination word for masking back in 701da177e4SLinus Torvalds t1 == a source word not containing a null. */ 711da177e4SLinus Torvalds /* Nops here to separate store quads from load quads */ 721da177e4SLinus Torvalds 731da177e4SLinus Torvalds$a_loop: 741da177e4SLinus Torvalds stq_u t1, 0(a0) # L : 751da177e4SLinus Torvalds addq a0, 8, a0 # E : 761da177e4SLinus Torvalds nop 771da177e4SLinus Torvalds nop 781da177e4SLinus Torvalds 791da177e4SLinus Torvalds ldq_u t1, 0(a1) # L : Latency=3 801da177e4SLinus Torvalds addq a1, 8, a1 # E : 811da177e4SLinus Torvalds cmpbge zero, t1, t8 # E : (3 cycle stall) 821da177e4SLinus Torvalds beq t8, $a_loop # U : (stall for t8) 831da177e4SLinus Torvalds 841da177e4SLinus Torvalds /* Take care of the final (partial) word store. 851da177e4SLinus Torvalds On entry to this basic block we have: 861da177e4SLinus Torvalds t1 == the source word containing the null 871da177e4SLinus Torvalds t8 == the cmpbge mask that found it. */ 881da177e4SLinus Torvalds$a_eos: 891da177e4SLinus Torvalds negq t8, t6 # E : find low bit set 901da177e4SLinus Torvalds and t8, t6, t12 # E : (stall) 911da177e4SLinus Torvalds /* For the sake of the cache, don't read a destination word 921da177e4SLinus Torvalds if we're not going to need it. */ 931da177e4SLinus Torvalds and t12, 0x80, t6 # E : (stall) 941da177e4SLinus Torvalds bne t6, 1f # U : (stall) 951da177e4SLinus Torvalds 961da177e4SLinus Torvalds /* We're doing a partial word store and so need to combine 971da177e4SLinus Torvalds our source and original destination words. */ 981da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : Latency=3 991da177e4SLinus Torvalds subq t12, 1, t6 # E : 1001da177e4SLinus Torvalds zapnot t1, t6, t1 # U : clear src bytes >= null (stall) 1011da177e4SLinus Torvalds or t12, t6, t8 # E : (stall) 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds zap t0, t8, t0 # E : clear dst bytes <= null 1041da177e4SLinus Torvalds or t0, t1, t1 # E : (stall) 1051da177e4SLinus Torvalds nop 1061da177e4SLinus Torvalds nop 1071da177e4SLinus Torvalds 1081da177e4SLinus Torvalds1: stq_u t1, 0(a0) # L : 1091da177e4SLinus Torvalds ret (t9) # L0 : Latency=3 1101da177e4SLinus Torvalds nop 1111da177e4SLinus Torvalds nop 1121da177e4SLinus Torvalds 1131da177e4SLinus Torvalds .end stxcpy_aligned 1141da177e4SLinus Torvalds 1151da177e4SLinus Torvalds .align 4 1161da177e4SLinus Torvalds .ent __stxcpy 1171da177e4SLinus Torvalds .globl __stxcpy 1181da177e4SLinus Torvalds__stxcpy: 1191da177e4SLinus Torvalds .frame sp, 0, t9 1201da177e4SLinus Torvalds .prologue 0 1211da177e4SLinus Torvalds 1221da177e4SLinus Torvalds /* Are source and destination co-aligned? */ 1231da177e4SLinus Torvalds xor a0, a1, t0 # E : 1241da177e4SLinus Torvalds unop # E : 1251da177e4SLinus Torvalds and t0, 7, t0 # E : (stall) 1261da177e4SLinus Torvalds bne t0, $unaligned # U : (stall) 1271da177e4SLinus Torvalds 1281da177e4SLinus Torvalds /* We are co-aligned; take care of a partial first word. */ 1291da177e4SLinus Torvalds ldq_u t1, 0(a1) # L : load first src word 1301da177e4SLinus Torvalds and a0, 7, t0 # E : take care not to load a word ... 1311da177e4SLinus Torvalds addq a1, 8, a1 # E : 1321da177e4SLinus Torvalds beq t0, stxcpy_aligned # U : ... if we wont need it (stall) 1331da177e4SLinus Torvalds 1341da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : 1351da177e4SLinus Torvalds br stxcpy_aligned # L0 : Latency=3 1361da177e4SLinus Torvalds nop 1371da177e4SLinus Torvalds nop 1381da177e4SLinus Torvalds 1391da177e4SLinus Torvalds 1401da177e4SLinus Torvalds/* The source and destination are not co-aligned. Align the destination 1411da177e4SLinus Torvalds and cope. We have to be very careful about not reading too much and 1421da177e4SLinus Torvalds causing a SEGV. */ 1431da177e4SLinus Torvalds 1441da177e4SLinus Torvalds .align 4 1451da177e4SLinus Torvalds$u_head: 1461da177e4SLinus Torvalds /* We know just enough now to be able to assemble the first 1471da177e4SLinus Torvalds full source word. We can still find a zero at the end of it 1481da177e4SLinus Torvalds that prevents us from outputting the whole thing. 1491da177e4SLinus Torvalds 1501da177e4SLinus Torvalds On entry to this basic block: 1511da177e4SLinus Torvalds t0 == the first dest word, for masking back in, if needed else 0 1521da177e4SLinus Torvalds t1 == the low bits of the first source word 1531da177e4SLinus Torvalds t6 == bytemask that is -1 in dest word bytes */ 1541da177e4SLinus Torvalds 1551da177e4SLinus Torvalds ldq_u t2, 8(a1) # L : 1561da177e4SLinus Torvalds addq a1, 8, a1 # E : 1571da177e4SLinus Torvalds extql t1, a1, t1 # U : (stall on a1) 1581da177e4SLinus Torvalds extqh t2, a1, t4 # U : (stall on a1) 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds mskql t0, a0, t0 # U : 1611da177e4SLinus Torvalds or t1, t4, t1 # E : 1621da177e4SLinus Torvalds mskqh t1, a0, t1 # U : (stall on t1) 1631da177e4SLinus Torvalds or t0, t1, t1 # E : (stall on t1) 1641da177e4SLinus Torvalds 1651da177e4SLinus Torvalds or t1, t6, t6 # E : 1661da177e4SLinus Torvalds cmpbge zero, t6, t8 # E : (stall) 1671da177e4SLinus Torvalds lda t6, -1 # E : for masking just below 1681da177e4SLinus Torvalds bne t8, $u_final # U : (stall) 1691da177e4SLinus Torvalds 1701da177e4SLinus Torvalds mskql t6, a1, t6 # U : mask out the bits we have 1711da177e4SLinus Torvalds or t6, t2, t2 # E : already extracted before (stall) 1721da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : testing eos (stall) 1731da177e4SLinus Torvalds bne t8, $u_late_head_exit # U : (stall) 1741da177e4SLinus Torvalds 1751da177e4SLinus Torvalds /* Finally, we've got all the stupid leading edge cases taken care 1761da177e4SLinus Torvalds of and we can set up to enter the main loop. */ 1771da177e4SLinus Torvalds 1781da177e4SLinus Torvalds stq_u t1, 0(a0) # L : store first output word 1791da177e4SLinus Torvalds addq a0, 8, a0 # E : 1801da177e4SLinus Torvalds extql t2, a1, t0 # U : position ho-bits of lo word 1811da177e4SLinus Torvalds ldq_u t2, 8(a1) # U : read next high-order source word 1821da177e4SLinus Torvalds 1831da177e4SLinus Torvalds addq a1, 8, a1 # E : 1841da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : (stall for t2) 1851da177e4SLinus Torvalds nop # E : 1861da177e4SLinus Torvalds bne t8, $u_eos # U : (stall) 1871da177e4SLinus Torvalds 1881da177e4SLinus Torvalds /* Unaligned copy main loop. In order to avoid reading too much, 1891da177e4SLinus Torvalds the loop is structured to detect zeros in aligned source words. 1901da177e4SLinus Torvalds This has, unfortunately, effectively pulled half of a loop 1911da177e4SLinus Torvalds iteration out into the head and half into the tail, but it does 1921da177e4SLinus Torvalds prevent nastiness from accumulating in the very thing we want 1931da177e4SLinus Torvalds to run as fast as possible. 1941da177e4SLinus Torvalds 1951da177e4SLinus Torvalds On entry to this basic block: 1961da177e4SLinus Torvalds t0 == the shifted high-order bits from the previous source word 1971da177e4SLinus Torvalds t2 == the unshifted current source word 1981da177e4SLinus Torvalds 1991da177e4SLinus Torvalds We further know that t2 does not contain a null terminator. */ 2001da177e4SLinus Torvalds 2011da177e4SLinus Torvalds .align 3 2021da177e4SLinus Torvalds$u_loop: 2031da177e4SLinus Torvalds extqh t2, a1, t1 # U : extract high bits for current word 2041da177e4SLinus Torvalds addq a1, 8, a1 # E : (stall) 2051da177e4SLinus Torvalds extql t2, a1, t3 # U : extract low bits for next time (stall) 2061da177e4SLinus Torvalds addq a0, 8, a0 # E : 2071da177e4SLinus Torvalds 2081da177e4SLinus Torvalds or t0, t1, t1 # E : current dst word now complete 2091da177e4SLinus Torvalds ldq_u t2, 0(a1) # L : Latency=3 load high word for next time 2101da177e4SLinus Torvalds stq_u t1, -8(a0) # L : save the current word (stall) 2111da177e4SLinus Torvalds mov t3, t0 # E : 2121da177e4SLinus Torvalds 2131da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : test new word for eos 2141da177e4SLinus Torvalds beq t8, $u_loop # U : (stall) 2151da177e4SLinus Torvalds nop 2161da177e4SLinus Torvalds nop 2171da177e4SLinus Torvalds 2181da177e4SLinus Torvalds /* We've found a zero somewhere in the source word we just read. 2191da177e4SLinus Torvalds If it resides in the lower half, we have one (probably partial) 2201da177e4SLinus Torvalds word to write out, and if it resides in the upper half, we 2211da177e4SLinus Torvalds have one full and one partial word left to write out. 2221da177e4SLinus Torvalds 2231da177e4SLinus Torvalds On entry to this basic block: 2241da177e4SLinus Torvalds t0 == the shifted high-order bits from the previous source word 2251da177e4SLinus Torvalds t2 == the unshifted current source word. */ 2261da177e4SLinus Torvalds$u_eos: 2271da177e4SLinus Torvalds extqh t2, a1, t1 # U : 2281da177e4SLinus Torvalds or t0, t1, t1 # E : first (partial) source word complete (stall) 2291da177e4SLinus Torvalds cmpbge zero, t1, t8 # E : is the null in this first bit? (stall) 2301da177e4SLinus Torvalds bne t8, $u_final # U : (stall) 2311da177e4SLinus Torvalds 2321da177e4SLinus Torvalds$u_late_head_exit: 2331da177e4SLinus Torvalds stq_u t1, 0(a0) # L : the null was in the high-order bits 2341da177e4SLinus Torvalds addq a0, 8, a0 # E : 2351da177e4SLinus Torvalds extql t2, a1, t1 # U : 2361da177e4SLinus Torvalds cmpbge zero, t1, t8 # E : (stall) 2371da177e4SLinus Torvalds 2381da177e4SLinus Torvalds /* Take care of a final (probably partial) result word. 2391da177e4SLinus Torvalds On entry to this basic block: 2401da177e4SLinus Torvalds t1 == assembled source word 2411da177e4SLinus Torvalds t8 == cmpbge mask that found the null. */ 2421da177e4SLinus Torvalds$u_final: 2431da177e4SLinus Torvalds negq t8, t6 # E : isolate low bit set 2441da177e4SLinus Torvalds and t6, t8, t12 # E : (stall) 2451da177e4SLinus Torvalds and t12, 0x80, t6 # E : avoid dest word load if we can (stall) 2461da177e4SLinus Torvalds bne t6, 1f # U : (stall) 2471da177e4SLinus Torvalds 2481da177e4SLinus Torvalds ldq_u t0, 0(a0) # E : 2491da177e4SLinus Torvalds subq t12, 1, t6 # E : 2501da177e4SLinus Torvalds or t6, t12, t8 # E : (stall) 2511da177e4SLinus Torvalds zapnot t1, t6, t1 # U : kill source bytes >= null (stall) 2521da177e4SLinus Torvalds 2531da177e4SLinus Torvalds zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall) 2541da177e4SLinus Torvalds or t0, t1, t1 # E : (stall) 2551da177e4SLinus Torvalds nop 2561da177e4SLinus Torvalds nop 2571da177e4SLinus Torvalds 2581da177e4SLinus Torvalds1: stq_u t1, 0(a0) # L : 2591da177e4SLinus Torvalds ret (t9) # L0 : Latency=3 2601da177e4SLinus Torvalds nop 2611da177e4SLinus Torvalds nop 2621da177e4SLinus Torvalds 2631da177e4SLinus Torvalds /* Unaligned copy entry point. */ 2641da177e4SLinus Torvalds .align 4 2651da177e4SLinus Torvalds$unaligned: 2661da177e4SLinus Torvalds 2671da177e4SLinus Torvalds ldq_u t1, 0(a1) # L : load first source word 2681da177e4SLinus Torvalds and a0, 7, t4 # E : find dest misalignment 2691da177e4SLinus Torvalds and a1, 7, t5 # E : find src misalignment 2701da177e4SLinus Torvalds /* Conditionally load the first destination word and a bytemask 2711da177e4SLinus Torvalds with 0xff indicating that the destination byte is sacrosanct. */ 2721da177e4SLinus Torvalds mov zero, t0 # E : 2731da177e4SLinus Torvalds 2741da177e4SLinus Torvalds mov zero, t6 # E : 2751da177e4SLinus Torvalds beq t4, 1f # U : 2761da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : 2771da177e4SLinus Torvalds lda t6, -1 # E : 2781da177e4SLinus Torvalds 2791da177e4SLinus Torvalds mskql t6, a0, t6 # U : 2801da177e4SLinus Torvalds nop 2811da177e4SLinus Torvalds nop 2821da177e4SLinus Torvalds nop 2831da177e4SLinus Torvalds1: 2841da177e4SLinus Torvalds subq a1, t4, a1 # E : sub dest misalignment from src addr 2851da177e4SLinus Torvalds /* If source misalignment is larger than dest misalignment, we need 2861da177e4SLinus Torvalds extra startup checks to avoid SEGV. */ 2871da177e4SLinus Torvalds cmplt t4, t5, t12 # E : 2881da177e4SLinus Torvalds beq t12, $u_head # U : 2891da177e4SLinus Torvalds lda t2, -1 # E : mask out leading garbage in source 2901da177e4SLinus Torvalds 2911da177e4SLinus Torvalds mskqh t2, t5, t2 # U : 2921da177e4SLinus Torvalds ornot t1, t2, t3 # E : (stall) 2931da177e4SLinus Torvalds cmpbge zero, t3, t8 # E : is there a zero? (stall) 2941da177e4SLinus Torvalds beq t8, $u_head # U : (stall) 2951da177e4SLinus Torvalds 2961da177e4SLinus Torvalds /* At this point we've found a zero in the first partial word of 2971da177e4SLinus Torvalds the source. We need to isolate the valid source data and mask 2981da177e4SLinus Torvalds it into the original destination data. (Incidentally, we know 2991da177e4SLinus Torvalds that we'll need at least one byte of that original dest word.) */ 3001da177e4SLinus Torvalds 3011da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : 3021da177e4SLinus Torvalds negq t8, t6 # E : build bitmask of bytes <= zero 3031da177e4SLinus Torvalds and t6, t8, t12 # E : (stall) 3041da177e4SLinus Torvalds and a1, 7, t5 # E : 3051da177e4SLinus Torvalds 3061da177e4SLinus Torvalds subq t12, 1, t6 # E : 3071da177e4SLinus Torvalds or t6, t12, t8 # E : (stall) 3081da177e4SLinus Torvalds srl t12, t5, t12 # U : adjust final null return value 3091da177e4SLinus Torvalds zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) 3101da177e4SLinus Torvalds 3111da177e4SLinus Torvalds and t1, t2, t1 # E : to source validity mask 3121da177e4SLinus Torvalds extql t2, a1, t2 # U : 3131da177e4SLinus Torvalds extql t1, a1, t1 # U : (stall) 3141da177e4SLinus Torvalds andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) 3151da177e4SLinus Torvalds 3161da177e4SLinus Torvalds or t0, t1, t1 # e1 : and put it there 3171da177e4SLinus Torvalds stq_u t1, 0(a0) # .. e0 : (stall) 3181da177e4SLinus Torvalds ret (t9) # e1 : 3191da177e4SLinus Torvalds nop 3201da177e4SLinus Torvalds 3211da177e4SLinus Torvalds .end __stxcpy 3221da177e4SLinus Torvalds 323