1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-stxncpy.S 41da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * Copy no more than COUNT bytes of the null-terminated string from 71da177e4SLinus Torvalds * SRC to DST. 81da177e4SLinus Torvalds * 91da177e4SLinus Torvalds * This is an internal routine used by strncpy, stpncpy, and strncat. 101da177e4SLinus Torvalds * As such, it uses special linkage conventions to make implementation 111da177e4SLinus Torvalds * of these public functions more efficient. 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * On input: 141da177e4SLinus Torvalds * t9 = return address 151da177e4SLinus Torvalds * a0 = DST 161da177e4SLinus Torvalds * a1 = SRC 171da177e4SLinus Torvalds * a2 = COUNT 181da177e4SLinus Torvalds * 191da177e4SLinus Torvalds * Furthermore, COUNT may not be zero. 201da177e4SLinus Torvalds * 211da177e4SLinus Torvalds * On output: 221da177e4SLinus Torvalds * t0 = last word written 231da177e4SLinus Torvalds * t10 = bitmask (with one bit set) indicating the byte position of 241da177e4SLinus Torvalds * the end of the range specified by COUNT 251da177e4SLinus Torvalds * t12 = bitmask (with one bit set) indicating the last byte written 261da177e4SLinus Torvalds * a0 = unaligned address of the last *word* written 271da177e4SLinus Torvalds * a2 = the number of full words left in COUNT 281da177e4SLinus Torvalds * 291da177e4SLinus Torvalds * Furthermore, v0, a3-a5, t11, and $at are untouched. 301da177e4SLinus Torvalds * 311da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from: 321da177e4SLinus Torvalds * Compiler Writer's Guide for the Alpha 21264 331da177e4SLinus Torvalds * abbreviated as 'CWG' in other comments here 341da177e4SLinus Torvalds * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 351da177e4SLinus Torvalds * Scheduling notation: 361da177e4SLinus Torvalds * E - either cluster 371da177e4SLinus Torvalds * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 381da177e4SLinus Torvalds * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 391da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency. 401da177e4SLinus Torvalds */ 411da177e4SLinus Torvalds 421da177e4SLinus Torvalds#include <asm/regdef.h> 431da177e4SLinus Torvalds 441da177e4SLinus Torvalds .set noat 451da177e4SLinus Torvalds .set noreorder 461da177e4SLinus Torvalds 471da177e4SLinus Torvalds .text 481da177e4SLinus Torvalds 491da177e4SLinus Torvalds/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that 501da177e4SLinus Torvalds doesn't like putting the entry point for a procedure somewhere in the 511da177e4SLinus Torvalds middle of the procedure descriptor. Work around this by putting the 521da177e4SLinus Torvalds aligned copy in its own procedure descriptor */ 531da177e4SLinus Torvalds 541da177e4SLinus Torvalds 551da177e4SLinus Torvalds .ent stxncpy_aligned 561da177e4SLinus Torvalds .align 4 571da177e4SLinus Torvaldsstxncpy_aligned: 581da177e4SLinus Torvalds .frame sp, 0, t9, 0 591da177e4SLinus Torvalds .prologue 0 601da177e4SLinus Torvalds 611da177e4SLinus Torvalds /* On entry to this basic block: 621da177e4SLinus Torvalds t0 == the first destination word for masking back in 631da177e4SLinus Torvalds t1 == the first source word. */ 641da177e4SLinus Torvalds 651da177e4SLinus Torvalds /* Create the 1st output word and detect 0's in the 1st input word. */ 661da177e4SLinus Torvalds lda t2, -1 # E : build a mask against false zero 671da177e4SLinus Torvalds mskqh t2, a1, t2 # U : detection in the src word (stall) 681da177e4SLinus Torvalds mskqh t1, a1, t3 # U : 691da177e4SLinus Torvalds ornot t1, t2, t2 # E : (stall) 701da177e4SLinus Torvalds 711da177e4SLinus Torvalds mskql t0, a1, t0 # U : assemble the first output word 721da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : bits set iff null found 731da177e4SLinus Torvalds or t0, t3, t0 # E : (stall) 741da177e4SLinus Torvalds beq a2, $a_eoc # U : 751da177e4SLinus Torvalds 761da177e4SLinus Torvalds bne t8, $a_eos # U : 771da177e4SLinus Torvalds nop 781da177e4SLinus Torvalds nop 791da177e4SLinus Torvalds nop 801da177e4SLinus Torvalds 811da177e4SLinus Torvalds /* On entry to this basic block: 821da177e4SLinus Torvalds t0 == a source word not containing a null. */ 831da177e4SLinus Torvalds 841da177e4SLinus Torvalds /* 851da177e4SLinus Torvalds * nops here to: 861da177e4SLinus Torvalds * separate store quads from load quads 871da177e4SLinus Torvalds * limit of 1 bcond/quad to permit training 881da177e4SLinus Torvalds */ 891da177e4SLinus Torvalds$a_loop: 901da177e4SLinus Torvalds stq_u t0, 0(a0) # L : 911da177e4SLinus Torvalds addq a0, 8, a0 # E : 921da177e4SLinus Torvalds subq a2, 1, a2 # E : 931da177e4SLinus Torvalds nop 941da177e4SLinus Torvalds 951da177e4SLinus Torvalds ldq_u t0, 0(a1) # L : 961da177e4SLinus Torvalds addq a1, 8, a1 # E : 971da177e4SLinus Torvalds cmpbge zero, t0, t8 # E : 981da177e4SLinus Torvalds beq a2, $a_eoc # U : 991da177e4SLinus Torvalds 1001da177e4SLinus Torvalds beq t8, $a_loop # U : 1011da177e4SLinus Torvalds nop 1021da177e4SLinus Torvalds nop 1031da177e4SLinus Torvalds nop 1041da177e4SLinus Torvalds 1051da177e4SLinus Torvalds /* Take care of the final (partial) word store. At this point 1061da177e4SLinus Torvalds the end-of-count bit is set in t8 iff it applies. 1071da177e4SLinus Torvalds 1081da177e4SLinus Torvalds On entry to this basic block we have: 1091da177e4SLinus Torvalds t0 == the source word containing the null 1101da177e4SLinus Torvalds t8 == the cmpbge mask that found it. */ 1111da177e4SLinus Torvalds 1121da177e4SLinus Torvalds$a_eos: 1131da177e4SLinus Torvalds negq t8, t12 # E : find low bit set 1141da177e4SLinus Torvalds and t8, t12, t12 # E : (stall) 1151da177e4SLinus Torvalds /* For the sake of the cache, don't read a destination word 1161da177e4SLinus Torvalds if we're not going to need it. */ 1171da177e4SLinus Torvalds and t12, 0x80, t6 # E : (stall) 1181da177e4SLinus Torvalds bne t6, 1f # U : (stall) 1191da177e4SLinus Torvalds 1201da177e4SLinus Torvalds /* We're doing a partial word store and so need to combine 1211da177e4SLinus Torvalds our source and original destination words. */ 1221da177e4SLinus Torvalds ldq_u t1, 0(a0) # L : 1231da177e4SLinus Torvalds subq t12, 1, t6 # E : 1241da177e4SLinus Torvalds or t12, t6, t8 # E : (stall) 1251da177e4SLinus Torvalds zapnot t0, t8, t0 # U : clear src bytes > null (stall) 1261da177e4SLinus Torvalds 1271da177e4SLinus Torvalds zap t1, t8, t1 # .. e1 : clear dst bytes <= null 1281da177e4SLinus Torvalds or t0, t1, t0 # e1 : (stall) 1291da177e4SLinus Torvalds nop 1301da177e4SLinus Torvalds nop 1311da177e4SLinus Torvalds 1321da177e4SLinus Torvalds1: stq_u t0, 0(a0) # L : 1331da177e4SLinus Torvalds ret (t9) # L0 : Latency=3 1341da177e4SLinus Torvalds nop 1351da177e4SLinus Torvalds nop 1361da177e4SLinus Torvalds 1371da177e4SLinus Torvalds /* Add the end-of-count bit to the eos detection bitmask. */ 1381da177e4SLinus Torvalds$a_eoc: 1391da177e4SLinus Torvalds or t10, t8, t8 # E : 1401da177e4SLinus Torvalds br $a_eos # L0 : Latency=3 1411da177e4SLinus Torvalds nop 1421da177e4SLinus Torvalds nop 1431da177e4SLinus Torvalds 1441da177e4SLinus Torvalds .end stxncpy_aligned 1451da177e4SLinus Torvalds 1461da177e4SLinus Torvalds .align 4 1471da177e4SLinus Torvalds .ent __stxncpy 1481da177e4SLinus Torvalds .globl __stxncpy 1491da177e4SLinus Torvalds__stxncpy: 1501da177e4SLinus Torvalds .frame sp, 0, t9, 0 1511da177e4SLinus Torvalds .prologue 0 1521da177e4SLinus Torvalds 1531da177e4SLinus Torvalds /* Are source and destination co-aligned? */ 1541da177e4SLinus Torvalds xor a0, a1, t1 # E : 1551da177e4SLinus Torvalds and a0, 7, t0 # E : find dest misalignment 1561da177e4SLinus Torvalds and t1, 7, t1 # E : (stall) 1571da177e4SLinus Torvalds addq a2, t0, a2 # E : bias count by dest misalignment (stall) 1581da177e4SLinus Torvalds 1591da177e4SLinus Torvalds subq a2, 1, a2 # E : 1601da177e4SLinus Torvalds and a2, 7, t2 # E : (stall) 1611da177e4SLinus Torvalds srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall) 1621da177e4SLinus Torvalds addq zero, 1, t10 # E : 1631da177e4SLinus Torvalds 1641da177e4SLinus Torvalds sll t10, t2, t10 # U : t10 = bitmask of last count byte 1651da177e4SLinus Torvalds bne t1, $unaligned # U : 1661da177e4SLinus Torvalds /* We are co-aligned; take care of a partial first word. */ 1671da177e4SLinus Torvalds ldq_u t1, 0(a1) # L : load first src word 1681da177e4SLinus Torvalds addq a1, 8, a1 # E : 1691da177e4SLinus Torvalds 1701da177e4SLinus Torvalds beq t0, stxncpy_aligned # U : avoid loading dest word if not needed 1711da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : 1721da177e4SLinus Torvalds nop 1731da177e4SLinus Torvalds nop 1741da177e4SLinus Torvalds 1751da177e4SLinus Torvalds br stxncpy_aligned # .. e1 : 1761da177e4SLinus Torvalds nop 1771da177e4SLinus Torvalds nop 1781da177e4SLinus Torvalds nop 1791da177e4SLinus Torvalds 1801da177e4SLinus Torvalds 1811da177e4SLinus Torvalds 1821da177e4SLinus Torvalds/* The source and destination are not co-aligned. Align the destination 1831da177e4SLinus Torvalds and cope. We have to be very careful about not reading too much and 1841da177e4SLinus Torvalds causing a SEGV. */ 1851da177e4SLinus Torvalds 1861da177e4SLinus Torvalds .align 4 1871da177e4SLinus Torvalds$u_head: 1881da177e4SLinus Torvalds /* We know just enough now to be able to assemble the first 1891da177e4SLinus Torvalds full source word. We can still find a zero at the end of it 1901da177e4SLinus Torvalds that prevents us from outputting the whole thing. 1911da177e4SLinus Torvalds 1921da177e4SLinus Torvalds On entry to this basic block: 1931da177e4SLinus Torvalds t0 == the first dest word, unmasked 1941da177e4SLinus Torvalds t1 == the shifted low bits of the first source word 1951da177e4SLinus Torvalds t6 == bytemask that is -1 in dest word bytes */ 1961da177e4SLinus Torvalds 1971da177e4SLinus Torvalds ldq_u t2, 8(a1) # L : Latency=3 load second src word 1981da177e4SLinus Torvalds addq a1, 8, a1 # E : 1991da177e4SLinus Torvalds mskql t0, a0, t0 # U : mask trailing garbage in dst 2001da177e4SLinus Torvalds extqh t2, a1, t4 # U : (3 cycle stall on t2) 2011da177e4SLinus Torvalds 2021da177e4SLinus Torvalds or t1, t4, t1 # E : first aligned src word complete (stall) 2031da177e4SLinus Torvalds mskqh t1, a0, t1 # U : mask leading garbage in src (stall) 2041da177e4SLinus Torvalds or t0, t1, t0 # E : first output word complete (stall) 2051da177e4SLinus Torvalds or t0, t6, t6 # E : mask original data for zero test (stall) 2061da177e4SLinus Torvalds 2071da177e4SLinus Torvalds cmpbge zero, t6, t8 # E : 2081da177e4SLinus Torvalds beq a2, $u_eocfin # U : 2091da177e4SLinus Torvalds lda t6, -1 # E : 2101da177e4SLinus Torvalds nop 2111da177e4SLinus Torvalds 2121da177e4SLinus Torvalds bne t8, $u_final # U : 2131da177e4SLinus Torvalds mskql t6, a1, t6 # U : mask out bits already seen 2141da177e4SLinus Torvalds stq_u t0, 0(a0) # L : store first output word 2151da177e4SLinus Torvalds or t6, t2, t2 # E : (stall) 2161da177e4SLinus Torvalds 2171da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : find nulls in second partial 2181da177e4SLinus Torvalds addq a0, 8, a0 # E : 2191da177e4SLinus Torvalds subq a2, 1, a2 # E : 2201da177e4SLinus Torvalds bne t8, $u_late_head_exit # U : 2211da177e4SLinus Torvalds 2221da177e4SLinus Torvalds /* Finally, we've got all the stupid leading edge cases taken care 2231da177e4SLinus Torvalds of and we can set up to enter the main loop. */ 2241da177e4SLinus Torvalds extql t2, a1, t1 # U : position hi-bits of lo word 2251da177e4SLinus Torvalds beq a2, $u_eoc # U : 2261da177e4SLinus Torvalds ldq_u t2, 8(a1) # L : read next high-order source word 2271da177e4SLinus Torvalds addq a1, 8, a1 # E : 2281da177e4SLinus Torvalds 2291da177e4SLinus Torvalds extqh t2, a1, t0 # U : position lo-bits of hi word (stall) 2301da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : 2311da177e4SLinus Torvalds nop 2321da177e4SLinus Torvalds bne t8, $u_eos # U : 2331da177e4SLinus Torvalds 2341da177e4SLinus Torvalds /* Unaligned copy main loop. In order to avoid reading too much, 2351da177e4SLinus Torvalds the loop is structured to detect zeros in aligned source words. 2361da177e4SLinus Torvalds This has, unfortunately, effectively pulled half of a loop 2371da177e4SLinus Torvalds iteration out into the head and half into the tail, but it does 2381da177e4SLinus Torvalds prevent nastiness from accumulating in the very thing we want 2391da177e4SLinus Torvalds to run as fast as possible. 2401da177e4SLinus Torvalds 2411da177e4SLinus Torvalds On entry to this basic block: 2421da177e4SLinus Torvalds t0 == the shifted low-order bits from the current source word 2431da177e4SLinus Torvalds t1 == the shifted high-order bits from the previous source word 2441da177e4SLinus Torvalds t2 == the unshifted current source word 2451da177e4SLinus Torvalds 2461da177e4SLinus Torvalds We further know that t2 does not contain a null terminator. */ 2471da177e4SLinus Torvalds 2481da177e4SLinus Torvalds .align 4 2491da177e4SLinus Torvalds$u_loop: 2501da177e4SLinus Torvalds or t0, t1, t0 # E : current dst word now complete 2511da177e4SLinus Torvalds subq a2, 1, a2 # E : decrement word count 2521da177e4SLinus Torvalds extql t2, a1, t1 # U : extract low bits for next time 2531da177e4SLinus Torvalds addq a0, 8, a0 # E : 2541da177e4SLinus Torvalds 2551da177e4SLinus Torvalds stq_u t0, -8(a0) # U : save the current word 2561da177e4SLinus Torvalds beq a2, $u_eoc # U : 2571da177e4SLinus Torvalds ldq_u t2, 8(a1) # U : Latency=3 load high word for next time 2581da177e4SLinus Torvalds addq a1, 8, a1 # E : 2591da177e4SLinus Torvalds 2601da177e4SLinus Torvalds extqh t2, a1, t0 # U : extract low bits (2 cycle stall) 2611da177e4SLinus Torvalds cmpbge zero, t2, t8 # E : test new word for eos 2621da177e4SLinus Torvalds nop 2631da177e4SLinus Torvalds beq t8, $u_loop # U : 2641da177e4SLinus Torvalds 2651da177e4SLinus Torvalds /* We've found a zero somewhere in the source word we just read. 2661da177e4SLinus Torvalds If it resides in the lower half, we have one (probably partial) 2671da177e4SLinus Torvalds word to write out, and if it resides in the upper half, we 2681da177e4SLinus Torvalds have one full and one partial word left to write out. 2691da177e4SLinus Torvalds 2701da177e4SLinus Torvalds On entry to this basic block: 2711da177e4SLinus Torvalds t0 == the shifted low-order bits from the current source word 2721da177e4SLinus Torvalds t1 == the shifted high-order bits from the previous source word 2731da177e4SLinus Torvalds t2 == the unshifted current source word. */ 2741da177e4SLinus Torvalds$u_eos: 2751da177e4SLinus Torvalds or t0, t1, t0 # E : first (partial) source word complete 2761da177e4SLinus Torvalds nop 2771da177e4SLinus Torvalds cmpbge zero, t0, t8 # E : is the null in this first bit? (stall) 2781da177e4SLinus Torvalds bne t8, $u_final # U : (stall) 2791da177e4SLinus Torvalds 2801da177e4SLinus Torvalds stq_u t0, 0(a0) # L : the null was in the high-order bits 2811da177e4SLinus Torvalds addq a0, 8, a0 # E : 2821da177e4SLinus Torvalds subq a2, 1, a2 # E : 2831da177e4SLinus Torvalds nop 2841da177e4SLinus Torvalds 2851da177e4SLinus Torvalds$u_late_head_exit: 2861da177e4SLinus Torvalds extql t2, a1, t0 # U : 2871da177e4SLinus Torvalds cmpbge zero, t0, t8 # E : 2881da177e4SLinus Torvalds or t8, t10, t6 # E : (stall) 2891da177e4SLinus Torvalds cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall) 2901da177e4SLinus Torvalds 2911da177e4SLinus Torvalds /* Take care of a final (probably partial) result word. 2921da177e4SLinus Torvalds On entry to this basic block: 2931da177e4SLinus Torvalds t0 == assembled source word 2941da177e4SLinus Torvalds t8 == cmpbge mask that found the null. */ 2951da177e4SLinus Torvalds$u_final: 2961da177e4SLinus Torvalds negq t8, t6 # E : isolate low bit set 2971da177e4SLinus Torvalds and t6, t8, t12 # E : (stall) 2981da177e4SLinus Torvalds and t12, 0x80, t6 # E : avoid dest word load if we can (stall) 2991da177e4SLinus Torvalds bne t6, 1f # U : (stall) 3001da177e4SLinus Torvalds 3011da177e4SLinus Torvalds ldq_u t1, 0(a0) # L : 3021da177e4SLinus Torvalds subq t12, 1, t6 # E : 3031da177e4SLinus Torvalds or t6, t12, t8 # E : (stall) 3041da177e4SLinus Torvalds zapnot t0, t8, t0 # U : kill source bytes > null 3051da177e4SLinus Torvalds 3061da177e4SLinus Torvalds zap t1, t8, t1 # U : kill dest bytes <= null 3071da177e4SLinus Torvalds or t0, t1, t0 # E : (stall) 3081da177e4SLinus Torvalds nop 3091da177e4SLinus Torvalds nop 3101da177e4SLinus Torvalds 3111da177e4SLinus Torvalds1: stq_u t0, 0(a0) # L : 3121da177e4SLinus Torvalds ret (t9) # L0 : Latency=3 3131da177e4SLinus Torvalds 3141da177e4SLinus Torvalds /* Got to end-of-count before end of string. 3151da177e4SLinus Torvalds On entry to this basic block: 3161da177e4SLinus Torvalds t1 == the shifted high-order bits from the previous source word */ 3171da177e4SLinus Torvalds$u_eoc: 3181da177e4SLinus Torvalds and a1, 7, t6 # E : avoid final load if possible 3191da177e4SLinus Torvalds sll t10, t6, t6 # U : (stall) 3201da177e4SLinus Torvalds and t6, 0xff, t6 # E : (stall) 3211da177e4SLinus Torvalds bne t6, 1f # U : (stall) 3221da177e4SLinus Torvalds 3231da177e4SLinus Torvalds ldq_u t2, 8(a1) # L : load final src word 3241da177e4SLinus Torvalds nop 3251da177e4SLinus Torvalds extqh t2, a1, t0 # U : extract low bits for last word (stall) 3261da177e4SLinus Torvalds or t1, t0, t1 # E : (stall) 3271da177e4SLinus Torvalds 3281da177e4SLinus Torvalds1: cmpbge zero, t1, t8 # E : 3291da177e4SLinus Torvalds mov t1, t0 # E : 3301da177e4SLinus Torvalds 3311da177e4SLinus Torvalds$u_eocfin: # end-of-count, final word 3321da177e4SLinus Torvalds or t10, t8, t8 # E : 3331da177e4SLinus Torvalds br $u_final # L0 : Latency=3 3341da177e4SLinus Torvalds 3351da177e4SLinus Torvalds /* Unaligned copy entry point. */ 3361da177e4SLinus Torvalds .align 4 3371da177e4SLinus Torvalds$unaligned: 3381da177e4SLinus Torvalds 3391da177e4SLinus Torvalds ldq_u t1, 0(a1) # L : load first source word 3401da177e4SLinus Torvalds and a0, 7, t4 # E : find dest misalignment 3411da177e4SLinus Torvalds and a1, 7, t5 # E : find src misalignment 3421da177e4SLinus Torvalds /* Conditionally load the first destination word and a bytemask 3431da177e4SLinus Torvalds with 0xff indicating that the destination byte is sacrosanct. */ 3441da177e4SLinus Torvalds mov zero, t0 # E : 3451da177e4SLinus Torvalds 3461da177e4SLinus Torvalds mov zero, t6 # E : 3471da177e4SLinus Torvalds beq t4, 1f # U : 3481da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : 3491da177e4SLinus Torvalds lda t6, -1 # E : 3501da177e4SLinus Torvalds 3511da177e4SLinus Torvalds mskql t6, a0, t6 # U : 3521da177e4SLinus Torvalds nop 3531da177e4SLinus Torvalds nop 3541da177e4SLinus Torvalds subq a1, t4, a1 # E : sub dest misalignment from src addr 3551da177e4SLinus Torvalds 3561da177e4SLinus Torvalds /* If source misalignment is larger than dest misalignment, we need 3571da177e4SLinus Torvalds extra startup checks to avoid SEGV. */ 3581da177e4SLinus Torvalds 3591da177e4SLinus Torvalds1: cmplt t4, t5, t12 # E : 3601da177e4SLinus Torvalds extql t1, a1, t1 # U : shift src into place 3611da177e4SLinus Torvalds lda t2, -1 # E : for creating masks later 3621da177e4SLinus Torvalds beq t12, $u_head # U : (stall) 3631da177e4SLinus Torvalds 3641da177e4SLinus Torvalds extql t2, a1, t2 # U : 3651da177e4SLinus Torvalds cmpbge zero, t1, t8 # E : is there a zero? 366fe4304baSIvan Kokshaysky andnot t2, t6, t2 # E : dest mask for a single word copy 3671da177e4SLinus Torvalds or t8, t10, t5 # E : test for end-of-count too 3681da177e4SLinus Torvalds 369fe4304baSIvan Kokshaysky cmpbge zero, t2, t3 # E : 3701da177e4SLinus Torvalds cmoveq a2, t5, t8 # E : Latency=2, extra map slot 3711da177e4SLinus Torvalds nop # E : keep with cmoveq 3721da177e4SLinus Torvalds andnot t8, t3, t8 # E : (stall) 3731da177e4SLinus Torvalds 3741da177e4SLinus Torvalds beq t8, $u_head # U : 3751da177e4SLinus Torvalds /* At this point we've found a zero in the first partial word of 3761da177e4SLinus Torvalds the source. We need to isolate the valid source data and mask 3771da177e4SLinus Torvalds it into the original destination data. (Incidentally, we know 3781da177e4SLinus Torvalds that we'll need at least one byte of that original dest word.) */ 3791da177e4SLinus Torvalds ldq_u t0, 0(a0) # L : 3801da177e4SLinus Torvalds negq t8, t6 # E : build bitmask of bytes <= zero 3811da177e4SLinus Torvalds mskqh t1, t4, t1 # U : 3821da177e4SLinus Torvalds 383fe4304baSIvan Kokshaysky and t6, t8, t12 # E : 384fe4304baSIvan Kokshaysky subq t12, 1, t6 # E : (stall) 385fe4304baSIvan Kokshaysky or t6, t12, t8 # E : (stall) 386fe4304baSIvan Kokshaysky zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) 3871da177e4SLinus Torvalds 3881da177e4SLinus Torvalds zapnot t1, t8, t1 # U : to source validity mask 389fe4304baSIvan Kokshaysky andnot t0, t2, t0 # E : zero place for source to reside 3901da177e4SLinus Torvalds or t0, t1, t0 # E : and put it there (stall both t0, t1) 3911da177e4SLinus Torvalds stq_u t0, 0(a0) # L : (stall) 3921da177e4SLinus Torvalds 3931da177e4SLinus Torvalds ret (t9) # L0 : Latency=3 3941da177e4SLinus Torvalds nop 3951da177e4SLinus Torvalds nop 3961da177e4SLinus Torvalds nop 3971da177e4SLinus Torvalds 3981da177e4SLinus Torvalds .end __stxncpy 399