xref: /openbmc/linux/arch/alpha/lib/ev6-stxncpy.S (revision b2441318)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-stxncpy.S
41da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
51da177e4SLinus Torvalds *
61da177e4SLinus Torvalds * Copy no more than COUNT bytes of the null-terminated string from
71da177e4SLinus Torvalds * SRC to DST.
81da177e4SLinus Torvalds *
91da177e4SLinus Torvalds * This is an internal routine used by strncpy, stpncpy, and strncat.
101da177e4SLinus Torvalds * As such, it uses special linkage conventions to make implementation
111da177e4SLinus Torvalds * of these public functions more efficient.
121da177e4SLinus Torvalds *
131da177e4SLinus Torvalds * On input:
141da177e4SLinus Torvalds *	t9 = return address
151da177e4SLinus Torvalds *	a0 = DST
161da177e4SLinus Torvalds *	a1 = SRC
171da177e4SLinus Torvalds *	a2 = COUNT
181da177e4SLinus Torvalds *
191da177e4SLinus Torvalds * Furthermore, COUNT may not be zero.
201da177e4SLinus Torvalds *
211da177e4SLinus Torvalds * On output:
221da177e4SLinus Torvalds *	t0  = last word written
231da177e4SLinus Torvalds *	t10 = bitmask (with one bit set) indicating the byte position of
241da177e4SLinus Torvalds *	      the end of the range specified by COUNT
251da177e4SLinus Torvalds *	t12 = bitmask (with one bit set) indicating the last byte written
261da177e4SLinus Torvalds *	a0  = unaligned address of the last *word* written
271da177e4SLinus Torvalds *	a2  = the number of full words left in COUNT
281da177e4SLinus Torvalds *
291da177e4SLinus Torvalds * Furthermore, v0, a3-a5, t11, and $at are untouched.
301da177e4SLinus Torvalds *
311da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from:
321da177e4SLinus Torvalds *	Compiler Writer's Guide for the Alpha 21264
331da177e4SLinus Torvalds *	abbreviated as 'CWG' in other comments here
341da177e4SLinus Torvalds *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
351da177e4SLinus Torvalds * Scheduling notation:
361da177e4SLinus Torvalds *	E	- either cluster
371da177e4SLinus Torvalds *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
381da177e4SLinus Torvalds *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
391da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency.
401da177e4SLinus Torvalds */
411da177e4SLinus Torvalds
421da177e4SLinus Torvalds#include <asm/regdef.h>
431da177e4SLinus Torvalds
441da177e4SLinus Torvalds	.set noat
451da177e4SLinus Torvalds	.set noreorder
461da177e4SLinus Torvalds
471da177e4SLinus Torvalds	.text
481da177e4SLinus Torvalds
491da177e4SLinus Torvalds/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
501da177e4SLinus Torvalds   doesn't like putting the entry point for a procedure somewhere in the
511da177e4SLinus Torvalds   middle of the procedure descriptor.  Work around this by putting the
521da177e4SLinus Torvalds   aligned copy in its own procedure descriptor */
531da177e4SLinus Torvalds
541da177e4SLinus Torvalds
551da177e4SLinus Torvalds	.ent stxncpy_aligned
561da177e4SLinus Torvalds	.align 4
571da177e4SLinus Torvaldsstxncpy_aligned:
581da177e4SLinus Torvalds	.frame sp, 0, t9, 0
591da177e4SLinus Torvalds	.prologue 0
601da177e4SLinus Torvalds
611da177e4SLinus Torvalds	/* On entry to this basic block:
621da177e4SLinus Torvalds	   t0 == the first destination word for masking back in
631da177e4SLinus Torvalds	   t1 == the first source word.  */
641da177e4SLinus Torvalds
651da177e4SLinus Torvalds	/* Create the 1st output word and detect 0's in the 1st input word.  */
661da177e4SLinus Torvalds	lda	t2, -1		# E : build a mask against false zero
671da177e4SLinus Torvalds	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
681da177e4SLinus Torvalds	mskqh	t1, a1, t3	# U :
691da177e4SLinus Torvalds	ornot	t1, t2, t2	# E : (stall)
701da177e4SLinus Torvalds
711da177e4SLinus Torvalds	mskql	t0, a1, t0	# U : assemble the first output word
721da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E : bits set iff null found
731da177e4SLinus Torvalds	or	t0, t3, t0	# E : (stall)
741da177e4SLinus Torvalds	beq	a2, $a_eoc	# U :
751da177e4SLinus Torvalds
761da177e4SLinus Torvalds	bne	t8, $a_eos	# U :
771da177e4SLinus Torvalds	nop
781da177e4SLinus Torvalds	nop
791da177e4SLinus Torvalds	nop
801da177e4SLinus Torvalds
811da177e4SLinus Torvalds	/* On entry to this basic block:
821da177e4SLinus Torvalds	   t0 == a source word not containing a null.  */
831da177e4SLinus Torvalds
841da177e4SLinus Torvalds	/*
851da177e4SLinus Torvalds	 * nops here to:
861da177e4SLinus Torvalds	 *	separate store quads from load quads
871da177e4SLinus Torvalds	 *	limit of 1 bcond/quad to permit training
881da177e4SLinus Torvalds	 */
891da177e4SLinus Torvalds$a_loop:
901da177e4SLinus Torvalds	stq_u	t0, 0(a0)	# L :
911da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
921da177e4SLinus Torvalds	subq	a2, 1, a2	# E :
931da177e4SLinus Torvalds	nop
941da177e4SLinus Torvalds
951da177e4SLinus Torvalds	ldq_u	t0, 0(a1)	# L :
961da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
971da177e4SLinus Torvalds	cmpbge	zero, t0, t8	# E :
981da177e4SLinus Torvalds	beq	a2, $a_eoc      # U :
991da177e4SLinus Torvalds
1001da177e4SLinus Torvalds	beq	t8, $a_loop	# U :
1011da177e4SLinus Torvalds	nop
1021da177e4SLinus Torvalds	nop
1031da177e4SLinus Torvalds	nop
1041da177e4SLinus Torvalds
1051da177e4SLinus Torvalds	/* Take care of the final (partial) word store.  At this point
1061da177e4SLinus Torvalds	   the end-of-count bit is set in t8 iff it applies.
1071da177e4SLinus Torvalds
1081da177e4SLinus Torvalds	   On entry to this basic block we have:
1091da177e4SLinus Torvalds	   t0 == the source word containing the null
1101da177e4SLinus Torvalds	   t8 == the cmpbge mask that found it.  */
1111da177e4SLinus Torvalds
1121da177e4SLinus Torvalds$a_eos:
1131da177e4SLinus Torvalds	negq	t8, t12		# E : find low bit set
1141da177e4SLinus Torvalds	and	t8, t12, t12	# E : (stall)
1151da177e4SLinus Torvalds	/* For the sake of the cache, don't read a destination word
1161da177e4SLinus Torvalds	   if we're not going to need it.  */
1171da177e4SLinus Torvalds	and	t12, 0x80, t6	# E : (stall)
1181da177e4SLinus Torvalds	bne	t6, 1f		# U : (stall)
1191da177e4SLinus Torvalds
1201da177e4SLinus Torvalds	/* We're doing a partial word store and so need to combine
1211da177e4SLinus Torvalds	   our source and original destination words.  */
1221da177e4SLinus Torvalds	ldq_u	t1, 0(a0)	# L :
1231da177e4SLinus Torvalds	subq	t12, 1, t6	# E :
1241da177e4SLinus Torvalds	or	t12, t6, t8	# E : (stall)
1251da177e4SLinus Torvalds	zapnot	t0, t8, t0	# U : clear src bytes > null (stall)
1261da177e4SLinus Torvalds
1271da177e4SLinus Torvalds	zap	t1, t8, t1	# .. e1 : clear dst bytes <= null
1281da177e4SLinus Torvalds	or	t0, t1, t0	# e1    : (stall)
1291da177e4SLinus Torvalds	nop
1301da177e4SLinus Torvalds	nop
1311da177e4SLinus Torvalds
1321da177e4SLinus Torvalds1:	stq_u	t0, 0(a0)	# L :
1331da177e4SLinus Torvalds	ret	(t9)		# L0 : Latency=3
1341da177e4SLinus Torvalds	nop
1351da177e4SLinus Torvalds	nop
1361da177e4SLinus Torvalds
1371da177e4SLinus Torvalds	/* Add the end-of-count bit to the eos detection bitmask.  */
1381da177e4SLinus Torvalds$a_eoc:
1391da177e4SLinus Torvalds	or	t10, t8, t8	# E :
1401da177e4SLinus Torvalds	br	$a_eos		# L0 : Latency=3
1411da177e4SLinus Torvalds	nop
1421da177e4SLinus Torvalds	nop
1431da177e4SLinus Torvalds
1441da177e4SLinus Torvalds	.end stxncpy_aligned
1451da177e4SLinus Torvalds
1461da177e4SLinus Torvalds	.align 4
1471da177e4SLinus Torvalds	.ent __stxncpy
1481da177e4SLinus Torvalds	.globl __stxncpy
1491da177e4SLinus Torvalds__stxncpy:
1501da177e4SLinus Torvalds	.frame sp, 0, t9, 0
1511da177e4SLinus Torvalds	.prologue 0
1521da177e4SLinus Torvalds
1531da177e4SLinus Torvalds	/* Are source and destination co-aligned?  */
1541da177e4SLinus Torvalds	xor	a0, a1, t1	# E :
1551da177e4SLinus Torvalds	and	a0, 7, t0	# E : find dest misalignment
1561da177e4SLinus Torvalds	and	t1, 7, t1	# E : (stall)
1571da177e4SLinus Torvalds	addq	a2, t0, a2	# E : bias count by dest misalignment (stall)
1581da177e4SLinus Torvalds
1591da177e4SLinus Torvalds	subq	a2, 1, a2	# E :
1601da177e4SLinus Torvalds	and	a2, 7, t2	# E : (stall)
1611da177e4SLinus Torvalds	srl	a2, 3, a2	# U : a2 = loop counter = (count - 1)/8 (stall)
1621da177e4SLinus Torvalds	addq	zero, 1, t10	# E :
1631da177e4SLinus Torvalds
1641da177e4SLinus Torvalds	sll	t10, t2, t10	# U : t10 = bitmask of last count byte
1651da177e4SLinus Torvalds	bne	t1, $unaligned	# U :
1661da177e4SLinus Torvalds	/* We are co-aligned; take care of a partial first word.  */
1671da177e4SLinus Torvalds	ldq_u	t1, 0(a1)	# L : load first src word
1681da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
1691da177e4SLinus Torvalds
1701da177e4SLinus Torvalds	beq	t0, stxncpy_aligned     # U : avoid loading dest word if not needed
1711da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L :
1721da177e4SLinus Torvalds	nop
1731da177e4SLinus Torvalds	nop
1741da177e4SLinus Torvalds
1751da177e4SLinus Torvalds	br	stxncpy_aligned	# .. e1 :
1761da177e4SLinus Torvalds	nop
1771da177e4SLinus Torvalds	nop
1781da177e4SLinus Torvalds	nop
1791da177e4SLinus Torvalds
1801da177e4SLinus Torvalds
1811da177e4SLinus Torvalds
1821da177e4SLinus Torvalds/* The source and destination are not co-aligned.  Align the destination
1831da177e4SLinus Torvalds   and cope.  We have to be very careful about not reading too much and
1841da177e4SLinus Torvalds   causing a SEGV.  */
1851da177e4SLinus Torvalds
1861da177e4SLinus Torvalds	.align 4
1871da177e4SLinus Torvalds$u_head:
1881da177e4SLinus Torvalds	/* We know just enough now to be able to assemble the first
1891da177e4SLinus Torvalds	   full source word.  We can still find a zero at the end of it
1901da177e4SLinus Torvalds	   that prevents us from outputting the whole thing.
1911da177e4SLinus Torvalds
1921da177e4SLinus Torvalds	   On entry to this basic block:
1931da177e4SLinus Torvalds	   t0 == the first dest word, unmasked
1941da177e4SLinus Torvalds	   t1 == the shifted low bits of the first source word
1951da177e4SLinus Torvalds	   t6 == bytemask that is -1 in dest word bytes */
1961da177e4SLinus Torvalds
1971da177e4SLinus Torvalds	ldq_u	t2, 8(a1)	# L : Latency=3 load second src word
1981da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
1991da177e4SLinus Torvalds	mskql	t0, a0, t0	# U : mask trailing garbage in dst
2001da177e4SLinus Torvalds	extqh	t2, a1, t4	# U : (3 cycle stall on t2)
2011da177e4SLinus Torvalds
2021da177e4SLinus Torvalds	or	t1, t4, t1	# E : first aligned src word complete (stall)
2031da177e4SLinus Torvalds	mskqh	t1, a0, t1	# U : mask leading garbage in src (stall)
2041da177e4SLinus Torvalds	or	t0, t1, t0	# E : first output word complete (stall)
2051da177e4SLinus Torvalds	or	t0, t6, t6	# E : mask original data for zero test (stall)
2061da177e4SLinus Torvalds
2071da177e4SLinus Torvalds	cmpbge	zero, t6, t8	# E :
2081da177e4SLinus Torvalds	beq	a2, $u_eocfin	# U :
2091da177e4SLinus Torvalds	lda	t6, -1		# E :
2101da177e4SLinus Torvalds	nop
2111da177e4SLinus Torvalds
2121da177e4SLinus Torvalds	bne	t8, $u_final	# U :
2131da177e4SLinus Torvalds	mskql	t6, a1, t6	# U : mask out bits already seen
2141da177e4SLinus Torvalds	stq_u	t0, 0(a0)	# L : store first output word
2151da177e4SLinus Torvalds	or      t6, t2, t2	# E : (stall)
2161da177e4SLinus Torvalds
2171da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E : find nulls in second partial
2181da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
2191da177e4SLinus Torvalds	subq	a2, 1, a2	# E :
2201da177e4SLinus Torvalds	bne	t8, $u_late_head_exit	# U :
2211da177e4SLinus Torvalds
2221da177e4SLinus Torvalds	/* Finally, we've got all the stupid leading edge cases taken care
2231da177e4SLinus Torvalds	   of and we can set up to enter the main loop.  */
2241da177e4SLinus Torvalds	extql	t2, a1, t1	# U : position hi-bits of lo word
2251da177e4SLinus Torvalds	beq	a2, $u_eoc	# U :
2261da177e4SLinus Torvalds	ldq_u	t2, 8(a1)	# L : read next high-order source word
2271da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
2281da177e4SLinus Torvalds
2291da177e4SLinus Torvalds	extqh	t2, a1, t0	# U : position lo-bits of hi word (stall)
2301da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E :
2311da177e4SLinus Torvalds	nop
2321da177e4SLinus Torvalds	bne	t8, $u_eos	# U :
2331da177e4SLinus Torvalds
2341da177e4SLinus Torvalds	/* Unaligned copy main loop.  In order to avoid reading too much,
2351da177e4SLinus Torvalds	   the loop is structured to detect zeros in aligned source words.
2361da177e4SLinus Torvalds	   This has, unfortunately, effectively pulled half of a loop
2371da177e4SLinus Torvalds	   iteration out into the head and half into the tail, but it does
2381da177e4SLinus Torvalds	   prevent nastiness from accumulating in the very thing we want
2391da177e4SLinus Torvalds	   to run as fast as possible.
2401da177e4SLinus Torvalds
2411da177e4SLinus Torvalds	   On entry to this basic block:
2421da177e4SLinus Torvalds	   t0 == the shifted low-order bits from the current source word
2431da177e4SLinus Torvalds	   t1 == the shifted high-order bits from the previous source word
2441da177e4SLinus Torvalds	   t2 == the unshifted current source word
2451da177e4SLinus Torvalds
2461da177e4SLinus Torvalds	   We further know that t2 does not contain a null terminator.  */
2471da177e4SLinus Torvalds
2481da177e4SLinus Torvalds	.align 4
2491da177e4SLinus Torvalds$u_loop:
2501da177e4SLinus Torvalds	or	t0, t1, t0	# E : current dst word now complete
2511da177e4SLinus Torvalds	subq	a2, 1, a2	# E : decrement word count
2521da177e4SLinus Torvalds	extql	t2, a1, t1	# U : extract low bits for next time
2531da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
2541da177e4SLinus Torvalds
2551da177e4SLinus Torvalds	stq_u	t0, -8(a0)	# U : save the current word
2561da177e4SLinus Torvalds	beq	a2, $u_eoc	# U :
2571da177e4SLinus Torvalds	ldq_u	t2, 8(a1)	# U : Latency=3 load high word for next time
2581da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
2591da177e4SLinus Torvalds
2601da177e4SLinus Torvalds	extqh	t2, a1, t0	# U : extract low bits (2 cycle stall)
2611da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E : test new word for eos
2621da177e4SLinus Torvalds	nop
2631da177e4SLinus Torvalds	beq	t8, $u_loop	# U :
2641da177e4SLinus Torvalds
2651da177e4SLinus Torvalds	/* We've found a zero somewhere in the source word we just read.
2661da177e4SLinus Torvalds	   If it resides in the lower half, we have one (probably partial)
2671da177e4SLinus Torvalds	   word to write out, and if it resides in the upper half, we
2681da177e4SLinus Torvalds	   have one full and one partial word left to write out.
2691da177e4SLinus Torvalds
2701da177e4SLinus Torvalds	   On entry to this basic block:
2711da177e4SLinus Torvalds	   t0 == the shifted low-order bits from the current source word
2721da177e4SLinus Torvalds	   t1 == the shifted high-order bits from the previous source word
2731da177e4SLinus Torvalds	   t2 == the unshifted current source word.  */
2741da177e4SLinus Torvalds$u_eos:
2751da177e4SLinus Torvalds	or	t0, t1, t0	# E : first (partial) source word complete
2761da177e4SLinus Torvalds	nop
2771da177e4SLinus Torvalds	cmpbge	zero, t0, t8	# E : is the null in this first bit? (stall)
2781da177e4SLinus Torvalds	bne	t8, $u_final	# U : (stall)
2791da177e4SLinus Torvalds
2801da177e4SLinus Torvalds	stq_u	t0, 0(a0)	# L : the null was in the high-order bits
2811da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
2821da177e4SLinus Torvalds	subq	a2, 1, a2	# E :
2831da177e4SLinus Torvalds	nop
2841da177e4SLinus Torvalds
2851da177e4SLinus Torvalds$u_late_head_exit:
2861da177e4SLinus Torvalds	extql	t2, a1, t0	# U :
2871da177e4SLinus Torvalds	cmpbge	zero, t0, t8	# E :
2881da177e4SLinus Torvalds	or	t8, t10, t6	# E : (stall)
2891da177e4SLinus Torvalds	cmoveq	a2, t6, t8	# E : Latency=2, extra map slot (stall)
2901da177e4SLinus Torvalds
2911da177e4SLinus Torvalds	/* Take care of a final (probably partial) result word.
2921da177e4SLinus Torvalds	   On entry to this basic block:
2931da177e4SLinus Torvalds	   t0 == assembled source word
2941da177e4SLinus Torvalds	   t8 == cmpbge mask that found the null.  */
2951da177e4SLinus Torvalds$u_final:
2961da177e4SLinus Torvalds	negq	t8, t6		# E : isolate low bit set
2971da177e4SLinus Torvalds	and	t6, t8, t12	# E : (stall)
2981da177e4SLinus Torvalds	and	t12, 0x80, t6	# E : avoid dest word load if we can (stall)
2991da177e4SLinus Torvalds	bne	t6, 1f		# U : (stall)
3001da177e4SLinus Torvalds
3011da177e4SLinus Torvalds	ldq_u	t1, 0(a0)	# L :
3021da177e4SLinus Torvalds	subq	t12, 1, t6	# E :
3031da177e4SLinus Torvalds	or	t6, t12, t8	# E : (stall)
3041da177e4SLinus Torvalds	zapnot	t0, t8, t0	# U : kill source bytes > null
3051da177e4SLinus Torvalds
3061da177e4SLinus Torvalds	zap	t1, t8, t1	# U : kill dest bytes <= null
3071da177e4SLinus Torvalds	or	t0, t1, t0	# E : (stall)
3081da177e4SLinus Torvalds	nop
3091da177e4SLinus Torvalds	nop
3101da177e4SLinus Torvalds
3111da177e4SLinus Torvalds1:	stq_u	t0, 0(a0)	# L :
3121da177e4SLinus Torvalds	ret	(t9)		# L0 : Latency=3
3131da177e4SLinus Torvalds
3141da177e4SLinus Torvalds	  /* Got to end-of-count before end of string.
3151da177e4SLinus Torvalds	     On entry to this basic block:
3161da177e4SLinus Torvalds	     t1 == the shifted high-order bits from the previous source word  */
3171da177e4SLinus Torvalds$u_eoc:
3181da177e4SLinus Torvalds	and	a1, 7, t6	# E : avoid final load if possible
3191da177e4SLinus Torvalds	sll	t10, t6, t6	# U : (stall)
3201da177e4SLinus Torvalds	and	t6, 0xff, t6	# E : (stall)
3211da177e4SLinus Torvalds	bne	t6, 1f		# U : (stall)
3221da177e4SLinus Torvalds
3231da177e4SLinus Torvalds	ldq_u	t2, 8(a1)	# L : load final src word
3241da177e4SLinus Torvalds	nop
3251da177e4SLinus Torvalds	extqh	t2, a1, t0	# U : extract low bits for last word (stall)
3261da177e4SLinus Torvalds	or	t1, t0, t1	# E : (stall)
3271da177e4SLinus Torvalds
3281da177e4SLinus Torvalds1:	cmpbge	zero, t1, t8	# E :
3291da177e4SLinus Torvalds	mov	t1, t0		# E :
3301da177e4SLinus Torvalds
3311da177e4SLinus Torvalds$u_eocfin:			# end-of-count, final word
3321da177e4SLinus Torvalds	or	t10, t8, t8	# E :
3331da177e4SLinus Torvalds	br	$u_final	# L0 : Latency=3
3341da177e4SLinus Torvalds
3351da177e4SLinus Torvalds	/* Unaligned copy entry point.  */
3361da177e4SLinus Torvalds	.align 4
3371da177e4SLinus Torvalds$unaligned:
3381da177e4SLinus Torvalds
3391da177e4SLinus Torvalds	ldq_u	t1, 0(a1)	# L : load first source word
3401da177e4SLinus Torvalds	and	a0, 7, t4	# E : find dest misalignment
3411da177e4SLinus Torvalds	and	a1, 7, t5	# E : find src misalignment
3421da177e4SLinus Torvalds	/* Conditionally load the first destination word and a bytemask
3431da177e4SLinus Torvalds	   with 0xff indicating that the destination byte is sacrosanct.  */
3441da177e4SLinus Torvalds	mov	zero, t0	# E :
3451da177e4SLinus Torvalds
3461da177e4SLinus Torvalds	mov	zero, t6	# E :
3471da177e4SLinus Torvalds	beq	t4, 1f		# U :
3481da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L :
3491da177e4SLinus Torvalds	lda	t6, -1		# E :
3501da177e4SLinus Torvalds
3511da177e4SLinus Torvalds	mskql	t6, a0, t6	# U :
3521da177e4SLinus Torvalds	nop
3531da177e4SLinus Torvalds	nop
3541da177e4SLinus Torvalds	subq	a1, t4, a1	# E : sub dest misalignment from src addr
3551da177e4SLinus Torvalds
3561da177e4SLinus Torvalds	/* If source misalignment is larger than dest misalignment, we need
3571da177e4SLinus Torvalds	   extra startup checks to avoid SEGV.  */
3581da177e4SLinus Torvalds
3591da177e4SLinus Torvalds1:	cmplt	t4, t5, t12	# E :
3601da177e4SLinus Torvalds	extql	t1, a1, t1	# U : shift src into place
3611da177e4SLinus Torvalds	lda	t2, -1		# E : for creating masks later
3621da177e4SLinus Torvalds	beq	t12, $u_head	# U : (stall)
3631da177e4SLinus Torvalds
3641da177e4SLinus Torvalds	extql	t2, a1, t2	# U :
3651da177e4SLinus Torvalds	cmpbge	zero, t1, t8	# E : is there a zero?
366fe4304baSIvan Kokshaysky	andnot	t2, t6, t2	# E : dest mask for a single word copy
3671da177e4SLinus Torvalds	or	t8, t10, t5	# E : test for end-of-count too
3681da177e4SLinus Torvalds
369fe4304baSIvan Kokshaysky	cmpbge	zero, t2, t3	# E :
3701da177e4SLinus Torvalds	cmoveq	a2, t5, t8	# E : Latency=2, extra map slot
3711da177e4SLinus Torvalds	nop			# E : keep with cmoveq
3721da177e4SLinus Torvalds	andnot	t8, t3, t8	# E : (stall)
3731da177e4SLinus Torvalds
3741da177e4SLinus Torvalds	beq	t8, $u_head	# U :
3751da177e4SLinus Torvalds	/* At this point we've found a zero in the first partial word of
3761da177e4SLinus Torvalds	   the source.  We need to isolate the valid source data and mask
3771da177e4SLinus Torvalds	   it into the original destination data.  (Incidentally, we know
3781da177e4SLinus Torvalds	   that we'll need at least one byte of that original dest word.) */
3791da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L :
3801da177e4SLinus Torvalds	negq	t8, t6		# E : build bitmask of bytes <= zero
3811da177e4SLinus Torvalds	mskqh	t1, t4, t1	# U :
3821da177e4SLinus Torvalds
383fe4304baSIvan Kokshaysky	and	t6, t8, t12	# E :
384fe4304baSIvan Kokshaysky	subq	t12, 1, t6	# E : (stall)
385fe4304baSIvan Kokshaysky	or	t6, t12, t8	# E : (stall)
386fe4304baSIvan Kokshaysky	zapnot	t2, t8, t2	# U : prepare source word; mirror changes (stall)
3871da177e4SLinus Torvalds
3881da177e4SLinus Torvalds	zapnot	t1, t8, t1	# U : to source validity mask
389fe4304baSIvan Kokshaysky	andnot	t0, t2, t0	# E : zero place for source to reside
3901da177e4SLinus Torvalds	or	t0, t1, t0	# E : and put it there (stall both t0, t1)
3911da177e4SLinus Torvalds	stq_u	t0, 0(a0)	# L : (stall)
3921da177e4SLinus Torvalds
3931da177e4SLinus Torvalds	ret	(t9)		# L0 : Latency=3
3941da177e4SLinus Torvalds	nop
3951da177e4SLinus Torvalds	nop
3961da177e4SLinus Torvalds	nop
3971da177e4SLinus Torvalds
3981da177e4SLinus Torvalds	.end __stxncpy
399