xref: /openbmc/linux/arch/alpha/lib/ev6-stxcpy.S (revision 498495dba268b20e8eadd7fe93c140c68b6cc9d2)
1*b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-stxcpy.S
41da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
51da177e4SLinus Torvalds *
61da177e4SLinus Torvalds * Copy a null-terminated string from SRC to DST.
71da177e4SLinus Torvalds *
81da177e4SLinus Torvalds * This is an internal routine used by strcpy, stpcpy, and strcat.
91da177e4SLinus Torvalds * As such, it uses special linkage conventions to make implementation
101da177e4SLinus Torvalds * of these public functions more efficient.
111da177e4SLinus Torvalds *
121da177e4SLinus Torvalds * On input:
131da177e4SLinus Torvalds *	t9 = return address
141da177e4SLinus Torvalds *	a0 = DST
151da177e4SLinus Torvalds *	a1 = SRC
161da177e4SLinus Torvalds *
171da177e4SLinus Torvalds * On output:
181da177e4SLinus Torvalds *	t12 = bitmask (with one bit set) indicating the last byte written
191da177e4SLinus Torvalds *	a0  = unaligned address of the last *word* written
201da177e4SLinus Torvalds *
211da177e4SLinus Torvalds * Furthermore, v0, a3-a5, t11, and t12 are untouched.
221da177e4SLinus Torvalds *
231da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from:
241da177e4SLinus Torvalds *	Compiler Writer's Guide for the Alpha 21264
251da177e4SLinus Torvalds *	abbreviated as 'CWG' in other comments here
261da177e4SLinus Torvalds *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
271da177e4SLinus Torvalds * Scheduling notation:
281da177e4SLinus Torvalds *	E	- either cluster
291da177e4SLinus Torvalds *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
301da177e4SLinus Torvalds *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
311da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency.
321da177e4SLinus Torvalds */
331da177e4SLinus Torvalds
341da177e4SLinus Torvalds#include <asm/regdef.h>
351da177e4SLinus Torvalds
361da177e4SLinus Torvalds	.set noat
371da177e4SLinus Torvalds	.set noreorder
381da177e4SLinus Torvalds
391da177e4SLinus Torvalds	.text
401da177e4SLinus Torvalds
411da177e4SLinus Torvalds/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
421da177e4SLinus Torvalds   doesn't like putting the entry point for a procedure somewhere in the
431da177e4SLinus Torvalds   middle of the procedure descriptor.  Work around this by putting the
441da177e4SLinus Torvalds   aligned copy in its own procedure descriptor */
451da177e4SLinus Torvalds
461da177e4SLinus Torvalds
471da177e4SLinus Torvalds	.ent stxcpy_aligned
481da177e4SLinus Torvalds	.align 4
491da177e4SLinus Torvaldsstxcpy_aligned:
501da177e4SLinus Torvalds	.frame sp, 0, t9
511da177e4SLinus Torvalds	.prologue 0
521da177e4SLinus Torvalds
531da177e4SLinus Torvalds	/* On entry to this basic block:
541da177e4SLinus Torvalds	   t0 == the first destination word for masking back in
551da177e4SLinus Torvalds	   t1 == the first source word.  */
561da177e4SLinus Torvalds
571da177e4SLinus Torvalds	/* Create the 1st output word and detect 0's in the 1st input word.  */
581da177e4SLinus Torvalds	lda	t2, -1		# E : build a mask against false zero
591da177e4SLinus Torvalds	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
601da177e4SLinus Torvalds	mskqh	t1, a1, t3	# U :
611da177e4SLinus Torvalds	ornot	t1, t2, t2	# E : (stall)
621da177e4SLinus Torvalds
631da177e4SLinus Torvalds	mskql	t0, a1, t0	# U : assemble the first output word
641da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E : bits set iff null found
651da177e4SLinus Torvalds	or	t0, t3, t1	# E : (stall)
661da177e4SLinus Torvalds	bne	t8, $a_eos	# U : (stall)
671da177e4SLinus Torvalds
681da177e4SLinus Torvalds	/* On entry to this basic block:
691da177e4SLinus Torvalds	   t0 == the first destination word for masking back in
701da177e4SLinus Torvalds	   t1 == a source word not containing a null.  */
711da177e4SLinus Torvalds	/* Nops here to separate store quads from load quads */
721da177e4SLinus Torvalds
731da177e4SLinus Torvalds$a_loop:
741da177e4SLinus Torvalds	stq_u	t1, 0(a0)	# L :
751da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
761da177e4SLinus Torvalds	nop
771da177e4SLinus Torvalds	nop
781da177e4SLinus Torvalds
791da177e4SLinus Torvalds	ldq_u	t1, 0(a1)	# L : Latency=3
801da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
811da177e4SLinus Torvalds	cmpbge	zero, t1, t8	# E : (3 cycle stall)
821da177e4SLinus Torvalds	beq	t8, $a_loop	# U : (stall for t8)
831da177e4SLinus Torvalds
841da177e4SLinus Torvalds	/* Take care of the final (partial) word store.
851da177e4SLinus Torvalds	   On entry to this basic block we have:
861da177e4SLinus Torvalds	   t1 == the source word containing the null
871da177e4SLinus Torvalds	   t8 == the cmpbge mask that found it.  */
881da177e4SLinus Torvalds$a_eos:
891da177e4SLinus Torvalds	negq	t8, t6		# E : find low bit set
901da177e4SLinus Torvalds	and	t8, t6, t12	# E : (stall)
911da177e4SLinus Torvalds	/* For the sake of the cache, don't read a destination word
921da177e4SLinus Torvalds	   if we're not going to need it.  */
931da177e4SLinus Torvalds	and	t12, 0x80, t6	# E : (stall)
941da177e4SLinus Torvalds	bne	t6, 1f		# U : (stall)
951da177e4SLinus Torvalds
961da177e4SLinus Torvalds	/* We're doing a partial word store and so need to combine
971da177e4SLinus Torvalds	   our source and original destination words.  */
981da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L : Latency=3
991da177e4SLinus Torvalds	subq	t12, 1, t6	# E :
1001da177e4SLinus Torvalds	zapnot	t1, t6, t1	# U : clear src bytes >= null (stall)
1011da177e4SLinus Torvalds	or	t12, t6, t8	# E : (stall)
1021da177e4SLinus Torvalds
1031da177e4SLinus Torvalds	zap	t0, t8, t0	# E : clear dst bytes <= null
1041da177e4SLinus Torvalds	or	t0, t1, t1	# E : (stall)
1051da177e4SLinus Torvalds	nop
1061da177e4SLinus Torvalds	nop
1071da177e4SLinus Torvalds
1081da177e4SLinus Torvalds1:	stq_u	t1, 0(a0)	# L :
1091da177e4SLinus Torvalds	ret	(t9)		# L0 : Latency=3
1101da177e4SLinus Torvalds	nop
1111da177e4SLinus Torvalds	nop
1121da177e4SLinus Torvalds
1131da177e4SLinus Torvalds	.end stxcpy_aligned
1141da177e4SLinus Torvalds
1151da177e4SLinus Torvalds	.align 4
1161da177e4SLinus Torvalds	.ent __stxcpy
1171da177e4SLinus Torvalds	.globl __stxcpy
1181da177e4SLinus Torvalds__stxcpy:
1191da177e4SLinus Torvalds	.frame sp, 0, t9
1201da177e4SLinus Torvalds	.prologue 0
1211da177e4SLinus Torvalds
1221da177e4SLinus Torvalds	/* Are source and destination co-aligned?  */
1231da177e4SLinus Torvalds	xor	a0, a1, t0	# E :
1241da177e4SLinus Torvalds	unop			# E :
1251da177e4SLinus Torvalds	and	t0, 7, t0	# E : (stall)
1261da177e4SLinus Torvalds	bne	t0, $unaligned	# U : (stall)
1271da177e4SLinus Torvalds
1281da177e4SLinus Torvalds	/* We are co-aligned; take care of a partial first word.  */
1291da177e4SLinus Torvalds	ldq_u	t1, 0(a1)		# L : load first src word
1301da177e4SLinus Torvalds	and	a0, 7, t0		# E : take care not to load a word ...
1311da177e4SLinus Torvalds	addq	a1, 8, a1		# E :
1321da177e4SLinus Torvalds	beq	t0, stxcpy_aligned	# U : ... if we wont need it (stall)
1331da177e4SLinus Torvalds
1341da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L :
1351da177e4SLinus Torvalds	br	stxcpy_aligned	# L0 : Latency=3
1361da177e4SLinus Torvalds	nop
1371da177e4SLinus Torvalds	nop
1381da177e4SLinus Torvalds
1391da177e4SLinus Torvalds
1401da177e4SLinus Torvalds/* The source and destination are not co-aligned.  Align the destination
1411da177e4SLinus Torvalds   and cope.  We have to be very careful about not reading too much and
1421da177e4SLinus Torvalds   causing a SEGV.  */
1431da177e4SLinus Torvalds
1441da177e4SLinus Torvalds	.align 4
1451da177e4SLinus Torvalds$u_head:
1461da177e4SLinus Torvalds	/* We know just enough now to be able to assemble the first
1471da177e4SLinus Torvalds	   full source word.  We can still find a zero at the end of it
1481da177e4SLinus Torvalds	   that prevents us from outputting the whole thing.
1491da177e4SLinus Torvalds
1501da177e4SLinus Torvalds	   On entry to this basic block:
1511da177e4SLinus Torvalds	   t0 == the first dest word, for masking back in, if needed else 0
1521da177e4SLinus Torvalds	   t1 == the low bits of the first source word
1531da177e4SLinus Torvalds	   t6 == bytemask that is -1 in dest word bytes */
1541da177e4SLinus Torvalds
1551da177e4SLinus Torvalds	ldq_u	t2, 8(a1)	# L :
1561da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
1571da177e4SLinus Torvalds	extql	t1, a1, t1	# U : (stall on a1)
1581da177e4SLinus Torvalds	extqh	t2, a1, t4	# U : (stall on a1)
1591da177e4SLinus Torvalds
1601da177e4SLinus Torvalds	mskql	t0, a0, t0	# U :
1611da177e4SLinus Torvalds	or	t1, t4, t1	# E :
1621da177e4SLinus Torvalds	mskqh	t1, a0, t1	# U : (stall on t1)
1631da177e4SLinus Torvalds	or	t0, t1, t1	# E : (stall on t1)
1641da177e4SLinus Torvalds
1651da177e4SLinus Torvalds	or	t1, t6, t6	# E :
1661da177e4SLinus Torvalds	cmpbge	zero, t6, t8	# E : (stall)
1671da177e4SLinus Torvalds	lda	t6, -1		# E : for masking just below
1681da177e4SLinus Torvalds	bne	t8, $u_final	# U : (stall)
1691da177e4SLinus Torvalds
1701da177e4SLinus Torvalds	mskql	t6, a1, t6		# U : mask out the bits we have
1711da177e4SLinus Torvalds	or	t6, t2, t2		# E :   already extracted before (stall)
1721da177e4SLinus Torvalds	cmpbge	zero, t2, t8		# E :   testing eos (stall)
1731da177e4SLinus Torvalds	bne	t8, $u_late_head_exit	# U : (stall)
1741da177e4SLinus Torvalds
1751da177e4SLinus Torvalds	/* Finally, we've got all the stupid leading edge cases taken care
1761da177e4SLinus Torvalds	   of and we can set up to enter the main loop.  */
1771da177e4SLinus Torvalds
1781da177e4SLinus Torvalds	stq_u	t1, 0(a0)	# L : store first output word
1791da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
1801da177e4SLinus Torvalds	extql	t2, a1, t0	# U : position ho-bits of lo word
1811da177e4SLinus Torvalds	ldq_u	t2, 8(a1)	# U : read next high-order source word
1821da177e4SLinus Torvalds
1831da177e4SLinus Torvalds	addq	a1, 8, a1	# E :
1841da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E : (stall for t2)
1851da177e4SLinus Torvalds	nop			# E :
1861da177e4SLinus Torvalds	bne	t8, $u_eos	# U : (stall)
1871da177e4SLinus Torvalds
1881da177e4SLinus Torvalds	/* Unaligned copy main loop.  In order to avoid reading too much,
1891da177e4SLinus Torvalds	   the loop is structured to detect zeros in aligned source words.
1901da177e4SLinus Torvalds	   This has, unfortunately, effectively pulled half of a loop
1911da177e4SLinus Torvalds	   iteration out into the head and half into the tail, but it does
1921da177e4SLinus Torvalds	   prevent nastiness from accumulating in the very thing we want
1931da177e4SLinus Torvalds	   to run as fast as possible.
1941da177e4SLinus Torvalds
1951da177e4SLinus Torvalds	   On entry to this basic block:
1961da177e4SLinus Torvalds	   t0 == the shifted high-order bits from the previous source word
1971da177e4SLinus Torvalds	   t2 == the unshifted current source word
1981da177e4SLinus Torvalds
1991da177e4SLinus Torvalds	   We further know that t2 does not contain a null terminator.  */
2001da177e4SLinus Torvalds
2011da177e4SLinus Torvalds	.align 3
2021da177e4SLinus Torvalds$u_loop:
2031da177e4SLinus Torvalds	extqh	t2, a1, t1	# U : extract high bits for current word
2041da177e4SLinus Torvalds	addq	a1, 8, a1	# E : (stall)
2051da177e4SLinus Torvalds	extql	t2, a1, t3	# U : extract low bits for next time (stall)
2061da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
2071da177e4SLinus Torvalds
2081da177e4SLinus Torvalds	or	t0, t1, t1	# E : current dst word now complete
2091da177e4SLinus Torvalds	ldq_u	t2, 0(a1)	# L : Latency=3 load high word for next time
2101da177e4SLinus Torvalds	stq_u	t1, -8(a0)	# L : save the current word (stall)
2111da177e4SLinus Torvalds	mov	t3, t0		# E :
2121da177e4SLinus Torvalds
2131da177e4SLinus Torvalds	cmpbge	zero, t2, t8	# E : test new word for eos
2141da177e4SLinus Torvalds	beq	t8, $u_loop	# U : (stall)
2151da177e4SLinus Torvalds	nop
2161da177e4SLinus Torvalds	nop
2171da177e4SLinus Torvalds
2181da177e4SLinus Torvalds	/* We've found a zero somewhere in the source word we just read.
2191da177e4SLinus Torvalds	   If it resides in the lower half, we have one (probably partial)
2201da177e4SLinus Torvalds	   word to write out, and if it resides in the upper half, we
2211da177e4SLinus Torvalds	   have one full and one partial word left to write out.
2221da177e4SLinus Torvalds
2231da177e4SLinus Torvalds	   On entry to this basic block:
2241da177e4SLinus Torvalds	   t0 == the shifted high-order bits from the previous source word
2251da177e4SLinus Torvalds	   t2 == the unshifted current source word.  */
2261da177e4SLinus Torvalds$u_eos:
2271da177e4SLinus Torvalds	extqh	t2, a1, t1	# U :
2281da177e4SLinus Torvalds	or	t0, t1, t1	# E : first (partial) source word complete (stall)
2291da177e4SLinus Torvalds	cmpbge	zero, t1, t8	# E : is the null in this first bit? (stall)
2301da177e4SLinus Torvalds	bne	t8, $u_final	# U : (stall)
2311da177e4SLinus Torvalds
2321da177e4SLinus Torvalds$u_late_head_exit:
2331da177e4SLinus Torvalds	stq_u	t1, 0(a0)	# L : the null was in the high-order bits
2341da177e4SLinus Torvalds	addq	a0, 8, a0	# E :
2351da177e4SLinus Torvalds	extql	t2, a1, t1	# U :
2361da177e4SLinus Torvalds	cmpbge	zero, t1, t8	# E : (stall)
2371da177e4SLinus Torvalds
2381da177e4SLinus Torvalds	/* Take care of a final (probably partial) result word.
2391da177e4SLinus Torvalds	   On entry to this basic block:
2401da177e4SLinus Torvalds	   t1 == assembled source word
2411da177e4SLinus Torvalds	   t8 == cmpbge mask that found the null.  */
2421da177e4SLinus Torvalds$u_final:
2431da177e4SLinus Torvalds	negq	t8, t6		# E : isolate low bit set
2441da177e4SLinus Torvalds	and	t6, t8, t12	# E : (stall)
2451da177e4SLinus Torvalds	and	t12, 0x80, t6	# E : avoid dest word load if we can (stall)
2461da177e4SLinus Torvalds	bne	t6, 1f		# U : (stall)
2471da177e4SLinus Torvalds
2481da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# E :
2491da177e4SLinus Torvalds	subq	t12, 1, t6	# E :
2501da177e4SLinus Torvalds	or	t6, t12, t8	# E : (stall)
2511da177e4SLinus Torvalds	zapnot	t1, t6, t1	# U : kill source bytes >= null (stall)
2521da177e4SLinus Torvalds
2531da177e4SLinus Torvalds	zap	t0, t8, t0	# U : kill dest bytes <= null (2 cycle data stall)
2541da177e4SLinus Torvalds	or	t0, t1, t1	# E : (stall)
2551da177e4SLinus Torvalds	nop
2561da177e4SLinus Torvalds	nop
2571da177e4SLinus Torvalds
2581da177e4SLinus Torvalds1:	stq_u	t1, 0(a0)	# L :
2591da177e4SLinus Torvalds	ret	(t9)		# L0 : Latency=3
2601da177e4SLinus Torvalds	nop
2611da177e4SLinus Torvalds	nop
2621da177e4SLinus Torvalds
2631da177e4SLinus Torvalds	/* Unaligned copy entry point.  */
2641da177e4SLinus Torvalds	.align 4
2651da177e4SLinus Torvalds$unaligned:
2661da177e4SLinus Torvalds
2671da177e4SLinus Torvalds	ldq_u	t1, 0(a1)	# L : load first source word
2681da177e4SLinus Torvalds	and	a0, 7, t4	# E : find dest misalignment
2691da177e4SLinus Torvalds	and	a1, 7, t5	# E : find src misalignment
2701da177e4SLinus Torvalds	/* Conditionally load the first destination word and a bytemask
2711da177e4SLinus Torvalds	   with 0xff indicating that the destination byte is sacrosanct.  */
2721da177e4SLinus Torvalds	mov	zero, t0	# E :
2731da177e4SLinus Torvalds
2741da177e4SLinus Torvalds	mov	zero, t6	# E :
2751da177e4SLinus Torvalds	beq	t4, 1f		# U :
2761da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L :
2771da177e4SLinus Torvalds	lda	t6, -1		# E :
2781da177e4SLinus Torvalds
2791da177e4SLinus Torvalds	mskql	t6, a0, t6	# U :
2801da177e4SLinus Torvalds	nop
2811da177e4SLinus Torvalds	nop
2821da177e4SLinus Torvalds	nop
2831da177e4SLinus Torvalds1:
2841da177e4SLinus Torvalds	subq	a1, t4, a1	# E : sub dest misalignment from src addr
2851da177e4SLinus Torvalds	/* If source misalignment is larger than dest misalignment, we need
2861da177e4SLinus Torvalds	   extra startup checks to avoid SEGV.  */
2871da177e4SLinus Torvalds	cmplt	t4, t5, t12	# E :
2881da177e4SLinus Torvalds	beq	t12, $u_head	# U :
2891da177e4SLinus Torvalds	lda	t2, -1		# E : mask out leading garbage in source
2901da177e4SLinus Torvalds
2911da177e4SLinus Torvalds	mskqh	t2, t5, t2	# U :
2921da177e4SLinus Torvalds	ornot	t1, t2, t3	# E : (stall)
2931da177e4SLinus Torvalds	cmpbge	zero, t3, t8	# E : is there a zero? (stall)
2941da177e4SLinus Torvalds	beq	t8, $u_head	# U : (stall)
2951da177e4SLinus Torvalds
2961da177e4SLinus Torvalds	/* At this point we've found a zero in the first partial word of
2971da177e4SLinus Torvalds	   the source.  We need to isolate the valid source data and mask
2981da177e4SLinus Torvalds	   it into the original destination data.  (Incidentally, we know
2991da177e4SLinus Torvalds	   that we'll need at least one byte of that original dest word.) */
3001da177e4SLinus Torvalds
3011da177e4SLinus Torvalds	ldq_u	t0, 0(a0)	# L :
3021da177e4SLinus Torvalds	negq	t8, t6		# E : build bitmask of bytes <= zero
3031da177e4SLinus Torvalds	and	t6, t8, t12	# E : (stall)
3041da177e4SLinus Torvalds	and	a1, 7, t5	# E :
3051da177e4SLinus Torvalds
3061da177e4SLinus Torvalds	subq	t12, 1, t6	# E :
3071da177e4SLinus Torvalds	or	t6, t12, t8	# E : (stall)
3081da177e4SLinus Torvalds	srl	t12, t5, t12	# U : adjust final null return value
3091da177e4SLinus Torvalds	zapnot	t2, t8, t2	# U : prepare source word; mirror changes (stall)
3101da177e4SLinus Torvalds
3111da177e4SLinus Torvalds	and	t1, t2, t1	# E : to source validity mask
3121da177e4SLinus Torvalds	extql	t2, a1, t2	# U :
3131da177e4SLinus Torvalds	extql	t1, a1, t1	# U : (stall)
3141da177e4SLinus Torvalds	andnot	t0, t2, t0	# .. e1 : zero place for source to reside (stall)
3151da177e4SLinus Torvalds
3161da177e4SLinus Torvalds	or	t0, t1, t1	# e1    : and put it there
3171da177e4SLinus Torvalds	stq_u	t1, 0(a0)	# .. e0 : (stall)
3181da177e4SLinus Torvalds	ret	(t9)		# e1    :
3191da177e4SLinus Torvalds	nop
3201da177e4SLinus Torvalds
3211da177e4SLinus Torvalds	.end __stxcpy
3221da177e4SLinus Torvalds
323