xref: /openbmc/linux/arch/alpha/lib/ev6-memset.S (revision f3c78e94)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-memset.S
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * This is an efficient (and relatively small) implementation of the C library
61da177e4SLinus Torvalds * "memset()" function for the 21264 implementation of Alpha.
71da177e4SLinus Torvalds *
81da177e4SLinus Torvalds * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
91da177e4SLinus Torvalds *
101da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from:
111da177e4SLinus Torvalds *	Compiler Writer's Guide for the Alpha 21264
121da177e4SLinus Torvalds *	abbreviated as 'CWG' in other comments here
131da177e4SLinus Torvalds *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
141da177e4SLinus Torvalds * Scheduling notation:
151da177e4SLinus Torvalds *	E	- either cluster
161da177e4SLinus Torvalds *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
171da177e4SLinus Torvalds *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
181da177e4SLinus Torvalds * The algorithm for the leading and trailing quadwords remains the same,
191da177e4SLinus Torvalds * however the loop has been unrolled to enable better memory throughput,
201da177e4SLinus Torvalds * and the code has been replicated for each of the entry points: __memset
210d83620fSMichael Cree * and __memset16 to permit better scheduling to eliminate the stalling
221da177e4SLinus Torvalds * encountered during the mask replication.
231da177e4SLinus Torvalds * A future enhancement might be to put in a byte store loop for really
241da177e4SLinus Torvalds * small (say < 32 bytes) memset()s.  Whether or not that change would be
251da177e4SLinus Torvalds * a win in the kernel would depend upon the contextual usage.
261da177e4SLinus Torvalds * WARNING: Maintaining this is going to be more work than the above version,
271da177e4SLinus Torvalds * as fixes will need to be made in multiple places.  The performance gain
281da177e4SLinus Torvalds * is worth it.
291da177e4SLinus Torvalds */
30*f3c78e94SMasahiro Yamada#include <linux/export.h>
311da177e4SLinus Torvalds	.set noat
321da177e4SLinus Torvalds	.set noreorder
331da177e4SLinus Torvalds.text
34a47e5bb5SRichard Henderson	.globl memset
351da177e4SLinus Torvalds	.globl __memset
36a47e5bb5SRichard Henderson	.globl ___memset
370d83620fSMichael Cree	.globl __memset16
381da177e4SLinus Torvalds	.globl __constant_c_memset
391da177e4SLinus Torvalds
40a47e5bb5SRichard Henderson	.ent ___memset
411da177e4SLinus Torvalds.align 5
42a47e5bb5SRichard Henderson___memset:
431da177e4SLinus Torvalds	.frame $30,0,$26,0
441da177e4SLinus Torvalds	.prologue 0
451da177e4SLinus Torvalds
461da177e4SLinus Torvalds	/*
471da177e4SLinus Torvalds	 * Serious stalling happens.  The only way to mitigate this is to
481da177e4SLinus Torvalds	 * undertake a major re-write to interleave the constant materialization
491da177e4SLinus Torvalds	 * with other parts of the fall-through code.  This is important, even
501da177e4SLinus Torvalds	 * though it makes maintenance tougher.
511da177e4SLinus Torvalds	 * Do this later.
521da177e4SLinus Torvalds	 */
531da177e4SLinus Torvalds	and $17,255,$1		# E : 00000000000000ch
541da177e4SLinus Torvalds	insbl $17,1,$2		# U : 000000000000ch00
551da177e4SLinus Torvalds	bis $16,$16,$0		# E : return value
561da177e4SLinus Torvalds	ble $18,end_b		# U : zero length requested?
571da177e4SLinus Torvalds
581da177e4SLinus Torvalds	addq $18,$16,$6		# E : max address to write to
591da177e4SLinus Torvalds	bis	$1,$2,$17	# E : 000000000000chch
601da177e4SLinus Torvalds	insbl	$1,2,$3		# U : 0000000000ch0000
611da177e4SLinus Torvalds	insbl	$1,3,$4		# U : 00000000ch000000
621da177e4SLinus Torvalds
631da177e4SLinus Torvalds	or	$3,$4,$3	# E : 00000000chch0000
641da177e4SLinus Torvalds	inswl	$17,4,$5	# U : 0000chch00000000
651da177e4SLinus Torvalds	xor	$16,$6,$1	# E : will complete write be within one quadword?
661da177e4SLinus Torvalds	inswl	$17,6,$2	# U : chch000000000000
671da177e4SLinus Torvalds
681da177e4SLinus Torvalds	or	$17,$3,$17	# E : 00000000chchchch
691da177e4SLinus Torvalds	or	$2,$5,$2	# E : chchchch00000000
701da177e4SLinus Torvalds	bic	$1,7,$1		# E : fit within a single quadword?
711da177e4SLinus Torvalds	and	$16,7,$3	# E : Target addr misalignment
721da177e4SLinus Torvalds
731da177e4SLinus Torvalds	or	$17,$2,$17	# E : chchchchchchchch
741da177e4SLinus Torvalds	beq	$1,within_quad_b # U :
751da177e4SLinus Torvalds	nop			# E :
761da177e4SLinus Torvalds	beq	$3,aligned_b	# U : target is 0mod8
771da177e4SLinus Torvalds
781da177e4SLinus Torvalds	/*
791da177e4SLinus Torvalds	 * Target address is misaligned, and won't fit within a quadword
801da177e4SLinus Torvalds	 */
811da177e4SLinus Torvalds	ldq_u $4,0($16)		# L : Fetch first partial
821da177e4SLinus Torvalds	bis $16,$16,$5		# E : Save the address
831da177e4SLinus Torvalds	insql $17,$16,$2	# U : Insert new bytes
841da177e4SLinus Torvalds	subq $3,8,$3		# E : Invert (for addressing uses)
851da177e4SLinus Torvalds
861da177e4SLinus Torvalds	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
871da177e4SLinus Torvalds	mskql $4,$16,$4		# U : clear relevant parts of the quad
881da177e4SLinus Torvalds	subq $16,$3,$16		# E : $16 is new aligned destination
891da177e4SLinus Torvalds	bis $2,$4,$1		# E : Final bytes
901da177e4SLinus Torvalds
911da177e4SLinus Torvalds	nop
921da177e4SLinus Torvalds	stq_u $1,0($5)		# L : Store result
931da177e4SLinus Torvalds	nop
941da177e4SLinus Torvalds	nop
951da177e4SLinus Torvalds
961da177e4SLinus Torvalds.align 4
971da177e4SLinus Torvaldsaligned_b:
981da177e4SLinus Torvalds	/*
991da177e4SLinus Torvalds	 * We are now guaranteed to be quad aligned, with at least
1001da177e4SLinus Torvalds	 * one partial quad to write.
1011da177e4SLinus Torvalds	 */
1021da177e4SLinus Torvalds
1031da177e4SLinus Torvalds	sra $18,3,$3		# U : Number of remaining quads to write
1041da177e4SLinus Torvalds	and $18,7,$18		# E : Number of trailing bytes to write
1051da177e4SLinus Torvalds	bis $16,$16,$5		# E : Save dest address
1061da177e4SLinus Torvalds	beq $3,no_quad_b	# U : tail stuff only
1071da177e4SLinus Torvalds
1081da177e4SLinus Torvalds	/*
1091da177e4SLinus Torvalds	 * it's worth the effort to unroll this and use wh64 if possible
1101da177e4SLinus Torvalds	 * Lifted a bunch of code from clear_user.S
1111da177e4SLinus Torvalds	 * At this point, entry values are:
1121da177e4SLinus Torvalds	 * $16	Current destination address
1131da177e4SLinus Torvalds	 * $5	A copy of $16
1141da177e4SLinus Torvalds	 * $6	The max quadword address to write to
1151da177e4SLinus Torvalds	 * $18	Number trailer bytes
1161da177e4SLinus Torvalds	 * $3	Number quads to write
1171da177e4SLinus Torvalds	 */
1181da177e4SLinus Torvalds
1191da177e4SLinus Torvalds	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
1201da177e4SLinus Torvalds	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
1211da177e4SLinus Torvalds	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
1221da177e4SLinus Torvalds	blt	$4, loop_b	# U :
1231da177e4SLinus Torvalds
1241da177e4SLinus Torvalds	/*
1251da177e4SLinus Torvalds	 * We know we've got at least 16 quads, minimum of one trip
1261da177e4SLinus Torvalds	 * through unrolled loop.  Do a quad at a time to get us 0mod64
1271da177e4SLinus Torvalds	 * aligned.
1281da177e4SLinus Torvalds	 */
1291da177e4SLinus Torvalds
1301da177e4SLinus Torvalds	nop			# E :
1311da177e4SLinus Torvalds	nop			# E :
1321da177e4SLinus Torvalds	nop			# E :
1331da177e4SLinus Torvalds	beq	$1, $bigalign_b	# U :
1341da177e4SLinus Torvalds
1351da177e4SLinus Torvalds$alignmod64_b:
1361da177e4SLinus Torvalds	stq	$17, 0($5)	# L :
1371da177e4SLinus Torvalds	subq	$3, 1, $3	# E : For consistency later
1381da177e4SLinus Torvalds	addq	$1, 8, $1	# E : Increment towards zero for alignment
1391da177e4SLinus Torvalds	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
1401da177e4SLinus Torvalds
1411da177e4SLinus Torvalds	nop
1421da177e4SLinus Torvalds	nop
1431da177e4SLinus Torvalds	addq	$5, 8, $5	# E : Inc address
1441da177e4SLinus Torvalds	blt	$1, $alignmod64_b # U :
1451da177e4SLinus Torvalds
1461da177e4SLinus Torvalds$bigalign_b:
1471da177e4SLinus Torvalds	/*
1481da177e4SLinus Torvalds	 * $3 - number quads left to go
1491da177e4SLinus Torvalds	 * $5 - target address (aligned 0mod64)
1501da177e4SLinus Torvalds	 * $17 - mask of stuff to store
1511da177e4SLinus Torvalds	 * Scratch registers available: $7, $2, $4, $1
1521da177e4SLinus Torvalds	 * we know that we'll be taking a minimum of one trip through
1531da177e4SLinus Torvalds 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
1541da177e4SLinus Torvalds	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
1551da177e4SLinus Torvalds	 * The wh64 is issued on for the starting destination address for trip +2
1561da177e4SLinus Torvalds	 * through the loop, and if there are less than two trips left, the target
1571da177e4SLinus Torvalds	 * address will be for the current trip.
1581da177e4SLinus Torvalds	 */
1591da177e4SLinus Torvalds
1601da177e4SLinus Torvalds$do_wh64_b:
1611da177e4SLinus Torvalds	wh64	($4)		# L1 : memory subsystem write hint
1621da177e4SLinus Torvalds	subq	$3, 24, $2	# E : For determining future wh64 addresses
1631da177e4SLinus Torvalds	stq	$17, 0($5)	# L :
1641da177e4SLinus Torvalds	nop			# E :
1651da177e4SLinus Torvalds
1661da177e4SLinus Torvalds	addq	$5, 128, $4	# E : speculative target of next wh64
1671da177e4SLinus Torvalds	stq	$17, 8($5)	# L :
1681da177e4SLinus Torvalds	stq	$17, 16($5)	# L :
1691da177e4SLinus Torvalds	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
1701da177e4SLinus Torvalds
1711da177e4SLinus Torvalds	stq	$17, 24($5)	# L :
1721da177e4SLinus Torvalds	stq	$17, 32($5)	# L :
1731da177e4SLinus Torvalds	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
1741da177e4SLinus Torvalds	nop
1751da177e4SLinus Torvalds
1761da177e4SLinus Torvalds	stq	$17, 40($5)	# L :
1771da177e4SLinus Torvalds	stq	$17, 48($5)	# L :
1781da177e4SLinus Torvalds	subq	$3, 16, $2	# E : Repeat the loop at least once more?
1791da177e4SLinus Torvalds	nop
1801da177e4SLinus Torvalds
1811da177e4SLinus Torvalds	stq	$17, 56($5)	# L :
1821da177e4SLinus Torvalds	addq	$5, 64, $5	# E :
1831da177e4SLinus Torvalds	subq	$3, 8, $3	# E :
1841da177e4SLinus Torvalds	bge	$2, $do_wh64_b	# U :
1851da177e4SLinus Torvalds
1861da177e4SLinus Torvalds	nop
1871da177e4SLinus Torvalds	nop
1881da177e4SLinus Torvalds	nop
1891da177e4SLinus Torvalds	beq	$3, no_quad_b	# U : Might have finished already
1901da177e4SLinus Torvalds
1911da177e4SLinus Torvalds.align 4
1921da177e4SLinus Torvalds	/*
1931da177e4SLinus Torvalds	 * Simple loop for trailing quadwords, or for small amounts
1941da177e4SLinus Torvalds	 * of data (where we can't use an unrolled loop and wh64)
1951da177e4SLinus Torvalds	 */
1961da177e4SLinus Torvaldsloop_b:
1971da177e4SLinus Torvalds	stq $17,0($5)		# L :
1981da177e4SLinus Torvalds	subq $3,1,$3		# E : Decrement number quads left
1991da177e4SLinus Torvalds	addq $5,8,$5		# E : Inc address
2001da177e4SLinus Torvalds	bne $3,loop_b		# U : more?
2011da177e4SLinus Torvalds
2021da177e4SLinus Torvaldsno_quad_b:
2031da177e4SLinus Torvalds	/*
2041da177e4SLinus Torvalds	 * Write 0..7 trailing bytes.
2051da177e4SLinus Torvalds	 */
2061da177e4SLinus Torvalds	nop			# E :
2071da177e4SLinus Torvalds	beq $18,end_b		# U : All done?
2081da177e4SLinus Torvalds	ldq $7,0($5)		# L :
2091da177e4SLinus Torvalds	mskqh $7,$6,$2		# U : Mask final quad
2101da177e4SLinus Torvalds
2111da177e4SLinus Torvalds	insqh $17,$6,$4		# U : New bits
2121da177e4SLinus Torvalds	bis $2,$4,$1		# E : Put it all together
2131da177e4SLinus Torvalds	stq $1,0($5)		# L : And back to memory
2141da177e4SLinus Torvalds	ret $31,($26),1		# L0 :
2151da177e4SLinus Torvalds
2161da177e4SLinus Torvaldswithin_quad_b:
2171da177e4SLinus Torvalds	ldq_u $1,0($16)		# L :
2181da177e4SLinus Torvalds	insql $17,$16,$2	# U : New bits
2191da177e4SLinus Torvalds	mskql $1,$16,$4		# U : Clear old
2201da177e4SLinus Torvalds	bis $2,$4,$2		# E : New result
2211da177e4SLinus Torvalds
2221da177e4SLinus Torvalds	mskql $2,$6,$4		# U :
2231da177e4SLinus Torvalds	mskqh $1,$6,$2		# U :
2241da177e4SLinus Torvalds	bis $2,$4,$1		# E :
2251da177e4SLinus Torvalds	stq_u $1,0($16)		# L :
2261da177e4SLinus Torvalds
2271da177e4SLinus Torvaldsend_b:
2281da177e4SLinus Torvalds	nop
2291da177e4SLinus Torvalds	nop
2301da177e4SLinus Torvalds	nop
2311da177e4SLinus Torvalds	ret $31,($26),1		# L0 :
232a47e5bb5SRichard Henderson	.end ___memset
23300fc0e0dSAl Viro	EXPORT_SYMBOL(___memset)
2341da177e4SLinus Torvalds
2351da177e4SLinus Torvalds	/*
2361da177e4SLinus Torvalds	 * This is the original body of code, prior to replication and
2371da177e4SLinus Torvalds	 * rescheduling.  Leave it here, as there may be calls to this
2381da177e4SLinus Torvalds	 * entry point.
2391da177e4SLinus Torvalds	 */
2401da177e4SLinus Torvalds.align 4
2411da177e4SLinus Torvalds	.ent __constant_c_memset
2421da177e4SLinus Torvalds__constant_c_memset:
2431da177e4SLinus Torvalds	.frame $30,0,$26,0
2441da177e4SLinus Torvalds	.prologue 0
2451da177e4SLinus Torvalds
2461da177e4SLinus Torvalds	addq $18,$16,$6		# E : max address to write to
2471da177e4SLinus Torvalds	bis $16,$16,$0		# E : return value
2481da177e4SLinus Torvalds	xor $16,$6,$1		# E : will complete write be within one quadword?
2491da177e4SLinus Torvalds	ble $18,end		# U : zero length requested?
2501da177e4SLinus Torvalds
2511da177e4SLinus Torvalds	bic $1,7,$1		# E : fit within a single quadword
2521da177e4SLinus Torvalds	beq $1,within_one_quad	# U :
2531da177e4SLinus Torvalds	and $16,7,$3		# E : Target addr misalignment
2541da177e4SLinus Torvalds	beq $3,aligned		# U : target is 0mod8
2551da177e4SLinus Torvalds
2561da177e4SLinus Torvalds	/*
2571da177e4SLinus Torvalds	 * Target address is misaligned, and won't fit within a quadword
2581da177e4SLinus Torvalds	 */
2591da177e4SLinus Torvalds	ldq_u $4,0($16)		# L : Fetch first partial
2601da177e4SLinus Torvalds	bis $16,$16,$5		# E : Save the address
2611da177e4SLinus Torvalds	insql $17,$16,$2	# U : Insert new bytes
2621da177e4SLinus Torvalds	subq $3,8,$3		# E : Invert (for addressing uses)
2631da177e4SLinus Torvalds
2641da177e4SLinus Torvalds	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
2651da177e4SLinus Torvalds	mskql $4,$16,$4		# U : clear relevant parts of the quad
2661da177e4SLinus Torvalds	subq $16,$3,$16		# E : $16 is new aligned destination
2671da177e4SLinus Torvalds	bis $2,$4,$1		# E : Final bytes
2681da177e4SLinus Torvalds
2691da177e4SLinus Torvalds	nop
2701da177e4SLinus Torvalds	stq_u $1,0($5)		# L : Store result
2711da177e4SLinus Torvalds	nop
2721da177e4SLinus Torvalds	nop
2731da177e4SLinus Torvalds
2741da177e4SLinus Torvalds.align 4
2751da177e4SLinus Torvaldsaligned:
2761da177e4SLinus Torvalds	/*
2771da177e4SLinus Torvalds	 * We are now guaranteed to be quad aligned, with at least
2781da177e4SLinus Torvalds	 * one partial quad to write.
2791da177e4SLinus Torvalds	 */
2801da177e4SLinus Torvalds
2811da177e4SLinus Torvalds	sra $18,3,$3		# U : Number of remaining quads to write
2821da177e4SLinus Torvalds	and $18,7,$18		# E : Number of trailing bytes to write
2831da177e4SLinus Torvalds	bis $16,$16,$5		# E : Save dest address
2841da177e4SLinus Torvalds	beq $3,no_quad		# U : tail stuff only
2851da177e4SLinus Torvalds
2861da177e4SLinus Torvalds	/*
2871da177e4SLinus Torvalds	 * it's worth the effort to unroll this and use wh64 if possible
2881da177e4SLinus Torvalds	 * Lifted a bunch of code from clear_user.S
2891da177e4SLinus Torvalds	 * At this point, entry values are:
2901da177e4SLinus Torvalds	 * $16	Current destination address
2911da177e4SLinus Torvalds	 * $5	A copy of $16
2921da177e4SLinus Torvalds	 * $6	The max quadword address to write to
2931da177e4SLinus Torvalds	 * $18	Number trailer bytes
2941da177e4SLinus Torvalds	 * $3	Number quads to write
2951da177e4SLinus Torvalds	 */
2961da177e4SLinus Torvalds
2971da177e4SLinus Torvalds	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
2981da177e4SLinus Torvalds	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
2991da177e4SLinus Torvalds	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
3001da177e4SLinus Torvalds	blt	$4, loop	# U :
3011da177e4SLinus Torvalds
3021da177e4SLinus Torvalds	/*
3031da177e4SLinus Torvalds	 * We know we've got at least 16 quads, minimum of one trip
3041da177e4SLinus Torvalds	 * through unrolled loop.  Do a quad at a time to get us 0mod64
3051da177e4SLinus Torvalds	 * aligned.
3061da177e4SLinus Torvalds	 */
3071da177e4SLinus Torvalds
3081da177e4SLinus Torvalds	nop			# E :
3091da177e4SLinus Torvalds	nop			# E :
3101da177e4SLinus Torvalds	nop			# E :
3111da177e4SLinus Torvalds	beq	$1, $bigalign	# U :
3121da177e4SLinus Torvalds
3131da177e4SLinus Torvalds$alignmod64:
3141da177e4SLinus Torvalds	stq	$17, 0($5)	# L :
3151da177e4SLinus Torvalds	subq	$3, 1, $3	# E : For consistency later
3161da177e4SLinus Torvalds	addq	$1, 8, $1	# E : Increment towards zero for alignment
3171da177e4SLinus Torvalds	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
3181da177e4SLinus Torvalds
3191da177e4SLinus Torvalds	nop
3201da177e4SLinus Torvalds	nop
3211da177e4SLinus Torvalds	addq	$5, 8, $5	# E : Inc address
3221da177e4SLinus Torvalds	blt	$1, $alignmod64	# U :
3231da177e4SLinus Torvalds
3241da177e4SLinus Torvalds$bigalign:
3251da177e4SLinus Torvalds	/*
3261da177e4SLinus Torvalds	 * $3 - number quads left to go
3271da177e4SLinus Torvalds	 * $5 - target address (aligned 0mod64)
3281da177e4SLinus Torvalds	 * $17 - mask of stuff to store
3291da177e4SLinus Torvalds	 * Scratch registers available: $7, $2, $4, $1
3301da177e4SLinus Torvalds	 * we know that we'll be taking a minimum of one trip through
3311da177e4SLinus Torvalds 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
3321da177e4SLinus Torvalds	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
3331da177e4SLinus Torvalds	 * The wh64 is issued on for the starting destination address for trip +2
3341da177e4SLinus Torvalds	 * through the loop, and if there are less than two trips left, the target
3351da177e4SLinus Torvalds	 * address will be for the current trip.
3361da177e4SLinus Torvalds	 */
3371da177e4SLinus Torvalds
3381da177e4SLinus Torvalds$do_wh64:
3391da177e4SLinus Torvalds	wh64	($4)		# L1 : memory subsystem write hint
3401da177e4SLinus Torvalds	subq	$3, 24, $2	# E : For determining future wh64 addresses
3411da177e4SLinus Torvalds	stq	$17, 0($5)	# L :
3421da177e4SLinus Torvalds	nop			# E :
3431da177e4SLinus Torvalds
3441da177e4SLinus Torvalds	addq	$5, 128, $4	# E : speculative target of next wh64
3451da177e4SLinus Torvalds	stq	$17, 8($5)	# L :
3461da177e4SLinus Torvalds	stq	$17, 16($5)	# L :
3471da177e4SLinus Torvalds	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
3481da177e4SLinus Torvalds
3491da177e4SLinus Torvalds	stq	$17, 24($5)	# L :
3501da177e4SLinus Torvalds	stq	$17, 32($5)	# L :
3511da177e4SLinus Torvalds	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
3521da177e4SLinus Torvalds	nop
3531da177e4SLinus Torvalds
3541da177e4SLinus Torvalds	stq	$17, 40($5)	# L :
3551da177e4SLinus Torvalds	stq	$17, 48($5)	# L :
3561da177e4SLinus Torvalds	subq	$3, 16, $2	# E : Repeat the loop at least once more?
3571da177e4SLinus Torvalds	nop
3581da177e4SLinus Torvalds
3591da177e4SLinus Torvalds	stq	$17, 56($5)	# L :
3601da177e4SLinus Torvalds	addq	$5, 64, $5	# E :
3611da177e4SLinus Torvalds	subq	$3, 8, $3	# E :
3621da177e4SLinus Torvalds	bge	$2, $do_wh64	# U :
3631da177e4SLinus Torvalds
3641da177e4SLinus Torvalds	nop
3651da177e4SLinus Torvalds	nop
3661da177e4SLinus Torvalds	nop
3671da177e4SLinus Torvalds	beq	$3, no_quad	# U : Might have finished already
3681da177e4SLinus Torvalds
3691da177e4SLinus Torvalds.align 4
3701da177e4SLinus Torvalds	/*
3711da177e4SLinus Torvalds	 * Simple loop for trailing quadwords, or for small amounts
3721da177e4SLinus Torvalds	 * of data (where we can't use an unrolled loop and wh64)
3731da177e4SLinus Torvalds	 */
3741da177e4SLinus Torvaldsloop:
3751da177e4SLinus Torvalds	stq $17,0($5)		# L :
3761da177e4SLinus Torvalds	subq $3,1,$3		# E : Decrement number quads left
3771da177e4SLinus Torvalds	addq $5,8,$5		# E : Inc address
3781da177e4SLinus Torvalds	bne $3,loop		# U : more?
3791da177e4SLinus Torvalds
3801da177e4SLinus Torvaldsno_quad:
3811da177e4SLinus Torvalds	/*
3821da177e4SLinus Torvalds	 * Write 0..7 trailing bytes.
3831da177e4SLinus Torvalds	 */
3841da177e4SLinus Torvalds	nop			# E :
3851da177e4SLinus Torvalds	beq $18,end		# U : All done?
3861da177e4SLinus Torvalds	ldq $7,0($5)		# L :
3871da177e4SLinus Torvalds	mskqh $7,$6,$2		# U : Mask final quad
3881da177e4SLinus Torvalds
3891da177e4SLinus Torvalds	insqh $17,$6,$4		# U : New bits
3901da177e4SLinus Torvalds	bis $2,$4,$1		# E : Put it all together
3911da177e4SLinus Torvalds	stq $1,0($5)		# L : And back to memory
3921da177e4SLinus Torvalds	ret $31,($26),1		# L0 :
3931da177e4SLinus Torvalds
3941da177e4SLinus Torvaldswithin_one_quad:
3951da177e4SLinus Torvalds	ldq_u $1,0($16)		# L :
3961da177e4SLinus Torvalds	insql $17,$16,$2	# U : New bits
3971da177e4SLinus Torvalds	mskql $1,$16,$4		# U : Clear old
3981da177e4SLinus Torvalds	bis $2,$4,$2		# E : New result
3991da177e4SLinus Torvalds
4001da177e4SLinus Torvalds	mskql $2,$6,$4		# U :
4011da177e4SLinus Torvalds	mskqh $1,$6,$2		# U :
4021da177e4SLinus Torvalds	bis $2,$4,$1		# E :
4031da177e4SLinus Torvalds	stq_u $1,0($16)		# L :
4041da177e4SLinus Torvalds
4051da177e4SLinus Torvaldsend:
4061da177e4SLinus Torvalds	nop
4071da177e4SLinus Torvalds	nop
4081da177e4SLinus Torvalds	nop
4091da177e4SLinus Torvalds	ret $31,($26),1		# L0 :
4101da177e4SLinus Torvalds	.end __constant_c_memset
41100fc0e0dSAl Viro	EXPORT_SYMBOL(__constant_c_memset)
4121da177e4SLinus Torvalds
4131da177e4SLinus Torvalds	/*
4141da177e4SLinus Torvalds	 * This is a replicant of the __constant_c_memset code, rescheduled
4151da177e4SLinus Torvalds	 * to mask stalls.  Note that entry point names also had to change
4161da177e4SLinus Torvalds	 */
4171da177e4SLinus Torvalds	.align 5
4180d83620fSMichael Cree	.ent __memset16
4191da177e4SLinus Torvalds
4200d83620fSMichael Cree__memset16:
4211da177e4SLinus Torvalds	.frame $30,0,$26,0
4221da177e4SLinus Torvalds	.prologue 0
4231da177e4SLinus Torvalds
4241da177e4SLinus Torvalds	inswl $17,0,$5		# U : 000000000000c1c2
4251da177e4SLinus Torvalds	inswl $17,2,$2		# U : 00000000c1c20000
4261da177e4SLinus Torvalds	bis $16,$16,$0		# E : return value
4271da177e4SLinus Torvalds	addq	$18,$16,$6	# E : max address to write to
4281da177e4SLinus Torvalds
4291da177e4SLinus Torvalds	ble $18, end_w		# U : zero length requested?
4301da177e4SLinus Torvalds	inswl	$17,4,$3	# U : 0000c1c200000000
4311da177e4SLinus Torvalds	inswl	$17,6,$4	# U : c1c2000000000000
4321da177e4SLinus Torvalds	xor	$16,$6,$1	# E : will complete write be within one quadword?
4331da177e4SLinus Torvalds
4341da177e4SLinus Torvalds	or	$2,$5,$2	# E : 00000000c1c2c1c2
4351da177e4SLinus Torvalds	or	$3,$4,$17	# E : c1c2c1c200000000
4361da177e4SLinus Torvalds	bic	$1,7,$1		# E : fit within a single quadword
4371da177e4SLinus Torvalds	and	$16,7,$3	# E : Target addr misalignment
4381da177e4SLinus Torvalds
4391da177e4SLinus Torvalds	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
4401da177e4SLinus Torvalds	beq $1,within_quad_w	# U :
4411da177e4SLinus Torvalds	nop
4421da177e4SLinus Torvalds	beq $3,aligned_w	# U : target is 0mod8
4431da177e4SLinus Torvalds
4441da177e4SLinus Torvalds	/*
4451da177e4SLinus Torvalds	 * Target address is misaligned, and won't fit within a quadword
4461da177e4SLinus Torvalds	 */
4471da177e4SLinus Torvalds	ldq_u $4,0($16)		# L : Fetch first partial
4481da177e4SLinus Torvalds	bis $16,$16,$5		# E : Save the address
4491da177e4SLinus Torvalds	insql $17,$16,$2	# U : Insert new bytes
4501da177e4SLinus Torvalds	subq $3,8,$3		# E : Invert (for addressing uses)
4511da177e4SLinus Torvalds
4521da177e4SLinus Torvalds	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
4531da177e4SLinus Torvalds	mskql $4,$16,$4		# U : clear relevant parts of the quad
4541da177e4SLinus Torvalds	subq $16,$3,$16		# E : $16 is new aligned destination
4551da177e4SLinus Torvalds	bis $2,$4,$1		# E : Final bytes
4561da177e4SLinus Torvalds
4571da177e4SLinus Torvalds	nop
4581da177e4SLinus Torvalds	stq_u $1,0($5)		# L : Store result
4591da177e4SLinus Torvalds	nop
4601da177e4SLinus Torvalds	nop
4611da177e4SLinus Torvalds
4621da177e4SLinus Torvalds.align 4
4631da177e4SLinus Torvaldsaligned_w:
4641da177e4SLinus Torvalds	/*
4651da177e4SLinus Torvalds	 * We are now guaranteed to be quad aligned, with at least
4661da177e4SLinus Torvalds	 * one partial quad to write.
4671da177e4SLinus Torvalds	 */
4681da177e4SLinus Torvalds
4691da177e4SLinus Torvalds	sra $18,3,$3		# U : Number of remaining quads to write
4701da177e4SLinus Torvalds	and $18,7,$18		# E : Number of trailing bytes to write
4711da177e4SLinus Torvalds	bis $16,$16,$5		# E : Save dest address
4721da177e4SLinus Torvalds	beq $3,no_quad_w	# U : tail stuff only
4731da177e4SLinus Torvalds
4741da177e4SLinus Torvalds	/*
4751da177e4SLinus Torvalds	 * it's worth the effort to unroll this and use wh64 if possible
4761da177e4SLinus Torvalds	 * Lifted a bunch of code from clear_user.S
4771da177e4SLinus Torvalds	 * At this point, entry values are:
4781da177e4SLinus Torvalds	 * $16	Current destination address
4791da177e4SLinus Torvalds	 * $5	A copy of $16
4801da177e4SLinus Torvalds	 * $6	The max quadword address to write to
4811da177e4SLinus Torvalds	 * $18	Number trailer bytes
4821da177e4SLinus Torvalds	 * $3	Number quads to write
4831da177e4SLinus Torvalds	 */
4841da177e4SLinus Torvalds
4851da177e4SLinus Torvalds	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
4861da177e4SLinus Torvalds	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
4871da177e4SLinus Torvalds	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
4881da177e4SLinus Torvalds	blt	$4, loop_w	# U :
4891da177e4SLinus Torvalds
4901da177e4SLinus Torvalds	/*
4911da177e4SLinus Torvalds	 * We know we've got at least 16 quads, minimum of one trip
4921da177e4SLinus Torvalds	 * through unrolled loop.  Do a quad at a time to get us 0mod64
4931da177e4SLinus Torvalds	 * aligned.
4941da177e4SLinus Torvalds	 */
4951da177e4SLinus Torvalds
4961da177e4SLinus Torvalds	nop			# E :
4971da177e4SLinus Torvalds	nop			# E :
4981da177e4SLinus Torvalds	nop			# E :
4991da177e4SLinus Torvalds	beq	$1, $bigalign_w	# U :
5001da177e4SLinus Torvalds
5011da177e4SLinus Torvalds$alignmod64_w:
5021da177e4SLinus Torvalds	stq	$17, 0($5)	# L :
5031da177e4SLinus Torvalds	subq	$3, 1, $3	# E : For consistency later
5041da177e4SLinus Torvalds	addq	$1, 8, $1	# E : Increment towards zero for alignment
5051da177e4SLinus Torvalds	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
5061da177e4SLinus Torvalds
5071da177e4SLinus Torvalds	nop
5081da177e4SLinus Torvalds	nop
5091da177e4SLinus Torvalds	addq	$5, 8, $5	# E : Inc address
5101da177e4SLinus Torvalds	blt	$1, $alignmod64_w	# U :
5111da177e4SLinus Torvalds
5121da177e4SLinus Torvalds$bigalign_w:
5131da177e4SLinus Torvalds	/*
5141da177e4SLinus Torvalds	 * $3 - number quads left to go
5151da177e4SLinus Torvalds	 * $5 - target address (aligned 0mod64)
5161da177e4SLinus Torvalds	 * $17 - mask of stuff to store
5171da177e4SLinus Torvalds	 * Scratch registers available: $7, $2, $4, $1
5181da177e4SLinus Torvalds	 * we know that we'll be taking a minimum of one trip through
5191da177e4SLinus Torvalds 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
5201da177e4SLinus Torvalds	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
5211da177e4SLinus Torvalds	 * The wh64 is issued on for the starting destination address for trip +2
5221da177e4SLinus Torvalds	 * through the loop, and if there are less than two trips left, the target
5231da177e4SLinus Torvalds	 * address will be for the current trip.
5241da177e4SLinus Torvalds	 */
5251da177e4SLinus Torvalds
5261da177e4SLinus Torvalds$do_wh64_w:
5271da177e4SLinus Torvalds	wh64	($4)		# L1 : memory subsystem write hint
5281da177e4SLinus Torvalds	subq	$3, 24, $2	# E : For determining future wh64 addresses
5291da177e4SLinus Torvalds	stq	$17, 0($5)	# L :
5301da177e4SLinus Torvalds	nop			# E :
5311da177e4SLinus Torvalds
5321da177e4SLinus Torvalds	addq	$5, 128, $4	# E : speculative target of next wh64
5331da177e4SLinus Torvalds	stq	$17, 8($5)	# L :
5341da177e4SLinus Torvalds	stq	$17, 16($5)	# L :
5351da177e4SLinus Torvalds	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
5361da177e4SLinus Torvalds
5371da177e4SLinus Torvalds	stq	$17, 24($5)	# L :
5381da177e4SLinus Torvalds	stq	$17, 32($5)	# L :
5391da177e4SLinus Torvalds	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
5401da177e4SLinus Torvalds	nop
5411da177e4SLinus Torvalds
5421da177e4SLinus Torvalds	stq	$17, 40($5)	# L :
5431da177e4SLinus Torvalds	stq	$17, 48($5)	# L :
5441da177e4SLinus Torvalds	subq	$3, 16, $2	# E : Repeat the loop at least once more?
5451da177e4SLinus Torvalds	nop
5461da177e4SLinus Torvalds
5471da177e4SLinus Torvalds	stq	$17, 56($5)	# L :
5481da177e4SLinus Torvalds	addq	$5, 64, $5	# E :
5491da177e4SLinus Torvalds	subq	$3, 8, $3	# E :
5501da177e4SLinus Torvalds	bge	$2, $do_wh64_w	# U :
5511da177e4SLinus Torvalds
5521da177e4SLinus Torvalds	nop
5531da177e4SLinus Torvalds	nop
5541da177e4SLinus Torvalds	nop
5551da177e4SLinus Torvalds	beq	$3, no_quad_w	# U : Might have finished already
5561da177e4SLinus Torvalds
5571da177e4SLinus Torvalds.align 4
5581da177e4SLinus Torvalds	/*
5591da177e4SLinus Torvalds	 * Simple loop for trailing quadwords, or for small amounts
5601da177e4SLinus Torvalds	 * of data (where we can't use an unrolled loop and wh64)
5611da177e4SLinus Torvalds	 */
5621da177e4SLinus Torvaldsloop_w:
5631da177e4SLinus Torvalds	stq $17,0($5)		# L :
5641da177e4SLinus Torvalds	subq $3,1,$3		# E : Decrement number quads left
5651da177e4SLinus Torvalds	addq $5,8,$5		# E : Inc address
5661da177e4SLinus Torvalds	bne $3,loop_w		# U : more?
5671da177e4SLinus Torvalds
5681da177e4SLinus Torvaldsno_quad_w:
5691da177e4SLinus Torvalds	/*
5701da177e4SLinus Torvalds	 * Write 0..7 trailing bytes.
5711da177e4SLinus Torvalds	 */
5721da177e4SLinus Torvalds	nop			# E :
5731da177e4SLinus Torvalds	beq $18,end_w		# U : All done?
5741da177e4SLinus Torvalds	ldq $7,0($5)		# L :
5751da177e4SLinus Torvalds	mskqh $7,$6,$2		# U : Mask final quad
5761da177e4SLinus Torvalds
5771da177e4SLinus Torvalds	insqh $17,$6,$4		# U : New bits
5781da177e4SLinus Torvalds	bis $2,$4,$1		# E : Put it all together
5791da177e4SLinus Torvalds	stq $1,0($5)		# L : And back to memory
5801da177e4SLinus Torvalds	ret $31,($26),1		# L0 :
5811da177e4SLinus Torvalds
5821da177e4SLinus Torvaldswithin_quad_w:
5831da177e4SLinus Torvalds	ldq_u $1,0($16)		# L :
5841da177e4SLinus Torvalds	insql $17,$16,$2	# U : New bits
5851da177e4SLinus Torvalds	mskql $1,$16,$4		# U : Clear old
5861da177e4SLinus Torvalds	bis $2,$4,$2		# E : New result
5871da177e4SLinus Torvalds
5881da177e4SLinus Torvalds	mskql $2,$6,$4		# U :
5891da177e4SLinus Torvalds	mskqh $1,$6,$2		# U :
5901da177e4SLinus Torvalds	bis $2,$4,$1		# E :
5911da177e4SLinus Torvalds	stq_u $1,0($16)		# L :
5921da177e4SLinus Torvalds
5931da177e4SLinus Torvaldsend_w:
5941da177e4SLinus Torvalds	nop
5951da177e4SLinus Torvalds	nop
5961da177e4SLinus Torvalds	nop
5971da177e4SLinus Torvalds	ret $31,($26),1		# L0 :
5981da177e4SLinus Torvalds
5990d83620fSMichael Cree	.end __memset16
6000d83620fSMichael Cree	EXPORT_SYMBOL(__memset16)
6011da177e4SLinus Torvalds
602a47e5bb5SRichard Hendersonmemset = ___memset
603a47e5bb5SRichard Henderson__memset = ___memset
60400fc0e0dSAl Viro	EXPORT_SYMBOL(memset)
60500fc0e0dSAl Viro	EXPORT_SYMBOL(__memset)
606