xref: /openbmc/linux/arch/alpha/lib/ev6-memcpy.S (revision f3c78e94)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-memcpy.S
41da177e4SLinus Torvalds * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
51da177e4SLinus Torvalds *
61da177e4SLinus Torvalds * Reasonably optimized memcpy() routine for the Alpha 21264
71da177e4SLinus Torvalds *
81da177e4SLinus Torvalds *	- memory accessed as aligned quadwords only
91da177e4SLinus Torvalds *	- uses bcmpge to compare 8 bytes in parallel
101da177e4SLinus Torvalds *
111da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from:
121da177e4SLinus Torvalds *	Compiler Writer's Guide for the Alpha 21264
131da177e4SLinus Torvalds *	abbreviated as 'CWG' in other comments here
141da177e4SLinus Torvalds *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
151da177e4SLinus Torvalds * Scheduling notation:
161da177e4SLinus Torvalds *	E	- either cluster
171da177e4SLinus Torvalds *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
181da177e4SLinus Torvalds *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
191da177e4SLinus Torvalds *
201da177e4SLinus Torvalds * Temp usage notes:
211da177e4SLinus Torvalds *	$1,$2,		- scratch
221da177e4SLinus Torvalds */
23*f3c78e94SMasahiro Yamada#include <linux/export.h>
241da177e4SLinus Torvalds	.set noreorder
251da177e4SLinus Torvalds	.set noat
261da177e4SLinus Torvalds
271da177e4SLinus Torvalds	.align	4
281da177e4SLinus Torvalds	.globl memcpy
291da177e4SLinus Torvalds	.ent memcpy
301da177e4SLinus Torvaldsmemcpy:
311da177e4SLinus Torvalds	.frame $30,0,$26,0
321da177e4SLinus Torvalds	.prologue 0
331da177e4SLinus Torvalds
341da177e4SLinus Torvalds	mov	$16, $0			# E : copy dest to return
351da177e4SLinus Torvalds	ble	$18, $nomoredata	# U : done with the copy?
361da177e4SLinus Torvalds	xor	$16, $17, $1		# E : are source and dest alignments the same?
371da177e4SLinus Torvalds	and	$1, 7, $1		# E : are they the same mod 8?
381da177e4SLinus Torvalds
391da177e4SLinus Torvalds	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
401da177e4SLinus Torvalds	/* source and dest are same mod 8 address */
411da177e4SLinus Torvalds	and	$16, 7, $1		# E : Are both 0mod8?
421da177e4SLinus Torvalds	beq	$1, $both_0mod8		# U : Yes
431da177e4SLinus Torvalds	nop				# E :
441da177e4SLinus Torvalds
451da177e4SLinus Torvalds	/*
461da177e4SLinus Torvalds	 * source and dest are same misalignment.  move a byte at a time
471da177e4SLinus Torvalds	 * until a 0mod8 alignment for both is reached.
481da177e4SLinus Torvalds	 * At least one byte more to move
491da177e4SLinus Torvalds	 */
501da177e4SLinus Torvalds
511da177e4SLinus Torvalds$head_align:
521da177e4SLinus Torvalds	ldbu	$1, 0($17)		# L : grab a byte
531da177e4SLinus Torvalds	subq	$18, 1, $18		# E : count--
541da177e4SLinus Torvalds	addq	$17, 1, $17		# E : src++
551da177e4SLinus Torvalds	stb	$1, 0($16)		# L :
561da177e4SLinus Torvalds	addq	$16, 1, $16		# E : dest++
571da177e4SLinus Torvalds	and	$16, 7, $1		# E : Are we at 0mod8 yet?
581da177e4SLinus Torvalds	ble	$18, $nomoredata	# U : done with the copy?
591da177e4SLinus Torvalds	bne	$1, $head_align		# U :
601da177e4SLinus Torvalds
611da177e4SLinus Torvalds$both_0mod8:
621da177e4SLinus Torvalds	cmple	$18, 127, $1		# E : Can we unroll the loop?
631da177e4SLinus Torvalds	bne	$1, $no_unroll		# U :
641da177e4SLinus Torvalds	and	$16, 63, $1		# E : get mod64 alignment
651da177e4SLinus Torvalds	beq	$1, $do_unroll		# U : no single quads to fiddle
661da177e4SLinus Torvalds
671da177e4SLinus Torvalds$single_head_quad:
681da177e4SLinus Torvalds	ldq	$1, 0($17)		# L : get 8 bytes
691da177e4SLinus Torvalds	subq	$18, 8, $18		# E : count -= 8
701da177e4SLinus Torvalds	addq	$17, 8, $17		# E : src += 8
711da177e4SLinus Torvalds	nop				# E :
721da177e4SLinus Torvalds
731da177e4SLinus Torvalds	stq	$1, 0($16)		# L : store
741da177e4SLinus Torvalds	addq	$16, 8, $16		# E : dest += 8
751da177e4SLinus Torvalds	and	$16, 63, $1		# E : get mod64 alignment
761da177e4SLinus Torvalds	bne	$1, $single_head_quad	# U : still not fully aligned
771da177e4SLinus Torvalds
781da177e4SLinus Torvalds$do_unroll:
791da177e4SLinus Torvalds	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
801da177e4SLinus Torvalds	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
811da177e4SLinus Torvalds	bne	$1, $tail_quads		# U : Nope
821da177e4SLinus Torvalds	nop				# E :
831da177e4SLinus Torvalds
841da177e4SLinus Torvalds$unroll_body:
851da177e4SLinus Torvalds	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
861da177e4SLinus Torvalds					# ($7) are about to be over-written
871da177e4SLinus Torvalds	ldq	$6, 0($17)		# L0 : bytes 0..7
881da177e4SLinus Torvalds	nop				# E :
891da177e4SLinus Torvalds	nop				# E :
901da177e4SLinus Torvalds
911da177e4SLinus Torvalds	ldq	$4, 8($17)		# L : bytes 8..15
921da177e4SLinus Torvalds	ldq	$5, 16($17)		# L : bytes 16..23
931da177e4SLinus Torvalds	addq	$7, 64, $7		# E : Update next wh64 address
941da177e4SLinus Torvalds	nop				# E :
951da177e4SLinus Torvalds
961da177e4SLinus Torvalds	ldq	$3, 24($17)		# L : bytes 24..31
971da177e4SLinus Torvalds	addq	$16, 64, $1		# E : fallback value for wh64
981da177e4SLinus Torvalds	nop				# E :
991da177e4SLinus Torvalds	nop				# E :
1001da177e4SLinus Torvalds
1011da177e4SLinus Torvalds	addq	$17, 32, $17		# E : src += 32 bytes
1021da177e4SLinus Torvalds	stq	$6, 0($16)		# L : bytes 0..7
1031da177e4SLinus Torvalds	nop				# E :
1041da177e4SLinus Torvalds	nop				# E :
1051da177e4SLinus Torvalds
1061da177e4SLinus Torvalds	stq	$4, 8($16)		# L : bytes 8..15
1071da177e4SLinus Torvalds	stq	$5, 16($16)		# L : bytes 16..23
1081da177e4SLinus Torvalds	subq	$18, 192, $2		# E : At least two more trips to go?
1091da177e4SLinus Torvalds	nop				# E :
1101da177e4SLinus Torvalds
1111da177e4SLinus Torvalds	stq	$3, 24($16)		# L : bytes 24..31
1121da177e4SLinus Torvalds	addq	$16, 32, $16		# E : dest += 32 bytes
1131da177e4SLinus Torvalds	nop				# E :
1141da177e4SLinus Torvalds	nop				# E :
1151da177e4SLinus Torvalds
1161da177e4SLinus Torvalds	ldq	$6, 0($17)		# L : bytes 0..7
1171da177e4SLinus Torvalds	ldq	$4, 8($17)		# L : bytes 8..15
1181da177e4SLinus Torvalds	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
1191da177e4SLinus Torvalds					# fallback wh64 address if < 2 more trips
1201da177e4SLinus Torvalds	nop				# E :
1211da177e4SLinus Torvalds
1221da177e4SLinus Torvalds	ldq	$5, 16($17)		# L : bytes 16..23
1231da177e4SLinus Torvalds	ldq	$3, 24($17)		# L : bytes 24..31
1241da177e4SLinus Torvalds	addq	$16, 32, $16		# E : dest += 32
1251da177e4SLinus Torvalds	subq	$18, 64, $18		# E : count -= 64
1261da177e4SLinus Torvalds
1271da177e4SLinus Torvalds	addq	$17, 32, $17		# E : src += 32
1281da177e4SLinus Torvalds	stq	$6, -32($16)		# L : bytes 0..7
1291da177e4SLinus Torvalds	stq	$4, -24($16)		# L : bytes 8..15
1301da177e4SLinus Torvalds	cmple	$18, 63, $1		# E : At least one more trip?
1311da177e4SLinus Torvalds
1321da177e4SLinus Torvalds	stq	$5, -16($16)		# L : bytes 16..23
1331da177e4SLinus Torvalds	stq	$3, -8($16)		# L : bytes 24..31
1341da177e4SLinus Torvalds	nop				# E :
1351da177e4SLinus Torvalds	beq	$1, $unroll_body
1361da177e4SLinus Torvalds
1371da177e4SLinus Torvalds$tail_quads:
1381da177e4SLinus Torvalds$no_unroll:
1391da177e4SLinus Torvalds	.align 4
1401da177e4SLinus Torvalds	subq	$18, 8, $18		# E : At least a quad left?
1411da177e4SLinus Torvalds	blt	$18, $less_than_8	# U : Nope
1421da177e4SLinus Torvalds	nop				# E :
1431da177e4SLinus Torvalds	nop				# E :
1441da177e4SLinus Torvalds
1451da177e4SLinus Torvalds$move_a_quad:
1461da177e4SLinus Torvalds	ldq	$1, 0($17)		# L : fetch 8
1471da177e4SLinus Torvalds	subq	$18, 8, $18		# E : count -= 8
1481da177e4SLinus Torvalds	addq	$17, 8, $17		# E : src += 8
1491da177e4SLinus Torvalds	nop				# E :
1501da177e4SLinus Torvalds
1511da177e4SLinus Torvalds	stq	$1, 0($16)		# L : store 8
1521da177e4SLinus Torvalds	addq	$16, 8, $16		# E : dest += 8
1531da177e4SLinus Torvalds	bge	$18, $move_a_quad	# U :
1541da177e4SLinus Torvalds	nop				# E :
1551da177e4SLinus Torvalds
1561da177e4SLinus Torvalds$less_than_8:
1571da177e4SLinus Torvalds	.align 4
1581da177e4SLinus Torvalds	addq	$18, 8, $18		# E : add back for trailing bytes
1591da177e4SLinus Torvalds	ble	$18, $nomoredata	# U : All-done
1601da177e4SLinus Torvalds	nop				# E :
1611da177e4SLinus Torvalds	nop				# E :
1621da177e4SLinus Torvalds
1631da177e4SLinus Torvalds	/* Trailing bytes */
1641da177e4SLinus Torvalds$tail_bytes:
1651da177e4SLinus Torvalds	subq	$18, 1, $18		# E : count--
1661da177e4SLinus Torvalds	ldbu	$1, 0($17)		# L : fetch a byte
1671da177e4SLinus Torvalds	addq	$17, 1, $17		# E : src++
1681da177e4SLinus Torvalds	nop				# E :
1691da177e4SLinus Torvalds
1701da177e4SLinus Torvalds	stb	$1, 0($16)		# L : store a byte
1711da177e4SLinus Torvalds	addq	$16, 1, $16		# E : dest++
1721da177e4SLinus Torvalds	bgt	$18, $tail_bytes	# U : more to be done?
1731da177e4SLinus Torvalds	nop				# E :
1741da177e4SLinus Torvalds
1751da177e4SLinus Torvalds	/* branching to exit takes 3 extra cycles, so replicate exit here */
1761da177e4SLinus Torvalds	ret	$31, ($26), 1		# L0 :
1771da177e4SLinus Torvalds	nop				# E :
1781da177e4SLinus Torvalds	nop				# E :
1791da177e4SLinus Torvalds	nop				# E :
1801da177e4SLinus Torvalds
1811da177e4SLinus Torvalds$misaligned:
1821da177e4SLinus Torvalds	mov	$0, $4			# E : dest temp
1831da177e4SLinus Torvalds	and	$0, 7, $1		# E : dest alignment mod8
1841da177e4SLinus Torvalds	beq	$1, $dest_0mod8		# U : life doesnt totally suck
1851da177e4SLinus Torvalds	nop
1861da177e4SLinus Torvalds
1871da177e4SLinus Torvalds$aligndest:
1881da177e4SLinus Torvalds	ble	$18, $nomoredata	# U :
1891da177e4SLinus Torvalds	ldbu	$1, 0($17)		# L : fetch a byte
1901da177e4SLinus Torvalds	subq	$18, 1, $18		# E : count--
1911da177e4SLinus Torvalds	addq	$17, 1, $17		# E : src++
1921da177e4SLinus Torvalds
1931da177e4SLinus Torvalds	stb	$1, 0($4)		# L : store it
1941da177e4SLinus Torvalds	addq	$4, 1, $4		# E : dest++
1951da177e4SLinus Torvalds	and	$4, 7, $1		# E : dest 0mod8 yet?
1961da177e4SLinus Torvalds	bne	$1, $aligndest		# U : go until we are aligned.
1971da177e4SLinus Torvalds
1981da177e4SLinus Torvalds	/* Source has unknown alignment, but dest is known to be 0mod8 */
1991da177e4SLinus Torvalds$dest_0mod8:
2001da177e4SLinus Torvalds	subq	$18, 8, $18		# E : At least a quad left?
2011da177e4SLinus Torvalds	blt	$18, $misalign_tail	# U : Nope
2021da177e4SLinus Torvalds	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
2031da177e4SLinus Torvalds	nop				# E :
2041da177e4SLinus Torvalds
2051da177e4SLinus Torvalds$mis_quad:
2061da177e4SLinus Torvalds	ldq_u	$16, 8($17)		# L : Fetch next 8
2071da177e4SLinus Torvalds	extql	$3, $17, $3		# U : masking
2081da177e4SLinus Torvalds	extqh	$16, $17, $1		# U : masking
2091da177e4SLinus Torvalds	bis	$3, $1, $1		# E : merged bytes to store
2101da177e4SLinus Torvalds
2111da177e4SLinus Torvalds	subq	$18, 8, $18		# E : count -= 8
2121da177e4SLinus Torvalds	addq	$17, 8, $17		# E : src += 8
2131da177e4SLinus Torvalds	stq	$1, 0($4)		# L : store 8 (aligned)
2141da177e4SLinus Torvalds	mov	$16, $3			# E : "rotate" source data
2151da177e4SLinus Torvalds
2161da177e4SLinus Torvalds	addq	$4, 8, $4		# E : dest += 8
2171da177e4SLinus Torvalds	bge	$18, $mis_quad		# U : More quads to move
2181da177e4SLinus Torvalds	nop
2191da177e4SLinus Torvalds	nop
2201da177e4SLinus Torvalds
2211da177e4SLinus Torvalds$misalign_tail:
2221da177e4SLinus Torvalds	addq	$18, 8, $18		# E : account for tail stuff
2231da177e4SLinus Torvalds	ble	$18, $nomoredata	# U :
2241da177e4SLinus Torvalds	nop
2251da177e4SLinus Torvalds	nop
2261da177e4SLinus Torvalds
2271da177e4SLinus Torvalds$misalign_byte:
2281da177e4SLinus Torvalds	ldbu	$1, 0($17)		# L : fetch 1
2291da177e4SLinus Torvalds	subq	$18, 1, $18		# E : count--
2301da177e4SLinus Torvalds	addq	$17, 1, $17		# E : src++
2311da177e4SLinus Torvalds	nop				# E :
2321da177e4SLinus Torvalds
2331da177e4SLinus Torvalds	stb	$1, 0($4)		# L : store
2341da177e4SLinus Torvalds	addq	$4, 1, $4		# E : dest++
2351da177e4SLinus Torvalds	bgt	$18, $misalign_byte	# U : more to go?
2361da177e4SLinus Torvalds	nop
2371da177e4SLinus Torvalds
2381da177e4SLinus Torvalds
2391da177e4SLinus Torvalds$nomoredata:
2401da177e4SLinus Torvalds	ret	$31, ($26), 1		# L0 :
2411da177e4SLinus Torvalds	nop				# E :
2421da177e4SLinus Torvalds	nop				# E :
2431da177e4SLinus Torvalds	nop				# E :
2441da177e4SLinus Torvalds
2451da177e4SLinus Torvalds	.end memcpy
24600fc0e0dSAl Viro	EXPORT_SYMBOL(memcpy)
2471da177e4SLinus Torvalds
2481da177e4SLinus Torvalds/* For backwards module compatibility.  */
2491da177e4SLinus Torvalds__memcpy = memcpy
2501da177e4SLinus Torvalds.globl __memcpy
251