xref: /openbmc/linux/arch/ia64/lib/memcpy.S (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds *
41da177e4SLinus Torvalds * Optimized version of the standard memcpy() function
51da177e4SLinus Torvalds *
61da177e4SLinus Torvalds * Inputs:
71da177e4SLinus Torvalds * 	in0:	destination address
81da177e4SLinus Torvalds *	in1:	source address
91da177e4SLinus Torvalds *	in2:	number of bytes to copy
101da177e4SLinus Torvalds * Output:
111da177e4SLinus Torvalds * 	no return value
121da177e4SLinus Torvalds *
131da177e4SLinus Torvalds * Copyright (C) 2000-2001 Hewlett-Packard Co
141da177e4SLinus Torvalds *	Stephane Eranian <eranian@hpl.hp.com>
151da177e4SLinus Torvalds *	David Mosberger-Tang <davidm@hpl.hp.com>
161da177e4SLinus Torvalds */
17*ab03e604SMasahiro Yamada#include <linux/export.h>
181da177e4SLinus Torvalds#include <asm/asmmacro.h>
191da177e4SLinus Torvalds
201da177e4SLinus TorvaldsGLOBAL_ENTRY(memcpy)
211da177e4SLinus Torvalds
221da177e4SLinus Torvalds#	define MEM_LAT	21		/* latency to memory */
231da177e4SLinus Torvalds
241da177e4SLinus Torvalds#	define dst	r2
251da177e4SLinus Torvalds#	define src	r3
261da177e4SLinus Torvalds#	define retval	r8
271da177e4SLinus Torvalds#	define saved_pfs r9
281da177e4SLinus Torvalds#	define saved_lc	r10
291da177e4SLinus Torvalds#	define saved_pr	r11
301da177e4SLinus Torvalds#	define cnt	r16
311da177e4SLinus Torvalds#	define src2	r17
321da177e4SLinus Torvalds#	define t0	r18
331da177e4SLinus Torvalds#	define t1	r19
341da177e4SLinus Torvalds#	define t2	r20
351da177e4SLinus Torvalds#	define t3	r21
361da177e4SLinus Torvalds#	define t4	r22
371da177e4SLinus Torvalds#	define src_end	r23
381da177e4SLinus Torvalds
391da177e4SLinus Torvalds#	define N	(MEM_LAT + 4)
401da177e4SLinus Torvalds#	define Nrot	((N + 7) & ~7)
411da177e4SLinus Torvalds
421da177e4SLinus Torvalds	/*
431da177e4SLinus Torvalds	 * First, check if everything (src, dst, len) is a multiple of eight.  If
441da177e4SLinus Torvalds	 * so, we handle everything with no taken branches (other than the loop
451da177e4SLinus Torvalds	 * itself) and a small icache footprint.  Otherwise, we jump off to
461da177e4SLinus Torvalds	 * the more general copy routine handling arbitrary
471da177e4SLinus Torvalds	 * sizes/alignment etc.
481da177e4SLinus Torvalds	 */
491da177e4SLinus Torvalds	.prologue
501da177e4SLinus Torvalds	.save ar.pfs, saved_pfs
511da177e4SLinus Torvalds	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
521da177e4SLinus Torvalds	.save ar.lc, saved_lc
531da177e4SLinus Torvalds	mov saved_lc=ar.lc
541da177e4SLinus Torvalds	or t0=in0,in1
551da177e4SLinus Torvalds	;;
561da177e4SLinus Torvalds
571da177e4SLinus Torvalds	or t0=t0,in2
581da177e4SLinus Torvalds	.save pr, saved_pr
591da177e4SLinus Torvalds	mov saved_pr=pr
601da177e4SLinus Torvalds
611da177e4SLinus Torvalds	.body
621da177e4SLinus Torvalds
631da177e4SLinus Torvalds	cmp.eq p6,p0=in2,r0	// zero length?
641da177e4SLinus Torvalds	mov retval=in0		// return dst
651da177e4SLinus Torvalds(p6)	br.ret.spnt.many rp	// zero length, return immediately
661da177e4SLinus Torvalds	;;
671da177e4SLinus Torvalds
681da177e4SLinus Torvalds	mov dst=in0		// copy because of rotation
691da177e4SLinus Torvalds	shr.u cnt=in2,3		// number of 8-byte words to copy
701da177e4SLinus Torvalds	mov pr.rot=1<<16
711da177e4SLinus Torvalds	;;
721da177e4SLinus Torvalds
731da177e4SLinus Torvalds	adds cnt=-1,cnt		// br.ctop is repeat/until
741da177e4SLinus Torvalds	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
751da177e4SLinus Torvalds	mov ar.ec=N
761da177e4SLinus Torvalds	;;
771da177e4SLinus Torvalds
781da177e4SLinus Torvalds	and t0=0x7,t0
791da177e4SLinus Torvalds	mov ar.lc=cnt
801da177e4SLinus Torvalds	;;
811da177e4SLinus Torvalds	cmp.ne p6,p0=t0,r0
821da177e4SLinus Torvalds
831da177e4SLinus Torvalds	mov src=in1		// copy because of rotation
841da177e4SLinus Torvalds(p7)	br.cond.spnt.few .memcpy_short
851da177e4SLinus Torvalds(p6)	br.cond.spnt.few .memcpy_long
861da177e4SLinus Torvalds	;;
871da177e4SLinus Torvalds	nop.m	0
881da177e4SLinus Torvalds	;;
891da177e4SLinus Torvalds	nop.m	0
901da177e4SLinus Torvalds	nop.i	0
911da177e4SLinus Torvalds	;;
921da177e4SLinus Torvalds	nop.m	0
931da177e4SLinus Torvalds	;;
941da177e4SLinus Torvalds	.rotr val[N]
951da177e4SLinus Torvalds	.rotp p[N]
961da177e4SLinus Torvalds	.align 32
971da177e4SLinus Torvalds1: { .mib
981da177e4SLinus Torvalds(p[0])	ld8 val[0]=[src],8
991da177e4SLinus Torvalds	nop.i 0
1001da177e4SLinus Torvalds	brp.loop.imp 1b, 2f
1011da177e4SLinus Torvalds}
1021da177e4SLinus Torvalds2: { .mfb
1031da177e4SLinus Torvalds(p[N-1])st8 [dst]=val[N-1],8
1041da177e4SLinus Torvalds	nop.f 0
1051da177e4SLinus Torvalds	br.ctop.dptk.few 1b
1061da177e4SLinus Torvalds}
1071da177e4SLinus Torvalds	;;
1081da177e4SLinus Torvalds	mov ar.lc=saved_lc
1091da177e4SLinus Torvalds	mov pr=saved_pr,-1
1101da177e4SLinus Torvalds	mov ar.pfs=saved_pfs
1111da177e4SLinus Torvalds	br.ret.sptk.many rp
1121da177e4SLinus Torvalds
1131da177e4SLinus Torvalds	/*
1141da177e4SLinus Torvalds	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
1151da177e4SLinus Torvalds	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
1161da177e4SLinus Torvalds	 * get used very often (gcc inlines small copies) and due to atomicity
1171da177e4SLinus Torvalds	 * issues, we want to avoid read-modify-write of entire words.
1181da177e4SLinus Torvalds	 */
1191da177e4SLinus Torvalds	.align 32
1201da177e4SLinus Torvalds.memcpy_short:
1211da177e4SLinus Torvalds	adds cnt=-1,in2		// br.ctop is repeat/until
1221da177e4SLinus Torvalds	mov ar.ec=MEM_LAT
1231da177e4SLinus Torvalds	brp.loop.imp 1f, 2f
1241da177e4SLinus Torvalds	;;
1251da177e4SLinus Torvalds	mov ar.lc=cnt
1261da177e4SLinus Torvalds	;;
1271da177e4SLinus Torvalds	nop.m	0
1281da177e4SLinus Torvalds	;;
1291da177e4SLinus Torvalds	nop.m	0
1301da177e4SLinus Torvalds	nop.i	0
1311da177e4SLinus Torvalds	;;
1321da177e4SLinus Torvalds	nop.m	0
1331da177e4SLinus Torvalds	;;
1341da177e4SLinus Torvalds	nop.m	0
1351da177e4SLinus Torvalds	;;
1361da177e4SLinus Torvalds	/*
1371da177e4SLinus Torvalds	 * It is faster to put a stop bit in the loop here because it makes
1381da177e4SLinus Torvalds	 * the pipeline shorter (and latency is what matters on short copies).
1391da177e4SLinus Torvalds	 */
1401da177e4SLinus Torvalds	.align 32
1411da177e4SLinus Torvalds1: { .mib
1421da177e4SLinus Torvalds(p[0])	ld1 val[0]=[src],1
1431da177e4SLinus Torvalds	nop.i 0
1441da177e4SLinus Torvalds	brp.loop.imp 1b, 2f
1451da177e4SLinus Torvalds} ;;
1461da177e4SLinus Torvalds2: { .mfb
1471da177e4SLinus Torvalds(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
1481da177e4SLinus Torvalds	nop.f 0
1491da177e4SLinus Torvalds	br.ctop.dptk.few 1b
1501da177e4SLinus Torvalds} ;;
1511da177e4SLinus Torvalds	mov ar.lc=saved_lc
1521da177e4SLinus Torvalds	mov pr=saved_pr,-1
1531da177e4SLinus Torvalds	mov ar.pfs=saved_pfs
1541da177e4SLinus Torvalds	br.ret.sptk.many rp
1551da177e4SLinus Torvalds
1561da177e4SLinus Torvalds	/*
1571da177e4SLinus Torvalds	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
1581da177e4SLinus Torvalds	 * an overriding concern here, but throughput is.  We first do
1591da177e4SLinus Torvalds	 * sub-word copying until the destination is aligned, then we check
1601da177e4SLinus Torvalds	 * if the source is also aligned.  If so, we do a simple load/store-loop
1611da177e4SLinus Torvalds	 * until there are less than 8 bytes left over and then we do the tail,
1621da177e4SLinus Torvalds	 * by storing the last few bytes using sub-word copying.  If the source
1631da177e4SLinus Torvalds	 * is not aligned, we branch off to the non-congruent loop.
1641da177e4SLinus Torvalds	 *
1651da177e4SLinus Torvalds	 *   stage:   op:
1661da177e4SLinus Torvalds	 *         0  ld
1671da177e4SLinus Torvalds	 *	   :
1681da177e4SLinus Torvalds	 * MEM_LAT+3  shrp
1691da177e4SLinus Torvalds	 * MEM_LAT+4  st
1701da177e4SLinus Torvalds	 *
1711da177e4SLinus Torvalds	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
1721da177e4SLinus Torvalds	 * seems to introduce an unavoidable bubble in the pipeline so the overall
1731da177e4SLinus Torvalds	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
1741da177e4SLinus Torvalds	 * of 4 byte/cycle.  Still not bad.
1751da177e4SLinus Torvalds	 */
1761da177e4SLinus Torvalds#	undef N
1771da177e4SLinus Torvalds#	undef Nrot
1781da177e4SLinus Torvalds#	define N	(MEM_LAT + 5)		/* number of stages */
1791da177e4SLinus Torvalds#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
1801da177e4SLinus Torvalds
1811da177e4SLinus Torvalds#define LOG_LOOP_SIZE	6
1821da177e4SLinus Torvalds
1831da177e4SLinus Torvalds.memcpy_long:
1841da177e4SLinus Torvalds	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
1851da177e4SLinus Torvalds	and t0=-8,src		// t0 = src & ~7
1861da177e4SLinus Torvalds	and t2=7,src		// t2 = src & 7
1871da177e4SLinus Torvalds	;;
1881da177e4SLinus Torvalds	ld8 t0=[t0]		// t0 = 1st source word
1891da177e4SLinus Torvalds	adds src2=7,src		// src2 = (src + 7)
1901da177e4SLinus Torvalds	sub t4=r0,dst		// t4 = -dst
1911da177e4SLinus Torvalds	;;
1921da177e4SLinus Torvalds	and src2=-8,src2	// src2 = (src + 7) & ~7
1931da177e4SLinus Torvalds	shl t2=t2,3		// t2 = 8*(src & 7)
1941da177e4SLinus Torvalds	shl t4=t4,3		// t4 = 8*(dst & 7)
1951da177e4SLinus Torvalds	;;
1961da177e4SLinus Torvalds	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
1971da177e4SLinus Torvalds	sub t3=64,t2		// t3 = 64-8*(src & 7)
1981da177e4SLinus Torvalds	shr.u t0=t0,t2
1991da177e4SLinus Torvalds	;;
2001da177e4SLinus Torvalds	add src_end=src,in2
2011da177e4SLinus Torvalds	shl t1=t1,t3
2021da177e4SLinus Torvalds	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
2031da177e4SLinus Torvalds	;;
2041da177e4SLinus Torvalds	or t0=t0,t1
2051da177e4SLinus Torvalds	mov cnt=r0
2061da177e4SLinus Torvalds	adds src_end=-1,src_end
2071da177e4SLinus Torvalds	;;
2081da177e4SLinus Torvalds(p3)	st1 [dst]=t0,1
2091da177e4SLinus Torvalds(p3)	shr.u t0=t0,8
2101da177e4SLinus Torvalds(p3)	adds cnt=1,cnt
2111da177e4SLinus Torvalds	;;
2121da177e4SLinus Torvalds(p4)	st2 [dst]=t0,2
2131da177e4SLinus Torvalds(p4)	shr.u t0=t0,16
2141da177e4SLinus Torvalds(p4)	adds cnt=2,cnt
2151da177e4SLinus Torvalds	;;
2161da177e4SLinus Torvalds(p5)	st4 [dst]=t0,4
2171da177e4SLinus Torvalds(p5)	adds cnt=4,cnt
2181da177e4SLinus Torvalds	and src_end=-8,src_end	// src_end = last word of source buffer
2191da177e4SLinus Torvalds	;;
2201da177e4SLinus Torvalds
2211da177e4SLinus Torvalds	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
2221da177e4SLinus Torvalds
2231da177e4SLinus Torvalds1:{	add src=cnt,src			// make src point to remainder of source buffer
2241da177e4SLinus Torvalds	sub cnt=in2,cnt			// cnt = number of bytes left to copy
2251da177e4SLinus Torvalds	mov t4=ip
2261da177e4SLinus Torvalds  }	;;
2271da177e4SLinus Torvalds	and src2=-8,src			// align source pointer
2281da177e4SLinus Torvalds	adds t4=.memcpy_loops-1b,t4
2291da177e4SLinus Torvalds	mov ar.ec=N
2301da177e4SLinus Torvalds
2311da177e4SLinus Torvalds	and t0=7,src			// t0 = src & 7
2321da177e4SLinus Torvalds	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
2331da177e4SLinus Torvalds	shl cnt=cnt,3			// move bits 0-2 to 3-5
2341da177e4SLinus Torvalds	;;
2351da177e4SLinus Torvalds
2361da177e4SLinus Torvalds	.rotr val[N+1], w[2]
2371da177e4SLinus Torvalds	.rotp p[N]
2381da177e4SLinus Torvalds
2391da177e4SLinus Torvalds	cmp.ne p6,p0=t0,r0		// is src aligned, too?
2401da177e4SLinus Torvalds	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
2411da177e4SLinus Torvalds	adds t2=-1,t2			// br.ctop is repeat/until
2421da177e4SLinus Torvalds	;;
2431da177e4SLinus Torvalds	add t4=t0,t4
2441da177e4SLinus Torvalds	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
2451da177e4SLinus Torvalds	mov ar.lc=t2
2461da177e4SLinus Torvalds	;;
2471da177e4SLinus Torvalds	nop.m	0
2481da177e4SLinus Torvalds	;;
2491da177e4SLinus Torvalds	nop.m	0
2501da177e4SLinus Torvalds	nop.i	0
2511da177e4SLinus Torvalds	;;
2521da177e4SLinus Torvalds	nop.m	0
2531da177e4SLinus Torvalds	;;
2541da177e4SLinus Torvalds(p6)	ld8 val[1]=[src2],8		// prime the pump...
2551da177e4SLinus Torvalds	mov b6=t4
2561da177e4SLinus Torvalds	br.sptk.few b6
2571da177e4SLinus Torvalds	;;
2581da177e4SLinus Torvalds
2591da177e4SLinus Torvalds.memcpy_tail:
2601da177e4SLinus Torvalds	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
2611da177e4SLinus Torvalds	// less than 8) and t0 contains the last few bytes of the src buffer:
2621da177e4SLinus Torvalds(p5)	st4 [dst]=t0,4
2631da177e4SLinus Torvalds(p5)	shr.u t0=t0,32
2641da177e4SLinus Torvalds	mov ar.lc=saved_lc
2651da177e4SLinus Torvalds	;;
2661da177e4SLinus Torvalds(p4)	st2 [dst]=t0,2
2671da177e4SLinus Torvalds(p4)	shr.u t0=t0,16
2681da177e4SLinus Torvalds	mov ar.pfs=saved_pfs
2691da177e4SLinus Torvalds	;;
2701da177e4SLinus Torvalds(p3)	st1 [dst]=t0
2711da177e4SLinus Torvalds	mov pr=saved_pr,-1
2721da177e4SLinus Torvalds	br.ret.sptk.many rp
2731da177e4SLinus Torvalds
2741da177e4SLinus Torvalds///////////////////////////////////////////////////////
2751da177e4SLinus Torvalds	.align 64
2761da177e4SLinus Torvalds
2771da177e4SLinus Torvalds#define COPY(shift,index)									\
2781da177e4SLinus Torvalds 1: { .mib											\
2791da177e4SLinus Torvalds	(p[0])		ld8 val[0]=[src2],8;							\
2801da177e4SLinus Torvalds	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
2811da177e4SLinus Torvalds			brp.loop.imp 1b, 2f							\
2821da177e4SLinus Torvalds    };												\
2831da177e4SLinus Torvalds 2: { .mfb											\
2841da177e4SLinus Torvalds	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
2851da177e4SLinus Torvalds			nop.f 0;								\
2861da177e4SLinus Torvalds			br.ctop.dptk.few 1b;							\
2871da177e4SLinus Torvalds    };												\
2881da177e4SLinus Torvalds			;;									\
2891da177e4SLinus Torvalds			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
2901da177e4SLinus Torvalds			;;									\
2911da177e4SLinus Torvalds			shrp t0=val[N-1],val[N-index],shift;					\
2921da177e4SLinus Torvalds			br .memcpy_tail
2931da177e4SLinus Torvalds.memcpy_loops:
2941da177e4SLinus Torvalds	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
2951da177e4SLinus Torvalds	COPY(8, 0)
2961da177e4SLinus Torvalds	COPY(16, 0)
2971da177e4SLinus Torvalds	COPY(24, 0)
2981da177e4SLinus Torvalds	COPY(32, 0)
2991da177e4SLinus Torvalds	COPY(40, 0)
3001da177e4SLinus Torvalds	COPY(48, 0)
3011da177e4SLinus Torvalds	COPY(56, 0)
3021da177e4SLinus Torvalds
3031da177e4SLinus TorvaldsEND(memcpy)
304e007c533SAl ViroEXPORT_SYMBOL(memcpy)
305