xref: /openbmc/linux/arch/ia64/lib/memcpy.S (revision f3a8b664)
1/*
2 *
3 * Optimized version of the standard memcpy() function
4 *
5 * Inputs:
6 * 	in0:	destination address
7 *	in1:	source address
8 *	in2:	number of bytes to copy
9 * Output:
10 * 	no return value
11 *
12 * Copyright (C) 2000-2001 Hewlett-Packard Co
13 *	Stephane Eranian <eranian@hpl.hp.com>
14 *	David Mosberger-Tang <davidm@hpl.hp.com>
15 */
16#include <asm/asmmacro.h>
17#include <asm/export.h>
18
19GLOBAL_ENTRY(memcpy)
20
21#	define MEM_LAT	21		/* latency to memory */
22
23#	define dst	r2
24#	define src	r3
25#	define retval	r8
26#	define saved_pfs r9
27#	define saved_lc	r10
28#	define saved_pr	r11
29#	define cnt	r16
30#	define src2	r17
31#	define t0	r18
32#	define t1	r19
33#	define t2	r20
34#	define t3	r21
35#	define t4	r22
36#	define src_end	r23
37
38#	define N	(MEM_LAT + 4)
39#	define Nrot	((N + 7) & ~7)
40
41	/*
42	 * First, check if everything (src, dst, len) is a multiple of eight.  If
43	 * so, we handle everything with no taken branches (other than the loop
44	 * itself) and a small icache footprint.  Otherwise, we jump off to
45	 * the more general copy routine handling arbitrary
46	 * sizes/alignment etc.
47	 */
48	.prologue
49	.save ar.pfs, saved_pfs
50	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
51	.save ar.lc, saved_lc
52	mov saved_lc=ar.lc
53	or t0=in0,in1
54	;;
55
56	or t0=t0,in2
57	.save pr, saved_pr
58	mov saved_pr=pr
59
60	.body
61
62	cmp.eq p6,p0=in2,r0	// zero length?
63	mov retval=in0		// return dst
64(p6)	br.ret.spnt.many rp	// zero length, return immediately
65	;;
66
67	mov dst=in0		// copy because of rotation
68	shr.u cnt=in2,3		// number of 8-byte words to copy
69	mov pr.rot=1<<16
70	;;
71
72	adds cnt=-1,cnt		// br.ctop is repeat/until
73	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
74	mov ar.ec=N
75	;;
76
77	and t0=0x7,t0
78	mov ar.lc=cnt
79	;;
80	cmp.ne p6,p0=t0,r0
81
82	mov src=in1		// copy because of rotation
83(p7)	br.cond.spnt.few .memcpy_short
84(p6)	br.cond.spnt.few .memcpy_long
85	;;
86	nop.m	0
87	;;
88	nop.m	0
89	nop.i	0
90	;;
91	nop.m	0
92	;;
93	.rotr val[N]
94	.rotp p[N]
95	.align 32
961: { .mib
97(p[0])	ld8 val[0]=[src],8
98	nop.i 0
99	brp.loop.imp 1b, 2f
100}
1012: { .mfb
102(p[N-1])st8 [dst]=val[N-1],8
103	nop.f 0
104	br.ctop.dptk.few 1b
105}
106	;;
107	mov ar.lc=saved_lc
108	mov pr=saved_pr,-1
109	mov ar.pfs=saved_pfs
110	br.ret.sptk.many rp
111
112	/*
113	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
114	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
115	 * get used very often (gcc inlines small copies) and due to atomicity
116	 * issues, we want to avoid read-modify-write of entire words.
117	 */
118	.align 32
119.memcpy_short:
120	adds cnt=-1,in2		// br.ctop is repeat/until
121	mov ar.ec=MEM_LAT
122	brp.loop.imp 1f, 2f
123	;;
124	mov ar.lc=cnt
125	;;
126	nop.m	0
127	;;
128	nop.m	0
129	nop.i	0
130	;;
131	nop.m	0
132	;;
133	nop.m	0
134	;;
135	/*
136	 * It is faster to put a stop bit in the loop here because it makes
137	 * the pipeline shorter (and latency is what matters on short copies).
138	 */
139	.align 32
1401: { .mib
141(p[0])	ld1 val[0]=[src],1
142	nop.i 0
143	brp.loop.imp 1b, 2f
144} ;;
1452: { .mfb
146(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
147	nop.f 0
148	br.ctop.dptk.few 1b
149} ;;
150	mov ar.lc=saved_lc
151	mov pr=saved_pr,-1
152	mov ar.pfs=saved_pfs
153	br.ret.sptk.many rp
154
155	/*
156	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
157	 * an overriding concern here, but throughput is.  We first do
158	 * sub-word copying until the destination is aligned, then we check
159	 * if the source is also aligned.  If so, we do a simple load/store-loop
160	 * until there are less than 8 bytes left over and then we do the tail,
161	 * by storing the last few bytes using sub-word copying.  If the source
162	 * is not aligned, we branch off to the non-congruent loop.
163	 *
164	 *   stage:   op:
165	 *         0  ld
166	 *	   :
167	 * MEM_LAT+3  shrp
168	 * MEM_LAT+4  st
169	 *
170	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
171	 * seems to introduce an unavoidable bubble in the pipeline so the overall
172	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
173	 * of 4 byte/cycle.  Still not bad.
174	 */
175#	undef N
176#	undef Nrot
177#	define N	(MEM_LAT + 5)		/* number of stages */
178#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
179
180#define LOG_LOOP_SIZE	6
181
182.memcpy_long:
183	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
184	and t0=-8,src		// t0 = src & ~7
185	and t2=7,src		// t2 = src & 7
186	;;
187	ld8 t0=[t0]		// t0 = 1st source word
188	adds src2=7,src		// src2 = (src + 7)
189	sub t4=r0,dst		// t4 = -dst
190	;;
191	and src2=-8,src2	// src2 = (src + 7) & ~7
192	shl t2=t2,3		// t2 = 8*(src & 7)
193	shl t4=t4,3		// t4 = 8*(dst & 7)
194	;;
195	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
196	sub t3=64,t2		// t3 = 64-8*(src & 7)
197	shr.u t0=t0,t2
198	;;
199	add src_end=src,in2
200	shl t1=t1,t3
201	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
202	;;
203	or t0=t0,t1
204	mov cnt=r0
205	adds src_end=-1,src_end
206	;;
207(p3)	st1 [dst]=t0,1
208(p3)	shr.u t0=t0,8
209(p3)	adds cnt=1,cnt
210	;;
211(p4)	st2 [dst]=t0,2
212(p4)	shr.u t0=t0,16
213(p4)	adds cnt=2,cnt
214	;;
215(p5)	st4 [dst]=t0,4
216(p5)	adds cnt=4,cnt
217	and src_end=-8,src_end	// src_end = last word of source buffer
218	;;
219
220	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
221
2221:{	add src=cnt,src			// make src point to remainder of source buffer
223	sub cnt=in2,cnt			// cnt = number of bytes left to copy
224	mov t4=ip
225  }	;;
226	and src2=-8,src			// align source pointer
227	adds t4=.memcpy_loops-1b,t4
228	mov ar.ec=N
229
230	and t0=7,src			// t0 = src & 7
231	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
232	shl cnt=cnt,3			// move bits 0-2 to 3-5
233	;;
234
235	.rotr val[N+1], w[2]
236	.rotp p[N]
237
238	cmp.ne p6,p0=t0,r0		// is src aligned, too?
239	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
240	adds t2=-1,t2			// br.ctop is repeat/until
241	;;
242	add t4=t0,t4
243	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
244	mov ar.lc=t2
245	;;
246	nop.m	0
247	;;
248	nop.m	0
249	nop.i	0
250	;;
251	nop.m	0
252	;;
253(p6)	ld8 val[1]=[src2],8		// prime the pump...
254	mov b6=t4
255	br.sptk.few b6
256	;;
257
258.memcpy_tail:
259	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
260	// less than 8) and t0 contains the last few bytes of the src buffer:
261(p5)	st4 [dst]=t0,4
262(p5)	shr.u t0=t0,32
263	mov ar.lc=saved_lc
264	;;
265(p4)	st2 [dst]=t0,2
266(p4)	shr.u t0=t0,16
267	mov ar.pfs=saved_pfs
268	;;
269(p3)	st1 [dst]=t0
270	mov pr=saved_pr,-1
271	br.ret.sptk.many rp
272
273///////////////////////////////////////////////////////
274	.align 64
275
276#define COPY(shift,index)									\
277 1: { .mib											\
278	(p[0])		ld8 val[0]=[src2],8;							\
279	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
280			brp.loop.imp 1b, 2f							\
281    };												\
282 2: { .mfb											\
283	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
284			nop.f 0;								\
285			br.ctop.dptk.few 1b;							\
286    };												\
287			;;									\
288			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
289			;;									\
290			shrp t0=val[N-1],val[N-index],shift;					\
291			br .memcpy_tail
292.memcpy_loops:
293	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
294	COPY(8, 0)
295	COPY(16, 0)
296	COPY(24, 0)
297	COPY(32, 0)
298	COPY(40, 0)
299	COPY(48, 0)
300	COPY(56, 0)
301
302END(memcpy)
303EXPORT_SYMBOL(memcpy)
304