xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision 4f3db074)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES		\
17	lwz	r7,4(r4);	\
18	lwz	r8,8(r4);	\
19	lwz	r9,12(r4);	\
20	lwzu	r10,16(r4);	\
21	stw	r7,4(r6);	\
22	stw	r8,8(r6);	\
23	stw	r9,12(r6);	\
24	stwu	r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n)	\
278 ## n ## 0:			\
28	lwz	r7,4(r4);	\
298 ## n ## 1:			\
30	lwz	r8,8(r4);	\
318 ## n ## 2:			\
32	lwz	r9,12(r4);	\
338 ## n ## 3:			\
34	lwzu	r10,16(r4);	\
358 ## n ## 4:			\
36	stw	r7,4(r6);	\
378 ## n ## 5:			\
38	stw	r8,8(r6);	\
398 ## n ## 6:			\
40	stw	r9,12(r6);	\
418 ## n ## 7:			\
42	stwu	r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n)			\
459 ## n ## 0:					\
46	addi	r5,r5,-(16 * n);		\
47	b	104f;				\
489 ## n ## 1:					\
49	addi	r5,r5,-(16 * n);		\
50	b	105f;				\
51.section __ex_table,"a";			\
52	.align	2;				\
53	.long	8 ## n ## 0b,9 ## n ## 0b;	\
54	.long	8 ## n ## 1b,9 ## n ## 0b;	\
55	.long	8 ## n ## 2b,9 ## n ## 0b;	\
56	.long	8 ## n ## 3b,9 ## n ## 0b;	\
57	.long	8 ## n ## 4b,9 ## n ## 1b;	\
58	.long	8 ## n ## 5b,9 ## n ## 1b;	\
59	.long	8 ## n ## 6b,9 ## n ## 1b;	\
60	.long	8 ## n ## 7b,9 ## n ## 1b;	\
61	.text
62
63	.text
64	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
65	.stabs	"copy_32.S",N_SO,0,0,0f
660:
67
68CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72_GLOBAL(memset)
73	rlwimi	r4,r4,8,16,23
74	rlwimi	r4,r4,16,0,15
75	addi	r6,r3,-4
76	cmplwi	0,r5,4
77	blt	7f
78	stwu	r4,4(r6)
79	beqlr
80	andi.	r0,r6,3
81	add	r5,r0,r5
82	subf	r6,r0,r6
83	srwi	r0,r5,2
84	mtctr	r0
85	bdz	6f
861:	stwu	r4,4(r6)
87	bdnz	1b
886:	andi.	r5,r5,3
897:	cmpwi	0,r5,0
90	beqlr
91	mtctr	r5
92	addi	r6,r6,3
938:	stbu	r4,1(r6)
94	bdnz	8b
95	blr
96
97_GLOBAL(memmove)
98	cmplw	0,r3,r4
99	bgt	backwards_memcpy
100	/* fall through */
101
102_GLOBAL(memcpy)
103	srwi.	r7,r5,3
104	addi	r6,r3,-4
105	addi	r4,r4,-4
106	beq	2f			/* if less than 8 bytes to do */
107	andi.	r0,r6,3			/* get dest word aligned */
108	mtctr	r7
109	bne	5f
1101:	lwz	r7,4(r4)
111	lwzu	r8,8(r4)
112	stw	r7,4(r6)
113	stwu	r8,8(r6)
114	bdnz	1b
115	andi.	r5,r5,7
1162:	cmplwi	0,r5,4
117	blt	3f
118	lwzu	r0,4(r4)
119	addi	r5,r5,-4
120	stwu	r0,4(r6)
1213:	cmpwi	0,r5,0
122	beqlr
123	mtctr	r5
124	addi	r4,r4,3
125	addi	r6,r6,3
1264:	lbzu	r0,1(r4)
127	stbu	r0,1(r6)
128	bdnz	4b
129	blr
1305:	subfic	r0,r0,4
131	mtctr	r0
1326:	lbz	r7,4(r4)
133	addi	r4,r4,1
134	stb	r7,4(r6)
135	addi	r6,r6,1
136	bdnz	6b
137	subf	r5,r0,r5
138	rlwinm.	r7,r5,32-3,3,31
139	beq	2b
140	mtctr	r7
141	b	1b
142
143_GLOBAL(backwards_memcpy)
144	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
145	add	r6,r3,r5
146	add	r4,r4,r5
147	beq	2f
148	andi.	r0,r6,3
149	mtctr	r7
150	bne	5f
1511:	lwz	r7,-4(r4)
152	lwzu	r8,-8(r4)
153	stw	r7,-4(r6)
154	stwu	r8,-8(r6)
155	bdnz	1b
156	andi.	r5,r5,7
1572:	cmplwi	0,r5,4
158	blt	3f
159	lwzu	r0,-4(r4)
160	subi	r5,r5,4
161	stwu	r0,-4(r6)
1623:	cmpwi	0,r5,0
163	beqlr
164	mtctr	r5
1654:	lbzu	r0,-1(r4)
166	stbu	r0,-1(r6)
167	bdnz	4b
168	blr
1695:	mtctr	r0
1706:	lbzu	r7,-1(r4)
171	stbu	r7,-1(r6)
172	bdnz	6b
173	subf	r5,r0,r5
174	rlwinm.	r7,r5,32-3,3,31
175	beq	2b
176	mtctr	r7
177	b	1b
178
179_GLOBAL(__copy_tofrom_user)
180	addi	r4,r4,-4
181	addi	r6,r3,-4
182	neg	r0,r3
183	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
184	beq	58f
185
186	cmplw	0,r5,r0			/* is this more than total to do? */
187	blt	63f			/* if not much to do */
188	andi.	r8,r0,3			/* get it word-aligned first */
189	mtctr	r8
190	beq+	61f
19170:	lbz	r9,4(r4)		/* do some bytes */
19271:	stb	r9,4(r6)
193	addi	r4,r4,1
194	addi	r6,r6,1
195	bdnz	70b
19661:	subf	r5,r0,r5
197	srwi.	r0,r0,2
198	mtctr	r0
199	beq	58f
20072:	lwzu	r9,4(r4)		/* do some words */
20173:	stwu	r9,4(r6)
202	bdnz	72b
203
204	.section __ex_table,"a"
205	.align	2
206	.long	70b,100f
207	.long	71b,101f
208	.long	72b,102f
209	.long	73b,103f
210	.text
211
21258:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
213	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
214	li	r11,4
215	beq	63f
216
217	/* Here we decide how far ahead to prefetch the source */
218	li	r3,4
219	cmpwi	r0,1
220	li	r7,0
221	ble	114f
222	li	r7,1
223#if MAX_COPY_PREFETCH > 1
224	/* Heuristically, for large transfers we prefetch
225	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
226	   we prefetch 1 cacheline ahead. */
227	cmpwi	r0,MAX_COPY_PREFETCH
228	ble	112f
229	li	r7,MAX_COPY_PREFETCH
230112:	mtctr	r7
231111:	dcbt	r3,r4
232	addi	r3,r3,CACHELINE_BYTES
233	bdnz	111b
234#else
235	dcbt	r3,r4
236	addi	r3,r3,CACHELINE_BYTES
237#endif /* MAX_COPY_PREFETCH > 1 */
238
239114:	subf	r8,r7,r0
240	mr	r0,r7
241	mtctr	r8
242
24353:	dcbt	r3,r4
24454:	dcbz	r11,r6
245	.section __ex_table,"a"
246	.align	2
247	.long	54b,105f
248	.text
249/* the main body of the cacheline loop */
250	COPY_16_BYTES_WITHEX(0)
251#if L1_CACHE_BYTES >= 32
252	COPY_16_BYTES_WITHEX(1)
253#if L1_CACHE_BYTES >= 64
254	COPY_16_BYTES_WITHEX(2)
255	COPY_16_BYTES_WITHEX(3)
256#if L1_CACHE_BYTES >= 128
257	COPY_16_BYTES_WITHEX(4)
258	COPY_16_BYTES_WITHEX(5)
259	COPY_16_BYTES_WITHEX(6)
260	COPY_16_BYTES_WITHEX(7)
261#endif
262#endif
263#endif
264	bdnz	53b
265	cmpwi	r0,0
266	li	r3,4
267	li	r7,0
268	bne	114b
269
27063:	srwi.	r0,r5,2
271	mtctr	r0
272	beq	64f
27330:	lwzu	r0,4(r4)
27431:	stwu	r0,4(r6)
275	bdnz	30b
276
27764:	andi.	r0,r5,3
278	mtctr	r0
279	beq+	65f
28040:	lbz	r0,4(r4)
28141:	stb	r0,4(r6)
282	addi	r4,r4,1
283	addi	r6,r6,1
284	bdnz	40b
28565:	li	r3,0
286	blr
287
288/* read fault, initial single-byte copy */
289100:	li	r9,0
290	b	90f
291/* write fault, initial single-byte copy */
292101:	li	r9,1
29390:	subf	r5,r8,r5
294	li	r3,0
295	b	99f
296/* read fault, initial word copy */
297102:	li	r9,0
298	b	91f
299/* write fault, initial word copy */
300103:	li	r9,1
30191:	li	r3,2
302	b	99f
303
304/*
305 * this stuff handles faults in the cacheline loop and branches to either
306 * 104f (if in read part) or 105f (if in write part), after updating r5
307 */
308	COPY_16_BYTES_EXCODE(0)
309#if L1_CACHE_BYTES >= 32
310	COPY_16_BYTES_EXCODE(1)
311#if L1_CACHE_BYTES >= 64
312	COPY_16_BYTES_EXCODE(2)
313	COPY_16_BYTES_EXCODE(3)
314#if L1_CACHE_BYTES >= 128
315	COPY_16_BYTES_EXCODE(4)
316	COPY_16_BYTES_EXCODE(5)
317	COPY_16_BYTES_EXCODE(6)
318	COPY_16_BYTES_EXCODE(7)
319#endif
320#endif
321#endif
322
323/* read fault in cacheline loop */
324104:	li	r9,0
325	b	92f
326/* fault on dcbz (effectively a write fault) */
327/* or write fault in cacheline loop */
328105:	li	r9,1
32992:	li	r3,LG_CACHELINE_BYTES
330	mfctr	r8
331	add	r0,r0,r8
332	b	106f
333/* read fault in final word loop */
334108:	li	r9,0
335	b	93f
336/* write fault in final word loop */
337109:	li	r9,1
33893:	andi.	r5,r5,3
339	li	r3,2
340	b	99f
341/* read fault in final byte loop */
342110:	li	r9,0
343	b	94f
344/* write fault in final byte loop */
345111:	li	r9,1
34694:	li	r5,0
347	li	r3,0
348/*
349 * At this stage the number of bytes not copied is
350 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
351 */
35299:	mfctr	r0
353106:	slw	r3,r0,r3
354	add.	r3,r3,r5
355	beq	120f			/* shouldn't happen */
356	cmpwi	0,r9,0
357	bne	120f
358/* for a read fault, first try to continue the copy one byte at a time */
359	mtctr	r3
360130:	lbz	r0,4(r4)
361131:	stb	r0,4(r6)
362	addi	r4,r4,1
363	addi	r6,r6,1
364	bdnz	130b
365/* then clear out the destination: r3 bytes starting at 4(r6) */
366132:	mfctr	r3
367	srwi.	r0,r3,2
368	li	r9,0
369	mtctr	r0
370	beq	113f
371112:	stwu	r9,4(r6)
372	bdnz	112b
373113:	andi.	r0,r3,3
374	mtctr	r0
375	beq	120f
376114:	stb	r9,4(r6)
377	addi	r6,r6,1
378	bdnz	114b
379120:	blr
380
381	.section __ex_table,"a"
382	.align	2
383	.long	30b,108b
384	.long	31b,109b
385	.long	40b,110b
386	.long	41b,111b
387	.long	130b,132b
388	.long	131b,120b
389	.long	112b,120b
390	.long	114b,120b
391	.text
392