xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision 87c2ce3b)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/config.h>
12#include <asm/processor.h>
13#include <asm/cache.h>
14#include <asm/errno.h>
15#include <asm/ppc_asm.h>
16
17#define COPY_16_BYTES		\
18	lwz	r7,4(r4);	\
19	lwz	r8,8(r4);	\
20	lwz	r9,12(r4);	\
21	lwzu	r10,16(r4);	\
22	stw	r7,4(r6);	\
23	stw	r8,8(r6);	\
24	stw	r9,12(r6);	\
25	stwu	r10,16(r6)
26
27#define COPY_16_BYTES_WITHEX(n)	\
288 ## n ## 0:			\
29	lwz	r7,4(r4);	\
308 ## n ## 1:			\
31	lwz	r8,8(r4);	\
328 ## n ## 2:			\
33	lwz	r9,12(r4);	\
348 ## n ## 3:			\
35	lwzu	r10,16(r4);	\
368 ## n ## 4:			\
37	stw	r7,4(r6);	\
388 ## n ## 5:			\
39	stw	r8,8(r6);	\
408 ## n ## 6:			\
41	stw	r9,12(r6);	\
428 ## n ## 7:			\
43	stwu	r10,16(r6)
44
45#define COPY_16_BYTES_EXCODE(n)			\
469 ## n ## 0:					\
47	addi	r5,r5,-(16 * n);		\
48	b	104f;				\
499 ## n ## 1:					\
50	addi	r5,r5,-(16 * n);		\
51	b	105f;				\
52.section __ex_table,"a";			\
53	.align	2;				\
54	.long	8 ## n ## 0b,9 ## n ## 0b;	\
55	.long	8 ## n ## 1b,9 ## n ## 0b;	\
56	.long	8 ## n ## 2b,9 ## n ## 0b;	\
57	.long	8 ## n ## 3b,9 ## n ## 0b;	\
58	.long	8 ## n ## 4b,9 ## n ## 1b;	\
59	.long	8 ## n ## 5b,9 ## n ## 1b;	\
60	.long	8 ## n ## 6b,9 ## n ## 1b;	\
61	.long	8 ## n ## 7b,9 ## n ## 1b;	\
62	.text
63
64	.text
65	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
66	.stabs	"copy32.S",N_SO,0,0,0f
670:
68
69CACHELINE_BYTES = L1_CACHE_BYTES
70LG_CACHELINE_BYTES = L1_CACHE_SHIFT
71CACHELINE_MASK = (L1_CACHE_BYTES-1)
72
73/*
74 * Use dcbz on the complete cache lines in the destination
75 * to set them to zero.  This requires that the destination
76 * area is cacheable.  -- paulus
77 */
78_GLOBAL(cacheable_memzero)
79	mr	r5,r4
80	li	r4,0
81	addi	r6,r3,-4
82	cmplwi	0,r5,4
83	blt	7f
84	stwu	r4,4(r6)
85	beqlr
86	andi.	r0,r6,3
87	add	r5,r0,r5
88	subf	r6,r0,r6
89	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
90	add	r8,r7,r5
91	srwi	r9,r8,LG_CACHELINE_BYTES
92	addic.	r9,r9,-1	/* total number of complete cachelines */
93	ble	2f
94	xori	r0,r7,CACHELINE_MASK & ~3
95	srwi.	r0,r0,2
96	beq	3f
97	mtctr	r0
984:	stwu	r4,4(r6)
99	bdnz	4b
1003:	mtctr	r9
101	li	r7,4
102#if !defined(CONFIG_8xx)
10310:	dcbz	r7,r6
104#else
10510:	stw	r4, 4(r6)
106	stw	r4, 8(r6)
107	stw	r4, 12(r6)
108	stw	r4, 16(r6)
109#if CACHE_LINE_SIZE >= 32
110	stw	r4, 20(r6)
111	stw	r4, 24(r6)
112	stw	r4, 28(r6)
113	stw	r4, 32(r6)
114#endif /* CACHE_LINE_SIZE */
115#endif
116	addi	r6,r6,CACHELINE_BYTES
117	bdnz	10b
118	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
119	addi	r5,r5,4
1202:	srwi	r0,r5,2
121	mtctr	r0
122	bdz	6f
1231:	stwu	r4,4(r6)
124	bdnz	1b
1256:	andi.	r5,r5,3
1267:	cmpwi	0,r5,0
127	beqlr
128	mtctr	r5
129	addi	r6,r6,3
1308:	stbu	r4,1(r6)
131	bdnz	8b
132	blr
133
134_GLOBAL(memset)
135	rlwimi	r4,r4,8,16,23
136	rlwimi	r4,r4,16,0,15
137	addi	r6,r3,-4
138	cmplwi	0,r5,4
139	blt	7f
140	stwu	r4,4(r6)
141	beqlr
142	andi.	r0,r6,3
143	add	r5,r0,r5
144	subf	r6,r0,r6
145	srwi	r0,r5,2
146	mtctr	r0
147	bdz	6f
1481:	stwu	r4,4(r6)
149	bdnz	1b
1506:	andi.	r5,r5,3
1517:	cmpwi	0,r5,0
152	beqlr
153	mtctr	r5
154	addi	r6,r6,3
1558:	stbu	r4,1(r6)
156	bdnz	8b
157	blr
158
159/*
160 * This version uses dcbz on the complete cache lines in the
161 * destination area to reduce memory traffic.  This requires that
162 * the destination area is cacheable.
163 * We only use this version if the source and dest don't overlap.
164 * -- paulus.
165 */
166_GLOBAL(cacheable_memcpy)
167	add	r7,r3,r5		/* test if the src & dst overlap */
168	add	r8,r4,r5
169	cmplw	0,r4,r7
170	cmplw	1,r3,r8
171	crand	0,0,4			/* cr0.lt &= cr1.lt */
172	blt	memcpy			/* if regions overlap */
173
174	addi	r4,r4,-4
175	addi	r6,r3,-4
176	neg	r0,r3
177	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
178	beq	58f
179
180	cmplw	0,r5,r0			/* is this more than total to do? */
181	blt	63f			/* if not much to do */
182	andi.	r8,r0,3			/* get it word-aligned first */
183	subf	r5,r0,r5
184	mtctr	r8
185	beq+	61f
18670:	lbz	r9,4(r4)		/* do some bytes */
187	stb	r9,4(r6)
188	addi	r4,r4,1
189	addi	r6,r6,1
190	bdnz	70b
19161:	srwi.	r0,r0,2
192	mtctr	r0
193	beq	58f
19472:	lwzu	r9,4(r4)		/* do some words */
195	stwu	r9,4(r6)
196	bdnz	72b
197
19858:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
199	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
200	li	r11,4
201	mtctr	r0
202	beq	63f
20353:
204#if !defined(CONFIG_8xx)
205	dcbz	r11,r6
206#endif
207	COPY_16_BYTES
208#if L1_CACHE_BYTES >= 32
209	COPY_16_BYTES
210#if L1_CACHE_BYTES >= 64
211	COPY_16_BYTES
212	COPY_16_BYTES
213#if L1_CACHE_BYTES >= 128
214	COPY_16_BYTES
215	COPY_16_BYTES
216	COPY_16_BYTES
217	COPY_16_BYTES
218#endif
219#endif
220#endif
221	bdnz	53b
222
22363:	srwi.	r0,r5,2
224	mtctr	r0
225	beq	64f
22630:	lwzu	r0,4(r4)
227	stwu	r0,4(r6)
228	bdnz	30b
229
23064:	andi.	r0,r5,3
231	mtctr	r0
232	beq+	65f
23340:	lbz	r0,4(r4)
234	stb	r0,4(r6)
235	addi	r4,r4,1
236	addi	r6,r6,1
237	bdnz	40b
23865:	blr
239
240_GLOBAL(memmove)
241	cmplw	0,r3,r4
242	bgt	backwards_memcpy
243	/* fall through */
244
245_GLOBAL(memcpy)
246	srwi.	r7,r5,3
247	addi	r6,r3,-4
248	addi	r4,r4,-4
249	beq	2f			/* if less than 8 bytes to do */
250	andi.	r0,r6,3			/* get dest word aligned */
251	mtctr	r7
252	bne	5f
2531:	lwz	r7,4(r4)
254	lwzu	r8,8(r4)
255	stw	r7,4(r6)
256	stwu	r8,8(r6)
257	bdnz	1b
258	andi.	r5,r5,7
2592:	cmplwi	0,r5,4
260	blt	3f
261	lwzu	r0,4(r4)
262	addi	r5,r5,-4
263	stwu	r0,4(r6)
2643:	cmpwi	0,r5,0
265	beqlr
266	mtctr	r5
267	addi	r4,r4,3
268	addi	r6,r6,3
2694:	lbzu	r0,1(r4)
270	stbu	r0,1(r6)
271	bdnz	4b
272	blr
2735:	subfic	r0,r0,4
274	mtctr	r0
2756:	lbz	r7,4(r4)
276	addi	r4,r4,1
277	stb	r7,4(r6)
278	addi	r6,r6,1
279	bdnz	6b
280	subf	r5,r0,r5
281	rlwinm.	r7,r5,32-3,3,31
282	beq	2b
283	mtctr	r7
284	b	1b
285
286_GLOBAL(backwards_memcpy)
287	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
288	add	r6,r3,r5
289	add	r4,r4,r5
290	beq	2f
291	andi.	r0,r6,3
292	mtctr	r7
293	bne	5f
2941:	lwz	r7,-4(r4)
295	lwzu	r8,-8(r4)
296	stw	r7,-4(r6)
297	stwu	r8,-8(r6)
298	bdnz	1b
299	andi.	r5,r5,7
3002:	cmplwi	0,r5,4
301	blt	3f
302	lwzu	r0,-4(r4)
303	subi	r5,r5,4
304	stwu	r0,-4(r6)
3053:	cmpwi	0,r5,0
306	beqlr
307	mtctr	r5
3084:	lbzu	r0,-1(r4)
309	stbu	r0,-1(r6)
310	bdnz	4b
311	blr
3125:	mtctr	r0
3136:	lbzu	r7,-1(r4)
314	stbu	r7,-1(r6)
315	bdnz	6b
316	subf	r5,r0,r5
317	rlwinm.	r7,r5,32-3,3,31
318	beq	2b
319	mtctr	r7
320	b	1b
321
322_GLOBAL(__copy_tofrom_user)
323	addi	r4,r4,-4
324	addi	r6,r3,-4
325	neg	r0,r3
326	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
327	beq	58f
328
329	cmplw	0,r5,r0			/* is this more than total to do? */
330	blt	63f			/* if not much to do */
331	andi.	r8,r0,3			/* get it word-aligned first */
332	mtctr	r8
333	beq+	61f
33470:	lbz	r9,4(r4)		/* do some bytes */
33571:	stb	r9,4(r6)
336	addi	r4,r4,1
337	addi	r6,r6,1
338	bdnz	70b
33961:	subf	r5,r0,r5
340	srwi.	r0,r0,2
341	mtctr	r0
342	beq	58f
34372:	lwzu	r9,4(r4)		/* do some words */
34473:	stwu	r9,4(r6)
345	bdnz	72b
346
347	.section __ex_table,"a"
348	.align	2
349	.long	70b,100f
350	.long	71b,101f
351	.long	72b,102f
352	.long	73b,103f
353	.text
354
35558:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
356	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
357	li	r11,4
358	beq	63f
359
360#ifdef CONFIG_8xx
361	/* Don't use prefetch on 8xx */
362	mtctr	r0
363	li	r0,0
36453:	COPY_16_BYTES_WITHEX(0)
365	bdnz	53b
366
367#else /* not CONFIG_8xx */
368	/* Here we decide how far ahead to prefetch the source */
369	li	r3,4
370	cmpwi	r0,1
371	li	r7,0
372	ble	114f
373	li	r7,1
374#if MAX_COPY_PREFETCH > 1
375	/* Heuristically, for large transfers we prefetch
376	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
377	   we prefetch 1 cacheline ahead. */
378	cmpwi	r0,MAX_COPY_PREFETCH
379	ble	112f
380	li	r7,MAX_COPY_PREFETCH
381112:	mtctr	r7
382111:	dcbt	r3,r4
383	addi	r3,r3,CACHELINE_BYTES
384	bdnz	111b
385#else
386	dcbt	r3,r4
387	addi	r3,r3,CACHELINE_BYTES
388#endif /* MAX_COPY_PREFETCH > 1 */
389
390114:	subf	r8,r7,r0
391	mr	r0,r7
392	mtctr	r8
393
39453:	dcbt	r3,r4
39554:	dcbz	r11,r6
396	.section __ex_table,"a"
397	.align	2
398	.long	54b,105f
399	.text
400/* the main body of the cacheline loop */
401	COPY_16_BYTES_WITHEX(0)
402#if L1_CACHE_BYTES >= 32
403	COPY_16_BYTES_WITHEX(1)
404#if L1_CACHE_BYTES >= 64
405	COPY_16_BYTES_WITHEX(2)
406	COPY_16_BYTES_WITHEX(3)
407#if L1_CACHE_BYTES >= 128
408	COPY_16_BYTES_WITHEX(4)
409	COPY_16_BYTES_WITHEX(5)
410	COPY_16_BYTES_WITHEX(6)
411	COPY_16_BYTES_WITHEX(7)
412#endif
413#endif
414#endif
415	bdnz	53b
416	cmpwi	r0,0
417	li	r3,4
418	li	r7,0
419	bne	114b
420#endif /* CONFIG_8xx */
421
42263:	srwi.	r0,r5,2
423	mtctr	r0
424	beq	64f
42530:	lwzu	r0,4(r4)
42631:	stwu	r0,4(r6)
427	bdnz	30b
428
42964:	andi.	r0,r5,3
430	mtctr	r0
431	beq+	65f
43240:	lbz	r0,4(r4)
43341:	stb	r0,4(r6)
434	addi	r4,r4,1
435	addi	r6,r6,1
436	bdnz	40b
43765:	li	r3,0
438	blr
439
440/* read fault, initial single-byte copy */
441100:	li	r9,0
442	b	90f
443/* write fault, initial single-byte copy */
444101:	li	r9,1
44590:	subf	r5,r8,r5
446	li	r3,0
447	b	99f
448/* read fault, initial word copy */
449102:	li	r9,0
450	b	91f
451/* write fault, initial word copy */
452103:	li	r9,1
45391:	li	r3,2
454	b	99f
455
456/*
457 * this stuff handles faults in the cacheline loop and branches to either
458 * 104f (if in read part) or 105f (if in write part), after updating r5
459 */
460	COPY_16_BYTES_EXCODE(0)
461#if L1_CACHE_BYTES >= 32
462	COPY_16_BYTES_EXCODE(1)
463#if L1_CACHE_BYTES >= 64
464	COPY_16_BYTES_EXCODE(2)
465	COPY_16_BYTES_EXCODE(3)
466#if L1_CACHE_BYTES >= 128
467	COPY_16_BYTES_EXCODE(4)
468	COPY_16_BYTES_EXCODE(5)
469	COPY_16_BYTES_EXCODE(6)
470	COPY_16_BYTES_EXCODE(7)
471#endif
472#endif
473#endif
474
475/* read fault in cacheline loop */
476104:	li	r9,0
477	b	92f
478/* fault on dcbz (effectively a write fault) */
479/* or write fault in cacheline loop */
480105:	li	r9,1
48192:	li	r3,LG_CACHELINE_BYTES
482	mfctr	r8
483	add	r0,r0,r8
484	b	106f
485/* read fault in final word loop */
486108:	li	r9,0
487	b	93f
488/* write fault in final word loop */
489109:	li	r9,1
49093:	andi.	r5,r5,3
491	li	r3,2
492	b	99f
493/* read fault in final byte loop */
494110:	li	r9,0
495	b	94f
496/* write fault in final byte loop */
497111:	li	r9,1
49894:	li	r5,0
499	li	r3,0
500/*
501 * At this stage the number of bytes not copied is
502 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
503 */
50499:	mfctr	r0
505106:	slw	r3,r0,r3
506	add.	r3,r3,r5
507	beq	120f			/* shouldn't happen */
508	cmpwi	0,r9,0
509	bne	120f
510/* for a read fault, first try to continue the copy one byte at a time */
511	mtctr	r3
512130:	lbz	r0,4(r4)
513131:	stb	r0,4(r6)
514	addi	r4,r4,1
515	addi	r6,r6,1
516	bdnz	130b
517/* then clear out the destination: r3 bytes starting at 4(r6) */
518132:	mfctr	r3
519	srwi.	r0,r3,2
520	li	r9,0
521	mtctr	r0
522	beq	113f
523112:	stwu	r9,4(r6)
524	bdnz	112b
525113:	andi.	r0,r3,3
526	mtctr	r0
527	beq	120f
528114:	stb	r9,4(r6)
529	addi	r6,r6,1
530	bdnz	114b
531120:	blr
532
533	.section __ex_table,"a"
534	.align	2
535	.long	30b,108b
536	.long	31b,109b
537	.long	40b,110b
538	.long	41b,111b
539	.long	130b,132b
540	.long	131b,120b
541	.long	112b,120b
542	.long	114b,120b
543	.text
544