xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision f220d3eb)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16#include <asm/code-patching-asm.h>
17
18#define COPY_16_BYTES		\
19	lwz	r7,4(r4);	\
20	lwz	r8,8(r4);	\
21	lwz	r9,12(r4);	\
22	lwzu	r10,16(r4);	\
23	stw	r7,4(r6);	\
24	stw	r8,8(r6);	\
25	stw	r9,12(r6);	\
26	stwu	r10,16(r6)
27
28#define COPY_16_BYTES_WITHEX(n)	\
298 ## n ## 0:			\
30	lwz	r7,4(r4);	\
318 ## n ## 1:			\
32	lwz	r8,8(r4);	\
338 ## n ## 2:			\
34	lwz	r9,12(r4);	\
358 ## n ## 3:			\
36	lwzu	r10,16(r4);	\
378 ## n ## 4:			\
38	stw	r7,4(r6);	\
398 ## n ## 5:			\
40	stw	r8,8(r6);	\
418 ## n ## 6:			\
42	stw	r9,12(r6);	\
438 ## n ## 7:			\
44	stwu	r10,16(r6)
45
46#define COPY_16_BYTES_EXCODE(n)			\
479 ## n ## 0:					\
48	addi	r5,r5,-(16 * n);		\
49	b	104f;				\
509 ## n ## 1:					\
51	addi	r5,r5,-(16 * n);		\
52	b	105f;				\
53	EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);	\
54	EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);	\
55	EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);	\
56	EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);	\
57	EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);	\
58	EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);	\
59	EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);	\
60	EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
61
62	.text
63	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
64	.stabs	"copy_32.S",N_SO,0,0,0f
650:
66
67CACHELINE_BYTES = L1_CACHE_BYTES
68LG_CACHELINE_BYTES = L1_CACHE_SHIFT
69CACHELINE_MASK = (L1_CACHE_BYTES-1)
70
71_GLOBAL(memset16)
72	rlwinm.	r0 ,r5, 31, 1, 31
73	addi	r6, r3, -4
74	beq-	2f
75	rlwimi	r4 ,r4 ,16 ,0 ,15
76	mtctr	r0
771:	stwu	r4, 4(r6)
78	bdnz	1b
792:	andi.	r0, r5, 1
80	beqlr
81	sth	r4, 4(r6)
82	blr
83EXPORT_SYMBOL(memset16)
84
85/*
86 * Use dcbz on the complete cache lines in the destination
87 * to set them to zero.  This requires that the destination
88 * area is cacheable.  -- paulus
89 *
90 * During early init, cache might not be active yet, so dcbz cannot be used.
91 * We therefore skip the optimised bloc that uses dcbz. This jump is
92 * replaced by a nop once cache is active. This is done in machine_init()
93 */
94_GLOBAL(memset)
95	cmplwi	0,r5,4
96	blt	7f
97
98	rlwimi	r4,r4,8,16,23
99	rlwimi	r4,r4,16,0,15
100
101	stw	r4,0(r3)
102	beqlr
103	andi.	r0,r3,3
104	add	r5,r0,r5
105	subf	r6,r0,r3
106	cmplwi	0,r4,0
107	/*
108	 * Skip optimised bloc until cache is enabled. Will be replaced
109	 * by 'bne' during boot to use normal procedure if r4 is not zero
110	 */
1115:	b	2f
112	patch_site	5b, patch__memset_nocache
113
114	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
115	add	r8,r7,r5
116	srwi	r9,r8,LG_CACHELINE_BYTES
117	addic.	r9,r9,-1	/* total number of complete cachelines */
118	ble	2f
119	xori	r0,r7,CACHELINE_MASK & ~3
120	srwi.	r0,r0,2
121	beq	3f
122	mtctr	r0
1234:	stwu	r4,4(r6)
124	bdnz	4b
1253:	mtctr	r9
126	li	r7,4
12710:	dcbz	r7,r6
128	addi	r6,r6,CACHELINE_BYTES
129	bdnz	10b
130	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
131	addi	r5,r5,4
132
1332:	srwi	r0,r5,2
134	mtctr	r0
135	bdz	6f
1361:	stwu	r4,4(r6)
137	bdnz	1b
1386:	andi.	r5,r5,3
139	beqlr
140	mtctr	r5
141	addi	r6,r6,3
1428:	stbu	r4,1(r6)
143	bdnz	8b
144	blr
145
1467:	cmpwi	0,r5,0
147	beqlr
148	mtctr	r5
149	addi	r6,r3,-1
1509:	stbu	r4,1(r6)
151	bdnz	9b
152	blr
153EXPORT_SYMBOL(memset)
154
155/*
156 * This version uses dcbz on the complete cache lines in the
157 * destination area to reduce memory traffic.  This requires that
158 * the destination area is cacheable.
159 * We only use this version if the source and dest don't overlap.
160 * -- paulus.
161 *
162 * During early init, cache might not be active yet, so dcbz cannot be used.
163 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
164 * replaced by a nop once cache is active. This is done in machine_init()
165 */
166_GLOBAL(memmove)
167	cmplw	0,r3,r4
168	bgt	backwards_memcpy
169	/* fall through */
170
171_GLOBAL(memcpy)
1721:	b	generic_memcpy
173	patch_site	1b, patch__memcpy_nocache
174
175	add	r7,r3,r5		/* test if the src & dst overlap */
176	add	r8,r4,r5
177	cmplw	0,r4,r7
178	cmplw	1,r3,r8
179	crand	0,0,4			/* cr0.lt &= cr1.lt */
180	blt	generic_memcpy		/* if regions overlap */
181
182	addi	r4,r4,-4
183	addi	r6,r3,-4
184	neg	r0,r3
185	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
186	beq	58f
187
188	cmplw	0,r5,r0			/* is this more than total to do? */
189	blt	63f			/* if not much to do */
190	andi.	r8,r0,3			/* get it word-aligned first */
191	subf	r5,r0,r5
192	mtctr	r8
193	beq+	61f
19470:	lbz	r9,4(r4)		/* do some bytes */
195	addi	r4,r4,1
196	addi	r6,r6,1
197	stb	r9,3(r6)
198	bdnz	70b
19961:	srwi.	r0,r0,2
200	mtctr	r0
201	beq	58f
20272:	lwzu	r9,4(r4)		/* do some words */
203	stwu	r9,4(r6)
204	bdnz	72b
205
20658:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
207	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
208	li	r11,4
209	mtctr	r0
210	beq	63f
21153:
212	dcbz	r11,r6
213	COPY_16_BYTES
214#if L1_CACHE_BYTES >= 32
215	COPY_16_BYTES
216#if L1_CACHE_BYTES >= 64
217	COPY_16_BYTES
218	COPY_16_BYTES
219#if L1_CACHE_BYTES >= 128
220	COPY_16_BYTES
221	COPY_16_BYTES
222	COPY_16_BYTES
223	COPY_16_BYTES
224#endif
225#endif
226#endif
227	bdnz	53b
228
22963:	srwi.	r0,r5,2
230	mtctr	r0
231	beq	64f
23230:	lwzu	r0,4(r4)
233	stwu	r0,4(r6)
234	bdnz	30b
235
23664:	andi.	r0,r5,3
237	mtctr	r0
238	beq+	65f
239	addi	r4,r4,3
240	addi	r6,r6,3
24140:	lbzu	r0,1(r4)
242	stbu	r0,1(r6)
243	bdnz	40b
24465:	blr
245EXPORT_SYMBOL(memcpy)
246EXPORT_SYMBOL(memmove)
247
248generic_memcpy:
249	srwi.	r7,r5,3
250	addi	r6,r3,-4
251	addi	r4,r4,-4
252	beq	2f			/* if less than 8 bytes to do */
253	andi.	r0,r6,3			/* get dest word aligned */
254	mtctr	r7
255	bne	5f
2561:	lwz	r7,4(r4)
257	lwzu	r8,8(r4)
258	stw	r7,4(r6)
259	stwu	r8,8(r6)
260	bdnz	1b
261	andi.	r5,r5,7
2622:	cmplwi	0,r5,4
263	blt	3f
264	lwzu	r0,4(r4)
265	addi	r5,r5,-4
266	stwu	r0,4(r6)
2673:	cmpwi	0,r5,0
268	beqlr
269	mtctr	r5
270	addi	r4,r4,3
271	addi	r6,r6,3
2724:	lbzu	r0,1(r4)
273	stbu	r0,1(r6)
274	bdnz	4b
275	blr
2765:	subfic	r0,r0,4
277	mtctr	r0
2786:	lbz	r7,4(r4)
279	addi	r4,r4,1
280	stb	r7,4(r6)
281	addi	r6,r6,1
282	bdnz	6b
283	subf	r5,r0,r5
284	rlwinm.	r7,r5,32-3,3,31
285	beq	2b
286	mtctr	r7
287	b	1b
288
289_GLOBAL(backwards_memcpy)
290	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
291	add	r6,r3,r5
292	add	r4,r4,r5
293	beq	2f
294	andi.	r0,r6,3
295	mtctr	r7
296	bne	5f
2971:	lwz	r7,-4(r4)
298	lwzu	r8,-8(r4)
299	stw	r7,-4(r6)
300	stwu	r8,-8(r6)
301	bdnz	1b
302	andi.	r5,r5,7
3032:	cmplwi	0,r5,4
304	blt	3f
305	lwzu	r0,-4(r4)
306	subi	r5,r5,4
307	stwu	r0,-4(r6)
3083:	cmpwi	0,r5,0
309	beqlr
310	mtctr	r5
3114:	lbzu	r0,-1(r4)
312	stbu	r0,-1(r6)
313	bdnz	4b
314	blr
3155:	mtctr	r0
3166:	lbzu	r7,-1(r4)
317	stbu	r7,-1(r6)
318	bdnz	6b
319	subf	r5,r0,r5
320	rlwinm.	r7,r5,32-3,3,31
321	beq	2b
322	mtctr	r7
323	b	1b
324
325_GLOBAL(__copy_tofrom_user)
326	addi	r4,r4,-4
327	addi	r6,r3,-4
328	neg	r0,r3
329	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
330	beq	58f
331
332	cmplw	0,r5,r0			/* is this more than total to do? */
333	blt	63f			/* if not much to do */
334	andi.	r8,r0,3			/* get it word-aligned first */
335	mtctr	r8
336	beq+	61f
33770:	lbz	r9,4(r4)		/* do some bytes */
33871:	stb	r9,4(r6)
339	addi	r4,r4,1
340	addi	r6,r6,1
341	bdnz	70b
34261:	subf	r5,r0,r5
343	srwi.	r0,r0,2
344	mtctr	r0
345	beq	58f
34672:	lwzu	r9,4(r4)		/* do some words */
34773:	stwu	r9,4(r6)
348	bdnz	72b
349
350	EX_TABLE(70b,100f)
351	EX_TABLE(71b,101f)
352	EX_TABLE(72b,102f)
353	EX_TABLE(73b,103f)
354
35558:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
356	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
357	li	r11,4
358	beq	63f
359
360	/* Here we decide how far ahead to prefetch the source */
361	li	r3,4
362	cmpwi	r0,1
363	li	r7,0
364	ble	114f
365	li	r7,1
366#if MAX_COPY_PREFETCH > 1
367	/* Heuristically, for large transfers we prefetch
368	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
369	   we prefetch 1 cacheline ahead. */
370	cmpwi	r0,MAX_COPY_PREFETCH
371	ble	112f
372	li	r7,MAX_COPY_PREFETCH
373112:	mtctr	r7
374111:	dcbt	r3,r4
375	addi	r3,r3,CACHELINE_BYTES
376	bdnz	111b
377#else
378	dcbt	r3,r4
379	addi	r3,r3,CACHELINE_BYTES
380#endif /* MAX_COPY_PREFETCH > 1 */
381
382114:	subf	r8,r7,r0
383	mr	r0,r7
384	mtctr	r8
385
38653:	dcbt	r3,r4
38754:	dcbz	r11,r6
388	EX_TABLE(54b,105f)
389/* the main body of the cacheline loop */
390	COPY_16_BYTES_WITHEX(0)
391#if L1_CACHE_BYTES >= 32
392	COPY_16_BYTES_WITHEX(1)
393#if L1_CACHE_BYTES >= 64
394	COPY_16_BYTES_WITHEX(2)
395	COPY_16_BYTES_WITHEX(3)
396#if L1_CACHE_BYTES >= 128
397	COPY_16_BYTES_WITHEX(4)
398	COPY_16_BYTES_WITHEX(5)
399	COPY_16_BYTES_WITHEX(6)
400	COPY_16_BYTES_WITHEX(7)
401#endif
402#endif
403#endif
404	bdnz	53b
405	cmpwi	r0,0
406	li	r3,4
407	li	r7,0
408	bne	114b
409
41063:	srwi.	r0,r5,2
411	mtctr	r0
412	beq	64f
41330:	lwzu	r0,4(r4)
41431:	stwu	r0,4(r6)
415	bdnz	30b
416
41764:	andi.	r0,r5,3
418	mtctr	r0
419	beq+	65f
42040:	lbz	r0,4(r4)
42141:	stb	r0,4(r6)
422	addi	r4,r4,1
423	addi	r6,r6,1
424	bdnz	40b
42565:	li	r3,0
426	blr
427
428/* read fault, initial single-byte copy */
429100:	li	r9,0
430	b	90f
431/* write fault, initial single-byte copy */
432101:	li	r9,1
43390:	subf	r5,r8,r5
434	li	r3,0
435	b	99f
436/* read fault, initial word copy */
437102:	li	r9,0
438	b	91f
439/* write fault, initial word copy */
440103:	li	r9,1
44191:	li	r3,2
442	b	99f
443
444/*
445 * this stuff handles faults in the cacheline loop and branches to either
446 * 104f (if in read part) or 105f (if in write part), after updating r5
447 */
448	COPY_16_BYTES_EXCODE(0)
449#if L1_CACHE_BYTES >= 32
450	COPY_16_BYTES_EXCODE(1)
451#if L1_CACHE_BYTES >= 64
452	COPY_16_BYTES_EXCODE(2)
453	COPY_16_BYTES_EXCODE(3)
454#if L1_CACHE_BYTES >= 128
455	COPY_16_BYTES_EXCODE(4)
456	COPY_16_BYTES_EXCODE(5)
457	COPY_16_BYTES_EXCODE(6)
458	COPY_16_BYTES_EXCODE(7)
459#endif
460#endif
461#endif
462
463/* read fault in cacheline loop */
464104:	li	r9,0
465	b	92f
466/* fault on dcbz (effectively a write fault) */
467/* or write fault in cacheline loop */
468105:	li	r9,1
46992:	li	r3,LG_CACHELINE_BYTES
470	mfctr	r8
471	add	r0,r0,r8
472	b	106f
473/* read fault in final word loop */
474108:	li	r9,0
475	b	93f
476/* write fault in final word loop */
477109:	li	r9,1
47893:	andi.	r5,r5,3
479	li	r3,2
480	b	99f
481/* read fault in final byte loop */
482110:	li	r9,0
483	b	94f
484/* write fault in final byte loop */
485111:	li	r9,1
48694:	li	r5,0
487	li	r3,0
488/*
489 * At this stage the number of bytes not copied is
490 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
491 */
49299:	mfctr	r0
493106:	slw	r3,r0,r3
494	add.	r3,r3,r5
495	beq	120f			/* shouldn't happen */
496	cmpwi	0,r9,0
497	bne	120f
498/* for a read fault, first try to continue the copy one byte at a time */
499	mtctr	r3
500130:	lbz	r0,4(r4)
501131:	stb	r0,4(r6)
502	addi	r4,r4,1
503	addi	r6,r6,1
504	bdnz	130b
505/* then clear out the destination: r3 bytes starting at 4(r6) */
506132:	mfctr	r3
507120:	blr
508
509	EX_TABLE(30b,108b)
510	EX_TABLE(31b,109b)
511	EX_TABLE(40b,110b)
512	EX_TABLE(41b,111b)
513	EX_TABLE(130b,132b)
514	EX_TABLE(131b,120b)
515
516EXPORT_SYMBOL(__copy_tofrom_user)
517