xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision 2d96b44f)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16
17#define COPY_16_BYTES		\
18	lwz	r7,4(r4);	\
19	lwz	r8,8(r4);	\
20	lwz	r9,12(r4);	\
21	lwzu	r10,16(r4);	\
22	stw	r7,4(r6);	\
23	stw	r8,8(r6);	\
24	stw	r9,12(r6);	\
25	stwu	r10,16(r6)
26
27#define COPY_16_BYTES_WITHEX(n)	\
288 ## n ## 0:			\
29	lwz	r7,4(r4);	\
308 ## n ## 1:			\
31	lwz	r8,8(r4);	\
328 ## n ## 2:			\
33	lwz	r9,12(r4);	\
348 ## n ## 3:			\
35	lwzu	r10,16(r4);	\
368 ## n ## 4:			\
37	stw	r7,4(r6);	\
388 ## n ## 5:			\
39	stw	r8,8(r6);	\
408 ## n ## 6:			\
41	stw	r9,12(r6);	\
428 ## n ## 7:			\
43	stwu	r10,16(r6)
44
45#define COPY_16_BYTES_EXCODE(n)			\
469 ## n ## 0:					\
47	addi	r5,r5,-(16 * n);		\
48	b	104f;				\
499 ## n ## 1:					\
50	addi	r5,r5,-(16 * n);		\
51	b	105f;				\
52.section __ex_table,"a";			\
53	.align	2;				\
54	.long	8 ## n ## 0b,9 ## n ## 0b;	\
55	.long	8 ## n ## 1b,9 ## n ## 0b;	\
56	.long	8 ## n ## 2b,9 ## n ## 0b;	\
57	.long	8 ## n ## 3b,9 ## n ## 0b;	\
58	.long	8 ## n ## 4b,9 ## n ## 1b;	\
59	.long	8 ## n ## 5b,9 ## n ## 1b;	\
60	.long	8 ## n ## 6b,9 ## n ## 1b;	\
61	.long	8 ## n ## 7b,9 ## n ## 1b;	\
62	.text
63
64	.text
65	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
66	.stabs	"copy_32.S",N_SO,0,0,0f
670:
68
69CACHELINE_BYTES = L1_CACHE_BYTES
70LG_CACHELINE_BYTES = L1_CACHE_SHIFT
71CACHELINE_MASK = (L1_CACHE_BYTES-1)
72
73/*
74 * Use dcbz on the complete cache lines in the destination
75 * to set them to zero.  This requires that the destination
76 * area is cacheable.  -- paulus
77 *
78 * During early init, cache might not be active yet, so dcbz cannot be used.
79 * We therefore skip the optimised bloc that uses dcbz. This jump is
80 * replaced by a nop once cache is active. This is done in machine_init()
81 */
82_GLOBAL(memset)
83	rlwimi	r4,r4,8,16,23
84	rlwimi	r4,r4,16,0,15
85
86	addi	r6,r3,-4
87	cmplwi	0,r5,4
88	blt	7f
89	stwu	r4,4(r6)
90	beqlr
91	andi.	r0,r6,3
92	add	r5,r0,r5
93	subf	r6,r0,r6
94	cmplwi	0,r4,0
95	bne	2f	/* Use normal procedure if r4 is not zero */
96EXPORT_SYMBOL(memset)
97_GLOBAL(memset_nocache_branch)
98	b	2f	/* Skip optimised bloc until cache is enabled */
99
100	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
101	add	r8,r7,r5
102	srwi	r9,r8,LG_CACHELINE_BYTES
103	addic.	r9,r9,-1	/* total number of complete cachelines */
104	ble	2f
105	xori	r0,r7,CACHELINE_MASK & ~3
106	srwi.	r0,r0,2
107	beq	3f
108	mtctr	r0
1094:	stwu	r4,4(r6)
110	bdnz	4b
1113:	mtctr	r9
112	li	r7,4
11310:	dcbz	r7,r6
114	addi	r6,r6,CACHELINE_BYTES
115	bdnz	10b
116	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
117	addi	r5,r5,4
118
1192:	srwi	r0,r5,2
120	mtctr	r0
121	bdz	6f
1221:	stwu	r4,4(r6)
123	bdnz	1b
1246:	andi.	r5,r5,3
1257:	cmpwi	0,r5,0
126	beqlr
127	mtctr	r5
128	addi	r6,r6,3
1298:	stbu	r4,1(r6)
130	bdnz	8b
131	blr
132
133/*
134 * This version uses dcbz on the complete cache lines in the
135 * destination area to reduce memory traffic.  This requires that
136 * the destination area is cacheable.
137 * We only use this version if the source and dest don't overlap.
138 * -- paulus.
139 *
140 * During early init, cache might not be active yet, so dcbz cannot be used.
141 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
142 * replaced by a nop once cache is active. This is done in machine_init()
143 */
144_GLOBAL(memmove)
145	cmplw	0,r3,r4
146	bgt	backwards_memcpy
147	/* fall through */
148
149_GLOBAL(memcpy)
150	b	generic_memcpy
151	add	r7,r3,r5		/* test if the src & dst overlap */
152	add	r8,r4,r5
153	cmplw	0,r4,r7
154	cmplw	1,r3,r8
155	crand	0,0,4			/* cr0.lt &= cr1.lt */
156	blt	generic_memcpy		/* if regions overlap */
157
158	addi	r4,r4,-4
159	addi	r6,r3,-4
160	neg	r0,r3
161	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
162	beq	58f
163
164	cmplw	0,r5,r0			/* is this more than total to do? */
165	blt	63f			/* if not much to do */
166	andi.	r8,r0,3			/* get it word-aligned first */
167	subf	r5,r0,r5
168	mtctr	r8
169	beq+	61f
17070:	lbz	r9,4(r4)		/* do some bytes */
171	addi	r4,r4,1
172	addi	r6,r6,1
173	stb	r9,3(r6)
174	bdnz	70b
17561:	srwi.	r0,r0,2
176	mtctr	r0
177	beq	58f
17872:	lwzu	r9,4(r4)		/* do some words */
179	stwu	r9,4(r6)
180	bdnz	72b
181
18258:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
183	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
184	li	r11,4
185	mtctr	r0
186	beq	63f
18753:
188	dcbz	r11,r6
189	COPY_16_BYTES
190#if L1_CACHE_BYTES >= 32
191	COPY_16_BYTES
192#if L1_CACHE_BYTES >= 64
193	COPY_16_BYTES
194	COPY_16_BYTES
195#if L1_CACHE_BYTES >= 128
196	COPY_16_BYTES
197	COPY_16_BYTES
198	COPY_16_BYTES
199	COPY_16_BYTES
200#endif
201#endif
202#endif
203	bdnz	53b
204
20563:	srwi.	r0,r5,2
206	mtctr	r0
207	beq	64f
20830:	lwzu	r0,4(r4)
209	stwu	r0,4(r6)
210	bdnz	30b
211
21264:	andi.	r0,r5,3
213	mtctr	r0
214	beq+	65f
215	addi	r4,r4,3
216	addi	r6,r6,3
21740:	lbzu	r0,1(r4)
218	stbu	r0,1(r6)
219	bdnz	40b
22065:	blr
221EXPORT_SYMBOL(memcpy)
222EXPORT_SYMBOL(memmove)
223
224generic_memcpy:
225	srwi.	r7,r5,3
226	addi	r6,r3,-4
227	addi	r4,r4,-4
228	beq	2f			/* if less than 8 bytes to do */
229	andi.	r0,r6,3			/* get dest word aligned */
230	mtctr	r7
231	bne	5f
2321:	lwz	r7,4(r4)
233	lwzu	r8,8(r4)
234	stw	r7,4(r6)
235	stwu	r8,8(r6)
236	bdnz	1b
237	andi.	r5,r5,7
2382:	cmplwi	0,r5,4
239	blt	3f
240	lwzu	r0,4(r4)
241	addi	r5,r5,-4
242	stwu	r0,4(r6)
2433:	cmpwi	0,r5,0
244	beqlr
245	mtctr	r5
246	addi	r4,r4,3
247	addi	r6,r6,3
2484:	lbzu	r0,1(r4)
249	stbu	r0,1(r6)
250	bdnz	4b
251	blr
2525:	subfic	r0,r0,4
253	mtctr	r0
2546:	lbz	r7,4(r4)
255	addi	r4,r4,1
256	stb	r7,4(r6)
257	addi	r6,r6,1
258	bdnz	6b
259	subf	r5,r0,r5
260	rlwinm.	r7,r5,32-3,3,31
261	beq	2b
262	mtctr	r7
263	b	1b
264
265_GLOBAL(backwards_memcpy)
266	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
267	add	r6,r3,r5
268	add	r4,r4,r5
269	beq	2f
270	andi.	r0,r6,3
271	mtctr	r7
272	bne	5f
2731:	lwz	r7,-4(r4)
274	lwzu	r8,-8(r4)
275	stw	r7,-4(r6)
276	stwu	r8,-8(r6)
277	bdnz	1b
278	andi.	r5,r5,7
2792:	cmplwi	0,r5,4
280	blt	3f
281	lwzu	r0,-4(r4)
282	subi	r5,r5,4
283	stwu	r0,-4(r6)
2843:	cmpwi	0,r5,0
285	beqlr
286	mtctr	r5
2874:	lbzu	r0,-1(r4)
288	stbu	r0,-1(r6)
289	bdnz	4b
290	blr
2915:	mtctr	r0
2926:	lbzu	r7,-1(r4)
293	stbu	r7,-1(r6)
294	bdnz	6b
295	subf	r5,r0,r5
296	rlwinm.	r7,r5,32-3,3,31
297	beq	2b
298	mtctr	r7
299	b	1b
300
301_GLOBAL(__copy_tofrom_user)
302	addi	r4,r4,-4
303	addi	r6,r3,-4
304	neg	r0,r3
305	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
306	beq	58f
307
308	cmplw	0,r5,r0			/* is this more than total to do? */
309	blt	63f			/* if not much to do */
310	andi.	r8,r0,3			/* get it word-aligned first */
311	mtctr	r8
312	beq+	61f
31370:	lbz	r9,4(r4)		/* do some bytes */
31471:	stb	r9,4(r6)
315	addi	r4,r4,1
316	addi	r6,r6,1
317	bdnz	70b
31861:	subf	r5,r0,r5
319	srwi.	r0,r0,2
320	mtctr	r0
321	beq	58f
32272:	lwzu	r9,4(r4)		/* do some words */
32373:	stwu	r9,4(r6)
324	bdnz	72b
325
326	.section __ex_table,"a"
327	.align	2
328	.long	70b,100f
329	.long	71b,101f
330	.long	72b,102f
331	.long	73b,103f
332	.text
333
33458:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
335	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
336	li	r11,4
337	beq	63f
338
339	/* Here we decide how far ahead to prefetch the source */
340	li	r3,4
341	cmpwi	r0,1
342	li	r7,0
343	ble	114f
344	li	r7,1
345#if MAX_COPY_PREFETCH > 1
346	/* Heuristically, for large transfers we prefetch
347	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
348	   we prefetch 1 cacheline ahead. */
349	cmpwi	r0,MAX_COPY_PREFETCH
350	ble	112f
351	li	r7,MAX_COPY_PREFETCH
352112:	mtctr	r7
353111:	dcbt	r3,r4
354	addi	r3,r3,CACHELINE_BYTES
355	bdnz	111b
356#else
357	dcbt	r3,r4
358	addi	r3,r3,CACHELINE_BYTES
359#endif /* MAX_COPY_PREFETCH > 1 */
360
361114:	subf	r8,r7,r0
362	mr	r0,r7
363	mtctr	r8
364
36553:	dcbt	r3,r4
36654:	dcbz	r11,r6
367	.section __ex_table,"a"
368	.align	2
369	.long	54b,105f
370	.text
371/* the main body of the cacheline loop */
372	COPY_16_BYTES_WITHEX(0)
373#if L1_CACHE_BYTES >= 32
374	COPY_16_BYTES_WITHEX(1)
375#if L1_CACHE_BYTES >= 64
376	COPY_16_BYTES_WITHEX(2)
377	COPY_16_BYTES_WITHEX(3)
378#if L1_CACHE_BYTES >= 128
379	COPY_16_BYTES_WITHEX(4)
380	COPY_16_BYTES_WITHEX(5)
381	COPY_16_BYTES_WITHEX(6)
382	COPY_16_BYTES_WITHEX(7)
383#endif
384#endif
385#endif
386	bdnz	53b
387	cmpwi	r0,0
388	li	r3,4
389	li	r7,0
390	bne	114b
391
39263:	srwi.	r0,r5,2
393	mtctr	r0
394	beq	64f
39530:	lwzu	r0,4(r4)
39631:	stwu	r0,4(r6)
397	bdnz	30b
398
39964:	andi.	r0,r5,3
400	mtctr	r0
401	beq+	65f
40240:	lbz	r0,4(r4)
40341:	stb	r0,4(r6)
404	addi	r4,r4,1
405	addi	r6,r6,1
406	bdnz	40b
40765:	li	r3,0
408	blr
409
410/* read fault, initial single-byte copy */
411100:	li	r9,0
412	b	90f
413/* write fault, initial single-byte copy */
414101:	li	r9,1
41590:	subf	r5,r8,r5
416	li	r3,0
417	b	99f
418/* read fault, initial word copy */
419102:	li	r9,0
420	b	91f
421/* write fault, initial word copy */
422103:	li	r9,1
42391:	li	r3,2
424	b	99f
425
426/*
427 * this stuff handles faults in the cacheline loop and branches to either
428 * 104f (if in read part) or 105f (if in write part), after updating r5
429 */
430	COPY_16_BYTES_EXCODE(0)
431#if L1_CACHE_BYTES >= 32
432	COPY_16_BYTES_EXCODE(1)
433#if L1_CACHE_BYTES >= 64
434	COPY_16_BYTES_EXCODE(2)
435	COPY_16_BYTES_EXCODE(3)
436#if L1_CACHE_BYTES >= 128
437	COPY_16_BYTES_EXCODE(4)
438	COPY_16_BYTES_EXCODE(5)
439	COPY_16_BYTES_EXCODE(6)
440	COPY_16_BYTES_EXCODE(7)
441#endif
442#endif
443#endif
444
445/* read fault in cacheline loop */
446104:	li	r9,0
447	b	92f
448/* fault on dcbz (effectively a write fault) */
449/* or write fault in cacheline loop */
450105:	li	r9,1
45192:	li	r3,LG_CACHELINE_BYTES
452	mfctr	r8
453	add	r0,r0,r8
454	b	106f
455/* read fault in final word loop */
456108:	li	r9,0
457	b	93f
458/* write fault in final word loop */
459109:	li	r9,1
46093:	andi.	r5,r5,3
461	li	r3,2
462	b	99f
463/* read fault in final byte loop */
464110:	li	r9,0
465	b	94f
466/* write fault in final byte loop */
467111:	li	r9,1
46894:	li	r5,0
469	li	r3,0
470/*
471 * At this stage the number of bytes not copied is
472 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
473 */
47499:	mfctr	r0
475106:	slw	r3,r0,r3
476	add.	r3,r3,r5
477	beq	120f			/* shouldn't happen */
478	cmpwi	0,r9,0
479	bne	120f
480/* for a read fault, first try to continue the copy one byte at a time */
481	mtctr	r3
482130:	lbz	r0,4(r4)
483131:	stb	r0,4(r6)
484	addi	r4,r4,1
485	addi	r6,r6,1
486	bdnz	130b
487/* then clear out the destination: r3 bytes starting at 4(r6) */
488132:	mfctr	r3
489	srwi.	r0,r3,2
490	li	r9,0
491	mtctr	r0
492	beq	113f
493112:	stwu	r9,4(r6)
494	bdnz	112b
495113:	andi.	r0,r3,3
496	mtctr	r0
497	beq	120f
498114:	stb	r9,4(r6)
499	addi	r6,r6,1
500	bdnz	114b
501120:	blr
502
503	.section __ex_table,"a"
504	.align	2
505	.long	30b,108b
506	.long	31b,109b
507	.long	40b,110b
508	.long	41b,111b
509	.long	130b,132b
510	.long	131b,120b
511	.long	112b,120b
512	.long	114b,120b
513	.text
514EXPORT_SYMBOL(__copy_tofrom_user)
515