xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision feac8c8b)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16
17#define COPY_16_BYTES		\
18	lwz	r7,4(r4);	\
19	lwz	r8,8(r4);	\
20	lwz	r9,12(r4);	\
21	lwzu	r10,16(r4);	\
22	stw	r7,4(r6);	\
23	stw	r8,8(r6);	\
24	stw	r9,12(r6);	\
25	stwu	r10,16(r6)
26
27#define COPY_16_BYTES_WITHEX(n)	\
288 ## n ## 0:			\
29	lwz	r7,4(r4);	\
308 ## n ## 1:			\
31	lwz	r8,8(r4);	\
328 ## n ## 2:			\
33	lwz	r9,12(r4);	\
348 ## n ## 3:			\
35	lwzu	r10,16(r4);	\
368 ## n ## 4:			\
37	stw	r7,4(r6);	\
388 ## n ## 5:			\
39	stw	r8,8(r6);	\
408 ## n ## 6:			\
41	stw	r9,12(r6);	\
428 ## n ## 7:			\
43	stwu	r10,16(r6)
44
45#define COPY_16_BYTES_EXCODE(n)			\
469 ## n ## 0:					\
47	addi	r5,r5,-(16 * n);		\
48	b	104f;				\
499 ## n ## 1:					\
50	addi	r5,r5,-(16 * n);		\
51	b	105f;				\
52	EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);	\
53	EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);	\
54	EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);	\
55	EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);	\
56	EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);	\
57	EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);	\
58	EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);	\
59	EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
60
61	.text
62	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
63	.stabs	"copy_32.S",N_SO,0,0,0f
640:
65
66CACHELINE_BYTES = L1_CACHE_BYTES
67LG_CACHELINE_BYTES = L1_CACHE_SHIFT
68CACHELINE_MASK = (L1_CACHE_BYTES-1)
69
70_GLOBAL(memset16)
71	rlwinm.	r0 ,r5, 31, 1, 31
72	addi	r6, r3, -4
73	beq-	2f
74	rlwimi	r4 ,r4 ,16 ,0 ,15
75	mtctr	r0
761:	stwu	r4, 4(r6)
77	bdnz	1b
782:	andi.	r0, r5, 1
79	beqlr
80	sth	r4, 4(r6)
81	blr
82EXPORT_SYMBOL(memset16)
83
84/*
85 * Use dcbz on the complete cache lines in the destination
86 * to set them to zero.  This requires that the destination
87 * area is cacheable.  -- paulus
88 *
89 * During early init, cache might not be active yet, so dcbz cannot be used.
90 * We therefore skip the optimised bloc that uses dcbz. This jump is
91 * replaced by a nop once cache is active. This is done in machine_init()
92 */
93_GLOBAL(memset)
94	cmplwi	0,r5,4
95	blt	7f
96
97	rlwimi	r4,r4,8,16,23
98	rlwimi	r4,r4,16,0,15
99
100	stw	r4,0(r3)
101	beqlr
102	andi.	r0,r3,3
103	add	r5,r0,r5
104	subf	r6,r0,r3
105	cmplwi	0,r4,0
106	/*
107	 * Skip optimised bloc until cache is enabled. Will be replaced
108	 * by 'bne' during boot to use normal procedure if r4 is not zero
109	 */
110_GLOBAL(memset_nocache_branch)
111	b	2f
112
113	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
114	add	r8,r7,r5
115	srwi	r9,r8,LG_CACHELINE_BYTES
116	addic.	r9,r9,-1	/* total number of complete cachelines */
117	ble	2f
118	xori	r0,r7,CACHELINE_MASK & ~3
119	srwi.	r0,r0,2
120	beq	3f
121	mtctr	r0
1224:	stwu	r4,4(r6)
123	bdnz	4b
1243:	mtctr	r9
125	li	r7,4
12610:	dcbz	r7,r6
127	addi	r6,r6,CACHELINE_BYTES
128	bdnz	10b
129	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
130	addi	r5,r5,4
131
1322:	srwi	r0,r5,2
133	mtctr	r0
134	bdz	6f
1351:	stwu	r4,4(r6)
136	bdnz	1b
1376:	andi.	r5,r5,3
138	beqlr
139	mtctr	r5
140	addi	r6,r6,3
1418:	stbu	r4,1(r6)
142	bdnz	8b
143	blr
144
1457:	cmpwi	0,r5,0
146	beqlr
147	mtctr	r5
148	addi	r6,r3,-1
1499:	stbu	r4,1(r6)
150	bdnz	9b
151	blr
152EXPORT_SYMBOL(memset)
153
154/*
155 * This version uses dcbz on the complete cache lines in the
156 * destination area to reduce memory traffic.  This requires that
157 * the destination area is cacheable.
158 * We only use this version if the source and dest don't overlap.
159 * -- paulus.
160 *
161 * During early init, cache might not be active yet, so dcbz cannot be used.
162 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
163 * replaced by a nop once cache is active. This is done in machine_init()
164 */
165_GLOBAL(memmove)
166	cmplw	0,r3,r4
167	bgt	backwards_memcpy
168	/* fall through */
169
170_GLOBAL(memcpy)
171	b	generic_memcpy
172	add	r7,r3,r5		/* test if the src & dst overlap */
173	add	r8,r4,r5
174	cmplw	0,r4,r7
175	cmplw	1,r3,r8
176	crand	0,0,4			/* cr0.lt &= cr1.lt */
177	blt	generic_memcpy		/* if regions overlap */
178
179	addi	r4,r4,-4
180	addi	r6,r3,-4
181	neg	r0,r3
182	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
183	beq	58f
184
185	cmplw	0,r5,r0			/* is this more than total to do? */
186	blt	63f			/* if not much to do */
187	andi.	r8,r0,3			/* get it word-aligned first */
188	subf	r5,r0,r5
189	mtctr	r8
190	beq+	61f
19170:	lbz	r9,4(r4)		/* do some bytes */
192	addi	r4,r4,1
193	addi	r6,r6,1
194	stb	r9,3(r6)
195	bdnz	70b
19661:	srwi.	r0,r0,2
197	mtctr	r0
198	beq	58f
19972:	lwzu	r9,4(r4)		/* do some words */
200	stwu	r9,4(r6)
201	bdnz	72b
202
20358:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
204	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
205	li	r11,4
206	mtctr	r0
207	beq	63f
20853:
209	dcbz	r11,r6
210	COPY_16_BYTES
211#if L1_CACHE_BYTES >= 32
212	COPY_16_BYTES
213#if L1_CACHE_BYTES >= 64
214	COPY_16_BYTES
215	COPY_16_BYTES
216#if L1_CACHE_BYTES >= 128
217	COPY_16_BYTES
218	COPY_16_BYTES
219	COPY_16_BYTES
220	COPY_16_BYTES
221#endif
222#endif
223#endif
224	bdnz	53b
225
22663:	srwi.	r0,r5,2
227	mtctr	r0
228	beq	64f
22930:	lwzu	r0,4(r4)
230	stwu	r0,4(r6)
231	bdnz	30b
232
23364:	andi.	r0,r5,3
234	mtctr	r0
235	beq+	65f
236	addi	r4,r4,3
237	addi	r6,r6,3
23840:	lbzu	r0,1(r4)
239	stbu	r0,1(r6)
240	bdnz	40b
24165:	blr
242EXPORT_SYMBOL(memcpy)
243EXPORT_SYMBOL(memmove)
244
245generic_memcpy:
246	srwi.	r7,r5,3
247	addi	r6,r3,-4
248	addi	r4,r4,-4
249	beq	2f			/* if less than 8 bytes to do */
250	andi.	r0,r6,3			/* get dest word aligned */
251	mtctr	r7
252	bne	5f
2531:	lwz	r7,4(r4)
254	lwzu	r8,8(r4)
255	stw	r7,4(r6)
256	stwu	r8,8(r6)
257	bdnz	1b
258	andi.	r5,r5,7
2592:	cmplwi	0,r5,4
260	blt	3f
261	lwzu	r0,4(r4)
262	addi	r5,r5,-4
263	stwu	r0,4(r6)
2643:	cmpwi	0,r5,0
265	beqlr
266	mtctr	r5
267	addi	r4,r4,3
268	addi	r6,r6,3
2694:	lbzu	r0,1(r4)
270	stbu	r0,1(r6)
271	bdnz	4b
272	blr
2735:	subfic	r0,r0,4
274	mtctr	r0
2756:	lbz	r7,4(r4)
276	addi	r4,r4,1
277	stb	r7,4(r6)
278	addi	r6,r6,1
279	bdnz	6b
280	subf	r5,r0,r5
281	rlwinm.	r7,r5,32-3,3,31
282	beq	2b
283	mtctr	r7
284	b	1b
285
286_GLOBAL(backwards_memcpy)
287	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
288	add	r6,r3,r5
289	add	r4,r4,r5
290	beq	2f
291	andi.	r0,r6,3
292	mtctr	r7
293	bne	5f
2941:	lwz	r7,-4(r4)
295	lwzu	r8,-8(r4)
296	stw	r7,-4(r6)
297	stwu	r8,-8(r6)
298	bdnz	1b
299	andi.	r5,r5,7
3002:	cmplwi	0,r5,4
301	blt	3f
302	lwzu	r0,-4(r4)
303	subi	r5,r5,4
304	stwu	r0,-4(r6)
3053:	cmpwi	0,r5,0
306	beqlr
307	mtctr	r5
3084:	lbzu	r0,-1(r4)
309	stbu	r0,-1(r6)
310	bdnz	4b
311	blr
3125:	mtctr	r0
3136:	lbzu	r7,-1(r4)
314	stbu	r7,-1(r6)
315	bdnz	6b
316	subf	r5,r0,r5
317	rlwinm.	r7,r5,32-3,3,31
318	beq	2b
319	mtctr	r7
320	b	1b
321
322_GLOBAL(__copy_tofrom_user)
323	addi	r4,r4,-4
324	addi	r6,r3,-4
325	neg	r0,r3
326	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
327	beq	58f
328
329	cmplw	0,r5,r0			/* is this more than total to do? */
330	blt	63f			/* if not much to do */
331	andi.	r8,r0,3			/* get it word-aligned first */
332	mtctr	r8
333	beq+	61f
33470:	lbz	r9,4(r4)		/* do some bytes */
33571:	stb	r9,4(r6)
336	addi	r4,r4,1
337	addi	r6,r6,1
338	bdnz	70b
33961:	subf	r5,r0,r5
340	srwi.	r0,r0,2
341	mtctr	r0
342	beq	58f
34372:	lwzu	r9,4(r4)		/* do some words */
34473:	stwu	r9,4(r6)
345	bdnz	72b
346
347	EX_TABLE(70b,100f)
348	EX_TABLE(71b,101f)
349	EX_TABLE(72b,102f)
350	EX_TABLE(73b,103f)
351
35258:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
353	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
354	li	r11,4
355	beq	63f
356
357	/* Here we decide how far ahead to prefetch the source */
358	li	r3,4
359	cmpwi	r0,1
360	li	r7,0
361	ble	114f
362	li	r7,1
363#if MAX_COPY_PREFETCH > 1
364	/* Heuristically, for large transfers we prefetch
365	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
366	   we prefetch 1 cacheline ahead. */
367	cmpwi	r0,MAX_COPY_PREFETCH
368	ble	112f
369	li	r7,MAX_COPY_PREFETCH
370112:	mtctr	r7
371111:	dcbt	r3,r4
372	addi	r3,r3,CACHELINE_BYTES
373	bdnz	111b
374#else
375	dcbt	r3,r4
376	addi	r3,r3,CACHELINE_BYTES
377#endif /* MAX_COPY_PREFETCH > 1 */
378
379114:	subf	r8,r7,r0
380	mr	r0,r7
381	mtctr	r8
382
38353:	dcbt	r3,r4
38454:	dcbz	r11,r6
385	EX_TABLE(54b,105f)
386/* the main body of the cacheline loop */
387	COPY_16_BYTES_WITHEX(0)
388#if L1_CACHE_BYTES >= 32
389	COPY_16_BYTES_WITHEX(1)
390#if L1_CACHE_BYTES >= 64
391	COPY_16_BYTES_WITHEX(2)
392	COPY_16_BYTES_WITHEX(3)
393#if L1_CACHE_BYTES >= 128
394	COPY_16_BYTES_WITHEX(4)
395	COPY_16_BYTES_WITHEX(5)
396	COPY_16_BYTES_WITHEX(6)
397	COPY_16_BYTES_WITHEX(7)
398#endif
399#endif
400#endif
401	bdnz	53b
402	cmpwi	r0,0
403	li	r3,4
404	li	r7,0
405	bne	114b
406
40763:	srwi.	r0,r5,2
408	mtctr	r0
409	beq	64f
41030:	lwzu	r0,4(r4)
41131:	stwu	r0,4(r6)
412	bdnz	30b
413
41464:	andi.	r0,r5,3
415	mtctr	r0
416	beq+	65f
41740:	lbz	r0,4(r4)
41841:	stb	r0,4(r6)
419	addi	r4,r4,1
420	addi	r6,r6,1
421	bdnz	40b
42265:	li	r3,0
423	blr
424
425/* read fault, initial single-byte copy */
426100:	li	r9,0
427	b	90f
428/* write fault, initial single-byte copy */
429101:	li	r9,1
43090:	subf	r5,r8,r5
431	li	r3,0
432	b	99f
433/* read fault, initial word copy */
434102:	li	r9,0
435	b	91f
436/* write fault, initial word copy */
437103:	li	r9,1
43891:	li	r3,2
439	b	99f
440
441/*
442 * this stuff handles faults in the cacheline loop and branches to either
443 * 104f (if in read part) or 105f (if in write part), after updating r5
444 */
445	COPY_16_BYTES_EXCODE(0)
446#if L1_CACHE_BYTES >= 32
447	COPY_16_BYTES_EXCODE(1)
448#if L1_CACHE_BYTES >= 64
449	COPY_16_BYTES_EXCODE(2)
450	COPY_16_BYTES_EXCODE(3)
451#if L1_CACHE_BYTES >= 128
452	COPY_16_BYTES_EXCODE(4)
453	COPY_16_BYTES_EXCODE(5)
454	COPY_16_BYTES_EXCODE(6)
455	COPY_16_BYTES_EXCODE(7)
456#endif
457#endif
458#endif
459
460/* read fault in cacheline loop */
461104:	li	r9,0
462	b	92f
463/* fault on dcbz (effectively a write fault) */
464/* or write fault in cacheline loop */
465105:	li	r9,1
46692:	li	r3,LG_CACHELINE_BYTES
467	mfctr	r8
468	add	r0,r0,r8
469	b	106f
470/* read fault in final word loop */
471108:	li	r9,0
472	b	93f
473/* write fault in final word loop */
474109:	li	r9,1
47593:	andi.	r5,r5,3
476	li	r3,2
477	b	99f
478/* read fault in final byte loop */
479110:	li	r9,0
480	b	94f
481/* write fault in final byte loop */
482111:	li	r9,1
48394:	li	r5,0
484	li	r3,0
485/*
486 * At this stage the number of bytes not copied is
487 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
488 */
48999:	mfctr	r0
490106:	slw	r3,r0,r3
491	add.	r3,r3,r5
492	beq	120f			/* shouldn't happen */
493	cmpwi	0,r9,0
494	bne	120f
495/* for a read fault, first try to continue the copy one byte at a time */
496	mtctr	r3
497130:	lbz	r0,4(r4)
498131:	stb	r0,4(r6)
499	addi	r4,r4,1
500	addi	r6,r6,1
501	bdnz	130b
502/* then clear out the destination: r3 bytes starting at 4(r6) */
503132:	mfctr	r3
504120:	blr
505
506	EX_TABLE(30b,108b)
507	EX_TABLE(31b,109b)
508	EX_TABLE(40b,110b)
509	EX_TABLE(41b,111b)
510	EX_TABLE(130b,132b)
511	EX_TABLE(131b,120b)
512
513EXPORT_SYMBOL(__copy_tofrom_user)
514