xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision 151f4e2b)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16#include <asm/code-patching-asm.h>
17#include <asm/kasan.h>
18
19#define COPY_16_BYTES		\
20	lwz	r7,4(r4);	\
21	lwz	r8,8(r4);	\
22	lwz	r9,12(r4);	\
23	lwzu	r10,16(r4);	\
24	stw	r7,4(r6);	\
25	stw	r8,8(r6);	\
26	stw	r9,12(r6);	\
27	stwu	r10,16(r6)
28
29#define COPY_16_BYTES_WITHEX(n)	\
308 ## n ## 0:			\
31	lwz	r7,4(r4);	\
328 ## n ## 1:			\
33	lwz	r8,8(r4);	\
348 ## n ## 2:			\
35	lwz	r9,12(r4);	\
368 ## n ## 3:			\
37	lwzu	r10,16(r4);	\
388 ## n ## 4:			\
39	stw	r7,4(r6);	\
408 ## n ## 5:			\
41	stw	r8,8(r6);	\
428 ## n ## 6:			\
43	stw	r9,12(r6);	\
448 ## n ## 7:			\
45	stwu	r10,16(r6)
46
47#define COPY_16_BYTES_EXCODE(n)			\
489 ## n ## 0:					\
49	addi	r5,r5,-(16 * n);		\
50	b	104f;				\
519 ## n ## 1:					\
52	addi	r5,r5,-(16 * n);		\
53	b	105f;				\
54	EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);	\
55	EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);	\
56	EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);	\
57	EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);	\
58	EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);	\
59	EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);	\
60	EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);	\
61	EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
62
63	.text
64	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
65	.stabs	"copy_32.S",N_SO,0,0,0f
660:
67
68CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72#ifndef CONFIG_KASAN
73_GLOBAL(memset16)
74	rlwinm.	r0 ,r5, 31, 1, 31
75	addi	r6, r3, -4
76	beq-	2f
77	rlwimi	r4 ,r4 ,16 ,0 ,15
78	mtctr	r0
791:	stwu	r4, 4(r6)
80	bdnz	1b
812:	andi.	r0, r5, 1
82	beqlr
83	sth	r4, 4(r6)
84	blr
85EXPORT_SYMBOL(memset16)
86#endif
87
88/*
89 * Use dcbz on the complete cache lines in the destination
90 * to set them to zero.  This requires that the destination
91 * area is cacheable.  -- paulus
92 *
93 * During early init, cache might not be active yet, so dcbz cannot be used.
94 * We therefore skip the optimised bloc that uses dcbz. This jump is
95 * replaced by a nop once cache is active. This is done in machine_init()
96 */
97_GLOBAL_KASAN(memset)
98	cmplwi	0,r5,4
99	blt	7f
100
101	rlwimi	r4,r4,8,16,23
102	rlwimi	r4,r4,16,0,15
103
104	stw	r4,0(r3)
105	beqlr
106	andi.	r0,r3,3
107	add	r5,r0,r5
108	subf	r6,r0,r3
109	cmplwi	0,r4,0
110	/*
111	 * Skip optimised bloc until cache is enabled. Will be replaced
112	 * by 'bne' during boot to use normal procedure if r4 is not zero
113	 */
1145:	b	2f
115	patch_site	5b, patch__memset_nocache
116
117	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
118	add	r8,r7,r5
119	srwi	r9,r8,LG_CACHELINE_BYTES
120	addic.	r9,r9,-1	/* total number of complete cachelines */
121	ble	2f
122	xori	r0,r7,CACHELINE_MASK & ~3
123	srwi.	r0,r0,2
124	beq	3f
125	mtctr	r0
1264:	stwu	r4,4(r6)
127	bdnz	4b
1283:	mtctr	r9
129	li	r7,4
13010:	dcbz	r7,r6
131	addi	r6,r6,CACHELINE_BYTES
132	bdnz	10b
133	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
134	addi	r5,r5,4
135
1362:	srwi	r0,r5,2
137	mtctr	r0
138	bdz	6f
1391:	stwu	r4,4(r6)
140	bdnz	1b
1416:	andi.	r5,r5,3
142	beqlr
143	mtctr	r5
144	addi	r6,r6,3
1458:	stbu	r4,1(r6)
146	bdnz	8b
147	blr
148
1497:	cmpwi	0,r5,0
150	beqlr
151	mtctr	r5
152	addi	r6,r3,-1
1539:	stbu	r4,1(r6)
154	bdnz	9b
155	blr
156EXPORT_SYMBOL(memset)
157EXPORT_SYMBOL_KASAN(memset)
158
159/*
160 * This version uses dcbz on the complete cache lines in the
161 * destination area to reduce memory traffic.  This requires that
162 * the destination area is cacheable.
163 * We only use this version if the source and dest don't overlap.
164 * -- paulus.
165 *
166 * During early init, cache might not be active yet, so dcbz cannot be used.
167 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
168 * replaced by a nop once cache is active. This is done in machine_init()
169 */
170_GLOBAL_KASAN(memmove)
171	cmplw	0,r3,r4
172	bgt	backwards_memcpy
173	/* fall through */
174
175_GLOBAL_KASAN(memcpy)
1761:	b	generic_memcpy
177	patch_site	1b, patch__memcpy_nocache
178
179	add	r7,r3,r5		/* test if the src & dst overlap */
180	add	r8,r4,r5
181	cmplw	0,r4,r7
182	cmplw	1,r3,r8
183	crand	0,0,4			/* cr0.lt &= cr1.lt */
184	blt	generic_memcpy		/* if regions overlap */
185
186	addi	r4,r4,-4
187	addi	r6,r3,-4
188	neg	r0,r3
189	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
190	beq	58f
191
192	cmplw	0,r5,r0			/* is this more than total to do? */
193	blt	63f			/* if not much to do */
194	andi.	r8,r0,3			/* get it word-aligned first */
195	subf	r5,r0,r5
196	mtctr	r8
197	beq+	61f
19870:	lbz	r9,4(r4)		/* do some bytes */
199	addi	r4,r4,1
200	addi	r6,r6,1
201	stb	r9,3(r6)
202	bdnz	70b
20361:	srwi.	r0,r0,2
204	mtctr	r0
205	beq	58f
20672:	lwzu	r9,4(r4)		/* do some words */
207	stwu	r9,4(r6)
208	bdnz	72b
209
21058:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
211	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
212	li	r11,4
213	mtctr	r0
214	beq	63f
21553:
216	dcbz	r11,r6
217	COPY_16_BYTES
218#if L1_CACHE_BYTES >= 32
219	COPY_16_BYTES
220#if L1_CACHE_BYTES >= 64
221	COPY_16_BYTES
222	COPY_16_BYTES
223#if L1_CACHE_BYTES >= 128
224	COPY_16_BYTES
225	COPY_16_BYTES
226	COPY_16_BYTES
227	COPY_16_BYTES
228#endif
229#endif
230#endif
231	bdnz	53b
232
23363:	srwi.	r0,r5,2
234	mtctr	r0
235	beq	64f
23630:	lwzu	r0,4(r4)
237	stwu	r0,4(r6)
238	bdnz	30b
239
24064:	andi.	r0,r5,3
241	mtctr	r0
242	beq+	65f
243	addi	r4,r4,3
244	addi	r6,r6,3
24540:	lbzu	r0,1(r4)
246	stbu	r0,1(r6)
247	bdnz	40b
24865:	blr
249EXPORT_SYMBOL(memcpy)
250EXPORT_SYMBOL(memmove)
251EXPORT_SYMBOL_KASAN(memcpy)
252EXPORT_SYMBOL_KASAN(memmove)
253
254generic_memcpy:
255	srwi.	r7,r5,3
256	addi	r6,r3,-4
257	addi	r4,r4,-4
258	beq	2f			/* if less than 8 bytes to do */
259	andi.	r0,r6,3			/* get dest word aligned */
260	mtctr	r7
261	bne	5f
2621:	lwz	r7,4(r4)
263	lwzu	r8,8(r4)
264	stw	r7,4(r6)
265	stwu	r8,8(r6)
266	bdnz	1b
267	andi.	r5,r5,7
2682:	cmplwi	0,r5,4
269	blt	3f
270	lwzu	r0,4(r4)
271	addi	r5,r5,-4
272	stwu	r0,4(r6)
2733:	cmpwi	0,r5,0
274	beqlr
275	mtctr	r5
276	addi	r4,r4,3
277	addi	r6,r6,3
2784:	lbzu	r0,1(r4)
279	stbu	r0,1(r6)
280	bdnz	4b
281	blr
2825:	subfic	r0,r0,4
283	mtctr	r0
2846:	lbz	r7,4(r4)
285	addi	r4,r4,1
286	stb	r7,4(r6)
287	addi	r6,r6,1
288	bdnz	6b
289	subf	r5,r0,r5
290	rlwinm.	r7,r5,32-3,3,31
291	beq	2b
292	mtctr	r7
293	b	1b
294
295_GLOBAL(backwards_memcpy)
296	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
297	add	r6,r3,r5
298	add	r4,r4,r5
299	beq	2f
300	andi.	r0,r6,3
301	mtctr	r7
302	bne	5f
3031:	lwz	r7,-4(r4)
304	lwzu	r8,-8(r4)
305	stw	r7,-4(r6)
306	stwu	r8,-8(r6)
307	bdnz	1b
308	andi.	r5,r5,7
3092:	cmplwi	0,r5,4
310	blt	3f
311	lwzu	r0,-4(r4)
312	subi	r5,r5,4
313	stwu	r0,-4(r6)
3143:	cmpwi	0,r5,0
315	beqlr
316	mtctr	r5
3174:	lbzu	r0,-1(r4)
318	stbu	r0,-1(r6)
319	bdnz	4b
320	blr
3215:	mtctr	r0
3226:	lbzu	r7,-1(r4)
323	stbu	r7,-1(r6)
324	bdnz	6b
325	subf	r5,r0,r5
326	rlwinm.	r7,r5,32-3,3,31
327	beq	2b
328	mtctr	r7
329	b	1b
330
331_GLOBAL(__copy_tofrom_user)
332	addi	r4,r4,-4
333	addi	r6,r3,-4
334	neg	r0,r3
335	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
336	beq	58f
337
338	cmplw	0,r5,r0			/* is this more than total to do? */
339	blt	63f			/* if not much to do */
340	andi.	r8,r0,3			/* get it word-aligned first */
341	mtctr	r8
342	beq+	61f
34370:	lbz	r9,4(r4)		/* do some bytes */
34471:	stb	r9,4(r6)
345	addi	r4,r4,1
346	addi	r6,r6,1
347	bdnz	70b
34861:	subf	r5,r0,r5
349	srwi.	r0,r0,2
350	mtctr	r0
351	beq	58f
35272:	lwzu	r9,4(r4)		/* do some words */
35373:	stwu	r9,4(r6)
354	bdnz	72b
355
356	EX_TABLE(70b,100f)
357	EX_TABLE(71b,101f)
358	EX_TABLE(72b,102f)
359	EX_TABLE(73b,103f)
360
36158:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
362	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
363	li	r11,4
364	beq	63f
365
366	/* Here we decide how far ahead to prefetch the source */
367	li	r3,4
368	cmpwi	r0,1
369	li	r7,0
370	ble	114f
371	li	r7,1
372#if MAX_COPY_PREFETCH > 1
373	/* Heuristically, for large transfers we prefetch
374	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
375	   we prefetch 1 cacheline ahead. */
376	cmpwi	r0,MAX_COPY_PREFETCH
377	ble	112f
378	li	r7,MAX_COPY_PREFETCH
379112:	mtctr	r7
380111:	dcbt	r3,r4
381	addi	r3,r3,CACHELINE_BYTES
382	bdnz	111b
383#else
384	dcbt	r3,r4
385	addi	r3,r3,CACHELINE_BYTES
386#endif /* MAX_COPY_PREFETCH > 1 */
387
388114:	subf	r8,r7,r0
389	mr	r0,r7
390	mtctr	r8
391
39253:	dcbt	r3,r4
39354:	dcbz	r11,r6
394	EX_TABLE(54b,105f)
395/* the main body of the cacheline loop */
396	COPY_16_BYTES_WITHEX(0)
397#if L1_CACHE_BYTES >= 32
398	COPY_16_BYTES_WITHEX(1)
399#if L1_CACHE_BYTES >= 64
400	COPY_16_BYTES_WITHEX(2)
401	COPY_16_BYTES_WITHEX(3)
402#if L1_CACHE_BYTES >= 128
403	COPY_16_BYTES_WITHEX(4)
404	COPY_16_BYTES_WITHEX(5)
405	COPY_16_BYTES_WITHEX(6)
406	COPY_16_BYTES_WITHEX(7)
407#endif
408#endif
409#endif
410	bdnz	53b
411	cmpwi	r0,0
412	li	r3,4
413	li	r7,0
414	bne	114b
415
41663:	srwi.	r0,r5,2
417	mtctr	r0
418	beq	64f
41930:	lwzu	r0,4(r4)
42031:	stwu	r0,4(r6)
421	bdnz	30b
422
42364:	andi.	r0,r5,3
424	mtctr	r0
425	beq+	65f
42640:	lbz	r0,4(r4)
42741:	stb	r0,4(r6)
428	addi	r4,r4,1
429	addi	r6,r6,1
430	bdnz	40b
43165:	li	r3,0
432	blr
433
434/* read fault, initial single-byte copy */
435100:	li	r9,0
436	b	90f
437/* write fault, initial single-byte copy */
438101:	li	r9,1
43990:	subf	r5,r8,r5
440	li	r3,0
441	b	99f
442/* read fault, initial word copy */
443102:	li	r9,0
444	b	91f
445/* write fault, initial word copy */
446103:	li	r9,1
44791:	li	r3,2
448	b	99f
449
450/*
451 * this stuff handles faults in the cacheline loop and branches to either
452 * 104f (if in read part) or 105f (if in write part), after updating r5
453 */
454	COPY_16_BYTES_EXCODE(0)
455#if L1_CACHE_BYTES >= 32
456	COPY_16_BYTES_EXCODE(1)
457#if L1_CACHE_BYTES >= 64
458	COPY_16_BYTES_EXCODE(2)
459	COPY_16_BYTES_EXCODE(3)
460#if L1_CACHE_BYTES >= 128
461	COPY_16_BYTES_EXCODE(4)
462	COPY_16_BYTES_EXCODE(5)
463	COPY_16_BYTES_EXCODE(6)
464	COPY_16_BYTES_EXCODE(7)
465#endif
466#endif
467#endif
468
469/* read fault in cacheline loop */
470104:	li	r9,0
471	b	92f
472/* fault on dcbz (effectively a write fault) */
473/* or write fault in cacheline loop */
474105:	li	r9,1
47592:	li	r3,LG_CACHELINE_BYTES
476	mfctr	r8
477	add	r0,r0,r8
478	b	106f
479/* read fault in final word loop */
480108:	li	r9,0
481	b	93f
482/* write fault in final word loop */
483109:	li	r9,1
48493:	andi.	r5,r5,3
485	li	r3,2
486	b	99f
487/* read fault in final byte loop */
488110:	li	r9,0
489	b	94f
490/* write fault in final byte loop */
491111:	li	r9,1
49294:	li	r5,0
493	li	r3,0
494/*
495 * At this stage the number of bytes not copied is
496 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
497 */
49899:	mfctr	r0
499106:	slw	r3,r0,r3
500	add.	r3,r3,r5
501	beq	120f			/* shouldn't happen */
502	cmpwi	0,r9,0
503	bne	120f
504/* for a read fault, first try to continue the copy one byte at a time */
505	mtctr	r3
506130:	lbz	r0,4(r4)
507131:	stb	r0,4(r6)
508	addi	r4,r4,1
509	addi	r6,r6,1
510	bdnz	130b
511/* then clear out the destination: r3 bytes starting at 4(r6) */
512132:	mfctr	r3
513120:	blr
514
515	EX_TABLE(30b,108b)
516	EX_TABLE(31b,109b)
517	EX_TABLE(40b,110b)
518	EX_TABLE(41b,111b)
519	EX_TABLE(130b,132b)
520	EX_TABLE(131b,120b)
521
522EXPORT_SYMBOL(__copy_tofrom_user)
523