xref: /openbmc/linux/arch/powerpc/lib/copy_32.S (revision 3dc4b6fb)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Memory copy functions for 32-bit PowerPC.
4 *
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 */
7#include <asm/processor.h>
8#include <asm/cache.h>
9#include <asm/errno.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/code-patching-asm.h>
13#include <asm/kasan.h>
14
15#define COPY_16_BYTES		\
16	lwz	r7,4(r4);	\
17	lwz	r8,8(r4);	\
18	lwz	r9,12(r4);	\
19	lwzu	r10,16(r4);	\
20	stw	r7,4(r6);	\
21	stw	r8,8(r6);	\
22	stw	r9,12(r6);	\
23	stwu	r10,16(r6)
24
25#define COPY_16_BYTES_WITHEX(n)	\
268 ## n ## 0:			\
27	lwz	r7,4(r4);	\
288 ## n ## 1:			\
29	lwz	r8,8(r4);	\
308 ## n ## 2:			\
31	lwz	r9,12(r4);	\
328 ## n ## 3:			\
33	lwzu	r10,16(r4);	\
348 ## n ## 4:			\
35	stw	r7,4(r6);	\
368 ## n ## 5:			\
37	stw	r8,8(r6);	\
388 ## n ## 6:			\
39	stw	r9,12(r6);	\
408 ## n ## 7:			\
41	stwu	r10,16(r6)
42
43#define COPY_16_BYTES_EXCODE(n)			\
449 ## n ## 0:					\
45	addi	r5,r5,-(16 * n);		\
46	b	104f;				\
479 ## n ## 1:					\
48	addi	r5,r5,-(16 * n);		\
49	b	105f;				\
50	EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);	\
51	EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);	\
52	EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);	\
53	EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);	\
54	EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);	\
55	EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);	\
56	EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);	\
57	EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
58
59	.text
60	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
61	.stabs	"copy_32.S",N_SO,0,0,0f
620:
63
64CACHELINE_BYTES = L1_CACHE_BYTES
65LG_CACHELINE_BYTES = L1_CACHE_SHIFT
66CACHELINE_MASK = (L1_CACHE_BYTES-1)
67
68#ifndef CONFIG_KASAN
69_GLOBAL(memset16)
70	rlwinm.	r0 ,r5, 31, 1, 31
71	addi	r6, r3, -4
72	beq-	2f
73	rlwimi	r4 ,r4 ,16 ,0 ,15
74	mtctr	r0
751:	stwu	r4, 4(r6)
76	bdnz	1b
772:	andi.	r0, r5, 1
78	beqlr
79	sth	r4, 4(r6)
80	blr
81EXPORT_SYMBOL(memset16)
82#endif
83
84/*
85 * Use dcbz on the complete cache lines in the destination
86 * to set them to zero.  This requires that the destination
87 * area is cacheable.  -- paulus
88 *
89 * During early init, cache might not be active yet, so dcbz cannot be used.
90 * We therefore skip the optimised bloc that uses dcbz. This jump is
91 * replaced by a nop once cache is active. This is done in machine_init()
92 */
93_GLOBAL_KASAN(memset)
94	cmplwi	0,r5,4
95	blt	7f
96
97	rlwimi	r4,r4,8,16,23
98	rlwimi	r4,r4,16,0,15
99
100	stw	r4,0(r3)
101	beqlr
102	andi.	r0,r3,3
103	add	r5,r0,r5
104	subf	r6,r0,r3
105	cmplwi	0,r4,0
106	/*
107	 * Skip optimised bloc until cache is enabled. Will be replaced
108	 * by 'bne' during boot to use normal procedure if r4 is not zero
109	 */
1105:	b	2f
111	patch_site	5b, patch__memset_nocache
112
113	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
114	add	r8,r7,r5
115	srwi	r9,r8,LG_CACHELINE_BYTES
116	addic.	r9,r9,-1	/* total number of complete cachelines */
117	ble	2f
118	xori	r0,r7,CACHELINE_MASK & ~3
119	srwi.	r0,r0,2
120	beq	3f
121	mtctr	r0
1224:	stwu	r4,4(r6)
123	bdnz	4b
1243:	mtctr	r9
125	li	r7,4
12610:	dcbz	r7,r6
127	addi	r6,r6,CACHELINE_BYTES
128	bdnz	10b
129	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
130	addi	r5,r5,4
131
1322:	srwi	r0,r5,2
133	mtctr	r0
134	bdz	6f
1351:	stwu	r4,4(r6)
136	bdnz	1b
1376:	andi.	r5,r5,3
138	beqlr
139	mtctr	r5
140	addi	r6,r6,3
1418:	stbu	r4,1(r6)
142	bdnz	8b
143	blr
144
1457:	cmpwi	0,r5,0
146	beqlr
147	mtctr	r5
148	addi	r6,r3,-1
1499:	stbu	r4,1(r6)
150	bdnz	9b
151	blr
152EXPORT_SYMBOL(memset)
153EXPORT_SYMBOL_KASAN(memset)
154
155/*
156 * This version uses dcbz on the complete cache lines in the
157 * destination area to reduce memory traffic.  This requires that
158 * the destination area is cacheable.
159 * We only use this version if the source and dest don't overlap.
160 * -- paulus.
161 *
162 * During early init, cache might not be active yet, so dcbz cannot be used.
163 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
164 * replaced by a nop once cache is active. This is done in machine_init()
165 */
166_GLOBAL_KASAN(memmove)
167	cmplw	0,r3,r4
168	bgt	backwards_memcpy
169	/* fall through */
170
171_GLOBAL_KASAN(memcpy)
1721:	b	generic_memcpy
173	patch_site	1b, patch__memcpy_nocache
174
175	add	r7,r3,r5		/* test if the src & dst overlap */
176	add	r8,r4,r5
177	cmplw	0,r4,r7
178	cmplw	1,r3,r8
179	crand	0,0,4			/* cr0.lt &= cr1.lt */
180	blt	generic_memcpy		/* if regions overlap */
181
182	addi	r4,r4,-4
183	addi	r6,r3,-4
184	neg	r0,r3
185	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
186	beq	58f
187
188	cmplw	0,r5,r0			/* is this more than total to do? */
189	blt	63f			/* if not much to do */
190	andi.	r8,r0,3			/* get it word-aligned first */
191	subf	r5,r0,r5
192	mtctr	r8
193	beq+	61f
19470:	lbz	r9,4(r4)		/* do some bytes */
195	addi	r4,r4,1
196	addi	r6,r6,1
197	stb	r9,3(r6)
198	bdnz	70b
19961:	srwi.	r0,r0,2
200	mtctr	r0
201	beq	58f
20272:	lwzu	r9,4(r4)		/* do some words */
203	stwu	r9,4(r6)
204	bdnz	72b
205
20658:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
207	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
208	li	r11,4
209	mtctr	r0
210	beq	63f
21153:
212	dcbz	r11,r6
213	COPY_16_BYTES
214#if L1_CACHE_BYTES >= 32
215	COPY_16_BYTES
216#if L1_CACHE_BYTES >= 64
217	COPY_16_BYTES
218	COPY_16_BYTES
219#if L1_CACHE_BYTES >= 128
220	COPY_16_BYTES
221	COPY_16_BYTES
222	COPY_16_BYTES
223	COPY_16_BYTES
224#endif
225#endif
226#endif
227	bdnz	53b
228
22963:	srwi.	r0,r5,2
230	mtctr	r0
231	beq	64f
23230:	lwzu	r0,4(r4)
233	stwu	r0,4(r6)
234	bdnz	30b
235
23664:	andi.	r0,r5,3
237	mtctr	r0
238	beq+	65f
239	addi	r4,r4,3
240	addi	r6,r6,3
24140:	lbzu	r0,1(r4)
242	stbu	r0,1(r6)
243	bdnz	40b
24465:	blr
245EXPORT_SYMBOL(memcpy)
246EXPORT_SYMBOL(memmove)
247EXPORT_SYMBOL_KASAN(memcpy)
248EXPORT_SYMBOL_KASAN(memmove)
249
250generic_memcpy:
251	srwi.	r7,r5,3
252	addi	r6,r3,-4
253	addi	r4,r4,-4
254	beq	2f			/* if less than 8 bytes to do */
255	andi.	r0,r6,3			/* get dest word aligned */
256	mtctr	r7
257	bne	5f
2581:	lwz	r7,4(r4)
259	lwzu	r8,8(r4)
260	stw	r7,4(r6)
261	stwu	r8,8(r6)
262	bdnz	1b
263	andi.	r5,r5,7
2642:	cmplwi	0,r5,4
265	blt	3f
266	lwzu	r0,4(r4)
267	addi	r5,r5,-4
268	stwu	r0,4(r6)
2693:	cmpwi	0,r5,0
270	beqlr
271	mtctr	r5
272	addi	r4,r4,3
273	addi	r6,r6,3
2744:	lbzu	r0,1(r4)
275	stbu	r0,1(r6)
276	bdnz	4b
277	blr
2785:	subfic	r0,r0,4
279	mtctr	r0
2806:	lbz	r7,4(r4)
281	addi	r4,r4,1
282	stb	r7,4(r6)
283	addi	r6,r6,1
284	bdnz	6b
285	subf	r5,r0,r5
286	rlwinm.	r7,r5,32-3,3,31
287	beq	2b
288	mtctr	r7
289	b	1b
290
291_GLOBAL(backwards_memcpy)
292	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
293	add	r6,r3,r5
294	add	r4,r4,r5
295	beq	2f
296	andi.	r0,r6,3
297	mtctr	r7
298	bne	5f
2991:	lwz	r7,-4(r4)
300	lwzu	r8,-8(r4)
301	stw	r7,-4(r6)
302	stwu	r8,-8(r6)
303	bdnz	1b
304	andi.	r5,r5,7
3052:	cmplwi	0,r5,4
306	blt	3f
307	lwzu	r0,-4(r4)
308	subi	r5,r5,4
309	stwu	r0,-4(r6)
3103:	cmpwi	0,r5,0
311	beqlr
312	mtctr	r5
3134:	lbzu	r0,-1(r4)
314	stbu	r0,-1(r6)
315	bdnz	4b
316	blr
3175:	mtctr	r0
3186:	lbzu	r7,-1(r4)
319	stbu	r7,-1(r6)
320	bdnz	6b
321	subf	r5,r0,r5
322	rlwinm.	r7,r5,32-3,3,31
323	beq	2b
324	mtctr	r7
325	b	1b
326
327_GLOBAL(__copy_tofrom_user)
328	addi	r4,r4,-4
329	addi	r6,r3,-4
330	neg	r0,r3
331	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
332	beq	58f
333
334	cmplw	0,r5,r0			/* is this more than total to do? */
335	blt	63f			/* if not much to do */
336	andi.	r8,r0,3			/* get it word-aligned first */
337	mtctr	r8
338	beq+	61f
33970:	lbz	r9,4(r4)		/* do some bytes */
34071:	stb	r9,4(r6)
341	addi	r4,r4,1
342	addi	r6,r6,1
343	bdnz	70b
34461:	subf	r5,r0,r5
345	srwi.	r0,r0,2
346	mtctr	r0
347	beq	58f
34872:	lwzu	r9,4(r4)		/* do some words */
34973:	stwu	r9,4(r6)
350	bdnz	72b
351
352	EX_TABLE(70b,100f)
353	EX_TABLE(71b,101f)
354	EX_TABLE(72b,102f)
355	EX_TABLE(73b,103f)
356
35758:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
358	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
359	li	r11,4
360	beq	63f
361
362	/* Here we decide how far ahead to prefetch the source */
363	li	r3,4
364	cmpwi	r0,1
365	li	r7,0
366	ble	114f
367	li	r7,1
368#if MAX_COPY_PREFETCH > 1
369	/* Heuristically, for large transfers we prefetch
370	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
371	   we prefetch 1 cacheline ahead. */
372	cmpwi	r0,MAX_COPY_PREFETCH
373	ble	112f
374	li	r7,MAX_COPY_PREFETCH
375112:	mtctr	r7
376111:	dcbt	r3,r4
377	addi	r3,r3,CACHELINE_BYTES
378	bdnz	111b
379#else
380	dcbt	r3,r4
381	addi	r3,r3,CACHELINE_BYTES
382#endif /* MAX_COPY_PREFETCH > 1 */
383
384114:	subf	r8,r7,r0
385	mr	r0,r7
386	mtctr	r8
387
38853:	dcbt	r3,r4
38954:	dcbz	r11,r6
390	EX_TABLE(54b,105f)
391/* the main body of the cacheline loop */
392	COPY_16_BYTES_WITHEX(0)
393#if L1_CACHE_BYTES >= 32
394	COPY_16_BYTES_WITHEX(1)
395#if L1_CACHE_BYTES >= 64
396	COPY_16_BYTES_WITHEX(2)
397	COPY_16_BYTES_WITHEX(3)
398#if L1_CACHE_BYTES >= 128
399	COPY_16_BYTES_WITHEX(4)
400	COPY_16_BYTES_WITHEX(5)
401	COPY_16_BYTES_WITHEX(6)
402	COPY_16_BYTES_WITHEX(7)
403#endif
404#endif
405#endif
406	bdnz	53b
407	cmpwi	r0,0
408	li	r3,4
409	li	r7,0
410	bne	114b
411
41263:	srwi.	r0,r5,2
413	mtctr	r0
414	beq	64f
41530:	lwzu	r0,4(r4)
41631:	stwu	r0,4(r6)
417	bdnz	30b
418
41964:	andi.	r0,r5,3
420	mtctr	r0
421	beq+	65f
42240:	lbz	r0,4(r4)
42341:	stb	r0,4(r6)
424	addi	r4,r4,1
425	addi	r6,r6,1
426	bdnz	40b
42765:	li	r3,0
428	blr
429
430/* read fault, initial single-byte copy */
431100:	li	r9,0
432	b	90f
433/* write fault, initial single-byte copy */
434101:	li	r9,1
43590:	subf	r5,r8,r5
436	li	r3,0
437	b	99f
438/* read fault, initial word copy */
439102:	li	r9,0
440	b	91f
441/* write fault, initial word copy */
442103:	li	r9,1
44391:	li	r3,2
444	b	99f
445
446/*
447 * this stuff handles faults in the cacheline loop and branches to either
448 * 104f (if in read part) or 105f (if in write part), after updating r5
449 */
450	COPY_16_BYTES_EXCODE(0)
451#if L1_CACHE_BYTES >= 32
452	COPY_16_BYTES_EXCODE(1)
453#if L1_CACHE_BYTES >= 64
454	COPY_16_BYTES_EXCODE(2)
455	COPY_16_BYTES_EXCODE(3)
456#if L1_CACHE_BYTES >= 128
457	COPY_16_BYTES_EXCODE(4)
458	COPY_16_BYTES_EXCODE(5)
459	COPY_16_BYTES_EXCODE(6)
460	COPY_16_BYTES_EXCODE(7)
461#endif
462#endif
463#endif
464
465/* read fault in cacheline loop */
466104:	li	r9,0
467	b	92f
468/* fault on dcbz (effectively a write fault) */
469/* or write fault in cacheline loop */
470105:	li	r9,1
47192:	li	r3,LG_CACHELINE_BYTES
472	mfctr	r8
473	add	r0,r0,r8
474	b	106f
475/* read fault in final word loop */
476108:	li	r9,0
477	b	93f
478/* write fault in final word loop */
479109:	li	r9,1
48093:	andi.	r5,r5,3
481	li	r3,2
482	b	99f
483/* read fault in final byte loop */
484110:	li	r9,0
485	b	94f
486/* write fault in final byte loop */
487111:	li	r9,1
48894:	li	r5,0
489	li	r3,0
490/*
491 * At this stage the number of bytes not copied is
492 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
493 */
49499:	mfctr	r0
495106:	slw	r3,r0,r3
496	add.	r3,r3,r5
497	beq	120f			/* shouldn't happen */
498	cmpwi	0,r9,0
499	bne	120f
500/* for a read fault, first try to continue the copy one byte at a time */
501	mtctr	r3
502130:	lbz	r0,4(r4)
503131:	stb	r0,4(r6)
504	addi	r4,r4,1
505	addi	r6,r6,1
506	bdnz	130b
507/* then clear out the destination: r3 bytes starting at 4(r6) */
508132:	mfctr	r3
509120:	blr
510
511	EX_TABLE(30b,108b)
512	EX_TABLE(31b,109b)
513	EX_TABLE(40b,110b)
514	EX_TABLE(41b,111b)
515	EX_TABLE(130b,132b)
516	EX_TABLE(131b,120b)
517
518EXPORT_SYMBOL(__copy_tofrom_user)
519