xref: /openbmc/linux/arch/powerpc/lib/memcpy_power7.S (revision c75df6f9)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#define STACKFRAMESIZE	256
23#define STK_REG(i)	(112 + ((i)-14)*8)
24
25_GLOBAL(memcpy_power7)
26#ifdef CONFIG_ALTIVEC
27	cmpldi	r5,16
28	cmpldi	cr1,r5,4096
29
30	std	r3,48(r1)
31
32	blt	.Lshort_copy
33	bgt	cr1,.Lvmx_copy
34#else
35	cmpldi	r5,16
36
37	std	r3,48(r1)
38
39	blt	.Lshort_copy
40#endif
41
42.Lnonvmx_copy:
43	/* Get the source 8B aligned */
44	neg	r6,r4
45	mtocrf	0x01,r6
46	clrldi	r6,r6,(64-3)
47
48	bf	cr7*4+3,1f
49	lbz	r0,0(r4)
50	addi	r4,r4,1
51	stb	r0,0(r3)
52	addi	r3,r3,1
53
541:	bf	cr7*4+2,2f
55	lhz	r0,0(r4)
56	addi	r4,r4,2
57	sth	r0,0(r3)
58	addi	r3,r3,2
59
602:	bf	cr7*4+1,3f
61	lwz	r0,0(r4)
62	addi	r4,r4,4
63	stw	r0,0(r3)
64	addi	r3,r3,4
65
663:	sub	r5,r5,r6
67	cmpldi	r5,128
68	blt	5f
69
70	mflr	r0
71	stdu	r1,-STACKFRAMESIZE(r1)
72	std	r14,STK_REG(R14)(r1)
73	std	r15,STK_REG(R15)(r1)
74	std	r16,STK_REG(R16)(r1)
75	std	r17,STK_REG(R17)(r1)
76	std	r18,STK_REG(R18)(r1)
77	std	r19,STK_REG(R19)(r1)
78	std	r20,STK_REG(R20)(r1)
79	std	r21,STK_REG(R21)(r1)
80	std	r22,STK_REG(R22)(r1)
81	std	r0,STACKFRAMESIZE+16(r1)
82
83	srdi	r6,r5,7
84	mtctr	r6
85
86	/* Now do cacheline (128B) sized loads and stores. */
87	.align	5
884:
89	ld	r0,0(r4)
90	ld	r6,8(r4)
91	ld	r7,16(r4)
92	ld	r8,24(r4)
93	ld	r9,32(r4)
94	ld	r10,40(r4)
95	ld	r11,48(r4)
96	ld	r12,56(r4)
97	ld	r14,64(r4)
98	ld	r15,72(r4)
99	ld	r16,80(r4)
100	ld	r17,88(r4)
101	ld	r18,96(r4)
102	ld	r19,104(r4)
103	ld	r20,112(r4)
104	ld	r21,120(r4)
105	addi	r4,r4,128
106	std	r0,0(r3)
107	std	r6,8(r3)
108	std	r7,16(r3)
109	std	r8,24(r3)
110	std	r9,32(r3)
111	std	r10,40(r3)
112	std	r11,48(r3)
113	std	r12,56(r3)
114	std	r14,64(r3)
115	std	r15,72(r3)
116	std	r16,80(r3)
117	std	r17,88(r3)
118	std	r18,96(r3)
119	std	r19,104(r3)
120	std	r20,112(r3)
121	std	r21,120(r3)
122	addi	r3,r3,128
123	bdnz	4b
124
125	clrldi	r5,r5,(64-7)
126
127	ld	r14,STK_REG(R14)(r1)
128	ld	r15,STK_REG(R15)(r1)
129	ld	r16,STK_REG(R16)(r1)
130	ld	r17,STK_REG(R17)(r1)
131	ld	r18,STK_REG(R18)(r1)
132	ld	r19,STK_REG(R19)(r1)
133	ld	r20,STK_REG(R20)(r1)
134	ld	r21,STK_REG(R21)(r1)
135	ld	r22,STK_REG(R22)(r1)
136	addi	r1,r1,STACKFRAMESIZE
137
138	/* Up to 127B to go */
1395:	srdi	r6,r5,4
140	mtocrf	0x01,r6
141
1426:	bf	cr7*4+1,7f
143	ld	r0,0(r4)
144	ld	r6,8(r4)
145	ld	r7,16(r4)
146	ld	r8,24(r4)
147	ld	r9,32(r4)
148	ld	r10,40(r4)
149	ld	r11,48(r4)
150	ld	r12,56(r4)
151	addi	r4,r4,64
152	std	r0,0(r3)
153	std	r6,8(r3)
154	std	r7,16(r3)
155	std	r8,24(r3)
156	std	r9,32(r3)
157	std	r10,40(r3)
158	std	r11,48(r3)
159	std	r12,56(r3)
160	addi	r3,r3,64
161
162	/* Up to 63B to go */
1637:	bf	cr7*4+2,8f
164	ld	r0,0(r4)
165	ld	r6,8(r4)
166	ld	r7,16(r4)
167	ld	r8,24(r4)
168	addi	r4,r4,32
169	std	r0,0(r3)
170	std	r6,8(r3)
171	std	r7,16(r3)
172	std	r8,24(r3)
173	addi	r3,r3,32
174
175	/* Up to 31B to go */
1768:	bf	cr7*4+3,9f
177	ld	r0,0(r4)
178	ld	r6,8(r4)
179	addi	r4,r4,16
180	std	r0,0(r3)
181	std	r6,8(r3)
182	addi	r3,r3,16
183
1849:	clrldi	r5,r5,(64-4)
185
186	/* Up to 15B to go */
187.Lshort_copy:
188	mtocrf	0x01,r5
189	bf	cr7*4+0,12f
190	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
191	lwz	r6,4(r4)
192	addi	r4,r4,8
193	stw	r0,0(r3)
194	stw	r6,4(r3)
195	addi	r3,r3,8
196
19712:	bf	cr7*4+1,13f
198	lwz	r0,0(r4)
199	addi	r4,r4,4
200	stw	r0,0(r3)
201	addi	r3,r3,4
202
20313:	bf	cr7*4+2,14f
204	lhz	r0,0(r4)
205	addi	r4,r4,2
206	sth	r0,0(r3)
207	addi	r3,r3,2
208
20914:	bf	cr7*4+3,15f
210	lbz	r0,0(r4)
211	stb	r0,0(r3)
212
21315:	ld	r3,48(r1)
214	blr
215
216.Lunwind_stack_nonvmx_copy:
217	addi	r1,r1,STACKFRAMESIZE
218	b	.Lnonvmx_copy
219
220#ifdef CONFIG_ALTIVEC
221.Lvmx_copy:
222	mflr	r0
223	std	r4,56(r1)
224	std	r5,64(r1)
225	std	r0,16(r1)
226	stdu	r1,-STACKFRAMESIZE(r1)
227	bl	.enter_vmx_copy
228	cmpwi	r3,0
229	ld	r0,STACKFRAMESIZE+16(r1)
230	ld	r3,STACKFRAMESIZE+48(r1)
231	ld	r4,STACKFRAMESIZE+56(r1)
232	ld	r5,STACKFRAMESIZE+64(r1)
233	mtlr	r0
234
235	/*
236	 * We prefetch both the source and destination using enhanced touch
237	 * instructions. We use a stream ID of 0 for the load side and
238	 * 1 for the store side.
239	 */
240	clrrdi	r6,r4,7
241	clrrdi	r9,r3,7
242	ori	r9,r9,1		/* stream=1 */
243
244	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
245	cmpldi	cr1,r7,0x3FF
246	ble	cr1,1f
247	li	r7,0x3FF
2481:	lis	r0,0x0E00	/* depth=7 */
249	sldi	r7,r7,7
250	or	r7,r7,r0
251	ori	r10,r7,1	/* stream=1 */
252
253	lis	r8,0x8000	/* GO=1 */
254	clrldi	r8,r8,32
255
256.machine push
257.machine "power4"
258	dcbt	r0,r6,0b01000
259	dcbt	r0,r7,0b01010
260	dcbtst	r0,r9,0b01000
261	dcbtst	r0,r10,0b01010
262	eieio
263	dcbt	r0,r8,0b01010	/* GO */
264.machine pop
265
266	beq	.Lunwind_stack_nonvmx_copy
267
268	/*
269	 * If source and destination are not relatively aligned we use a
270	 * slower permute loop.
271	 */
272	xor	r6,r4,r3
273	rldicl.	r6,r6,0,(64-4)
274	bne	.Lvmx_unaligned_copy
275
276	/* Get the destination 16B aligned */
277	neg	r6,r3
278	mtocrf	0x01,r6
279	clrldi	r6,r6,(64-4)
280
281	bf	cr7*4+3,1f
282	lbz	r0,0(r4)
283	addi	r4,r4,1
284	stb	r0,0(r3)
285	addi	r3,r3,1
286
2871:	bf	cr7*4+2,2f
288	lhz	r0,0(r4)
289	addi	r4,r4,2
290	sth	r0,0(r3)
291	addi	r3,r3,2
292
2932:	bf	cr7*4+1,3f
294	lwz	r0,0(r4)
295	addi	r4,r4,4
296	stw	r0,0(r3)
297	addi	r3,r3,4
298
2993:	bf	cr7*4+0,4f
300	ld	r0,0(r4)
301	addi	r4,r4,8
302	std	r0,0(r3)
303	addi	r3,r3,8
304
3054:	sub	r5,r5,r6
306
307	/* Get the desination 128B aligned */
308	neg	r6,r3
309	srdi	r7,r6,4
310	mtocrf	0x01,r7
311	clrldi	r6,r6,(64-7)
312
313	li	r9,16
314	li	r10,32
315	li	r11,48
316
317	bf	cr7*4+3,5f
318	lvx	vr1,r0,r4
319	addi	r4,r4,16
320	stvx	vr1,r0,r3
321	addi	r3,r3,16
322
3235:	bf	cr7*4+2,6f
324	lvx	vr1,r0,r4
325	lvx	vr0,r4,r9
326	addi	r4,r4,32
327	stvx	vr1,r0,r3
328	stvx	vr0,r3,r9
329	addi	r3,r3,32
330
3316:	bf	cr7*4+1,7f
332	lvx	vr3,r0,r4
333	lvx	vr2,r4,r9
334	lvx	vr1,r4,r10
335	lvx	vr0,r4,r11
336	addi	r4,r4,64
337	stvx	vr3,r0,r3
338	stvx	vr2,r3,r9
339	stvx	vr1,r3,r10
340	stvx	vr0,r3,r11
341	addi	r3,r3,64
342
3437:	sub	r5,r5,r6
344	srdi	r6,r5,7
345
346	std	r14,STK_REG(R14)(r1)
347	std	r15,STK_REG(R15)(r1)
348	std	r16,STK_REG(R16)(r1)
349
350	li	r12,64
351	li	r14,80
352	li	r15,96
353	li	r16,112
354
355	mtctr	r6
356
357	/*
358	 * Now do cacheline sized loads and stores. By this stage the
359	 * cacheline stores are also cacheline aligned.
360	 */
361	.align	5
3628:
363	lvx	vr7,r0,r4
364	lvx	vr6,r4,r9
365	lvx	vr5,r4,r10
366	lvx	vr4,r4,r11
367	lvx	vr3,r4,r12
368	lvx	vr2,r4,r14
369	lvx	vr1,r4,r15
370	lvx	vr0,r4,r16
371	addi	r4,r4,128
372	stvx	vr7,r0,r3
373	stvx	vr6,r3,r9
374	stvx	vr5,r3,r10
375	stvx	vr4,r3,r11
376	stvx	vr3,r3,r12
377	stvx	vr2,r3,r14
378	stvx	vr1,r3,r15
379	stvx	vr0,r3,r16
380	addi	r3,r3,128
381	bdnz	8b
382
383	ld	r14,STK_REG(R14)(r1)
384	ld	r15,STK_REG(R15)(r1)
385	ld	r16,STK_REG(R16)(r1)
386
387	/* Up to 127B to go */
388	clrldi	r5,r5,(64-7)
389	srdi	r6,r5,4
390	mtocrf	0x01,r6
391
392	bf	cr7*4+1,9f
393	lvx	vr3,r0,r4
394	lvx	vr2,r4,r9
395	lvx	vr1,r4,r10
396	lvx	vr0,r4,r11
397	addi	r4,r4,64
398	stvx	vr3,r0,r3
399	stvx	vr2,r3,r9
400	stvx	vr1,r3,r10
401	stvx	vr0,r3,r11
402	addi	r3,r3,64
403
4049:	bf	cr7*4+2,10f
405	lvx	vr1,r0,r4
406	lvx	vr0,r4,r9
407	addi	r4,r4,32
408	stvx	vr1,r0,r3
409	stvx	vr0,r3,r9
410	addi	r3,r3,32
411
41210:	bf	cr7*4+3,11f
413	lvx	vr1,r0,r4
414	addi	r4,r4,16
415	stvx	vr1,r0,r3
416	addi	r3,r3,16
417
418	/* Up to 15B to go */
41911:	clrldi	r5,r5,(64-4)
420	mtocrf	0x01,r5
421	bf	cr7*4+0,12f
422	ld	r0,0(r4)
423	addi	r4,r4,8
424	std	r0,0(r3)
425	addi	r3,r3,8
426
42712:	bf	cr7*4+1,13f
428	lwz	r0,0(r4)
429	addi	r4,r4,4
430	stw	r0,0(r3)
431	addi	r3,r3,4
432
43313:	bf	cr7*4+2,14f
434	lhz	r0,0(r4)
435	addi	r4,r4,2
436	sth	r0,0(r3)
437	addi	r3,r3,2
438
43914:	bf	cr7*4+3,15f
440	lbz	r0,0(r4)
441	stb	r0,0(r3)
442
44315:	addi	r1,r1,STACKFRAMESIZE
444	ld	r3,48(r1)
445	b	.exit_vmx_copy		/* tail call optimise */
446
447.Lvmx_unaligned_copy:
448	/* Get the destination 16B aligned */
449	neg	r6,r3
450	mtocrf	0x01,r6
451	clrldi	r6,r6,(64-4)
452
453	bf	cr7*4+3,1f
454	lbz	r0,0(r4)
455	addi	r4,r4,1
456	stb	r0,0(r3)
457	addi	r3,r3,1
458
4591:	bf	cr7*4+2,2f
460	lhz	r0,0(r4)
461	addi	r4,r4,2
462	sth	r0,0(r3)
463	addi	r3,r3,2
464
4652:	bf	cr7*4+1,3f
466	lwz	r0,0(r4)
467	addi	r4,r4,4
468	stw	r0,0(r3)
469	addi	r3,r3,4
470
4713:	bf	cr7*4+0,4f
472	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
473	lwz	r7,4(r4)
474	addi	r4,r4,8
475	stw	r0,0(r3)
476	stw	r7,4(r3)
477	addi	r3,r3,8
478
4794:	sub	r5,r5,r6
480
481	/* Get the desination 128B aligned */
482	neg	r6,r3
483	srdi	r7,r6,4
484	mtocrf	0x01,r7
485	clrldi	r6,r6,(64-7)
486
487	li	r9,16
488	li	r10,32
489	li	r11,48
490
491	lvsl	vr16,0,r4	/* Setup permute control vector */
492	lvx	vr0,0,r4
493	addi	r4,r4,16
494
495	bf	cr7*4+3,5f
496	lvx	vr1,r0,r4
497	vperm	vr8,vr0,vr1,vr16
498	addi	r4,r4,16
499	stvx	vr8,r0,r3
500	addi	r3,r3,16
501	vor	vr0,vr1,vr1
502
5035:	bf	cr7*4+2,6f
504	lvx	vr1,r0,r4
505	vperm	vr8,vr0,vr1,vr16
506	lvx	vr0,r4,r9
507	vperm	vr9,vr1,vr0,vr16
508	addi	r4,r4,32
509	stvx	vr8,r0,r3
510	stvx	vr9,r3,r9
511	addi	r3,r3,32
512
5136:	bf	cr7*4+1,7f
514	lvx	vr3,r0,r4
515	vperm	vr8,vr0,vr3,vr16
516	lvx	vr2,r4,r9
517	vperm	vr9,vr3,vr2,vr16
518	lvx	vr1,r4,r10
519	vperm	vr10,vr2,vr1,vr16
520	lvx	vr0,r4,r11
521	vperm	vr11,vr1,vr0,vr16
522	addi	r4,r4,64
523	stvx	vr8,r0,r3
524	stvx	vr9,r3,r9
525	stvx	vr10,r3,r10
526	stvx	vr11,r3,r11
527	addi	r3,r3,64
528
5297:	sub	r5,r5,r6
530	srdi	r6,r5,7
531
532	std	r14,STK_REG(R14)(r1)
533	std	r15,STK_REG(R15)(r1)
534	std	r16,STK_REG(R16)(r1)
535
536	li	r12,64
537	li	r14,80
538	li	r15,96
539	li	r16,112
540
541	mtctr	r6
542
543	/*
544	 * Now do cacheline sized loads and stores. By this stage the
545	 * cacheline stores are also cacheline aligned.
546	 */
547	.align	5
5488:
549	lvx	vr7,r0,r4
550	vperm	vr8,vr0,vr7,vr16
551	lvx	vr6,r4,r9
552	vperm	vr9,vr7,vr6,vr16
553	lvx	vr5,r4,r10
554	vperm	vr10,vr6,vr5,vr16
555	lvx	vr4,r4,r11
556	vperm	vr11,vr5,vr4,vr16
557	lvx	vr3,r4,r12
558	vperm	vr12,vr4,vr3,vr16
559	lvx	vr2,r4,r14
560	vperm	vr13,vr3,vr2,vr16
561	lvx	vr1,r4,r15
562	vperm	vr14,vr2,vr1,vr16
563	lvx	vr0,r4,r16
564	vperm	vr15,vr1,vr0,vr16
565	addi	r4,r4,128
566	stvx	vr8,r0,r3
567	stvx	vr9,r3,r9
568	stvx	vr10,r3,r10
569	stvx	vr11,r3,r11
570	stvx	vr12,r3,r12
571	stvx	vr13,r3,r14
572	stvx	vr14,r3,r15
573	stvx	vr15,r3,r16
574	addi	r3,r3,128
575	bdnz	8b
576
577	ld	r14,STK_REG(R14)(r1)
578	ld	r15,STK_REG(R15)(r1)
579	ld	r16,STK_REG(R16)(r1)
580
581	/* Up to 127B to go */
582	clrldi	r5,r5,(64-7)
583	srdi	r6,r5,4
584	mtocrf	0x01,r6
585
586	bf	cr7*4+1,9f
587	lvx	vr3,r0,r4
588	vperm	vr8,vr0,vr3,vr16
589	lvx	vr2,r4,r9
590	vperm	vr9,vr3,vr2,vr16
591	lvx	vr1,r4,r10
592	vperm	vr10,vr2,vr1,vr16
593	lvx	vr0,r4,r11
594	vperm	vr11,vr1,vr0,vr16
595	addi	r4,r4,64
596	stvx	vr8,r0,r3
597	stvx	vr9,r3,r9
598	stvx	vr10,r3,r10
599	stvx	vr11,r3,r11
600	addi	r3,r3,64
601
6029:	bf	cr7*4+2,10f
603	lvx	vr1,r0,r4
604	vperm	vr8,vr0,vr1,vr16
605	lvx	vr0,r4,r9
606	vperm	vr9,vr1,vr0,vr16
607	addi	r4,r4,32
608	stvx	vr8,r0,r3
609	stvx	vr9,r3,r9
610	addi	r3,r3,32
611
61210:	bf	cr7*4+3,11f
613	lvx	vr1,r0,r4
614	vperm	vr8,vr0,vr1,vr16
615	addi	r4,r4,16
616	stvx	vr8,r0,r3
617	addi	r3,r3,16
618
619	/* Up to 15B to go */
62011:	clrldi	r5,r5,(64-4)
621	addi	r4,r4,-16	/* Unwind the +16 load offset */
622	mtocrf	0x01,r5
623	bf	cr7*4+0,12f
624	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
625	lwz	r6,4(r4)
626	addi	r4,r4,8
627	stw	r0,0(r3)
628	stw	r6,4(r3)
629	addi	r3,r3,8
630
63112:	bf	cr7*4+1,13f
632	lwz	r0,0(r4)
633	addi	r4,r4,4
634	stw	r0,0(r3)
635	addi	r3,r3,4
636
63713:	bf	cr7*4+2,14f
638	lhz	r0,0(r4)
639	addi	r4,r4,2
640	sth	r0,0(r3)
641	addi	r3,r3,2
642
64314:	bf	cr7*4+3,15f
644	lbz	r0,0(r4)
645	stb	r0,0(r3)
646
64715:	addi	r1,r1,STACKFRAMESIZE
648	ld	r3,48(r1)
649	b	.exit_vmx_copy		/* tail call optimise */
650#endif /* CONFiG_ALTIVEC */
651