xref: /openbmc/linux/arch/powerpc/lib/memcpy_power7.S (revision d4295e12)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#ifndef SELFTEST_CASE
23/* 0 == don't use VMX, 1 == use VMX */
24#define SELFTEST_CASE	0
25#endif
26
27#ifdef __BIG_ENDIAN__
28#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
29#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
30#else
31#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
32#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
33#endif
34
35_GLOBAL(memcpy_power7)
36	cmpldi	r5,16
37	cmpldi	cr1,r5,4096
38	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
39	blt	.Lshort_copy
40
41#ifdef CONFIG_ALTIVEC
42test_feature = SELFTEST_CASE
43BEGIN_FTR_SECTION
44	bgt	cr1, .Lvmx_copy
45END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
46#endif
47
48.Lnonvmx_copy:
49	/* Get the source 8B aligned */
50	neg	r6,r4
51	mtocrf	0x01,r6
52	clrldi	r6,r6,(64-3)
53
54	bf	cr7*4+3,1f
55	lbz	r0,0(r4)
56	addi	r4,r4,1
57	stb	r0,0(r3)
58	addi	r3,r3,1
59
601:	bf	cr7*4+2,2f
61	lhz	r0,0(r4)
62	addi	r4,r4,2
63	sth	r0,0(r3)
64	addi	r3,r3,2
65
662:	bf	cr7*4+1,3f
67	lwz	r0,0(r4)
68	addi	r4,r4,4
69	stw	r0,0(r3)
70	addi	r3,r3,4
71
723:	sub	r5,r5,r6
73	cmpldi	r5,128
74	blt	5f
75
76	mflr	r0
77	stdu	r1,-STACKFRAMESIZE(r1)
78	std	r14,STK_REG(R14)(r1)
79	std	r15,STK_REG(R15)(r1)
80	std	r16,STK_REG(R16)(r1)
81	std	r17,STK_REG(R17)(r1)
82	std	r18,STK_REG(R18)(r1)
83	std	r19,STK_REG(R19)(r1)
84	std	r20,STK_REG(R20)(r1)
85	std	r21,STK_REG(R21)(r1)
86	std	r22,STK_REG(R22)(r1)
87	std	r0,STACKFRAMESIZE+16(r1)
88
89	srdi	r6,r5,7
90	mtctr	r6
91
92	/* Now do cacheline (128B) sized loads and stores. */
93	.align	5
944:
95	ld	r0,0(r4)
96	ld	r6,8(r4)
97	ld	r7,16(r4)
98	ld	r8,24(r4)
99	ld	r9,32(r4)
100	ld	r10,40(r4)
101	ld	r11,48(r4)
102	ld	r12,56(r4)
103	ld	r14,64(r4)
104	ld	r15,72(r4)
105	ld	r16,80(r4)
106	ld	r17,88(r4)
107	ld	r18,96(r4)
108	ld	r19,104(r4)
109	ld	r20,112(r4)
110	ld	r21,120(r4)
111	addi	r4,r4,128
112	std	r0,0(r3)
113	std	r6,8(r3)
114	std	r7,16(r3)
115	std	r8,24(r3)
116	std	r9,32(r3)
117	std	r10,40(r3)
118	std	r11,48(r3)
119	std	r12,56(r3)
120	std	r14,64(r3)
121	std	r15,72(r3)
122	std	r16,80(r3)
123	std	r17,88(r3)
124	std	r18,96(r3)
125	std	r19,104(r3)
126	std	r20,112(r3)
127	std	r21,120(r3)
128	addi	r3,r3,128
129	bdnz	4b
130
131	clrldi	r5,r5,(64-7)
132
133	ld	r14,STK_REG(R14)(r1)
134	ld	r15,STK_REG(R15)(r1)
135	ld	r16,STK_REG(R16)(r1)
136	ld	r17,STK_REG(R17)(r1)
137	ld	r18,STK_REG(R18)(r1)
138	ld	r19,STK_REG(R19)(r1)
139	ld	r20,STK_REG(R20)(r1)
140	ld	r21,STK_REG(R21)(r1)
141	ld	r22,STK_REG(R22)(r1)
142	addi	r1,r1,STACKFRAMESIZE
143
144	/* Up to 127B to go */
1455:	srdi	r6,r5,4
146	mtocrf	0x01,r6
147
1486:	bf	cr7*4+1,7f
149	ld	r0,0(r4)
150	ld	r6,8(r4)
151	ld	r7,16(r4)
152	ld	r8,24(r4)
153	ld	r9,32(r4)
154	ld	r10,40(r4)
155	ld	r11,48(r4)
156	ld	r12,56(r4)
157	addi	r4,r4,64
158	std	r0,0(r3)
159	std	r6,8(r3)
160	std	r7,16(r3)
161	std	r8,24(r3)
162	std	r9,32(r3)
163	std	r10,40(r3)
164	std	r11,48(r3)
165	std	r12,56(r3)
166	addi	r3,r3,64
167
168	/* Up to 63B to go */
1697:	bf	cr7*4+2,8f
170	ld	r0,0(r4)
171	ld	r6,8(r4)
172	ld	r7,16(r4)
173	ld	r8,24(r4)
174	addi	r4,r4,32
175	std	r0,0(r3)
176	std	r6,8(r3)
177	std	r7,16(r3)
178	std	r8,24(r3)
179	addi	r3,r3,32
180
181	/* Up to 31B to go */
1828:	bf	cr7*4+3,9f
183	ld	r0,0(r4)
184	ld	r6,8(r4)
185	addi	r4,r4,16
186	std	r0,0(r3)
187	std	r6,8(r3)
188	addi	r3,r3,16
189
1909:	clrldi	r5,r5,(64-4)
191
192	/* Up to 15B to go */
193.Lshort_copy:
194	mtocrf	0x01,r5
195	bf	cr7*4+0,12f
196	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
197	lwz	r6,4(r4)
198	addi	r4,r4,8
199	stw	r0,0(r3)
200	stw	r6,4(r3)
201	addi	r3,r3,8
202
20312:	bf	cr7*4+1,13f
204	lwz	r0,0(r4)
205	addi	r4,r4,4
206	stw	r0,0(r3)
207	addi	r3,r3,4
208
20913:	bf	cr7*4+2,14f
210	lhz	r0,0(r4)
211	addi	r4,r4,2
212	sth	r0,0(r3)
213	addi	r3,r3,2
214
21514:	bf	cr7*4+3,15f
216	lbz	r0,0(r4)
217	stb	r0,0(r3)
218
21915:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
220	blr
221
222.Lunwind_stack_nonvmx_copy:
223	addi	r1,r1,STACKFRAMESIZE
224	b	.Lnonvmx_copy
225
226.Lvmx_copy:
227#ifdef CONFIG_ALTIVEC
228	mflr	r0
229	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
231	std	r0,16(r1)
232	stdu	r1,-STACKFRAMESIZE(r1)
233	bl	enter_vmx_ops
234	cmpwi	cr1,r3,0
235	ld	r0,STACKFRAMESIZE+16(r1)
236	ld	r3,STK_REG(R31)(r1)
237	ld	r4,STK_REG(R30)(r1)
238	ld	r5,STK_REG(R29)(r1)
239	mtlr	r0
240
241	/*
242	 * We prefetch both the source and destination using enhanced touch
243	 * instructions. We use a stream ID of 0 for the load side and
244	 * 1 for the store side.
245	 */
246	clrrdi	r6,r4,7
247	clrrdi	r9,r3,7
248	ori	r9,r9,1		/* stream=1 */
249
250	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
251	cmpldi	r7,0x3FF
252	ble	1f
253	li	r7,0x3FF
2541:	lis	r0,0x0E00	/* depth=7 */
255	sldi	r7,r7,7
256	or	r7,r7,r0
257	ori	r10,r7,1	/* stream=1 */
258
259	lis	r8,0x8000	/* GO=1 */
260	clrldi	r8,r8,32
261
262	dcbt	0,r6,0b01000
263	dcbt	0,r7,0b01010
264	dcbtst	0,r9,0b01000
265	dcbtst	0,r10,0b01010
266	eieio
267	dcbt	0,r8,0b01010	/* GO */
268
269	beq	cr1,.Lunwind_stack_nonvmx_copy
270
271	/*
272	 * If source and destination are not relatively aligned we use a
273	 * slower permute loop.
274	 */
275	xor	r6,r4,r3
276	rldicl.	r6,r6,0,(64-4)
277	bne	.Lvmx_unaligned_copy
278
279	/* Get the destination 16B aligned */
280	neg	r6,r3
281	mtocrf	0x01,r6
282	clrldi	r6,r6,(64-4)
283
284	bf	cr7*4+3,1f
285	lbz	r0,0(r4)
286	addi	r4,r4,1
287	stb	r0,0(r3)
288	addi	r3,r3,1
289
2901:	bf	cr7*4+2,2f
291	lhz	r0,0(r4)
292	addi	r4,r4,2
293	sth	r0,0(r3)
294	addi	r3,r3,2
295
2962:	bf	cr7*4+1,3f
297	lwz	r0,0(r4)
298	addi	r4,r4,4
299	stw	r0,0(r3)
300	addi	r3,r3,4
301
3023:	bf	cr7*4+0,4f
303	ld	r0,0(r4)
304	addi	r4,r4,8
305	std	r0,0(r3)
306	addi	r3,r3,8
307
3084:	sub	r5,r5,r6
309
310	/* Get the desination 128B aligned */
311	neg	r6,r3
312	srdi	r7,r6,4
313	mtocrf	0x01,r7
314	clrldi	r6,r6,(64-7)
315
316	li	r9,16
317	li	r10,32
318	li	r11,48
319
320	bf	cr7*4+3,5f
321	lvx	v1,0,r4
322	addi	r4,r4,16
323	stvx	v1,0,r3
324	addi	r3,r3,16
325
3265:	bf	cr7*4+2,6f
327	lvx	v1,0,r4
328	lvx	v0,r4,r9
329	addi	r4,r4,32
330	stvx	v1,0,r3
331	stvx	v0,r3,r9
332	addi	r3,r3,32
333
3346:	bf	cr7*4+1,7f
335	lvx	v3,0,r4
336	lvx	v2,r4,r9
337	lvx	v1,r4,r10
338	lvx	v0,r4,r11
339	addi	r4,r4,64
340	stvx	v3,0,r3
341	stvx	v2,r3,r9
342	stvx	v1,r3,r10
343	stvx	v0,r3,r11
344	addi	r3,r3,64
345
3467:	sub	r5,r5,r6
347	srdi	r6,r5,7
348
349	std	r14,STK_REG(R14)(r1)
350	std	r15,STK_REG(R15)(r1)
351	std	r16,STK_REG(R16)(r1)
352
353	li	r12,64
354	li	r14,80
355	li	r15,96
356	li	r16,112
357
358	mtctr	r6
359
360	/*
361	 * Now do cacheline sized loads and stores. By this stage the
362	 * cacheline stores are also cacheline aligned.
363	 */
364	.align	5
3658:
366	lvx	v7,0,r4
367	lvx	v6,r4,r9
368	lvx	v5,r4,r10
369	lvx	v4,r4,r11
370	lvx	v3,r4,r12
371	lvx	v2,r4,r14
372	lvx	v1,r4,r15
373	lvx	v0,r4,r16
374	addi	r4,r4,128
375	stvx	v7,0,r3
376	stvx	v6,r3,r9
377	stvx	v5,r3,r10
378	stvx	v4,r3,r11
379	stvx	v3,r3,r12
380	stvx	v2,r3,r14
381	stvx	v1,r3,r15
382	stvx	v0,r3,r16
383	addi	r3,r3,128
384	bdnz	8b
385
386	ld	r14,STK_REG(R14)(r1)
387	ld	r15,STK_REG(R15)(r1)
388	ld	r16,STK_REG(R16)(r1)
389
390	/* Up to 127B to go */
391	clrldi	r5,r5,(64-7)
392	srdi	r6,r5,4
393	mtocrf	0x01,r6
394
395	bf	cr7*4+1,9f
396	lvx	v3,0,r4
397	lvx	v2,r4,r9
398	lvx	v1,r4,r10
399	lvx	v0,r4,r11
400	addi	r4,r4,64
401	stvx	v3,0,r3
402	stvx	v2,r3,r9
403	stvx	v1,r3,r10
404	stvx	v0,r3,r11
405	addi	r3,r3,64
406
4079:	bf	cr7*4+2,10f
408	lvx	v1,0,r4
409	lvx	v0,r4,r9
410	addi	r4,r4,32
411	stvx	v1,0,r3
412	stvx	v0,r3,r9
413	addi	r3,r3,32
414
41510:	bf	cr7*4+3,11f
416	lvx	v1,0,r4
417	addi	r4,r4,16
418	stvx	v1,0,r3
419	addi	r3,r3,16
420
421	/* Up to 15B to go */
42211:	clrldi	r5,r5,(64-4)
423	mtocrf	0x01,r5
424	bf	cr7*4+0,12f
425	ld	r0,0(r4)
426	addi	r4,r4,8
427	std	r0,0(r3)
428	addi	r3,r3,8
429
43012:	bf	cr7*4+1,13f
431	lwz	r0,0(r4)
432	addi	r4,r4,4
433	stw	r0,0(r3)
434	addi	r3,r3,4
435
43613:	bf	cr7*4+2,14f
437	lhz	r0,0(r4)
438	addi	r4,r4,2
439	sth	r0,0(r3)
440	addi	r3,r3,2
441
44214:	bf	cr7*4+3,15f
443	lbz	r0,0(r4)
444	stb	r0,0(r3)
445
44615:	addi	r1,r1,STACKFRAMESIZE
447	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
448	b	exit_vmx_ops		/* tail call optimise */
449
450.Lvmx_unaligned_copy:
451	/* Get the destination 16B aligned */
452	neg	r6,r3
453	mtocrf	0x01,r6
454	clrldi	r6,r6,(64-4)
455
456	bf	cr7*4+3,1f
457	lbz	r0,0(r4)
458	addi	r4,r4,1
459	stb	r0,0(r3)
460	addi	r3,r3,1
461
4621:	bf	cr7*4+2,2f
463	lhz	r0,0(r4)
464	addi	r4,r4,2
465	sth	r0,0(r3)
466	addi	r3,r3,2
467
4682:	bf	cr7*4+1,3f
469	lwz	r0,0(r4)
470	addi	r4,r4,4
471	stw	r0,0(r3)
472	addi	r3,r3,4
473
4743:	bf	cr7*4+0,4f
475	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
476	lwz	r7,4(r4)
477	addi	r4,r4,8
478	stw	r0,0(r3)
479	stw	r7,4(r3)
480	addi	r3,r3,8
481
4824:	sub	r5,r5,r6
483
484	/* Get the desination 128B aligned */
485	neg	r6,r3
486	srdi	r7,r6,4
487	mtocrf	0x01,r7
488	clrldi	r6,r6,(64-7)
489
490	li	r9,16
491	li	r10,32
492	li	r11,48
493
494	LVS(v16,0,r4)		/* Setup permute control vector */
495	lvx	v0,0,r4
496	addi	r4,r4,16
497
498	bf	cr7*4+3,5f
499	lvx	v1,0,r4
500	VPERM(v8,v0,v1,v16)
501	addi	r4,r4,16
502	stvx	v8,0,r3
503	addi	r3,r3,16
504	vor	v0,v1,v1
505
5065:	bf	cr7*4+2,6f
507	lvx	v1,0,r4
508	VPERM(v8,v0,v1,v16)
509	lvx	v0,r4,r9
510	VPERM(v9,v1,v0,v16)
511	addi	r4,r4,32
512	stvx	v8,0,r3
513	stvx	v9,r3,r9
514	addi	r3,r3,32
515
5166:	bf	cr7*4+1,7f
517	lvx	v3,0,r4
518	VPERM(v8,v0,v3,v16)
519	lvx	v2,r4,r9
520	VPERM(v9,v3,v2,v16)
521	lvx	v1,r4,r10
522	VPERM(v10,v2,v1,v16)
523	lvx	v0,r4,r11
524	VPERM(v11,v1,v0,v16)
525	addi	r4,r4,64
526	stvx	v8,0,r3
527	stvx	v9,r3,r9
528	stvx	v10,r3,r10
529	stvx	v11,r3,r11
530	addi	r3,r3,64
531
5327:	sub	r5,r5,r6
533	srdi	r6,r5,7
534
535	std	r14,STK_REG(R14)(r1)
536	std	r15,STK_REG(R15)(r1)
537	std	r16,STK_REG(R16)(r1)
538
539	li	r12,64
540	li	r14,80
541	li	r15,96
542	li	r16,112
543
544	mtctr	r6
545
546	/*
547	 * Now do cacheline sized loads and stores. By this stage the
548	 * cacheline stores are also cacheline aligned.
549	 */
550	.align	5
5518:
552	lvx	v7,0,r4
553	VPERM(v8,v0,v7,v16)
554	lvx	v6,r4,r9
555	VPERM(v9,v7,v6,v16)
556	lvx	v5,r4,r10
557	VPERM(v10,v6,v5,v16)
558	lvx	v4,r4,r11
559	VPERM(v11,v5,v4,v16)
560	lvx	v3,r4,r12
561	VPERM(v12,v4,v3,v16)
562	lvx	v2,r4,r14
563	VPERM(v13,v3,v2,v16)
564	lvx	v1,r4,r15
565	VPERM(v14,v2,v1,v16)
566	lvx	v0,r4,r16
567	VPERM(v15,v1,v0,v16)
568	addi	r4,r4,128
569	stvx	v8,0,r3
570	stvx	v9,r3,r9
571	stvx	v10,r3,r10
572	stvx	v11,r3,r11
573	stvx	v12,r3,r12
574	stvx	v13,r3,r14
575	stvx	v14,r3,r15
576	stvx	v15,r3,r16
577	addi	r3,r3,128
578	bdnz	8b
579
580	ld	r14,STK_REG(R14)(r1)
581	ld	r15,STK_REG(R15)(r1)
582	ld	r16,STK_REG(R16)(r1)
583
584	/* Up to 127B to go */
585	clrldi	r5,r5,(64-7)
586	srdi	r6,r5,4
587	mtocrf	0x01,r6
588
589	bf	cr7*4+1,9f
590	lvx	v3,0,r4
591	VPERM(v8,v0,v3,v16)
592	lvx	v2,r4,r9
593	VPERM(v9,v3,v2,v16)
594	lvx	v1,r4,r10
595	VPERM(v10,v2,v1,v16)
596	lvx	v0,r4,r11
597	VPERM(v11,v1,v0,v16)
598	addi	r4,r4,64
599	stvx	v8,0,r3
600	stvx	v9,r3,r9
601	stvx	v10,r3,r10
602	stvx	v11,r3,r11
603	addi	r3,r3,64
604
6059:	bf	cr7*4+2,10f
606	lvx	v1,0,r4
607	VPERM(v8,v0,v1,v16)
608	lvx	v0,r4,r9
609	VPERM(v9,v1,v0,v16)
610	addi	r4,r4,32
611	stvx	v8,0,r3
612	stvx	v9,r3,r9
613	addi	r3,r3,32
614
61510:	bf	cr7*4+3,11f
616	lvx	v1,0,r4
617	VPERM(v8,v0,v1,v16)
618	addi	r4,r4,16
619	stvx	v8,0,r3
620	addi	r3,r3,16
621
622	/* Up to 15B to go */
62311:	clrldi	r5,r5,(64-4)
624	addi	r4,r4,-16	/* Unwind the +16 load offset */
625	mtocrf	0x01,r5
626	bf	cr7*4+0,12f
627	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
628	lwz	r6,4(r4)
629	addi	r4,r4,8
630	stw	r0,0(r3)
631	stw	r6,4(r3)
632	addi	r3,r3,8
633
63412:	bf	cr7*4+1,13f
635	lwz	r0,0(r4)
636	addi	r4,r4,4
637	stw	r0,0(r3)
638	addi	r3,r3,4
639
64013:	bf	cr7*4+2,14f
641	lhz	r0,0(r4)
642	addi	r4,r4,2
643	sth	r0,0(r3)
644	addi	r3,r3,2
645
64614:	bf	cr7*4+3,15f
647	lbz	r0,0(r4)
648	stb	r0,0(r3)
649
65015:	addi	r1,r1,STACKFRAMESIZE
651	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
652	b	exit_vmx_ops		/* tail call optimise */
653#endif /* CONFIG_ALTIVEC */
654