xref: /openbmc/linux/arch/powerpc/lib/memcpy_power7.S (revision 2fae7cdb)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22_GLOBAL(memcpy_power7)
23#ifdef CONFIG_ALTIVEC
24	cmpldi	r5,16
25	cmpldi	cr1,r5,4096
26
27	std	r3,48(r1)
28
29	blt	.Lshort_copy
30	bgt	cr1,.Lvmx_copy
31#else
32	cmpldi	r5,16
33
34	std	r3,48(r1)
35
36	blt	.Lshort_copy
37#endif
38
39.Lnonvmx_copy:
40	/* Get the source 8B aligned */
41	neg	r6,r4
42	mtocrf	0x01,r6
43	clrldi	r6,r6,(64-3)
44
45	bf	cr7*4+3,1f
46	lbz	r0,0(r4)
47	addi	r4,r4,1
48	stb	r0,0(r3)
49	addi	r3,r3,1
50
511:	bf	cr7*4+2,2f
52	lhz	r0,0(r4)
53	addi	r4,r4,2
54	sth	r0,0(r3)
55	addi	r3,r3,2
56
572:	bf	cr7*4+1,3f
58	lwz	r0,0(r4)
59	addi	r4,r4,4
60	stw	r0,0(r3)
61	addi	r3,r3,4
62
633:	sub	r5,r5,r6
64	cmpldi	r5,128
65	blt	5f
66
67	mflr	r0
68	stdu	r1,-STACKFRAMESIZE(r1)
69	std	r14,STK_REG(R14)(r1)
70	std	r15,STK_REG(R15)(r1)
71	std	r16,STK_REG(R16)(r1)
72	std	r17,STK_REG(R17)(r1)
73	std	r18,STK_REG(R18)(r1)
74	std	r19,STK_REG(R19)(r1)
75	std	r20,STK_REG(R20)(r1)
76	std	r21,STK_REG(R21)(r1)
77	std	r22,STK_REG(R22)(r1)
78	std	r0,STACKFRAMESIZE+16(r1)
79
80	srdi	r6,r5,7
81	mtctr	r6
82
83	/* Now do cacheline (128B) sized loads and stores. */
84	.align	5
854:
86	ld	r0,0(r4)
87	ld	r6,8(r4)
88	ld	r7,16(r4)
89	ld	r8,24(r4)
90	ld	r9,32(r4)
91	ld	r10,40(r4)
92	ld	r11,48(r4)
93	ld	r12,56(r4)
94	ld	r14,64(r4)
95	ld	r15,72(r4)
96	ld	r16,80(r4)
97	ld	r17,88(r4)
98	ld	r18,96(r4)
99	ld	r19,104(r4)
100	ld	r20,112(r4)
101	ld	r21,120(r4)
102	addi	r4,r4,128
103	std	r0,0(r3)
104	std	r6,8(r3)
105	std	r7,16(r3)
106	std	r8,24(r3)
107	std	r9,32(r3)
108	std	r10,40(r3)
109	std	r11,48(r3)
110	std	r12,56(r3)
111	std	r14,64(r3)
112	std	r15,72(r3)
113	std	r16,80(r3)
114	std	r17,88(r3)
115	std	r18,96(r3)
116	std	r19,104(r3)
117	std	r20,112(r3)
118	std	r21,120(r3)
119	addi	r3,r3,128
120	bdnz	4b
121
122	clrldi	r5,r5,(64-7)
123
124	ld	r14,STK_REG(R14)(r1)
125	ld	r15,STK_REG(R15)(r1)
126	ld	r16,STK_REG(R16)(r1)
127	ld	r17,STK_REG(R17)(r1)
128	ld	r18,STK_REG(R18)(r1)
129	ld	r19,STK_REG(R19)(r1)
130	ld	r20,STK_REG(R20)(r1)
131	ld	r21,STK_REG(R21)(r1)
132	ld	r22,STK_REG(R22)(r1)
133	addi	r1,r1,STACKFRAMESIZE
134
135	/* Up to 127B to go */
1365:	srdi	r6,r5,4
137	mtocrf	0x01,r6
138
1396:	bf	cr7*4+1,7f
140	ld	r0,0(r4)
141	ld	r6,8(r4)
142	ld	r7,16(r4)
143	ld	r8,24(r4)
144	ld	r9,32(r4)
145	ld	r10,40(r4)
146	ld	r11,48(r4)
147	ld	r12,56(r4)
148	addi	r4,r4,64
149	std	r0,0(r3)
150	std	r6,8(r3)
151	std	r7,16(r3)
152	std	r8,24(r3)
153	std	r9,32(r3)
154	std	r10,40(r3)
155	std	r11,48(r3)
156	std	r12,56(r3)
157	addi	r3,r3,64
158
159	/* Up to 63B to go */
1607:	bf	cr7*4+2,8f
161	ld	r0,0(r4)
162	ld	r6,8(r4)
163	ld	r7,16(r4)
164	ld	r8,24(r4)
165	addi	r4,r4,32
166	std	r0,0(r3)
167	std	r6,8(r3)
168	std	r7,16(r3)
169	std	r8,24(r3)
170	addi	r3,r3,32
171
172	/* Up to 31B to go */
1738:	bf	cr7*4+3,9f
174	ld	r0,0(r4)
175	ld	r6,8(r4)
176	addi	r4,r4,16
177	std	r0,0(r3)
178	std	r6,8(r3)
179	addi	r3,r3,16
180
1819:	clrldi	r5,r5,(64-4)
182
183	/* Up to 15B to go */
184.Lshort_copy:
185	mtocrf	0x01,r5
186	bf	cr7*4+0,12f
187	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
188	lwz	r6,4(r4)
189	addi	r4,r4,8
190	stw	r0,0(r3)
191	stw	r6,4(r3)
192	addi	r3,r3,8
193
19412:	bf	cr7*4+1,13f
195	lwz	r0,0(r4)
196	addi	r4,r4,4
197	stw	r0,0(r3)
198	addi	r3,r3,4
199
20013:	bf	cr7*4+2,14f
201	lhz	r0,0(r4)
202	addi	r4,r4,2
203	sth	r0,0(r3)
204	addi	r3,r3,2
205
20614:	bf	cr7*4+3,15f
207	lbz	r0,0(r4)
208	stb	r0,0(r3)
209
21015:	ld	r3,48(r1)
211	blr
212
213.Lunwind_stack_nonvmx_copy:
214	addi	r1,r1,STACKFRAMESIZE
215	b	.Lnonvmx_copy
216
217#ifdef CONFIG_ALTIVEC
218.Lvmx_copy:
219	mflr	r0
220	std	r4,56(r1)
221	std	r5,64(r1)
222	std	r0,16(r1)
223	stdu	r1,-STACKFRAMESIZE(r1)
224	bl	.enter_vmx_copy
225	cmpwi	cr1,r3,0
226	ld	r0,STACKFRAMESIZE+16(r1)
227	ld	r3,STACKFRAMESIZE+48(r1)
228	ld	r4,STACKFRAMESIZE+56(r1)
229	ld	r5,STACKFRAMESIZE+64(r1)
230	mtlr	r0
231
232	/*
233	 * We prefetch both the source and destination using enhanced touch
234	 * instructions. We use a stream ID of 0 for the load side and
235	 * 1 for the store side.
236	 */
237	clrrdi	r6,r4,7
238	clrrdi	r9,r3,7
239	ori	r9,r9,1		/* stream=1 */
240
241	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
242	cmpldi	cr1,r7,0x3FF
243	ble	cr1,1f
244	li	r7,0x3FF
2451:	lis	r0,0x0E00	/* depth=7 */
246	sldi	r7,r7,7
247	or	r7,r7,r0
248	ori	r10,r7,1	/* stream=1 */
249
250	lis	r8,0x8000	/* GO=1 */
251	clrldi	r8,r8,32
252
253.machine push
254.machine "power4"
255	dcbt	r0,r6,0b01000
256	dcbt	r0,r7,0b01010
257	dcbtst	r0,r9,0b01000
258	dcbtst	r0,r10,0b01010
259	eieio
260	dcbt	r0,r8,0b01010	/* GO */
261.machine pop
262
263	beq	cr1,.Lunwind_stack_nonvmx_copy
264
265	/*
266	 * If source and destination are not relatively aligned we use a
267	 * slower permute loop.
268	 */
269	xor	r6,r4,r3
270	rldicl.	r6,r6,0,(64-4)
271	bne	.Lvmx_unaligned_copy
272
273	/* Get the destination 16B aligned */
274	neg	r6,r3
275	mtocrf	0x01,r6
276	clrldi	r6,r6,(64-4)
277
278	bf	cr7*4+3,1f
279	lbz	r0,0(r4)
280	addi	r4,r4,1
281	stb	r0,0(r3)
282	addi	r3,r3,1
283
2841:	bf	cr7*4+2,2f
285	lhz	r0,0(r4)
286	addi	r4,r4,2
287	sth	r0,0(r3)
288	addi	r3,r3,2
289
2902:	bf	cr7*4+1,3f
291	lwz	r0,0(r4)
292	addi	r4,r4,4
293	stw	r0,0(r3)
294	addi	r3,r3,4
295
2963:	bf	cr7*4+0,4f
297	ld	r0,0(r4)
298	addi	r4,r4,8
299	std	r0,0(r3)
300	addi	r3,r3,8
301
3024:	sub	r5,r5,r6
303
304	/* Get the desination 128B aligned */
305	neg	r6,r3
306	srdi	r7,r6,4
307	mtocrf	0x01,r7
308	clrldi	r6,r6,(64-7)
309
310	li	r9,16
311	li	r10,32
312	li	r11,48
313
314	bf	cr7*4+3,5f
315	lvx	vr1,r0,r4
316	addi	r4,r4,16
317	stvx	vr1,r0,r3
318	addi	r3,r3,16
319
3205:	bf	cr7*4+2,6f
321	lvx	vr1,r0,r4
322	lvx	vr0,r4,r9
323	addi	r4,r4,32
324	stvx	vr1,r0,r3
325	stvx	vr0,r3,r9
326	addi	r3,r3,32
327
3286:	bf	cr7*4+1,7f
329	lvx	vr3,r0,r4
330	lvx	vr2,r4,r9
331	lvx	vr1,r4,r10
332	lvx	vr0,r4,r11
333	addi	r4,r4,64
334	stvx	vr3,r0,r3
335	stvx	vr2,r3,r9
336	stvx	vr1,r3,r10
337	stvx	vr0,r3,r11
338	addi	r3,r3,64
339
3407:	sub	r5,r5,r6
341	srdi	r6,r5,7
342
343	std	r14,STK_REG(R14)(r1)
344	std	r15,STK_REG(R15)(r1)
345	std	r16,STK_REG(R16)(r1)
346
347	li	r12,64
348	li	r14,80
349	li	r15,96
350	li	r16,112
351
352	mtctr	r6
353
354	/*
355	 * Now do cacheline sized loads and stores. By this stage the
356	 * cacheline stores are also cacheline aligned.
357	 */
358	.align	5
3598:
360	lvx	vr7,r0,r4
361	lvx	vr6,r4,r9
362	lvx	vr5,r4,r10
363	lvx	vr4,r4,r11
364	lvx	vr3,r4,r12
365	lvx	vr2,r4,r14
366	lvx	vr1,r4,r15
367	lvx	vr0,r4,r16
368	addi	r4,r4,128
369	stvx	vr7,r0,r3
370	stvx	vr6,r3,r9
371	stvx	vr5,r3,r10
372	stvx	vr4,r3,r11
373	stvx	vr3,r3,r12
374	stvx	vr2,r3,r14
375	stvx	vr1,r3,r15
376	stvx	vr0,r3,r16
377	addi	r3,r3,128
378	bdnz	8b
379
380	ld	r14,STK_REG(R14)(r1)
381	ld	r15,STK_REG(R15)(r1)
382	ld	r16,STK_REG(R16)(r1)
383
384	/* Up to 127B to go */
385	clrldi	r5,r5,(64-7)
386	srdi	r6,r5,4
387	mtocrf	0x01,r6
388
389	bf	cr7*4+1,9f
390	lvx	vr3,r0,r4
391	lvx	vr2,r4,r9
392	lvx	vr1,r4,r10
393	lvx	vr0,r4,r11
394	addi	r4,r4,64
395	stvx	vr3,r0,r3
396	stvx	vr2,r3,r9
397	stvx	vr1,r3,r10
398	stvx	vr0,r3,r11
399	addi	r3,r3,64
400
4019:	bf	cr7*4+2,10f
402	lvx	vr1,r0,r4
403	lvx	vr0,r4,r9
404	addi	r4,r4,32
405	stvx	vr1,r0,r3
406	stvx	vr0,r3,r9
407	addi	r3,r3,32
408
40910:	bf	cr7*4+3,11f
410	lvx	vr1,r0,r4
411	addi	r4,r4,16
412	stvx	vr1,r0,r3
413	addi	r3,r3,16
414
415	/* Up to 15B to go */
41611:	clrldi	r5,r5,(64-4)
417	mtocrf	0x01,r5
418	bf	cr7*4+0,12f
419	ld	r0,0(r4)
420	addi	r4,r4,8
421	std	r0,0(r3)
422	addi	r3,r3,8
423
42412:	bf	cr7*4+1,13f
425	lwz	r0,0(r4)
426	addi	r4,r4,4
427	stw	r0,0(r3)
428	addi	r3,r3,4
429
43013:	bf	cr7*4+2,14f
431	lhz	r0,0(r4)
432	addi	r4,r4,2
433	sth	r0,0(r3)
434	addi	r3,r3,2
435
43614:	bf	cr7*4+3,15f
437	lbz	r0,0(r4)
438	stb	r0,0(r3)
439
44015:	addi	r1,r1,STACKFRAMESIZE
441	ld	r3,48(r1)
442	b	.exit_vmx_copy		/* tail call optimise */
443
444.Lvmx_unaligned_copy:
445	/* Get the destination 16B aligned */
446	neg	r6,r3
447	mtocrf	0x01,r6
448	clrldi	r6,r6,(64-4)
449
450	bf	cr7*4+3,1f
451	lbz	r0,0(r4)
452	addi	r4,r4,1
453	stb	r0,0(r3)
454	addi	r3,r3,1
455
4561:	bf	cr7*4+2,2f
457	lhz	r0,0(r4)
458	addi	r4,r4,2
459	sth	r0,0(r3)
460	addi	r3,r3,2
461
4622:	bf	cr7*4+1,3f
463	lwz	r0,0(r4)
464	addi	r4,r4,4
465	stw	r0,0(r3)
466	addi	r3,r3,4
467
4683:	bf	cr7*4+0,4f
469	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
470	lwz	r7,4(r4)
471	addi	r4,r4,8
472	stw	r0,0(r3)
473	stw	r7,4(r3)
474	addi	r3,r3,8
475
4764:	sub	r5,r5,r6
477
478	/* Get the desination 128B aligned */
479	neg	r6,r3
480	srdi	r7,r6,4
481	mtocrf	0x01,r7
482	clrldi	r6,r6,(64-7)
483
484	li	r9,16
485	li	r10,32
486	li	r11,48
487
488	lvsl	vr16,0,r4	/* Setup permute control vector */
489	lvx	vr0,0,r4
490	addi	r4,r4,16
491
492	bf	cr7*4+3,5f
493	lvx	vr1,r0,r4
494	vperm	vr8,vr0,vr1,vr16
495	addi	r4,r4,16
496	stvx	vr8,r0,r3
497	addi	r3,r3,16
498	vor	vr0,vr1,vr1
499
5005:	bf	cr7*4+2,6f
501	lvx	vr1,r0,r4
502	vperm	vr8,vr0,vr1,vr16
503	lvx	vr0,r4,r9
504	vperm	vr9,vr1,vr0,vr16
505	addi	r4,r4,32
506	stvx	vr8,r0,r3
507	stvx	vr9,r3,r9
508	addi	r3,r3,32
509
5106:	bf	cr7*4+1,7f
511	lvx	vr3,r0,r4
512	vperm	vr8,vr0,vr3,vr16
513	lvx	vr2,r4,r9
514	vperm	vr9,vr3,vr2,vr16
515	lvx	vr1,r4,r10
516	vperm	vr10,vr2,vr1,vr16
517	lvx	vr0,r4,r11
518	vperm	vr11,vr1,vr0,vr16
519	addi	r4,r4,64
520	stvx	vr8,r0,r3
521	stvx	vr9,r3,r9
522	stvx	vr10,r3,r10
523	stvx	vr11,r3,r11
524	addi	r3,r3,64
525
5267:	sub	r5,r5,r6
527	srdi	r6,r5,7
528
529	std	r14,STK_REG(R14)(r1)
530	std	r15,STK_REG(R15)(r1)
531	std	r16,STK_REG(R16)(r1)
532
533	li	r12,64
534	li	r14,80
535	li	r15,96
536	li	r16,112
537
538	mtctr	r6
539
540	/*
541	 * Now do cacheline sized loads and stores. By this stage the
542	 * cacheline stores are also cacheline aligned.
543	 */
544	.align	5
5458:
546	lvx	vr7,r0,r4
547	vperm	vr8,vr0,vr7,vr16
548	lvx	vr6,r4,r9
549	vperm	vr9,vr7,vr6,vr16
550	lvx	vr5,r4,r10
551	vperm	vr10,vr6,vr5,vr16
552	lvx	vr4,r4,r11
553	vperm	vr11,vr5,vr4,vr16
554	lvx	vr3,r4,r12
555	vperm	vr12,vr4,vr3,vr16
556	lvx	vr2,r4,r14
557	vperm	vr13,vr3,vr2,vr16
558	lvx	vr1,r4,r15
559	vperm	vr14,vr2,vr1,vr16
560	lvx	vr0,r4,r16
561	vperm	vr15,vr1,vr0,vr16
562	addi	r4,r4,128
563	stvx	vr8,r0,r3
564	stvx	vr9,r3,r9
565	stvx	vr10,r3,r10
566	stvx	vr11,r3,r11
567	stvx	vr12,r3,r12
568	stvx	vr13,r3,r14
569	stvx	vr14,r3,r15
570	stvx	vr15,r3,r16
571	addi	r3,r3,128
572	bdnz	8b
573
574	ld	r14,STK_REG(R14)(r1)
575	ld	r15,STK_REG(R15)(r1)
576	ld	r16,STK_REG(R16)(r1)
577
578	/* Up to 127B to go */
579	clrldi	r5,r5,(64-7)
580	srdi	r6,r5,4
581	mtocrf	0x01,r6
582
583	bf	cr7*4+1,9f
584	lvx	vr3,r0,r4
585	vperm	vr8,vr0,vr3,vr16
586	lvx	vr2,r4,r9
587	vperm	vr9,vr3,vr2,vr16
588	lvx	vr1,r4,r10
589	vperm	vr10,vr2,vr1,vr16
590	lvx	vr0,r4,r11
591	vperm	vr11,vr1,vr0,vr16
592	addi	r4,r4,64
593	stvx	vr8,r0,r3
594	stvx	vr9,r3,r9
595	stvx	vr10,r3,r10
596	stvx	vr11,r3,r11
597	addi	r3,r3,64
598
5999:	bf	cr7*4+2,10f
600	lvx	vr1,r0,r4
601	vperm	vr8,vr0,vr1,vr16
602	lvx	vr0,r4,r9
603	vperm	vr9,vr1,vr0,vr16
604	addi	r4,r4,32
605	stvx	vr8,r0,r3
606	stvx	vr9,r3,r9
607	addi	r3,r3,32
608
60910:	bf	cr7*4+3,11f
610	lvx	vr1,r0,r4
611	vperm	vr8,vr0,vr1,vr16
612	addi	r4,r4,16
613	stvx	vr8,r0,r3
614	addi	r3,r3,16
615
616	/* Up to 15B to go */
61711:	clrldi	r5,r5,(64-4)
618	addi	r4,r4,-16	/* Unwind the +16 load offset */
619	mtocrf	0x01,r5
620	bf	cr7*4+0,12f
621	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
622	lwz	r6,4(r4)
623	addi	r4,r4,8
624	stw	r0,0(r3)
625	stw	r6,4(r3)
626	addi	r3,r3,8
627
62812:	bf	cr7*4+1,13f
629	lwz	r0,0(r4)
630	addi	r4,r4,4
631	stw	r0,0(r3)
632	addi	r3,r3,4
633
63413:	bf	cr7*4+2,14f
635	lhz	r0,0(r4)
636	addi	r4,r4,2
637	sth	r0,0(r3)
638	addi	r3,r3,2
639
64014:	bf	cr7*4+3,15f
641	lbz	r0,0(r4)
642	stb	r0,0(r3)
643
64415:	addi	r1,r1,STACKFRAMESIZE
645	ld	r3,48(r1)
646	b	.exit_vmx_copy		/* tail call optimise */
647#endif /* CONFiG_ALTIVEC */
648