xref: /openbmc/linux/arch/powerpc/lib/memcpy_power7.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
11a59d1b8SThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */
2b3f271e8SAnton Blanchard/*
3b3f271e8SAnton Blanchard *
4b3f271e8SAnton Blanchard * Copyright (C) IBM Corporation, 2012
5b3f271e8SAnton Blanchard *
6b3f271e8SAnton Blanchard * Author: Anton Blanchard <anton@au.ibm.com>
7b3f271e8SAnton Blanchard */
8b3f271e8SAnton Blanchard#include <asm/ppc_asm.h>
9b3f271e8SAnton Blanchard
1098c45f51SPaul Mackerras#ifndef SELFTEST_CASE
1198c45f51SPaul Mackerras/* 0 == don't use VMX, 1 == use VMX */
1298c45f51SPaul Mackerras#define SELFTEST_CASE	0
1398c45f51SPaul Mackerras#endif
1432ee1e18SAnton Blanchard
1532ee1e18SAnton Blanchard#ifdef __BIG_ENDIAN__
1632ee1e18SAnton Blanchard#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
1732ee1e18SAnton Blanchard#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
1832ee1e18SAnton Blanchard#else
1932ee1e18SAnton Blanchard#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
2032ee1e18SAnton Blanchard#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
2132ee1e18SAnton Blanchard#endif
2232ee1e18SAnton Blanchard
2398c45f51SPaul Mackerras_GLOBAL(memcpy_power7)
24b3f271e8SAnton Blanchard	cmpldi	r5,16
25b3f271e8SAnton Blanchard	cmpldi	cr1,r5,4096
26752a6422SUlrich Weigand	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27b3f271e8SAnton Blanchard	blt	.Lshort_copy
2898c45f51SPaul Mackerras
2998c45f51SPaul Mackerras#ifdef CONFIG_ALTIVEC
3098c45f51SPaul Mackerrastest_feature = SELFTEST_CASE
3198c45f51SPaul MackerrasBEGIN_FTR_SECTION
32b3f271e8SAnton Blanchard	bgt	cr1, .Lvmx_copy
3398c45f51SPaul MackerrasEND_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34b3f271e8SAnton Blanchard#endif
35b3f271e8SAnton Blanchard
36b3f271e8SAnton Blanchard.Lnonvmx_copy:
37b3f271e8SAnton Blanchard	/* Get the source 8B aligned */
38b3f271e8SAnton Blanchard	neg	r6,r4
39b3f271e8SAnton Blanchard	mtocrf	0x01,r6
40b3f271e8SAnton Blanchard	clrldi	r6,r6,(64-3)
41b3f271e8SAnton Blanchard
42b3f271e8SAnton Blanchard	bf	cr7*4+3,1f
43b3f271e8SAnton Blanchard	lbz	r0,0(r4)
44b3f271e8SAnton Blanchard	addi	r4,r4,1
45b3f271e8SAnton Blanchard	stb	r0,0(r3)
46b3f271e8SAnton Blanchard	addi	r3,r3,1
47b3f271e8SAnton Blanchard
48b3f271e8SAnton Blanchard1:	bf	cr7*4+2,2f
49b3f271e8SAnton Blanchard	lhz	r0,0(r4)
50b3f271e8SAnton Blanchard	addi	r4,r4,2
51b3f271e8SAnton Blanchard	sth	r0,0(r3)
52b3f271e8SAnton Blanchard	addi	r3,r3,2
53b3f271e8SAnton Blanchard
54b3f271e8SAnton Blanchard2:	bf	cr7*4+1,3f
55b3f271e8SAnton Blanchard	lwz	r0,0(r4)
56b3f271e8SAnton Blanchard	addi	r4,r4,4
57b3f271e8SAnton Blanchard	stw	r0,0(r3)
58b3f271e8SAnton Blanchard	addi	r3,r3,4
59b3f271e8SAnton Blanchard
60b3f271e8SAnton Blanchard3:	sub	r5,r5,r6
61b3f271e8SAnton Blanchard	cmpldi	r5,128
62b3f271e8SAnton Blanchard	blt	5f
63b3f271e8SAnton Blanchard
64b3f271e8SAnton Blanchard	mflr	r0
65b3f271e8SAnton Blanchard	stdu	r1,-STACKFRAMESIZE(r1)
66c75df6f9SMichael Neuling	std	r14,STK_REG(R14)(r1)
67c75df6f9SMichael Neuling	std	r15,STK_REG(R15)(r1)
68c75df6f9SMichael Neuling	std	r16,STK_REG(R16)(r1)
69c75df6f9SMichael Neuling	std	r17,STK_REG(R17)(r1)
70c75df6f9SMichael Neuling	std	r18,STK_REG(R18)(r1)
71c75df6f9SMichael Neuling	std	r19,STK_REG(R19)(r1)
72c75df6f9SMichael Neuling	std	r20,STK_REG(R20)(r1)
73c75df6f9SMichael Neuling	std	r21,STK_REG(R21)(r1)
74c75df6f9SMichael Neuling	std	r22,STK_REG(R22)(r1)
75b3f271e8SAnton Blanchard	std	r0,STACKFRAMESIZE+16(r1)
76b3f271e8SAnton Blanchard
77b3f271e8SAnton Blanchard	srdi	r6,r5,7
78b3f271e8SAnton Blanchard	mtctr	r6
79b3f271e8SAnton Blanchard
80b3f271e8SAnton Blanchard	/* Now do cacheline (128B) sized loads and stores. */
81b3f271e8SAnton Blanchard	.align	5
82b3f271e8SAnton Blanchard4:
83b3f271e8SAnton Blanchard	ld	r0,0(r4)
84b3f271e8SAnton Blanchard	ld	r6,8(r4)
85b3f271e8SAnton Blanchard	ld	r7,16(r4)
86b3f271e8SAnton Blanchard	ld	r8,24(r4)
87b3f271e8SAnton Blanchard	ld	r9,32(r4)
88b3f271e8SAnton Blanchard	ld	r10,40(r4)
89b3f271e8SAnton Blanchard	ld	r11,48(r4)
90b3f271e8SAnton Blanchard	ld	r12,56(r4)
91b3f271e8SAnton Blanchard	ld	r14,64(r4)
92b3f271e8SAnton Blanchard	ld	r15,72(r4)
93b3f271e8SAnton Blanchard	ld	r16,80(r4)
94b3f271e8SAnton Blanchard	ld	r17,88(r4)
95b3f271e8SAnton Blanchard	ld	r18,96(r4)
96b3f271e8SAnton Blanchard	ld	r19,104(r4)
97b3f271e8SAnton Blanchard	ld	r20,112(r4)
98b3f271e8SAnton Blanchard	ld	r21,120(r4)
99b3f271e8SAnton Blanchard	addi	r4,r4,128
100b3f271e8SAnton Blanchard	std	r0,0(r3)
101b3f271e8SAnton Blanchard	std	r6,8(r3)
102b3f271e8SAnton Blanchard	std	r7,16(r3)
103b3f271e8SAnton Blanchard	std	r8,24(r3)
104b3f271e8SAnton Blanchard	std	r9,32(r3)
105b3f271e8SAnton Blanchard	std	r10,40(r3)
106b3f271e8SAnton Blanchard	std	r11,48(r3)
107b3f271e8SAnton Blanchard	std	r12,56(r3)
108b3f271e8SAnton Blanchard	std	r14,64(r3)
109b3f271e8SAnton Blanchard	std	r15,72(r3)
110b3f271e8SAnton Blanchard	std	r16,80(r3)
111b3f271e8SAnton Blanchard	std	r17,88(r3)
112b3f271e8SAnton Blanchard	std	r18,96(r3)
113b3f271e8SAnton Blanchard	std	r19,104(r3)
114b3f271e8SAnton Blanchard	std	r20,112(r3)
115b3f271e8SAnton Blanchard	std	r21,120(r3)
116b3f271e8SAnton Blanchard	addi	r3,r3,128
117b3f271e8SAnton Blanchard	bdnz	4b
118b3f271e8SAnton Blanchard
119b3f271e8SAnton Blanchard	clrldi	r5,r5,(64-7)
120b3f271e8SAnton Blanchard
121c75df6f9SMichael Neuling	ld	r14,STK_REG(R14)(r1)
122c75df6f9SMichael Neuling	ld	r15,STK_REG(R15)(r1)
123c75df6f9SMichael Neuling	ld	r16,STK_REG(R16)(r1)
124c75df6f9SMichael Neuling	ld	r17,STK_REG(R17)(r1)
125c75df6f9SMichael Neuling	ld	r18,STK_REG(R18)(r1)
126c75df6f9SMichael Neuling	ld	r19,STK_REG(R19)(r1)
127c75df6f9SMichael Neuling	ld	r20,STK_REG(R20)(r1)
128c75df6f9SMichael Neuling	ld	r21,STK_REG(R21)(r1)
129c75df6f9SMichael Neuling	ld	r22,STK_REG(R22)(r1)
130b3f271e8SAnton Blanchard	addi	r1,r1,STACKFRAMESIZE
131b3f271e8SAnton Blanchard
132b3f271e8SAnton Blanchard	/* Up to 127B to go */
133b3f271e8SAnton Blanchard5:	srdi	r6,r5,4
134b3f271e8SAnton Blanchard	mtocrf	0x01,r6
135b3f271e8SAnton Blanchard
136b3f271e8SAnton Blanchard6:	bf	cr7*4+1,7f
137b3f271e8SAnton Blanchard	ld	r0,0(r4)
138b3f271e8SAnton Blanchard	ld	r6,8(r4)
139b3f271e8SAnton Blanchard	ld	r7,16(r4)
140b3f271e8SAnton Blanchard	ld	r8,24(r4)
141b3f271e8SAnton Blanchard	ld	r9,32(r4)
142b3f271e8SAnton Blanchard	ld	r10,40(r4)
143b3f271e8SAnton Blanchard	ld	r11,48(r4)
144b3f271e8SAnton Blanchard	ld	r12,56(r4)
145b3f271e8SAnton Blanchard	addi	r4,r4,64
146b3f271e8SAnton Blanchard	std	r0,0(r3)
147b3f271e8SAnton Blanchard	std	r6,8(r3)
148b3f271e8SAnton Blanchard	std	r7,16(r3)
149b3f271e8SAnton Blanchard	std	r8,24(r3)
150b3f271e8SAnton Blanchard	std	r9,32(r3)
151b3f271e8SAnton Blanchard	std	r10,40(r3)
152b3f271e8SAnton Blanchard	std	r11,48(r3)
153b3f271e8SAnton Blanchard	std	r12,56(r3)
154b3f271e8SAnton Blanchard	addi	r3,r3,64
155b3f271e8SAnton Blanchard
156b3f271e8SAnton Blanchard	/* Up to 63B to go */
157b3f271e8SAnton Blanchard7:	bf	cr7*4+2,8f
158b3f271e8SAnton Blanchard	ld	r0,0(r4)
159b3f271e8SAnton Blanchard	ld	r6,8(r4)
160b3f271e8SAnton Blanchard	ld	r7,16(r4)
161b3f271e8SAnton Blanchard	ld	r8,24(r4)
162b3f271e8SAnton Blanchard	addi	r4,r4,32
163b3f271e8SAnton Blanchard	std	r0,0(r3)
164b3f271e8SAnton Blanchard	std	r6,8(r3)
165b3f271e8SAnton Blanchard	std	r7,16(r3)
166b3f271e8SAnton Blanchard	std	r8,24(r3)
167b3f271e8SAnton Blanchard	addi	r3,r3,32
168b3f271e8SAnton Blanchard
169b3f271e8SAnton Blanchard	/* Up to 31B to go */
170b3f271e8SAnton Blanchard8:	bf	cr7*4+3,9f
171b3f271e8SAnton Blanchard	ld	r0,0(r4)
172b3f271e8SAnton Blanchard	ld	r6,8(r4)
173b3f271e8SAnton Blanchard	addi	r4,r4,16
174b3f271e8SAnton Blanchard	std	r0,0(r3)
175b3f271e8SAnton Blanchard	std	r6,8(r3)
176b3f271e8SAnton Blanchard	addi	r3,r3,16
177b3f271e8SAnton Blanchard
178b3f271e8SAnton Blanchard9:	clrldi	r5,r5,(64-4)
179b3f271e8SAnton Blanchard
180b3f271e8SAnton Blanchard	/* Up to 15B to go */
181b3f271e8SAnton Blanchard.Lshort_copy:
182b3f271e8SAnton Blanchard	mtocrf	0x01,r5
183b3f271e8SAnton Blanchard	bf	cr7*4+0,12f
184b3f271e8SAnton Blanchard	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
185b3f271e8SAnton Blanchard	lwz	r6,4(r4)
186b3f271e8SAnton Blanchard	addi	r4,r4,8
187b3f271e8SAnton Blanchard	stw	r0,0(r3)
188b3f271e8SAnton Blanchard	stw	r6,4(r3)
189b3f271e8SAnton Blanchard	addi	r3,r3,8
190b3f271e8SAnton Blanchard
191b3f271e8SAnton Blanchard12:	bf	cr7*4+1,13f
192b3f271e8SAnton Blanchard	lwz	r0,0(r4)
193b3f271e8SAnton Blanchard	addi	r4,r4,4
194b3f271e8SAnton Blanchard	stw	r0,0(r3)
195b3f271e8SAnton Blanchard	addi	r3,r3,4
196b3f271e8SAnton Blanchard
197b3f271e8SAnton Blanchard13:	bf	cr7*4+2,14f
198b3f271e8SAnton Blanchard	lhz	r0,0(r4)
199b3f271e8SAnton Blanchard	addi	r4,r4,2
200b3f271e8SAnton Blanchard	sth	r0,0(r3)
201b3f271e8SAnton Blanchard	addi	r3,r3,2
202b3f271e8SAnton Blanchard
203b3f271e8SAnton Blanchard14:	bf	cr7*4+3,15f
204b3f271e8SAnton Blanchard	lbz	r0,0(r4)
205b3f271e8SAnton Blanchard	stb	r0,0(r3)
206b3f271e8SAnton Blanchard
207752a6422SUlrich Weigand15:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208b3f271e8SAnton Blanchard	blr
209b3f271e8SAnton Blanchard
210b3f271e8SAnton Blanchard.Lunwind_stack_nonvmx_copy:
211b3f271e8SAnton Blanchard	addi	r1,r1,STACKFRAMESIZE
212b3f271e8SAnton Blanchard	b	.Lnonvmx_copy
213b3f271e8SAnton Blanchard
214b3f271e8SAnton Blanchard.Lvmx_copy:
21598c45f51SPaul Mackerras#ifdef CONFIG_ALTIVEC
216b3f271e8SAnton Blanchard	mflr	r0
217752a6422SUlrich Weigand	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218752a6422SUlrich Weigand	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219b3f271e8SAnton Blanchard	std	r0,16(r1)
220b3f271e8SAnton Blanchard	stdu	r1,-STACKFRAMESIZE(r1)
221*4e991e3cSNicholas Piggin	bl	CFUNC(enter_vmx_ops)
2222fae7cdbSAnton Blanchard	cmpwi	cr1,r3,0
223b3f271e8SAnton Blanchard	ld	r0,STACKFRAMESIZE+16(r1)
224752a6422SUlrich Weigand	ld	r3,STK_REG(R31)(r1)
225752a6422SUlrich Weigand	ld	r4,STK_REG(R30)(r1)
226752a6422SUlrich Weigand	ld	r5,STK_REG(R29)(r1)
227b3f271e8SAnton Blanchard	mtlr	r0
228b3f271e8SAnton Blanchard
229b3f271e8SAnton Blanchard	/*
230b3f271e8SAnton Blanchard	 * We prefetch both the source and destination using enhanced touch
231b3f271e8SAnton Blanchard	 * instructions. We use a stream ID of 0 for the load side and
232b3f271e8SAnton Blanchard	 * 1 for the store side.
233b3f271e8SAnton Blanchard	 */
234b3f271e8SAnton Blanchard	clrrdi	r6,r4,7
235b3f271e8SAnton Blanchard	clrrdi	r9,r3,7
236b3f271e8SAnton Blanchard	ori	r9,r9,1		/* stream=1 */
237b3f271e8SAnton Blanchard
238b3f271e8SAnton Blanchard	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
239c8adfeccSNishanth Aravamudan	cmpldi	r7,0x3FF
240c8adfeccSNishanth Aravamudan	ble	1f
241b3f271e8SAnton Blanchard	li	r7,0x3FF
242b3f271e8SAnton Blanchard1:	lis	r0,0x0E00	/* depth=7 */
243b3f271e8SAnton Blanchard	sldi	r7,r7,7
244b3f271e8SAnton Blanchard	or	r7,r7,r0
245b3f271e8SAnton Blanchard	ori	r10,r7,1	/* stream=1 */
246b3f271e8SAnton Blanchard
247b3f271e8SAnton Blanchard	lis	r8,0x8000	/* GO=1 */
248b3f271e8SAnton Blanchard	clrldi	r8,r8,32
249b3f271e8SAnton Blanchard
2508a583c0aSAndreas Schwab	dcbt	0,r6,0b01000
2518a583c0aSAndreas Schwab	dcbt	0,r7,0b01010
2528a583c0aSAndreas Schwab	dcbtst	0,r9,0b01000
2538a583c0aSAndreas Schwab	dcbtst	0,r10,0b01010
254b3f271e8SAnton Blanchard	eieio
2558a583c0aSAndreas Schwab	dcbt	0,r8,0b01010	/* GO */
256b3f271e8SAnton Blanchard
2572fae7cdbSAnton Blanchard	beq	cr1,.Lunwind_stack_nonvmx_copy
258b3f271e8SAnton Blanchard
259b3f271e8SAnton Blanchard	/*
260b3f271e8SAnton Blanchard	 * If source and destination are not relatively aligned we use a
261b3f271e8SAnton Blanchard	 * slower permute loop.
262b3f271e8SAnton Blanchard	 */
263b3f271e8SAnton Blanchard	xor	r6,r4,r3
264b3f271e8SAnton Blanchard	rldicl.	r6,r6,0,(64-4)
265b3f271e8SAnton Blanchard	bne	.Lvmx_unaligned_copy
266b3f271e8SAnton Blanchard
267b3f271e8SAnton Blanchard	/* Get the destination 16B aligned */
268b3f271e8SAnton Blanchard	neg	r6,r3
269b3f271e8SAnton Blanchard	mtocrf	0x01,r6
270b3f271e8SAnton Blanchard	clrldi	r6,r6,(64-4)
271b3f271e8SAnton Blanchard
272b3f271e8SAnton Blanchard	bf	cr7*4+3,1f
273b3f271e8SAnton Blanchard	lbz	r0,0(r4)
274b3f271e8SAnton Blanchard	addi	r4,r4,1
275b3f271e8SAnton Blanchard	stb	r0,0(r3)
276b3f271e8SAnton Blanchard	addi	r3,r3,1
277b3f271e8SAnton Blanchard
278b3f271e8SAnton Blanchard1:	bf	cr7*4+2,2f
279b3f271e8SAnton Blanchard	lhz	r0,0(r4)
280b3f271e8SAnton Blanchard	addi	r4,r4,2
281b3f271e8SAnton Blanchard	sth	r0,0(r3)
282b3f271e8SAnton Blanchard	addi	r3,r3,2
283b3f271e8SAnton Blanchard
284b3f271e8SAnton Blanchard2:	bf	cr7*4+1,3f
285b3f271e8SAnton Blanchard	lwz	r0,0(r4)
286b3f271e8SAnton Blanchard	addi	r4,r4,4
287b3f271e8SAnton Blanchard	stw	r0,0(r3)
288b3f271e8SAnton Blanchard	addi	r3,r3,4
289b3f271e8SAnton Blanchard
290b3f271e8SAnton Blanchard3:	bf	cr7*4+0,4f
291b3f271e8SAnton Blanchard	ld	r0,0(r4)
292b3f271e8SAnton Blanchard	addi	r4,r4,8
293b3f271e8SAnton Blanchard	std	r0,0(r3)
294b3f271e8SAnton Blanchard	addi	r3,r3,8
295b3f271e8SAnton Blanchard
296b3f271e8SAnton Blanchard4:	sub	r5,r5,r6
297b3f271e8SAnton Blanchard
298b3f271e8SAnton Blanchard	/* Get the desination 128B aligned */
299b3f271e8SAnton Blanchard	neg	r6,r3
300b3f271e8SAnton Blanchard	srdi	r7,r6,4
301b3f271e8SAnton Blanchard	mtocrf	0x01,r7
302b3f271e8SAnton Blanchard	clrldi	r6,r6,(64-7)
303b3f271e8SAnton Blanchard
304b3f271e8SAnton Blanchard	li	r9,16
305b3f271e8SAnton Blanchard	li	r10,32
306b3f271e8SAnton Blanchard	li	r11,48
307b3f271e8SAnton Blanchard
308b3f271e8SAnton Blanchard	bf	cr7*4+3,5f
3098a583c0aSAndreas Schwab	lvx	v1,0,r4
310b3f271e8SAnton Blanchard	addi	r4,r4,16
3118a583c0aSAndreas Schwab	stvx	v1,0,r3
312b3f271e8SAnton Blanchard	addi	r3,r3,16
313b3f271e8SAnton Blanchard
314b3f271e8SAnton Blanchard5:	bf	cr7*4+2,6f
3158a583c0aSAndreas Schwab	lvx	v1,0,r4
316c2ce6f9fSAnton Blanchard	lvx	v0,r4,r9
317b3f271e8SAnton Blanchard	addi	r4,r4,32
3188a583c0aSAndreas Schwab	stvx	v1,0,r3
319c2ce6f9fSAnton Blanchard	stvx	v0,r3,r9
320b3f271e8SAnton Blanchard	addi	r3,r3,32
321b3f271e8SAnton Blanchard
322b3f271e8SAnton Blanchard6:	bf	cr7*4+1,7f
3238a583c0aSAndreas Schwab	lvx	v3,0,r4
324c2ce6f9fSAnton Blanchard	lvx	v2,r4,r9
325c2ce6f9fSAnton Blanchard	lvx	v1,r4,r10
326c2ce6f9fSAnton Blanchard	lvx	v0,r4,r11
327b3f271e8SAnton Blanchard	addi	r4,r4,64
3288a583c0aSAndreas Schwab	stvx	v3,0,r3
329c2ce6f9fSAnton Blanchard	stvx	v2,r3,r9
330c2ce6f9fSAnton Blanchard	stvx	v1,r3,r10
331c2ce6f9fSAnton Blanchard	stvx	v0,r3,r11
332b3f271e8SAnton Blanchard	addi	r3,r3,64
333b3f271e8SAnton Blanchard
334b3f271e8SAnton Blanchard7:	sub	r5,r5,r6
335b3f271e8SAnton Blanchard	srdi	r6,r5,7
336b3f271e8SAnton Blanchard
337c75df6f9SMichael Neuling	std	r14,STK_REG(R14)(r1)
338c75df6f9SMichael Neuling	std	r15,STK_REG(R15)(r1)
339c75df6f9SMichael Neuling	std	r16,STK_REG(R16)(r1)
340b3f271e8SAnton Blanchard
341b3f271e8SAnton Blanchard	li	r12,64
342b3f271e8SAnton Blanchard	li	r14,80
343b3f271e8SAnton Blanchard	li	r15,96
344b3f271e8SAnton Blanchard	li	r16,112
345b3f271e8SAnton Blanchard
346b3f271e8SAnton Blanchard	mtctr	r6
347b3f271e8SAnton Blanchard
348b3f271e8SAnton Blanchard	/*
349b3f271e8SAnton Blanchard	 * Now do cacheline sized loads and stores. By this stage the
350b3f271e8SAnton Blanchard	 * cacheline stores are also cacheline aligned.
351b3f271e8SAnton Blanchard	 */
352b3f271e8SAnton Blanchard	.align	5
353b3f271e8SAnton Blanchard8:
3548a583c0aSAndreas Schwab	lvx	v7,0,r4
355c2ce6f9fSAnton Blanchard	lvx	v6,r4,r9
356c2ce6f9fSAnton Blanchard	lvx	v5,r4,r10
357c2ce6f9fSAnton Blanchard	lvx	v4,r4,r11
358c2ce6f9fSAnton Blanchard	lvx	v3,r4,r12
359c2ce6f9fSAnton Blanchard	lvx	v2,r4,r14
360c2ce6f9fSAnton Blanchard	lvx	v1,r4,r15
361c2ce6f9fSAnton Blanchard	lvx	v0,r4,r16
362b3f271e8SAnton Blanchard	addi	r4,r4,128
3638a583c0aSAndreas Schwab	stvx	v7,0,r3
364c2ce6f9fSAnton Blanchard	stvx	v6,r3,r9
365c2ce6f9fSAnton Blanchard	stvx	v5,r3,r10
366c2ce6f9fSAnton Blanchard	stvx	v4,r3,r11
367c2ce6f9fSAnton Blanchard	stvx	v3,r3,r12
368c2ce6f9fSAnton Blanchard	stvx	v2,r3,r14
369c2ce6f9fSAnton Blanchard	stvx	v1,r3,r15
370c2ce6f9fSAnton Blanchard	stvx	v0,r3,r16
371b3f271e8SAnton Blanchard	addi	r3,r3,128
372b3f271e8SAnton Blanchard	bdnz	8b
373b3f271e8SAnton Blanchard
374c75df6f9SMichael Neuling	ld	r14,STK_REG(R14)(r1)
375c75df6f9SMichael Neuling	ld	r15,STK_REG(R15)(r1)
376c75df6f9SMichael Neuling	ld	r16,STK_REG(R16)(r1)
377b3f271e8SAnton Blanchard
378b3f271e8SAnton Blanchard	/* Up to 127B to go */
379b3f271e8SAnton Blanchard	clrldi	r5,r5,(64-7)
380b3f271e8SAnton Blanchard	srdi	r6,r5,4
381b3f271e8SAnton Blanchard	mtocrf	0x01,r6
382b3f271e8SAnton Blanchard
383b3f271e8SAnton Blanchard	bf	cr7*4+1,9f
3848a583c0aSAndreas Schwab	lvx	v3,0,r4
385c2ce6f9fSAnton Blanchard	lvx	v2,r4,r9
386c2ce6f9fSAnton Blanchard	lvx	v1,r4,r10
387c2ce6f9fSAnton Blanchard	lvx	v0,r4,r11
388b3f271e8SAnton Blanchard	addi	r4,r4,64
3898a583c0aSAndreas Schwab	stvx	v3,0,r3
390c2ce6f9fSAnton Blanchard	stvx	v2,r3,r9
391c2ce6f9fSAnton Blanchard	stvx	v1,r3,r10
392c2ce6f9fSAnton Blanchard	stvx	v0,r3,r11
393b3f271e8SAnton Blanchard	addi	r3,r3,64
394b3f271e8SAnton Blanchard
395b3f271e8SAnton Blanchard9:	bf	cr7*4+2,10f
3968a583c0aSAndreas Schwab	lvx	v1,0,r4
397c2ce6f9fSAnton Blanchard	lvx	v0,r4,r9
398b3f271e8SAnton Blanchard	addi	r4,r4,32
3998a583c0aSAndreas Schwab	stvx	v1,0,r3
400c2ce6f9fSAnton Blanchard	stvx	v0,r3,r9
401b3f271e8SAnton Blanchard	addi	r3,r3,32
402b3f271e8SAnton Blanchard
403b3f271e8SAnton Blanchard10:	bf	cr7*4+3,11f
4048a583c0aSAndreas Schwab	lvx	v1,0,r4
405b3f271e8SAnton Blanchard	addi	r4,r4,16
4068a583c0aSAndreas Schwab	stvx	v1,0,r3
407b3f271e8SAnton Blanchard	addi	r3,r3,16
408b3f271e8SAnton Blanchard
409b3f271e8SAnton Blanchard	/* Up to 15B to go */
410b3f271e8SAnton Blanchard11:	clrldi	r5,r5,(64-4)
411b3f271e8SAnton Blanchard	mtocrf	0x01,r5
412b3f271e8SAnton Blanchard	bf	cr7*4+0,12f
413b3f271e8SAnton Blanchard	ld	r0,0(r4)
414b3f271e8SAnton Blanchard	addi	r4,r4,8
415b3f271e8SAnton Blanchard	std	r0,0(r3)
416b3f271e8SAnton Blanchard	addi	r3,r3,8
417b3f271e8SAnton Blanchard
418b3f271e8SAnton Blanchard12:	bf	cr7*4+1,13f
419b3f271e8SAnton Blanchard	lwz	r0,0(r4)
420b3f271e8SAnton Blanchard	addi	r4,r4,4
421b3f271e8SAnton Blanchard	stw	r0,0(r3)
422b3f271e8SAnton Blanchard	addi	r3,r3,4
423b3f271e8SAnton Blanchard
424b3f271e8SAnton Blanchard13:	bf	cr7*4+2,14f
425b3f271e8SAnton Blanchard	lhz	r0,0(r4)
426b3f271e8SAnton Blanchard	addi	r4,r4,2
427b3f271e8SAnton Blanchard	sth	r0,0(r3)
428b3f271e8SAnton Blanchard	addi	r3,r3,2
429b3f271e8SAnton Blanchard
430b3f271e8SAnton Blanchard14:	bf	cr7*4+3,15f
431b3f271e8SAnton Blanchard	lbz	r0,0(r4)
432b3f271e8SAnton Blanchard	stb	r0,0(r3)
433b3f271e8SAnton Blanchard
434b3f271e8SAnton Blanchard15:	addi	r1,r1,STACKFRAMESIZE
435752a6422SUlrich Weigand	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
436*4e991e3cSNicholas Piggin	b	CFUNC(exit_vmx_ops)		/* tail call optimise */
437b3f271e8SAnton Blanchard
438b3f271e8SAnton Blanchard.Lvmx_unaligned_copy:
439b3f271e8SAnton Blanchard	/* Get the destination 16B aligned */
440b3f271e8SAnton Blanchard	neg	r6,r3
441b3f271e8SAnton Blanchard	mtocrf	0x01,r6
442b3f271e8SAnton Blanchard	clrldi	r6,r6,(64-4)
443b3f271e8SAnton Blanchard
444b3f271e8SAnton Blanchard	bf	cr7*4+3,1f
445b3f271e8SAnton Blanchard	lbz	r0,0(r4)
446b3f271e8SAnton Blanchard	addi	r4,r4,1
447b3f271e8SAnton Blanchard	stb	r0,0(r3)
448b3f271e8SAnton Blanchard	addi	r3,r3,1
449b3f271e8SAnton Blanchard
450b3f271e8SAnton Blanchard1:	bf	cr7*4+2,2f
451b3f271e8SAnton Blanchard	lhz	r0,0(r4)
452b3f271e8SAnton Blanchard	addi	r4,r4,2
453b3f271e8SAnton Blanchard	sth	r0,0(r3)
454b3f271e8SAnton Blanchard	addi	r3,r3,2
455b3f271e8SAnton Blanchard
456b3f271e8SAnton Blanchard2:	bf	cr7*4+1,3f
457b3f271e8SAnton Blanchard	lwz	r0,0(r4)
458b3f271e8SAnton Blanchard	addi	r4,r4,4
459b3f271e8SAnton Blanchard	stw	r0,0(r3)
460b3f271e8SAnton Blanchard	addi	r3,r3,4
461b3f271e8SAnton Blanchard
462b3f271e8SAnton Blanchard3:	bf	cr7*4+0,4f
463b3f271e8SAnton Blanchard	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
464b3f271e8SAnton Blanchard	lwz	r7,4(r4)
465b3f271e8SAnton Blanchard	addi	r4,r4,8
466b3f271e8SAnton Blanchard	stw	r0,0(r3)
467b3f271e8SAnton Blanchard	stw	r7,4(r3)
468b3f271e8SAnton Blanchard	addi	r3,r3,8
469b3f271e8SAnton Blanchard
470b3f271e8SAnton Blanchard4:	sub	r5,r5,r6
471b3f271e8SAnton Blanchard
472b3f271e8SAnton Blanchard	/* Get the desination 128B aligned */
473b3f271e8SAnton Blanchard	neg	r6,r3
474b3f271e8SAnton Blanchard	srdi	r7,r6,4
475b3f271e8SAnton Blanchard	mtocrf	0x01,r7
476b3f271e8SAnton Blanchard	clrldi	r6,r6,(64-7)
477b3f271e8SAnton Blanchard
478b3f271e8SAnton Blanchard	li	r9,16
479b3f271e8SAnton Blanchard	li	r10,32
480b3f271e8SAnton Blanchard	li	r11,48
481b3f271e8SAnton Blanchard
482c2ce6f9fSAnton Blanchard	LVS(v16,0,r4)		/* Setup permute control vector */
483c2ce6f9fSAnton Blanchard	lvx	v0,0,r4
484b3f271e8SAnton Blanchard	addi	r4,r4,16
485b3f271e8SAnton Blanchard
486b3f271e8SAnton Blanchard	bf	cr7*4+3,5f
4878a583c0aSAndreas Schwab	lvx	v1,0,r4
488c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v1,v16)
489b3f271e8SAnton Blanchard	addi	r4,r4,16
4908a583c0aSAndreas Schwab	stvx	v8,0,r3
491b3f271e8SAnton Blanchard	addi	r3,r3,16
492c2ce6f9fSAnton Blanchard	vor	v0,v1,v1
493b3f271e8SAnton Blanchard
494b3f271e8SAnton Blanchard5:	bf	cr7*4+2,6f
4958a583c0aSAndreas Schwab	lvx	v1,0,r4
496c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v1,v16)
497c2ce6f9fSAnton Blanchard	lvx	v0,r4,r9
498c2ce6f9fSAnton Blanchard	VPERM(v9,v1,v0,v16)
499b3f271e8SAnton Blanchard	addi	r4,r4,32
5008a583c0aSAndreas Schwab	stvx	v8,0,r3
501c2ce6f9fSAnton Blanchard	stvx	v9,r3,r9
502b3f271e8SAnton Blanchard	addi	r3,r3,32
503b3f271e8SAnton Blanchard
504b3f271e8SAnton Blanchard6:	bf	cr7*4+1,7f
5058a583c0aSAndreas Schwab	lvx	v3,0,r4
506c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v3,v16)
507c2ce6f9fSAnton Blanchard	lvx	v2,r4,r9
508c2ce6f9fSAnton Blanchard	VPERM(v9,v3,v2,v16)
509c2ce6f9fSAnton Blanchard	lvx	v1,r4,r10
510c2ce6f9fSAnton Blanchard	VPERM(v10,v2,v1,v16)
511c2ce6f9fSAnton Blanchard	lvx	v0,r4,r11
512c2ce6f9fSAnton Blanchard	VPERM(v11,v1,v0,v16)
513b3f271e8SAnton Blanchard	addi	r4,r4,64
5148a583c0aSAndreas Schwab	stvx	v8,0,r3
515c2ce6f9fSAnton Blanchard	stvx	v9,r3,r9
516c2ce6f9fSAnton Blanchard	stvx	v10,r3,r10
517c2ce6f9fSAnton Blanchard	stvx	v11,r3,r11
518b3f271e8SAnton Blanchard	addi	r3,r3,64
519b3f271e8SAnton Blanchard
520b3f271e8SAnton Blanchard7:	sub	r5,r5,r6
521b3f271e8SAnton Blanchard	srdi	r6,r5,7
522b3f271e8SAnton Blanchard
523c75df6f9SMichael Neuling	std	r14,STK_REG(R14)(r1)
524c75df6f9SMichael Neuling	std	r15,STK_REG(R15)(r1)
525c75df6f9SMichael Neuling	std	r16,STK_REG(R16)(r1)
526b3f271e8SAnton Blanchard
527b3f271e8SAnton Blanchard	li	r12,64
528b3f271e8SAnton Blanchard	li	r14,80
529b3f271e8SAnton Blanchard	li	r15,96
530b3f271e8SAnton Blanchard	li	r16,112
531b3f271e8SAnton Blanchard
532b3f271e8SAnton Blanchard	mtctr	r6
533b3f271e8SAnton Blanchard
534b3f271e8SAnton Blanchard	/*
535b3f271e8SAnton Blanchard	 * Now do cacheline sized loads and stores. By this stage the
536b3f271e8SAnton Blanchard	 * cacheline stores are also cacheline aligned.
537b3f271e8SAnton Blanchard	 */
538b3f271e8SAnton Blanchard	.align	5
539b3f271e8SAnton Blanchard8:
5408a583c0aSAndreas Schwab	lvx	v7,0,r4
541c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v7,v16)
542c2ce6f9fSAnton Blanchard	lvx	v6,r4,r9
543c2ce6f9fSAnton Blanchard	VPERM(v9,v7,v6,v16)
544c2ce6f9fSAnton Blanchard	lvx	v5,r4,r10
545c2ce6f9fSAnton Blanchard	VPERM(v10,v6,v5,v16)
546c2ce6f9fSAnton Blanchard	lvx	v4,r4,r11
547c2ce6f9fSAnton Blanchard	VPERM(v11,v5,v4,v16)
548c2ce6f9fSAnton Blanchard	lvx	v3,r4,r12
549c2ce6f9fSAnton Blanchard	VPERM(v12,v4,v3,v16)
550c2ce6f9fSAnton Blanchard	lvx	v2,r4,r14
551c2ce6f9fSAnton Blanchard	VPERM(v13,v3,v2,v16)
552c2ce6f9fSAnton Blanchard	lvx	v1,r4,r15
553c2ce6f9fSAnton Blanchard	VPERM(v14,v2,v1,v16)
554c2ce6f9fSAnton Blanchard	lvx	v0,r4,r16
555c2ce6f9fSAnton Blanchard	VPERM(v15,v1,v0,v16)
556b3f271e8SAnton Blanchard	addi	r4,r4,128
5578a583c0aSAndreas Schwab	stvx	v8,0,r3
558c2ce6f9fSAnton Blanchard	stvx	v9,r3,r9
559c2ce6f9fSAnton Blanchard	stvx	v10,r3,r10
560c2ce6f9fSAnton Blanchard	stvx	v11,r3,r11
561c2ce6f9fSAnton Blanchard	stvx	v12,r3,r12
562c2ce6f9fSAnton Blanchard	stvx	v13,r3,r14
563c2ce6f9fSAnton Blanchard	stvx	v14,r3,r15
564c2ce6f9fSAnton Blanchard	stvx	v15,r3,r16
565b3f271e8SAnton Blanchard	addi	r3,r3,128
566b3f271e8SAnton Blanchard	bdnz	8b
567b3f271e8SAnton Blanchard
568c75df6f9SMichael Neuling	ld	r14,STK_REG(R14)(r1)
569c75df6f9SMichael Neuling	ld	r15,STK_REG(R15)(r1)
570c75df6f9SMichael Neuling	ld	r16,STK_REG(R16)(r1)
571b3f271e8SAnton Blanchard
572b3f271e8SAnton Blanchard	/* Up to 127B to go */
573b3f271e8SAnton Blanchard	clrldi	r5,r5,(64-7)
574b3f271e8SAnton Blanchard	srdi	r6,r5,4
575b3f271e8SAnton Blanchard	mtocrf	0x01,r6
576b3f271e8SAnton Blanchard
577b3f271e8SAnton Blanchard	bf	cr7*4+1,9f
5788a583c0aSAndreas Schwab	lvx	v3,0,r4
579c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v3,v16)
580c2ce6f9fSAnton Blanchard	lvx	v2,r4,r9
581c2ce6f9fSAnton Blanchard	VPERM(v9,v3,v2,v16)
582c2ce6f9fSAnton Blanchard	lvx	v1,r4,r10
583c2ce6f9fSAnton Blanchard	VPERM(v10,v2,v1,v16)
584c2ce6f9fSAnton Blanchard	lvx	v0,r4,r11
585c2ce6f9fSAnton Blanchard	VPERM(v11,v1,v0,v16)
586b3f271e8SAnton Blanchard	addi	r4,r4,64
5878a583c0aSAndreas Schwab	stvx	v8,0,r3
588c2ce6f9fSAnton Blanchard	stvx	v9,r3,r9
589c2ce6f9fSAnton Blanchard	stvx	v10,r3,r10
590c2ce6f9fSAnton Blanchard	stvx	v11,r3,r11
591b3f271e8SAnton Blanchard	addi	r3,r3,64
592b3f271e8SAnton Blanchard
593b3f271e8SAnton Blanchard9:	bf	cr7*4+2,10f
5948a583c0aSAndreas Schwab	lvx	v1,0,r4
595c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v1,v16)
596c2ce6f9fSAnton Blanchard	lvx	v0,r4,r9
597c2ce6f9fSAnton Blanchard	VPERM(v9,v1,v0,v16)
598b3f271e8SAnton Blanchard	addi	r4,r4,32
5998a583c0aSAndreas Schwab	stvx	v8,0,r3
600c2ce6f9fSAnton Blanchard	stvx	v9,r3,r9
601b3f271e8SAnton Blanchard	addi	r3,r3,32
602b3f271e8SAnton Blanchard
603b3f271e8SAnton Blanchard10:	bf	cr7*4+3,11f
6048a583c0aSAndreas Schwab	lvx	v1,0,r4
605c2ce6f9fSAnton Blanchard	VPERM(v8,v0,v1,v16)
606b3f271e8SAnton Blanchard	addi	r4,r4,16
6078a583c0aSAndreas Schwab	stvx	v8,0,r3
608b3f271e8SAnton Blanchard	addi	r3,r3,16
609b3f271e8SAnton Blanchard
610b3f271e8SAnton Blanchard	/* Up to 15B to go */
611b3f271e8SAnton Blanchard11:	clrldi	r5,r5,(64-4)
612b3f271e8SAnton Blanchard	addi	r4,r4,-16	/* Unwind the +16 load offset */
613b3f271e8SAnton Blanchard	mtocrf	0x01,r5
614b3f271e8SAnton Blanchard	bf	cr7*4+0,12f
615b3f271e8SAnton Blanchard	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
616b3f271e8SAnton Blanchard	lwz	r6,4(r4)
617b3f271e8SAnton Blanchard	addi	r4,r4,8
618b3f271e8SAnton Blanchard	stw	r0,0(r3)
619b3f271e8SAnton Blanchard	stw	r6,4(r3)
620b3f271e8SAnton Blanchard	addi	r3,r3,8
621b3f271e8SAnton Blanchard
622b3f271e8SAnton Blanchard12:	bf	cr7*4+1,13f
623b3f271e8SAnton Blanchard	lwz	r0,0(r4)
624b3f271e8SAnton Blanchard	addi	r4,r4,4
625b3f271e8SAnton Blanchard	stw	r0,0(r3)
626b3f271e8SAnton Blanchard	addi	r3,r3,4
627b3f271e8SAnton Blanchard
628b3f271e8SAnton Blanchard13:	bf	cr7*4+2,14f
629b3f271e8SAnton Blanchard	lhz	r0,0(r4)
630b3f271e8SAnton Blanchard	addi	r4,r4,2
631b3f271e8SAnton Blanchard	sth	r0,0(r3)
632b3f271e8SAnton Blanchard	addi	r3,r3,2
633b3f271e8SAnton Blanchard
634b3f271e8SAnton Blanchard14:	bf	cr7*4+3,15f
635b3f271e8SAnton Blanchard	lbz	r0,0(r4)
636b3f271e8SAnton Blanchard	stb	r0,0(r3)
637b3f271e8SAnton Blanchard
638b3f271e8SAnton Blanchard15:	addi	r1,r1,STACKFRAMESIZE
639752a6422SUlrich Weigand	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
640*4e991e3cSNicholas Piggin	b	CFUNC(exit_vmx_ops)		/* tail call optimise */
641c2522dcdSPaul Bolle#endif /* CONFIG_ALTIVEC */
642