xref: /openbmc/linux/arch/powerpc/kernel/vector.S (revision be58f710)
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <asm/processor.h>
3#include <asm/ppc_asm.h>
4#include <asm/reg.h>
5#include <asm/asm-offsets.h>
6#include <asm/cputable.h>
7#include <asm/thread_info.h>
8#include <asm/page.h>
9#include <asm/ptrace.h>
10#include <asm/export.h>
11#include <asm/asm-compat.h>
12
13/*
14 * Load state from memory into VMX registers including VSCR.
15 * Assumes the caller has enabled VMX in the MSR.
16 */
17_GLOBAL(load_vr_state)
18	li	r4,VRSTATE_VSCR
19	lvx	v0,r4,r3
20	mtvscr	v0
21	REST_32VRS(0,r4,r3)
22	blr
23EXPORT_SYMBOL(load_vr_state)
24_ASM_NOKPROBE_SYMBOL(load_vr_state); /* used by restore_math */
25
26/*
27 * Store VMX state into memory, including VSCR.
28 * Assumes the caller has enabled VMX in the MSR.
29 */
30_GLOBAL(store_vr_state)
31	SAVE_32VRS(0, r4, r3)
32	mfvscr	v0
33	li	r4, VRSTATE_VSCR
34	stvx	v0, r4, r3
35	blr
36EXPORT_SYMBOL(store_vr_state)
37
38/*
39 * Disable VMX for the task which had it previously,
40 * and save its vector registers in its thread_struct.
41 * Enables the VMX for use in the kernel on return.
42 * On SMP we know the VMX is free, since we give it up every
43 * switch (ie, no lazy save of the vector registers).
44 *
45 * Note that on 32-bit this can only use registers that will be
46 * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
47 */
48_GLOBAL(load_up_altivec)
49	mfmsr	r5			/* grab the current MSR */
50	oris	r5,r5,MSR_VEC@h
51	MTMSRD(r5)			/* enable use of AltiVec now */
52	isync
53
54	/*
55	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
56	 * to optimise userspace context save/restore. Whenever we take an
57	 * altivec unavailable exception we must set VRSAVE to something non
58	 * zero. Set it to all 1s. See also the programming note in the ISA.
59	 */
60	mfspr	r4,SPRN_VRSAVE
61	cmpwi	0,r4,0
62	bne+	1f
63	li	r4,-1
64	mtspr	SPRN_VRSAVE,r4
651:
66	/* enable use of VMX after return */
67#ifdef CONFIG_PPC32
68	addi	r5,r2,THREAD
69	oris	r9,r9,MSR_VEC@h
70#else
71	ld	r4,PACACURRENT(r13)
72	addi	r5,r4,THREAD		/* Get THREAD */
73	oris	r12,r12,MSR_VEC@h
74	std	r12,_MSR(r1)
75#ifdef CONFIG_PPC_BOOK3S_64
76	li	r4,0
77	stb	r4,PACASRR_VALID(r13)
78#endif
79#endif
80	li	r4,1
81	stb	r4,THREAD_LOAD_VEC(r5)
82	addi	r6,r5,THREAD_VRSTATE
83	li	r10,VRSTATE_VSCR
84	stw	r4,THREAD_USED_VR(r5)
85	lvx	v0,r10,r6
86	mtvscr	v0
87	REST_32VRS(0,r4,r6)
88	/* restore registers and return */
89	blr
90_ASM_NOKPROBE_SYMBOL(load_up_altivec)
91
92/*
93 * save_altivec(tsk)
94 * Save the vector registers to its thread_struct
95 */
96_GLOBAL(save_altivec)
97	addi	r3,r3,THREAD		/* want THREAD of task */
98	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
99	PPC_LL	r5,PT_REGS(r3)
100	PPC_LCMPI	0,r7,0
101	bne	2f
102	addi	r7,r3,THREAD_VRSTATE
1032:	SAVE_32VRS(0,r4,r7)
104	mfvscr	v0
105	li	r4,VRSTATE_VSCR
106	stvx	v0,r4,r7
107	blr
108
109#ifdef CONFIG_VSX
110
111#ifdef CONFIG_PPC32
112#error This asm code isn't ready for 32-bit kernels
113#endif
114
115/*
116 * load_up_vsx(unused, unused, tsk)
117 * Disable VSX for the task which had it previously,
118 * and save its vector registers in its thread_struct.
119 * Reuse the fp and vsx saves, but first check to see if they have
120 * been saved already.
121 */
122_GLOBAL(load_up_vsx)
123/* Load FP and VSX registers if they haven't been done yet */
124	andi.	r5,r12,MSR_FP
125	beql+	load_up_fpu		/* skip if already loaded */
126	andis.	r5,r12,MSR_VEC@h
127	beql+	load_up_altivec		/* skip if already loaded */
128
129	ld	r4,PACACURRENT(r13)
130	addi	r4,r4,THREAD		/* Get THREAD */
131	li	r6,1
132	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
133	/* enable use of VSX after return */
134	oris	r12,r12,MSR_VSX@h
135	std	r12,_MSR(r1)
136	li	r4,0
137	stb	r4,PACASRR_VALID(r13)
138	b	fast_interrupt_return_srr
139
140#endif /* CONFIG_VSX */
141
142
143/*
144 * The routines below are in assembler so we can closely control the
145 * usage of floating-point registers.  These routines must be called
146 * with preempt disabled.
147 */
148#ifdef CONFIG_PPC32
149	.data
150fpzero:
151	.long	0
152fpone:
153	.long	0x3f800000	/* 1.0 in single-precision FP */
154fphalf:
155	.long	0x3f000000	/* 0.5 in single-precision FP */
156
157#define LDCONST(fr, name)	\
158	lis	r11,name@ha;	\
159	lfs	fr,name@l(r11)
160#else
161
162	.section ".toc","aw"
163fpzero:
164	.tc	FD_0_0[TC],0
165fpone:
166	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
167fphalf:
168	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
169
170#define LDCONST(fr, name)	\
171	lfd	fr,name@toc(r2)
172#endif
173
174	.text
175/*
176 * Internal routine to enable floating point and set FPSCR to 0.
177 * Don't call it from C; it doesn't use the normal calling convention.
178 */
179fpenable:
180#ifdef CONFIG_PPC32
181	stwu	r1,-64(r1)
182#else
183	stdu	r1,-64(r1)
184#endif
185	mfmsr	r10
186	ori	r11,r10,MSR_FP
187	mtmsr	r11
188	isync
189	stfd	fr0,24(r1)
190	stfd	fr1,16(r1)
191	stfd	fr31,8(r1)
192	LDCONST(fr1, fpzero)
193	mffs	fr31
194	MTFSF_L(fr1)
195	blr
196
197fpdisable:
198	mtlr	r12
199	MTFSF_L(fr31)
200	lfd	fr31,8(r1)
201	lfd	fr1,16(r1)
202	lfd	fr0,24(r1)
203	mtmsr	r10
204	isync
205	addi	r1,r1,64
206	blr
207
208/*
209 * Vector add, floating point.
210 */
211_GLOBAL(vaddfp)
212	mflr	r12
213	bl	fpenable
214	li	r0,4
215	mtctr	r0
216	li	r6,0
2171:	lfsx	fr0,r4,r6
218	lfsx	fr1,r5,r6
219	fadds	fr0,fr0,fr1
220	stfsx	fr0,r3,r6
221	addi	r6,r6,4
222	bdnz	1b
223	b	fpdisable
224
225/*
226 * Vector subtract, floating point.
227 */
228_GLOBAL(vsubfp)
229	mflr	r12
230	bl	fpenable
231	li	r0,4
232	mtctr	r0
233	li	r6,0
2341:	lfsx	fr0,r4,r6
235	lfsx	fr1,r5,r6
236	fsubs	fr0,fr0,fr1
237	stfsx	fr0,r3,r6
238	addi	r6,r6,4
239	bdnz	1b
240	b	fpdisable
241
242/*
243 * Vector multiply and add, floating point.
244 */
245_GLOBAL(vmaddfp)
246	mflr	r12
247	bl	fpenable
248	stfd	fr2,32(r1)
249	li	r0,4
250	mtctr	r0
251	li	r7,0
2521:	lfsx	fr0,r4,r7
253	lfsx	fr1,r5,r7
254	lfsx	fr2,r6,r7
255	fmadds	fr0,fr0,fr2,fr1
256	stfsx	fr0,r3,r7
257	addi	r7,r7,4
258	bdnz	1b
259	lfd	fr2,32(r1)
260	b	fpdisable
261
262/*
263 * Vector negative multiply and subtract, floating point.
264 */
265_GLOBAL(vnmsubfp)
266	mflr	r12
267	bl	fpenable
268	stfd	fr2,32(r1)
269	li	r0,4
270	mtctr	r0
271	li	r7,0
2721:	lfsx	fr0,r4,r7
273	lfsx	fr1,r5,r7
274	lfsx	fr2,r6,r7
275	fnmsubs	fr0,fr0,fr2,fr1
276	stfsx	fr0,r3,r7
277	addi	r7,r7,4
278	bdnz	1b
279	lfd	fr2,32(r1)
280	b	fpdisable
281
282/*
283 * Vector reciprocal estimate.  We just compute 1.0/x.
284 * r3 -> destination, r4 -> source.
285 */
286_GLOBAL(vrefp)
287	mflr	r12
288	bl	fpenable
289	li	r0,4
290	LDCONST(fr1, fpone)
291	mtctr	r0
292	li	r6,0
2931:	lfsx	fr0,r4,r6
294	fdivs	fr0,fr1,fr0
295	stfsx	fr0,r3,r6
296	addi	r6,r6,4
297	bdnz	1b
298	b	fpdisable
299
300/*
301 * Vector reciprocal square-root estimate, floating point.
302 * We use the frsqrte instruction for the initial estimate followed
303 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
304 * r3 -> destination, r4 -> source.
305 */
306_GLOBAL(vrsqrtefp)
307	mflr	r12
308	bl	fpenable
309	stfd	fr2,32(r1)
310	stfd	fr3,40(r1)
311	stfd	fr4,48(r1)
312	stfd	fr5,56(r1)
313	li	r0,4
314	LDCONST(fr4, fpone)
315	LDCONST(fr5, fphalf)
316	mtctr	r0
317	li	r6,0
3181:	lfsx	fr0,r4,r6
319	frsqrte	fr1,fr0		/* r = frsqrte(s) */
320	fmuls	fr3,fr1,fr0	/* r * s */
321	fmuls	fr2,fr1,fr5	/* r * 0.5 */
322	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
323	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
324	fmuls	fr3,fr1,fr0	/* r * s */
325	fmuls	fr2,fr1,fr5	/* r * 0.5 */
326	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
327	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
328	stfsx	fr1,r3,r6
329	addi	r6,r6,4
330	bdnz	1b
331	lfd	fr5,56(r1)
332	lfd	fr4,48(r1)
333	lfd	fr3,40(r1)
334	lfd	fr2,32(r1)
335	b	fpdisable
336