xref: /openbmc/linux/arch/powerpc/kernel/vector.S (revision b58c6630)
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <asm/processor.h>
3#include <asm/ppc_asm.h>
4#include <asm/reg.h>
5#include <asm/asm-offsets.h>
6#include <asm/cputable.h>
7#include <asm/thread_info.h>
8#include <asm/page.h>
9#include <asm/ptrace.h>
10#include <asm/export.h>
11#include <asm/asm-compat.h>
12
13/*
14 * Load state from memory into VMX registers including VSCR.
15 * Assumes the caller has enabled VMX in the MSR.
16 */
17_GLOBAL(load_vr_state)
18	li	r4,VRSTATE_VSCR
19	lvx	v0,r4,r3
20	mtvscr	v0
21	REST_32VRS(0,r4,r3)
22	blr
23EXPORT_SYMBOL(load_vr_state)
24_ASM_NOKPROBE_SYMBOL(load_vr_state); /* used by restore_math */
25
26/*
27 * Store VMX state into memory, including VSCR.
28 * Assumes the caller has enabled VMX in the MSR.
29 */
30_GLOBAL(store_vr_state)
31	SAVE_32VRS(0, r4, r3)
32	mfvscr	v0
33	li	r4, VRSTATE_VSCR
34	stvx	v0, r4, r3
35	blr
36EXPORT_SYMBOL(store_vr_state)
37
38/*
39 * Disable VMX for the task which had it previously,
40 * and save its vector registers in its thread_struct.
41 * Enables the VMX for use in the kernel on return.
42 * On SMP we know the VMX is free, since we give it up every
43 * switch (ie, no lazy save of the vector registers).
44 *
45 * Note that on 32-bit this can only use registers that will be
46 * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
47 */
48_GLOBAL(load_up_altivec)
49	mfmsr	r5			/* grab the current MSR */
50	oris	r5,r5,MSR_VEC@h
51	MTMSRD(r5)			/* enable use of AltiVec now */
52	isync
53
54	/*
55	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
56	 * to optimise userspace context save/restore. Whenever we take an
57	 * altivec unavailable exception we must set VRSAVE to something non
58	 * zero. Set it to all 1s. See also the programming note in the ISA.
59	 */
60	mfspr	r4,SPRN_VRSAVE
61	cmpwi	0,r4,0
62	bne+	1f
63	li	r4,-1
64	mtspr	SPRN_VRSAVE,r4
651:
66	/* enable use of VMX after return */
67#ifdef CONFIG_PPC32
68	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
69	oris	r9,r9,MSR_VEC@h
70#ifdef CONFIG_VMAP_STACK
71	tovirt(r5, r5)
72#endif
73#else
74	ld	r4,PACACURRENT(r13)
75	addi	r5,r4,THREAD		/* Get THREAD */
76	oris	r12,r12,MSR_VEC@h
77	std	r12,_MSR(r1)
78#endif
79	/* Don't care if r4 overflows, this is desired behaviour */
80	lbz	r4,THREAD_LOAD_VEC(r5)
81	addi	r4,r4,1
82	stb	r4,THREAD_LOAD_VEC(r5)
83	addi	r6,r5,THREAD_VRSTATE
84	li	r4,1
85	li	r10,VRSTATE_VSCR
86	stw	r4,THREAD_USED_VR(r5)
87	lvx	v0,r10,r6
88	mtvscr	v0
89	REST_32VRS(0,r4,r6)
90	/* restore registers and return */
91	blr
92
93/*
94 * save_altivec(tsk)
95 * Save the vector registers to its thread_struct
96 */
97_GLOBAL(save_altivec)
98	addi	r3,r3,THREAD		/* want THREAD of task */
99	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
100	PPC_LL	r5,PT_REGS(r3)
101	PPC_LCMPI	0,r7,0
102	bne	2f
103	addi	r7,r3,THREAD_VRSTATE
1042:	SAVE_32VRS(0,r4,r7)
105	mfvscr	v0
106	li	r4,VRSTATE_VSCR
107	stvx	v0,r4,r7
108	blr
109
110#ifdef CONFIG_VSX
111
112#ifdef CONFIG_PPC32
113#error This asm code isn't ready for 32-bit kernels
114#endif
115
116/*
117 * load_up_vsx(unused, unused, tsk)
118 * Disable VSX for the task which had it previously,
119 * and save its vector registers in its thread_struct.
120 * Reuse the fp and vsx saves, but first check to see if they have
121 * been saved already.
122 */
123_GLOBAL(load_up_vsx)
124/* Load FP and VSX registers if they haven't been done yet */
125	andi.	r5,r12,MSR_FP
126	beql+	load_up_fpu		/* skip if already loaded */
127	andis.	r5,r12,MSR_VEC@h
128	beql+	load_up_altivec		/* skip if already loaded */
129
130	ld	r4,PACACURRENT(r13)
131	addi	r4,r4,THREAD		/* Get THREAD */
132	li	r6,1
133	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
134	/* enable use of VSX after return */
135	oris	r12,r12,MSR_VSX@h
136	std	r12,_MSR(r1)
137	b	fast_interrupt_return
138
139#endif /* CONFIG_VSX */
140
141
142/*
143 * The routines below are in assembler so we can closely control the
144 * usage of floating-point registers.  These routines must be called
145 * with preempt disabled.
146 */
147#ifdef CONFIG_PPC32
148	.data
149fpzero:
150	.long	0
151fpone:
152	.long	0x3f800000	/* 1.0 in single-precision FP */
153fphalf:
154	.long	0x3f000000	/* 0.5 in single-precision FP */
155
156#define LDCONST(fr, name)	\
157	lis	r11,name@ha;	\
158	lfs	fr,name@l(r11)
159#else
160
161	.section ".toc","aw"
162fpzero:
163	.tc	FD_0_0[TC],0
164fpone:
165	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
166fphalf:
167	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
168
169#define LDCONST(fr, name)	\
170	lfd	fr,name@toc(r2)
171#endif
172
173	.text
174/*
175 * Internal routine to enable floating point and set FPSCR to 0.
176 * Don't call it from C; it doesn't use the normal calling convention.
177 */
178fpenable:
179#ifdef CONFIG_PPC32
180	stwu	r1,-64(r1)
181#else
182	stdu	r1,-64(r1)
183#endif
184	mfmsr	r10
185	ori	r11,r10,MSR_FP
186	mtmsr	r11
187	isync
188	stfd	fr0,24(r1)
189	stfd	fr1,16(r1)
190	stfd	fr31,8(r1)
191	LDCONST(fr1, fpzero)
192	mffs	fr31
193	MTFSF_L(fr1)
194	blr
195
196fpdisable:
197	mtlr	r12
198	MTFSF_L(fr31)
199	lfd	fr31,8(r1)
200	lfd	fr1,16(r1)
201	lfd	fr0,24(r1)
202	mtmsr	r10
203	isync
204	addi	r1,r1,64
205	blr
206
207/*
208 * Vector add, floating point.
209 */
210_GLOBAL(vaddfp)
211	mflr	r12
212	bl	fpenable
213	li	r0,4
214	mtctr	r0
215	li	r6,0
2161:	lfsx	fr0,r4,r6
217	lfsx	fr1,r5,r6
218	fadds	fr0,fr0,fr1
219	stfsx	fr0,r3,r6
220	addi	r6,r6,4
221	bdnz	1b
222	b	fpdisable
223
224/*
225 * Vector subtract, floating point.
226 */
227_GLOBAL(vsubfp)
228	mflr	r12
229	bl	fpenable
230	li	r0,4
231	mtctr	r0
232	li	r6,0
2331:	lfsx	fr0,r4,r6
234	lfsx	fr1,r5,r6
235	fsubs	fr0,fr0,fr1
236	stfsx	fr0,r3,r6
237	addi	r6,r6,4
238	bdnz	1b
239	b	fpdisable
240
241/*
242 * Vector multiply and add, floating point.
243 */
244_GLOBAL(vmaddfp)
245	mflr	r12
246	bl	fpenable
247	stfd	fr2,32(r1)
248	li	r0,4
249	mtctr	r0
250	li	r7,0
2511:	lfsx	fr0,r4,r7
252	lfsx	fr1,r5,r7
253	lfsx	fr2,r6,r7
254	fmadds	fr0,fr0,fr2,fr1
255	stfsx	fr0,r3,r7
256	addi	r7,r7,4
257	bdnz	1b
258	lfd	fr2,32(r1)
259	b	fpdisable
260
261/*
262 * Vector negative multiply and subtract, floating point.
263 */
264_GLOBAL(vnmsubfp)
265	mflr	r12
266	bl	fpenable
267	stfd	fr2,32(r1)
268	li	r0,4
269	mtctr	r0
270	li	r7,0
2711:	lfsx	fr0,r4,r7
272	lfsx	fr1,r5,r7
273	lfsx	fr2,r6,r7
274	fnmsubs	fr0,fr0,fr2,fr1
275	stfsx	fr0,r3,r7
276	addi	r7,r7,4
277	bdnz	1b
278	lfd	fr2,32(r1)
279	b	fpdisable
280
281/*
282 * Vector reciprocal estimate.  We just compute 1.0/x.
283 * r3 -> destination, r4 -> source.
284 */
285_GLOBAL(vrefp)
286	mflr	r12
287	bl	fpenable
288	li	r0,4
289	LDCONST(fr1, fpone)
290	mtctr	r0
291	li	r6,0
2921:	lfsx	fr0,r4,r6
293	fdivs	fr0,fr1,fr0
294	stfsx	fr0,r3,r6
295	addi	r6,r6,4
296	bdnz	1b
297	b	fpdisable
298
299/*
300 * Vector reciprocal square-root estimate, floating point.
301 * We use the frsqrte instruction for the initial estimate followed
302 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
303 * r3 -> destination, r4 -> source.
304 */
305_GLOBAL(vrsqrtefp)
306	mflr	r12
307	bl	fpenable
308	stfd	fr2,32(r1)
309	stfd	fr3,40(r1)
310	stfd	fr4,48(r1)
311	stfd	fr5,56(r1)
312	li	r0,4
313	LDCONST(fr4, fpone)
314	LDCONST(fr5, fphalf)
315	mtctr	r0
316	li	r6,0
3171:	lfsx	fr0,r4,r6
318	frsqrte	fr1,fr0		/* r = frsqrte(s) */
319	fmuls	fr3,fr1,fr0	/* r * s */
320	fmuls	fr2,fr1,fr5	/* r * 0.5 */
321	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
322	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
323	fmuls	fr3,fr1,fr0	/* r * s */
324	fmuls	fr2,fr1,fr5	/* r * 0.5 */
325	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
326	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
327	stfsx	fr1,r3,r6
328	addi	r6,r6,4
329	bdnz	1b
330	lfd	fr5,56(r1)
331	lfd	fr4,48(r1)
332	lfd	fr3,40(r1)
333	lfd	fr2,32(r1)
334	b	fpdisable
335