xref: /openbmc/linux/arch/powerpc/kernel/vector.S (revision e3d786a3)
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <asm/processor.h>
3#include <asm/ppc_asm.h>
4#include <asm/reg.h>
5#include <asm/asm-offsets.h>
6#include <asm/cputable.h>
7#include <asm/thread_info.h>
8#include <asm/page.h>
9#include <asm/ptrace.h>
10#include <asm/export.h>
11#include <asm/asm-compat.h>
12
13/*
14 * Load state from memory into VMX registers including VSCR.
15 * Assumes the caller has enabled VMX in the MSR.
16 */
17_GLOBAL(load_vr_state)
18	li	r4,VRSTATE_VSCR
19	lvx	v0,r4,r3
20	mtvscr	v0
21	REST_32VRS(0,r4,r3)
22	blr
23EXPORT_SYMBOL(load_vr_state)
24
25/*
26 * Store VMX state into memory, including VSCR.
27 * Assumes the caller has enabled VMX in the MSR.
28 */
29_GLOBAL(store_vr_state)
30	SAVE_32VRS(0, r4, r3)
31	mfvscr	v0
32	li	r4, VRSTATE_VSCR
33	stvx	v0, r4, r3
34	blr
35EXPORT_SYMBOL(store_vr_state)
36
37/*
38 * Disable VMX for the task which had it previously,
39 * and save its vector registers in its thread_struct.
40 * Enables the VMX for use in the kernel on return.
41 * On SMP we know the VMX is free, since we give it up every
42 * switch (ie, no lazy save of the vector registers).
43 *
44 * Note that on 32-bit this can only use registers that will be
45 * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
46 */
47_GLOBAL(load_up_altivec)
48	mfmsr	r5			/* grab the current MSR */
49	oris	r5,r5,MSR_VEC@h
50	MTMSRD(r5)			/* enable use of AltiVec now */
51	isync
52
53	/*
54	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
55	 * to optimise userspace context save/restore. Whenever we take an
56	 * altivec unavailable exception we must set VRSAVE to something non
57	 * zero. Set it to all 1s. See also the programming note in the ISA.
58	 */
59	mfspr	r4,SPRN_VRSAVE
60	cmpwi	0,r4,0
61	bne+	1f
62	li	r4,-1
63	mtspr	SPRN_VRSAVE,r4
641:
65	/* enable use of VMX after return */
66#ifdef CONFIG_PPC32
67	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
68	oris	r9,r9,MSR_VEC@h
69#else
70	ld	r4,PACACURRENT(r13)
71	addi	r5,r4,THREAD		/* Get THREAD */
72	oris	r12,r12,MSR_VEC@h
73	std	r12,_MSR(r1)
74#endif
75	/* Don't care if r4 overflows, this is desired behaviour */
76	lbz	r4,THREAD_LOAD_VEC(r5)
77	addi	r4,r4,1
78	stb	r4,THREAD_LOAD_VEC(r5)
79	addi	r6,r5,THREAD_VRSTATE
80	li	r4,1
81	li	r10,VRSTATE_VSCR
82	stw	r4,THREAD_USED_VR(r5)
83	lvx	v0,r10,r6
84	mtvscr	v0
85	REST_32VRS(0,r4,r6)
86	/* restore registers and return */
87	blr
88
89/*
90 * save_altivec(tsk)
91 * Save the vector registers to its thread_struct
92 */
93_GLOBAL(save_altivec)
94	addi	r3,r3,THREAD		/* want THREAD of task */
95	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
96	PPC_LL	r5,PT_REGS(r3)
97	PPC_LCMPI	0,r7,0
98	bne	2f
99	addi	r7,r3,THREAD_VRSTATE
1002:	SAVE_32VRS(0,r4,r7)
101	mfvscr	v0
102	li	r4,VRSTATE_VSCR
103	stvx	v0,r4,r7
104	blr
105
106#ifdef CONFIG_VSX
107
108#ifdef CONFIG_PPC32
109#error This asm code isn't ready for 32-bit kernels
110#endif
111
112/*
113 * load_up_vsx(unused, unused, tsk)
114 * Disable VSX for the task which had it previously,
115 * and save its vector registers in its thread_struct.
116 * Reuse the fp and vsx saves, but first check to see if they have
117 * been saved already.
118 */
119_GLOBAL(load_up_vsx)
120/* Load FP and VSX registers if they haven't been done yet */
121	andi.	r5,r12,MSR_FP
122	beql+	load_up_fpu		/* skip if already loaded */
123	andis.	r5,r12,MSR_VEC@h
124	beql+	load_up_altivec		/* skip if already loaded */
125
126	ld	r4,PACACURRENT(r13)
127	addi	r4,r4,THREAD		/* Get THREAD */
128	li	r6,1
129	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
130	/* enable use of VSX after return */
131	oris	r12,r12,MSR_VSX@h
132	std	r12,_MSR(r1)
133	b	fast_exception_return
134
135#endif /* CONFIG_VSX */
136
137
138/*
139 * The routines below are in assembler so we can closely control the
140 * usage of floating-point registers.  These routines must be called
141 * with preempt disabled.
142 */
143#ifdef CONFIG_PPC32
144	.data
145fpzero:
146	.long	0
147fpone:
148	.long	0x3f800000	/* 1.0 in single-precision FP */
149fphalf:
150	.long	0x3f000000	/* 0.5 in single-precision FP */
151
152#define LDCONST(fr, name)	\
153	lis	r11,name@ha;	\
154	lfs	fr,name@l(r11)
155#else
156
157	.section ".toc","aw"
158fpzero:
159	.tc	FD_0_0[TC],0
160fpone:
161	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
162fphalf:
163	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
164
165#define LDCONST(fr, name)	\
166	lfd	fr,name@toc(r2)
167#endif
168
169	.text
170/*
171 * Internal routine to enable floating point and set FPSCR to 0.
172 * Don't call it from C; it doesn't use the normal calling convention.
173 */
174fpenable:
175#ifdef CONFIG_PPC32
176	stwu	r1,-64(r1)
177#else
178	stdu	r1,-64(r1)
179#endif
180	mfmsr	r10
181	ori	r11,r10,MSR_FP
182	mtmsr	r11
183	isync
184	stfd	fr0,24(r1)
185	stfd	fr1,16(r1)
186	stfd	fr31,8(r1)
187	LDCONST(fr1, fpzero)
188	mffs	fr31
189	MTFSF_L(fr1)
190	blr
191
192fpdisable:
193	mtlr	r12
194	MTFSF_L(fr31)
195	lfd	fr31,8(r1)
196	lfd	fr1,16(r1)
197	lfd	fr0,24(r1)
198	mtmsr	r10
199	isync
200	addi	r1,r1,64
201	blr
202
203/*
204 * Vector add, floating point.
205 */
206_GLOBAL(vaddfp)
207	mflr	r12
208	bl	fpenable
209	li	r0,4
210	mtctr	r0
211	li	r6,0
2121:	lfsx	fr0,r4,r6
213	lfsx	fr1,r5,r6
214	fadds	fr0,fr0,fr1
215	stfsx	fr0,r3,r6
216	addi	r6,r6,4
217	bdnz	1b
218	b	fpdisable
219
220/*
221 * Vector subtract, floating point.
222 */
223_GLOBAL(vsubfp)
224	mflr	r12
225	bl	fpenable
226	li	r0,4
227	mtctr	r0
228	li	r6,0
2291:	lfsx	fr0,r4,r6
230	lfsx	fr1,r5,r6
231	fsubs	fr0,fr0,fr1
232	stfsx	fr0,r3,r6
233	addi	r6,r6,4
234	bdnz	1b
235	b	fpdisable
236
237/*
238 * Vector multiply and add, floating point.
239 */
240_GLOBAL(vmaddfp)
241	mflr	r12
242	bl	fpenable
243	stfd	fr2,32(r1)
244	li	r0,4
245	mtctr	r0
246	li	r7,0
2471:	lfsx	fr0,r4,r7
248	lfsx	fr1,r5,r7
249	lfsx	fr2,r6,r7
250	fmadds	fr0,fr0,fr2,fr1
251	stfsx	fr0,r3,r7
252	addi	r7,r7,4
253	bdnz	1b
254	lfd	fr2,32(r1)
255	b	fpdisable
256
257/*
258 * Vector negative multiply and subtract, floating point.
259 */
260_GLOBAL(vnmsubfp)
261	mflr	r12
262	bl	fpenable
263	stfd	fr2,32(r1)
264	li	r0,4
265	mtctr	r0
266	li	r7,0
2671:	lfsx	fr0,r4,r7
268	lfsx	fr1,r5,r7
269	lfsx	fr2,r6,r7
270	fnmsubs	fr0,fr0,fr2,fr1
271	stfsx	fr0,r3,r7
272	addi	r7,r7,4
273	bdnz	1b
274	lfd	fr2,32(r1)
275	b	fpdisable
276
277/*
278 * Vector reciprocal estimate.  We just compute 1.0/x.
279 * r3 -> destination, r4 -> source.
280 */
281_GLOBAL(vrefp)
282	mflr	r12
283	bl	fpenable
284	li	r0,4
285	LDCONST(fr1, fpone)
286	mtctr	r0
287	li	r6,0
2881:	lfsx	fr0,r4,r6
289	fdivs	fr0,fr1,fr0
290	stfsx	fr0,r3,r6
291	addi	r6,r6,4
292	bdnz	1b
293	b	fpdisable
294
295/*
296 * Vector reciprocal square-root estimate, floating point.
297 * We use the frsqrte instruction for the initial estimate followed
298 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
299 * r3 -> destination, r4 -> source.
300 */
301_GLOBAL(vrsqrtefp)
302	mflr	r12
303	bl	fpenable
304	stfd	fr2,32(r1)
305	stfd	fr3,40(r1)
306	stfd	fr4,48(r1)
307	stfd	fr5,56(r1)
308	li	r0,4
309	LDCONST(fr4, fpone)
310	LDCONST(fr5, fphalf)
311	mtctr	r0
312	li	r6,0
3131:	lfsx	fr0,r4,r6
314	frsqrte	fr1,fr0		/* r = frsqrte(s) */
315	fmuls	fr3,fr1,fr0	/* r * s */
316	fmuls	fr2,fr1,fr5	/* r * 0.5 */
317	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
318	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
319	fmuls	fr3,fr1,fr0	/* r * s */
320	fmuls	fr2,fr1,fr5	/* r * 0.5 */
321	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
322	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
323	stfsx	fr1,r3,r6
324	addi	r6,r6,4
325	bdnz	1b
326	lfd	fr5,56(r1)
327	lfd	fr4,48(r1)
328	lfd	fr3,40(r1)
329	lfd	fr2,32(r1)
330	b	fpdisable
331