xref: /openbmc/linux/arch/xtensa/lib/umulsidi3.S (revision e65e175b07bef5974045cc42238de99057669ca7)
1/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */
2#include <linux/linkage.h>
3#include <asm/asmmacro.h>
4#include <asm/core.h>
5
6#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
7#define XCHAL_NO_MUL 1
8#endif
9
10ENTRY(__umulsidi3)
11
12#ifdef __XTENSA_CALL0_ABI__
13	abi_entry(32)
14	s32i	a12, sp, 16
15	s32i	a13, sp, 20
16	s32i	a14, sp, 24
17	s32i	a15, sp, 28
18#elif XCHAL_NO_MUL
19	/* This is not really a leaf function; allocate enough stack space
20	   to allow CALL12s to a helper function.  */
21	abi_entry(32)
22#else
23	abi_entry_default
24#endif
25
26#ifdef __XTENSA_EB__
27#define wh a2
28#define wl a3
29#else
30#define wh a3
31#define wl a2
32#endif /* __XTENSA_EB__ */
33
34	/* This code is taken from the mulsf3 routine in ieee754-sf.S.
35	   See more comments there.  */
36
37#if XCHAL_HAVE_MUL32_HIGH
38	mull	a6, a2, a3
39	muluh	wh, a2, a3
40	mov	wl, a6
41
42#else /* ! MUL32_HIGH */
43
44#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL
45	/* a0 and a8 will be clobbered by calling the multiply function
46	   but a8 is not used here and need not be saved.  */
47	s32i	a0, sp, 0
48#endif
49
50#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
51
52#define a2h a4
53#define a3h a5
54
55	/* Get the high halves of the inputs into registers.  */
56	srli	a2h, a2, 16
57	srli	a3h, a3, 16
58
59#define a2l a2
60#define a3l a3
61
62#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
63	/* Clear the high halves of the inputs.  This does not matter
64	   for MUL16 because the high bits are ignored.  */
65	extui	a2, a2, 0, 16
66	extui	a3, a3, 0, 16
67#endif
68#endif /* MUL16 || MUL32 */
69
70
71#if XCHAL_HAVE_MUL16
72
73#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
74	mul16u	dst, xreg ## xhalf, yreg ## yhalf
75
76#elif XCHAL_HAVE_MUL32
77
78#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
79	mull	dst, xreg ## xhalf, yreg ## yhalf
80
81#elif XCHAL_HAVE_MAC16
82
83/* The preprocessor insists on inserting a space when concatenating after
84   a period in the definition of do_mul below.  These macros are a workaround
85   using underscores instead of periods when doing the concatenation.  */
86#define umul_aa_ll umul.aa.ll
87#define umul_aa_lh umul.aa.lh
88#define umul_aa_hl umul.aa.hl
89#define umul_aa_hh umul.aa.hh
90
91#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
92	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
93	rsr	dst, ACCLO
94
95#else /* no multiply hardware */
96
97#define set_arg_l(dst, src) \
98	extui	dst, src, 0, 16
99#define set_arg_h(dst, src) \
100	srli	dst, src, 16
101
102#ifdef __XTENSA_CALL0_ABI__
103#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
104	set_arg_ ## xhalf (a13, xreg); \
105	set_arg_ ## yhalf (a14, yreg); \
106	call0	.Lmul_mulsi3; \
107	mov	dst, a12
108#else
109#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
110	set_arg_ ## xhalf (a14, xreg); \
111	set_arg_ ## yhalf (a15, yreg); \
112	call12	.Lmul_mulsi3; \
113	mov	dst, a14
114#endif /* __XTENSA_CALL0_ABI__ */
115
116#endif /* no multiply hardware */
117
118	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
119	do_mul(a6, a2, l, a3, h)	/* pp 1 */
120	do_mul(a11, a2, h, a3, l)	/* pp 2 */
121	movi	a9, 0
122	add	a6, a6, a11
123	bgeu	a6, a11, 1f
124	addi	a9, a9, 1
1251:
126	/* Shift the high half of a9/a6 into position in a9.  Note that
127	   this value can be safely incremented without any carry-outs.  */
128	ssai	16
129	src	a9, a9, a6
130
131	/* Compute the low word into a6.  */
132	do_mul(a11, a2, l, a3, l)	/* pp 0 */
133	sll	a6, a6
134	add	a6, a6, a11
135	bgeu	a6, a11, 1f
136	addi	a9, a9, 1
1371:
138	/* Compute the high word into wh.  */
139	do_mul(wh, a2, h, a3, h)	/* pp 3 */
140	add	wh, wh, a9
141	mov	wl, a6
142
143#endif /* !MUL32_HIGH */
144
145#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL
146	/* Restore the original return address.  */
147	l32i	a0, sp, 0
148#endif
149#ifdef __XTENSA_CALL0_ABI__
150	l32i	a12, sp, 16
151	l32i	a13, sp, 20
152	l32i	a14, sp, 24
153	l32i	a15, sp, 28
154	abi_ret(32)
155#else
156	abi_ret_default
157#endif
158
159#if XCHAL_NO_MUL
160
161	.macro	do_addx2 dst, as, at, tmp
162#if XCHAL_HAVE_ADDX
163	addx2	\dst, \as, \at
164#else
165	slli	\tmp, \as, 1
166	add	\dst, \tmp, \at
167#endif
168	.endm
169
170	.macro	do_addx4 dst, as, at, tmp
171#if XCHAL_HAVE_ADDX
172	addx4	\dst, \as, \at
173#else
174	slli	\tmp, \as, 2
175	add	\dst, \tmp, \at
176#endif
177	.endm
178
179	.macro	do_addx8 dst, as, at, tmp
180#if XCHAL_HAVE_ADDX
181	addx8	\dst, \as, \at
182#else
183	slli	\tmp, \as, 3
184	add	\dst, \tmp, \at
185#endif
186	.endm
187
188	/* For Xtensa processors with no multiply hardware, this simplified
189	   version of _mulsi3 is used for multiplying 16-bit chunks of
190	   the floating-point mantissas.  When using CALL0, this function
191	   uses a custom ABI: the inputs are passed in a13 and a14, the
192	   result is returned in a12, and a8 and a15 are clobbered.  */
193	.align	4
194.Lmul_mulsi3:
195	abi_entry_default
196
197	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
198	movi	\dst, 0
1991:	add	\tmp1, \src2, \dst
200	extui	\tmp2, \src1, 0, 1
201	movnez	\dst, \tmp1, \tmp2
202
203	do_addx2 \tmp1, \src2, \dst, \tmp1
204	extui	\tmp2, \src1, 1, 1
205	movnez	\dst, \tmp1, \tmp2
206
207	do_addx4 \tmp1, \src2, \dst, \tmp1
208	extui	\tmp2, \src1, 2, 1
209	movnez	\dst, \tmp1, \tmp2
210
211	do_addx8 \tmp1, \src2, \dst, \tmp1
212	extui	\tmp2, \src1, 3, 1
213	movnez	\dst, \tmp1, \tmp2
214
215	srli	\src1, \src1, 4
216	slli	\src2, \src2, 4
217	bnez	\src1, 1b
218	.endm
219
220#ifdef __XTENSA_CALL0_ABI__
221	mul_mulsi3_body a12, a13, a14, a15, a8
222#else
223	/* The result will be written into a2, so save that argument in a4.  */
224	mov	a4, a2
225	mul_mulsi3_body a2, a4, a3, a5, a6
226#endif
227	abi_ret_default
228#endif /* XCHAL_NO_MUL */
229
230ENDPROC(__umulsidi3)
231