xref: /openbmc/linux/arch/xtensa/lib/mulsi3.S (revision dbf4ed89)
1*dbf4ed89SMax Filippov/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */
2*dbf4ed89SMax Filippov#include <linux/linkage.h>
3*dbf4ed89SMax Filippov#include <asm/asmmacro.h>
4*dbf4ed89SMax Filippov#include <asm/core.h>
5*dbf4ed89SMax Filippov
6*dbf4ed89SMax Filippov	.macro	do_addx2 dst, as, at, tmp
7*dbf4ed89SMax Filippov#if XCHAL_HAVE_ADDX
8*dbf4ed89SMax Filippov	addx2	\dst, \as, \at
9*dbf4ed89SMax Filippov#else
10*dbf4ed89SMax Filippov	slli	\tmp, \as, 1
11*dbf4ed89SMax Filippov	add	\dst, \tmp, \at
12*dbf4ed89SMax Filippov#endif
13*dbf4ed89SMax Filippov	.endm
14*dbf4ed89SMax Filippov
15*dbf4ed89SMax Filippov	.macro	do_addx4 dst, as, at, tmp
16*dbf4ed89SMax Filippov#if XCHAL_HAVE_ADDX
17*dbf4ed89SMax Filippov	addx4	\dst, \as, \at
18*dbf4ed89SMax Filippov#else
19*dbf4ed89SMax Filippov	slli	\tmp, \as, 2
20*dbf4ed89SMax Filippov	add	\dst, \tmp, \at
21*dbf4ed89SMax Filippov#endif
22*dbf4ed89SMax Filippov	.endm
23*dbf4ed89SMax Filippov
24*dbf4ed89SMax Filippov	.macro	do_addx8 dst, as, at, tmp
25*dbf4ed89SMax Filippov#if XCHAL_HAVE_ADDX
26*dbf4ed89SMax Filippov	addx8	\dst, \as, \at
27*dbf4ed89SMax Filippov#else
28*dbf4ed89SMax Filippov	slli	\tmp, \as, 3
29*dbf4ed89SMax Filippov	add	\dst, \tmp, \at
30*dbf4ed89SMax Filippov#endif
31*dbf4ed89SMax Filippov	.endm
32*dbf4ed89SMax Filippov
33*dbf4ed89SMax FilippovENTRY(__mulsi3)
34*dbf4ed89SMax Filippov
35*dbf4ed89SMax Filippov	abi_entry_default
36*dbf4ed89SMax Filippov
37*dbf4ed89SMax Filippov#if XCHAL_HAVE_MUL32
38*dbf4ed89SMax Filippov	mull	a2, a2, a3
39*dbf4ed89SMax Filippov
40*dbf4ed89SMax Filippov#elif XCHAL_HAVE_MUL16
41*dbf4ed89SMax Filippov	or	a4, a2, a3
42*dbf4ed89SMax Filippov	srai	a4, a4, 16
43*dbf4ed89SMax Filippov	bnez	a4, .LMUL16
44*dbf4ed89SMax Filippov	mul16u	a2, a2, a3
45*dbf4ed89SMax Filippov	abi_ret_default
46*dbf4ed89SMax Filippov.LMUL16:
47*dbf4ed89SMax Filippov	srai	a4, a2, 16
48*dbf4ed89SMax Filippov	srai	a5, a3, 16
49*dbf4ed89SMax Filippov	mul16u	a7, a4, a3
50*dbf4ed89SMax Filippov	mul16u	a6, a5, a2
51*dbf4ed89SMax Filippov	mul16u	a4, a2, a3
52*dbf4ed89SMax Filippov	add	a7, a7, a6
53*dbf4ed89SMax Filippov	slli	a7, a7, 16
54*dbf4ed89SMax Filippov	add	a2, a7, a4
55*dbf4ed89SMax Filippov
56*dbf4ed89SMax Filippov#elif XCHAL_HAVE_MAC16
57*dbf4ed89SMax Filippov	mul.aa.hl a2, a3
58*dbf4ed89SMax Filippov	mula.aa.lh a2, a3
59*dbf4ed89SMax Filippov	rsr	a5, ACCLO
60*dbf4ed89SMax Filippov	umul.aa.ll a2, a3
61*dbf4ed89SMax Filippov	rsr	a4, ACCLO
62*dbf4ed89SMax Filippov	slli	a5, a5, 16
63*dbf4ed89SMax Filippov	add	a2, a4, a5
64*dbf4ed89SMax Filippov
65*dbf4ed89SMax Filippov#else /* !MUL32 && !MUL16 && !MAC16 */
66*dbf4ed89SMax Filippov
67*dbf4ed89SMax Filippov	/* Multiply one bit at a time, but unroll the loop 4x to better
68*dbf4ed89SMax Filippov	   exploit the addx instructions and avoid overhead.
69*dbf4ed89SMax Filippov	   Peel the first iteration to save a cycle on init.  */
70*dbf4ed89SMax Filippov
71*dbf4ed89SMax Filippov	/* Avoid negative numbers.  */
72*dbf4ed89SMax Filippov	xor	a5, a2, a3	/* Top bit is 1 if one input is negative.  */
73*dbf4ed89SMax Filippov	do_abs	a3, a3, a6
74*dbf4ed89SMax Filippov	do_abs	a2, a2, a6
75*dbf4ed89SMax Filippov
76*dbf4ed89SMax Filippov	/* Swap so the second argument is smaller.  */
77*dbf4ed89SMax Filippov	sub	a7, a2, a3
78*dbf4ed89SMax Filippov	mov	a4, a3
79*dbf4ed89SMax Filippov	movgez	a4, a2, a7	/* a4 = max (a2, a3) */
80*dbf4ed89SMax Filippov	movltz	a3, a2, a7	/* a3 = min (a2, a3) */
81*dbf4ed89SMax Filippov
82*dbf4ed89SMax Filippov	movi	a2, 0
83*dbf4ed89SMax Filippov	extui	a6, a3, 0, 1
84*dbf4ed89SMax Filippov	movnez	a2, a4, a6
85*dbf4ed89SMax Filippov
86*dbf4ed89SMax Filippov	do_addx2 a7, a4, a2, a7
87*dbf4ed89SMax Filippov	extui	a6, a3, 1, 1
88*dbf4ed89SMax Filippov	movnez	a2, a7, a6
89*dbf4ed89SMax Filippov
90*dbf4ed89SMax Filippov	do_addx4 a7, a4, a2, a7
91*dbf4ed89SMax Filippov	extui	a6, a3, 2, 1
92*dbf4ed89SMax Filippov	movnez	a2, a7, a6
93*dbf4ed89SMax Filippov
94*dbf4ed89SMax Filippov	do_addx8 a7, a4, a2, a7
95*dbf4ed89SMax Filippov	extui	a6, a3, 3, 1
96*dbf4ed89SMax Filippov	movnez	a2, a7, a6
97*dbf4ed89SMax Filippov
98*dbf4ed89SMax Filippov	bgeui	a3, 16, .Lmult_main_loop
99*dbf4ed89SMax Filippov	neg	a3, a2
100*dbf4ed89SMax Filippov	movltz	a2, a3, a5
101*dbf4ed89SMax Filippov	abi_ret_default
102*dbf4ed89SMax Filippov
103*dbf4ed89SMax Filippov	.align	4
104*dbf4ed89SMax Filippov.Lmult_main_loop:
105*dbf4ed89SMax Filippov	srli	a3, a3, 4
106*dbf4ed89SMax Filippov	slli	a4, a4, 4
107*dbf4ed89SMax Filippov
108*dbf4ed89SMax Filippov	add	a7, a4, a2
109*dbf4ed89SMax Filippov	extui	a6, a3, 0, 1
110*dbf4ed89SMax Filippov	movnez	a2, a7, a6
111*dbf4ed89SMax Filippov
112*dbf4ed89SMax Filippov	do_addx2 a7, a4, a2, a7
113*dbf4ed89SMax Filippov	extui	a6, a3, 1, 1
114*dbf4ed89SMax Filippov	movnez	a2, a7, a6
115*dbf4ed89SMax Filippov
116*dbf4ed89SMax Filippov	do_addx4 a7, a4, a2, a7
117*dbf4ed89SMax Filippov	extui	a6, a3, 2, 1
118*dbf4ed89SMax Filippov	movnez	a2, a7, a6
119*dbf4ed89SMax Filippov
120*dbf4ed89SMax Filippov	do_addx8 a7, a4, a2, a7
121*dbf4ed89SMax Filippov	extui	a6, a3, 3, 1
122*dbf4ed89SMax Filippov	movnez	a2, a7, a6
123*dbf4ed89SMax Filippov
124*dbf4ed89SMax Filippov	bgeui	a3, 16, .Lmult_main_loop
125*dbf4ed89SMax Filippov
126*dbf4ed89SMax Filippov	neg	a3, a2
127*dbf4ed89SMax Filippov	movltz	a2, a3, a5
128*dbf4ed89SMax Filippov
129*dbf4ed89SMax Filippov#endif /* !MUL32 && !MUL16 && !MAC16 */
130*dbf4ed89SMax Filippov
131*dbf4ed89SMax Filippov	abi_ret_default
132*dbf4ed89SMax Filippov
133*dbf4ed89SMax FilippovENDPROC(__mulsi3)
134