1/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ 2#include <linux/linkage.h> 3#include <asm/asmmacro.h> 4#include <asm/core.h> 5 6#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 7#define XCHAL_NO_MUL 1 8#endif 9 10ENTRY(__umulsidi3) 11 12#ifdef __XTENSA_CALL0_ABI__ 13 abi_entry(32) 14 s32i a12, sp, 16 15 s32i a13, sp, 20 16 s32i a14, sp, 24 17 s32i a15, sp, 28 18#elif XCHAL_NO_MUL 19 /* This is not really a leaf function; allocate enough stack space 20 to allow CALL12s to a helper function. */ 21 abi_entry(32) 22#else 23 abi_entry_default 24#endif 25 26#ifdef __XTENSA_EB__ 27#define wh a2 28#define wl a3 29#else 30#define wh a3 31#define wl a2 32#endif /* __XTENSA_EB__ */ 33 34 /* This code is taken from the mulsf3 routine in ieee754-sf.S. 35 See more comments there. */ 36 37#if XCHAL_HAVE_MUL32_HIGH 38 mull a6, a2, a3 39 muluh wh, a2, a3 40 mov wl, a6 41 42#else /* ! MUL32_HIGH */ 43 44#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL 45 /* a0 and a8 will be clobbered by calling the multiply function 46 but a8 is not used here and need not be saved. */ 47 s32i a0, sp, 0 48#endif 49 50#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 51 52#define a2h a4 53#define a3h a5 54 55 /* Get the high halves of the inputs into registers. */ 56 srli a2h, a2, 16 57 srli a3h, a3, 16 58 59#define a2l a2 60#define a3l a3 61 62#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 63 /* Clear the high halves of the inputs. This does not matter 64 for MUL16 because the high bits are ignored. */ 65 extui a2, a2, 0, 16 66 extui a3, a3, 0, 16 67#endif 68#endif /* MUL16 || MUL32 */ 69 70 71#if XCHAL_HAVE_MUL16 72 73#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ 74 mul16u dst, xreg ## xhalf, yreg ## yhalf 75 76#elif XCHAL_HAVE_MUL32 77 78#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ 79 mull dst, xreg ## xhalf, yreg ## yhalf 80 81#elif XCHAL_HAVE_MAC16 82 83/* The preprocessor insists on inserting a space when concatenating after 84 a period in the definition of do_mul below. These macros are a workaround 85 using underscores instead of periods when doing the concatenation. */ 86#define umul_aa_ll umul.aa.ll 87#define umul_aa_lh umul.aa.lh 88#define umul_aa_hl umul.aa.hl 89#define umul_aa_hh umul.aa.hh 90 91#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ 92 umul_aa_ ## xhalf ## yhalf xreg, yreg; \ 93 rsr dst, ACCLO 94 95#else /* no multiply hardware */ 96 97#define set_arg_l(dst, src) \ 98 extui dst, src, 0, 16 99#define set_arg_h(dst, src) \ 100 srli dst, src, 16 101 102#ifdef __XTENSA_CALL0_ABI__ 103#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ 104 set_arg_ ## xhalf (a13, xreg); \ 105 set_arg_ ## yhalf (a14, yreg); \ 106 call0 .Lmul_mulsi3; \ 107 mov dst, a12 108#else 109#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ 110 set_arg_ ## xhalf (a14, xreg); \ 111 set_arg_ ## yhalf (a15, yreg); \ 112 call12 .Lmul_mulsi3; \ 113 mov dst, a14 114#endif /* __XTENSA_CALL0_ABI__ */ 115 116#endif /* no multiply hardware */ 117 118 /* Add pp1 and pp2 into a6 with carry-out in a9. */ 119 do_mul(a6, a2, l, a3, h) /* pp 1 */ 120 do_mul(a11, a2, h, a3, l) /* pp 2 */ 121 movi a9, 0 122 add a6, a6, a11 123 bgeu a6, a11, 1f 124 addi a9, a9, 1 1251: 126 /* Shift the high half of a9/a6 into position in a9. Note that 127 this value can be safely incremented without any carry-outs. */ 128 ssai 16 129 src a9, a9, a6 130 131 /* Compute the low word into a6. */ 132 do_mul(a11, a2, l, a3, l) /* pp 0 */ 133 sll a6, a6 134 add a6, a6, a11 135 bgeu a6, a11, 1f 136 addi a9, a9, 1 1371: 138 /* Compute the high word into wh. */ 139 do_mul(wh, a2, h, a3, h) /* pp 3 */ 140 add wh, wh, a9 141 mov wl, a6 142 143#endif /* !MUL32_HIGH */ 144 145#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL 146 /* Restore the original return address. */ 147 l32i a0, sp, 0 148#endif 149#ifdef __XTENSA_CALL0_ABI__ 150 l32i a12, sp, 16 151 l32i a13, sp, 20 152 l32i a14, sp, 24 153 l32i a15, sp, 28 154 abi_ret(32) 155#else 156 abi_ret_default 157#endif 158 159#if XCHAL_NO_MUL 160 161 .macro do_addx2 dst, as, at, tmp 162#if XCHAL_HAVE_ADDX 163 addx2 \dst, \as, \at 164#else 165 slli \tmp, \as, 1 166 add \dst, \tmp, \at 167#endif 168 .endm 169 170 .macro do_addx4 dst, as, at, tmp 171#if XCHAL_HAVE_ADDX 172 addx4 \dst, \as, \at 173#else 174 slli \tmp, \as, 2 175 add \dst, \tmp, \at 176#endif 177 .endm 178 179 .macro do_addx8 dst, as, at, tmp 180#if XCHAL_HAVE_ADDX 181 addx8 \dst, \as, \at 182#else 183 slli \tmp, \as, 3 184 add \dst, \tmp, \at 185#endif 186 .endm 187 188 /* For Xtensa processors with no multiply hardware, this simplified 189 version of _mulsi3 is used for multiplying 16-bit chunks of 190 the floating-point mantissas. When using CALL0, this function 191 uses a custom ABI: the inputs are passed in a13 and a14, the 192 result is returned in a12, and a8 and a15 are clobbered. */ 193 .align 4 194.Lmul_mulsi3: 195 abi_entry_default 196 197 .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 198 movi \dst, 0 1991: add \tmp1, \src2, \dst 200 extui \tmp2, \src1, 0, 1 201 movnez \dst, \tmp1, \tmp2 202 203 do_addx2 \tmp1, \src2, \dst, \tmp1 204 extui \tmp2, \src1, 1, 1 205 movnez \dst, \tmp1, \tmp2 206 207 do_addx4 \tmp1, \src2, \dst, \tmp1 208 extui \tmp2, \src1, 2, 1 209 movnez \dst, \tmp1, \tmp2 210 211 do_addx8 \tmp1, \src2, \dst, \tmp1 212 extui \tmp2, \src1, 3, 1 213 movnez \dst, \tmp1, \tmp2 214 215 srli \src1, \src1, 4 216 slli \src2, \src2, 4 217 bnez \src1, 1b 218 .endm 219 220#ifdef __XTENSA_CALL0_ABI__ 221 mul_mulsi3_body a12, a13, a14, a15, a8 222#else 223 /* The result will be written into a2, so save that argument in a4. */ 224 mov a4, a2 225 mul_mulsi3_body a2, a4, a3, a5, a6 226#endif 227 abi_ret_default 228#endif /* XCHAL_NO_MUL */ 229 230ENDPROC(__umulsidi3) 231EXPORT_SYMBOL(__umulsidi3) 232