1 /* multi_arith.h: multi-precision integer arithmetic functions, needed 2 to do extended-precision floating point. 3 4 (c) 1998 David Huggins-Daines. 5 6 Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c) 7 David Mosberger-Tang. 8 9 You may copy, modify, and redistribute this file under the terms of 10 the GNU General Public License, version 2, or any later version, at 11 your convenience. */ 12 13 /* Note: 14 15 These are not general multi-precision math routines. Rather, they 16 implement the subset of integer arithmetic that we need in order to 17 multiply, divide, and normalize 128-bit unsigned mantissae. */ 18 19 #ifndef MULTI_ARITH_H 20 #define MULTI_ARITH_H 21 22 #if 0 /* old code... */ 23 24 /* Unsigned only, because we don't need signs to multiply and divide. */ 25 typedef unsigned int int128[4]; 26 27 /* Word order */ 28 enum { 29 MSW128, 30 NMSW128, 31 NLSW128, 32 LSW128 33 }; 34 35 /* big-endian */ 36 #define LO_WORD(ll) (((unsigned int *) &ll)[1]) 37 #define HI_WORD(ll) (((unsigned int *) &ll)[0]) 38 39 /* Convenience functions to stuff various integer values into int128s */ 40 41 static inline void zero128(int128 a) 42 { 43 a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0; 44 } 45 46 /* Human-readable word order in the arguments */ 47 static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1, 48 unsigned int i0, int128 a) 49 { 50 a[LSW128] = i0; 51 a[NLSW128] = i1; 52 a[NMSW128] = i2; 53 a[MSW128] = i3; 54 } 55 56 /* Convenience functions (for testing as well) */ 57 static inline void int64_to_128(unsigned long long src, int128 dest) 58 { 59 dest[LSW128] = (unsigned int) src; 60 dest[NLSW128] = src >> 32; 61 dest[NMSW128] = dest[MSW128] = 0; 62 } 63 64 static inline void int128_to_64(const int128 src, unsigned long long *dest) 65 { 66 *dest = src[LSW128] | (long long) src[NLSW128] << 32; 67 } 68 69 static inline void put_i128(const int128 a) 70 { 71 printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128], 72 a[NLSW128], a[LSW128]); 73 } 74 75 /* Internal shifters: 76 77 Note that these are only good for 0 < count < 32. 78 */ 79 80 static inline void _lsl128(unsigned int count, int128 a) 81 { 82 a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count)); 83 a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count)); 84 a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count)); 85 a[LSW128] <<= count; 86 } 87 88 static inline void _lsr128(unsigned int count, int128 a) 89 { 90 a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count)); 91 a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count)); 92 a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count)); 93 a[MSW128] >>= count; 94 } 95 96 /* Should be faster, one would hope */ 97 98 static inline void lslone128(int128 a) 99 { 100 asm volatile ("lsl.l #1,%0\n" 101 "roxl.l #1,%1\n" 102 "roxl.l #1,%2\n" 103 "roxl.l #1,%3\n" 104 : 105 "=d" (a[LSW128]), 106 "=d"(a[NLSW128]), 107 "=d"(a[NMSW128]), 108 "=d"(a[MSW128]) 109 : 110 "0"(a[LSW128]), 111 "1"(a[NLSW128]), 112 "2"(a[NMSW128]), 113 "3"(a[MSW128])); 114 } 115 116 static inline void lsrone128(int128 a) 117 { 118 asm volatile ("lsr.l #1,%0\n" 119 "roxr.l #1,%1\n" 120 "roxr.l #1,%2\n" 121 "roxr.l #1,%3\n" 122 : 123 "=d" (a[MSW128]), 124 "=d"(a[NMSW128]), 125 "=d"(a[NLSW128]), 126 "=d"(a[LSW128]) 127 : 128 "0"(a[MSW128]), 129 "1"(a[NMSW128]), 130 "2"(a[NLSW128]), 131 "3"(a[LSW128])); 132 } 133 134 /* Generalized 128-bit shifters: 135 136 These bit-shift to a multiple of 32, then move whole longwords. */ 137 138 static inline void lsl128(unsigned int count, int128 a) 139 { 140 int wordcount, i; 141 142 if (count % 32) 143 _lsl128(count % 32, a); 144 145 if (0 == (wordcount = count / 32)) 146 return; 147 148 /* argh, gak, endian-sensitive */ 149 for (i = 0; i < 4 - wordcount; i++) { 150 a[i] = a[i + wordcount]; 151 } 152 for (i = 3; i >= 4 - wordcount; --i) { 153 a[i] = 0; 154 } 155 } 156 157 static inline void lsr128(unsigned int count, int128 a) 158 { 159 int wordcount, i; 160 161 if (count % 32) 162 _lsr128(count % 32, a); 163 164 if (0 == (wordcount = count / 32)) 165 return; 166 167 for (i = 3; i >= wordcount; --i) { 168 a[i] = a[i - wordcount]; 169 } 170 for (i = 0; i < wordcount; i++) { 171 a[i] = 0; 172 } 173 } 174 175 static inline int orl128(int a, int128 b) 176 { 177 b[LSW128] |= a; 178 } 179 180 static inline int btsthi128(const int128 a) 181 { 182 return a[MSW128] & 0x80000000; 183 } 184 185 /* test bits (numbered from 0 = LSB) up to and including "top" */ 186 static inline int bftestlo128(int top, const int128 a) 187 { 188 int r = 0; 189 190 if (top > 31) 191 r |= a[LSW128]; 192 if (top > 63) 193 r |= a[NLSW128]; 194 if (top > 95) 195 r |= a[NMSW128]; 196 197 r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1); 198 199 return (r != 0); 200 } 201 202 /* Aargh. We need these because GCC is broken */ 203 /* FIXME: do them in assembly, for goodness' sake! */ 204 static inline void mask64(int pos, unsigned long long *mask) 205 { 206 *mask = 0; 207 208 if (pos < 32) { 209 LO_WORD(*mask) = (1 << pos) - 1; 210 return; 211 } 212 LO_WORD(*mask) = -1; 213 HI_WORD(*mask) = (1 << (pos - 32)) - 1; 214 } 215 216 static inline void bset64(int pos, unsigned long long *dest) 217 { 218 /* This conditional will be optimized away. Thanks, GCC! */ 219 if (pos < 32) 220 asm volatile ("bset %1,%0":"=m" 221 (LO_WORD(*dest)):"id"(pos)); 222 else 223 asm volatile ("bset %1,%0":"=m" 224 (HI_WORD(*dest)):"id"(pos - 32)); 225 } 226 227 static inline int btst64(int pos, unsigned long long dest) 228 { 229 if (pos < 32) 230 return (0 != (LO_WORD(dest) & (1 << pos))); 231 else 232 return (0 != (HI_WORD(dest) & (1 << (pos - 32)))); 233 } 234 235 static inline void lsl64(int count, unsigned long long *dest) 236 { 237 if (count < 32) { 238 HI_WORD(*dest) = (HI_WORD(*dest) << count) 239 | (LO_WORD(*dest) >> count); 240 LO_WORD(*dest) <<= count; 241 return; 242 } 243 count -= 32; 244 HI_WORD(*dest) = LO_WORD(*dest) << count; 245 LO_WORD(*dest) = 0; 246 } 247 248 static inline void lsr64(int count, unsigned long long *dest) 249 { 250 if (count < 32) { 251 LO_WORD(*dest) = (LO_WORD(*dest) >> count) 252 | (HI_WORD(*dest) << (32 - count)); 253 HI_WORD(*dest) >>= count; 254 return; 255 } 256 count -= 32; 257 LO_WORD(*dest) = HI_WORD(*dest) >> count; 258 HI_WORD(*dest) = 0; 259 } 260 #endif 261 262 static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt) 263 { 264 reg->exp += cnt; 265 266 switch (cnt) { 267 case 0 ... 8: 268 reg->lowmant = reg->mant.m32[1] << (8 - cnt); 269 reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) | 270 (reg->mant.m32[0] << (32 - cnt)); 271 reg->mant.m32[0] = reg->mant.m32[0] >> cnt; 272 break; 273 case 9 ... 32: 274 reg->lowmant = reg->mant.m32[1] >> (cnt - 8); 275 if (reg->mant.m32[1] << (40 - cnt)) 276 reg->lowmant |= 1; 277 reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) | 278 (reg->mant.m32[0] << (32 - cnt)); 279 reg->mant.m32[0] = reg->mant.m32[0] >> cnt; 280 break; 281 case 33 ... 39: 282 asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant) 283 : "m" (reg->mant.m32[0]), "d" (64 - cnt)); 284 if (reg->mant.m32[1] << (40 - cnt)) 285 reg->lowmant |= 1; 286 reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32); 287 reg->mant.m32[0] = 0; 288 break; 289 case 40 ... 71: 290 reg->lowmant = reg->mant.m32[0] >> (cnt - 40); 291 if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1]) 292 reg->lowmant |= 1; 293 reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32); 294 reg->mant.m32[0] = 0; 295 break; 296 default: 297 reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1]; 298 reg->mant.m32[0] = 0; 299 reg->mant.m32[1] = 0; 300 break; 301 } 302 } 303 304 static inline int fp_overnormalize(struct fp_ext *reg) 305 { 306 int shift; 307 308 if (reg->mant.m32[0]) { 309 asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0])); 310 reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift)); 311 reg->mant.m32[1] = (reg->mant.m32[1] << shift); 312 } else { 313 asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1])); 314 reg->mant.m32[0] = (reg->mant.m32[1] << shift); 315 reg->mant.m32[1] = 0; 316 shift += 32; 317 } 318 319 return shift; 320 } 321 322 static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src) 323 { 324 int carry; 325 326 /* we assume here, gcc only insert move and a clr instr */ 327 asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant) 328 : "g,d" (src->lowmant), "0,0" (dest->lowmant)); 329 asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1]) 330 : "d" (src->mant.m32[1]), "0" (dest->mant.m32[1])); 331 asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0]) 332 : "d" (src->mant.m32[0]), "0" (dest->mant.m32[0])); 333 asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0)); 334 335 return carry; 336 } 337 338 static inline int fp_addcarry(struct fp_ext *reg) 339 { 340 if (++reg->exp == 0x7fff) { 341 if (reg->mant.m64) 342 fp_set_sr(FPSR_EXC_INEX2); 343 reg->mant.m64 = 0; 344 fp_set_sr(FPSR_EXC_OVFL); 345 return 0; 346 } 347 reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0); 348 reg->mant.m32[1] = (reg->mant.m32[1] >> 1) | 349 (reg->mant.m32[0] << 31); 350 reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000; 351 352 return 1; 353 } 354 355 static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1, 356 struct fp_ext *src2) 357 { 358 /* we assume here, gcc only insert move and a clr instr */ 359 asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant) 360 : "g,d" (src2->lowmant), "0,0" (src1->lowmant)); 361 asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1]) 362 : "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1])); 363 asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0]) 364 : "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0])); 365 } 366 367 #define fp_mul64(desth, destl, src1, src2) ({ \ 368 asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \ 369 : "g" (src1), "0" (src2)); \ 370 }) 371 #define fp_div64(quot, rem, srch, srcl, div) \ 372 asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \ 373 : "dm" (div), "1" (srch), "0" (srcl)) 374 #define fp_add64(dest1, dest2, src1, src2) ({ \ 375 asm ("add.l %1,%0" : "=d,dm" (dest2) \ 376 : "dm,d" (src2), "0,0" (dest2)); \ 377 asm ("addx.l %1,%0" : "=d" (dest1) \ 378 : "d" (src1), "0" (dest1)); \ 379 }) 380 #define fp_addx96(dest, src) ({ \ 381 /* we assume here, gcc only insert move and a clr instr */ \ 382 asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \ 383 : "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \ 384 asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \ 385 : "d" (temp.m32[0]), "0" (dest->m32[1])); \ 386 asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \ 387 : "d" (0), "0" (dest->m32[0])); \ 388 }) 389 #define fp_sub64(dest, src) ({ \ 390 asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \ 391 : "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \ 392 asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \ 393 : "d" (src.m32[0]), "0" (dest.m32[0])); \ 394 }) 395 #define fp_sub96c(dest, srch, srcm, srcl) ({ \ 396 char carry; \ 397 asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \ 398 : "dm,d" (srcl), "0,0" (dest.m32[2])); \ 399 asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \ 400 : "d" (srcm), "0" (dest.m32[1])); \ 401 asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \ 402 : "d" (srch), "1" (dest.m32[0])); \ 403 carry; \ 404 }) 405 406 static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1, 407 struct fp_ext *src2) 408 { 409 union fp_mant64 temp; 410 411 fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]); 412 fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]); 413 414 fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]); 415 fp_addx96(dest, temp); 416 417 fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]); 418 fp_addx96(dest, temp); 419 } 420 421 static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src, 422 struct fp_ext *div) 423 { 424 union fp_mant128 tmp; 425 union fp_mant64 tmp64; 426 unsigned long *mantp = dest->m32; 427 unsigned long fix, rem, first, dummy; 428 int i; 429 430 /* the algorithm below requires dest to be smaller than div, 431 but both have the high bit set */ 432 if (src->mant.m64 >= div->mant.m64) { 433 fp_sub64(src->mant, div->mant); 434 *mantp = 1; 435 } else 436 *mantp = 0; 437 mantp++; 438 439 /* basic idea behind this algorithm: we can't divide two 64bit numbers 440 (AB/CD) directly, but we can calculate AB/C0, but this means this 441 quotient is off by C0/CD, so we have to multiply the first result 442 to fix the result, after that we have nearly the correct result 443 and only a few corrections are needed. */ 444 445 /* C0/CD can be precalculated, but it's an 64bit division again, but 446 we can make it a bit easier, by dividing first through C so we get 447 10/1D and now only a single shift and the value fits into 32bit. */ 448 fix = 0x80000000; 449 dummy = div->mant.m32[1] / div->mant.m32[0] + 1; 450 dummy = (dummy >> 1) | fix; 451 fp_div64(fix, dummy, fix, 0, dummy); 452 fix--; 453 454 for (i = 0; i < 3; i++, mantp++) { 455 if (src->mant.m32[0] == div->mant.m32[0]) { 456 fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]); 457 458 fp_mul64(*mantp, dummy, first, fix); 459 *mantp += fix; 460 } else { 461 fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]); 462 463 fp_mul64(*mantp, dummy, first, fix); 464 } 465 466 fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp); 467 fp_add64(tmp.m32[0], tmp.m32[1], 0, rem); 468 tmp.m32[2] = 0; 469 470 fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]); 471 fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]); 472 473 src->mant.m32[0] = tmp.m32[1]; 474 src->mant.m32[1] = tmp.m32[2]; 475 476 while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) { 477 src->mant.m32[0] = tmp.m32[1]; 478 src->mant.m32[1] = tmp.m32[2]; 479 *mantp += 1; 480 } 481 } 482 } 483 484 #if 0 485 static inline unsigned int fp_fls128(union fp_mant128 *src) 486 { 487 unsigned long data; 488 unsigned int res, off; 489 490 if ((data = src->m32[0])) 491 off = 0; 492 else if ((data = src->m32[1])) 493 off = 32; 494 else if ((data = src->m32[2])) 495 off = 64; 496 else if ((data = src->m32[3])) 497 off = 96; 498 else 499 return 128; 500 501 asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data)); 502 return res + off; 503 } 504 505 static inline void fp_shiftmant128(union fp_mant128 *src, int shift) 506 { 507 unsigned long sticky; 508 509 switch (shift) { 510 case 0: 511 return; 512 case 1: 513 asm volatile ("lsl.l #1,%0" 514 : "=d" (src->m32[3]) : "0" (src->m32[3])); 515 asm volatile ("roxl.l #1,%0" 516 : "=d" (src->m32[2]) : "0" (src->m32[2])); 517 asm volatile ("roxl.l #1,%0" 518 : "=d" (src->m32[1]) : "0" (src->m32[1])); 519 asm volatile ("roxl.l #1,%0" 520 : "=d" (src->m32[0]) : "0" (src->m32[0])); 521 return; 522 case 2 ... 31: 523 src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift)); 524 src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift)); 525 src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift)); 526 src->m32[3] = (src->m32[3] << shift); 527 return; 528 case 32 ... 63: 529 shift -= 32; 530 src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift)); 531 src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift)); 532 src->m32[2] = (src->m32[3] << shift); 533 src->m32[3] = 0; 534 return; 535 case 64 ... 95: 536 shift -= 64; 537 src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift)); 538 src->m32[1] = (src->m32[3] << shift); 539 src->m32[2] = src->m32[3] = 0; 540 return; 541 case 96 ... 127: 542 shift -= 96; 543 src->m32[0] = (src->m32[3] << shift); 544 src->m32[1] = src->m32[2] = src->m32[3] = 0; 545 return; 546 case -31 ... -1: 547 shift = -shift; 548 sticky = 0; 549 if (src->m32[3] << (32 - shift)) 550 sticky = 1; 551 src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky; 552 src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)); 553 src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)); 554 src->m32[0] = (src->m32[0] >> shift); 555 return; 556 case -63 ... -32: 557 shift = -shift - 32; 558 sticky = 0; 559 if ((src->m32[2] << (32 - shift)) || src->m32[3]) 560 sticky = 1; 561 src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky; 562 src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)); 563 src->m32[1] = (src->m32[0] >> shift); 564 src->m32[0] = 0; 565 return; 566 case -95 ... -64: 567 shift = -shift - 64; 568 sticky = 0; 569 if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3]) 570 sticky = 1; 571 src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky; 572 src->m32[2] = (src->m32[0] >> shift); 573 src->m32[1] = src->m32[0] = 0; 574 return; 575 case -127 ... -96: 576 shift = -shift - 96; 577 sticky = 0; 578 if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3]) 579 sticky = 1; 580 src->m32[3] = (src->m32[0] >> shift) | sticky; 581 src->m32[2] = src->m32[1] = src->m32[0] = 0; 582 return; 583 } 584 585 if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3])) 586 src->m32[3] = 1; 587 else 588 src->m32[3] = 0; 589 src->m32[2] = 0; 590 src->m32[1] = 0; 591 src->m32[0] = 0; 592 } 593 #endif 594 595 static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src, 596 int shift) 597 { 598 unsigned long tmp; 599 600 switch (shift) { 601 case 0: 602 dest->mant.m64 = src->m64[0]; 603 dest->lowmant = src->m32[2] >> 24; 604 if (src->m32[3] || (src->m32[2] << 8)) 605 dest->lowmant |= 1; 606 break; 607 case 1: 608 asm volatile ("lsl.l #1,%0" 609 : "=d" (tmp) : "0" (src->m32[2])); 610 asm volatile ("roxl.l #1,%0" 611 : "=d" (dest->mant.m32[1]) : "0" (src->m32[1])); 612 asm volatile ("roxl.l #1,%0" 613 : "=d" (dest->mant.m32[0]) : "0" (src->m32[0])); 614 dest->lowmant = tmp >> 24; 615 if (src->m32[3] || (tmp << 8)) 616 dest->lowmant |= 1; 617 break; 618 case 31: 619 asm volatile ("lsr.l #1,%1; roxr.l #1,%0" 620 : "=d" (dest->mant.m32[0]) 621 : "d" (src->m32[0]), "0" (src->m32[1])); 622 asm volatile ("roxr.l #1,%0" 623 : "=d" (dest->mant.m32[1]) : "0" (src->m32[2])); 624 asm volatile ("roxr.l #1,%0" 625 : "=d" (tmp) : "0" (src->m32[3])); 626 dest->lowmant = tmp >> 24; 627 if (src->m32[3] << 7) 628 dest->lowmant |= 1; 629 break; 630 case 32: 631 dest->mant.m32[0] = src->m32[1]; 632 dest->mant.m32[1] = src->m32[2]; 633 dest->lowmant = src->m32[3] >> 24; 634 if (src->m32[3] << 8) 635 dest->lowmant |= 1; 636 break; 637 } 638 } 639 640 #if 0 /* old code... */ 641 static inline int fls(unsigned int a) 642 { 643 int r; 644 645 asm volatile ("bfffo %1{#0,#32},%0" 646 : "=d" (r) : "md" (a)); 647 return r; 648 } 649 650 /* fls = "find last set" (cf. ffs(3)) */ 651 static inline int fls128(const int128 a) 652 { 653 if (a[MSW128]) 654 return fls(a[MSW128]); 655 if (a[NMSW128]) 656 return fls(a[NMSW128]) + 32; 657 /* XXX: it probably never gets beyond this point in actual 658 use, but that's indicative of a more general problem in the 659 algorithm (i.e. as per the actual 68881 implementation, we 660 really only need at most 67 bits of precision [plus 661 overflow]) so I'm not going to fix it. */ 662 if (a[NLSW128]) 663 return fls(a[NLSW128]) + 64; 664 if (a[LSW128]) 665 return fls(a[LSW128]) + 96; 666 else 667 return -1; 668 } 669 670 static inline int zerop128(const int128 a) 671 { 672 return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]); 673 } 674 675 static inline int nonzerop128(const int128 a) 676 { 677 return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]); 678 } 679 680 /* Addition and subtraction */ 681 /* Do these in "pure" assembly, because "extended" asm is unmanageable 682 here */ 683 static inline void add128(const int128 a, int128 b) 684 { 685 /* rotating carry flags */ 686 unsigned int carry[2]; 687 688 carry[0] = a[LSW128] > (0xffffffff - b[LSW128]); 689 b[LSW128] += a[LSW128]; 690 691 carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]); 692 b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0]; 693 694 carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]); 695 b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1]; 696 697 b[MSW128] = a[MSW128] + b[MSW128] + carry[0]; 698 } 699 700 /* Note: assembler semantics: "b -= a" */ 701 static inline void sub128(const int128 a, int128 b) 702 { 703 /* rotating borrow flags */ 704 unsigned int borrow[2]; 705 706 borrow[0] = b[LSW128] < a[LSW128]; 707 b[LSW128] -= a[LSW128]; 708 709 borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0]; 710 b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0]; 711 712 borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1]; 713 b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1]; 714 715 b[MSW128] = b[MSW128] - a[MSW128] - borrow[0]; 716 } 717 718 /* Poor man's 64-bit expanding multiply */ 719 static inline void mul64(unsigned long long a, unsigned long long b, int128 c) 720 { 721 unsigned long long acc; 722 int128 acc128; 723 724 zero128(acc128); 725 zero128(c); 726 727 /* first the low words */ 728 if (LO_WORD(a) && LO_WORD(b)) { 729 acc = (long long) LO_WORD(a) * LO_WORD(b); 730 c[NLSW128] = HI_WORD(acc); 731 c[LSW128] = LO_WORD(acc); 732 } 733 /* Next the high words */ 734 if (HI_WORD(a) && HI_WORD(b)) { 735 acc = (long long) HI_WORD(a) * HI_WORD(b); 736 c[MSW128] = HI_WORD(acc); 737 c[NMSW128] = LO_WORD(acc); 738 } 739 /* The middle words */ 740 if (LO_WORD(a) && HI_WORD(b)) { 741 acc = (long long) LO_WORD(a) * HI_WORD(b); 742 acc128[NMSW128] = HI_WORD(acc); 743 acc128[NLSW128] = LO_WORD(acc); 744 add128(acc128, c); 745 } 746 /* The first and last words */ 747 if (HI_WORD(a) && LO_WORD(b)) { 748 acc = (long long) HI_WORD(a) * LO_WORD(b); 749 acc128[NMSW128] = HI_WORD(acc); 750 acc128[NLSW128] = LO_WORD(acc); 751 add128(acc128, c); 752 } 753 } 754 755 /* Note: unsigned */ 756 static inline int cmp128(int128 a, int128 b) 757 { 758 if (a[MSW128] < b[MSW128]) 759 return -1; 760 if (a[MSW128] > b[MSW128]) 761 return 1; 762 if (a[NMSW128] < b[NMSW128]) 763 return -1; 764 if (a[NMSW128] > b[NMSW128]) 765 return 1; 766 if (a[NLSW128] < b[NLSW128]) 767 return -1; 768 if (a[NLSW128] > b[NLSW128]) 769 return 1; 770 771 return (signed) a[LSW128] - b[LSW128]; 772 } 773 774 inline void div128(int128 a, int128 b, int128 c) 775 { 776 int128 mask; 777 778 /* Algorithm: 779 780 Shift the divisor until it's at least as big as the 781 dividend, keeping track of the position to which we've 782 shifted it, i.e. the power of 2 which we've multiplied it 783 by. 784 785 Then, for this power of 2 (the mask), and every one smaller 786 than it, subtract the mask from the dividend and add it to 787 the quotient until the dividend is smaller than the raised 788 divisor. At this point, divide the dividend and the mask 789 by 2 (i.e. shift one place to the right). Lather, rinse, 790 and repeat, until there are no more powers of 2 left. */ 791 792 /* FIXME: needless to say, there's room for improvement here too. */ 793 794 /* Shift up */ 795 /* XXX: since it just has to be "at least as big", we can 796 probably eliminate this horribly wasteful loop. I will 797 have to prove this first, though */ 798 set128(0, 0, 0, 1, mask); 799 while (cmp128(b, a) < 0 && !btsthi128(b)) { 800 lslone128(b); 801 lslone128(mask); 802 } 803 804 /* Shift down */ 805 zero128(c); 806 do { 807 if (cmp128(a, b) >= 0) { 808 sub128(b, a); 809 add128(mask, c); 810 } 811 lsrone128(mask); 812 lsrone128(b); 813 } while (nonzerop128(mask)); 814 815 /* The remainder is in a... */ 816 } 817 #endif 818 819 #endif /* MULTI_ARITH_H */ 820