11da177e4SLinus Torvalds /* IEEE754 floating point arithmetic 21da177e4SLinus Torvalds * double precision square root 31da177e4SLinus Torvalds */ 41da177e4SLinus Torvalds /* 51da177e4SLinus Torvalds * MIPS floating point support 61da177e4SLinus Torvalds * Copyright (C) 1994-2000 Algorithmics Ltd. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * ######################################################################## 91da177e4SLinus Torvalds * 101da177e4SLinus Torvalds * This program is free software; you can distribute it and/or modify it 111da177e4SLinus Torvalds * under the terms of the GNU General Public License (Version 2) as 121da177e4SLinus Torvalds * published by the Free Software Foundation. 131da177e4SLinus Torvalds * 141da177e4SLinus Torvalds * This program is distributed in the hope it will be useful, but WITHOUT 151da177e4SLinus Torvalds * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 161da177e4SLinus Torvalds * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 171da177e4SLinus Torvalds * for more details. 181da177e4SLinus Torvalds * 191da177e4SLinus Torvalds * You should have received a copy of the GNU General Public License along 201da177e4SLinus Torvalds * with this program; if not, write to the Free Software Foundation, Inc., 211da177e4SLinus Torvalds * 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. 221da177e4SLinus Torvalds * 231da177e4SLinus Torvalds * ######################################################################## 241da177e4SLinus Torvalds */ 251da177e4SLinus Torvalds 261da177e4SLinus Torvalds 271da177e4SLinus Torvalds #include "ieee754dp.h" 281da177e4SLinus Torvalds 291da177e4SLinus Torvalds static const unsigned table[] = { 301da177e4SLinus Torvalds 0, 1204, 3062, 5746, 9193, 13348, 18162, 23592, 311da177e4SLinus Torvalds 29598, 36145, 43202, 50740, 58733, 67158, 75992, 321da177e4SLinus Torvalds 85215, 83599, 71378, 60428, 50647, 41945, 34246, 331da177e4SLinus Torvalds 27478, 21581, 16499, 12183, 8588, 5674, 3403, 341da177e4SLinus Torvalds 1742, 661, 130 351da177e4SLinus Torvalds }; 361da177e4SLinus Torvalds 372209bcb1SRalf Baechle union ieee754dp ieee754dp_sqrt(union ieee754dp x) 381da177e4SLinus Torvalds { 39cd21dfcfSRalf Baechle struct _ieee754_csr oldcsr; 402209bcb1SRalf Baechle union ieee754dp y, z, t; 411da177e4SLinus Torvalds unsigned scalx, yh; 421da177e4SLinus Torvalds COMPXDP; 431da177e4SLinus Torvalds 441da177e4SLinus Torvalds EXPLODEXDP; 459e8bad1fSRalf Baechle ieee754_clearcx(); 461da177e4SLinus Torvalds FLUSHXDP; 471da177e4SLinus Torvalds 481da177e4SLinus Torvalds /* x == INF or NAN? */ 491da177e4SLinus Torvalds switch (xc) { 501da177e4SLinus Torvalds case IEEE754_CLASS_QNAN: 511da177e4SLinus Torvalds /* sqrt(Nan) = Nan */ 5290efba36SRalf Baechle return ieee754dp_nanxcpt(x); 531da177e4SLinus Torvalds case IEEE754_CLASS_SNAN: 549e8bad1fSRalf Baechle ieee754_setcx(IEEE754_INVALID_OPERATION); 5590efba36SRalf Baechle return ieee754dp_nanxcpt(ieee754dp_indef()); 561da177e4SLinus Torvalds case IEEE754_CLASS_ZERO: 571da177e4SLinus Torvalds /* sqrt(0) = 0 */ 581da177e4SLinus Torvalds return x; 591da177e4SLinus Torvalds case IEEE754_CLASS_INF: 601da177e4SLinus Torvalds if (xs) { 611da177e4SLinus Torvalds /* sqrt(-Inf) = Nan */ 629e8bad1fSRalf Baechle ieee754_setcx(IEEE754_INVALID_OPERATION); 6390efba36SRalf Baechle return ieee754dp_nanxcpt(ieee754dp_indef()); 641da177e4SLinus Torvalds } 651da177e4SLinus Torvalds /* sqrt(+Inf) = Inf */ 661da177e4SLinus Torvalds return x; 671da177e4SLinus Torvalds case IEEE754_CLASS_DNORM: 681da177e4SLinus Torvalds DPDNORMX; 691da177e4SLinus Torvalds /* fall through */ 701da177e4SLinus Torvalds case IEEE754_CLASS_NORM: 711da177e4SLinus Torvalds if (xs) { 721da177e4SLinus Torvalds /* sqrt(-x) = Nan */ 739e8bad1fSRalf Baechle ieee754_setcx(IEEE754_INVALID_OPERATION); 7490efba36SRalf Baechle return ieee754dp_nanxcpt(ieee754dp_indef()); 751da177e4SLinus Torvalds } 761da177e4SLinus Torvalds break; 771da177e4SLinus Torvalds } 781da177e4SLinus Torvalds 791da177e4SLinus Torvalds /* save old csr; switch off INX enable & flag; set RN rounding */ 801da177e4SLinus Torvalds oldcsr = ieee754_csr; 811da177e4SLinus Torvalds ieee754_csr.mx &= ~IEEE754_INEXACT; 821da177e4SLinus Torvalds ieee754_csr.sx &= ~IEEE754_INEXACT; 831da177e4SLinus Torvalds ieee754_csr.rm = IEEE754_RN; 841da177e4SLinus Torvalds 851da177e4SLinus Torvalds /* adjust exponent to prevent overflow */ 861da177e4SLinus Torvalds scalx = 0; 871da177e4SLinus Torvalds if (xe > 512) { /* x > 2**-512? */ 881da177e4SLinus Torvalds xe -= 512; /* x = x / 2**512 */ 891da177e4SLinus Torvalds scalx += 256; 901da177e4SLinus Torvalds } else if (xe < -512) { /* x < 2**-512? */ 911da177e4SLinus Torvalds xe += 512; /* x = x * 2**512 */ 921da177e4SLinus Torvalds scalx -= 256; 931da177e4SLinus Torvalds } 941da177e4SLinus Torvalds 951da177e4SLinus Torvalds y = x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT); 961da177e4SLinus Torvalds 971da177e4SLinus Torvalds /* magic initial approximation to almost 8 sig. bits */ 981da177e4SLinus Torvalds yh = y.bits >> 32; 991da177e4SLinus Torvalds yh = (yh >> 1) + 0x1ff80000; 1001da177e4SLinus Torvalds yh = yh - table[(yh >> 15) & 31]; 1011da177e4SLinus Torvalds y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff); 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds /* Heron's rule once with correction to improve to ~18 sig. bits */ 1041da177e4SLinus Torvalds /* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */ 1051da177e4SLinus Torvalds t = ieee754dp_div(x, y); 1061da177e4SLinus Torvalds y = ieee754dp_add(y, t); 1071da177e4SLinus Torvalds y.bits -= 0x0010000600000000LL; 1081da177e4SLinus Torvalds y.bits &= 0xffffffff00000000LL; 1091da177e4SLinus Torvalds 1101da177e4SLinus Torvalds /* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */ 1111da177e4SLinus Torvalds /* t=y*y; z=t; pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */ 1121da177e4SLinus Torvalds z = t = ieee754dp_mul(y, y); 1131da177e4SLinus Torvalds t.parts.bexp += 0x001; 1141da177e4SLinus Torvalds t = ieee754dp_add(t, z); 1151da177e4SLinus Torvalds z = ieee754dp_mul(ieee754dp_sub(x, z), y); 1161da177e4SLinus Torvalds 1171da177e4SLinus Torvalds /* t=z/(t+x) ; pt[n0]+=0x00100000; y+=t; */ 1181da177e4SLinus Torvalds t = ieee754dp_div(z, ieee754dp_add(t, x)); 1191da177e4SLinus Torvalds t.parts.bexp += 0x001; 1201da177e4SLinus Torvalds y = ieee754dp_add(y, t); 1211da177e4SLinus Torvalds 1221da177e4SLinus Torvalds /* twiddle last bit to force y correctly rounded */ 1231da177e4SLinus Torvalds 1241da177e4SLinus Torvalds /* set RZ, clear INEX flag */ 1251da177e4SLinus Torvalds ieee754_csr.rm = IEEE754_RZ; 1261da177e4SLinus Torvalds ieee754_csr.sx &= ~IEEE754_INEXACT; 1271da177e4SLinus Torvalds 1281da177e4SLinus Torvalds /* t=x/y; ...chopped quotient, possibly inexact */ 1291da177e4SLinus Torvalds t = ieee754dp_div(x, y); 1301da177e4SLinus Torvalds 1311da177e4SLinus Torvalds if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) { 1321da177e4SLinus Torvalds 1331da177e4SLinus Torvalds if (!(ieee754_csr.sx & IEEE754_INEXACT)) 1341da177e4SLinus Torvalds /* t = t-ulp */ 1351da177e4SLinus Torvalds t.bits -= 1; 1361da177e4SLinus Torvalds 1371da177e4SLinus Torvalds /* add inexact to result status */ 1381da177e4SLinus Torvalds oldcsr.cx |= IEEE754_INEXACT; 1391da177e4SLinus Torvalds oldcsr.sx |= IEEE754_INEXACT; 1401da177e4SLinus Torvalds 1411da177e4SLinus Torvalds switch (oldcsr.rm) { 1421da177e4SLinus Torvalds case IEEE754_RP: 1431da177e4SLinus Torvalds y.bits += 1; 1441da177e4SLinus Torvalds /* drop through */ 1451da177e4SLinus Torvalds case IEEE754_RN: 1461da177e4SLinus Torvalds t.bits += 1; 1471da177e4SLinus Torvalds break; 1481da177e4SLinus Torvalds } 1491da177e4SLinus Torvalds 1501da177e4SLinus Torvalds /* y=y+t; ...chopped sum */ 1511da177e4SLinus Torvalds y = ieee754dp_add(y, t); 1521da177e4SLinus Torvalds 1531da177e4SLinus Torvalds /* adjust scalx for correctly rounded sqrt(x) */ 1541da177e4SLinus Torvalds scalx -= 1; 1551da177e4SLinus Torvalds } 1561da177e4SLinus Torvalds 1571da177e4SLinus Torvalds /* py[n0]=py[n0]+scalx; ...scale back y */ 1581da177e4SLinus Torvalds y.parts.bexp += scalx; 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds /* restore rounding mode, possibly set inexact */ 1611da177e4SLinus Torvalds ieee754_csr = oldcsr; 1621da177e4SLinus Torvalds 1631da177e4SLinus Torvalds return y; 1641da177e4SLinus Torvalds } 165