xref: /openbmc/linux/arch/mips/math-emu/dp_sqrt.c (revision 61100500)
11da177e4SLinus Torvalds /* IEEE754 floating point arithmetic
21da177e4SLinus Torvalds  * double precision square root
31da177e4SLinus Torvalds  */
41da177e4SLinus Torvalds /*
51da177e4SLinus Torvalds  * MIPS floating point support
61da177e4SLinus Torvalds  * Copyright (C) 1994-2000 Algorithmics Ltd.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  *  This program is free software; you can distribute it and/or modify it
91da177e4SLinus Torvalds  *  under the terms of the GNU General Public License (Version 2) as
101da177e4SLinus Torvalds  *  published by the Free Software Foundation.
111da177e4SLinus Torvalds  *
121da177e4SLinus Torvalds  *  This program is distributed in the hope it will be useful, but WITHOUT
131da177e4SLinus Torvalds  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
141da177e4SLinus Torvalds  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
151da177e4SLinus Torvalds  *  for more details.
161da177e4SLinus Torvalds  *
171da177e4SLinus Torvalds  *  You should have received a copy of the GNU General Public License along
181da177e4SLinus Torvalds  *  with this program; if not, write to the Free Software Foundation, Inc.,
193f7cac41SRalf Baechle  *  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
201da177e4SLinus Torvalds  */
211da177e4SLinus Torvalds 
221da177e4SLinus Torvalds #include "ieee754dp.h"
231da177e4SLinus Torvalds 
24a58f85b5SAleksandar Markovic static const unsigned int table[] = {
251da177e4SLinus Torvalds 	0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
261da177e4SLinus Torvalds 	29598, 36145, 43202, 50740, 58733, 67158, 75992,
271da177e4SLinus Torvalds 	85215, 83599, 71378, 60428, 50647, 41945, 34246,
281da177e4SLinus Torvalds 	27478, 21581, 16499, 12183, 8588, 5674, 3403,
291da177e4SLinus Torvalds 	1742, 661, 130
301da177e4SLinus Torvalds };
311da177e4SLinus Torvalds 
322209bcb1SRalf Baechle union ieee754dp ieee754dp_sqrt(union ieee754dp x)
331da177e4SLinus Torvalds {
34cd21dfcfSRalf Baechle 	struct _ieee754_csr oldcsr;
352209bcb1SRalf Baechle 	union ieee754dp y, z, t;
36a58f85b5SAleksandar Markovic 	unsigned int scalx, yh;
371da177e4SLinus Torvalds 	COMPXDP;
381da177e4SLinus Torvalds 
391da177e4SLinus Torvalds 	EXPLODEXDP;
409e8bad1fSRalf Baechle 	ieee754_clearcx();
411da177e4SLinus Torvalds 	FLUSHXDP;
421da177e4SLinus Torvalds 
431da177e4SLinus Torvalds 	/* x == INF or NAN? */
441da177e4SLinus Torvalds 	switch (xc) {
45d5afa7e9SMaciej W. Rozycki 	case IEEE754_CLASS_SNAN:
46d5afa7e9SMaciej W. Rozycki 		return ieee754dp_nanxcpt(x);
47d5afa7e9SMaciej W. Rozycki 
481da177e4SLinus Torvalds 	case IEEE754_CLASS_QNAN:
491da177e4SLinus Torvalds 		/* sqrt(Nan) = Nan */
50539bfb57SMaciej W. Rozycki 		return x;
513f7cac41SRalf Baechle 
521da177e4SLinus Torvalds 	case IEEE754_CLASS_ZERO:
531da177e4SLinus Torvalds 		/* sqrt(0) = 0 */
541da177e4SLinus Torvalds 		return x;
553f7cac41SRalf Baechle 
561da177e4SLinus Torvalds 	case IEEE754_CLASS_INF:
571da177e4SLinus Torvalds 		if (xs) {
581da177e4SLinus Torvalds 			/* sqrt(-Inf) = Nan */
599e8bad1fSRalf Baechle 			ieee754_setcx(IEEE754_INVALID_OPERATION);
60539bfb57SMaciej W. Rozycki 			return ieee754dp_indef();
611da177e4SLinus Torvalds 		}
621da177e4SLinus Torvalds 		/* sqrt(+Inf) = Inf */
631da177e4SLinus Torvalds 		return x;
643f7cac41SRalf Baechle 
651da177e4SLinus Torvalds 	case IEEE754_CLASS_DNORM:
661da177e4SLinus Torvalds 		DPDNORMX;
671da177e4SLinus Torvalds 		/* fall through */
683f7cac41SRalf Baechle 
691da177e4SLinus Torvalds 	case IEEE754_CLASS_NORM:
701da177e4SLinus Torvalds 		if (xs) {
711da177e4SLinus Torvalds 			/* sqrt(-x) = Nan */
729e8bad1fSRalf Baechle 			ieee754_setcx(IEEE754_INVALID_OPERATION);
73539bfb57SMaciej W. Rozycki 			return ieee754dp_indef();
741da177e4SLinus Torvalds 		}
751da177e4SLinus Torvalds 		break;
761da177e4SLinus Torvalds 	}
771da177e4SLinus Torvalds 
781da177e4SLinus Torvalds 	/* save old csr; switch off INX enable & flag; set RN rounding */
791da177e4SLinus Torvalds 	oldcsr = ieee754_csr;
801da177e4SLinus Torvalds 	ieee754_csr.mx &= ~IEEE754_INEXACT;
811da177e4SLinus Torvalds 	ieee754_csr.sx &= ~IEEE754_INEXACT;
8256a64733SRalf Baechle 	ieee754_csr.rm = FPU_CSR_RN;
831da177e4SLinus Torvalds 
841da177e4SLinus Torvalds 	/* adjust exponent to prevent overflow */
851da177e4SLinus Torvalds 	scalx = 0;
861da177e4SLinus Torvalds 	if (xe > 512) {		/* x > 2**-512? */
871da177e4SLinus Torvalds 		xe -= 512;	/* x = x / 2**512 */
881da177e4SLinus Torvalds 		scalx += 256;
891da177e4SLinus Torvalds 	} else if (xe < -512) { /* x < 2**-512? */
901da177e4SLinus Torvalds 		xe += 512;	/* x = x * 2**512 */
911da177e4SLinus Torvalds 		scalx -= 256;
921da177e4SLinus Torvalds 	}
931da177e4SLinus Torvalds 
9461100500SAleksandar Markovic 	x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
9561100500SAleksandar Markovic 	y = x;
961da177e4SLinus Torvalds 
971da177e4SLinus Torvalds 	/* magic initial approximation to almost 8 sig. bits */
981da177e4SLinus Torvalds 	yh = y.bits >> 32;
991da177e4SLinus Torvalds 	yh = (yh >> 1) + 0x1ff80000;
1001da177e4SLinus Torvalds 	yh = yh - table[(yh >> 15) & 31];
1011da177e4SLinus Torvalds 	y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff);
1021da177e4SLinus Torvalds 
1031da177e4SLinus Torvalds 	/* Heron's rule once with correction to improve to ~18 sig. bits */
1041da177e4SLinus Torvalds 	/* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */
1051da177e4SLinus Torvalds 	t = ieee754dp_div(x, y);
1061da177e4SLinus Torvalds 	y = ieee754dp_add(y, t);
1071da177e4SLinus Torvalds 	y.bits -= 0x0010000600000000LL;
1081da177e4SLinus Torvalds 	y.bits &= 0xffffffff00000000LL;
1091da177e4SLinus Torvalds 
1101da177e4SLinus Torvalds 	/* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
1111da177e4SLinus Torvalds 	/* t=y*y; z=t;	pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
11261100500SAleksandar Markovic 	t = ieee754dp_mul(y, y);
11361100500SAleksandar Markovic 	z = t;
11449548b09SRalf Baechle 	t.bexp += 0x001;
1151da177e4SLinus Torvalds 	t = ieee754dp_add(t, z);
1161da177e4SLinus Torvalds 	z = ieee754dp_mul(ieee754dp_sub(x, z), y);
1171da177e4SLinus Torvalds 
1181da177e4SLinus Torvalds 	/* t=z/(t+x) ;	pt[n0]+=0x00100000; y+=t; */
1191da177e4SLinus Torvalds 	t = ieee754dp_div(z, ieee754dp_add(t, x));
12049548b09SRalf Baechle 	t.bexp += 0x001;
1211da177e4SLinus Torvalds 	y = ieee754dp_add(y, t);
1221da177e4SLinus Torvalds 
1231da177e4SLinus Torvalds 	/* twiddle last bit to force y correctly rounded */
1241da177e4SLinus Torvalds 
1251da177e4SLinus Torvalds 	/* set RZ, clear INEX flag */
12656a64733SRalf Baechle 	ieee754_csr.rm = FPU_CSR_RZ;
1271da177e4SLinus Torvalds 	ieee754_csr.sx &= ~IEEE754_INEXACT;
1281da177e4SLinus Torvalds 
1291da177e4SLinus Torvalds 	/* t=x/y; ...chopped quotient, possibly inexact */
1301da177e4SLinus Torvalds 	t = ieee754dp_div(x, y);
1311da177e4SLinus Torvalds 
1321da177e4SLinus Torvalds 	if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) {
1331da177e4SLinus Torvalds 
1341da177e4SLinus Torvalds 		if (!(ieee754_csr.sx & IEEE754_INEXACT))
1351da177e4SLinus Torvalds 			/* t = t-ulp */
1361da177e4SLinus Torvalds 			t.bits -= 1;
1371da177e4SLinus Torvalds 
1381da177e4SLinus Torvalds 		/* add inexact to result status */
1391da177e4SLinus Torvalds 		oldcsr.cx |= IEEE754_INEXACT;
1401da177e4SLinus Torvalds 		oldcsr.sx |= IEEE754_INEXACT;
1411da177e4SLinus Torvalds 
1421da177e4SLinus Torvalds 		switch (oldcsr.rm) {
14356a64733SRalf Baechle 		case FPU_CSR_RU:
1441da177e4SLinus Torvalds 			y.bits += 1;
1451da177e4SLinus Torvalds 			/* drop through */
14656a64733SRalf Baechle 		case FPU_CSR_RN:
1471da177e4SLinus Torvalds 			t.bits += 1;
1481da177e4SLinus Torvalds 			break;
1491da177e4SLinus Torvalds 		}
1501da177e4SLinus Torvalds 
1511da177e4SLinus Torvalds 		/* y=y+t; ...chopped sum */
1521da177e4SLinus Torvalds 		y = ieee754dp_add(y, t);
1531da177e4SLinus Torvalds 
1541da177e4SLinus Torvalds 		/* adjust scalx for correctly rounded sqrt(x) */
1551da177e4SLinus Torvalds 		scalx -= 1;
1561da177e4SLinus Torvalds 	}
1571da177e4SLinus Torvalds 
1581da177e4SLinus Torvalds 	/* py[n0]=py[n0]+scalx; ...scale back y */
15949548b09SRalf Baechle 	y.bexp += scalx;
1601da177e4SLinus Torvalds 
1611da177e4SLinus Torvalds 	/* restore rounding mode, possibly set inexact */
1621da177e4SLinus Torvalds 	ieee754_csr = oldcsr;
1631da177e4SLinus Torvalds 
1641da177e4SLinus Torvalds 	return y;
1651da177e4SLinus Torvalds }
166