xref: /openbmc/linux/arch/mips/math-emu/dp_sqrt.c (revision 49548b09)
11da177e4SLinus Torvalds /* IEEE754 floating point arithmetic
21da177e4SLinus Torvalds  * double precision square root
31da177e4SLinus Torvalds  */
41da177e4SLinus Torvalds /*
51da177e4SLinus Torvalds  * MIPS floating point support
61da177e4SLinus Torvalds  * Copyright (C) 1994-2000 Algorithmics Ltd.
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * ########################################################################
91da177e4SLinus Torvalds  *
101da177e4SLinus Torvalds  *  This program is free software; you can distribute it and/or modify it
111da177e4SLinus Torvalds  *  under the terms of the GNU General Public License (Version 2) as
121da177e4SLinus Torvalds  *  published by the Free Software Foundation.
131da177e4SLinus Torvalds  *
141da177e4SLinus Torvalds  *  This program is distributed in the hope it will be useful, but WITHOUT
151da177e4SLinus Torvalds  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
161da177e4SLinus Torvalds  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
171da177e4SLinus Torvalds  *  for more details.
181da177e4SLinus Torvalds  *
191da177e4SLinus Torvalds  *  You should have received a copy of the GNU General Public License along
201da177e4SLinus Torvalds  *  with this program; if not, write to the Free Software Foundation, Inc.,
211da177e4SLinus Torvalds  *  59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
221da177e4SLinus Torvalds  *
231da177e4SLinus Torvalds  * ########################################################################
241da177e4SLinus Torvalds  */
251da177e4SLinus Torvalds 
261da177e4SLinus Torvalds 
271da177e4SLinus Torvalds #include "ieee754dp.h"
281da177e4SLinus Torvalds 
291da177e4SLinus Torvalds static const unsigned table[] = {
301da177e4SLinus Torvalds 	0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
311da177e4SLinus Torvalds 	29598, 36145, 43202, 50740, 58733, 67158, 75992,
321da177e4SLinus Torvalds 	85215, 83599, 71378, 60428, 50647, 41945, 34246,
331da177e4SLinus Torvalds 	27478, 21581, 16499, 12183, 8588, 5674, 3403,
341da177e4SLinus Torvalds 	1742, 661, 130
351da177e4SLinus Torvalds };
361da177e4SLinus Torvalds 
372209bcb1SRalf Baechle union ieee754dp ieee754dp_sqrt(union ieee754dp x)
381da177e4SLinus Torvalds {
39cd21dfcfSRalf Baechle 	struct _ieee754_csr oldcsr;
402209bcb1SRalf Baechle 	union ieee754dp y, z, t;
411da177e4SLinus Torvalds 	unsigned scalx, yh;
421da177e4SLinus Torvalds 	COMPXDP;
431da177e4SLinus Torvalds 
441da177e4SLinus Torvalds 	EXPLODEXDP;
459e8bad1fSRalf Baechle 	ieee754_clearcx();
461da177e4SLinus Torvalds 	FLUSHXDP;
471da177e4SLinus Torvalds 
481da177e4SLinus Torvalds 	/* x == INF or NAN? */
491da177e4SLinus Torvalds 	switch (xc) {
501da177e4SLinus Torvalds 	case IEEE754_CLASS_QNAN:
511da177e4SLinus Torvalds 		/* sqrt(Nan) = Nan */
5290efba36SRalf Baechle 		return ieee754dp_nanxcpt(x);
531da177e4SLinus Torvalds 	case IEEE754_CLASS_SNAN:
549e8bad1fSRalf Baechle 		ieee754_setcx(IEEE754_INVALID_OPERATION);
5590efba36SRalf Baechle 		return ieee754dp_nanxcpt(ieee754dp_indef());
561da177e4SLinus Torvalds 	case IEEE754_CLASS_ZERO:
571da177e4SLinus Torvalds 		/* sqrt(0) = 0 */
581da177e4SLinus Torvalds 		return x;
591da177e4SLinus Torvalds 	case IEEE754_CLASS_INF:
601da177e4SLinus Torvalds 		if (xs) {
611da177e4SLinus Torvalds 			/* sqrt(-Inf) = Nan */
629e8bad1fSRalf Baechle 			ieee754_setcx(IEEE754_INVALID_OPERATION);
6390efba36SRalf Baechle 			return ieee754dp_nanxcpt(ieee754dp_indef());
641da177e4SLinus Torvalds 		}
651da177e4SLinus Torvalds 		/* sqrt(+Inf) = Inf */
661da177e4SLinus Torvalds 		return x;
671da177e4SLinus Torvalds 	case IEEE754_CLASS_DNORM:
681da177e4SLinus Torvalds 		DPDNORMX;
691da177e4SLinus Torvalds 		/* fall through */
701da177e4SLinus Torvalds 	case IEEE754_CLASS_NORM:
711da177e4SLinus Torvalds 		if (xs) {
721da177e4SLinus Torvalds 			/* sqrt(-x) = Nan */
739e8bad1fSRalf Baechle 			ieee754_setcx(IEEE754_INVALID_OPERATION);
7490efba36SRalf Baechle 			return ieee754dp_nanxcpt(ieee754dp_indef());
751da177e4SLinus Torvalds 		}
761da177e4SLinus Torvalds 		break;
771da177e4SLinus Torvalds 	}
781da177e4SLinus Torvalds 
791da177e4SLinus Torvalds 	/* save old csr; switch off INX enable & flag; set RN rounding */
801da177e4SLinus Torvalds 	oldcsr = ieee754_csr;
811da177e4SLinus Torvalds 	ieee754_csr.mx &= ~IEEE754_INEXACT;
821da177e4SLinus Torvalds 	ieee754_csr.sx &= ~IEEE754_INEXACT;
831da177e4SLinus Torvalds 	ieee754_csr.rm = IEEE754_RN;
841da177e4SLinus Torvalds 
851da177e4SLinus Torvalds 	/* adjust exponent to prevent overflow */
861da177e4SLinus Torvalds 	scalx = 0;
871da177e4SLinus Torvalds 	if (xe > 512) {		/* x > 2**-512? */
881da177e4SLinus Torvalds 		xe -= 512;	/* x = x / 2**512 */
891da177e4SLinus Torvalds 		scalx += 256;
901da177e4SLinus Torvalds 	} else if (xe < -512) { /* x < 2**-512? */
911da177e4SLinus Torvalds 		xe += 512;	/* x = x * 2**512 */
921da177e4SLinus Torvalds 		scalx -= 256;
931da177e4SLinus Torvalds 	}
941da177e4SLinus Torvalds 
951da177e4SLinus Torvalds 	y = x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
961da177e4SLinus Torvalds 
971da177e4SLinus Torvalds 	/* magic initial approximation to almost 8 sig. bits */
981da177e4SLinus Torvalds 	yh = y.bits >> 32;
991da177e4SLinus Torvalds 	yh = (yh >> 1) + 0x1ff80000;
1001da177e4SLinus Torvalds 	yh = yh - table[(yh >> 15) & 31];
1011da177e4SLinus Torvalds 	y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff);
1021da177e4SLinus Torvalds 
1031da177e4SLinus Torvalds 	/* Heron's rule once with correction to improve to ~18 sig. bits */
1041da177e4SLinus Torvalds 	/* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */
1051da177e4SLinus Torvalds 	t = ieee754dp_div(x, y);
1061da177e4SLinus Torvalds 	y = ieee754dp_add(y, t);
1071da177e4SLinus Torvalds 	y.bits -= 0x0010000600000000LL;
1081da177e4SLinus Torvalds 	y.bits &= 0xffffffff00000000LL;
1091da177e4SLinus Torvalds 
1101da177e4SLinus Torvalds 	/* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
1111da177e4SLinus Torvalds 	/* t=y*y; z=t;	pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
1121da177e4SLinus Torvalds 	z = t = ieee754dp_mul(y, y);
11349548b09SRalf Baechle 	t.bexp += 0x001;
1141da177e4SLinus Torvalds 	t = ieee754dp_add(t, z);
1151da177e4SLinus Torvalds 	z = ieee754dp_mul(ieee754dp_sub(x, z), y);
1161da177e4SLinus Torvalds 
1171da177e4SLinus Torvalds 	/* t=z/(t+x) ;	pt[n0]+=0x00100000; y+=t; */
1181da177e4SLinus Torvalds 	t = ieee754dp_div(z, ieee754dp_add(t, x));
11949548b09SRalf Baechle 	t.bexp += 0x001;
1201da177e4SLinus Torvalds 	y = ieee754dp_add(y, t);
1211da177e4SLinus Torvalds 
1221da177e4SLinus Torvalds 	/* twiddle last bit to force y correctly rounded */
1231da177e4SLinus Torvalds 
1241da177e4SLinus Torvalds 	/* set RZ, clear INEX flag */
1251da177e4SLinus Torvalds 	ieee754_csr.rm = IEEE754_RZ;
1261da177e4SLinus Torvalds 	ieee754_csr.sx &= ~IEEE754_INEXACT;
1271da177e4SLinus Torvalds 
1281da177e4SLinus Torvalds 	/* t=x/y; ...chopped quotient, possibly inexact */
1291da177e4SLinus Torvalds 	t = ieee754dp_div(x, y);
1301da177e4SLinus Torvalds 
1311da177e4SLinus Torvalds 	if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) {
1321da177e4SLinus Torvalds 
1331da177e4SLinus Torvalds 		if (!(ieee754_csr.sx & IEEE754_INEXACT))
1341da177e4SLinus Torvalds 			/* t = t-ulp */
1351da177e4SLinus Torvalds 			t.bits -= 1;
1361da177e4SLinus Torvalds 
1371da177e4SLinus Torvalds 		/* add inexact to result status */
1381da177e4SLinus Torvalds 		oldcsr.cx |= IEEE754_INEXACT;
1391da177e4SLinus Torvalds 		oldcsr.sx |= IEEE754_INEXACT;
1401da177e4SLinus Torvalds 
1411da177e4SLinus Torvalds 		switch (oldcsr.rm) {
1421da177e4SLinus Torvalds 		case IEEE754_RP:
1431da177e4SLinus Torvalds 			y.bits += 1;
1441da177e4SLinus Torvalds 			/* drop through */
1451da177e4SLinus Torvalds 		case IEEE754_RN:
1461da177e4SLinus Torvalds 			t.bits += 1;
1471da177e4SLinus Torvalds 			break;
1481da177e4SLinus Torvalds 		}
1491da177e4SLinus Torvalds 
1501da177e4SLinus Torvalds 		/* y=y+t; ...chopped sum */
1511da177e4SLinus Torvalds 		y = ieee754dp_add(y, t);
1521da177e4SLinus Torvalds 
1531da177e4SLinus Torvalds 		/* adjust scalx for correctly rounded sqrt(x) */
1541da177e4SLinus Torvalds 		scalx -= 1;
1551da177e4SLinus Torvalds 	}
1561da177e4SLinus Torvalds 
1571da177e4SLinus Torvalds 	/* py[n0]=py[n0]+scalx; ...scale back y */
15849548b09SRalf Baechle 	y.bexp += scalx;
1591da177e4SLinus Torvalds 
1601da177e4SLinus Torvalds 	/* restore rounding mode, possibly set inexact */
1611da177e4SLinus Torvalds 	ieee754_csr = oldcsr;
1621da177e4SLinus Torvalds 
1631da177e4SLinus Torvalds 	return y;
1641da177e4SLinus Torvalds }
165