xref: /openbmc/linux/arch/mips/math-emu/dp_sqrt.c (revision c9b02990)
19d5a6349SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /* IEEE754 floating point arithmetic
31da177e4SLinus Torvalds  * double precision square root
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds /*
61da177e4SLinus Torvalds  * MIPS floating point support
71da177e4SLinus Torvalds  * Copyright (C) 1994-2000 Algorithmics Ltd.
81da177e4SLinus Torvalds  */
91da177e4SLinus Torvalds 
101da177e4SLinus Torvalds #include "ieee754dp.h"
111da177e4SLinus Torvalds 
12a58f85b5SAleksandar Markovic static const unsigned int table[] = {
131da177e4SLinus Torvalds 	0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
141da177e4SLinus Torvalds 	29598, 36145, 43202, 50740, 58733, 67158, 75992,
151da177e4SLinus Torvalds 	85215, 83599, 71378, 60428, 50647, 41945, 34246,
161da177e4SLinus Torvalds 	27478, 21581, 16499, 12183, 8588, 5674, 3403,
171da177e4SLinus Torvalds 	1742, 661, 130
181da177e4SLinus Torvalds };
191da177e4SLinus Torvalds 
ieee754dp_sqrt(union ieee754dp x)202209bcb1SRalf Baechle union ieee754dp ieee754dp_sqrt(union ieee754dp x)
211da177e4SLinus Torvalds {
22cd21dfcfSRalf Baechle 	struct _ieee754_csr oldcsr;
232209bcb1SRalf Baechle 	union ieee754dp y, z, t;
24a58f85b5SAleksandar Markovic 	unsigned int scalx, yh;
251da177e4SLinus Torvalds 	COMPXDP;
261da177e4SLinus Torvalds 
271da177e4SLinus Torvalds 	EXPLODEXDP;
289e8bad1fSRalf Baechle 	ieee754_clearcx();
291da177e4SLinus Torvalds 	FLUSHXDP;
301da177e4SLinus Torvalds 
311da177e4SLinus Torvalds 	/* x == INF or NAN? */
321da177e4SLinus Torvalds 	switch (xc) {
33d5afa7e9SMaciej W. Rozycki 	case IEEE754_CLASS_SNAN:
34d5afa7e9SMaciej W. Rozycki 		return ieee754dp_nanxcpt(x);
35d5afa7e9SMaciej W. Rozycki 
361da177e4SLinus Torvalds 	case IEEE754_CLASS_QNAN:
371da177e4SLinus Torvalds 		/* sqrt(Nan) = Nan */
38539bfb57SMaciej W. Rozycki 		return x;
393f7cac41SRalf Baechle 
401da177e4SLinus Torvalds 	case IEEE754_CLASS_ZERO:
411da177e4SLinus Torvalds 		/* sqrt(0) = 0 */
421da177e4SLinus Torvalds 		return x;
433f7cac41SRalf Baechle 
441da177e4SLinus Torvalds 	case IEEE754_CLASS_INF:
451da177e4SLinus Torvalds 		if (xs) {
461da177e4SLinus Torvalds 			/* sqrt(-Inf) = Nan */
479e8bad1fSRalf Baechle 			ieee754_setcx(IEEE754_INVALID_OPERATION);
48539bfb57SMaciej W. Rozycki 			return ieee754dp_indef();
491da177e4SLinus Torvalds 		}
501da177e4SLinus Torvalds 		/* sqrt(+Inf) = Inf */
511da177e4SLinus Torvalds 		return x;
523f7cac41SRalf Baechle 
531da177e4SLinus Torvalds 	case IEEE754_CLASS_DNORM:
541da177e4SLinus Torvalds 		DPDNORMX;
55c9b02990SLiangliang Huang 		fallthrough;
561da177e4SLinus Torvalds 	case IEEE754_CLASS_NORM:
571da177e4SLinus Torvalds 		if (xs) {
581da177e4SLinus Torvalds 			/* sqrt(-x) = Nan */
599e8bad1fSRalf Baechle 			ieee754_setcx(IEEE754_INVALID_OPERATION);
60539bfb57SMaciej W. Rozycki 			return ieee754dp_indef();
611da177e4SLinus Torvalds 		}
621da177e4SLinus Torvalds 		break;
631da177e4SLinus Torvalds 	}
641da177e4SLinus Torvalds 
651da177e4SLinus Torvalds 	/* save old csr; switch off INX enable & flag; set RN rounding */
661da177e4SLinus Torvalds 	oldcsr = ieee754_csr;
671da177e4SLinus Torvalds 	ieee754_csr.mx &= ~IEEE754_INEXACT;
681da177e4SLinus Torvalds 	ieee754_csr.sx &= ~IEEE754_INEXACT;
6956a64733SRalf Baechle 	ieee754_csr.rm = FPU_CSR_RN;
701da177e4SLinus Torvalds 
711da177e4SLinus Torvalds 	/* adjust exponent to prevent overflow */
721da177e4SLinus Torvalds 	scalx = 0;
731da177e4SLinus Torvalds 	if (xe > 512) {		/* x > 2**-512? */
741da177e4SLinus Torvalds 		xe -= 512;	/* x = x / 2**512 */
751da177e4SLinus Torvalds 		scalx += 256;
761da177e4SLinus Torvalds 	} else if (xe < -512) { /* x < 2**-512? */
771da177e4SLinus Torvalds 		xe += 512;	/* x = x * 2**512 */
781da177e4SLinus Torvalds 		scalx -= 256;
791da177e4SLinus Torvalds 	}
801da177e4SLinus Torvalds 
8161100500SAleksandar Markovic 	x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
8261100500SAleksandar Markovic 	y = x;
831da177e4SLinus Torvalds 
841da177e4SLinus Torvalds 	/* magic initial approximation to almost 8 sig. bits */
851da177e4SLinus Torvalds 	yh = y.bits >> 32;
861da177e4SLinus Torvalds 	yh = (yh >> 1) + 0x1ff80000;
871da177e4SLinus Torvalds 	yh = yh - table[(yh >> 15) & 31];
881da177e4SLinus Torvalds 	y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff);
891da177e4SLinus Torvalds 
901da177e4SLinus Torvalds 	/* Heron's rule once with correction to improve to ~18 sig. bits */
911da177e4SLinus Torvalds 	/* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */
921da177e4SLinus Torvalds 	t = ieee754dp_div(x, y);
931da177e4SLinus Torvalds 	y = ieee754dp_add(y, t);
941da177e4SLinus Torvalds 	y.bits -= 0x0010000600000000LL;
951da177e4SLinus Torvalds 	y.bits &= 0xffffffff00000000LL;
961da177e4SLinus Torvalds 
971da177e4SLinus Torvalds 	/* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
981da177e4SLinus Torvalds 	/* t=y*y; z=t;	pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
9961100500SAleksandar Markovic 	t = ieee754dp_mul(y, y);
10061100500SAleksandar Markovic 	z = t;
10149548b09SRalf Baechle 	t.bexp += 0x001;
1021da177e4SLinus Torvalds 	t = ieee754dp_add(t, z);
1031da177e4SLinus Torvalds 	z = ieee754dp_mul(ieee754dp_sub(x, z), y);
1041da177e4SLinus Torvalds 
1051da177e4SLinus Torvalds 	/* t=z/(t+x) ;	pt[n0]+=0x00100000; y+=t; */
1061da177e4SLinus Torvalds 	t = ieee754dp_div(z, ieee754dp_add(t, x));
10749548b09SRalf Baechle 	t.bexp += 0x001;
1081da177e4SLinus Torvalds 	y = ieee754dp_add(y, t);
1091da177e4SLinus Torvalds 
1101da177e4SLinus Torvalds 	/* twiddle last bit to force y correctly rounded */
1111da177e4SLinus Torvalds 
1121da177e4SLinus Torvalds 	/* set RZ, clear INEX flag */
11356a64733SRalf Baechle 	ieee754_csr.rm = FPU_CSR_RZ;
1141da177e4SLinus Torvalds 	ieee754_csr.sx &= ~IEEE754_INEXACT;
1151da177e4SLinus Torvalds 
1161da177e4SLinus Torvalds 	/* t=x/y; ...chopped quotient, possibly inexact */
1171da177e4SLinus Torvalds 	t = ieee754dp_div(x, y);
1181da177e4SLinus Torvalds 
1191da177e4SLinus Torvalds 	if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) {
1201da177e4SLinus Torvalds 
1211da177e4SLinus Torvalds 		if (!(ieee754_csr.sx & IEEE754_INEXACT))
1221da177e4SLinus Torvalds 			/* t = t-ulp */
1231da177e4SLinus Torvalds 			t.bits -= 1;
1241da177e4SLinus Torvalds 
1251da177e4SLinus Torvalds 		/* add inexact to result status */
1261da177e4SLinus Torvalds 		oldcsr.cx |= IEEE754_INEXACT;
1271da177e4SLinus Torvalds 		oldcsr.sx |= IEEE754_INEXACT;
1281da177e4SLinus Torvalds 
1291da177e4SLinus Torvalds 		switch (oldcsr.rm) {
13056a64733SRalf Baechle 		case FPU_CSR_RU:
1311da177e4SLinus Torvalds 			y.bits += 1;
132c9b02990SLiangliang Huang 			fallthrough;
13356a64733SRalf Baechle 		case FPU_CSR_RN:
1341da177e4SLinus Torvalds 			t.bits += 1;
1351da177e4SLinus Torvalds 			break;
1361da177e4SLinus Torvalds 		}
1371da177e4SLinus Torvalds 
1381da177e4SLinus Torvalds 		/* y=y+t; ...chopped sum */
1391da177e4SLinus Torvalds 		y = ieee754dp_add(y, t);
1401da177e4SLinus Torvalds 
1411da177e4SLinus Torvalds 		/* adjust scalx for correctly rounded sqrt(x) */
1421da177e4SLinus Torvalds 		scalx -= 1;
1431da177e4SLinus Torvalds 	}
1441da177e4SLinus Torvalds 
1451da177e4SLinus Torvalds 	/* py[n0]=py[n0]+scalx; ...scale back y */
14649548b09SRalf Baechle 	y.bexp += scalx;
1471da177e4SLinus Torvalds 
1481da177e4SLinus Torvalds 	/* restore rounding mode, possibly set inexact */
1491da177e4SLinus Torvalds 	ieee754_csr = oldcsr;
1501da177e4SLinus Torvalds 
1511da177e4SLinus Torvalds 	return y;
1521da177e4SLinus Torvalds }
153