19d5a6349SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /* IEEE754 floating point arithmetic
31da177e4SLinus Torvalds * double precision square root
41da177e4SLinus Torvalds */
51da177e4SLinus Torvalds /*
61da177e4SLinus Torvalds * MIPS floating point support
71da177e4SLinus Torvalds * Copyright (C) 1994-2000 Algorithmics Ltd.
81da177e4SLinus Torvalds */
91da177e4SLinus Torvalds
101da177e4SLinus Torvalds #include "ieee754dp.h"
111da177e4SLinus Torvalds
12a58f85b5SAleksandar Markovic static const unsigned int table[] = {
131da177e4SLinus Torvalds 0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
141da177e4SLinus Torvalds 29598, 36145, 43202, 50740, 58733, 67158, 75992,
151da177e4SLinus Torvalds 85215, 83599, 71378, 60428, 50647, 41945, 34246,
161da177e4SLinus Torvalds 27478, 21581, 16499, 12183, 8588, 5674, 3403,
171da177e4SLinus Torvalds 1742, 661, 130
181da177e4SLinus Torvalds };
191da177e4SLinus Torvalds
ieee754dp_sqrt(union ieee754dp x)202209bcb1SRalf Baechle union ieee754dp ieee754dp_sqrt(union ieee754dp x)
211da177e4SLinus Torvalds {
22cd21dfcfSRalf Baechle struct _ieee754_csr oldcsr;
232209bcb1SRalf Baechle union ieee754dp y, z, t;
24a58f85b5SAleksandar Markovic unsigned int scalx, yh;
251da177e4SLinus Torvalds COMPXDP;
261da177e4SLinus Torvalds
271da177e4SLinus Torvalds EXPLODEXDP;
289e8bad1fSRalf Baechle ieee754_clearcx();
291da177e4SLinus Torvalds FLUSHXDP;
301da177e4SLinus Torvalds
311da177e4SLinus Torvalds /* x == INF or NAN? */
321da177e4SLinus Torvalds switch (xc) {
33d5afa7e9SMaciej W. Rozycki case IEEE754_CLASS_SNAN:
34d5afa7e9SMaciej W. Rozycki return ieee754dp_nanxcpt(x);
35d5afa7e9SMaciej W. Rozycki
361da177e4SLinus Torvalds case IEEE754_CLASS_QNAN:
371da177e4SLinus Torvalds /* sqrt(Nan) = Nan */
38539bfb57SMaciej W. Rozycki return x;
393f7cac41SRalf Baechle
401da177e4SLinus Torvalds case IEEE754_CLASS_ZERO:
411da177e4SLinus Torvalds /* sqrt(0) = 0 */
421da177e4SLinus Torvalds return x;
433f7cac41SRalf Baechle
441da177e4SLinus Torvalds case IEEE754_CLASS_INF:
451da177e4SLinus Torvalds if (xs) {
461da177e4SLinus Torvalds /* sqrt(-Inf) = Nan */
479e8bad1fSRalf Baechle ieee754_setcx(IEEE754_INVALID_OPERATION);
48539bfb57SMaciej W. Rozycki return ieee754dp_indef();
491da177e4SLinus Torvalds }
501da177e4SLinus Torvalds /* sqrt(+Inf) = Inf */
511da177e4SLinus Torvalds return x;
523f7cac41SRalf Baechle
531da177e4SLinus Torvalds case IEEE754_CLASS_DNORM:
541da177e4SLinus Torvalds DPDNORMX;
55c9b02990SLiangliang Huang fallthrough;
561da177e4SLinus Torvalds case IEEE754_CLASS_NORM:
571da177e4SLinus Torvalds if (xs) {
581da177e4SLinus Torvalds /* sqrt(-x) = Nan */
599e8bad1fSRalf Baechle ieee754_setcx(IEEE754_INVALID_OPERATION);
60539bfb57SMaciej W. Rozycki return ieee754dp_indef();
611da177e4SLinus Torvalds }
621da177e4SLinus Torvalds break;
631da177e4SLinus Torvalds }
641da177e4SLinus Torvalds
651da177e4SLinus Torvalds /* save old csr; switch off INX enable & flag; set RN rounding */
661da177e4SLinus Torvalds oldcsr = ieee754_csr;
671da177e4SLinus Torvalds ieee754_csr.mx &= ~IEEE754_INEXACT;
681da177e4SLinus Torvalds ieee754_csr.sx &= ~IEEE754_INEXACT;
6956a64733SRalf Baechle ieee754_csr.rm = FPU_CSR_RN;
701da177e4SLinus Torvalds
711da177e4SLinus Torvalds /* adjust exponent to prevent overflow */
721da177e4SLinus Torvalds scalx = 0;
731da177e4SLinus Torvalds if (xe > 512) { /* x > 2**-512? */
741da177e4SLinus Torvalds xe -= 512; /* x = x / 2**512 */
751da177e4SLinus Torvalds scalx += 256;
761da177e4SLinus Torvalds } else if (xe < -512) { /* x < 2**-512? */
771da177e4SLinus Torvalds xe += 512; /* x = x * 2**512 */
781da177e4SLinus Torvalds scalx -= 256;
791da177e4SLinus Torvalds }
801da177e4SLinus Torvalds
8161100500SAleksandar Markovic x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
8261100500SAleksandar Markovic y = x;
831da177e4SLinus Torvalds
841da177e4SLinus Torvalds /* magic initial approximation to almost 8 sig. bits */
851da177e4SLinus Torvalds yh = y.bits >> 32;
861da177e4SLinus Torvalds yh = (yh >> 1) + 0x1ff80000;
871da177e4SLinus Torvalds yh = yh - table[(yh >> 15) & 31];
881da177e4SLinus Torvalds y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff);
891da177e4SLinus Torvalds
901da177e4SLinus Torvalds /* Heron's rule once with correction to improve to ~18 sig. bits */
911da177e4SLinus Torvalds /* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */
921da177e4SLinus Torvalds t = ieee754dp_div(x, y);
931da177e4SLinus Torvalds y = ieee754dp_add(y, t);
941da177e4SLinus Torvalds y.bits -= 0x0010000600000000LL;
951da177e4SLinus Torvalds y.bits &= 0xffffffff00000000LL;
961da177e4SLinus Torvalds
971da177e4SLinus Torvalds /* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
981da177e4SLinus Torvalds /* t=y*y; z=t; pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
9961100500SAleksandar Markovic t = ieee754dp_mul(y, y);
10061100500SAleksandar Markovic z = t;
10149548b09SRalf Baechle t.bexp += 0x001;
1021da177e4SLinus Torvalds t = ieee754dp_add(t, z);
1031da177e4SLinus Torvalds z = ieee754dp_mul(ieee754dp_sub(x, z), y);
1041da177e4SLinus Torvalds
1051da177e4SLinus Torvalds /* t=z/(t+x) ; pt[n0]+=0x00100000; y+=t; */
1061da177e4SLinus Torvalds t = ieee754dp_div(z, ieee754dp_add(t, x));
10749548b09SRalf Baechle t.bexp += 0x001;
1081da177e4SLinus Torvalds y = ieee754dp_add(y, t);
1091da177e4SLinus Torvalds
1101da177e4SLinus Torvalds /* twiddle last bit to force y correctly rounded */
1111da177e4SLinus Torvalds
1121da177e4SLinus Torvalds /* set RZ, clear INEX flag */
11356a64733SRalf Baechle ieee754_csr.rm = FPU_CSR_RZ;
1141da177e4SLinus Torvalds ieee754_csr.sx &= ~IEEE754_INEXACT;
1151da177e4SLinus Torvalds
1161da177e4SLinus Torvalds /* t=x/y; ...chopped quotient, possibly inexact */
1171da177e4SLinus Torvalds t = ieee754dp_div(x, y);
1181da177e4SLinus Torvalds
1191da177e4SLinus Torvalds if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) {
1201da177e4SLinus Torvalds
1211da177e4SLinus Torvalds if (!(ieee754_csr.sx & IEEE754_INEXACT))
1221da177e4SLinus Torvalds /* t = t-ulp */
1231da177e4SLinus Torvalds t.bits -= 1;
1241da177e4SLinus Torvalds
1251da177e4SLinus Torvalds /* add inexact to result status */
1261da177e4SLinus Torvalds oldcsr.cx |= IEEE754_INEXACT;
1271da177e4SLinus Torvalds oldcsr.sx |= IEEE754_INEXACT;
1281da177e4SLinus Torvalds
1291da177e4SLinus Torvalds switch (oldcsr.rm) {
13056a64733SRalf Baechle case FPU_CSR_RU:
1311da177e4SLinus Torvalds y.bits += 1;
132c9b02990SLiangliang Huang fallthrough;
13356a64733SRalf Baechle case FPU_CSR_RN:
1341da177e4SLinus Torvalds t.bits += 1;
1351da177e4SLinus Torvalds break;
1361da177e4SLinus Torvalds }
1371da177e4SLinus Torvalds
1381da177e4SLinus Torvalds /* y=y+t; ...chopped sum */
1391da177e4SLinus Torvalds y = ieee754dp_add(y, t);
1401da177e4SLinus Torvalds
1411da177e4SLinus Torvalds /* adjust scalx for correctly rounded sqrt(x) */
1421da177e4SLinus Torvalds scalx -= 1;
1431da177e4SLinus Torvalds }
1441da177e4SLinus Torvalds
1451da177e4SLinus Torvalds /* py[n0]=py[n0]+scalx; ...scale back y */
14649548b09SRalf Baechle y.bexp += scalx;
1471da177e4SLinus Torvalds
1481da177e4SLinus Torvalds /* restore rounding mode, possibly set inexact */
1491da177e4SLinus Torvalds ieee754_csr = oldcsr;
1501da177e4SLinus Torvalds
1511da177e4SLinus Torvalds return y;
1521da177e4SLinus Torvalds }
153