1| 2| stan.sa 3.3 7/29/91 3| 4| The entry point stan computes the tangent of 5| an input argument; 6| stand does the same except for denormalized input. 7| 8| Input: Double-extended number X in location pointed to 9| by address register a0. 10| 11| Output: The value tan(X) returned in floating-point register Fp0. 12| 13| Accuracy and Monotonicity: The returned result is within 3 ulp in 14| 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the 15| result is subsequently rounded to double precision. The 16| result is provably monotonic in double precision. 17| 18| Speed: The program sTAN takes approximately 170 cycles for 19| input argument X such that |X| < 15Pi, which is the usual 20| situation. 21| 22| Algorithm: 23| 24| 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6. 25| 26| 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let 27| k = N mod 2, so in particular, k = 0 or 1. 28| 29| 3. If k is odd, go to 5. 30| 31| 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a 32| rational function U/V where 33| U = r + r*s*(P1 + s*(P2 + s*P3)), and 34| V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r. 35| Exit. 36| 37| 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a 38| rational function U/V where 39| U = r + r*s*(P1 + s*(P2 + s*P3)), and 40| V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r, 41| -Cot(r) = -V/U. Exit. 42| 43| 6. If |X| > 1, go to 8. 44| 45| 7. (|X|<2**(-40)) Tan(X) = X. Exit. 46| 47| 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2. 48| 49 50| Copyright (C) Motorola, Inc. 1990 51| All Rights Reserved 52| 53| For details on the license for this file, please see the 54| file, README, in this same directory. 55 56|STAN idnt 2,1 | Motorola 040 Floating Point Software Package 57 58 |section 8 59 60#include "fpsp.h" 61 62BOUNDS1: .long 0x3FD78000,0x4004BC7E 63TWOBYPI: .long 0x3FE45F30,0x6DC9C883 64 65TANQ4: .long 0x3EA0B759,0xF50F8688 66TANP3: .long 0xBEF2BAA5,0xA8924F04 67 68TANQ3: .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000 69 70TANP2: .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000 71 72TANQ2: .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000 73 74TANP1: .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000 75 76TANQ1: .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000 77 78INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000 79 80TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000 81TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000 82 83|--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING 84|--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT 85|--MOST 69 BITS LONG. 86 .global PITBL 87PITBL: 88 .long 0xC0040000,0xC90FDAA2,0x2168C235,0x21800000 89 .long 0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000 90 .long 0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000 91 .long 0xC0040000,0xB6365E22,0xEE46F000,0x21480000 92 .long 0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000 93 .long 0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000 94 .long 0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000 95 .long 0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000 96 .long 0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000 97 .long 0xC0040000,0x90836524,0x88034B96,0x20B00000 98 .long 0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000 99 .long 0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000 100 .long 0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000 101 .long 0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000 102 .long 0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000 103 .long 0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000 104 .long 0xC0030000,0xC90FDAA2,0x2168C235,0x21000000 105 .long 0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000 106 .long 0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000 107 .long 0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000 108 .long 0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000 109 .long 0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000 110 .long 0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000 111 .long 0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000 112 .long 0xC0020000,0xC90FDAA2,0x2168C235,0x20800000 113 .long 0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000 114 .long 0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000 115 .long 0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000 116 .long 0xC0010000,0xC90FDAA2,0x2168C235,0x20000000 117 .long 0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000 118 .long 0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000 119 .long 0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000 120 .long 0x00000000,0x00000000,0x00000000,0x00000000 121 .long 0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000 122 .long 0x40000000,0xC90FDAA2,0x2168C235,0x9F800000 123 .long 0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000 124 .long 0x40010000,0xC90FDAA2,0x2168C235,0xA0000000 125 .long 0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000 126 .long 0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000 127 .long 0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000 128 .long 0x40020000,0xC90FDAA2,0x2168C235,0xA0800000 129 .long 0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000 130 .long 0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000 131 .long 0x40030000,0x8A3AE64F,0x76F80584,0x21080000 132 .long 0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000 133 .long 0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000 134 .long 0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000 135 .long 0x40030000,0xBC7EDCF7,0xFF523611,0x21680000 136 .long 0x40030000,0xC90FDAA2,0x2168C235,0xA1000000 137 .long 0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000 138 .long 0x40030000,0xE231D5F6,0x6595DA7B,0x21300000 139 .long 0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000 140 .long 0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000 141 .long 0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000 142 .long 0x40040000,0x8A3AE64F,0x76F80584,0x21880000 143 .long 0x40040000,0x90836524,0x88034B96,0xA0B00000 144 .long 0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000 145 .long 0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000 146 .long 0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000 147 .long 0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000 148 .long 0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000 149 .long 0x40040000,0xB6365E22,0xEE46F000,0xA1480000 150 .long 0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000 151 .long 0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000 152 .long 0x40040000,0xC90FDAA2,0x2168C235,0xA1800000 153 154 .set INARG,FP_SCR4 155 156 .set TWOTO63,L_SCR1 157 .set ENDFLAG,L_SCR2 158 .set N,L_SCR3 159 160 | xref t_frcinx 161 |xref t_extdnrm 162 163 .global stand 164stand: 165|--TAN(X) = X FOR DENORMALIZED X 166 167 bra t_extdnrm 168 169 .global stan 170stan: 171 fmovex (%a0),%fp0 | ...LOAD INPUT 172 173 movel (%a0),%d0 174 movew 4(%a0),%d0 175 andil #0x7FFFFFFF,%d0 176 177 cmpil #0x3FD78000,%d0 | ...|X| >= 2**(-40)? 178 bges TANOK1 179 bra TANSM 180TANOK1: 181 cmpil #0x4004BC7E,%d0 | ...|X| < 15 PI? 182 blts TANMAIN 183 bra REDUCEX 184 185 186TANMAIN: 187|--THIS IS THE USUAL CASE, |X| <= 15 PI. 188|--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP. 189 fmovex %fp0,%fp1 190 fmuld TWOBYPI,%fp1 | ...X*2/PI 191 192|--HIDE THE NEXT TWO INSTRUCTIONS 193 leal PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32 194 195|--FP1 IS NOW READY 196 fmovel %fp1,%d0 | ...CONVERT TO INTEGER 197 198 asll #4,%d0 199 addal %d0,%a1 | ...ADDRESS N*PIBY2 IN Y1, Y2 200 201 fsubx (%a1)+,%fp0 | ...X-Y1 202|--HIDE THE NEXT ONE 203 204 fsubs (%a1),%fp0 | ...FP0 IS R = (X-Y1)-Y2 205 206 rorl #5,%d0 207 andil #0x80000000,%d0 | ...D0 WAS ODD IFF D0 < 0 208 209TANCONT: 210 211 cmpil #0,%d0 212 blt NODD 213 214 fmovex %fp0,%fp1 215 fmulx %fp1,%fp1 | ...S = R*R 216 217 fmoved TANQ4,%fp3 218 fmoved TANP3,%fp2 219 220 fmulx %fp1,%fp3 | ...SQ4 221 fmulx %fp1,%fp2 | ...SP3 222 223 faddd TANQ3,%fp3 | ...Q3+SQ4 224 faddx TANP2,%fp2 | ...P2+SP3 225 226 fmulx %fp1,%fp3 | ...S(Q3+SQ4) 227 fmulx %fp1,%fp2 | ...S(P2+SP3) 228 229 faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4) 230 faddx TANP1,%fp2 | ...P1+S(P2+SP3) 231 232 fmulx %fp1,%fp3 | ...S(Q2+S(Q3+SQ4)) 233 fmulx %fp1,%fp2 | ...S(P1+S(P2+SP3)) 234 235 faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4)) 236 fmulx %fp0,%fp2 | ...RS(P1+S(P2+SP3)) 237 238 fmulx %fp3,%fp1 | ...S(Q1+S(Q2+S(Q3+SQ4))) 239 240 241 faddx %fp2,%fp0 | ...R+RS(P1+S(P2+SP3)) 242 243 244 fadds #0x3F800000,%fp1 | ...1+S(Q1+...) 245 246 fmovel %d1,%fpcr |restore users exceptions 247 fdivx %fp1,%fp0 |last inst - possible exception set 248 249 bra t_frcinx 250 251NODD: 252 fmovex %fp0,%fp1 253 fmulx %fp0,%fp0 | ...S = R*R 254 255 fmoved TANQ4,%fp3 256 fmoved TANP3,%fp2 257 258 fmulx %fp0,%fp3 | ...SQ4 259 fmulx %fp0,%fp2 | ...SP3 260 261 faddd TANQ3,%fp3 | ...Q3+SQ4 262 faddx TANP2,%fp2 | ...P2+SP3 263 264 fmulx %fp0,%fp3 | ...S(Q3+SQ4) 265 fmulx %fp0,%fp2 | ...S(P2+SP3) 266 267 faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4) 268 faddx TANP1,%fp2 | ...P1+S(P2+SP3) 269 270 fmulx %fp0,%fp3 | ...S(Q2+S(Q3+SQ4)) 271 fmulx %fp0,%fp2 | ...S(P1+S(P2+SP3)) 272 273 faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4)) 274 fmulx %fp1,%fp2 | ...RS(P1+S(P2+SP3)) 275 276 fmulx %fp3,%fp0 | ...S(Q1+S(Q2+S(Q3+SQ4))) 277 278 279 faddx %fp2,%fp1 | ...R+RS(P1+S(P2+SP3)) 280 fadds #0x3F800000,%fp0 | ...1+S(Q1+...) 281 282 283 fmovex %fp1,-(%sp) 284 eoril #0x80000000,(%sp) 285 286 fmovel %d1,%fpcr |restore users exceptions 287 fdivx (%sp)+,%fp0 |last inst - possible exception set 288 289 bra t_frcinx 290 291TANBORS: 292|--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION. 293|--IF |X| < 2**(-40), RETURN X OR 1. 294 cmpil #0x3FFF8000,%d0 295 bgts REDUCEX 296 297TANSM: 298 299 fmovex %fp0,-(%sp) 300 fmovel %d1,%fpcr |restore users exceptions 301 fmovex (%sp)+,%fp0 |last inst - possible exception set 302 303 bra t_frcinx 304 305 306REDUCEX: 307|--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW. 308|--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING 309|--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE. 310 311 fmovemx %fp2-%fp5,-(%a7) | ...save FP2 through FP5 312 movel %d2,-(%a7) 313 fmoves #0x00000000,%fp1 314 315|--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that 316|--there is a danger of unwanted overflow in first LOOP iteration. In this 317|--case, reduce argument by one remainder step to make subsequent reduction 318|--safe. 319 cmpil #0x7ffeffff,%d0 |is argument dangerously large? 320 bnes LOOP 321 movel #0x7ffe0000,FP_SCR2(%a6) |yes 322| ;create 2**16383*PI/2 323 movel #0xc90fdaa2,FP_SCR2+4(%a6) 324 clrl FP_SCR2+8(%a6) 325 ftstx %fp0 |test sign of argument 326 movel #0x7fdc0000,FP_SCR3(%a6) |create low half of 2**16383* 327| ;PI/2 at FP_SCR3 328 movel #0x85a308d3,FP_SCR3+4(%a6) 329 clrl FP_SCR3+8(%a6) 330 fblt red_neg 331 orw #0x8000,FP_SCR2(%a6) |positive arg 332 orw #0x8000,FP_SCR3(%a6) 333red_neg: 334 faddx FP_SCR2(%a6),%fp0 |high part of reduction is exact 335 fmovex %fp0,%fp1 |save high result in fp1 336 faddx FP_SCR3(%a6),%fp0 |low part of reduction 337 fsubx %fp0,%fp1 |determine low component of result 338 faddx FP_SCR3(%a6),%fp1 |fp0/fp1 are reduced argument. 339 340|--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4. 341|--integer quotient will be stored in N 342|--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1) 343 344LOOP: 345 fmovex %fp0,INARG(%a6) | ...+-2**K * F, 1 <= F < 2 346 movew INARG(%a6),%d0 347 movel %d0,%a1 | ...save a copy of D0 348 andil #0x00007FFF,%d0 349 subil #0x00003FFF,%d0 | ...D0 IS K 350 cmpil #28,%d0 351 bles LASTLOOP 352CONTLOOP: 353 subil #27,%d0 | ...D0 IS L := K-27 354 movel #0,ENDFLAG(%a6) 355 bras WORK 356LASTLOOP: 357 clrl %d0 | ...D0 IS L := 0 358 movel #1,ENDFLAG(%a6) 359 360WORK: 361|--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN 362|--THAT INT( X * (2/PI) / 2**(L) ) < 2**29. 363 364|--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63), 365|--2**L * (PIby2_1), 2**L * (PIby2_2) 366 367 movel #0x00003FFE,%d2 | ...BIASED EXPO OF 2/PI 368 subl %d0,%d2 | ...BIASED EXPO OF 2**(-L)*(2/PI) 369 370 movel #0xA2F9836E,FP_SCR1+4(%a6) 371 movel #0x4E44152A,FP_SCR1+8(%a6) 372 movew %d2,FP_SCR1(%a6) | ...FP_SCR1 is 2**(-L)*(2/PI) 373 374 fmovex %fp0,%fp2 375 fmulx FP_SCR1(%a6),%fp2 376|--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN 377|--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N 378|--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT 379|--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE 380|--US THE DESIRED VALUE IN FLOATING POINT. 381 382|--HIDE SIX CYCLES OF INSTRUCTION 383 movel %a1,%d2 384 swap %d2 385 andil #0x80000000,%d2 386 oril #0x5F000000,%d2 | ...D2 IS SIGN(INARG)*2**63 IN SGL 387 movel %d2,TWOTO63(%a6) 388 389 movel %d0,%d2 390 addil #0x00003FFF,%d2 | ...BIASED EXPO OF 2**L * (PI/2) 391 392|--FP2 IS READY 393 fadds TWOTO63(%a6),%fp2 | ...THE FRACTIONAL PART OF FP1 IS ROUNDED 394 395|--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2 396 movew %d2,FP_SCR2(%a6) 397 clrw FP_SCR2+2(%a6) 398 movel #0xC90FDAA2,FP_SCR2+4(%a6) 399 clrl FP_SCR2+8(%a6) | ...FP_SCR2 is 2**(L) * Piby2_1 400 401|--FP2 IS READY 402 fsubs TWOTO63(%a6),%fp2 | ...FP2 is N 403 404 addil #0x00003FDD,%d0 405 movew %d0,FP_SCR3(%a6) 406 clrw FP_SCR3+2(%a6) 407 movel #0x85A308D3,FP_SCR3+4(%a6) 408 clrl FP_SCR3+8(%a6) | ...FP_SCR3 is 2**(L) * Piby2_2 409 410 movel ENDFLAG(%a6),%d0 411 412|--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and 413|--P2 = 2**(L) * Piby2_2 414 fmovex %fp2,%fp4 415 fmulx FP_SCR2(%a6),%fp4 | ...W = N*P1 416 fmovex %fp2,%fp5 417 fmulx FP_SCR3(%a6),%fp5 | ...w = N*P2 418 fmovex %fp4,%fp3 419|--we want P+p = W+w but |p| <= half ulp of P 420|--Then, we need to compute A := R-P and a := r-p 421 faddx %fp5,%fp3 | ...FP3 is P 422 fsubx %fp3,%fp4 | ...W-P 423 424 fsubx %fp3,%fp0 | ...FP0 is A := R - P 425 faddx %fp5,%fp4 | ...FP4 is p = (W-P)+w 426 427 fmovex %fp0,%fp3 | ...FP3 A 428 fsubx %fp4,%fp1 | ...FP1 is a := r - p 429 430|--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but 431|--|r| <= half ulp of R. 432 faddx %fp1,%fp0 | ...FP0 is R := A+a 433|--No need to calculate r if this is the last loop 434 cmpil #0,%d0 435 bgt RESTORE 436 437|--Need to calculate r 438 fsubx %fp0,%fp3 | ...A-R 439 faddx %fp3,%fp1 | ...FP1 is r := (A-R)+a 440 bra LOOP 441 442RESTORE: 443 fmovel %fp2,N(%a6) 444 movel (%a7)+,%d2 445 fmovemx (%a7)+,%fp2-%fp5 446 447 448 movel N(%a6),%d0 449 rorl #1,%d0 450 451 452 bra TANCONT 453 454 |end 455