1 /* 2 * QEMU float support macros 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 #ifndef FPU_SOFTFLOAT_MACROS_H 83 #define FPU_SOFTFLOAT_MACROS_H 84 85 #include "fpu/softfloat-types.h" 86 #include "qemu/host-utils.h" 87 88 /*---------------------------------------------------------------------------- 89 | Shifts `a' right by the number of bits given in `count'. If any nonzero 90 | bits are shifted off, they are ``jammed'' into the least significant bit of 91 | the result by setting the least significant bit to 1. The value of `count' 92 | can be arbitrarily large; in particular, if `count' is greater than 32, the 93 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 94 | The result is stored in the location pointed to by `zPtr'. 95 *----------------------------------------------------------------------------*/ 96 97 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr) 98 { 99 uint32_t z; 100 101 if ( count == 0 ) { 102 z = a; 103 } 104 else if ( count < 32 ) { 105 z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); 106 } 107 else { 108 z = ( a != 0 ); 109 } 110 *zPtr = z; 111 112 } 113 114 /*---------------------------------------------------------------------------- 115 | Shifts `a' right by the number of bits given in `count'. If any nonzero 116 | bits are shifted off, they are ``jammed'' into the least significant bit of 117 | the result by setting the least significant bit to 1. The value of `count' 118 | can be arbitrarily large; in particular, if `count' is greater than 64, the 119 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 120 | The result is stored in the location pointed to by `zPtr'. 121 *----------------------------------------------------------------------------*/ 122 123 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr) 124 { 125 uint64_t z; 126 127 if ( count == 0 ) { 128 z = a; 129 } 130 else if ( count < 64 ) { 131 z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); 132 } 133 else { 134 z = ( a != 0 ); 135 } 136 *zPtr = z; 137 138 } 139 140 /*---------------------------------------------------------------------------- 141 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 142 | _plus_ the number of bits given in `count'. The shifted result is at most 143 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The 144 | bits shifted off form a second 64-bit result as follows: The _last_ bit 145 | shifted off is the most-significant bit of the extra result, and the other 146 | 63 bits of the extra result are all zero if and only if _all_but_the_last_ 147 | bits shifted off were all zero. This extra result is stored in the location 148 | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. 149 | (This routine makes more sense if `a0' and `a1' are considered to form a 150 | fixed-point value with binary point between `a0' and `a1'. This fixed-point 151 | value is shifted right by the number of bits given in `count', and the 152 | integer part of the result is returned at the location pointed to by 153 | `z0Ptr'. The fractional part of the result may be slightly corrupted as 154 | described above, and is returned at the location pointed to by `z1Ptr'.) 155 *----------------------------------------------------------------------------*/ 156 157 static inline void 158 shift64ExtraRightJamming( 159 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 160 { 161 uint64_t z0, z1; 162 int8_t negCount = ( - count ) & 63; 163 164 if ( count == 0 ) { 165 z1 = a1; 166 z0 = a0; 167 } 168 else if ( count < 64 ) { 169 z1 = ( a0<<negCount ) | ( a1 != 0 ); 170 z0 = a0>>count; 171 } 172 else { 173 if ( count == 64 ) { 174 z1 = a0 | ( a1 != 0 ); 175 } 176 else { 177 z1 = ( ( a0 | a1 ) != 0 ); 178 } 179 z0 = 0; 180 } 181 *z1Ptr = z1; 182 *z0Ptr = z0; 183 184 } 185 186 /*---------------------------------------------------------------------------- 187 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 188 | number of bits given in `count'. Any bits shifted off are lost. The value 189 | of `count' can be arbitrarily large; in particular, if `count' is greater 190 | than 128, the result will be 0. The result is broken into two 64-bit pieces 191 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 192 *----------------------------------------------------------------------------*/ 193 194 static inline void 195 shift128Right( 196 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 197 { 198 uint64_t z0, z1; 199 int8_t negCount = ( - count ) & 63; 200 201 if ( count == 0 ) { 202 z1 = a1; 203 z0 = a0; 204 } 205 else if ( count < 64 ) { 206 z1 = ( a0<<negCount ) | ( a1>>count ); 207 z0 = a0>>count; 208 } 209 else { 210 z1 = (count < 128) ? (a0 >> (count & 63)) : 0; 211 z0 = 0; 212 } 213 *z1Ptr = z1; 214 *z0Ptr = z0; 215 216 } 217 218 /*---------------------------------------------------------------------------- 219 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 220 | number of bits given in `count'. If any nonzero bits are shifted off, they 221 | are ``jammed'' into the least significant bit of the result by setting the 222 | least significant bit to 1. The value of `count' can be arbitrarily large; 223 | in particular, if `count' is greater than 128, the result will be either 224 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or 225 | nonzero. The result is broken into two 64-bit pieces which are stored at 226 | the locations pointed to by `z0Ptr' and `z1Ptr'. 227 *----------------------------------------------------------------------------*/ 228 229 static inline void 230 shift128RightJamming( 231 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 232 { 233 uint64_t z0, z1; 234 int8_t negCount = ( - count ) & 63; 235 236 if ( count == 0 ) { 237 z1 = a1; 238 z0 = a0; 239 } 240 else if ( count < 64 ) { 241 z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); 242 z0 = a0>>count; 243 } 244 else { 245 if ( count == 64 ) { 246 z1 = a0 | ( a1 != 0 ); 247 } 248 else if ( count < 128 ) { 249 z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); 250 } 251 else { 252 z1 = ( ( a0 | a1 ) != 0 ); 253 } 254 z0 = 0; 255 } 256 *z1Ptr = z1; 257 *z0Ptr = z0; 258 259 } 260 261 /*---------------------------------------------------------------------------- 262 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right 263 | by 64 _plus_ the number of bits given in `count'. The shifted result is 264 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are 265 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted 266 | off form a third 64-bit result as follows: The _last_ bit shifted off is 267 | the most-significant bit of the extra result, and the other 63 bits of the 268 | extra result are all zero if and only if _all_but_the_last_ bits shifted off 269 | were all zero. This extra result is stored in the location pointed to by 270 | `z2Ptr'. The value of `count' can be arbitrarily large. 271 | (This routine makes more sense if `a0', `a1', and `a2' are considered 272 | to form a fixed-point value with binary point between `a1' and `a2'. This 273 | fixed-point value is shifted right by the number of bits given in `count', 274 | and the integer part of the result is returned at the locations pointed to 275 | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly 276 | corrupted as described above, and is returned at the location pointed to by 277 | `z2Ptr'.) 278 *----------------------------------------------------------------------------*/ 279 280 static inline void 281 shift128ExtraRightJamming( 282 uint64_t a0, 283 uint64_t a1, 284 uint64_t a2, 285 int count, 286 uint64_t *z0Ptr, 287 uint64_t *z1Ptr, 288 uint64_t *z2Ptr 289 ) 290 { 291 uint64_t z0, z1, z2; 292 int8_t negCount = ( - count ) & 63; 293 294 if ( count == 0 ) { 295 z2 = a2; 296 z1 = a1; 297 z0 = a0; 298 } 299 else { 300 if ( count < 64 ) { 301 z2 = a1<<negCount; 302 z1 = ( a0<<negCount ) | ( a1>>count ); 303 z0 = a0>>count; 304 } 305 else { 306 if ( count == 64 ) { 307 z2 = a1; 308 z1 = a0; 309 } 310 else { 311 a2 |= a1; 312 if ( count < 128 ) { 313 z2 = a0<<negCount; 314 z1 = a0>>( count & 63 ); 315 } 316 else { 317 z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); 318 z1 = 0; 319 } 320 } 321 z0 = 0; 322 } 323 z2 |= ( a2 != 0 ); 324 } 325 *z2Ptr = z2; 326 *z1Ptr = z1; 327 *z0Ptr = z0; 328 329 } 330 331 /*---------------------------------------------------------------------------- 332 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 333 | number of bits given in `count'. Any bits shifted off are lost. The value 334 | of `count' must be less than 64. The result is broken into two 64-bit 335 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 336 *----------------------------------------------------------------------------*/ 337 338 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count, 339 uint64_t *z0Ptr, uint64_t *z1Ptr) 340 { 341 *z1Ptr = a1 << count; 342 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 343 } 344 345 /*---------------------------------------------------------------------------- 346 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 347 | number of bits given in `count'. Any bits shifted off are lost. The value 348 | of `count' may be greater than 64. The result is broken into two 64-bit 349 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 350 *----------------------------------------------------------------------------*/ 351 352 static inline void shift128Left(uint64_t a0, uint64_t a1, int count, 353 uint64_t *z0Ptr, uint64_t *z1Ptr) 354 { 355 if (count < 64) { 356 *z1Ptr = a1 << count; 357 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 358 } else { 359 *z1Ptr = 0; 360 *z0Ptr = a1 << (count - 64); 361 } 362 } 363 364 /*---------------------------------------------------------------------------- 365 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left 366 | by the number of bits given in `count'. Any bits shifted off are lost. 367 | The value of `count' must be less than 64. The result is broken into three 368 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 369 | `z1Ptr', and `z2Ptr'. 370 *----------------------------------------------------------------------------*/ 371 372 static inline void 373 shortShift192Left( 374 uint64_t a0, 375 uint64_t a1, 376 uint64_t a2, 377 int count, 378 uint64_t *z0Ptr, 379 uint64_t *z1Ptr, 380 uint64_t *z2Ptr 381 ) 382 { 383 uint64_t z0, z1, z2; 384 int8_t negCount; 385 386 z2 = a2<<count; 387 z1 = a1<<count; 388 z0 = a0<<count; 389 if ( 0 < count ) { 390 negCount = ( ( - count ) & 63 ); 391 z1 |= a2>>negCount; 392 z0 |= a1>>negCount; 393 } 394 *z2Ptr = z2; 395 *z1Ptr = z1; 396 *z0Ptr = z0; 397 398 } 399 400 /*---------------------------------------------------------------------------- 401 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit 402 | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so 403 | any carry out is lost. The result is broken into two 64-bit pieces which 404 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 405 *----------------------------------------------------------------------------*/ 406 407 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, 408 uint64_t *z0Ptr, uint64_t *z1Ptr) 409 { 410 bool c = 0; 411 *z1Ptr = uadd64_carry(a1, b1, &c); 412 *z0Ptr = uadd64_carry(a0, b0, &c); 413 } 414 415 /*---------------------------------------------------------------------------- 416 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the 417 | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is 418 | modulo 2^192, so any carry out is lost. The result is broken into three 419 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 420 | `z1Ptr', and `z2Ptr'. 421 *----------------------------------------------------------------------------*/ 422 423 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2, 424 uint64_t b0, uint64_t b1, uint64_t b2, 425 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr) 426 { 427 bool c = 0; 428 *z2Ptr = uadd64_carry(a2, b2, &c); 429 *z1Ptr = uadd64_carry(a1, b1, &c); 430 *z0Ptr = uadd64_carry(a0, b0, &c); 431 } 432 433 /*---------------------------------------------------------------------------- 434 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the 435 | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo 436 | 2^128, so any borrow out (carry out) is lost. The result is broken into two 437 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and 438 | `z1Ptr'. 439 *----------------------------------------------------------------------------*/ 440 441 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, 442 uint64_t *z0Ptr, uint64_t *z1Ptr) 443 { 444 bool c = 0; 445 *z1Ptr = usub64_borrow(a1, b1, &c); 446 *z0Ptr = usub64_borrow(a0, b0, &c); 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' 451 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'. 452 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The 453 | result is broken into three 64-bit pieces which are stored at the locations 454 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2, 458 uint64_t b0, uint64_t b1, uint64_t b2, 459 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr) 460 { 461 bool c = 0; 462 *z2Ptr = usub64_borrow(a2, b2, &c); 463 *z1Ptr = usub64_borrow(a1, b1, &c); 464 *z0Ptr = usub64_borrow(a0, b0, &c); 465 } 466 467 /*---------------------------------------------------------------------------- 468 | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken 469 | into two 64-bit pieces which are stored at the locations pointed to by 470 | `z0Ptr' and `z1Ptr'. 471 *----------------------------------------------------------------------------*/ 472 473 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr ) 474 { 475 uint32_t aHigh, aLow, bHigh, bLow; 476 uint64_t z0, zMiddleA, zMiddleB, z1; 477 478 aLow = a; 479 aHigh = a>>32; 480 bLow = b; 481 bHigh = b>>32; 482 z1 = ( (uint64_t) aLow ) * bLow; 483 zMiddleA = ( (uint64_t) aLow ) * bHigh; 484 zMiddleB = ( (uint64_t) aHigh ) * bLow; 485 z0 = ( (uint64_t) aHigh ) * bHigh; 486 zMiddleA += zMiddleB; 487 z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); 488 zMiddleA <<= 32; 489 z1 += zMiddleA; 490 z0 += ( z1 < zMiddleA ); 491 *z1Ptr = z1; 492 *z0Ptr = z0; 493 494 } 495 496 /*---------------------------------------------------------------------------- 497 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by 498 | `b' to obtain a 192-bit product. The product is broken into three 64-bit 499 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and 500 | `z2Ptr'. 501 *----------------------------------------------------------------------------*/ 502 503 static inline void 504 mul128By64To192( 505 uint64_t a0, 506 uint64_t a1, 507 uint64_t b, 508 uint64_t *z0Ptr, 509 uint64_t *z1Ptr, 510 uint64_t *z2Ptr 511 ) 512 { 513 uint64_t z0, z1, z2, more1; 514 515 mul64To128( a1, b, &z1, &z2 ); 516 mul64To128( a0, b, &z0, &more1 ); 517 add128( z0, more1, 0, z1, &z0, &z1 ); 518 *z2Ptr = z2; 519 *z1Ptr = z1; 520 *z0Ptr = z0; 521 522 } 523 524 /*---------------------------------------------------------------------------- 525 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the 526 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit 527 | product. The product is broken into four 64-bit pieces which are stored at 528 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. 529 *----------------------------------------------------------------------------*/ 530 531 static inline void 532 mul128To256( 533 uint64_t a0, 534 uint64_t a1, 535 uint64_t b0, 536 uint64_t b1, 537 uint64_t *z0Ptr, 538 uint64_t *z1Ptr, 539 uint64_t *z2Ptr, 540 uint64_t *z3Ptr 541 ) 542 { 543 uint64_t z0, z1, z2, z3; 544 uint64_t more1, more2; 545 546 mul64To128( a1, b1, &z2, &z3 ); 547 mul64To128( a1, b0, &z1, &more2 ); 548 add128( z1, more2, 0, z2, &z1, &z2 ); 549 mul64To128( a0, b0, &z0, &more1 ); 550 add128( z0, more1, 0, z1, &z0, &z1 ); 551 mul64To128( a0, b1, &more1, &more2 ); 552 add128( more1, more2, 0, z2, &more1, &z2 ); 553 add128( z0, z1, 0, more1, &z0, &z1 ); 554 *z3Ptr = z3; 555 *z2Ptr = z2; 556 *z1Ptr = z1; 557 *z0Ptr = z0; 558 559 } 560 561 /*---------------------------------------------------------------------------- 562 | Returns an approximation to the 64-bit integer quotient obtained by dividing 563 | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The 564 | divisor `b' must be at least 2^63. If q is the exact quotient truncated 565 | toward zero, the approximation returned lies between q and q + 2 inclusive. 566 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit 567 | unsigned integer is returned. 568 *----------------------------------------------------------------------------*/ 569 570 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b) 571 { 572 uint64_t b0, b1; 573 uint64_t rem0, rem1, term0, term1; 574 uint64_t z; 575 576 if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF); 577 b0 = b>>32; 578 z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32; 579 mul64To128( b, z, &term0, &term1 ); 580 sub128( a0, a1, term0, term1, &rem0, &rem1 ); 581 while ( ( (int64_t) rem0 ) < 0 ) { 582 z -= UINT64_C(0x100000000); 583 b1 = b<<32; 584 add128( rem0, rem1, b0, b1, &rem0, &rem1 ); 585 } 586 rem0 = ( rem0<<32 ) | ( rem1>>32 ); 587 z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; 588 return z; 589 590 } 591 592 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd 593 * (https://gmplib.org/repo/gmp/file/tip/longlong.h) 594 * 595 * Licensed under the GPLv2/LGPLv3 596 */ 597 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, 598 uint64_t n0, uint64_t d) 599 { 600 #if defined(__x86_64__) 601 uint64_t q; 602 asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); 603 return q; 604 #elif defined(__s390x__) && !defined(__clang__) 605 /* Need to use a TImode type to get an even register pair for DLGR. */ 606 unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; 607 asm("dlgr %0, %1" : "+r"(n) : "r"(d)); 608 *r = n >> 64; 609 return n; 610 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) 611 /* From Power ISA 2.06, programming note for divdeu. */ 612 uint64_t q1, q2, Q, r1, r2, R; 613 asm("divdeu %0,%2,%4; divdu %1,%3,%4" 614 : "=&r"(q1), "=r"(q2) 615 : "r"(n1), "r"(n0), "r"(d)); 616 r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */ 617 r2 = n0 - (q2 * d); 618 Q = q1 + q2; 619 R = r1 + r2; 620 if (R >= d || R < r2) { /* overflow implies R > d */ 621 Q += 1; 622 R -= d; 623 } 624 *r = R; 625 return Q; 626 #else 627 uint64_t d0, d1, q0, q1, r1, r0, m; 628 629 d0 = (uint32_t)d; 630 d1 = d >> 32; 631 632 r1 = n1 % d1; 633 q1 = n1 / d1; 634 m = q1 * d0; 635 r1 = (r1 << 32) | (n0 >> 32); 636 if (r1 < m) { 637 q1 -= 1; 638 r1 += d; 639 if (r1 >= d) { 640 if (r1 < m) { 641 q1 -= 1; 642 r1 += d; 643 } 644 } 645 } 646 r1 -= m; 647 648 r0 = r1 % d1; 649 q0 = r1 / d1; 650 m = q0 * d0; 651 r0 = (r0 << 32) | (uint32_t)n0; 652 if (r0 < m) { 653 q0 -= 1; 654 r0 += d; 655 if (r0 >= d) { 656 if (r0 < m) { 657 q0 -= 1; 658 r0 += d; 659 } 660 } 661 } 662 r0 -= m; 663 664 *r = r0; 665 return (q1 << 32) | q0; 666 #endif 667 } 668 669 /*---------------------------------------------------------------------------- 670 | Returns an approximation to the square root of the 32-bit significand given 671 | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of 672 | `aExp' (the least significant bit) is 1, the integer returned approximates 673 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' 674 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either 675 | case, the approximation returned lies strictly within +/-2 of the exact 676 | value. 677 *----------------------------------------------------------------------------*/ 678 679 static inline uint32_t estimateSqrt32(int aExp, uint32_t a) 680 { 681 static const uint16_t sqrtOddAdjustments[] = { 682 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, 683 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 684 }; 685 static const uint16_t sqrtEvenAdjustments[] = { 686 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, 687 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 688 }; 689 int8_t index; 690 uint32_t z; 691 692 index = ( a>>27 ) & 15; 693 if ( aExp & 1 ) { 694 z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ]; 695 z = ( ( a / z )<<14 ) + ( z<<15 ); 696 a >>= 1; 697 } 698 else { 699 z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ]; 700 z = a / z + z; 701 z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); 702 if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 ); 703 } 704 return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 ); 705 706 } 707 708 /*---------------------------------------------------------------------------- 709 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' 710 | is equal to the 128-bit value formed by concatenating `b0' and `b1'. 711 | Otherwise, returns 0. 712 *----------------------------------------------------------------------------*/ 713 714 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 715 { 716 return a0 == b0 && a1 == b1; 717 } 718 719 /*---------------------------------------------------------------------------- 720 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 721 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'. 722 | Otherwise, returns 0. 723 *----------------------------------------------------------------------------*/ 724 725 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 726 { 727 return a0 < b0 || (a0 == b0 && a1 <= b1); 728 } 729 730 /*---------------------------------------------------------------------------- 731 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 732 | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, 733 | returns 0. 734 *----------------------------------------------------------------------------*/ 735 736 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 737 { 738 return a0 < b0 || (a0 == b0 && a1 < b1); 739 } 740 741 /*---------------------------------------------------------------------------- 742 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is 743 | not equal to the 128-bit value formed by concatenating `b0' and `b1'. 744 | Otherwise, returns 0. 745 *----------------------------------------------------------------------------*/ 746 747 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 748 { 749 return a0 != b0 || a1 != b1; 750 } 751 752 #endif 753