1 /* 2 * QEMU float support macros 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 #ifndef FPU_SOFTFLOAT_MACROS_H 83 #define FPU_SOFTFLOAT_MACROS_H 84 85 #include "fpu/softfloat-types.h" 86 87 /*---------------------------------------------------------------------------- 88 | Shifts `a' right by the number of bits given in `count'. If any nonzero 89 | bits are shifted off, they are ``jammed'' into the least significant bit of 90 | the result by setting the least significant bit to 1. The value of `count' 91 | can be arbitrarily large; in particular, if `count' is greater than 32, the 92 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 93 | The result is stored in the location pointed to by `zPtr'. 94 *----------------------------------------------------------------------------*/ 95 96 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr) 97 { 98 uint32_t z; 99 100 if ( count == 0 ) { 101 z = a; 102 } 103 else if ( count < 32 ) { 104 z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); 105 } 106 else { 107 z = ( a != 0 ); 108 } 109 *zPtr = z; 110 111 } 112 113 /*---------------------------------------------------------------------------- 114 | Shifts `a' right by the number of bits given in `count'. If any nonzero 115 | bits are shifted off, they are ``jammed'' into the least significant bit of 116 | the result by setting the least significant bit to 1. The value of `count' 117 | can be arbitrarily large; in particular, if `count' is greater than 64, the 118 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 119 | The result is stored in the location pointed to by `zPtr'. 120 *----------------------------------------------------------------------------*/ 121 122 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr) 123 { 124 uint64_t z; 125 126 if ( count == 0 ) { 127 z = a; 128 } 129 else if ( count < 64 ) { 130 z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); 131 } 132 else { 133 z = ( a != 0 ); 134 } 135 *zPtr = z; 136 137 } 138 139 /*---------------------------------------------------------------------------- 140 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 141 | _plus_ the number of bits given in `count'. The shifted result is at most 142 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The 143 | bits shifted off form a second 64-bit result as follows: The _last_ bit 144 | shifted off is the most-significant bit of the extra result, and the other 145 | 63 bits of the extra result are all zero if and only if _all_but_the_last_ 146 | bits shifted off were all zero. This extra result is stored in the location 147 | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. 148 | (This routine makes more sense if `a0' and `a1' are considered to form a 149 | fixed-point value with binary point between `a0' and `a1'. This fixed-point 150 | value is shifted right by the number of bits given in `count', and the 151 | integer part of the result is returned at the location pointed to by 152 | `z0Ptr'. The fractional part of the result may be slightly corrupted as 153 | described above, and is returned at the location pointed to by `z1Ptr'.) 154 *----------------------------------------------------------------------------*/ 155 156 static inline void 157 shift64ExtraRightJamming( 158 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 159 { 160 uint64_t z0, z1; 161 int8_t negCount = ( - count ) & 63; 162 163 if ( count == 0 ) { 164 z1 = a1; 165 z0 = a0; 166 } 167 else if ( count < 64 ) { 168 z1 = ( a0<<negCount ) | ( a1 != 0 ); 169 z0 = a0>>count; 170 } 171 else { 172 if ( count == 64 ) { 173 z1 = a0 | ( a1 != 0 ); 174 } 175 else { 176 z1 = ( ( a0 | a1 ) != 0 ); 177 } 178 z0 = 0; 179 } 180 *z1Ptr = z1; 181 *z0Ptr = z0; 182 183 } 184 185 /*---------------------------------------------------------------------------- 186 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 187 | number of bits given in `count'. Any bits shifted off are lost. The value 188 | of `count' can be arbitrarily large; in particular, if `count' is greater 189 | than 128, the result will be 0. The result is broken into two 64-bit pieces 190 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 191 *----------------------------------------------------------------------------*/ 192 193 static inline void 194 shift128Right( 195 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 196 { 197 uint64_t z0, z1; 198 int8_t negCount = ( - count ) & 63; 199 200 if ( count == 0 ) { 201 z1 = a1; 202 z0 = a0; 203 } 204 else if ( count < 64 ) { 205 z1 = ( a0<<negCount ) | ( a1>>count ); 206 z0 = a0>>count; 207 } 208 else { 209 z1 = (count < 128) ? (a0 >> (count & 63)) : 0; 210 z0 = 0; 211 } 212 *z1Ptr = z1; 213 *z0Ptr = z0; 214 215 } 216 217 /*---------------------------------------------------------------------------- 218 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 219 | number of bits given in `count'. If any nonzero bits are shifted off, they 220 | are ``jammed'' into the least significant bit of the result by setting the 221 | least significant bit to 1. The value of `count' can be arbitrarily large; 222 | in particular, if `count' is greater than 128, the result will be either 223 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or 224 | nonzero. The result is broken into two 64-bit pieces which are stored at 225 | the locations pointed to by `z0Ptr' and `z1Ptr'. 226 *----------------------------------------------------------------------------*/ 227 228 static inline void 229 shift128RightJamming( 230 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 231 { 232 uint64_t z0, z1; 233 int8_t negCount = ( - count ) & 63; 234 235 if ( count == 0 ) { 236 z1 = a1; 237 z0 = a0; 238 } 239 else if ( count < 64 ) { 240 z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); 241 z0 = a0>>count; 242 } 243 else { 244 if ( count == 64 ) { 245 z1 = a0 | ( a1 != 0 ); 246 } 247 else if ( count < 128 ) { 248 z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); 249 } 250 else { 251 z1 = ( ( a0 | a1 ) != 0 ); 252 } 253 z0 = 0; 254 } 255 *z1Ptr = z1; 256 *z0Ptr = z0; 257 258 } 259 260 /*---------------------------------------------------------------------------- 261 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right 262 | by 64 _plus_ the number of bits given in `count'. The shifted result is 263 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are 264 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted 265 | off form a third 64-bit result as follows: The _last_ bit shifted off is 266 | the most-significant bit of the extra result, and the other 63 bits of the 267 | extra result are all zero if and only if _all_but_the_last_ bits shifted off 268 | were all zero. This extra result is stored in the location pointed to by 269 | `z2Ptr'. The value of `count' can be arbitrarily large. 270 | (This routine makes more sense if `a0', `a1', and `a2' are considered 271 | to form a fixed-point value with binary point between `a1' and `a2'. This 272 | fixed-point value is shifted right by the number of bits given in `count', 273 | and the integer part of the result is returned at the locations pointed to 274 | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly 275 | corrupted as described above, and is returned at the location pointed to by 276 | `z2Ptr'.) 277 *----------------------------------------------------------------------------*/ 278 279 static inline void 280 shift128ExtraRightJamming( 281 uint64_t a0, 282 uint64_t a1, 283 uint64_t a2, 284 int count, 285 uint64_t *z0Ptr, 286 uint64_t *z1Ptr, 287 uint64_t *z2Ptr 288 ) 289 { 290 uint64_t z0, z1, z2; 291 int8_t negCount = ( - count ) & 63; 292 293 if ( count == 0 ) { 294 z2 = a2; 295 z1 = a1; 296 z0 = a0; 297 } 298 else { 299 if ( count < 64 ) { 300 z2 = a1<<negCount; 301 z1 = ( a0<<negCount ) | ( a1>>count ); 302 z0 = a0>>count; 303 } 304 else { 305 if ( count == 64 ) { 306 z2 = a1; 307 z1 = a0; 308 } 309 else { 310 a2 |= a1; 311 if ( count < 128 ) { 312 z2 = a0<<negCount; 313 z1 = a0>>( count & 63 ); 314 } 315 else { 316 z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); 317 z1 = 0; 318 } 319 } 320 z0 = 0; 321 } 322 z2 |= ( a2 != 0 ); 323 } 324 *z2Ptr = z2; 325 *z1Ptr = z1; 326 *z0Ptr = z0; 327 328 } 329 330 /*---------------------------------------------------------------------------- 331 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 332 | number of bits given in `count'. Any bits shifted off are lost. The value 333 | of `count' must be less than 64. The result is broken into two 64-bit 334 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 335 *----------------------------------------------------------------------------*/ 336 337 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count, 338 uint64_t *z0Ptr, uint64_t *z1Ptr) 339 { 340 *z1Ptr = a1 << count; 341 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 342 } 343 344 /*---------------------------------------------------------------------------- 345 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 346 | number of bits given in `count'. Any bits shifted off are lost. The value 347 | of `count' may be greater than 64. The result is broken into two 64-bit 348 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 349 *----------------------------------------------------------------------------*/ 350 351 static inline void shift128Left(uint64_t a0, uint64_t a1, int count, 352 uint64_t *z0Ptr, uint64_t *z1Ptr) 353 { 354 if (count < 64) { 355 *z1Ptr = a1 << count; 356 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 357 } else { 358 *z1Ptr = 0; 359 *z0Ptr = a1 << (count - 64); 360 } 361 } 362 363 /*---------------------------------------------------------------------------- 364 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left 365 | by the number of bits given in `count'. Any bits shifted off are lost. 366 | The value of `count' must be less than 64. The result is broken into three 367 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 368 | `z1Ptr', and `z2Ptr'. 369 *----------------------------------------------------------------------------*/ 370 371 static inline void 372 shortShift192Left( 373 uint64_t a0, 374 uint64_t a1, 375 uint64_t a2, 376 int count, 377 uint64_t *z0Ptr, 378 uint64_t *z1Ptr, 379 uint64_t *z2Ptr 380 ) 381 { 382 uint64_t z0, z1, z2; 383 int8_t negCount; 384 385 z2 = a2<<count; 386 z1 = a1<<count; 387 z0 = a0<<count; 388 if ( 0 < count ) { 389 negCount = ( ( - count ) & 63 ); 390 z1 |= a2>>negCount; 391 z0 |= a1>>negCount; 392 } 393 *z2Ptr = z2; 394 *z1Ptr = z1; 395 *z0Ptr = z0; 396 397 } 398 399 /*---------------------------------------------------------------------------- 400 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit 401 | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so 402 | any carry out is lost. The result is broken into two 64-bit pieces which 403 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 404 *----------------------------------------------------------------------------*/ 405 406 static inline void 407 add128( 408 uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr ) 409 { 410 uint64_t z1; 411 412 z1 = a1 + b1; 413 *z1Ptr = z1; 414 *z0Ptr = a0 + b0 + ( z1 < a1 ); 415 416 } 417 418 /*---------------------------------------------------------------------------- 419 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the 420 | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is 421 | modulo 2^192, so any carry out is lost. The result is broken into three 422 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 423 | `z1Ptr', and `z2Ptr'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline void 427 add192( 428 uint64_t a0, 429 uint64_t a1, 430 uint64_t a2, 431 uint64_t b0, 432 uint64_t b1, 433 uint64_t b2, 434 uint64_t *z0Ptr, 435 uint64_t *z1Ptr, 436 uint64_t *z2Ptr 437 ) 438 { 439 uint64_t z0, z1, z2; 440 int8_t carry0, carry1; 441 442 z2 = a2 + b2; 443 carry1 = ( z2 < a2 ); 444 z1 = a1 + b1; 445 carry0 = ( z1 < a1 ); 446 z0 = a0 + b0; 447 z1 += carry1; 448 z0 += ( z1 < carry1 ); 449 z0 += carry0; 450 *z2Ptr = z2; 451 *z1Ptr = z1; 452 *z0Ptr = z0; 453 454 } 455 456 /*---------------------------------------------------------------------------- 457 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the 458 | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo 459 | 2^128, so any borrow out (carry out) is lost. The result is broken into two 460 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and 461 | `z1Ptr'. 462 *----------------------------------------------------------------------------*/ 463 464 static inline void 465 sub128( 466 uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr ) 467 { 468 469 *z1Ptr = a1 - b1; 470 *z0Ptr = a0 - b0 - ( a1 < b1 ); 471 472 } 473 474 /*---------------------------------------------------------------------------- 475 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' 476 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'. 477 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The 478 | result is broken into three 64-bit pieces which are stored at the locations 479 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. 480 *----------------------------------------------------------------------------*/ 481 482 static inline void 483 sub192( 484 uint64_t a0, 485 uint64_t a1, 486 uint64_t a2, 487 uint64_t b0, 488 uint64_t b1, 489 uint64_t b2, 490 uint64_t *z0Ptr, 491 uint64_t *z1Ptr, 492 uint64_t *z2Ptr 493 ) 494 { 495 uint64_t z0, z1, z2; 496 int8_t borrow0, borrow1; 497 498 z2 = a2 - b2; 499 borrow1 = ( a2 < b2 ); 500 z1 = a1 - b1; 501 borrow0 = ( a1 < b1 ); 502 z0 = a0 - b0; 503 z0 -= ( z1 < borrow1 ); 504 z1 -= borrow1; 505 z0 -= borrow0; 506 *z2Ptr = z2; 507 *z1Ptr = z1; 508 *z0Ptr = z0; 509 510 } 511 512 /*---------------------------------------------------------------------------- 513 | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken 514 | into two 64-bit pieces which are stored at the locations pointed to by 515 | `z0Ptr' and `z1Ptr'. 516 *----------------------------------------------------------------------------*/ 517 518 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr ) 519 { 520 uint32_t aHigh, aLow, bHigh, bLow; 521 uint64_t z0, zMiddleA, zMiddleB, z1; 522 523 aLow = a; 524 aHigh = a>>32; 525 bLow = b; 526 bHigh = b>>32; 527 z1 = ( (uint64_t) aLow ) * bLow; 528 zMiddleA = ( (uint64_t) aLow ) * bHigh; 529 zMiddleB = ( (uint64_t) aHigh ) * bLow; 530 z0 = ( (uint64_t) aHigh ) * bHigh; 531 zMiddleA += zMiddleB; 532 z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); 533 zMiddleA <<= 32; 534 z1 += zMiddleA; 535 z0 += ( z1 < zMiddleA ); 536 *z1Ptr = z1; 537 *z0Ptr = z0; 538 539 } 540 541 /*---------------------------------------------------------------------------- 542 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by 543 | `b' to obtain a 192-bit product. The product is broken into three 64-bit 544 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and 545 | `z2Ptr'. 546 *----------------------------------------------------------------------------*/ 547 548 static inline void 549 mul128By64To192( 550 uint64_t a0, 551 uint64_t a1, 552 uint64_t b, 553 uint64_t *z0Ptr, 554 uint64_t *z1Ptr, 555 uint64_t *z2Ptr 556 ) 557 { 558 uint64_t z0, z1, z2, more1; 559 560 mul64To128( a1, b, &z1, &z2 ); 561 mul64To128( a0, b, &z0, &more1 ); 562 add128( z0, more1, 0, z1, &z0, &z1 ); 563 *z2Ptr = z2; 564 *z1Ptr = z1; 565 *z0Ptr = z0; 566 567 } 568 569 /*---------------------------------------------------------------------------- 570 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the 571 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit 572 | product. The product is broken into four 64-bit pieces which are stored at 573 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. 574 *----------------------------------------------------------------------------*/ 575 576 static inline void 577 mul128To256( 578 uint64_t a0, 579 uint64_t a1, 580 uint64_t b0, 581 uint64_t b1, 582 uint64_t *z0Ptr, 583 uint64_t *z1Ptr, 584 uint64_t *z2Ptr, 585 uint64_t *z3Ptr 586 ) 587 { 588 uint64_t z0, z1, z2, z3; 589 uint64_t more1, more2; 590 591 mul64To128( a1, b1, &z2, &z3 ); 592 mul64To128( a1, b0, &z1, &more2 ); 593 add128( z1, more2, 0, z2, &z1, &z2 ); 594 mul64To128( a0, b0, &z0, &more1 ); 595 add128( z0, more1, 0, z1, &z0, &z1 ); 596 mul64To128( a0, b1, &more1, &more2 ); 597 add128( more1, more2, 0, z2, &more1, &z2 ); 598 add128( z0, z1, 0, more1, &z0, &z1 ); 599 *z3Ptr = z3; 600 *z2Ptr = z2; 601 *z1Ptr = z1; 602 *z0Ptr = z0; 603 604 } 605 606 /*---------------------------------------------------------------------------- 607 | Returns an approximation to the 64-bit integer quotient obtained by dividing 608 | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The 609 | divisor `b' must be at least 2^63. If q is the exact quotient truncated 610 | toward zero, the approximation returned lies between q and q + 2 inclusive. 611 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit 612 | unsigned integer is returned. 613 *----------------------------------------------------------------------------*/ 614 615 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b) 616 { 617 uint64_t b0, b1; 618 uint64_t rem0, rem1, term0, term1; 619 uint64_t z; 620 621 if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF); 622 b0 = b>>32; 623 z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32; 624 mul64To128( b, z, &term0, &term1 ); 625 sub128( a0, a1, term0, term1, &rem0, &rem1 ); 626 while ( ( (int64_t) rem0 ) < 0 ) { 627 z -= UINT64_C(0x100000000); 628 b1 = b<<32; 629 add128( rem0, rem1, b0, b1, &rem0, &rem1 ); 630 } 631 rem0 = ( rem0<<32 ) | ( rem1>>32 ); 632 z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; 633 return z; 634 635 } 636 637 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd 638 * (https://gmplib.org/repo/gmp/file/tip/longlong.h) 639 * 640 * Licensed under the GPLv2/LGPLv3 641 */ 642 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, 643 uint64_t n0, uint64_t d) 644 { 645 #if defined(__x86_64__) 646 uint64_t q; 647 asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); 648 return q; 649 #elif defined(__s390x__) && !defined(__clang__) 650 /* Need to use a TImode type to get an even register pair for DLGR. */ 651 unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; 652 asm("dlgr %0, %1" : "+r"(n) : "r"(d)); 653 *r = n >> 64; 654 return n; 655 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) 656 /* From Power ISA 2.06, programming note for divdeu. */ 657 uint64_t q1, q2, Q, r1, r2, R; 658 asm("divdeu %0,%2,%4; divdu %1,%3,%4" 659 : "=&r"(q1), "=r"(q2) 660 : "r"(n1), "r"(n0), "r"(d)); 661 r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */ 662 r2 = n0 - (q2 * d); 663 Q = q1 + q2; 664 R = r1 + r2; 665 if (R >= d || R < r2) { /* overflow implies R > d */ 666 Q += 1; 667 R -= d; 668 } 669 *r = R; 670 return Q; 671 #else 672 uint64_t d0, d1, q0, q1, r1, r0, m; 673 674 d0 = (uint32_t)d; 675 d1 = d >> 32; 676 677 r1 = n1 % d1; 678 q1 = n1 / d1; 679 m = q1 * d0; 680 r1 = (r1 << 32) | (n0 >> 32); 681 if (r1 < m) { 682 q1 -= 1; 683 r1 += d; 684 if (r1 >= d) { 685 if (r1 < m) { 686 q1 -= 1; 687 r1 += d; 688 } 689 } 690 } 691 r1 -= m; 692 693 r0 = r1 % d1; 694 q0 = r1 / d1; 695 m = q0 * d0; 696 r0 = (r0 << 32) | (uint32_t)n0; 697 if (r0 < m) { 698 q0 -= 1; 699 r0 += d; 700 if (r0 >= d) { 701 if (r0 < m) { 702 q0 -= 1; 703 r0 += d; 704 } 705 } 706 } 707 r0 -= m; 708 709 *r = r0; 710 return (q1 << 32) | q0; 711 #endif 712 } 713 714 /*---------------------------------------------------------------------------- 715 | Returns an approximation to the square root of the 32-bit significand given 716 | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of 717 | `aExp' (the least significant bit) is 1, the integer returned approximates 718 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' 719 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either 720 | case, the approximation returned lies strictly within +/-2 of the exact 721 | value. 722 *----------------------------------------------------------------------------*/ 723 724 static inline uint32_t estimateSqrt32(int aExp, uint32_t a) 725 { 726 static const uint16_t sqrtOddAdjustments[] = { 727 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, 728 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 729 }; 730 static const uint16_t sqrtEvenAdjustments[] = { 731 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, 732 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 733 }; 734 int8_t index; 735 uint32_t z; 736 737 index = ( a>>27 ) & 15; 738 if ( aExp & 1 ) { 739 z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ]; 740 z = ( ( a / z )<<14 ) + ( z<<15 ); 741 a >>= 1; 742 } 743 else { 744 z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ]; 745 z = a / z + z; 746 z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); 747 if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 ); 748 } 749 return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 ); 750 751 } 752 753 /*---------------------------------------------------------------------------- 754 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' 755 | is equal to the 128-bit value formed by concatenating `b0' and `b1'. 756 | Otherwise, returns 0. 757 *----------------------------------------------------------------------------*/ 758 759 static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 760 { 761 762 return ( a0 == b0 ) && ( a1 == b1 ); 763 764 } 765 766 /*---------------------------------------------------------------------------- 767 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 768 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'. 769 | Otherwise, returns 0. 770 *----------------------------------------------------------------------------*/ 771 772 static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 773 { 774 775 return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); 776 777 } 778 779 /*---------------------------------------------------------------------------- 780 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 781 | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, 782 | returns 0. 783 *----------------------------------------------------------------------------*/ 784 785 static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 786 { 787 788 return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); 789 790 } 791 792 /*---------------------------------------------------------------------------- 793 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is 794 | not equal to the 128-bit value formed by concatenating `b0' and `b1'. 795 | Otherwise, returns 0. 796 *----------------------------------------------------------------------------*/ 797 798 static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 799 { 800 801 return ( a0 != b0 ) || ( a1 != b1 ); 802 803 } 804 805 #endif 806