1 /* 2 * QEMU float support macros 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 #ifndef FPU_SOFTFLOAT_MACROS_H 83 #define FPU_SOFTFLOAT_MACROS_H 84 85 /*---------------------------------------------------------------------------- 86 | Shifts `a' right by the number of bits given in `count'. If any nonzero 87 | bits are shifted off, they are ``jammed'' into the least significant bit of 88 | the result by setting the least significant bit to 1. The value of `count' 89 | can be arbitrarily large; in particular, if `count' is greater than 32, the 90 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 91 | The result is stored in the location pointed to by `zPtr'. 92 *----------------------------------------------------------------------------*/ 93 94 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr) 95 { 96 uint32_t z; 97 98 if ( count == 0 ) { 99 z = a; 100 } 101 else if ( count < 32 ) { 102 z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); 103 } 104 else { 105 z = ( a != 0 ); 106 } 107 *zPtr = z; 108 109 } 110 111 /*---------------------------------------------------------------------------- 112 | Shifts `a' right by the number of bits given in `count'. If any nonzero 113 | bits are shifted off, they are ``jammed'' into the least significant bit of 114 | the result by setting the least significant bit to 1. The value of `count' 115 | can be arbitrarily large; in particular, if `count' is greater than 64, the 116 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 117 | The result is stored in the location pointed to by `zPtr'. 118 *----------------------------------------------------------------------------*/ 119 120 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr) 121 { 122 uint64_t z; 123 124 if ( count == 0 ) { 125 z = a; 126 } 127 else if ( count < 64 ) { 128 z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); 129 } 130 else { 131 z = ( a != 0 ); 132 } 133 *zPtr = z; 134 135 } 136 137 /*---------------------------------------------------------------------------- 138 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 139 | _plus_ the number of bits given in `count'. The shifted result is at most 140 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The 141 | bits shifted off form a second 64-bit result as follows: The _last_ bit 142 | shifted off is the most-significant bit of the extra result, and the other 143 | 63 bits of the extra result are all zero if and only if _all_but_the_last_ 144 | bits shifted off were all zero. This extra result is stored in the location 145 | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. 146 | (This routine makes more sense if `a0' and `a1' are considered to form a 147 | fixed-point value with binary point between `a0' and `a1'. This fixed-point 148 | value is shifted right by the number of bits given in `count', and the 149 | integer part of the result is returned at the location pointed to by 150 | `z0Ptr'. The fractional part of the result may be slightly corrupted as 151 | described above, and is returned at the location pointed to by `z1Ptr'.) 152 *----------------------------------------------------------------------------*/ 153 154 static inline void 155 shift64ExtraRightJamming( 156 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 157 { 158 uint64_t z0, z1; 159 int8_t negCount = ( - count ) & 63; 160 161 if ( count == 0 ) { 162 z1 = a1; 163 z0 = a0; 164 } 165 else if ( count < 64 ) { 166 z1 = ( a0<<negCount ) | ( a1 != 0 ); 167 z0 = a0>>count; 168 } 169 else { 170 if ( count == 64 ) { 171 z1 = a0 | ( a1 != 0 ); 172 } 173 else { 174 z1 = ( ( a0 | a1 ) != 0 ); 175 } 176 z0 = 0; 177 } 178 *z1Ptr = z1; 179 *z0Ptr = z0; 180 181 } 182 183 /*---------------------------------------------------------------------------- 184 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 185 | number of bits given in `count'. Any bits shifted off are lost. The value 186 | of `count' can be arbitrarily large; in particular, if `count' is greater 187 | than 128, the result will be 0. The result is broken into two 64-bit pieces 188 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 189 *----------------------------------------------------------------------------*/ 190 191 static inline void 192 shift128Right( 193 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 194 { 195 uint64_t z0, z1; 196 int8_t negCount = ( - count ) & 63; 197 198 if ( count == 0 ) { 199 z1 = a1; 200 z0 = a0; 201 } 202 else if ( count < 64 ) { 203 z1 = ( a0<<negCount ) | ( a1>>count ); 204 z0 = a0>>count; 205 } 206 else { 207 z1 = (count < 128) ? (a0 >> (count & 63)) : 0; 208 z0 = 0; 209 } 210 *z1Ptr = z1; 211 *z0Ptr = z0; 212 213 } 214 215 /*---------------------------------------------------------------------------- 216 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 217 | number of bits given in `count'. If any nonzero bits are shifted off, they 218 | are ``jammed'' into the least significant bit of the result by setting the 219 | least significant bit to 1. The value of `count' can be arbitrarily large; 220 | in particular, if `count' is greater than 128, the result will be either 221 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or 222 | nonzero. The result is broken into two 64-bit pieces which are stored at 223 | the locations pointed to by `z0Ptr' and `z1Ptr'. 224 *----------------------------------------------------------------------------*/ 225 226 static inline void 227 shift128RightJamming( 228 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 229 { 230 uint64_t z0, z1; 231 int8_t negCount = ( - count ) & 63; 232 233 if ( count == 0 ) { 234 z1 = a1; 235 z0 = a0; 236 } 237 else if ( count < 64 ) { 238 z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); 239 z0 = a0>>count; 240 } 241 else { 242 if ( count == 64 ) { 243 z1 = a0 | ( a1 != 0 ); 244 } 245 else if ( count < 128 ) { 246 z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); 247 } 248 else { 249 z1 = ( ( a0 | a1 ) != 0 ); 250 } 251 z0 = 0; 252 } 253 *z1Ptr = z1; 254 *z0Ptr = z0; 255 256 } 257 258 /*---------------------------------------------------------------------------- 259 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right 260 | by 64 _plus_ the number of bits given in `count'. The shifted result is 261 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are 262 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted 263 | off form a third 64-bit result as follows: The _last_ bit shifted off is 264 | the most-significant bit of the extra result, and the other 63 bits of the 265 | extra result are all zero if and only if _all_but_the_last_ bits shifted off 266 | were all zero. This extra result is stored in the location pointed to by 267 | `z2Ptr'. The value of `count' can be arbitrarily large. 268 | (This routine makes more sense if `a0', `a1', and `a2' are considered 269 | to form a fixed-point value with binary point between `a1' and `a2'. This 270 | fixed-point value is shifted right by the number of bits given in `count', 271 | and the integer part of the result is returned at the locations pointed to 272 | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly 273 | corrupted as described above, and is returned at the location pointed to by 274 | `z2Ptr'.) 275 *----------------------------------------------------------------------------*/ 276 277 static inline void 278 shift128ExtraRightJamming( 279 uint64_t a0, 280 uint64_t a1, 281 uint64_t a2, 282 int count, 283 uint64_t *z0Ptr, 284 uint64_t *z1Ptr, 285 uint64_t *z2Ptr 286 ) 287 { 288 uint64_t z0, z1, z2; 289 int8_t negCount = ( - count ) & 63; 290 291 if ( count == 0 ) { 292 z2 = a2; 293 z1 = a1; 294 z0 = a0; 295 } 296 else { 297 if ( count < 64 ) { 298 z2 = a1<<negCount; 299 z1 = ( a0<<negCount ) | ( a1>>count ); 300 z0 = a0>>count; 301 } 302 else { 303 if ( count == 64 ) { 304 z2 = a1; 305 z1 = a0; 306 } 307 else { 308 a2 |= a1; 309 if ( count < 128 ) { 310 z2 = a0<<negCount; 311 z1 = a0>>( count & 63 ); 312 } 313 else { 314 z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); 315 z1 = 0; 316 } 317 } 318 z0 = 0; 319 } 320 z2 |= ( a2 != 0 ); 321 } 322 *z2Ptr = z2; 323 *z1Ptr = z1; 324 *z0Ptr = z0; 325 326 } 327 328 /*---------------------------------------------------------------------------- 329 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 330 | number of bits given in `count'. Any bits shifted off are lost. The value 331 | of `count' must be less than 64. The result is broken into two 64-bit 332 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 333 *----------------------------------------------------------------------------*/ 334 335 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count, 336 uint64_t *z0Ptr, uint64_t *z1Ptr) 337 { 338 *z1Ptr = a1 << count; 339 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 340 } 341 342 /*---------------------------------------------------------------------------- 343 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 344 | number of bits given in `count'. Any bits shifted off are lost. The value 345 | of `count' may be greater than 64. The result is broken into two 64-bit 346 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 347 *----------------------------------------------------------------------------*/ 348 349 static inline void shift128Left(uint64_t a0, uint64_t a1, int count, 350 uint64_t *z0Ptr, uint64_t *z1Ptr) 351 { 352 if (count < 64) { 353 *z1Ptr = a1 << count; 354 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 355 } else { 356 *z1Ptr = 0; 357 *z0Ptr = a1 << (count - 64); 358 } 359 } 360 361 /*---------------------------------------------------------------------------- 362 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left 363 | by the number of bits given in `count'. Any bits shifted off are lost. 364 | The value of `count' must be less than 64. The result is broken into three 365 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 366 | `z1Ptr', and `z2Ptr'. 367 *----------------------------------------------------------------------------*/ 368 369 static inline void 370 shortShift192Left( 371 uint64_t a0, 372 uint64_t a1, 373 uint64_t a2, 374 int count, 375 uint64_t *z0Ptr, 376 uint64_t *z1Ptr, 377 uint64_t *z2Ptr 378 ) 379 { 380 uint64_t z0, z1, z2; 381 int8_t negCount; 382 383 z2 = a2<<count; 384 z1 = a1<<count; 385 z0 = a0<<count; 386 if ( 0 < count ) { 387 negCount = ( ( - count ) & 63 ); 388 z1 |= a2>>negCount; 389 z0 |= a1>>negCount; 390 } 391 *z2Ptr = z2; 392 *z1Ptr = z1; 393 *z0Ptr = z0; 394 395 } 396 397 /*---------------------------------------------------------------------------- 398 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit 399 | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so 400 | any carry out is lost. The result is broken into two 64-bit pieces which 401 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 402 *----------------------------------------------------------------------------*/ 403 404 static inline void 405 add128( 406 uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr ) 407 { 408 uint64_t z1; 409 410 z1 = a1 + b1; 411 *z1Ptr = z1; 412 *z0Ptr = a0 + b0 + ( z1 < a1 ); 413 414 } 415 416 /*---------------------------------------------------------------------------- 417 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the 418 | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is 419 | modulo 2^192, so any carry out is lost. The result is broken into three 420 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 421 | `z1Ptr', and `z2Ptr'. 422 *----------------------------------------------------------------------------*/ 423 424 static inline void 425 add192( 426 uint64_t a0, 427 uint64_t a1, 428 uint64_t a2, 429 uint64_t b0, 430 uint64_t b1, 431 uint64_t b2, 432 uint64_t *z0Ptr, 433 uint64_t *z1Ptr, 434 uint64_t *z2Ptr 435 ) 436 { 437 uint64_t z0, z1, z2; 438 int8_t carry0, carry1; 439 440 z2 = a2 + b2; 441 carry1 = ( z2 < a2 ); 442 z1 = a1 + b1; 443 carry0 = ( z1 < a1 ); 444 z0 = a0 + b0; 445 z1 += carry1; 446 z0 += ( z1 < carry1 ); 447 z0 += carry0; 448 *z2Ptr = z2; 449 *z1Ptr = z1; 450 *z0Ptr = z0; 451 452 } 453 454 /*---------------------------------------------------------------------------- 455 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the 456 | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo 457 | 2^128, so any borrow out (carry out) is lost. The result is broken into two 458 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and 459 | `z1Ptr'. 460 *----------------------------------------------------------------------------*/ 461 462 static inline void 463 sub128( 464 uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr ) 465 { 466 467 *z1Ptr = a1 - b1; 468 *z0Ptr = a0 - b0 - ( a1 < b1 ); 469 470 } 471 472 /*---------------------------------------------------------------------------- 473 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' 474 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'. 475 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The 476 | result is broken into three 64-bit pieces which are stored at the locations 477 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. 478 *----------------------------------------------------------------------------*/ 479 480 static inline void 481 sub192( 482 uint64_t a0, 483 uint64_t a1, 484 uint64_t a2, 485 uint64_t b0, 486 uint64_t b1, 487 uint64_t b2, 488 uint64_t *z0Ptr, 489 uint64_t *z1Ptr, 490 uint64_t *z2Ptr 491 ) 492 { 493 uint64_t z0, z1, z2; 494 int8_t borrow0, borrow1; 495 496 z2 = a2 - b2; 497 borrow1 = ( a2 < b2 ); 498 z1 = a1 - b1; 499 borrow0 = ( a1 < b1 ); 500 z0 = a0 - b0; 501 z0 -= ( z1 < borrow1 ); 502 z1 -= borrow1; 503 z0 -= borrow0; 504 *z2Ptr = z2; 505 *z1Ptr = z1; 506 *z0Ptr = z0; 507 508 } 509 510 /*---------------------------------------------------------------------------- 511 | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken 512 | into two 64-bit pieces which are stored at the locations pointed to by 513 | `z0Ptr' and `z1Ptr'. 514 *----------------------------------------------------------------------------*/ 515 516 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr ) 517 { 518 uint32_t aHigh, aLow, bHigh, bLow; 519 uint64_t z0, zMiddleA, zMiddleB, z1; 520 521 aLow = a; 522 aHigh = a>>32; 523 bLow = b; 524 bHigh = b>>32; 525 z1 = ( (uint64_t) aLow ) * bLow; 526 zMiddleA = ( (uint64_t) aLow ) * bHigh; 527 zMiddleB = ( (uint64_t) aHigh ) * bLow; 528 z0 = ( (uint64_t) aHigh ) * bHigh; 529 zMiddleA += zMiddleB; 530 z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); 531 zMiddleA <<= 32; 532 z1 += zMiddleA; 533 z0 += ( z1 < zMiddleA ); 534 *z1Ptr = z1; 535 *z0Ptr = z0; 536 537 } 538 539 /*---------------------------------------------------------------------------- 540 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by 541 | `b' to obtain a 192-bit product. The product is broken into three 64-bit 542 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and 543 | `z2Ptr'. 544 *----------------------------------------------------------------------------*/ 545 546 static inline void 547 mul128By64To192( 548 uint64_t a0, 549 uint64_t a1, 550 uint64_t b, 551 uint64_t *z0Ptr, 552 uint64_t *z1Ptr, 553 uint64_t *z2Ptr 554 ) 555 { 556 uint64_t z0, z1, z2, more1; 557 558 mul64To128( a1, b, &z1, &z2 ); 559 mul64To128( a0, b, &z0, &more1 ); 560 add128( z0, more1, 0, z1, &z0, &z1 ); 561 *z2Ptr = z2; 562 *z1Ptr = z1; 563 *z0Ptr = z0; 564 565 } 566 567 /*---------------------------------------------------------------------------- 568 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the 569 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit 570 | product. The product is broken into four 64-bit pieces which are stored at 571 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. 572 *----------------------------------------------------------------------------*/ 573 574 static inline void 575 mul128To256( 576 uint64_t a0, 577 uint64_t a1, 578 uint64_t b0, 579 uint64_t b1, 580 uint64_t *z0Ptr, 581 uint64_t *z1Ptr, 582 uint64_t *z2Ptr, 583 uint64_t *z3Ptr 584 ) 585 { 586 uint64_t z0, z1, z2, z3; 587 uint64_t more1, more2; 588 589 mul64To128( a1, b1, &z2, &z3 ); 590 mul64To128( a1, b0, &z1, &more2 ); 591 add128( z1, more2, 0, z2, &z1, &z2 ); 592 mul64To128( a0, b0, &z0, &more1 ); 593 add128( z0, more1, 0, z1, &z0, &z1 ); 594 mul64To128( a0, b1, &more1, &more2 ); 595 add128( more1, more2, 0, z2, &more1, &z2 ); 596 add128( z0, z1, 0, more1, &z0, &z1 ); 597 *z3Ptr = z3; 598 *z2Ptr = z2; 599 *z1Ptr = z1; 600 *z0Ptr = z0; 601 602 } 603 604 /*---------------------------------------------------------------------------- 605 | Returns an approximation to the 64-bit integer quotient obtained by dividing 606 | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The 607 | divisor `b' must be at least 2^63. If q is the exact quotient truncated 608 | toward zero, the approximation returned lies between q and q + 2 inclusive. 609 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit 610 | unsigned integer is returned. 611 *----------------------------------------------------------------------------*/ 612 613 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b) 614 { 615 uint64_t b0, b1; 616 uint64_t rem0, rem1, term0, term1; 617 uint64_t z; 618 619 if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); 620 b0 = b>>32; 621 z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32; 622 mul64To128( b, z, &term0, &term1 ); 623 sub128( a0, a1, term0, term1, &rem0, &rem1 ); 624 while ( ( (int64_t) rem0 ) < 0 ) { 625 z -= LIT64( 0x100000000 ); 626 b1 = b<<32; 627 add128( rem0, rem1, b0, b1, &rem0, &rem1 ); 628 } 629 rem0 = ( rem0<<32 ) | ( rem1>>32 ); 630 z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; 631 return z; 632 633 } 634 635 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd 636 * (https://gmplib.org/repo/gmp/file/tip/longlong.h) 637 * 638 * Licensed under the GPLv2/LGPLv3 639 */ 640 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, 641 uint64_t n0, uint64_t d) 642 { 643 #if defined(__x86_64__) 644 uint64_t q; 645 asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); 646 return q; 647 #elif defined(__s390x__) && !defined(__clang__) 648 /* Need to use a TImode type to get an even register pair for DLGR. */ 649 unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; 650 asm("dlgr %0, %1" : "+r"(n) : "r"(d)); 651 *r = n >> 64; 652 return n; 653 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) 654 /* From Power ISA 2.06, programming note for divdeu. */ 655 uint64_t q1, q2, Q, r1, r2, R; 656 asm("divdeu %0,%2,%4; divdu %1,%3,%4" 657 : "=&r"(q1), "=r"(q2) 658 : "r"(n1), "r"(n0), "r"(d)); 659 r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */ 660 r2 = n0 - (q2 * d); 661 Q = q1 + q2; 662 R = r1 + r2; 663 if (R >= d || R < r2) { /* overflow implies R > d */ 664 Q += 1; 665 R -= d; 666 } 667 *r = R; 668 return Q; 669 #else 670 uint64_t d0, d1, q0, q1, r1, r0, m; 671 672 d0 = (uint32_t)d; 673 d1 = d >> 32; 674 675 r1 = n1 % d1; 676 q1 = n1 / d1; 677 m = q1 * d0; 678 r1 = (r1 << 32) | (n0 >> 32); 679 if (r1 < m) { 680 q1 -= 1; 681 r1 += d; 682 if (r1 >= d) { 683 if (r1 < m) { 684 q1 -= 1; 685 r1 += d; 686 } 687 } 688 } 689 r1 -= m; 690 691 r0 = r1 % d1; 692 q0 = r1 / d1; 693 m = q0 * d0; 694 r0 = (r0 << 32) | (uint32_t)n0; 695 if (r0 < m) { 696 q0 -= 1; 697 r0 += d; 698 if (r0 >= d) { 699 if (r0 < m) { 700 q0 -= 1; 701 r0 += d; 702 } 703 } 704 } 705 r0 -= m; 706 707 *r = r0; 708 return (q1 << 32) | q0; 709 #endif 710 } 711 712 /*---------------------------------------------------------------------------- 713 | Returns an approximation to the square root of the 32-bit significand given 714 | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of 715 | `aExp' (the least significant bit) is 1, the integer returned approximates 716 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' 717 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either 718 | case, the approximation returned lies strictly within +/-2 of the exact 719 | value. 720 *----------------------------------------------------------------------------*/ 721 722 static inline uint32_t estimateSqrt32(int aExp, uint32_t a) 723 { 724 static const uint16_t sqrtOddAdjustments[] = { 725 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, 726 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 727 }; 728 static const uint16_t sqrtEvenAdjustments[] = { 729 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, 730 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 731 }; 732 int8_t index; 733 uint32_t z; 734 735 index = ( a>>27 ) & 15; 736 if ( aExp & 1 ) { 737 z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ]; 738 z = ( ( a / z )<<14 ) + ( z<<15 ); 739 a >>= 1; 740 } 741 else { 742 z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ]; 743 z = a / z + z; 744 z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); 745 if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 ); 746 } 747 return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 ); 748 749 } 750 751 /*---------------------------------------------------------------------------- 752 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' 753 | is equal to the 128-bit value formed by concatenating `b0' and `b1'. 754 | Otherwise, returns 0. 755 *----------------------------------------------------------------------------*/ 756 757 static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 758 { 759 760 return ( a0 == b0 ) && ( a1 == b1 ); 761 762 } 763 764 /*---------------------------------------------------------------------------- 765 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 766 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'. 767 | Otherwise, returns 0. 768 *----------------------------------------------------------------------------*/ 769 770 static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 771 { 772 773 return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); 774 775 } 776 777 /*---------------------------------------------------------------------------- 778 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 779 | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, 780 | returns 0. 781 *----------------------------------------------------------------------------*/ 782 783 static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 784 { 785 786 return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); 787 788 } 789 790 /*---------------------------------------------------------------------------- 791 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is 792 | not equal to the 128-bit value formed by concatenating `b0' and `b1'. 793 | Otherwise, returns 0. 794 *----------------------------------------------------------------------------*/ 795 796 static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) 797 { 798 799 return ( a0 != b0 ) || ( a1 != b1 ); 800 801 } 802 803 #endif 804