1 /* 2 * QEMU float support macros 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 #ifndef FPU_SOFTFLOAT_MACROS_H 83 #define FPU_SOFTFLOAT_MACROS_H 84 85 #include "fpu/softfloat-types.h" 86 #include "qemu/host-utils.h" 87 88 /** 89 * shl_double: double-word merging left shift 90 * @l: left or most-significant word 91 * @r: right or least-significant word 92 * @c: shift count 93 * 94 * Shift @l left by @c bits, shifting in bits from @r. 95 */ 96 static inline uint64_t shl_double(uint64_t l, uint64_t r, int c) 97 { 98 #if defined(__x86_64__) 99 asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c)); 100 return l; 101 #else 102 return c ? (l << c) | (r >> (64 - c)) : l; 103 #endif 104 } 105 106 /** 107 * shr_double: double-word merging right shift 108 * @l: left or most-significant word 109 * @r: right or least-significant word 110 * @c: shift count 111 * 112 * Shift @r right by @c bits, shifting in bits from @l. 113 */ 114 static inline uint64_t shr_double(uint64_t l, uint64_t r, int c) 115 { 116 #if defined(__x86_64__) 117 asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c)); 118 return r; 119 #else 120 return c ? (r >> c) | (l << (64 - c)) : r; 121 #endif 122 } 123 124 /*---------------------------------------------------------------------------- 125 | Shifts `a' right by the number of bits given in `count'. If any nonzero 126 | bits are shifted off, they are ``jammed'' into the least significant bit of 127 | the result by setting the least significant bit to 1. The value of `count' 128 | can be arbitrarily large; in particular, if `count' is greater than 32, the 129 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 130 | The result is stored in the location pointed to by `zPtr'. 131 *----------------------------------------------------------------------------*/ 132 133 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr) 134 { 135 uint32_t z; 136 137 if ( count == 0 ) { 138 z = a; 139 } 140 else if ( count < 32 ) { 141 z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); 142 } 143 else { 144 z = ( a != 0 ); 145 } 146 *zPtr = z; 147 148 } 149 150 /*---------------------------------------------------------------------------- 151 | Shifts `a' right by the number of bits given in `count'. If any nonzero 152 | bits are shifted off, they are ``jammed'' into the least significant bit of 153 | the result by setting the least significant bit to 1. The value of `count' 154 | can be arbitrarily large; in particular, if `count' is greater than 64, the 155 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. 156 | The result is stored in the location pointed to by `zPtr'. 157 *----------------------------------------------------------------------------*/ 158 159 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr) 160 { 161 uint64_t z; 162 163 if ( count == 0 ) { 164 z = a; 165 } 166 else if ( count < 64 ) { 167 z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); 168 } 169 else { 170 z = ( a != 0 ); 171 } 172 *zPtr = z; 173 174 } 175 176 /*---------------------------------------------------------------------------- 177 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 178 | _plus_ the number of bits given in `count'. The shifted result is at most 179 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The 180 | bits shifted off form a second 64-bit result as follows: The _last_ bit 181 | shifted off is the most-significant bit of the extra result, and the other 182 | 63 bits of the extra result are all zero if and only if _all_but_the_last_ 183 | bits shifted off were all zero. This extra result is stored in the location 184 | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. 185 | (This routine makes more sense if `a0' and `a1' are considered to form a 186 | fixed-point value with binary point between `a0' and `a1'. This fixed-point 187 | value is shifted right by the number of bits given in `count', and the 188 | integer part of the result is returned at the location pointed to by 189 | `z0Ptr'. The fractional part of the result may be slightly corrupted as 190 | described above, and is returned at the location pointed to by `z1Ptr'.) 191 *----------------------------------------------------------------------------*/ 192 193 static inline void 194 shift64ExtraRightJamming( 195 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 196 { 197 uint64_t z0, z1; 198 int8_t negCount = ( - count ) & 63; 199 200 if ( count == 0 ) { 201 z1 = a1; 202 z0 = a0; 203 } 204 else if ( count < 64 ) { 205 z1 = ( a0<<negCount ) | ( a1 != 0 ); 206 z0 = a0>>count; 207 } 208 else { 209 if ( count == 64 ) { 210 z1 = a0 | ( a1 != 0 ); 211 } 212 else { 213 z1 = ( ( a0 | a1 ) != 0 ); 214 } 215 z0 = 0; 216 } 217 *z1Ptr = z1; 218 *z0Ptr = z0; 219 220 } 221 222 /*---------------------------------------------------------------------------- 223 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 224 | number of bits given in `count'. Any bits shifted off are lost. The value 225 | of `count' can be arbitrarily large; in particular, if `count' is greater 226 | than 128, the result will be 0. The result is broken into two 64-bit pieces 227 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 228 *----------------------------------------------------------------------------*/ 229 230 static inline void 231 shift128Right( 232 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 233 { 234 uint64_t z0, z1; 235 int8_t negCount = ( - count ) & 63; 236 237 if ( count == 0 ) { 238 z1 = a1; 239 z0 = a0; 240 } 241 else if ( count < 64 ) { 242 z1 = ( a0<<negCount ) | ( a1>>count ); 243 z0 = a0>>count; 244 } 245 else { 246 z1 = (count < 128) ? (a0 >> (count & 63)) : 0; 247 z0 = 0; 248 } 249 *z1Ptr = z1; 250 *z0Ptr = z0; 251 252 } 253 254 /*---------------------------------------------------------------------------- 255 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the 256 | number of bits given in `count'. If any nonzero bits are shifted off, they 257 | are ``jammed'' into the least significant bit of the result by setting the 258 | least significant bit to 1. The value of `count' can be arbitrarily large; 259 | in particular, if `count' is greater than 128, the result will be either 260 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or 261 | nonzero. The result is broken into two 64-bit pieces which are stored at 262 | the locations pointed to by `z0Ptr' and `z1Ptr'. 263 *----------------------------------------------------------------------------*/ 264 265 static inline void 266 shift128RightJamming( 267 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) 268 { 269 uint64_t z0, z1; 270 int8_t negCount = ( - count ) & 63; 271 272 if ( count == 0 ) { 273 z1 = a1; 274 z0 = a0; 275 } 276 else if ( count < 64 ) { 277 z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); 278 z0 = a0>>count; 279 } 280 else { 281 if ( count == 64 ) { 282 z1 = a0 | ( a1 != 0 ); 283 } 284 else if ( count < 128 ) { 285 z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); 286 } 287 else { 288 z1 = ( ( a0 | a1 ) != 0 ); 289 } 290 z0 = 0; 291 } 292 *z1Ptr = z1; 293 *z0Ptr = z0; 294 295 } 296 297 /*---------------------------------------------------------------------------- 298 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right 299 | by 64 _plus_ the number of bits given in `count'. The shifted result is 300 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are 301 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted 302 | off form a third 64-bit result as follows: The _last_ bit shifted off is 303 | the most-significant bit of the extra result, and the other 63 bits of the 304 | extra result are all zero if and only if _all_but_the_last_ bits shifted off 305 | were all zero. This extra result is stored in the location pointed to by 306 | `z2Ptr'. The value of `count' can be arbitrarily large. 307 | (This routine makes more sense if `a0', `a1', and `a2' are considered 308 | to form a fixed-point value with binary point between `a1' and `a2'. This 309 | fixed-point value is shifted right by the number of bits given in `count', 310 | and the integer part of the result is returned at the locations pointed to 311 | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly 312 | corrupted as described above, and is returned at the location pointed to by 313 | `z2Ptr'.) 314 *----------------------------------------------------------------------------*/ 315 316 static inline void 317 shift128ExtraRightJamming( 318 uint64_t a0, 319 uint64_t a1, 320 uint64_t a2, 321 int count, 322 uint64_t *z0Ptr, 323 uint64_t *z1Ptr, 324 uint64_t *z2Ptr 325 ) 326 { 327 uint64_t z0, z1, z2; 328 int8_t negCount = ( - count ) & 63; 329 330 if ( count == 0 ) { 331 z2 = a2; 332 z1 = a1; 333 z0 = a0; 334 } 335 else { 336 if ( count < 64 ) { 337 z2 = a1<<negCount; 338 z1 = ( a0<<negCount ) | ( a1>>count ); 339 z0 = a0>>count; 340 } 341 else { 342 if ( count == 64 ) { 343 z2 = a1; 344 z1 = a0; 345 } 346 else { 347 a2 |= a1; 348 if ( count < 128 ) { 349 z2 = a0<<negCount; 350 z1 = a0>>( count & 63 ); 351 } 352 else { 353 z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); 354 z1 = 0; 355 } 356 } 357 z0 = 0; 358 } 359 z2 |= ( a2 != 0 ); 360 } 361 *z2Ptr = z2; 362 *z1Ptr = z1; 363 *z0Ptr = z0; 364 365 } 366 367 /*---------------------------------------------------------------------------- 368 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 369 | number of bits given in `count'. Any bits shifted off are lost. The value 370 | of `count' must be less than 64. The result is broken into two 64-bit 371 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 372 *----------------------------------------------------------------------------*/ 373 374 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count, 375 uint64_t *z0Ptr, uint64_t *z1Ptr) 376 { 377 *z1Ptr = a1 << count; 378 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 379 } 380 381 /*---------------------------------------------------------------------------- 382 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the 383 | number of bits given in `count'. Any bits shifted off are lost. The value 384 | of `count' may be greater than 64. The result is broken into two 64-bit 385 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 386 *----------------------------------------------------------------------------*/ 387 388 static inline void shift128Left(uint64_t a0, uint64_t a1, int count, 389 uint64_t *z0Ptr, uint64_t *z1Ptr) 390 { 391 if (count < 64) { 392 *z1Ptr = a1 << count; 393 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63)); 394 } else { 395 *z1Ptr = 0; 396 *z0Ptr = a1 << (count - 64); 397 } 398 } 399 400 /*---------------------------------------------------------------------------- 401 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left 402 | by the number of bits given in `count'. Any bits shifted off are lost. 403 | The value of `count' must be less than 64. The result is broken into three 404 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 405 | `z1Ptr', and `z2Ptr'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline void 409 shortShift192Left( 410 uint64_t a0, 411 uint64_t a1, 412 uint64_t a2, 413 int count, 414 uint64_t *z0Ptr, 415 uint64_t *z1Ptr, 416 uint64_t *z2Ptr 417 ) 418 { 419 uint64_t z0, z1, z2; 420 int8_t negCount; 421 422 z2 = a2<<count; 423 z1 = a1<<count; 424 z0 = a0<<count; 425 if ( 0 < count ) { 426 negCount = ( ( - count ) & 63 ); 427 z1 |= a2>>negCount; 428 z0 |= a1>>negCount; 429 } 430 *z2Ptr = z2; 431 *z1Ptr = z1; 432 *z0Ptr = z0; 433 434 } 435 436 /*---------------------------------------------------------------------------- 437 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit 438 | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so 439 | any carry out is lost. The result is broken into two 64-bit pieces which 440 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. 441 *----------------------------------------------------------------------------*/ 442 443 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, 444 uint64_t *z0Ptr, uint64_t *z1Ptr) 445 { 446 bool c = 0; 447 *z1Ptr = uadd64_carry(a1, b1, &c); 448 *z0Ptr = uadd64_carry(a0, b0, &c); 449 } 450 451 /*---------------------------------------------------------------------------- 452 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the 453 | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is 454 | modulo 2^192, so any carry out is lost. The result is broken into three 455 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', 456 | `z1Ptr', and `z2Ptr'. 457 *----------------------------------------------------------------------------*/ 458 459 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2, 460 uint64_t b0, uint64_t b1, uint64_t b2, 461 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr) 462 { 463 bool c = 0; 464 *z2Ptr = uadd64_carry(a2, b2, &c); 465 *z1Ptr = uadd64_carry(a1, b1, &c); 466 *z0Ptr = uadd64_carry(a0, b0, &c); 467 } 468 469 /*---------------------------------------------------------------------------- 470 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the 471 | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo 472 | 2^128, so any borrow out (carry out) is lost. The result is broken into two 473 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and 474 | `z1Ptr'. 475 *----------------------------------------------------------------------------*/ 476 477 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, 478 uint64_t *z0Ptr, uint64_t *z1Ptr) 479 { 480 bool c = 0; 481 *z1Ptr = usub64_borrow(a1, b1, &c); 482 *z0Ptr = usub64_borrow(a0, b0, &c); 483 } 484 485 /*---------------------------------------------------------------------------- 486 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' 487 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'. 488 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The 489 | result is broken into three 64-bit pieces which are stored at the locations 490 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. 491 *----------------------------------------------------------------------------*/ 492 493 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2, 494 uint64_t b0, uint64_t b1, uint64_t b2, 495 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr) 496 { 497 bool c = 0; 498 *z2Ptr = usub64_borrow(a2, b2, &c); 499 *z1Ptr = usub64_borrow(a1, b1, &c); 500 *z0Ptr = usub64_borrow(a0, b0, &c); 501 } 502 503 /*---------------------------------------------------------------------------- 504 | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken 505 | into two 64-bit pieces which are stored at the locations pointed to by 506 | `z0Ptr' and `z1Ptr'. 507 *----------------------------------------------------------------------------*/ 508 509 static inline void 510 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr) 511 { 512 mulu64(z1Ptr, z0Ptr, a, b); 513 } 514 515 /*---------------------------------------------------------------------------- 516 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by 517 | `b' to obtain a 192-bit product. The product is broken into three 64-bit 518 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and 519 | `z2Ptr'. 520 *----------------------------------------------------------------------------*/ 521 522 static inline void 523 mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b, 524 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr) 525 { 526 uint64_t z0, z1, m1; 527 528 mul64To128(a1, b, &m1, z2Ptr); 529 mul64To128(a0, b, &z0, &z1); 530 add128(z0, z1, 0, m1, z0Ptr, z1Ptr); 531 } 532 533 /*---------------------------------------------------------------------------- 534 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the 535 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit 536 | product. The product is broken into four 64-bit pieces which are stored at 537 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. 538 *----------------------------------------------------------------------------*/ 539 540 static inline void mul128To256(uint64_t a0, uint64_t a1, 541 uint64_t b0, uint64_t b1, 542 uint64_t *z0Ptr, uint64_t *z1Ptr, 543 uint64_t *z2Ptr, uint64_t *z3Ptr) 544 { 545 uint64_t z0, z1, z2; 546 uint64_t m0, m1, m2, n1, n2; 547 548 mul64To128(a1, b0, &m1, &m2); 549 mul64To128(a0, b1, &n1, &n2); 550 mul64To128(a1, b1, &z2, z3Ptr); 551 mul64To128(a0, b0, &z0, &z1); 552 553 add192( 0, m1, m2, 0, n1, n2, &m0, &m1, &m2); 554 add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr); 555 } 556 557 /*---------------------------------------------------------------------------- 558 | Returns an approximation to the 64-bit integer quotient obtained by dividing 559 | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The 560 | divisor `b' must be at least 2^63. If q is the exact quotient truncated 561 | toward zero, the approximation returned lies between q and q + 2 inclusive. 562 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit 563 | unsigned integer is returned. 564 *----------------------------------------------------------------------------*/ 565 566 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b) 567 { 568 uint64_t b0, b1; 569 uint64_t rem0, rem1, term0, term1; 570 uint64_t z; 571 572 if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF); 573 b0 = b>>32; 574 z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32; 575 mul64To128( b, z, &term0, &term1 ); 576 sub128( a0, a1, term0, term1, &rem0, &rem1 ); 577 while ( ( (int64_t) rem0 ) < 0 ) { 578 z -= UINT64_C(0x100000000); 579 b1 = b<<32; 580 add128( rem0, rem1, b0, b1, &rem0, &rem1 ); 581 } 582 rem0 = ( rem0<<32 ) | ( rem1>>32 ); 583 z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; 584 return z; 585 586 } 587 588 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd 589 * (https://gmplib.org/repo/gmp/file/tip/longlong.h) 590 * 591 * Licensed under the GPLv2/LGPLv3 592 */ 593 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, 594 uint64_t n0, uint64_t d) 595 { 596 #if defined(__x86_64__) 597 uint64_t q; 598 asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); 599 return q; 600 #elif defined(__s390x__) && !defined(__clang__) 601 /* Need to use a TImode type to get an even register pair for DLGR. */ 602 unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; 603 asm("dlgr %0, %1" : "+r"(n) : "r"(d)); 604 *r = n >> 64; 605 return n; 606 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) 607 /* From Power ISA 2.06, programming note for divdeu. */ 608 uint64_t q1, q2, Q, r1, r2, R; 609 asm("divdeu %0,%2,%4; divdu %1,%3,%4" 610 : "=&r"(q1), "=r"(q2) 611 : "r"(n1), "r"(n0), "r"(d)); 612 r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */ 613 r2 = n0 - (q2 * d); 614 Q = q1 + q2; 615 R = r1 + r2; 616 if (R >= d || R < r2) { /* overflow implies R > d */ 617 Q += 1; 618 R -= d; 619 } 620 *r = R; 621 return Q; 622 #else 623 uint64_t d0, d1, q0, q1, r1, r0, m; 624 625 d0 = (uint32_t)d; 626 d1 = d >> 32; 627 628 r1 = n1 % d1; 629 q1 = n1 / d1; 630 m = q1 * d0; 631 r1 = (r1 << 32) | (n0 >> 32); 632 if (r1 < m) { 633 q1 -= 1; 634 r1 += d; 635 if (r1 >= d) { 636 if (r1 < m) { 637 q1 -= 1; 638 r1 += d; 639 } 640 } 641 } 642 r1 -= m; 643 644 r0 = r1 % d1; 645 q0 = r1 / d1; 646 m = q0 * d0; 647 r0 = (r0 << 32) | (uint32_t)n0; 648 if (r0 < m) { 649 q0 -= 1; 650 r0 += d; 651 if (r0 >= d) { 652 if (r0 < m) { 653 q0 -= 1; 654 r0 += d; 655 } 656 } 657 } 658 r0 -= m; 659 660 *r = r0; 661 return (q1 << 32) | q0; 662 #endif 663 } 664 665 /*---------------------------------------------------------------------------- 666 | Returns an approximation to the square root of the 32-bit significand given 667 | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of 668 | `aExp' (the least significant bit) is 1, the integer returned approximates 669 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' 670 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either 671 | case, the approximation returned lies strictly within +/-2 of the exact 672 | value. 673 *----------------------------------------------------------------------------*/ 674 675 static inline uint32_t estimateSqrt32(int aExp, uint32_t a) 676 { 677 static const uint16_t sqrtOddAdjustments[] = { 678 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, 679 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 680 }; 681 static const uint16_t sqrtEvenAdjustments[] = { 682 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, 683 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 684 }; 685 int8_t index; 686 uint32_t z; 687 688 index = ( a>>27 ) & 15; 689 if ( aExp & 1 ) { 690 z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ]; 691 z = ( ( a / z )<<14 ) + ( z<<15 ); 692 a >>= 1; 693 } 694 else { 695 z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ]; 696 z = a / z + z; 697 z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); 698 if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 ); 699 } 700 return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 ); 701 702 } 703 704 /*---------------------------------------------------------------------------- 705 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' 706 | is equal to the 128-bit value formed by concatenating `b0' and `b1'. 707 | Otherwise, returns 0. 708 *----------------------------------------------------------------------------*/ 709 710 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 711 { 712 return a0 == b0 && a1 == b1; 713 } 714 715 /*---------------------------------------------------------------------------- 716 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 717 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'. 718 | Otherwise, returns 0. 719 *----------------------------------------------------------------------------*/ 720 721 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 722 { 723 return a0 < b0 || (a0 == b0 && a1 <= b1); 724 } 725 726 /*---------------------------------------------------------------------------- 727 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less 728 | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, 729 | returns 0. 730 *----------------------------------------------------------------------------*/ 731 732 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 733 { 734 return a0 < b0 || (a0 == b0 && a1 < b1); 735 } 736 737 /*---------------------------------------------------------------------------- 738 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is 739 | not equal to the 128-bit value formed by concatenating `b0' and `b1'. 740 | Otherwise, returns 0. 741 *----------------------------------------------------------------------------*/ 742 743 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) 744 { 745 return a0 != b0 || a1 != b1; 746 } 747 748 #endif 749