xref: /openbmc/qemu/include/fpu/softfloat-macros.h (revision da10a907)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84 
85 #include "fpu/softfloat-types.h"
86 #include "qemu/host-utils.h"
87 
88 /*----------------------------------------------------------------------------
89 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
90 | bits are shifted off, they are ``jammed'' into the least significant bit of
91 | the result by setting the least significant bit to 1.  The value of `count'
92 | can be arbitrarily large; in particular, if `count' is greater than 32, the
93 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
94 | The result is stored in the location pointed to by `zPtr'.
95 *----------------------------------------------------------------------------*/
96 
97 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
98 {
99     uint32_t z;
100 
101     if ( count == 0 ) {
102         z = a;
103     }
104     else if ( count < 32 ) {
105         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
106     }
107     else {
108         z = ( a != 0 );
109     }
110     *zPtr = z;
111 
112 }
113 
114 /*----------------------------------------------------------------------------
115 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
116 | bits are shifted off, they are ``jammed'' into the least significant bit of
117 | the result by setting the least significant bit to 1.  The value of `count'
118 | can be arbitrarily large; in particular, if `count' is greater than 64, the
119 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
120 | The result is stored in the location pointed to by `zPtr'.
121 *----------------------------------------------------------------------------*/
122 
123 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
124 {
125     uint64_t z;
126 
127     if ( count == 0 ) {
128         z = a;
129     }
130     else if ( count < 64 ) {
131         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
132     }
133     else {
134         z = ( a != 0 );
135     }
136     *zPtr = z;
137 
138 }
139 
140 /*----------------------------------------------------------------------------
141 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
142 | _plus_ the number of bits given in `count'.  The shifted result is at most
143 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
144 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
145 | shifted off is the most-significant bit of the extra result, and the other
146 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
147 | bits shifted off were all zero.  This extra result is stored in the location
148 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
149 |     (This routine makes more sense if `a0' and `a1' are considered to form a
150 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
151 | value is shifted right by the number of bits given in `count', and the
152 | integer part of the result is returned at the location pointed to by
153 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
154 | described above, and is returned at the location pointed to by `z1Ptr'.)
155 *----------------------------------------------------------------------------*/
156 
157 static inline void
158  shift64ExtraRightJamming(
159      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
160 {
161     uint64_t z0, z1;
162     int8_t negCount = ( - count ) & 63;
163 
164     if ( count == 0 ) {
165         z1 = a1;
166         z0 = a0;
167     }
168     else if ( count < 64 ) {
169         z1 = ( a0<<negCount ) | ( a1 != 0 );
170         z0 = a0>>count;
171     }
172     else {
173         if ( count == 64 ) {
174             z1 = a0 | ( a1 != 0 );
175         }
176         else {
177             z1 = ( ( a0 | a1 ) != 0 );
178         }
179         z0 = 0;
180     }
181     *z1Ptr = z1;
182     *z0Ptr = z0;
183 
184 }
185 
186 /*----------------------------------------------------------------------------
187 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
188 | number of bits given in `count'.  Any bits shifted off are lost.  The value
189 | of `count' can be arbitrarily large; in particular, if `count' is greater
190 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
191 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
192 *----------------------------------------------------------------------------*/
193 
194 static inline void
195  shift128Right(
196      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
197 {
198     uint64_t z0, z1;
199     int8_t negCount = ( - count ) & 63;
200 
201     if ( count == 0 ) {
202         z1 = a1;
203         z0 = a0;
204     }
205     else if ( count < 64 ) {
206         z1 = ( a0<<negCount ) | ( a1>>count );
207         z0 = a0>>count;
208     }
209     else {
210         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
211         z0 = 0;
212     }
213     *z1Ptr = z1;
214     *z0Ptr = z0;
215 
216 }
217 
218 /*----------------------------------------------------------------------------
219 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
220 | number of bits given in `count'.  If any nonzero bits are shifted off, they
221 | are ``jammed'' into the least significant bit of the result by setting the
222 | least significant bit to 1.  The value of `count' can be arbitrarily large;
223 | in particular, if `count' is greater than 128, the result will be either
224 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
225 | nonzero.  The result is broken into two 64-bit pieces which are stored at
226 | the locations pointed to by `z0Ptr' and `z1Ptr'.
227 *----------------------------------------------------------------------------*/
228 
229 static inline void
230  shift128RightJamming(
231      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
232 {
233     uint64_t z0, z1;
234     int8_t negCount = ( - count ) & 63;
235 
236     if ( count == 0 ) {
237         z1 = a1;
238         z0 = a0;
239     }
240     else if ( count < 64 ) {
241         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
242         z0 = a0>>count;
243     }
244     else {
245         if ( count == 64 ) {
246             z1 = a0 | ( a1 != 0 );
247         }
248         else if ( count < 128 ) {
249             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
250         }
251         else {
252             z1 = ( ( a0 | a1 ) != 0 );
253         }
254         z0 = 0;
255     }
256     *z1Ptr = z1;
257     *z0Ptr = z0;
258 
259 }
260 
261 /*----------------------------------------------------------------------------
262 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
263 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
264 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
265 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
266 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
267 | the most-significant bit of the extra result, and the other 63 bits of the
268 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
269 | were all zero.  This extra result is stored in the location pointed to by
270 | `z2Ptr'.  The value of `count' can be arbitrarily large.
271 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
272 | to form a fixed-point value with binary point between `a1' and `a2'.  This
273 | fixed-point value is shifted right by the number of bits given in `count',
274 | and the integer part of the result is returned at the locations pointed to
275 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
276 | corrupted as described above, and is returned at the location pointed to by
277 | `z2Ptr'.)
278 *----------------------------------------------------------------------------*/
279 
280 static inline void
281  shift128ExtraRightJamming(
282      uint64_t a0,
283      uint64_t a1,
284      uint64_t a2,
285      int count,
286      uint64_t *z0Ptr,
287      uint64_t *z1Ptr,
288      uint64_t *z2Ptr
289  )
290 {
291     uint64_t z0, z1, z2;
292     int8_t negCount = ( - count ) & 63;
293 
294     if ( count == 0 ) {
295         z2 = a2;
296         z1 = a1;
297         z0 = a0;
298     }
299     else {
300         if ( count < 64 ) {
301             z2 = a1<<negCount;
302             z1 = ( a0<<negCount ) | ( a1>>count );
303             z0 = a0>>count;
304         }
305         else {
306             if ( count == 64 ) {
307                 z2 = a1;
308                 z1 = a0;
309             }
310             else {
311                 a2 |= a1;
312                 if ( count < 128 ) {
313                     z2 = a0<<negCount;
314                     z1 = a0>>( count & 63 );
315                 }
316                 else {
317                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
318                     z1 = 0;
319                 }
320             }
321             z0 = 0;
322         }
323         z2 |= ( a2 != 0 );
324     }
325     *z2Ptr = z2;
326     *z1Ptr = z1;
327     *z0Ptr = z0;
328 
329 }
330 
331 /*----------------------------------------------------------------------------
332 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
333 | number of bits given in `count'.  Any bits shifted off are lost.  The value
334 | of `count' must be less than 64.  The result is broken into two 64-bit
335 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
336 *----------------------------------------------------------------------------*/
337 
338 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
339                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
340 {
341     *z1Ptr = a1 << count;
342     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
343 }
344 
345 /*----------------------------------------------------------------------------
346 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
347 | number of bits given in `count'.  Any bits shifted off are lost.  The value
348 | of `count' may be greater than 64.  The result is broken into two 64-bit
349 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
350 *----------------------------------------------------------------------------*/
351 
352 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
353                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
354 {
355     if (count < 64) {
356         *z1Ptr = a1 << count;
357         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
358     } else {
359         *z1Ptr = 0;
360         *z0Ptr = a1 << (count - 64);
361     }
362 }
363 
364 /*----------------------------------------------------------------------------
365 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
366 | by the number of bits given in `count'.  Any bits shifted off are lost.
367 | The value of `count' must be less than 64.  The result is broken into three
368 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
369 | `z1Ptr', and `z2Ptr'.
370 *----------------------------------------------------------------------------*/
371 
372 static inline void
373  shortShift192Left(
374      uint64_t a0,
375      uint64_t a1,
376      uint64_t a2,
377      int count,
378      uint64_t *z0Ptr,
379      uint64_t *z1Ptr,
380      uint64_t *z2Ptr
381  )
382 {
383     uint64_t z0, z1, z2;
384     int8_t negCount;
385 
386     z2 = a2<<count;
387     z1 = a1<<count;
388     z0 = a0<<count;
389     if ( 0 < count ) {
390         negCount = ( ( - count ) & 63 );
391         z1 |= a2>>negCount;
392         z0 |= a1>>negCount;
393     }
394     *z2Ptr = z2;
395     *z1Ptr = z1;
396     *z0Ptr = z0;
397 
398 }
399 
400 /*----------------------------------------------------------------------------
401 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
402 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
403 | any carry out is lost.  The result is broken into two 64-bit pieces which
404 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
405 *----------------------------------------------------------------------------*/
406 
407 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
408                           uint64_t *z0Ptr, uint64_t *z1Ptr)
409 {
410     bool c = 0;
411     *z1Ptr = uadd64_carry(a1, b1, &c);
412     *z0Ptr = uadd64_carry(a0, b0, &c);
413 }
414 
415 /*----------------------------------------------------------------------------
416 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
417 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
418 | modulo 2^192, so any carry out is lost.  The result is broken into three
419 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
420 | `z1Ptr', and `z2Ptr'.
421 *----------------------------------------------------------------------------*/
422 
423 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
424                           uint64_t b0, uint64_t b1, uint64_t b2,
425                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
426 {
427     bool c = 0;
428     *z2Ptr = uadd64_carry(a2, b2, &c);
429     *z1Ptr = uadd64_carry(a1, b1, &c);
430     *z0Ptr = uadd64_carry(a0, b0, &c);
431 }
432 
433 /*----------------------------------------------------------------------------
434 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
435 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
436 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
437 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
438 | `z1Ptr'.
439 *----------------------------------------------------------------------------*/
440 
441 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
442                           uint64_t *z0Ptr, uint64_t *z1Ptr)
443 {
444     bool c = 0;
445     *z1Ptr = usub64_borrow(a1, b1, &c);
446     *z0Ptr = usub64_borrow(a0, b0, &c);
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
451 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
452 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
453 | result is broken into three 64-bit pieces which are stored at the locations
454 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
458                           uint64_t b0, uint64_t b1, uint64_t b2,
459                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
460 {
461     bool c = 0;
462     *z2Ptr = usub64_borrow(a2, b2, &c);
463     *z1Ptr = usub64_borrow(a1, b1, &c);
464     *z0Ptr = usub64_borrow(a0, b0, &c);
465 }
466 
467 /*----------------------------------------------------------------------------
468 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
469 | into two 64-bit pieces which are stored at the locations pointed to by
470 | `z0Ptr' and `z1Ptr'.
471 *----------------------------------------------------------------------------*/
472 
473 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
474 {
475     uint32_t aHigh, aLow, bHigh, bLow;
476     uint64_t z0, zMiddleA, zMiddleB, z1;
477 
478     aLow = a;
479     aHigh = a>>32;
480     bLow = b;
481     bHigh = b>>32;
482     z1 = ( (uint64_t) aLow ) * bLow;
483     zMiddleA = ( (uint64_t) aLow ) * bHigh;
484     zMiddleB = ( (uint64_t) aHigh ) * bLow;
485     z0 = ( (uint64_t) aHigh ) * bHigh;
486     zMiddleA += zMiddleB;
487     z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
488     zMiddleA <<= 32;
489     z1 += zMiddleA;
490     z0 += ( z1 < zMiddleA );
491     *z1Ptr = z1;
492     *z0Ptr = z0;
493 
494 }
495 
496 /*----------------------------------------------------------------------------
497 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
498 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
499 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
500 | `z2Ptr'.
501 *----------------------------------------------------------------------------*/
502 
503 static inline void
504  mul128By64To192(
505      uint64_t a0,
506      uint64_t a1,
507      uint64_t b,
508      uint64_t *z0Ptr,
509      uint64_t *z1Ptr,
510      uint64_t *z2Ptr
511  )
512 {
513     uint64_t z0, z1, z2, more1;
514 
515     mul64To128( a1, b, &z1, &z2 );
516     mul64To128( a0, b, &z0, &more1 );
517     add128( z0, more1, 0, z1, &z0, &z1 );
518     *z2Ptr = z2;
519     *z1Ptr = z1;
520     *z0Ptr = z0;
521 
522 }
523 
524 /*----------------------------------------------------------------------------
525 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
526 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
527 | product.  The product is broken into four 64-bit pieces which are stored at
528 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
529 *----------------------------------------------------------------------------*/
530 
531 static inline void
532  mul128To256(
533      uint64_t a0,
534      uint64_t a1,
535      uint64_t b0,
536      uint64_t b1,
537      uint64_t *z0Ptr,
538      uint64_t *z1Ptr,
539      uint64_t *z2Ptr,
540      uint64_t *z3Ptr
541  )
542 {
543     uint64_t z0, z1, z2, z3;
544     uint64_t more1, more2;
545 
546     mul64To128( a1, b1, &z2, &z3 );
547     mul64To128( a1, b0, &z1, &more2 );
548     add128( z1, more2, 0, z2, &z1, &z2 );
549     mul64To128( a0, b0, &z0, &more1 );
550     add128( z0, more1, 0, z1, &z0, &z1 );
551     mul64To128( a0, b1, &more1, &more2 );
552     add128( more1, more2, 0, z2, &more1, &z2 );
553     add128( z0, z1, 0, more1, &z0, &z1 );
554     *z3Ptr = z3;
555     *z2Ptr = z2;
556     *z1Ptr = z1;
557     *z0Ptr = z0;
558 
559 }
560 
561 /*----------------------------------------------------------------------------
562 | Returns an approximation to the 64-bit integer quotient obtained by dividing
563 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
564 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
565 | toward zero, the approximation returned lies between q and q + 2 inclusive.
566 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
567 | unsigned integer is returned.
568 *----------------------------------------------------------------------------*/
569 
570 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
571 {
572     uint64_t b0, b1;
573     uint64_t rem0, rem1, term0, term1;
574     uint64_t z;
575 
576     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
577     b0 = b>>32;
578     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
579     mul64To128( b, z, &term0, &term1 );
580     sub128( a0, a1, term0, term1, &rem0, &rem1 );
581     while ( ( (int64_t) rem0 ) < 0 ) {
582         z -= UINT64_C(0x100000000);
583         b1 = b<<32;
584         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
585     }
586     rem0 = ( rem0<<32 ) | ( rem1>>32 );
587     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
588     return z;
589 
590 }
591 
592 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
593  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
594  *
595  * Licensed under the GPLv2/LGPLv3
596  */
597 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
598                                   uint64_t n0, uint64_t d)
599 {
600 #if defined(__x86_64__)
601     uint64_t q;
602     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
603     return q;
604 #elif defined(__s390x__) && !defined(__clang__)
605     /* Need to use a TImode type to get an even register pair for DLGR.  */
606     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
607     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
608     *r = n >> 64;
609     return n;
610 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
611     /* From Power ISA 2.06, programming note for divdeu.  */
612     uint64_t q1, q2, Q, r1, r2, R;
613     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
614         : "=&r"(q1), "=r"(q2)
615         : "r"(n1), "r"(n0), "r"(d));
616     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
617     r2 = n0 - (q2 * d);
618     Q = q1 + q2;
619     R = r1 + r2;
620     if (R >= d || R < r2) { /* overflow implies R > d */
621         Q += 1;
622         R -= d;
623     }
624     *r = R;
625     return Q;
626 #else
627     uint64_t d0, d1, q0, q1, r1, r0, m;
628 
629     d0 = (uint32_t)d;
630     d1 = d >> 32;
631 
632     r1 = n1 % d1;
633     q1 = n1 / d1;
634     m = q1 * d0;
635     r1 = (r1 << 32) | (n0 >> 32);
636     if (r1 < m) {
637         q1 -= 1;
638         r1 += d;
639         if (r1 >= d) {
640             if (r1 < m) {
641                 q1 -= 1;
642                 r1 += d;
643             }
644         }
645     }
646     r1 -= m;
647 
648     r0 = r1 % d1;
649     q0 = r1 / d1;
650     m = q0 * d0;
651     r0 = (r0 << 32) | (uint32_t)n0;
652     if (r0 < m) {
653         q0 -= 1;
654         r0 += d;
655         if (r0 >= d) {
656             if (r0 < m) {
657                 q0 -= 1;
658                 r0 += d;
659             }
660         }
661     }
662     r0 -= m;
663 
664     *r = r0;
665     return (q1 << 32) | q0;
666 #endif
667 }
668 
669 /*----------------------------------------------------------------------------
670 | Returns an approximation to the square root of the 32-bit significand given
671 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
672 | `aExp' (the least significant bit) is 1, the integer returned approximates
673 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
674 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
675 | case, the approximation returned lies strictly within +/-2 of the exact
676 | value.
677 *----------------------------------------------------------------------------*/
678 
679 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
680 {
681     static const uint16_t sqrtOddAdjustments[] = {
682         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
683         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
684     };
685     static const uint16_t sqrtEvenAdjustments[] = {
686         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
687         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
688     };
689     int8_t index;
690     uint32_t z;
691 
692     index = ( a>>27 ) & 15;
693     if ( aExp & 1 ) {
694         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
695         z = ( ( a / z )<<14 ) + ( z<<15 );
696         a >>= 1;
697     }
698     else {
699         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
700         z = a / z + z;
701         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
702         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
703     }
704     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
705 
706 }
707 
708 /*----------------------------------------------------------------------------
709 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
710 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
711 | Otherwise, returns 0.
712 *----------------------------------------------------------------------------*/
713 
714 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
715 {
716     return a0 == b0 && a1 == b1;
717 }
718 
719 /*----------------------------------------------------------------------------
720 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
721 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
722 | Otherwise, returns 0.
723 *----------------------------------------------------------------------------*/
724 
725 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
726 {
727     return a0 < b0 || (a0 == b0 && a1 <= b1);
728 }
729 
730 /*----------------------------------------------------------------------------
731 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
732 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
733 | returns 0.
734 *----------------------------------------------------------------------------*/
735 
736 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
737 {
738     return a0 < b0 || (a0 == b0 && a1 < b1);
739 }
740 
741 /*----------------------------------------------------------------------------
742 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
743 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
744 | Otherwise, returns 0.
745 *----------------------------------------------------------------------------*/
746 
747 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
748 {
749     return a0 != b0 || a1 != b1;
750 }
751 
752 #endif
753