xref: /openbmc/qemu/include/fpu/softfloat-macros.h (revision cc1838c2)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84 
85 #include "fpu/softfloat-types.h"
86 #include "qemu/host-utils.h"
87 
88 /**
89  * shl_double: double-word merging left shift
90  * @l: left or most-significant word
91  * @r: right or least-significant word
92  * @c: shift count
93  *
94  * Shift @l left by @c bits, shifting in bits from @r.
95  */
96 static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
97 {
98 #if defined(__x86_64__)
99     asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
100     return l;
101 #else
102     return c ? (l << c) | (r >> (64 - c)) : l;
103 #endif
104 }
105 
106 /**
107  * shr_double: double-word merging right shift
108  * @l: left or most-significant word
109  * @r: right or least-significant word
110  * @c: shift count
111  *
112  * Shift @r right by @c bits, shifting in bits from @l.
113  */
114 static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
115 {
116 #if defined(__x86_64__)
117     asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
118     return r;
119 #else
120     return c ? (r >> c) | (l << (64 - c)) : r;
121 #endif
122 }
123 
124 /*----------------------------------------------------------------------------
125 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
126 | bits are shifted off, they are ``jammed'' into the least significant bit of
127 | the result by setting the least significant bit to 1.  The value of `count'
128 | can be arbitrarily large; in particular, if `count' is greater than 32, the
129 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
130 | The result is stored in the location pointed to by `zPtr'.
131 *----------------------------------------------------------------------------*/
132 
133 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
134 {
135     uint32_t z;
136 
137     if ( count == 0 ) {
138         z = a;
139     }
140     else if ( count < 32 ) {
141         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
142     }
143     else {
144         z = ( a != 0 );
145     }
146     *zPtr = z;
147 
148 }
149 
150 /*----------------------------------------------------------------------------
151 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
152 | bits are shifted off, they are ``jammed'' into the least significant bit of
153 | the result by setting the least significant bit to 1.  The value of `count'
154 | can be arbitrarily large; in particular, if `count' is greater than 64, the
155 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
156 | The result is stored in the location pointed to by `zPtr'.
157 *----------------------------------------------------------------------------*/
158 
159 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
160 {
161     uint64_t z;
162 
163     if ( count == 0 ) {
164         z = a;
165     }
166     else if ( count < 64 ) {
167         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
168     }
169     else {
170         z = ( a != 0 );
171     }
172     *zPtr = z;
173 
174 }
175 
176 /*----------------------------------------------------------------------------
177 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
178 | _plus_ the number of bits given in `count'.  The shifted result is at most
179 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
180 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
181 | shifted off is the most-significant bit of the extra result, and the other
182 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
183 | bits shifted off were all zero.  This extra result is stored in the location
184 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
185 |     (This routine makes more sense if `a0' and `a1' are considered to form a
186 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
187 | value is shifted right by the number of bits given in `count', and the
188 | integer part of the result is returned at the location pointed to by
189 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
190 | described above, and is returned at the location pointed to by `z1Ptr'.)
191 *----------------------------------------------------------------------------*/
192 
193 static inline void
194  shift64ExtraRightJamming(
195      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
196 {
197     uint64_t z0, z1;
198     int8_t negCount = ( - count ) & 63;
199 
200     if ( count == 0 ) {
201         z1 = a1;
202         z0 = a0;
203     }
204     else if ( count < 64 ) {
205         z1 = ( a0<<negCount ) | ( a1 != 0 );
206         z0 = a0>>count;
207     }
208     else {
209         if ( count == 64 ) {
210             z1 = a0 | ( a1 != 0 );
211         }
212         else {
213             z1 = ( ( a0 | a1 ) != 0 );
214         }
215         z0 = 0;
216     }
217     *z1Ptr = z1;
218     *z0Ptr = z0;
219 
220 }
221 
222 /*----------------------------------------------------------------------------
223 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
224 | number of bits given in `count'.  Any bits shifted off are lost.  The value
225 | of `count' can be arbitrarily large; in particular, if `count' is greater
226 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
227 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
228 *----------------------------------------------------------------------------*/
229 
230 static inline void
231  shift128Right(
232      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
233 {
234     uint64_t z0, z1;
235     int8_t negCount = ( - count ) & 63;
236 
237     if ( count == 0 ) {
238         z1 = a1;
239         z0 = a0;
240     }
241     else if ( count < 64 ) {
242         z1 = ( a0<<negCount ) | ( a1>>count );
243         z0 = a0>>count;
244     }
245     else {
246         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
247         z0 = 0;
248     }
249     *z1Ptr = z1;
250     *z0Ptr = z0;
251 
252 }
253 
254 /*----------------------------------------------------------------------------
255 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
256 | number of bits given in `count'.  If any nonzero bits are shifted off, they
257 | are ``jammed'' into the least significant bit of the result by setting the
258 | least significant bit to 1.  The value of `count' can be arbitrarily large;
259 | in particular, if `count' is greater than 128, the result will be either
260 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
261 | nonzero.  The result is broken into two 64-bit pieces which are stored at
262 | the locations pointed to by `z0Ptr' and `z1Ptr'.
263 *----------------------------------------------------------------------------*/
264 
265 static inline void
266  shift128RightJamming(
267      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
268 {
269     uint64_t z0, z1;
270     int8_t negCount = ( - count ) & 63;
271 
272     if ( count == 0 ) {
273         z1 = a1;
274         z0 = a0;
275     }
276     else if ( count < 64 ) {
277         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
278         z0 = a0>>count;
279     }
280     else {
281         if ( count == 64 ) {
282             z1 = a0 | ( a1 != 0 );
283         }
284         else if ( count < 128 ) {
285             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
286         }
287         else {
288             z1 = ( ( a0 | a1 ) != 0 );
289         }
290         z0 = 0;
291     }
292     *z1Ptr = z1;
293     *z0Ptr = z0;
294 
295 }
296 
297 /*----------------------------------------------------------------------------
298 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
299 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
300 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
301 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
302 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
303 | the most-significant bit of the extra result, and the other 63 bits of the
304 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
305 | were all zero.  This extra result is stored in the location pointed to by
306 | `z2Ptr'.  The value of `count' can be arbitrarily large.
307 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
308 | to form a fixed-point value with binary point between `a1' and `a2'.  This
309 | fixed-point value is shifted right by the number of bits given in `count',
310 | and the integer part of the result is returned at the locations pointed to
311 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
312 | corrupted as described above, and is returned at the location pointed to by
313 | `z2Ptr'.)
314 *----------------------------------------------------------------------------*/
315 
316 static inline void
317  shift128ExtraRightJamming(
318      uint64_t a0,
319      uint64_t a1,
320      uint64_t a2,
321      int count,
322      uint64_t *z0Ptr,
323      uint64_t *z1Ptr,
324      uint64_t *z2Ptr
325  )
326 {
327     uint64_t z0, z1, z2;
328     int8_t negCount = ( - count ) & 63;
329 
330     if ( count == 0 ) {
331         z2 = a2;
332         z1 = a1;
333         z0 = a0;
334     }
335     else {
336         if ( count < 64 ) {
337             z2 = a1<<negCount;
338             z1 = ( a0<<negCount ) | ( a1>>count );
339             z0 = a0>>count;
340         }
341         else {
342             if ( count == 64 ) {
343                 z2 = a1;
344                 z1 = a0;
345             }
346             else {
347                 a2 |= a1;
348                 if ( count < 128 ) {
349                     z2 = a0<<negCount;
350                     z1 = a0>>( count & 63 );
351                 }
352                 else {
353                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
354                     z1 = 0;
355                 }
356             }
357             z0 = 0;
358         }
359         z2 |= ( a2 != 0 );
360     }
361     *z2Ptr = z2;
362     *z1Ptr = z1;
363     *z0Ptr = z0;
364 
365 }
366 
367 /*----------------------------------------------------------------------------
368 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
369 | number of bits given in `count'.  Any bits shifted off are lost.  The value
370 | of `count' must be less than 64.  The result is broken into two 64-bit
371 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
372 *----------------------------------------------------------------------------*/
373 
374 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
375                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
376 {
377     *z1Ptr = a1 << count;
378     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
379 }
380 
381 /*----------------------------------------------------------------------------
382 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
383 | number of bits given in `count'.  Any bits shifted off are lost.  The value
384 | of `count' may be greater than 64.  The result is broken into two 64-bit
385 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
386 *----------------------------------------------------------------------------*/
387 
388 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
389                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
390 {
391     if (count < 64) {
392         *z1Ptr = a1 << count;
393         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
394     } else {
395         *z1Ptr = 0;
396         *z0Ptr = a1 << (count - 64);
397     }
398 }
399 
400 /*----------------------------------------------------------------------------
401 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
402 | by the number of bits given in `count'.  Any bits shifted off are lost.
403 | The value of `count' must be less than 64.  The result is broken into three
404 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
405 | `z1Ptr', and `z2Ptr'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline void
409  shortShift192Left(
410      uint64_t a0,
411      uint64_t a1,
412      uint64_t a2,
413      int count,
414      uint64_t *z0Ptr,
415      uint64_t *z1Ptr,
416      uint64_t *z2Ptr
417  )
418 {
419     uint64_t z0, z1, z2;
420     int8_t negCount;
421 
422     z2 = a2<<count;
423     z1 = a1<<count;
424     z0 = a0<<count;
425     if ( 0 < count ) {
426         negCount = ( ( - count ) & 63 );
427         z1 |= a2>>negCount;
428         z0 |= a1>>negCount;
429     }
430     *z2Ptr = z2;
431     *z1Ptr = z1;
432     *z0Ptr = z0;
433 
434 }
435 
436 /*----------------------------------------------------------------------------
437 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
438 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
439 | any carry out is lost.  The result is broken into two 64-bit pieces which
440 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
441 *----------------------------------------------------------------------------*/
442 
443 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
444                           uint64_t *z0Ptr, uint64_t *z1Ptr)
445 {
446     bool c = 0;
447     *z1Ptr = uadd64_carry(a1, b1, &c);
448     *z0Ptr = uadd64_carry(a0, b0, &c);
449 }
450 
451 /*----------------------------------------------------------------------------
452 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
453 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
454 | modulo 2^192, so any carry out is lost.  The result is broken into three
455 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
456 | `z1Ptr', and `z2Ptr'.
457 *----------------------------------------------------------------------------*/
458 
459 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
460                           uint64_t b0, uint64_t b1, uint64_t b2,
461                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
462 {
463     bool c = 0;
464     *z2Ptr = uadd64_carry(a2, b2, &c);
465     *z1Ptr = uadd64_carry(a1, b1, &c);
466     *z0Ptr = uadd64_carry(a0, b0, &c);
467 }
468 
469 /*----------------------------------------------------------------------------
470 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
471 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
472 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
473 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
474 | `z1Ptr'.
475 *----------------------------------------------------------------------------*/
476 
477 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
478                           uint64_t *z0Ptr, uint64_t *z1Ptr)
479 {
480     bool c = 0;
481     *z1Ptr = usub64_borrow(a1, b1, &c);
482     *z0Ptr = usub64_borrow(a0, b0, &c);
483 }
484 
485 /*----------------------------------------------------------------------------
486 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
487 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
488 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
489 | result is broken into three 64-bit pieces which are stored at the locations
490 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
491 *----------------------------------------------------------------------------*/
492 
493 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
494                           uint64_t b0, uint64_t b1, uint64_t b2,
495                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
496 {
497     bool c = 0;
498     *z2Ptr = usub64_borrow(a2, b2, &c);
499     *z1Ptr = usub64_borrow(a1, b1, &c);
500     *z0Ptr = usub64_borrow(a0, b0, &c);
501 }
502 
503 /*----------------------------------------------------------------------------
504 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
505 | into two 64-bit pieces which are stored at the locations pointed to by
506 | `z0Ptr' and `z1Ptr'.
507 *----------------------------------------------------------------------------*/
508 
509 static inline void
510 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
511 {
512     mulu64(z1Ptr, z0Ptr, a, b);
513 }
514 
515 /*----------------------------------------------------------------------------
516 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
517 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
518 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
519 | `z2Ptr'.
520 *----------------------------------------------------------------------------*/
521 
522 static inline void
523 mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
524                 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
525 {
526     uint64_t z0, z1, m1;
527 
528     mul64To128(a1, b, &m1, z2Ptr);
529     mul64To128(a0, b, &z0, &z1);
530     add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
531 }
532 
533 /*----------------------------------------------------------------------------
534 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
535 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
536 | product.  The product is broken into four 64-bit pieces which are stored at
537 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
538 *----------------------------------------------------------------------------*/
539 
540 static inline void mul128To256(uint64_t a0, uint64_t a1,
541                                uint64_t b0, uint64_t b1,
542                                uint64_t *z0Ptr, uint64_t *z1Ptr,
543                                uint64_t *z2Ptr, uint64_t *z3Ptr)
544 {
545     uint64_t z0, z1, z2;
546     uint64_t m0, m1, m2, n1, n2;
547 
548     mul64To128(a1, b0, &m1, &m2);
549     mul64To128(a0, b1, &n1, &n2);
550     mul64To128(a1, b1, &z2, z3Ptr);
551     mul64To128(a0, b0, &z0, &z1);
552 
553     add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
554     add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
555 }
556 
557 /*----------------------------------------------------------------------------
558 | Returns an approximation to the 64-bit integer quotient obtained by dividing
559 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
560 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
561 | toward zero, the approximation returned lies between q and q + 2 inclusive.
562 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
563 | unsigned integer is returned.
564 *----------------------------------------------------------------------------*/
565 
566 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
567 {
568     uint64_t b0, b1;
569     uint64_t rem0, rem1, term0, term1;
570     uint64_t z;
571 
572     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
573     b0 = b>>32;
574     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
575     mul64To128( b, z, &term0, &term1 );
576     sub128( a0, a1, term0, term1, &rem0, &rem1 );
577     while ( ( (int64_t) rem0 ) < 0 ) {
578         z -= UINT64_C(0x100000000);
579         b1 = b<<32;
580         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
581     }
582     rem0 = ( rem0<<32 ) | ( rem1>>32 );
583     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
584     return z;
585 
586 }
587 
588 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
589  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
590  *
591  * Licensed under the GPLv2/LGPLv3
592  */
593 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
594                                   uint64_t n0, uint64_t d)
595 {
596 #if defined(__x86_64__)
597     uint64_t q;
598     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
599     return q;
600 #elif defined(__s390x__) && !defined(__clang__)
601     /* Need to use a TImode type to get an even register pair for DLGR.  */
602     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
603     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
604     *r = n >> 64;
605     return n;
606 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
607     /* From Power ISA 2.06, programming note for divdeu.  */
608     uint64_t q1, q2, Q, r1, r2, R;
609     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
610         : "=&r"(q1), "=r"(q2)
611         : "r"(n1), "r"(n0), "r"(d));
612     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
613     r2 = n0 - (q2 * d);
614     Q = q1 + q2;
615     R = r1 + r2;
616     if (R >= d || R < r2) { /* overflow implies R > d */
617         Q += 1;
618         R -= d;
619     }
620     *r = R;
621     return Q;
622 #else
623     uint64_t d0, d1, q0, q1, r1, r0, m;
624 
625     d0 = (uint32_t)d;
626     d1 = d >> 32;
627 
628     r1 = n1 % d1;
629     q1 = n1 / d1;
630     m = q1 * d0;
631     r1 = (r1 << 32) | (n0 >> 32);
632     if (r1 < m) {
633         q1 -= 1;
634         r1 += d;
635         if (r1 >= d) {
636             if (r1 < m) {
637                 q1 -= 1;
638                 r1 += d;
639             }
640         }
641     }
642     r1 -= m;
643 
644     r0 = r1 % d1;
645     q0 = r1 / d1;
646     m = q0 * d0;
647     r0 = (r0 << 32) | (uint32_t)n0;
648     if (r0 < m) {
649         q0 -= 1;
650         r0 += d;
651         if (r0 >= d) {
652             if (r0 < m) {
653                 q0 -= 1;
654                 r0 += d;
655             }
656         }
657     }
658     r0 -= m;
659 
660     *r = r0;
661     return (q1 << 32) | q0;
662 #endif
663 }
664 
665 /*----------------------------------------------------------------------------
666 | Returns an approximation to the square root of the 32-bit significand given
667 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
668 | `aExp' (the least significant bit) is 1, the integer returned approximates
669 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
670 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
671 | case, the approximation returned lies strictly within +/-2 of the exact
672 | value.
673 *----------------------------------------------------------------------------*/
674 
675 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
676 {
677     static const uint16_t sqrtOddAdjustments[] = {
678         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
679         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
680     };
681     static const uint16_t sqrtEvenAdjustments[] = {
682         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
683         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
684     };
685     int8_t index;
686     uint32_t z;
687 
688     index = ( a>>27 ) & 15;
689     if ( aExp & 1 ) {
690         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
691         z = ( ( a / z )<<14 ) + ( z<<15 );
692         a >>= 1;
693     }
694     else {
695         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
696         z = a / z + z;
697         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
698         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
699     }
700     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
701 
702 }
703 
704 /*----------------------------------------------------------------------------
705 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
706 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
707 | Otherwise, returns 0.
708 *----------------------------------------------------------------------------*/
709 
710 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
711 {
712     return a0 == b0 && a1 == b1;
713 }
714 
715 /*----------------------------------------------------------------------------
716 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
717 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
718 | Otherwise, returns 0.
719 *----------------------------------------------------------------------------*/
720 
721 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
722 {
723     return a0 < b0 || (a0 == b0 && a1 <= b1);
724 }
725 
726 /*----------------------------------------------------------------------------
727 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
728 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
729 | returns 0.
730 *----------------------------------------------------------------------------*/
731 
732 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
733 {
734     return a0 < b0 || (a0 == b0 && a1 < b1);
735 }
736 
737 /*----------------------------------------------------------------------------
738 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
739 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
740 | Otherwise, returns 0.
741 *----------------------------------------------------------------------------*/
742 
743 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
744 {
745     return a0 != b0 || a1 != b1;
746 }
747 
748 /*
749  * Similarly, comparisons of 192-bit values.
750  */
751 
752 static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
753                          uint64_t b0, uint64_t b1, uint64_t b2)
754 {
755     return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
756 }
757 
758 static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
759                          uint64_t b0, uint64_t b1, uint64_t b2)
760 {
761     if (a0 != b0) {
762         return a0 < b0;
763     }
764     if (a1 != b1) {
765         return a1 < b1;
766     }
767     return a2 <= b2;
768 }
769 
770 static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
771                          uint64_t b0, uint64_t b1, uint64_t b2)
772 {
773     if (a0 != b0) {
774         return a0 < b0;
775     }
776     if (a1 != b1) {
777         return a1 < b1;
778     }
779     return a2 < b2;
780 }
781 
782 #endif
783