xref: /openbmc/qemu/include/fpu/softfloat-macros.h (revision 40cf6a54)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /*----------------------------------------------------------------------------
83 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
84 | bits are shifted off, they are ``jammed'' into the least significant bit of
85 | the result by setting the least significant bit to 1.  The value of `count'
86 | can be arbitrarily large; in particular, if `count' is greater than 32, the
87 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
88 | The result is stored in the location pointed to by `zPtr'.
89 *----------------------------------------------------------------------------*/
90 
91 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
92 {
93     uint32_t z;
94 
95     if ( count == 0 ) {
96         z = a;
97     }
98     else if ( count < 32 ) {
99         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
100     }
101     else {
102         z = ( a != 0 );
103     }
104     *zPtr = z;
105 
106 }
107 
108 /*----------------------------------------------------------------------------
109 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
110 | bits are shifted off, they are ``jammed'' into the least significant bit of
111 | the result by setting the least significant bit to 1.  The value of `count'
112 | can be arbitrarily large; in particular, if `count' is greater than 64, the
113 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
114 | The result is stored in the location pointed to by `zPtr'.
115 *----------------------------------------------------------------------------*/
116 
117 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
118 {
119     uint64_t z;
120 
121     if ( count == 0 ) {
122         z = a;
123     }
124     else if ( count < 64 ) {
125         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
126     }
127     else {
128         z = ( a != 0 );
129     }
130     *zPtr = z;
131 
132 }
133 
134 /*----------------------------------------------------------------------------
135 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
136 | _plus_ the number of bits given in `count'.  The shifted result is at most
137 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
138 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
139 | shifted off is the most-significant bit of the extra result, and the other
140 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
141 | bits shifted off were all zero.  This extra result is stored in the location
142 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
143 |     (This routine makes more sense if `a0' and `a1' are considered to form a
144 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
145 | value is shifted right by the number of bits given in `count', and the
146 | integer part of the result is returned at the location pointed to by
147 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
148 | described above, and is returned at the location pointed to by `z1Ptr'.)
149 *----------------------------------------------------------------------------*/
150 
151 static inline void
152  shift64ExtraRightJamming(
153      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
154 {
155     uint64_t z0, z1;
156     int8_t negCount = ( - count ) & 63;
157 
158     if ( count == 0 ) {
159         z1 = a1;
160         z0 = a0;
161     }
162     else if ( count < 64 ) {
163         z1 = ( a0<<negCount ) | ( a1 != 0 );
164         z0 = a0>>count;
165     }
166     else {
167         if ( count == 64 ) {
168             z1 = a0 | ( a1 != 0 );
169         }
170         else {
171             z1 = ( ( a0 | a1 ) != 0 );
172         }
173         z0 = 0;
174     }
175     *z1Ptr = z1;
176     *z0Ptr = z0;
177 
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
182 | number of bits given in `count'.  Any bits shifted off are lost.  The value
183 | of `count' can be arbitrarily large; in particular, if `count' is greater
184 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
185 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
186 *----------------------------------------------------------------------------*/
187 
188 static inline void
189  shift128Right(
190      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
191 {
192     uint64_t z0, z1;
193     int8_t negCount = ( - count ) & 63;
194 
195     if ( count == 0 ) {
196         z1 = a1;
197         z0 = a0;
198     }
199     else if ( count < 64 ) {
200         z1 = ( a0<<negCount ) | ( a1>>count );
201         z0 = a0>>count;
202     }
203     else {
204         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
205         z0 = 0;
206     }
207     *z1Ptr = z1;
208     *z0Ptr = z0;
209 
210 }
211 
212 /*----------------------------------------------------------------------------
213 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
214 | number of bits given in `count'.  If any nonzero bits are shifted off, they
215 | are ``jammed'' into the least significant bit of the result by setting the
216 | least significant bit to 1.  The value of `count' can be arbitrarily large;
217 | in particular, if `count' is greater than 128, the result will be either
218 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
219 | nonzero.  The result is broken into two 64-bit pieces which are stored at
220 | the locations pointed to by `z0Ptr' and `z1Ptr'.
221 *----------------------------------------------------------------------------*/
222 
223 static inline void
224  shift128RightJamming(
225      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
226 {
227     uint64_t z0, z1;
228     int8_t negCount = ( - count ) & 63;
229 
230     if ( count == 0 ) {
231         z1 = a1;
232         z0 = a0;
233     }
234     else if ( count < 64 ) {
235         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
236         z0 = a0>>count;
237     }
238     else {
239         if ( count == 64 ) {
240             z1 = a0 | ( a1 != 0 );
241         }
242         else if ( count < 128 ) {
243             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
244         }
245         else {
246             z1 = ( ( a0 | a1 ) != 0 );
247         }
248         z0 = 0;
249     }
250     *z1Ptr = z1;
251     *z0Ptr = z0;
252 
253 }
254 
255 /*----------------------------------------------------------------------------
256 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
257 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
258 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
259 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
260 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
261 | the most-significant bit of the extra result, and the other 63 bits of the
262 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
263 | were all zero.  This extra result is stored in the location pointed to by
264 | `z2Ptr'.  The value of `count' can be arbitrarily large.
265 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
266 | to form a fixed-point value with binary point between `a1' and `a2'.  This
267 | fixed-point value is shifted right by the number of bits given in `count',
268 | and the integer part of the result is returned at the locations pointed to
269 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
270 | corrupted as described above, and is returned at the location pointed to by
271 | `z2Ptr'.)
272 *----------------------------------------------------------------------------*/
273 
274 static inline void
275  shift128ExtraRightJamming(
276      uint64_t a0,
277      uint64_t a1,
278      uint64_t a2,
279      int count,
280      uint64_t *z0Ptr,
281      uint64_t *z1Ptr,
282      uint64_t *z2Ptr
283  )
284 {
285     uint64_t z0, z1, z2;
286     int8_t negCount = ( - count ) & 63;
287 
288     if ( count == 0 ) {
289         z2 = a2;
290         z1 = a1;
291         z0 = a0;
292     }
293     else {
294         if ( count < 64 ) {
295             z2 = a1<<negCount;
296             z1 = ( a0<<negCount ) | ( a1>>count );
297             z0 = a0>>count;
298         }
299         else {
300             if ( count == 64 ) {
301                 z2 = a1;
302                 z1 = a0;
303             }
304             else {
305                 a2 |= a1;
306                 if ( count < 128 ) {
307                     z2 = a0<<negCount;
308                     z1 = a0>>( count & 63 );
309                 }
310                 else {
311                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
312                     z1 = 0;
313                 }
314             }
315             z0 = 0;
316         }
317         z2 |= ( a2 != 0 );
318     }
319     *z2Ptr = z2;
320     *z1Ptr = z1;
321     *z0Ptr = z0;
322 
323 }
324 
325 /*----------------------------------------------------------------------------
326 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
327 | number of bits given in `count'.  Any bits shifted off are lost.  The value
328 | of `count' must be less than 64.  The result is broken into two 64-bit
329 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
330 *----------------------------------------------------------------------------*/
331 
332 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
333                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
334 {
335     *z1Ptr = a1 << count;
336     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
337 }
338 
339 /*----------------------------------------------------------------------------
340 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
341 | number of bits given in `count'.  Any bits shifted off are lost.  The value
342 | of `count' may be greater than 64.  The result is broken into two 64-bit
343 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
344 *----------------------------------------------------------------------------*/
345 
346 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
347                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
348 {
349     if (count < 64) {
350         *z1Ptr = a1 << count;
351         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
352     } else {
353         *z1Ptr = 0;
354         *z0Ptr = a1 << (count - 64);
355     }
356 }
357 
358 /*----------------------------------------------------------------------------
359 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
360 | by the number of bits given in `count'.  Any bits shifted off are lost.
361 | The value of `count' must be less than 64.  The result is broken into three
362 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
363 | `z1Ptr', and `z2Ptr'.
364 *----------------------------------------------------------------------------*/
365 
366 static inline void
367  shortShift192Left(
368      uint64_t a0,
369      uint64_t a1,
370      uint64_t a2,
371      int count,
372      uint64_t *z0Ptr,
373      uint64_t *z1Ptr,
374      uint64_t *z2Ptr
375  )
376 {
377     uint64_t z0, z1, z2;
378     int8_t negCount;
379 
380     z2 = a2<<count;
381     z1 = a1<<count;
382     z0 = a0<<count;
383     if ( 0 < count ) {
384         negCount = ( ( - count ) & 63 );
385         z1 |= a2>>negCount;
386         z0 |= a1>>negCount;
387     }
388     *z2Ptr = z2;
389     *z1Ptr = z1;
390     *z0Ptr = z0;
391 
392 }
393 
394 /*----------------------------------------------------------------------------
395 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
396 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
397 | any carry out is lost.  The result is broken into two 64-bit pieces which
398 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
399 *----------------------------------------------------------------------------*/
400 
401 static inline void
402  add128(
403      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
404 {
405     uint64_t z1;
406 
407     z1 = a1 + b1;
408     *z1Ptr = z1;
409     *z0Ptr = a0 + b0 + ( z1 < a1 );
410 
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
415 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
416 | modulo 2^192, so any carry out is lost.  The result is broken into three
417 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
418 | `z1Ptr', and `z2Ptr'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline void
422  add192(
423      uint64_t a0,
424      uint64_t a1,
425      uint64_t a2,
426      uint64_t b0,
427      uint64_t b1,
428      uint64_t b2,
429      uint64_t *z0Ptr,
430      uint64_t *z1Ptr,
431      uint64_t *z2Ptr
432  )
433 {
434     uint64_t z0, z1, z2;
435     int8_t carry0, carry1;
436 
437     z2 = a2 + b2;
438     carry1 = ( z2 < a2 );
439     z1 = a1 + b1;
440     carry0 = ( z1 < a1 );
441     z0 = a0 + b0;
442     z1 += carry1;
443     z0 += ( z1 < carry1 );
444     z0 += carry0;
445     *z2Ptr = z2;
446     *z1Ptr = z1;
447     *z0Ptr = z0;
448 
449 }
450 
451 /*----------------------------------------------------------------------------
452 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
453 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
454 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
455 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
456 | `z1Ptr'.
457 *----------------------------------------------------------------------------*/
458 
459 static inline void
460  sub128(
461      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
462 {
463 
464     *z1Ptr = a1 - b1;
465     *z0Ptr = a0 - b0 - ( a1 < b1 );
466 
467 }
468 
469 /*----------------------------------------------------------------------------
470 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
471 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
472 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
473 | result is broken into three 64-bit pieces which are stored at the locations
474 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
475 *----------------------------------------------------------------------------*/
476 
477 static inline void
478  sub192(
479      uint64_t a0,
480      uint64_t a1,
481      uint64_t a2,
482      uint64_t b0,
483      uint64_t b1,
484      uint64_t b2,
485      uint64_t *z0Ptr,
486      uint64_t *z1Ptr,
487      uint64_t *z2Ptr
488  )
489 {
490     uint64_t z0, z1, z2;
491     int8_t borrow0, borrow1;
492 
493     z2 = a2 - b2;
494     borrow1 = ( a2 < b2 );
495     z1 = a1 - b1;
496     borrow0 = ( a1 < b1 );
497     z0 = a0 - b0;
498     z0 -= ( z1 < borrow1 );
499     z1 -= borrow1;
500     z0 -= borrow0;
501     *z2Ptr = z2;
502     *z1Ptr = z1;
503     *z0Ptr = z0;
504 
505 }
506 
507 /*----------------------------------------------------------------------------
508 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
509 | into two 64-bit pieces which are stored at the locations pointed to by
510 | `z0Ptr' and `z1Ptr'.
511 *----------------------------------------------------------------------------*/
512 
513 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
514 {
515     uint32_t aHigh, aLow, bHigh, bLow;
516     uint64_t z0, zMiddleA, zMiddleB, z1;
517 
518     aLow = a;
519     aHigh = a>>32;
520     bLow = b;
521     bHigh = b>>32;
522     z1 = ( (uint64_t) aLow ) * bLow;
523     zMiddleA = ( (uint64_t) aLow ) * bHigh;
524     zMiddleB = ( (uint64_t) aHigh ) * bLow;
525     z0 = ( (uint64_t) aHigh ) * bHigh;
526     zMiddleA += zMiddleB;
527     z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
528     zMiddleA <<= 32;
529     z1 += zMiddleA;
530     z0 += ( z1 < zMiddleA );
531     *z1Ptr = z1;
532     *z0Ptr = z0;
533 
534 }
535 
536 /*----------------------------------------------------------------------------
537 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
538 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
539 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
540 | `z2Ptr'.
541 *----------------------------------------------------------------------------*/
542 
543 static inline void
544  mul128By64To192(
545      uint64_t a0,
546      uint64_t a1,
547      uint64_t b,
548      uint64_t *z0Ptr,
549      uint64_t *z1Ptr,
550      uint64_t *z2Ptr
551  )
552 {
553     uint64_t z0, z1, z2, more1;
554 
555     mul64To128( a1, b, &z1, &z2 );
556     mul64To128( a0, b, &z0, &more1 );
557     add128( z0, more1, 0, z1, &z0, &z1 );
558     *z2Ptr = z2;
559     *z1Ptr = z1;
560     *z0Ptr = z0;
561 
562 }
563 
564 /*----------------------------------------------------------------------------
565 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
566 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
567 | product.  The product is broken into four 64-bit pieces which are stored at
568 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
569 *----------------------------------------------------------------------------*/
570 
571 static inline void
572  mul128To256(
573      uint64_t a0,
574      uint64_t a1,
575      uint64_t b0,
576      uint64_t b1,
577      uint64_t *z0Ptr,
578      uint64_t *z1Ptr,
579      uint64_t *z2Ptr,
580      uint64_t *z3Ptr
581  )
582 {
583     uint64_t z0, z1, z2, z3;
584     uint64_t more1, more2;
585 
586     mul64To128( a1, b1, &z2, &z3 );
587     mul64To128( a1, b0, &z1, &more2 );
588     add128( z1, more2, 0, z2, &z1, &z2 );
589     mul64To128( a0, b0, &z0, &more1 );
590     add128( z0, more1, 0, z1, &z0, &z1 );
591     mul64To128( a0, b1, &more1, &more2 );
592     add128( more1, more2, 0, z2, &more1, &z2 );
593     add128( z0, z1, 0, more1, &z0, &z1 );
594     *z3Ptr = z3;
595     *z2Ptr = z2;
596     *z1Ptr = z1;
597     *z0Ptr = z0;
598 
599 }
600 
601 /*----------------------------------------------------------------------------
602 | Returns an approximation to the 64-bit integer quotient obtained by dividing
603 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
604 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
605 | toward zero, the approximation returned lies between q and q + 2 inclusive.
606 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
607 | unsigned integer is returned.
608 *----------------------------------------------------------------------------*/
609 
610 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
611 {
612     uint64_t b0, b1;
613     uint64_t rem0, rem1, term0, term1;
614     uint64_t z;
615 
616     if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
617     b0 = b>>32;
618     z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
619     mul64To128( b, z, &term0, &term1 );
620     sub128( a0, a1, term0, term1, &rem0, &rem1 );
621     while ( ( (int64_t) rem0 ) < 0 ) {
622         z -= LIT64( 0x100000000 );
623         b1 = b<<32;
624         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
625     }
626     rem0 = ( rem0<<32 ) | ( rem1>>32 );
627     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
628     return z;
629 
630 }
631 
632 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
633  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
634  *
635  * Licensed under the GPLv2/LGPLv3
636  */
637 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
638                                   uint64_t n0, uint64_t d)
639 {
640 #if defined(__x86_64__)
641     uint64_t q;
642     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
643     return q;
644 #elif defined(__s390x__)
645     /* Need to use a TImode type to get an even register pair for DLGR.  */
646     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
647     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
648     *r = n >> 64;
649     return n;
650 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
651     /* From Power ISA 2.06, programming note for divdeu.  */
652     uint64_t q1, q2, Q, r1, r2, R;
653     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
654         : "=&r"(q1), "=r"(q2)
655         : "r"(n1), "r"(n0), "r"(d));
656     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
657     r2 = n0 - (q2 * d);
658     Q = q1 + q2;
659     R = r1 + r2;
660     if (R >= d || R < r2) { /* overflow implies R > d */
661         Q += 1;
662         R -= d;
663     }
664     *r = R;
665     return Q;
666 #else
667     uint64_t d0, d1, q0, q1, r1, r0, m;
668 
669     d0 = (uint32_t)d;
670     d1 = d >> 32;
671 
672     r1 = n1 % d1;
673     q1 = n1 / d1;
674     m = q1 * d0;
675     r1 = (r1 << 32) | (n0 >> 32);
676     if (r1 < m) {
677         q1 -= 1;
678         r1 += d;
679         if (r1 >= d) {
680             if (r1 < m) {
681                 q1 -= 1;
682                 r1 += d;
683             }
684         }
685     }
686     r1 -= m;
687 
688     r0 = r1 % d1;
689     q0 = r1 / d1;
690     m = q0 * d0;
691     r0 = (r0 << 32) | (uint32_t)n0;
692     if (r0 < m) {
693         q0 -= 1;
694         r0 += d;
695         if (r0 >= d) {
696             if (r0 < m) {
697                 q0 -= 1;
698                 r0 += d;
699             }
700         }
701     }
702     r0 -= m;
703 
704     *r = r0;
705     return (q1 << 32) | q0;
706 #endif
707 }
708 
709 /*----------------------------------------------------------------------------
710 | Returns an approximation to the square root of the 32-bit significand given
711 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
712 | `aExp' (the least significant bit) is 1, the integer returned approximates
713 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
714 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
715 | case, the approximation returned lies strictly within +/-2 of the exact
716 | value.
717 *----------------------------------------------------------------------------*/
718 
719 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
720 {
721     static const uint16_t sqrtOddAdjustments[] = {
722         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
723         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
724     };
725     static const uint16_t sqrtEvenAdjustments[] = {
726         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
727         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
728     };
729     int8_t index;
730     uint32_t z;
731 
732     index = ( a>>27 ) & 15;
733     if ( aExp & 1 ) {
734         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
735         z = ( ( a / z )<<14 ) + ( z<<15 );
736         a >>= 1;
737     }
738     else {
739         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
740         z = a / z + z;
741         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
742         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
743     }
744     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
745 
746 }
747 
748 /*----------------------------------------------------------------------------
749 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
750 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
751 | Otherwise, returns 0.
752 *----------------------------------------------------------------------------*/
753 
754 static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
755 {
756 
757     return ( a0 == b0 ) && ( a1 == b1 );
758 
759 }
760 
761 /*----------------------------------------------------------------------------
762 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
763 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
764 | Otherwise, returns 0.
765 *----------------------------------------------------------------------------*/
766 
767 static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
768 {
769 
770     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
771 
772 }
773 
774 /*----------------------------------------------------------------------------
775 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
776 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
777 | returns 0.
778 *----------------------------------------------------------------------------*/
779 
780 static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
781 {
782 
783     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
784 
785 }
786 
787 /*----------------------------------------------------------------------------
788 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
789 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
790 | Otherwise, returns 0.
791 *----------------------------------------------------------------------------*/
792 
793 static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
794 {
795 
796     return ( a0 != b0 ) || ( a1 != b1 );
797 
798 }
799