xref: /openbmc/qemu/include/fpu/softfloat-macros.h (revision 8e6fe6b8)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84 
85 /*----------------------------------------------------------------------------
86 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
87 | bits are shifted off, they are ``jammed'' into the least significant bit of
88 | the result by setting the least significant bit to 1.  The value of `count'
89 | can be arbitrarily large; in particular, if `count' is greater than 32, the
90 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
91 | The result is stored in the location pointed to by `zPtr'.
92 *----------------------------------------------------------------------------*/
93 
94 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
95 {
96     uint32_t z;
97 
98     if ( count == 0 ) {
99         z = a;
100     }
101     else if ( count < 32 ) {
102         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
103     }
104     else {
105         z = ( a != 0 );
106     }
107     *zPtr = z;
108 
109 }
110 
111 /*----------------------------------------------------------------------------
112 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
113 | bits are shifted off, they are ``jammed'' into the least significant bit of
114 | the result by setting the least significant bit to 1.  The value of `count'
115 | can be arbitrarily large; in particular, if `count' is greater than 64, the
116 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
117 | The result is stored in the location pointed to by `zPtr'.
118 *----------------------------------------------------------------------------*/
119 
120 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
121 {
122     uint64_t z;
123 
124     if ( count == 0 ) {
125         z = a;
126     }
127     else if ( count < 64 ) {
128         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
129     }
130     else {
131         z = ( a != 0 );
132     }
133     *zPtr = z;
134 
135 }
136 
137 /*----------------------------------------------------------------------------
138 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
139 | _plus_ the number of bits given in `count'.  The shifted result is at most
140 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
141 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
142 | shifted off is the most-significant bit of the extra result, and the other
143 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
144 | bits shifted off were all zero.  This extra result is stored in the location
145 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
146 |     (This routine makes more sense if `a0' and `a1' are considered to form a
147 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
148 | value is shifted right by the number of bits given in `count', and the
149 | integer part of the result is returned at the location pointed to by
150 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
151 | described above, and is returned at the location pointed to by `z1Ptr'.)
152 *----------------------------------------------------------------------------*/
153 
154 static inline void
155  shift64ExtraRightJamming(
156      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
157 {
158     uint64_t z0, z1;
159     int8_t negCount = ( - count ) & 63;
160 
161     if ( count == 0 ) {
162         z1 = a1;
163         z0 = a0;
164     }
165     else if ( count < 64 ) {
166         z1 = ( a0<<negCount ) | ( a1 != 0 );
167         z0 = a0>>count;
168     }
169     else {
170         if ( count == 64 ) {
171             z1 = a0 | ( a1 != 0 );
172         }
173         else {
174             z1 = ( ( a0 | a1 ) != 0 );
175         }
176         z0 = 0;
177     }
178     *z1Ptr = z1;
179     *z0Ptr = z0;
180 
181 }
182 
183 /*----------------------------------------------------------------------------
184 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
185 | number of bits given in `count'.  Any bits shifted off are lost.  The value
186 | of `count' can be arbitrarily large; in particular, if `count' is greater
187 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
188 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
189 *----------------------------------------------------------------------------*/
190 
191 static inline void
192  shift128Right(
193      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
194 {
195     uint64_t z0, z1;
196     int8_t negCount = ( - count ) & 63;
197 
198     if ( count == 0 ) {
199         z1 = a1;
200         z0 = a0;
201     }
202     else if ( count < 64 ) {
203         z1 = ( a0<<negCount ) | ( a1>>count );
204         z0 = a0>>count;
205     }
206     else {
207         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
208         z0 = 0;
209     }
210     *z1Ptr = z1;
211     *z0Ptr = z0;
212 
213 }
214 
215 /*----------------------------------------------------------------------------
216 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
217 | number of bits given in `count'.  If any nonzero bits are shifted off, they
218 | are ``jammed'' into the least significant bit of the result by setting the
219 | least significant bit to 1.  The value of `count' can be arbitrarily large;
220 | in particular, if `count' is greater than 128, the result will be either
221 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
222 | nonzero.  The result is broken into two 64-bit pieces which are stored at
223 | the locations pointed to by `z0Ptr' and `z1Ptr'.
224 *----------------------------------------------------------------------------*/
225 
226 static inline void
227  shift128RightJamming(
228      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
229 {
230     uint64_t z0, z1;
231     int8_t negCount = ( - count ) & 63;
232 
233     if ( count == 0 ) {
234         z1 = a1;
235         z0 = a0;
236     }
237     else if ( count < 64 ) {
238         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
239         z0 = a0>>count;
240     }
241     else {
242         if ( count == 64 ) {
243             z1 = a0 | ( a1 != 0 );
244         }
245         else if ( count < 128 ) {
246             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
247         }
248         else {
249             z1 = ( ( a0 | a1 ) != 0 );
250         }
251         z0 = 0;
252     }
253     *z1Ptr = z1;
254     *z0Ptr = z0;
255 
256 }
257 
258 /*----------------------------------------------------------------------------
259 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
260 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
261 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
262 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
263 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
264 | the most-significant bit of the extra result, and the other 63 bits of the
265 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
266 | were all zero.  This extra result is stored in the location pointed to by
267 | `z2Ptr'.  The value of `count' can be arbitrarily large.
268 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
269 | to form a fixed-point value with binary point between `a1' and `a2'.  This
270 | fixed-point value is shifted right by the number of bits given in `count',
271 | and the integer part of the result is returned at the locations pointed to
272 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
273 | corrupted as described above, and is returned at the location pointed to by
274 | `z2Ptr'.)
275 *----------------------------------------------------------------------------*/
276 
277 static inline void
278  shift128ExtraRightJamming(
279      uint64_t a0,
280      uint64_t a1,
281      uint64_t a2,
282      int count,
283      uint64_t *z0Ptr,
284      uint64_t *z1Ptr,
285      uint64_t *z2Ptr
286  )
287 {
288     uint64_t z0, z1, z2;
289     int8_t negCount = ( - count ) & 63;
290 
291     if ( count == 0 ) {
292         z2 = a2;
293         z1 = a1;
294         z0 = a0;
295     }
296     else {
297         if ( count < 64 ) {
298             z2 = a1<<negCount;
299             z1 = ( a0<<negCount ) | ( a1>>count );
300             z0 = a0>>count;
301         }
302         else {
303             if ( count == 64 ) {
304                 z2 = a1;
305                 z1 = a0;
306             }
307             else {
308                 a2 |= a1;
309                 if ( count < 128 ) {
310                     z2 = a0<<negCount;
311                     z1 = a0>>( count & 63 );
312                 }
313                 else {
314                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
315                     z1 = 0;
316                 }
317             }
318             z0 = 0;
319         }
320         z2 |= ( a2 != 0 );
321     }
322     *z2Ptr = z2;
323     *z1Ptr = z1;
324     *z0Ptr = z0;
325 
326 }
327 
328 /*----------------------------------------------------------------------------
329 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
330 | number of bits given in `count'.  Any bits shifted off are lost.  The value
331 | of `count' must be less than 64.  The result is broken into two 64-bit
332 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
333 *----------------------------------------------------------------------------*/
334 
335 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
336                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
337 {
338     *z1Ptr = a1 << count;
339     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
340 }
341 
342 /*----------------------------------------------------------------------------
343 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
344 | number of bits given in `count'.  Any bits shifted off are lost.  The value
345 | of `count' may be greater than 64.  The result is broken into two 64-bit
346 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
347 *----------------------------------------------------------------------------*/
348 
349 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
350                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
351 {
352     if (count < 64) {
353         *z1Ptr = a1 << count;
354         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
355     } else {
356         *z1Ptr = 0;
357         *z0Ptr = a1 << (count - 64);
358     }
359 }
360 
361 /*----------------------------------------------------------------------------
362 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
363 | by the number of bits given in `count'.  Any bits shifted off are lost.
364 | The value of `count' must be less than 64.  The result is broken into three
365 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
366 | `z1Ptr', and `z2Ptr'.
367 *----------------------------------------------------------------------------*/
368 
369 static inline void
370  shortShift192Left(
371      uint64_t a0,
372      uint64_t a1,
373      uint64_t a2,
374      int count,
375      uint64_t *z0Ptr,
376      uint64_t *z1Ptr,
377      uint64_t *z2Ptr
378  )
379 {
380     uint64_t z0, z1, z2;
381     int8_t negCount;
382 
383     z2 = a2<<count;
384     z1 = a1<<count;
385     z0 = a0<<count;
386     if ( 0 < count ) {
387         negCount = ( ( - count ) & 63 );
388         z1 |= a2>>negCount;
389         z0 |= a1>>negCount;
390     }
391     *z2Ptr = z2;
392     *z1Ptr = z1;
393     *z0Ptr = z0;
394 
395 }
396 
397 /*----------------------------------------------------------------------------
398 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
399 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
400 | any carry out is lost.  The result is broken into two 64-bit pieces which
401 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
402 *----------------------------------------------------------------------------*/
403 
404 static inline void
405  add128(
406      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
407 {
408     uint64_t z1;
409 
410     z1 = a1 + b1;
411     *z1Ptr = z1;
412     *z0Ptr = a0 + b0 + ( z1 < a1 );
413 
414 }
415 
416 /*----------------------------------------------------------------------------
417 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
418 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
419 | modulo 2^192, so any carry out is lost.  The result is broken into three
420 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
421 | `z1Ptr', and `z2Ptr'.
422 *----------------------------------------------------------------------------*/
423 
424 static inline void
425  add192(
426      uint64_t a0,
427      uint64_t a1,
428      uint64_t a2,
429      uint64_t b0,
430      uint64_t b1,
431      uint64_t b2,
432      uint64_t *z0Ptr,
433      uint64_t *z1Ptr,
434      uint64_t *z2Ptr
435  )
436 {
437     uint64_t z0, z1, z2;
438     int8_t carry0, carry1;
439 
440     z2 = a2 + b2;
441     carry1 = ( z2 < a2 );
442     z1 = a1 + b1;
443     carry0 = ( z1 < a1 );
444     z0 = a0 + b0;
445     z1 += carry1;
446     z0 += ( z1 < carry1 );
447     z0 += carry0;
448     *z2Ptr = z2;
449     *z1Ptr = z1;
450     *z0Ptr = z0;
451 
452 }
453 
454 /*----------------------------------------------------------------------------
455 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
456 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
457 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
458 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
459 | `z1Ptr'.
460 *----------------------------------------------------------------------------*/
461 
462 static inline void
463  sub128(
464      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
465 {
466 
467     *z1Ptr = a1 - b1;
468     *z0Ptr = a0 - b0 - ( a1 < b1 );
469 
470 }
471 
472 /*----------------------------------------------------------------------------
473 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
474 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
475 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
476 | result is broken into three 64-bit pieces which are stored at the locations
477 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
478 *----------------------------------------------------------------------------*/
479 
480 static inline void
481  sub192(
482      uint64_t a0,
483      uint64_t a1,
484      uint64_t a2,
485      uint64_t b0,
486      uint64_t b1,
487      uint64_t b2,
488      uint64_t *z0Ptr,
489      uint64_t *z1Ptr,
490      uint64_t *z2Ptr
491  )
492 {
493     uint64_t z0, z1, z2;
494     int8_t borrow0, borrow1;
495 
496     z2 = a2 - b2;
497     borrow1 = ( a2 < b2 );
498     z1 = a1 - b1;
499     borrow0 = ( a1 < b1 );
500     z0 = a0 - b0;
501     z0 -= ( z1 < borrow1 );
502     z1 -= borrow1;
503     z0 -= borrow0;
504     *z2Ptr = z2;
505     *z1Ptr = z1;
506     *z0Ptr = z0;
507 
508 }
509 
510 /*----------------------------------------------------------------------------
511 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
512 | into two 64-bit pieces which are stored at the locations pointed to by
513 | `z0Ptr' and `z1Ptr'.
514 *----------------------------------------------------------------------------*/
515 
516 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
517 {
518     uint32_t aHigh, aLow, bHigh, bLow;
519     uint64_t z0, zMiddleA, zMiddleB, z1;
520 
521     aLow = a;
522     aHigh = a>>32;
523     bLow = b;
524     bHigh = b>>32;
525     z1 = ( (uint64_t) aLow ) * bLow;
526     zMiddleA = ( (uint64_t) aLow ) * bHigh;
527     zMiddleB = ( (uint64_t) aHigh ) * bLow;
528     z0 = ( (uint64_t) aHigh ) * bHigh;
529     zMiddleA += zMiddleB;
530     z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
531     zMiddleA <<= 32;
532     z1 += zMiddleA;
533     z0 += ( z1 < zMiddleA );
534     *z1Ptr = z1;
535     *z0Ptr = z0;
536 
537 }
538 
539 /*----------------------------------------------------------------------------
540 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
541 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
542 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
543 | `z2Ptr'.
544 *----------------------------------------------------------------------------*/
545 
546 static inline void
547  mul128By64To192(
548      uint64_t a0,
549      uint64_t a1,
550      uint64_t b,
551      uint64_t *z0Ptr,
552      uint64_t *z1Ptr,
553      uint64_t *z2Ptr
554  )
555 {
556     uint64_t z0, z1, z2, more1;
557 
558     mul64To128( a1, b, &z1, &z2 );
559     mul64To128( a0, b, &z0, &more1 );
560     add128( z0, more1, 0, z1, &z0, &z1 );
561     *z2Ptr = z2;
562     *z1Ptr = z1;
563     *z0Ptr = z0;
564 
565 }
566 
567 /*----------------------------------------------------------------------------
568 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
569 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
570 | product.  The product is broken into four 64-bit pieces which are stored at
571 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
572 *----------------------------------------------------------------------------*/
573 
574 static inline void
575  mul128To256(
576      uint64_t a0,
577      uint64_t a1,
578      uint64_t b0,
579      uint64_t b1,
580      uint64_t *z0Ptr,
581      uint64_t *z1Ptr,
582      uint64_t *z2Ptr,
583      uint64_t *z3Ptr
584  )
585 {
586     uint64_t z0, z1, z2, z3;
587     uint64_t more1, more2;
588 
589     mul64To128( a1, b1, &z2, &z3 );
590     mul64To128( a1, b0, &z1, &more2 );
591     add128( z1, more2, 0, z2, &z1, &z2 );
592     mul64To128( a0, b0, &z0, &more1 );
593     add128( z0, more1, 0, z1, &z0, &z1 );
594     mul64To128( a0, b1, &more1, &more2 );
595     add128( more1, more2, 0, z2, &more1, &z2 );
596     add128( z0, z1, 0, more1, &z0, &z1 );
597     *z3Ptr = z3;
598     *z2Ptr = z2;
599     *z1Ptr = z1;
600     *z0Ptr = z0;
601 
602 }
603 
604 /*----------------------------------------------------------------------------
605 | Returns an approximation to the 64-bit integer quotient obtained by dividing
606 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
607 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
608 | toward zero, the approximation returned lies between q and q + 2 inclusive.
609 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
610 | unsigned integer is returned.
611 *----------------------------------------------------------------------------*/
612 
613 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
614 {
615     uint64_t b0, b1;
616     uint64_t rem0, rem1, term0, term1;
617     uint64_t z;
618 
619     if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
620     b0 = b>>32;
621     z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
622     mul64To128( b, z, &term0, &term1 );
623     sub128( a0, a1, term0, term1, &rem0, &rem1 );
624     while ( ( (int64_t) rem0 ) < 0 ) {
625         z -= LIT64( 0x100000000 );
626         b1 = b<<32;
627         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
628     }
629     rem0 = ( rem0<<32 ) | ( rem1>>32 );
630     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
631     return z;
632 
633 }
634 
635 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
636  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
637  *
638  * Licensed under the GPLv2/LGPLv3
639  */
640 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
641                                   uint64_t n0, uint64_t d)
642 {
643 #if defined(__x86_64__)
644     uint64_t q;
645     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
646     return q;
647 #elif defined(__s390x__) && !defined(__clang__)
648     /* Need to use a TImode type to get an even register pair for DLGR.  */
649     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
650     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
651     *r = n >> 64;
652     return n;
653 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
654     /* From Power ISA 2.06, programming note for divdeu.  */
655     uint64_t q1, q2, Q, r1, r2, R;
656     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
657         : "=&r"(q1), "=r"(q2)
658         : "r"(n1), "r"(n0), "r"(d));
659     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
660     r2 = n0 - (q2 * d);
661     Q = q1 + q2;
662     R = r1 + r2;
663     if (R >= d || R < r2) { /* overflow implies R > d */
664         Q += 1;
665         R -= d;
666     }
667     *r = R;
668     return Q;
669 #else
670     uint64_t d0, d1, q0, q1, r1, r0, m;
671 
672     d0 = (uint32_t)d;
673     d1 = d >> 32;
674 
675     r1 = n1 % d1;
676     q1 = n1 / d1;
677     m = q1 * d0;
678     r1 = (r1 << 32) | (n0 >> 32);
679     if (r1 < m) {
680         q1 -= 1;
681         r1 += d;
682         if (r1 >= d) {
683             if (r1 < m) {
684                 q1 -= 1;
685                 r1 += d;
686             }
687         }
688     }
689     r1 -= m;
690 
691     r0 = r1 % d1;
692     q0 = r1 / d1;
693     m = q0 * d0;
694     r0 = (r0 << 32) | (uint32_t)n0;
695     if (r0 < m) {
696         q0 -= 1;
697         r0 += d;
698         if (r0 >= d) {
699             if (r0 < m) {
700                 q0 -= 1;
701                 r0 += d;
702             }
703         }
704     }
705     r0 -= m;
706 
707     *r = r0;
708     return (q1 << 32) | q0;
709 #endif
710 }
711 
712 /*----------------------------------------------------------------------------
713 | Returns an approximation to the square root of the 32-bit significand given
714 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
715 | `aExp' (the least significant bit) is 1, the integer returned approximates
716 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
717 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
718 | case, the approximation returned lies strictly within +/-2 of the exact
719 | value.
720 *----------------------------------------------------------------------------*/
721 
722 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
723 {
724     static const uint16_t sqrtOddAdjustments[] = {
725         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
726         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
727     };
728     static const uint16_t sqrtEvenAdjustments[] = {
729         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
730         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
731     };
732     int8_t index;
733     uint32_t z;
734 
735     index = ( a>>27 ) & 15;
736     if ( aExp & 1 ) {
737         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
738         z = ( ( a / z )<<14 ) + ( z<<15 );
739         a >>= 1;
740     }
741     else {
742         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
743         z = a / z + z;
744         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
745         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
746     }
747     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
748 
749 }
750 
751 /*----------------------------------------------------------------------------
752 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
753 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
754 | Otherwise, returns 0.
755 *----------------------------------------------------------------------------*/
756 
757 static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
758 {
759 
760     return ( a0 == b0 ) && ( a1 == b1 );
761 
762 }
763 
764 /*----------------------------------------------------------------------------
765 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
766 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
767 | Otherwise, returns 0.
768 *----------------------------------------------------------------------------*/
769 
770 static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
771 {
772 
773     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
774 
775 }
776 
777 /*----------------------------------------------------------------------------
778 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
779 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
780 | returns 0.
781 *----------------------------------------------------------------------------*/
782 
783 static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
784 {
785 
786     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
787 
788 }
789 
790 /*----------------------------------------------------------------------------
791 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
792 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
793 | Otherwise, returns 0.
794 *----------------------------------------------------------------------------*/
795 
796 static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
797 {
798 
799     return ( a0 != b0 ) || ( a1 != b1 );
800 
801 }
802 
803 #endif
804