xref: /openbmc/qemu/include/fpu/softfloat-macros.h (revision f58f084e71c6aeee066a30fc87422820e94a6cfc)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84 
85 #include "fpu/softfloat-types.h"
86 
87 /*----------------------------------------------------------------------------
88 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
89 | bits are shifted off, they are ``jammed'' into the least significant bit of
90 | the result by setting the least significant bit to 1.  The value of `count'
91 | can be arbitrarily large; in particular, if `count' is greater than 32, the
92 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
93 | The result is stored in the location pointed to by `zPtr'.
94 *----------------------------------------------------------------------------*/
95 
96 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
97 {
98     uint32_t z;
99 
100     if ( count == 0 ) {
101         z = a;
102     }
103     else if ( count < 32 ) {
104         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
105     }
106     else {
107         z = ( a != 0 );
108     }
109     *zPtr = z;
110 
111 }
112 
113 /*----------------------------------------------------------------------------
114 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
115 | bits are shifted off, they are ``jammed'' into the least significant bit of
116 | the result by setting the least significant bit to 1.  The value of `count'
117 | can be arbitrarily large; in particular, if `count' is greater than 64, the
118 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
119 | The result is stored in the location pointed to by `zPtr'.
120 *----------------------------------------------------------------------------*/
121 
122 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
123 {
124     uint64_t z;
125 
126     if ( count == 0 ) {
127         z = a;
128     }
129     else if ( count < 64 ) {
130         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
131     }
132     else {
133         z = ( a != 0 );
134     }
135     *zPtr = z;
136 
137 }
138 
139 /*----------------------------------------------------------------------------
140 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
141 | _plus_ the number of bits given in `count'.  The shifted result is at most
142 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
143 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
144 | shifted off is the most-significant bit of the extra result, and the other
145 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
146 | bits shifted off were all zero.  This extra result is stored in the location
147 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
148 |     (This routine makes more sense if `a0' and `a1' are considered to form a
149 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
150 | value is shifted right by the number of bits given in `count', and the
151 | integer part of the result is returned at the location pointed to by
152 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
153 | described above, and is returned at the location pointed to by `z1Ptr'.)
154 *----------------------------------------------------------------------------*/
155 
156 static inline void
157  shift64ExtraRightJamming(
158      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
159 {
160     uint64_t z0, z1;
161     int8_t negCount = ( - count ) & 63;
162 
163     if ( count == 0 ) {
164         z1 = a1;
165         z0 = a0;
166     }
167     else if ( count < 64 ) {
168         z1 = ( a0<<negCount ) | ( a1 != 0 );
169         z0 = a0>>count;
170     }
171     else {
172         if ( count == 64 ) {
173             z1 = a0 | ( a1 != 0 );
174         }
175         else {
176             z1 = ( ( a0 | a1 ) != 0 );
177         }
178         z0 = 0;
179     }
180     *z1Ptr = z1;
181     *z0Ptr = z0;
182 
183 }
184 
185 /*----------------------------------------------------------------------------
186 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
187 | number of bits given in `count'.  Any bits shifted off are lost.  The value
188 | of `count' can be arbitrarily large; in particular, if `count' is greater
189 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
190 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
191 *----------------------------------------------------------------------------*/
192 
193 static inline void
194  shift128Right(
195      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
196 {
197     uint64_t z0, z1;
198     int8_t negCount = ( - count ) & 63;
199 
200     if ( count == 0 ) {
201         z1 = a1;
202         z0 = a0;
203     }
204     else if ( count < 64 ) {
205         z1 = ( a0<<negCount ) | ( a1>>count );
206         z0 = a0>>count;
207     }
208     else {
209         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
210         z0 = 0;
211     }
212     *z1Ptr = z1;
213     *z0Ptr = z0;
214 
215 }
216 
217 /*----------------------------------------------------------------------------
218 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
219 | number of bits given in `count'.  If any nonzero bits are shifted off, they
220 | are ``jammed'' into the least significant bit of the result by setting the
221 | least significant bit to 1.  The value of `count' can be arbitrarily large;
222 | in particular, if `count' is greater than 128, the result will be either
223 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
224 | nonzero.  The result is broken into two 64-bit pieces which are stored at
225 | the locations pointed to by `z0Ptr' and `z1Ptr'.
226 *----------------------------------------------------------------------------*/
227 
228 static inline void
229  shift128RightJamming(
230      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
231 {
232     uint64_t z0, z1;
233     int8_t negCount = ( - count ) & 63;
234 
235     if ( count == 0 ) {
236         z1 = a1;
237         z0 = a0;
238     }
239     else if ( count < 64 ) {
240         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
241         z0 = a0>>count;
242     }
243     else {
244         if ( count == 64 ) {
245             z1 = a0 | ( a1 != 0 );
246         }
247         else if ( count < 128 ) {
248             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
249         }
250         else {
251             z1 = ( ( a0 | a1 ) != 0 );
252         }
253         z0 = 0;
254     }
255     *z1Ptr = z1;
256     *z0Ptr = z0;
257 
258 }
259 
260 /*----------------------------------------------------------------------------
261 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
262 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
263 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
264 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
265 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
266 | the most-significant bit of the extra result, and the other 63 bits of the
267 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
268 | were all zero.  This extra result is stored in the location pointed to by
269 | `z2Ptr'.  The value of `count' can be arbitrarily large.
270 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
271 | to form a fixed-point value with binary point between `a1' and `a2'.  This
272 | fixed-point value is shifted right by the number of bits given in `count',
273 | and the integer part of the result is returned at the locations pointed to
274 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
275 | corrupted as described above, and is returned at the location pointed to by
276 | `z2Ptr'.)
277 *----------------------------------------------------------------------------*/
278 
279 static inline void
280  shift128ExtraRightJamming(
281      uint64_t a0,
282      uint64_t a1,
283      uint64_t a2,
284      int count,
285      uint64_t *z0Ptr,
286      uint64_t *z1Ptr,
287      uint64_t *z2Ptr
288  )
289 {
290     uint64_t z0, z1, z2;
291     int8_t negCount = ( - count ) & 63;
292 
293     if ( count == 0 ) {
294         z2 = a2;
295         z1 = a1;
296         z0 = a0;
297     }
298     else {
299         if ( count < 64 ) {
300             z2 = a1<<negCount;
301             z1 = ( a0<<negCount ) | ( a1>>count );
302             z0 = a0>>count;
303         }
304         else {
305             if ( count == 64 ) {
306                 z2 = a1;
307                 z1 = a0;
308             }
309             else {
310                 a2 |= a1;
311                 if ( count < 128 ) {
312                     z2 = a0<<negCount;
313                     z1 = a0>>( count & 63 );
314                 }
315                 else {
316                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
317                     z1 = 0;
318                 }
319             }
320             z0 = 0;
321         }
322         z2 |= ( a2 != 0 );
323     }
324     *z2Ptr = z2;
325     *z1Ptr = z1;
326     *z0Ptr = z0;
327 
328 }
329 
330 /*----------------------------------------------------------------------------
331 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
332 | number of bits given in `count'.  Any bits shifted off are lost.  The value
333 | of `count' must be less than 64.  The result is broken into two 64-bit
334 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
335 *----------------------------------------------------------------------------*/
336 
337 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
338                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
339 {
340     *z1Ptr = a1 << count;
341     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
342 }
343 
344 /*----------------------------------------------------------------------------
345 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
346 | number of bits given in `count'.  Any bits shifted off are lost.  The value
347 | of `count' may be greater than 64.  The result is broken into two 64-bit
348 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
349 *----------------------------------------------------------------------------*/
350 
351 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
352                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
353 {
354     if (count < 64) {
355         *z1Ptr = a1 << count;
356         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
357     } else {
358         *z1Ptr = 0;
359         *z0Ptr = a1 << (count - 64);
360     }
361 }
362 
363 /*----------------------------------------------------------------------------
364 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
365 | by the number of bits given in `count'.  Any bits shifted off are lost.
366 | The value of `count' must be less than 64.  The result is broken into three
367 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
368 | `z1Ptr', and `z2Ptr'.
369 *----------------------------------------------------------------------------*/
370 
371 static inline void
372  shortShift192Left(
373      uint64_t a0,
374      uint64_t a1,
375      uint64_t a2,
376      int count,
377      uint64_t *z0Ptr,
378      uint64_t *z1Ptr,
379      uint64_t *z2Ptr
380  )
381 {
382     uint64_t z0, z1, z2;
383     int8_t negCount;
384 
385     z2 = a2<<count;
386     z1 = a1<<count;
387     z0 = a0<<count;
388     if ( 0 < count ) {
389         negCount = ( ( - count ) & 63 );
390         z1 |= a2>>negCount;
391         z0 |= a1>>negCount;
392     }
393     *z2Ptr = z2;
394     *z1Ptr = z1;
395     *z0Ptr = z0;
396 
397 }
398 
399 /*----------------------------------------------------------------------------
400 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
401 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
402 | any carry out is lost.  The result is broken into two 64-bit pieces which
403 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
404 *----------------------------------------------------------------------------*/
405 
406 static inline void
407  add128(
408      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
409 {
410     uint64_t z1;
411 
412     z1 = a1 + b1;
413     *z1Ptr = z1;
414     *z0Ptr = a0 + b0 + ( z1 < a1 );
415 
416 }
417 
418 /*----------------------------------------------------------------------------
419 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
420 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
421 | modulo 2^192, so any carry out is lost.  The result is broken into three
422 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
423 | `z1Ptr', and `z2Ptr'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline void
427  add192(
428      uint64_t a0,
429      uint64_t a1,
430      uint64_t a2,
431      uint64_t b0,
432      uint64_t b1,
433      uint64_t b2,
434      uint64_t *z0Ptr,
435      uint64_t *z1Ptr,
436      uint64_t *z2Ptr
437  )
438 {
439     uint64_t z0, z1, z2;
440     int8_t carry0, carry1;
441 
442     z2 = a2 + b2;
443     carry1 = ( z2 < a2 );
444     z1 = a1 + b1;
445     carry0 = ( z1 < a1 );
446     z0 = a0 + b0;
447     z1 += carry1;
448     z0 += ( z1 < carry1 );
449     z0 += carry0;
450     *z2Ptr = z2;
451     *z1Ptr = z1;
452     *z0Ptr = z0;
453 
454 }
455 
456 /*----------------------------------------------------------------------------
457 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
458 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
459 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
460 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
461 | `z1Ptr'.
462 *----------------------------------------------------------------------------*/
463 
464 static inline void
465  sub128(
466      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
467 {
468 
469     *z1Ptr = a1 - b1;
470     *z0Ptr = a0 - b0 - ( a1 < b1 );
471 
472 }
473 
474 /*----------------------------------------------------------------------------
475 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
476 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
477 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
478 | result is broken into three 64-bit pieces which are stored at the locations
479 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
480 *----------------------------------------------------------------------------*/
481 
482 static inline void
483  sub192(
484      uint64_t a0,
485      uint64_t a1,
486      uint64_t a2,
487      uint64_t b0,
488      uint64_t b1,
489      uint64_t b2,
490      uint64_t *z0Ptr,
491      uint64_t *z1Ptr,
492      uint64_t *z2Ptr
493  )
494 {
495     uint64_t z0, z1, z2;
496     int8_t borrow0, borrow1;
497 
498     z2 = a2 - b2;
499     borrow1 = ( a2 < b2 );
500     z1 = a1 - b1;
501     borrow0 = ( a1 < b1 );
502     z0 = a0 - b0;
503     z0 -= ( z1 < borrow1 );
504     z1 -= borrow1;
505     z0 -= borrow0;
506     *z2Ptr = z2;
507     *z1Ptr = z1;
508     *z0Ptr = z0;
509 
510 }
511 
512 /*----------------------------------------------------------------------------
513 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
514 | into two 64-bit pieces which are stored at the locations pointed to by
515 | `z0Ptr' and `z1Ptr'.
516 *----------------------------------------------------------------------------*/
517 
518 static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
519 {
520     uint32_t aHigh, aLow, bHigh, bLow;
521     uint64_t z0, zMiddleA, zMiddleB, z1;
522 
523     aLow = a;
524     aHigh = a>>32;
525     bLow = b;
526     bHigh = b>>32;
527     z1 = ( (uint64_t) aLow ) * bLow;
528     zMiddleA = ( (uint64_t) aLow ) * bHigh;
529     zMiddleB = ( (uint64_t) aHigh ) * bLow;
530     z0 = ( (uint64_t) aHigh ) * bHigh;
531     zMiddleA += zMiddleB;
532     z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
533     zMiddleA <<= 32;
534     z1 += zMiddleA;
535     z0 += ( z1 < zMiddleA );
536     *z1Ptr = z1;
537     *z0Ptr = z0;
538 
539 }
540 
541 /*----------------------------------------------------------------------------
542 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
543 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
544 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
545 | `z2Ptr'.
546 *----------------------------------------------------------------------------*/
547 
548 static inline void
549  mul128By64To192(
550      uint64_t a0,
551      uint64_t a1,
552      uint64_t b,
553      uint64_t *z0Ptr,
554      uint64_t *z1Ptr,
555      uint64_t *z2Ptr
556  )
557 {
558     uint64_t z0, z1, z2, more1;
559 
560     mul64To128( a1, b, &z1, &z2 );
561     mul64To128( a0, b, &z0, &more1 );
562     add128( z0, more1, 0, z1, &z0, &z1 );
563     *z2Ptr = z2;
564     *z1Ptr = z1;
565     *z0Ptr = z0;
566 
567 }
568 
569 /*----------------------------------------------------------------------------
570 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
571 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
572 | product.  The product is broken into four 64-bit pieces which are stored at
573 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
574 *----------------------------------------------------------------------------*/
575 
576 static inline void
577  mul128To256(
578      uint64_t a0,
579      uint64_t a1,
580      uint64_t b0,
581      uint64_t b1,
582      uint64_t *z0Ptr,
583      uint64_t *z1Ptr,
584      uint64_t *z2Ptr,
585      uint64_t *z3Ptr
586  )
587 {
588     uint64_t z0, z1, z2, z3;
589     uint64_t more1, more2;
590 
591     mul64To128( a1, b1, &z2, &z3 );
592     mul64To128( a1, b0, &z1, &more2 );
593     add128( z1, more2, 0, z2, &z1, &z2 );
594     mul64To128( a0, b0, &z0, &more1 );
595     add128( z0, more1, 0, z1, &z0, &z1 );
596     mul64To128( a0, b1, &more1, &more2 );
597     add128( more1, more2, 0, z2, &more1, &z2 );
598     add128( z0, z1, 0, more1, &z0, &z1 );
599     *z3Ptr = z3;
600     *z2Ptr = z2;
601     *z1Ptr = z1;
602     *z0Ptr = z0;
603 
604 }
605 
606 /*----------------------------------------------------------------------------
607 | Returns an approximation to the 64-bit integer quotient obtained by dividing
608 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
609 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
610 | toward zero, the approximation returned lies between q and q + 2 inclusive.
611 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
612 | unsigned integer is returned.
613 *----------------------------------------------------------------------------*/
614 
615 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
616 {
617     uint64_t b0, b1;
618     uint64_t rem0, rem1, term0, term1;
619     uint64_t z;
620 
621     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
622     b0 = b>>32;
623     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
624     mul64To128( b, z, &term0, &term1 );
625     sub128( a0, a1, term0, term1, &rem0, &rem1 );
626     while ( ( (int64_t) rem0 ) < 0 ) {
627         z -= UINT64_C(0x100000000);
628         b1 = b<<32;
629         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
630     }
631     rem0 = ( rem0<<32 ) | ( rem1>>32 );
632     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
633     return z;
634 
635 }
636 
637 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
638  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
639  *
640  * Licensed under the GPLv2/LGPLv3
641  */
642 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
643                                   uint64_t n0, uint64_t d)
644 {
645 #if defined(__x86_64__)
646     uint64_t q;
647     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
648     return q;
649 #elif defined(__s390x__) && !defined(__clang__)
650     /* Need to use a TImode type to get an even register pair for DLGR.  */
651     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
652     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
653     *r = n >> 64;
654     return n;
655 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
656     /* From Power ISA 2.06, programming note for divdeu.  */
657     uint64_t q1, q2, Q, r1, r2, R;
658     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
659         : "=&r"(q1), "=r"(q2)
660         : "r"(n1), "r"(n0), "r"(d));
661     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
662     r2 = n0 - (q2 * d);
663     Q = q1 + q2;
664     R = r1 + r2;
665     if (R >= d || R < r2) { /* overflow implies R > d */
666         Q += 1;
667         R -= d;
668     }
669     *r = R;
670     return Q;
671 #else
672     uint64_t d0, d1, q0, q1, r1, r0, m;
673 
674     d0 = (uint32_t)d;
675     d1 = d >> 32;
676 
677     r1 = n1 % d1;
678     q1 = n1 / d1;
679     m = q1 * d0;
680     r1 = (r1 << 32) | (n0 >> 32);
681     if (r1 < m) {
682         q1 -= 1;
683         r1 += d;
684         if (r1 >= d) {
685             if (r1 < m) {
686                 q1 -= 1;
687                 r1 += d;
688             }
689         }
690     }
691     r1 -= m;
692 
693     r0 = r1 % d1;
694     q0 = r1 / d1;
695     m = q0 * d0;
696     r0 = (r0 << 32) | (uint32_t)n0;
697     if (r0 < m) {
698         q0 -= 1;
699         r0 += d;
700         if (r0 >= d) {
701             if (r0 < m) {
702                 q0 -= 1;
703                 r0 += d;
704             }
705         }
706     }
707     r0 -= m;
708 
709     *r = r0;
710     return (q1 << 32) | q0;
711 #endif
712 }
713 
714 /*----------------------------------------------------------------------------
715 | Returns an approximation to the square root of the 32-bit significand given
716 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
717 | `aExp' (the least significant bit) is 1, the integer returned approximates
718 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
719 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
720 | case, the approximation returned lies strictly within +/-2 of the exact
721 | value.
722 *----------------------------------------------------------------------------*/
723 
724 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
725 {
726     static const uint16_t sqrtOddAdjustments[] = {
727         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
728         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
729     };
730     static const uint16_t sqrtEvenAdjustments[] = {
731         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
732         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
733     };
734     int8_t index;
735     uint32_t z;
736 
737     index = ( a>>27 ) & 15;
738     if ( aExp & 1 ) {
739         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
740         z = ( ( a / z )<<14 ) + ( z<<15 );
741         a >>= 1;
742     }
743     else {
744         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
745         z = a / z + z;
746         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
747         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
748     }
749     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
750 
751 }
752 
753 /*----------------------------------------------------------------------------
754 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
755 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
756 | Otherwise, returns 0.
757 *----------------------------------------------------------------------------*/
758 
759 static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
760 {
761 
762     return ( a0 == b0 ) && ( a1 == b1 );
763 
764 }
765 
766 /*----------------------------------------------------------------------------
767 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
768 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
769 | Otherwise, returns 0.
770 *----------------------------------------------------------------------------*/
771 
772 static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
773 {
774 
775     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
776 
777 }
778 
779 /*----------------------------------------------------------------------------
780 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
781 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
782 | returns 0.
783 *----------------------------------------------------------------------------*/
784 
785 static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
786 {
787 
788     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
789 
790 }
791 
792 /*----------------------------------------------------------------------------
793 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
794 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
795 | Otherwise, returns 0.
796 *----------------------------------------------------------------------------*/
797 
798 static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
799 {
800 
801     return ( a0 != b0 ) || ( a1 != b1 );
802 
803 }
804 
805 #endif
806