xref: /openbmc/qemu/include/fpu/softfloat-macros.h (revision 8ac2d6c5)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *
12  * Any future contributions to this file after December 1st 2014 will be
13  * taken to be licensed under the Softfloat-2a license unless specifically
14  * indicated otherwise.
15  */
16 
17 /*
18 ===============================================================================
19 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
20 Arithmetic Package, Release 2a.
21 
22 Written by John R. Hauser.  This work was made possible in part by the
23 International Computer Science Institute, located at Suite 600, 1947 Center
24 Street, Berkeley, California 94704.  Funding was partially provided by the
25 National Science Foundation under grant MIP-9311980.  The original version
26 of this code was written as part of a project to build a fixed-point vector
27 processor in collaboration with the University of California at Berkeley,
28 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
29 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
30 arithmetic/SoftFloat.html'.
31 
32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
33 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
34 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
35 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
36 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
37 
38 Derivative works are acceptable, even for commercial purposes, so long as
39 (1) they include prominent notice that the work is derivative, and (2) they
40 include prominent notice akin to these four paragraphs for those parts of
41 this code that are retained.
42 
43 ===============================================================================
44 */
45 
46 /* BSD licensing:
47  * Copyright (c) 2006, Fabrice Bellard
48  * All rights reserved.
49  *
50  * Redistribution and use in source and binary forms, with or without
51  * modification, are permitted provided that the following conditions are met:
52  *
53  * 1. Redistributions of source code must retain the above copyright notice,
54  * this list of conditions and the following disclaimer.
55  *
56  * 2. Redistributions in binary form must reproduce the above copyright notice,
57  * this list of conditions and the following disclaimer in the documentation
58  * and/or other materials provided with the distribution.
59  *
60  * 3. Neither the name of the copyright holder nor the names of its contributors
61  * may be used to endorse or promote products derived from this software without
62  * specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
65  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
68  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
69  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
70  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
71  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
72  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
73  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
74  * THE POSSIBILITY OF SUCH DAMAGE.
75  */
76 
77 #ifndef FPU_SOFTFLOAT_MACROS_H
78 #define FPU_SOFTFLOAT_MACROS_H
79 
80 #include "fpu/softfloat-types.h"
81 #include "qemu/host-utils.h"
82 
83 /**
84  * shl_double: double-word merging left shift
85  * @l: left or most-significant word
86  * @r: right or least-significant word
87  * @c: shift count
88  *
89  * Shift @l left by @c bits, shifting in bits from @r.
90  */
shl_double(uint64_t l,uint64_t r,int c)91 static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
92 {
93 #if defined(__x86_64__)
94     asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
95     return l;
96 #else
97     return c ? (l << c) | (r >> (64 - c)) : l;
98 #endif
99 }
100 
101 /**
102  * shr_double: double-word merging right shift
103  * @l: left or most-significant word
104  * @r: right or least-significant word
105  * @c: shift count
106  *
107  * Shift @r right by @c bits, shifting in bits from @l.
108  */
shr_double(uint64_t l,uint64_t r,int c)109 static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
110 {
111 #if defined(__x86_64__)
112     asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
113     return r;
114 #else
115     return c ? (r >> c) | (l << (64 - c)) : r;
116 #endif
117 }
118 
119 /*----------------------------------------------------------------------------
120 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
121 | bits are shifted off, they are ``jammed'' into the least significant bit of
122 | the result by setting the least significant bit to 1.  The value of `count'
123 | can be arbitrarily large; in particular, if `count' is greater than 32, the
124 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
125 | The result is stored in the location pointed to by `zPtr'.
126 *----------------------------------------------------------------------------*/
127 
shift32RightJamming(uint32_t a,int count,uint32_t * zPtr)128 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
129 {
130     uint32_t z;
131 
132     if ( count == 0 ) {
133         z = a;
134     }
135     else if ( count < 32 ) {
136         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
137     }
138     else {
139         z = ( a != 0 );
140     }
141     *zPtr = z;
142 
143 }
144 
145 /*----------------------------------------------------------------------------
146 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
147 | bits are shifted off, they are ``jammed'' into the least significant bit of
148 | the result by setting the least significant bit to 1.  The value of `count'
149 | can be arbitrarily large; in particular, if `count' is greater than 64, the
150 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
151 | The result is stored in the location pointed to by `zPtr'.
152 *----------------------------------------------------------------------------*/
153 
shift64RightJamming(uint64_t a,int count,uint64_t * zPtr)154 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
155 {
156     uint64_t z;
157 
158     if ( count == 0 ) {
159         z = a;
160     }
161     else if ( count < 64 ) {
162         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
163     }
164     else {
165         z = ( a != 0 );
166     }
167     *zPtr = z;
168 
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
173 | _plus_ the number of bits given in `count'.  The shifted result is at most
174 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
175 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
176 | shifted off is the most-significant bit of the extra result, and the other
177 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
178 | bits shifted off were all zero.  This extra result is stored in the location
179 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
180 |     (This routine makes more sense if `a0' and `a1' are considered to form a
181 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
182 | value is shifted right by the number of bits given in `count', and the
183 | integer part of the result is returned at the location pointed to by
184 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
185 | described above, and is returned at the location pointed to by `z1Ptr'.)
186 *----------------------------------------------------------------------------*/
187 
188 static inline void
shift64ExtraRightJamming(uint64_t a0,uint64_t a1,int count,uint64_t * z0Ptr,uint64_t * z1Ptr)189  shift64ExtraRightJamming(
190      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
191 {
192     uint64_t z0, z1;
193     int8_t negCount = ( - count ) & 63;
194 
195     if ( count == 0 ) {
196         z1 = a1;
197         z0 = a0;
198     }
199     else if ( count < 64 ) {
200         z1 = ( a0<<negCount ) | ( a1 != 0 );
201         z0 = a0>>count;
202     }
203     else {
204         if ( count == 64 ) {
205             z1 = a0 | ( a1 != 0 );
206         }
207         else {
208             z1 = ( ( a0 | a1 ) != 0 );
209         }
210         z0 = 0;
211     }
212     *z1Ptr = z1;
213     *z0Ptr = z0;
214 
215 }
216 
217 /*----------------------------------------------------------------------------
218 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
219 | number of bits given in `count'.  Any bits shifted off are lost.  The value
220 | of `count' can be arbitrarily large; in particular, if `count' is greater
221 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
222 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
223 *----------------------------------------------------------------------------*/
224 
225 static inline void
shift128Right(uint64_t a0,uint64_t a1,int count,uint64_t * z0Ptr,uint64_t * z1Ptr)226  shift128Right(
227      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
228 {
229     uint64_t z0, z1;
230     int8_t negCount = ( - count ) & 63;
231 
232     if ( count == 0 ) {
233         z1 = a1;
234         z0 = a0;
235     }
236     else if ( count < 64 ) {
237         z1 = ( a0<<negCount ) | ( a1>>count );
238         z0 = a0>>count;
239     }
240     else {
241         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
242         z0 = 0;
243     }
244     *z1Ptr = z1;
245     *z0Ptr = z0;
246 
247 }
248 
249 /*----------------------------------------------------------------------------
250 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
251 | number of bits given in `count'.  If any nonzero bits are shifted off, they
252 | are ``jammed'' into the least significant bit of the result by setting the
253 | least significant bit to 1.  The value of `count' can be arbitrarily large;
254 | in particular, if `count' is greater than 128, the result will be either
255 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
256 | nonzero.  The result is broken into two 64-bit pieces which are stored at
257 | the locations pointed to by `z0Ptr' and `z1Ptr'.
258 *----------------------------------------------------------------------------*/
259 
260 static inline void
shift128RightJamming(uint64_t a0,uint64_t a1,int count,uint64_t * z0Ptr,uint64_t * z1Ptr)261  shift128RightJamming(
262      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
263 {
264     uint64_t z0, z1;
265     int8_t negCount = ( - count ) & 63;
266 
267     if ( count == 0 ) {
268         z1 = a1;
269         z0 = a0;
270     }
271     else if ( count < 64 ) {
272         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
273         z0 = a0>>count;
274     }
275     else {
276         if ( count == 64 ) {
277             z1 = a0 | ( a1 != 0 );
278         }
279         else if ( count < 128 ) {
280             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
281         }
282         else {
283             z1 = ( ( a0 | a1 ) != 0 );
284         }
285         z0 = 0;
286     }
287     *z1Ptr = z1;
288     *z0Ptr = z0;
289 
290 }
291 
292 /*----------------------------------------------------------------------------
293 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
294 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
295 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
296 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
297 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
298 | the most-significant bit of the extra result, and the other 63 bits of the
299 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
300 | were all zero.  This extra result is stored in the location pointed to by
301 | `z2Ptr'.  The value of `count' can be arbitrarily large.
302 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
303 | to form a fixed-point value with binary point between `a1' and `a2'.  This
304 | fixed-point value is shifted right by the number of bits given in `count',
305 | and the integer part of the result is returned at the locations pointed to
306 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
307 | corrupted as described above, and is returned at the location pointed to by
308 | `z2Ptr'.)
309 *----------------------------------------------------------------------------*/
310 
311 static inline void
shift128ExtraRightJamming(uint64_t a0,uint64_t a1,uint64_t a2,int count,uint64_t * z0Ptr,uint64_t * z1Ptr,uint64_t * z2Ptr)312  shift128ExtraRightJamming(
313      uint64_t a0,
314      uint64_t a1,
315      uint64_t a2,
316      int count,
317      uint64_t *z0Ptr,
318      uint64_t *z1Ptr,
319      uint64_t *z2Ptr
320  )
321 {
322     uint64_t z0, z1, z2;
323     int8_t negCount = ( - count ) & 63;
324 
325     if ( count == 0 ) {
326         z2 = a2;
327         z1 = a1;
328         z0 = a0;
329     }
330     else {
331         if ( count < 64 ) {
332             z2 = a1<<negCount;
333             z1 = ( a0<<negCount ) | ( a1>>count );
334             z0 = a0>>count;
335         }
336         else {
337             if ( count == 64 ) {
338                 z2 = a1;
339                 z1 = a0;
340             }
341             else {
342                 a2 |= a1;
343                 if ( count < 128 ) {
344                     z2 = a0<<negCount;
345                     z1 = a0>>( count & 63 );
346                 }
347                 else {
348                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
349                     z1 = 0;
350                 }
351             }
352             z0 = 0;
353         }
354         z2 |= ( a2 != 0 );
355     }
356     *z2Ptr = z2;
357     *z1Ptr = z1;
358     *z0Ptr = z0;
359 
360 }
361 
362 /*----------------------------------------------------------------------------
363 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
364 | number of bits given in `count'.  Any bits shifted off are lost.  The value
365 | of `count' must be less than 64.  The result is broken into two 64-bit
366 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
367 *----------------------------------------------------------------------------*/
368 
shortShift128Left(uint64_t a0,uint64_t a1,int count,uint64_t * z0Ptr,uint64_t * z1Ptr)369 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
370                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
371 {
372     *z1Ptr = a1 << count;
373     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
374 }
375 
376 /*----------------------------------------------------------------------------
377 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
378 | number of bits given in `count'.  Any bits shifted off are lost.  The value
379 | of `count' may be greater than 64.  The result is broken into two 64-bit
380 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
381 *----------------------------------------------------------------------------*/
382 
shift128Left(uint64_t a0,uint64_t a1,int count,uint64_t * z0Ptr,uint64_t * z1Ptr)383 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
384                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
385 {
386     if (count < 64) {
387         *z1Ptr = a1 << count;
388         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
389     } else {
390         *z1Ptr = 0;
391         *z0Ptr = a1 << (count - 64);
392     }
393 }
394 
395 /*----------------------------------------------------------------------------
396 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
397 | by the number of bits given in `count'.  Any bits shifted off are lost.
398 | The value of `count' must be less than 64.  The result is broken into three
399 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
400 | `z1Ptr', and `z2Ptr'.
401 *----------------------------------------------------------------------------*/
402 
403 static inline void
shortShift192Left(uint64_t a0,uint64_t a1,uint64_t a2,int count,uint64_t * z0Ptr,uint64_t * z1Ptr,uint64_t * z2Ptr)404  shortShift192Left(
405      uint64_t a0,
406      uint64_t a1,
407      uint64_t a2,
408      int count,
409      uint64_t *z0Ptr,
410      uint64_t *z1Ptr,
411      uint64_t *z2Ptr
412  )
413 {
414     uint64_t z0, z1, z2;
415     int8_t negCount;
416 
417     z2 = a2<<count;
418     z1 = a1<<count;
419     z0 = a0<<count;
420     if ( 0 < count ) {
421         negCount = ( ( - count ) & 63 );
422         z1 |= a2>>negCount;
423         z0 |= a1>>negCount;
424     }
425     *z2Ptr = z2;
426     *z1Ptr = z1;
427     *z0Ptr = z0;
428 
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
433 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
434 | any carry out is lost.  The result is broken into two 64-bit pieces which
435 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
436 *----------------------------------------------------------------------------*/
437 
add128(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1,uint64_t * z0Ptr,uint64_t * z1Ptr)438 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
439                           uint64_t *z0Ptr, uint64_t *z1Ptr)
440 {
441     bool c = 0;
442     *z1Ptr = uadd64_carry(a1, b1, &c);
443     *z0Ptr = uadd64_carry(a0, b0, &c);
444 }
445 
446 /*----------------------------------------------------------------------------
447 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
448 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
449 | modulo 2^192, so any carry out is lost.  The result is broken into three
450 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
451 | `z1Ptr', and `z2Ptr'.
452 *----------------------------------------------------------------------------*/
453 
add192(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2,uint64_t * z0Ptr,uint64_t * z1Ptr,uint64_t * z2Ptr)454 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
455                           uint64_t b0, uint64_t b1, uint64_t b2,
456                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
457 {
458     bool c = 0;
459     *z2Ptr = uadd64_carry(a2, b2, &c);
460     *z1Ptr = uadd64_carry(a1, b1, &c);
461     *z0Ptr = uadd64_carry(a0, b0, &c);
462 }
463 
464 /*----------------------------------------------------------------------------
465 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
466 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
467 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
468 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
469 | `z1Ptr'.
470 *----------------------------------------------------------------------------*/
471 
sub128(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1,uint64_t * z0Ptr,uint64_t * z1Ptr)472 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
473                           uint64_t *z0Ptr, uint64_t *z1Ptr)
474 {
475     bool c = 0;
476     *z1Ptr = usub64_borrow(a1, b1, &c);
477     *z0Ptr = usub64_borrow(a0, b0, &c);
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
482 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
483 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
484 | result is broken into three 64-bit pieces which are stored at the locations
485 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
486 *----------------------------------------------------------------------------*/
487 
sub192(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2,uint64_t * z0Ptr,uint64_t * z1Ptr,uint64_t * z2Ptr)488 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
489                           uint64_t b0, uint64_t b1, uint64_t b2,
490                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
491 {
492     bool c = 0;
493     *z2Ptr = usub64_borrow(a2, b2, &c);
494     *z1Ptr = usub64_borrow(a1, b1, &c);
495     *z0Ptr = usub64_borrow(a0, b0, &c);
496 }
497 
498 /*----------------------------------------------------------------------------
499 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
500 | into two 64-bit pieces which are stored at the locations pointed to by
501 | `z0Ptr' and `z1Ptr'.
502 *----------------------------------------------------------------------------*/
503 
504 static inline void
mul64To128(uint64_t a,uint64_t b,uint64_t * z0Ptr,uint64_t * z1Ptr)505 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
506 {
507     mulu64(z1Ptr, z0Ptr, a, b);
508 }
509 
510 /*----------------------------------------------------------------------------
511 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
512 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
513 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
514 | `z2Ptr'.
515 *----------------------------------------------------------------------------*/
516 
517 static inline void
mul128By64To192(uint64_t a0,uint64_t a1,uint64_t b,uint64_t * z0Ptr,uint64_t * z1Ptr,uint64_t * z2Ptr)518 mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
519                 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
520 {
521     uint64_t z0, z1, m1;
522 
523     mul64To128(a1, b, &m1, z2Ptr);
524     mul64To128(a0, b, &z0, &z1);
525     add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
526 }
527 
528 /*----------------------------------------------------------------------------
529 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
530 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
531 | product.  The product is broken into four 64-bit pieces which are stored at
532 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
533 *----------------------------------------------------------------------------*/
534 
mul128To256(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1,uint64_t * z0Ptr,uint64_t * z1Ptr,uint64_t * z2Ptr,uint64_t * z3Ptr)535 static inline void mul128To256(uint64_t a0, uint64_t a1,
536                                uint64_t b0, uint64_t b1,
537                                uint64_t *z0Ptr, uint64_t *z1Ptr,
538                                uint64_t *z2Ptr, uint64_t *z3Ptr)
539 {
540     uint64_t z0, z1, z2;
541     uint64_t m0, m1, m2, n1, n2;
542 
543     mul64To128(a1, b0, &m1, &m2);
544     mul64To128(a0, b1, &n1, &n2);
545     mul64To128(a1, b1, &z2, z3Ptr);
546     mul64To128(a0, b0, &z0, &z1);
547 
548     add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
549     add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
550 }
551 
552 /*----------------------------------------------------------------------------
553 | Returns an approximation to the 64-bit integer quotient obtained by dividing
554 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
555 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
556 | toward zero, the approximation returned lies between q and q + 2 inclusive.
557 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
558 | unsigned integer is returned.
559 *----------------------------------------------------------------------------*/
560 
estimateDiv128To64(uint64_t a0,uint64_t a1,uint64_t b)561 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
562 {
563     uint64_t b0, b1;
564     uint64_t rem0, rem1, term0, term1;
565     uint64_t z;
566 
567     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
568     b0 = b>>32;
569     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
570     mul64To128( b, z, &term0, &term1 );
571     sub128( a0, a1, term0, term1, &rem0, &rem1 );
572     while ( ( (int64_t) rem0 ) < 0 ) {
573         z -= UINT64_C(0x100000000);
574         b1 = b<<32;
575         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
576     }
577     rem0 = ( rem0<<32 ) | ( rem1>>32 );
578     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
579     return z;
580 
581 }
582 
583 /*----------------------------------------------------------------------------
584 | Returns an approximation to the square root of the 32-bit significand given
585 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
586 | `aExp' (the least significant bit) is 1, the integer returned approximates
587 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
588 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
589 | case, the approximation returned lies strictly within +/-2 of the exact
590 | value.
591 *----------------------------------------------------------------------------*/
592 
estimateSqrt32(int aExp,uint32_t a)593 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
594 {
595     static const uint16_t sqrtOddAdjustments[] = {
596         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
597         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
598     };
599     static const uint16_t sqrtEvenAdjustments[] = {
600         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
601         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
602     };
603     int8_t index;
604     uint32_t z;
605 
606     index = ( a>>27 ) & 15;
607     if ( aExp & 1 ) {
608         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
609         z = ( ( a / z )<<14 ) + ( z<<15 );
610         a >>= 1;
611     }
612     else {
613         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
614         z = a / z + z;
615         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
616         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
617     }
618     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
619 
620 }
621 
622 /*----------------------------------------------------------------------------
623 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
624 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
625 | Otherwise, returns 0.
626 *----------------------------------------------------------------------------*/
627 
eq128(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1)628 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
629 {
630     return a0 == b0 && a1 == b1;
631 }
632 
633 /*----------------------------------------------------------------------------
634 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
635 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
636 | Otherwise, returns 0.
637 *----------------------------------------------------------------------------*/
638 
le128(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1)639 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
640 {
641     return a0 < b0 || (a0 == b0 && a1 <= b1);
642 }
643 
644 /*----------------------------------------------------------------------------
645 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
646 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
647 | returns 0.
648 *----------------------------------------------------------------------------*/
649 
lt128(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1)650 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
651 {
652     return a0 < b0 || (a0 == b0 && a1 < b1);
653 }
654 
655 /*----------------------------------------------------------------------------
656 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
657 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
658 | Otherwise, returns 0.
659 *----------------------------------------------------------------------------*/
660 
ne128(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1)661 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
662 {
663     return a0 != b0 || a1 != b1;
664 }
665 
666 /*
667  * Similarly, comparisons of 192-bit values.
668  */
669 
eq192(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2)670 static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
671                          uint64_t b0, uint64_t b1, uint64_t b2)
672 {
673     return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
674 }
675 
le192(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2)676 static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
677                          uint64_t b0, uint64_t b1, uint64_t b2)
678 {
679     if (a0 != b0) {
680         return a0 < b0;
681     }
682     if (a1 != b1) {
683         return a1 < b1;
684     }
685     return a2 <= b2;
686 }
687 
lt192(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2)688 static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
689                          uint64_t b0, uint64_t b1, uint64_t b2)
690 {
691     if (a0 != b0) {
692         return a0 < b0;
693     }
694     if (a1 != b1) {
695         return a1 < b1;
696     }
697     return a2 < b2;
698 }
699 
700 #endif
701