Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20211016' into staging
[qemu.git] / include / fpu / softfloat-macros.h
1 /*
2 * QEMU float support macros
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84
85 #include "fpu/softfloat-types.h"
86 #include "qemu/host-utils.h"
87
88 /**
89 * shl_double: double-word merging left shift
90 * @l: left or most-significant word
91 * @r: right or least-significant word
92 * @c: shift count
93 *
94 * Shift @l left by @c bits, shifting in bits from @r.
95 */
96 static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
97 {
98 #if defined(__x86_64__)
99 asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
100 return l;
101 #else
102 return c ? (l << c) | (r >> (64 - c)) : l;
103 #endif
104 }
105
106 /**
107 * shr_double: double-word merging right shift
108 * @l: left or most-significant word
109 * @r: right or least-significant word
110 * @c: shift count
111 *
112 * Shift @r right by @c bits, shifting in bits from @l.
113 */
114 static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
115 {
116 #if defined(__x86_64__)
117 asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
118 return r;
119 #else
120 return c ? (r >> c) | (l << (64 - c)) : r;
121 #endif
122 }
123
124 /*----------------------------------------------------------------------------
125 | Shifts `a' right by the number of bits given in `count'. If any nonzero
126 | bits are shifted off, they are ``jammed'' into the least significant bit of
127 | the result by setting the least significant bit to 1. The value of `count'
128 | can be arbitrarily large; in particular, if `count' is greater than 32, the
129 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
130 | The result is stored in the location pointed to by `zPtr'.
131 *----------------------------------------------------------------------------*/
132
133 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
134 {
135 uint32_t z;
136
137 if ( count == 0 ) {
138 z = a;
139 }
140 else if ( count < 32 ) {
141 z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
142 }
143 else {
144 z = ( a != 0 );
145 }
146 *zPtr = z;
147
148 }
149
150 /*----------------------------------------------------------------------------
151 | Shifts `a' right by the number of bits given in `count'. If any nonzero
152 | bits are shifted off, they are ``jammed'' into the least significant bit of
153 | the result by setting the least significant bit to 1. The value of `count'
154 | can be arbitrarily large; in particular, if `count' is greater than 64, the
155 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
156 | The result is stored in the location pointed to by `zPtr'.
157 *----------------------------------------------------------------------------*/
158
159 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
160 {
161 uint64_t z;
162
163 if ( count == 0 ) {
164 z = a;
165 }
166 else if ( count < 64 ) {
167 z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
168 }
169 else {
170 z = ( a != 0 );
171 }
172 *zPtr = z;
173
174 }
175
176 /*----------------------------------------------------------------------------
177 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
178 | _plus_ the number of bits given in `count'. The shifted result is at most
179 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
180 | bits shifted off form a second 64-bit result as follows: The _last_ bit
181 | shifted off is the most-significant bit of the extra result, and the other
182 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
183 | bits shifted off were all zero. This extra result is stored in the location
184 | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
185 | (This routine makes more sense if `a0' and `a1' are considered to form a
186 | fixed-point value with binary point between `a0' and `a1'. This fixed-point
187 | value is shifted right by the number of bits given in `count', and the
188 | integer part of the result is returned at the location pointed to by
189 | `z0Ptr'. The fractional part of the result may be slightly corrupted as
190 | described above, and is returned at the location pointed to by `z1Ptr'.)
191 *----------------------------------------------------------------------------*/
192
193 static inline void
194 shift64ExtraRightJamming(
195 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
196 {
197 uint64_t z0, z1;
198 int8_t negCount = ( - count ) & 63;
199
200 if ( count == 0 ) {
201 z1 = a1;
202 z0 = a0;
203 }
204 else if ( count < 64 ) {
205 z1 = ( a0<<negCount ) | ( a1 != 0 );
206 z0 = a0>>count;
207 }
208 else {
209 if ( count == 64 ) {
210 z1 = a0 | ( a1 != 0 );
211 }
212 else {
213 z1 = ( ( a0 | a1 ) != 0 );
214 }
215 z0 = 0;
216 }
217 *z1Ptr = z1;
218 *z0Ptr = z0;
219
220 }
221
222 /*----------------------------------------------------------------------------
223 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
224 | number of bits given in `count'. Any bits shifted off are lost. The value
225 | of `count' can be arbitrarily large; in particular, if `count' is greater
226 | than 128, the result will be 0. The result is broken into two 64-bit pieces
227 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
228 *----------------------------------------------------------------------------*/
229
230 static inline void
231 shift128Right(
232 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
233 {
234 uint64_t z0, z1;
235 int8_t negCount = ( - count ) & 63;
236
237 if ( count == 0 ) {
238 z1 = a1;
239 z0 = a0;
240 }
241 else if ( count < 64 ) {
242 z1 = ( a0<<negCount ) | ( a1>>count );
243 z0 = a0>>count;
244 }
245 else {
246 z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
247 z0 = 0;
248 }
249 *z1Ptr = z1;
250 *z0Ptr = z0;
251
252 }
253
254 /*----------------------------------------------------------------------------
255 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
256 | number of bits given in `count'. If any nonzero bits are shifted off, they
257 | are ``jammed'' into the least significant bit of the result by setting the
258 | least significant bit to 1. The value of `count' can be arbitrarily large;
259 | in particular, if `count' is greater than 128, the result will be either
260 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
261 | nonzero. The result is broken into two 64-bit pieces which are stored at
262 | the locations pointed to by `z0Ptr' and `z1Ptr'.
263 *----------------------------------------------------------------------------*/
264
265 static inline void
266 shift128RightJamming(
267 uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
268 {
269 uint64_t z0, z1;
270 int8_t negCount = ( - count ) & 63;
271
272 if ( count == 0 ) {
273 z1 = a1;
274 z0 = a0;
275 }
276 else if ( count < 64 ) {
277 z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
278 z0 = a0>>count;
279 }
280 else {
281 if ( count == 64 ) {
282 z1 = a0 | ( a1 != 0 );
283 }
284 else if ( count < 128 ) {
285 z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
286 }
287 else {
288 z1 = ( ( a0 | a1 ) != 0 );
289 }
290 z0 = 0;
291 }
292 *z1Ptr = z1;
293 *z0Ptr = z0;
294
295 }
296
297 /*----------------------------------------------------------------------------
298 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
299 | by 64 _plus_ the number of bits given in `count'. The shifted result is
300 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
301 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
302 | off form a third 64-bit result as follows: The _last_ bit shifted off is
303 | the most-significant bit of the extra result, and the other 63 bits of the
304 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
305 | were all zero. This extra result is stored in the location pointed to by
306 | `z2Ptr'. The value of `count' can be arbitrarily large.
307 | (This routine makes more sense if `a0', `a1', and `a2' are considered
308 | to form a fixed-point value with binary point between `a1' and `a2'. This
309 | fixed-point value is shifted right by the number of bits given in `count',
310 | and the integer part of the result is returned at the locations pointed to
311 | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
312 | corrupted as described above, and is returned at the location pointed to by
313 | `z2Ptr'.)
314 *----------------------------------------------------------------------------*/
315
316 static inline void
317 shift128ExtraRightJamming(
318 uint64_t a0,
319 uint64_t a1,
320 uint64_t a2,
321 int count,
322 uint64_t *z0Ptr,
323 uint64_t *z1Ptr,
324 uint64_t *z2Ptr
325 )
326 {
327 uint64_t z0, z1, z2;
328 int8_t negCount = ( - count ) & 63;
329
330 if ( count == 0 ) {
331 z2 = a2;
332 z1 = a1;
333 z0 = a0;
334 }
335 else {
336 if ( count < 64 ) {
337 z2 = a1<<negCount;
338 z1 = ( a0<<negCount ) | ( a1>>count );
339 z0 = a0>>count;
340 }
341 else {
342 if ( count == 64 ) {
343 z2 = a1;
344 z1 = a0;
345 }
346 else {
347 a2 |= a1;
348 if ( count < 128 ) {
349 z2 = a0<<negCount;
350 z1 = a0>>( count & 63 );
351 }
352 else {
353 z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
354 z1 = 0;
355 }
356 }
357 z0 = 0;
358 }
359 z2 |= ( a2 != 0 );
360 }
361 *z2Ptr = z2;
362 *z1Ptr = z1;
363 *z0Ptr = z0;
364
365 }
366
367 /*----------------------------------------------------------------------------
368 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
369 | number of bits given in `count'. Any bits shifted off are lost. The value
370 | of `count' must be less than 64. The result is broken into two 64-bit
371 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
372 *----------------------------------------------------------------------------*/
373
374 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
375 uint64_t *z0Ptr, uint64_t *z1Ptr)
376 {
377 *z1Ptr = a1 << count;
378 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
379 }
380
381 /*----------------------------------------------------------------------------
382 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
383 | number of bits given in `count'. Any bits shifted off are lost. The value
384 | of `count' may be greater than 64. The result is broken into two 64-bit
385 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
386 *----------------------------------------------------------------------------*/
387
388 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
389 uint64_t *z0Ptr, uint64_t *z1Ptr)
390 {
391 if (count < 64) {
392 *z1Ptr = a1 << count;
393 *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
394 } else {
395 *z1Ptr = 0;
396 *z0Ptr = a1 << (count - 64);
397 }
398 }
399
400 /*----------------------------------------------------------------------------
401 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
402 | by the number of bits given in `count'. Any bits shifted off are lost.
403 | The value of `count' must be less than 64. The result is broken into three
404 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
405 | `z1Ptr', and `z2Ptr'.
406 *----------------------------------------------------------------------------*/
407
408 static inline void
409 shortShift192Left(
410 uint64_t a0,
411 uint64_t a1,
412 uint64_t a2,
413 int count,
414 uint64_t *z0Ptr,
415 uint64_t *z1Ptr,
416 uint64_t *z2Ptr
417 )
418 {
419 uint64_t z0, z1, z2;
420 int8_t negCount;
421
422 z2 = a2<<count;
423 z1 = a1<<count;
424 z0 = a0<<count;
425 if ( 0 < count ) {
426 negCount = ( ( - count ) & 63 );
427 z1 |= a2>>negCount;
428 z0 |= a1>>negCount;
429 }
430 *z2Ptr = z2;
431 *z1Ptr = z1;
432 *z0Ptr = z0;
433
434 }
435
436 /*----------------------------------------------------------------------------
437 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
438 | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
439 | any carry out is lost. The result is broken into two 64-bit pieces which
440 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
441 *----------------------------------------------------------------------------*/
442
443 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
444 uint64_t *z0Ptr, uint64_t *z1Ptr)
445 {
446 bool c = 0;
447 *z1Ptr = uadd64_carry(a1, b1, &c);
448 *z0Ptr = uadd64_carry(a0, b0, &c);
449 }
450
451 /*----------------------------------------------------------------------------
452 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
453 | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
454 | modulo 2^192, so any carry out is lost. The result is broken into three
455 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
456 | `z1Ptr', and `z2Ptr'.
457 *----------------------------------------------------------------------------*/
458
459 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
460 uint64_t b0, uint64_t b1, uint64_t b2,
461 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
462 {
463 bool c = 0;
464 *z2Ptr = uadd64_carry(a2, b2, &c);
465 *z1Ptr = uadd64_carry(a1, b1, &c);
466 *z0Ptr = uadd64_carry(a0, b0, &c);
467 }
468
469 /*----------------------------------------------------------------------------
470 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
471 | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
472 | 2^128, so any borrow out (carry out) is lost. The result is broken into two
473 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
474 | `z1Ptr'.
475 *----------------------------------------------------------------------------*/
476
477 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
478 uint64_t *z0Ptr, uint64_t *z1Ptr)
479 {
480 bool c = 0;
481 *z1Ptr = usub64_borrow(a1, b1, &c);
482 *z0Ptr = usub64_borrow(a0, b0, &c);
483 }
484
485 /*----------------------------------------------------------------------------
486 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
487 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
488 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
489 | result is broken into three 64-bit pieces which are stored at the locations
490 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
491 *----------------------------------------------------------------------------*/
492
493 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
494 uint64_t b0, uint64_t b1, uint64_t b2,
495 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
496 {
497 bool c = 0;
498 *z2Ptr = usub64_borrow(a2, b2, &c);
499 *z1Ptr = usub64_borrow(a1, b1, &c);
500 *z0Ptr = usub64_borrow(a0, b0, &c);
501 }
502
503 /*----------------------------------------------------------------------------
504 | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
505 | into two 64-bit pieces which are stored at the locations pointed to by
506 | `z0Ptr' and `z1Ptr'.
507 *----------------------------------------------------------------------------*/
508
509 static inline void
510 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
511 {
512 mulu64(z1Ptr, z0Ptr, a, b);
513 }
514
515 /*----------------------------------------------------------------------------
516 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
517 | `b' to obtain a 192-bit product. The product is broken into three 64-bit
518 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
519 | `z2Ptr'.
520 *----------------------------------------------------------------------------*/
521
522 static inline void
523 mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
524 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
525 {
526 uint64_t z0, z1, m1;
527
528 mul64To128(a1, b, &m1, z2Ptr);
529 mul64To128(a0, b, &z0, &z1);
530 add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
531 }
532
533 /*----------------------------------------------------------------------------
534 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
535 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
536 | product. The product is broken into four 64-bit pieces which are stored at
537 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
538 *----------------------------------------------------------------------------*/
539
540 static inline void mul128To256(uint64_t a0, uint64_t a1,
541 uint64_t b0, uint64_t b1,
542 uint64_t *z0Ptr, uint64_t *z1Ptr,
543 uint64_t *z2Ptr, uint64_t *z3Ptr)
544 {
545 uint64_t z0, z1, z2;
546 uint64_t m0, m1, m2, n1, n2;
547
548 mul64To128(a1, b0, &m1, &m2);
549 mul64To128(a0, b1, &n1, &n2);
550 mul64To128(a1, b1, &z2, z3Ptr);
551 mul64To128(a0, b0, &z0, &z1);
552
553 add192( 0, m1, m2, 0, n1, n2, &m0, &m1, &m2);
554 add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
555 }
556
557 /*----------------------------------------------------------------------------
558 | Returns an approximation to the 64-bit integer quotient obtained by dividing
559 | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
560 | divisor `b' must be at least 2^63. If q is the exact quotient truncated
561 | toward zero, the approximation returned lies between q and q + 2 inclusive.
562 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
563 | unsigned integer is returned.
564 *----------------------------------------------------------------------------*/
565
566 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
567 {
568 uint64_t b0, b1;
569 uint64_t rem0, rem1, term0, term1;
570 uint64_t z;
571
572 if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
573 b0 = b>>32;
574 z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
575 mul64To128( b, z, &term0, &term1 );
576 sub128( a0, a1, term0, term1, &rem0, &rem1 );
577 while ( ( (int64_t) rem0 ) < 0 ) {
578 z -= UINT64_C(0x100000000);
579 b1 = b<<32;
580 add128( rem0, rem1, b0, b1, &rem0, &rem1 );
581 }
582 rem0 = ( rem0<<32 ) | ( rem1>>32 );
583 z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
584 return z;
585
586 }
587
588 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
589 * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
590 *
591 * Licensed under the GPLv2/LGPLv3
592 */
593 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
594 uint64_t n0, uint64_t d)
595 {
596 #if defined(__x86_64__)
597 uint64_t q;
598 asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
599 return q;
600 #elif defined(__s390x__) && !defined(__clang__)
601 /* Need to use a TImode type to get an even register pair for DLGR. */
602 unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
603 asm("dlgr %0, %1" : "+r"(n) : "r"(d));
604 *r = n >> 64;
605 return n;
606 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
607 /* From Power ISA 2.06, programming note for divdeu. */
608 uint64_t q1, q2, Q, r1, r2, R;
609 asm("divdeu %0,%2,%4; divdu %1,%3,%4"
610 : "=&r"(q1), "=r"(q2)
611 : "r"(n1), "r"(n0), "r"(d));
612 r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */
613 r2 = n0 - (q2 * d);
614 Q = q1 + q2;
615 R = r1 + r2;
616 if (R >= d || R < r2) { /* overflow implies R > d */
617 Q += 1;
618 R -= d;
619 }
620 *r = R;
621 return Q;
622 #else
623 uint64_t d0, d1, q0, q1, r1, r0, m;
624
625 d0 = (uint32_t)d;
626 d1 = d >> 32;
627
628 r1 = n1 % d1;
629 q1 = n1 / d1;
630 m = q1 * d0;
631 r1 = (r1 << 32) | (n0 >> 32);
632 if (r1 < m) {
633 q1 -= 1;
634 r1 += d;
635 if (r1 >= d) {
636 if (r1 < m) {
637 q1 -= 1;
638 r1 += d;
639 }
640 }
641 }
642 r1 -= m;
643
644 r0 = r1 % d1;
645 q0 = r1 / d1;
646 m = q0 * d0;
647 r0 = (r0 << 32) | (uint32_t)n0;
648 if (r0 < m) {
649 q0 -= 1;
650 r0 += d;
651 if (r0 >= d) {
652 if (r0 < m) {
653 q0 -= 1;
654 r0 += d;
655 }
656 }
657 }
658 r0 -= m;
659
660 *r = r0;
661 return (q1 << 32) | q0;
662 #endif
663 }
664
665 /*----------------------------------------------------------------------------
666 | Returns an approximation to the square root of the 32-bit significand given
667 | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
668 | `aExp' (the least significant bit) is 1, the integer returned approximates
669 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
670 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
671 | case, the approximation returned lies strictly within +/-2 of the exact
672 | value.
673 *----------------------------------------------------------------------------*/
674
675 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
676 {
677 static const uint16_t sqrtOddAdjustments[] = {
678 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
679 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
680 };
681 static const uint16_t sqrtEvenAdjustments[] = {
682 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
683 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
684 };
685 int8_t index;
686 uint32_t z;
687
688 index = ( a>>27 ) & 15;
689 if ( aExp & 1 ) {
690 z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
691 z = ( ( a / z )<<14 ) + ( z<<15 );
692 a >>= 1;
693 }
694 else {
695 z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
696 z = a / z + z;
697 z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
698 if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
699 }
700 return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
701
702 }
703
704 /*----------------------------------------------------------------------------
705 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
706 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
707 | Otherwise, returns 0.
708 *----------------------------------------------------------------------------*/
709
710 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
711 {
712 return a0 == b0 && a1 == b1;
713 }
714
715 /*----------------------------------------------------------------------------
716 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
717 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
718 | Otherwise, returns 0.
719 *----------------------------------------------------------------------------*/
720
721 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
722 {
723 return a0 < b0 || (a0 == b0 && a1 <= b1);
724 }
725
726 /*----------------------------------------------------------------------------
727 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
728 | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
729 | returns 0.
730 *----------------------------------------------------------------------------*/
731
732 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
733 {
734 return a0 < b0 || (a0 == b0 && a1 < b1);
735 }
736
737 /*----------------------------------------------------------------------------
738 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
739 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
740 | Otherwise, returns 0.
741 *----------------------------------------------------------------------------*/
742
743 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
744 {
745 return a0 != b0 || a1 != b1;
746 }
747
748 /*
749 * Similarly, comparisons of 192-bit values.
750 */
751
752 static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
753 uint64_t b0, uint64_t b1, uint64_t b2)
754 {
755 return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
756 }
757
758 static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
759 uint64_t b0, uint64_t b1, uint64_t b2)
760 {
761 if (a0 != b0) {
762 return a0 < b0;
763 }
764 if (a1 != b1) {
765 return a1 < b1;
766 }
767 return a2 <= b2;
768 }
769
770 static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
771 uint64_t b0, uint64_t b1, uint64_t b2)
772 {
773 if (a0 != b0) {
774 return a0 < b0;
775 }
776 if (a1 != b1) {
777 return a1 < b1;
778 }
779 return a2 < b2;
780 }
781
782 #endif