block: honor BDRV_O_ALLOW_RDWR when clearing bs->read_only
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86
87 #include "fpu/softfloat.h"
88
89 /* We only need stdlib for abort() */
90
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114 return float16_val(a) & 0x3ff;
115 }
116
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120
121 static inline int extractFloat16Exp(float16 a)
122 {
123 return (float16_val(a) >> 10) & 0x1f;
124 }
125
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129
130 static inline flag extractFloat16Sign(float16 a)
131 {
132 return float16_val(a)>>15;
133 }
134
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148 int8_t roundingMode;
149 flag roundNearestEven;
150 int8_t roundIncrement, roundBits;
151 int32_t z;
152
153 roundingMode = status->float_rounding_mode;
154 roundNearestEven = ( roundingMode == float_round_nearest_even );
155 switch (roundingMode) {
156 case float_round_nearest_even:
157 case float_round_ties_away:
158 roundIncrement = 0x40;
159 break;
160 case float_round_to_zero:
161 roundIncrement = 0;
162 break;
163 case float_round_up:
164 roundIncrement = zSign ? 0 : 0x7f;
165 break;
166 case float_round_down:
167 roundIncrement = zSign ? 0x7f : 0;
168 break;
169 default:
170 abort();
171 }
172 roundBits = absZ & 0x7F;
173 absZ = ( absZ + roundIncrement )>>7;
174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175 z = absZ;
176 if ( zSign ) z = - z;
177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178 float_raise(float_flag_invalid, status);
179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180 }
181 if (roundBits) {
182 status->float_exception_flags |= float_flag_inexact;
183 }
184 return z;
185
186 }
187
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201 float_status *status)
202 {
203 int8_t roundingMode;
204 flag roundNearestEven, increment;
205 int64_t z;
206
207 roundingMode = status->float_rounding_mode;
208 roundNearestEven = ( roundingMode == float_round_nearest_even );
209 switch (roundingMode) {
210 case float_round_nearest_even:
211 case float_round_ties_away:
212 increment = ((int64_t) absZ1 < 0);
213 break;
214 case float_round_to_zero:
215 increment = 0;
216 break;
217 case float_round_up:
218 increment = !zSign && absZ1;
219 break;
220 case float_round_down:
221 increment = zSign && absZ1;
222 break;
223 default:
224 abort();
225 }
226 if ( increment ) {
227 ++absZ0;
228 if ( absZ0 == 0 ) goto overflow;
229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230 }
231 z = absZ0;
232 if ( zSign ) z = - z;
233 if ( z && ( ( z < 0 ) ^ zSign ) ) {
234 overflow:
235 float_raise(float_flag_invalid, status);
236 return
237 zSign ? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
239 }
240 if (absZ1) {
241 status->float_exception_flags |= float_flag_inexact;
242 }
243 return z;
244
245 }
246
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258 uint64_t absZ1, float_status *status)
259 {
260 int8_t roundingMode;
261 flag roundNearestEven, increment;
262
263 roundingMode = status->float_rounding_mode;
264 roundNearestEven = (roundingMode == float_round_nearest_even);
265 switch (roundingMode) {
266 case float_round_nearest_even:
267 case float_round_ties_away:
268 increment = ((int64_t)absZ1 < 0);
269 break;
270 case float_round_to_zero:
271 increment = 0;
272 break;
273 case float_round_up:
274 increment = !zSign && absZ1;
275 break;
276 case float_round_down:
277 increment = zSign && absZ1;
278 break;
279 default:
280 abort();
281 }
282 if (increment) {
283 ++absZ0;
284 if (absZ0 == 0) {
285 float_raise(float_flag_invalid, status);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
287 }
288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289 }
290
291 if (zSign && absZ0) {
292 float_raise(float_flag_invalid, status);
293 return 0;
294 }
295
296 if (absZ1) {
297 status->float_exception_flags |= float_flag_inexact;
298 }
299 return absZ0;
300 }
301
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308
309 return float32_val(a) & 0x007FFFFF;
310
311 }
312
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316
317 static inline int extractFloat32Exp(float32 a)
318 {
319
320 return ( float32_val(a)>>23 ) & 0xFF;
321
322 }
323
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327
328 static inline flag extractFloat32Sign( float32 a )
329 {
330
331 return float32_val(a)>>31;
332
333 }
334
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341 if (status->flush_inputs_to_zero) {
342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343 float_raise(float_flag_input_denormal, status);
344 return make_float32(float32_val(a) & 0x80000000);
345 }
346 }
347 return a;
348 }
349
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356
357 static void
358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
359 {
360 int8_t shiftCount;
361
362 shiftCount = countLeadingZeros32( aSig ) - 8;
363 *zSigPtr = aSig<<shiftCount;
364 *zExpPtr = 1 - shiftCount;
365
366 }
367
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378
379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
380 {
381
382 return make_float32(
383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384
385 }
386
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408
409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
410 float_status *status)
411 {
412 int8_t roundingMode;
413 flag roundNearestEven;
414 int8_t roundIncrement, roundBits;
415 flag isTiny;
416
417 roundingMode = status->float_rounding_mode;
418 roundNearestEven = ( roundingMode == float_round_nearest_even );
419 switch (roundingMode) {
420 case float_round_nearest_even:
421 case float_round_ties_away:
422 roundIncrement = 0x40;
423 break;
424 case float_round_to_zero:
425 roundIncrement = 0;
426 break;
427 case float_round_up:
428 roundIncrement = zSign ? 0 : 0x7f;
429 break;
430 case float_round_down:
431 roundIncrement = zSign ? 0x7f : 0;
432 break;
433 default:
434 abort();
435 break;
436 }
437 roundBits = zSig & 0x7F;
438 if ( 0xFD <= (uint16_t) zExp ) {
439 if ( ( 0xFD < zExp )
440 || ( ( zExp == 0xFD )
441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442 ) {
443 float_raise(float_flag_overflow | float_flag_inexact, status);
444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445 }
446 if ( zExp < 0 ) {
447 if (status->flush_to_zero) {
448 float_raise(float_flag_output_denormal, status);
449 return packFloat32(zSign, 0, 0);
450 }
451 isTiny =
452 (status->float_detect_tininess
453 == float_tininess_before_rounding)
454 || ( zExp < -1 )
455 || ( zSig + roundIncrement < 0x80000000 );
456 shift32RightJamming( zSig, - zExp, &zSig );
457 zExp = 0;
458 roundBits = zSig & 0x7F;
459 if (isTiny && roundBits) {
460 float_raise(float_flag_underflow, status);
461 }
462 }
463 }
464 if (roundBits) {
465 status->float_exception_flags |= float_flag_inexact;
466 }
467 zSig = ( zSig + roundIncrement )>>7;
468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469 if ( zSig == 0 ) zExp = 0;
470 return packFloat32( zSign, zExp, zSig );
471
472 }
473
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482
483 static float32
484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
485 float_status *status)
486 {
487 int8_t shiftCount;
488
489 shiftCount = countLeadingZeros32( zSig ) - 1;
490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491 status);
492
493 }
494
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501
502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503
504 }
505
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509
510 static inline int extractFloat64Exp(float64 a)
511 {
512
513 return ( float64_val(a)>>52 ) & 0x7FF;
514
515 }
516
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520
521 static inline flag extractFloat64Sign( float64 a )
522 {
523
524 return float64_val(a)>>63;
525
526 }
527
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534 if (status->flush_inputs_to_zero) {
535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536 float_raise(float_flag_input_denormal, status);
537 return make_float64(float64_val(a) & (1ULL << 63));
538 }
539 }
540 return a;
541 }
542
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549
550 static void
551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
552 {
553 int8_t shiftCount;
554
555 shiftCount = countLeadingZeros64( aSig ) - 11;
556 *zSigPtr = aSig<<shiftCount;
557 *zExpPtr = 1 - shiftCount;
558
559 }
560
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571
572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
573 {
574
575 return make_float64(
576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577
578 }
579
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601
602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
603 float_status *status)
604 {
605 int8_t roundingMode;
606 flag roundNearestEven;
607 int roundIncrement, roundBits;
608 flag isTiny;
609
610 roundingMode = status->float_rounding_mode;
611 roundNearestEven = ( roundingMode == float_round_nearest_even );
612 switch (roundingMode) {
613 case float_round_nearest_even:
614 case float_round_ties_away:
615 roundIncrement = 0x200;
616 break;
617 case float_round_to_zero:
618 roundIncrement = 0;
619 break;
620 case float_round_up:
621 roundIncrement = zSign ? 0 : 0x3ff;
622 break;
623 case float_round_down:
624 roundIncrement = zSign ? 0x3ff : 0;
625 break;
626 case float_round_to_odd:
627 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
628 break;
629 default:
630 abort();
631 }
632 roundBits = zSig & 0x3FF;
633 if ( 0x7FD <= (uint16_t) zExp ) {
634 if ( ( 0x7FD < zExp )
635 || ( ( zExp == 0x7FD )
636 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
637 ) {
638 bool overflow_to_inf = roundingMode != float_round_to_odd &&
639 roundIncrement != 0;
640 float_raise(float_flag_overflow | float_flag_inexact, status);
641 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
642 }
643 if ( zExp < 0 ) {
644 if (status->flush_to_zero) {
645 float_raise(float_flag_output_denormal, status);
646 return packFloat64(zSign, 0, 0);
647 }
648 isTiny =
649 (status->float_detect_tininess
650 == float_tininess_before_rounding)
651 || ( zExp < -1 )
652 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
653 shift64RightJamming( zSig, - zExp, &zSig );
654 zExp = 0;
655 roundBits = zSig & 0x3FF;
656 if (isTiny && roundBits) {
657 float_raise(float_flag_underflow, status);
658 }
659 if (roundingMode == float_round_to_odd) {
660 /*
661 * For round-to-odd case, the roundIncrement depends on
662 * zSig which just changed.
663 */
664 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
665 }
666 }
667 }
668 if (roundBits) {
669 status->float_exception_flags |= float_flag_inexact;
670 }
671 zSig = ( zSig + roundIncrement )>>10;
672 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
673 if ( zSig == 0 ) zExp = 0;
674 return packFloat64( zSign, zExp, zSig );
675
676 }
677
678 /*----------------------------------------------------------------------------
679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680 | and significand `zSig', and returns the proper double-precision floating-
681 | point value corresponding to the abstract input. This routine is just like
682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
684 | floating-point exponent.
685 *----------------------------------------------------------------------------*/
686
687 static float64
688 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
689 float_status *status)
690 {
691 int8_t shiftCount;
692
693 shiftCount = countLeadingZeros64( zSig ) - 1;
694 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
695 status);
696
697 }
698
699 /*----------------------------------------------------------------------------
700 | Returns the fraction bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703
704 static inline uint64_t extractFloatx80Frac( floatx80 a )
705 {
706
707 return a.low;
708
709 }
710
711 /*----------------------------------------------------------------------------
712 | Returns the exponent bits of the extended double-precision floating-point
713 | value `a'.
714 *----------------------------------------------------------------------------*/
715
716 static inline int32_t extractFloatx80Exp( floatx80 a )
717 {
718
719 return a.high & 0x7FFF;
720
721 }
722
723 /*----------------------------------------------------------------------------
724 | Returns the sign bit of the extended double-precision floating-point value
725 | `a'.
726 *----------------------------------------------------------------------------*/
727
728 static inline flag extractFloatx80Sign( floatx80 a )
729 {
730
731 return a.high>>15;
732
733 }
734
735 /*----------------------------------------------------------------------------
736 | Normalizes the subnormal extended double-precision floating-point value
737 | represented by the denormalized significand `aSig'. The normalized exponent
738 | and significand are stored at the locations pointed to by `zExpPtr' and
739 | `zSigPtr', respectively.
740 *----------------------------------------------------------------------------*/
741
742 static void
743 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
744 {
745 int8_t shiftCount;
746
747 shiftCount = countLeadingZeros64( aSig );
748 *zSigPtr = aSig<<shiftCount;
749 *zExpPtr = 1 - shiftCount;
750
751 }
752
753 /*----------------------------------------------------------------------------
754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
755 | extended double-precision floating-point value, returning the result.
756 *----------------------------------------------------------------------------*/
757
758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
759 {
760 floatx80 z;
761
762 z.low = zSig;
763 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
764 return z;
765
766 }
767
768 /*----------------------------------------------------------------------------
769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
770 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
771 | and returns the proper extended double-precision floating-point value
772 | corresponding to the abstract input. Ordinarily, the abstract value is
773 | rounded and packed into the extended double-precision format, with the
774 | inexact exception raised if the abstract input cannot be represented
775 | exactly. However, if the abstract value is too large, the overflow and
776 | inexact exceptions are raised and an infinity or maximal finite value is
777 | returned. If the abstract value is too small, the input value is rounded to
778 | a subnormal number, and the underflow and inexact exceptions are raised if
779 | the abstract input cannot be represented exactly as a subnormal extended
780 | double-precision floating-point number.
781 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
782 | number of bits as single or double precision, respectively. Otherwise, the
783 | result is rounded to the full precision of the extended double-precision
784 | format.
785 | The input significand must be normalized or smaller. If the input
786 | significand is not normalized, `zExp' must be 0; in that case, the result
787 | returned is a subnormal number, and it must not require rounding. The
788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
789 | Floating-Point Arithmetic.
790 *----------------------------------------------------------------------------*/
791
792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
793 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
794 float_status *status)
795 {
796 int8_t roundingMode;
797 flag roundNearestEven, increment, isTiny;
798 int64_t roundIncrement, roundMask, roundBits;
799
800 roundingMode = status->float_rounding_mode;
801 roundNearestEven = ( roundingMode == float_round_nearest_even );
802 if ( roundingPrecision == 80 ) goto precision80;
803 if ( roundingPrecision == 64 ) {
804 roundIncrement = LIT64( 0x0000000000000400 );
805 roundMask = LIT64( 0x00000000000007FF );
806 }
807 else if ( roundingPrecision == 32 ) {
808 roundIncrement = LIT64( 0x0000008000000000 );
809 roundMask = LIT64( 0x000000FFFFFFFFFF );
810 }
811 else {
812 goto precision80;
813 }
814 zSig0 |= ( zSig1 != 0 );
815 switch (roundingMode) {
816 case float_round_nearest_even:
817 case float_round_ties_away:
818 break;
819 case float_round_to_zero:
820 roundIncrement = 0;
821 break;
822 case float_round_up:
823 roundIncrement = zSign ? 0 : roundMask;
824 break;
825 case float_round_down:
826 roundIncrement = zSign ? roundMask : 0;
827 break;
828 default:
829 abort();
830 }
831 roundBits = zSig0 & roundMask;
832 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
833 if ( ( 0x7FFE < zExp )
834 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
835 ) {
836 goto overflow;
837 }
838 if ( zExp <= 0 ) {
839 if (status->flush_to_zero) {
840 float_raise(float_flag_output_denormal, status);
841 return packFloatx80(zSign, 0, 0);
842 }
843 isTiny =
844 (status->float_detect_tininess
845 == float_tininess_before_rounding)
846 || ( zExp < 0 )
847 || ( zSig0 <= zSig0 + roundIncrement );
848 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
849 zExp = 0;
850 roundBits = zSig0 & roundMask;
851 if (isTiny && roundBits) {
852 float_raise(float_flag_underflow, status);
853 }
854 if (roundBits) {
855 status->float_exception_flags |= float_flag_inexact;
856 }
857 zSig0 += roundIncrement;
858 if ( (int64_t) zSig0 < 0 ) zExp = 1;
859 roundIncrement = roundMask + 1;
860 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
861 roundMask |= roundIncrement;
862 }
863 zSig0 &= ~ roundMask;
864 return packFloatx80( zSign, zExp, zSig0 );
865 }
866 }
867 if (roundBits) {
868 status->float_exception_flags |= float_flag_inexact;
869 }
870 zSig0 += roundIncrement;
871 if ( zSig0 < roundIncrement ) {
872 ++zExp;
873 zSig0 = LIT64( 0x8000000000000000 );
874 }
875 roundIncrement = roundMask + 1;
876 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
877 roundMask |= roundIncrement;
878 }
879 zSig0 &= ~ roundMask;
880 if ( zSig0 == 0 ) zExp = 0;
881 return packFloatx80( zSign, zExp, zSig0 );
882 precision80:
883 switch (roundingMode) {
884 case float_round_nearest_even:
885 case float_round_ties_away:
886 increment = ((int64_t)zSig1 < 0);
887 break;
888 case float_round_to_zero:
889 increment = 0;
890 break;
891 case float_round_up:
892 increment = !zSign && zSig1;
893 break;
894 case float_round_down:
895 increment = zSign && zSig1;
896 break;
897 default:
898 abort();
899 }
900 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
901 if ( ( 0x7FFE < zExp )
902 || ( ( zExp == 0x7FFE )
903 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
904 && increment
905 )
906 ) {
907 roundMask = 0;
908 overflow:
909 float_raise(float_flag_overflow | float_flag_inexact, status);
910 if ( ( roundingMode == float_round_to_zero )
911 || ( zSign && ( roundingMode == float_round_up ) )
912 || ( ! zSign && ( roundingMode == float_round_down ) )
913 ) {
914 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
915 }
916 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
917 }
918 if ( zExp <= 0 ) {
919 isTiny =
920 (status->float_detect_tininess
921 == float_tininess_before_rounding)
922 || ( zExp < 0 )
923 || ! increment
924 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
925 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
926 zExp = 0;
927 if (isTiny && zSig1) {
928 float_raise(float_flag_underflow, status);
929 }
930 if (zSig1) {
931 status->float_exception_flags |= float_flag_inexact;
932 }
933 switch (roundingMode) {
934 case float_round_nearest_even:
935 case float_round_ties_away:
936 increment = ((int64_t)zSig1 < 0);
937 break;
938 case float_round_to_zero:
939 increment = 0;
940 break;
941 case float_round_up:
942 increment = !zSign && zSig1;
943 break;
944 case float_round_down:
945 increment = zSign && zSig1;
946 break;
947 default:
948 abort();
949 }
950 if ( increment ) {
951 ++zSig0;
952 zSig0 &=
953 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
954 if ( (int64_t) zSig0 < 0 ) zExp = 1;
955 }
956 return packFloatx80( zSign, zExp, zSig0 );
957 }
958 }
959 if (zSig1) {
960 status->float_exception_flags |= float_flag_inexact;
961 }
962 if ( increment ) {
963 ++zSig0;
964 if ( zSig0 == 0 ) {
965 ++zExp;
966 zSig0 = LIT64( 0x8000000000000000 );
967 }
968 else {
969 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
970 }
971 }
972 else {
973 if ( zSig0 == 0 ) zExp = 0;
974 }
975 return packFloatx80( zSign, zExp, zSig0 );
976
977 }
978
979 /*----------------------------------------------------------------------------
980 | Takes an abstract floating-point value having sign `zSign', exponent
981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
982 | and returns the proper extended double-precision floating-point value
983 | corresponding to the abstract input. This routine is just like
984 | `roundAndPackFloatx80' except that the input significand does not have to be
985 | normalized.
986 *----------------------------------------------------------------------------*/
987
988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
989 flag zSign, int32_t zExp,
990 uint64_t zSig0, uint64_t zSig1,
991 float_status *status)
992 {
993 int8_t shiftCount;
994
995 if ( zSig0 == 0 ) {
996 zSig0 = zSig1;
997 zSig1 = 0;
998 zExp -= 64;
999 }
1000 shiftCount = countLeadingZeros64( zSig0 );
1001 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002 zExp -= shiftCount;
1003 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004 zSig0, zSig1, status);
1005
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 | Returns the least-significant 64 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012
1013 static inline uint64_t extractFloat128Frac1( float128 a )
1014 {
1015
1016 return a.low;
1017
1018 }
1019
1020 /*----------------------------------------------------------------------------
1021 | Returns the most-significant 48 fraction bits of the quadruple-precision
1022 | floating-point value `a'.
1023 *----------------------------------------------------------------------------*/
1024
1025 static inline uint64_t extractFloat128Frac0( float128 a )
1026 {
1027
1028 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029
1030 }
1031
1032 /*----------------------------------------------------------------------------
1033 | Returns the exponent bits of the quadruple-precision floating-point value
1034 | `a'.
1035 *----------------------------------------------------------------------------*/
1036
1037 static inline int32_t extractFloat128Exp( float128 a )
1038 {
1039
1040 return ( a.high>>48 ) & 0x7FFF;
1041
1042 }
1043
1044 /*----------------------------------------------------------------------------
1045 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1046 *----------------------------------------------------------------------------*/
1047
1048 static inline flag extractFloat128Sign( float128 a )
1049 {
1050
1051 return a.high>>63;
1052
1053 }
1054
1055 /*----------------------------------------------------------------------------
1056 | Normalizes the subnormal quadruple-precision floating-point value
1057 | represented by the denormalized significand formed by the concatenation of
1058 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1059 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1060 | significand are stored at the location pointed to by `zSig0Ptr', and the
1061 | least significant 64 bits of the normalized significand are stored at the
1062 | location pointed to by `zSig1Ptr'.
1063 *----------------------------------------------------------------------------*/
1064
1065 static void
1066 normalizeFloat128Subnormal(
1067 uint64_t aSig0,
1068 uint64_t aSig1,
1069 int32_t *zExpPtr,
1070 uint64_t *zSig0Ptr,
1071 uint64_t *zSig1Ptr
1072 )
1073 {
1074 int8_t shiftCount;
1075
1076 if ( aSig0 == 0 ) {
1077 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078 if ( shiftCount < 0 ) {
1079 *zSig0Ptr = aSig1>>( - shiftCount );
1080 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081 }
1082 else {
1083 *zSig0Ptr = aSig1<<shiftCount;
1084 *zSig1Ptr = 0;
1085 }
1086 *zExpPtr = - shiftCount - 63;
1087 }
1088 else {
1089 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091 *zExpPtr = 1 - shiftCount;
1092 }
1093
1094 }
1095
1096 /*----------------------------------------------------------------------------
1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099 | floating-point value, returning the result. After being shifted into the
1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101 | added together to form the most significant 32 bits of the result. This
1102 | means that any integer portion of `zSig0' will be added into the exponent.
1103 | Since a properly normalized significand will have an integer portion equal
1104 | to 1, the `zExp' input should be 1 less than the desired result exponent
1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106 | significand.
1107 *----------------------------------------------------------------------------*/
1108
1109 static inline float128
1110 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1111 {
1112 float128 z;
1113
1114 z.low = zSig1;
1115 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1116 return z;
1117
1118 }
1119
1120 /*----------------------------------------------------------------------------
1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1123 | and `zSig2', and returns the proper quadruple-precision floating-point value
1124 | corresponding to the abstract input. Ordinarily, the abstract value is
1125 | simply rounded and packed into the quadruple-precision format, with the
1126 | inexact exception raised if the abstract input cannot be represented
1127 | exactly. However, if the abstract value is too large, the overflow and
1128 | inexact exceptions are raised and an infinity or maximal finite value is
1129 | returned. If the abstract value is too small, the input value is rounded to
1130 | a subnormal number, and the underflow and inexact exceptions are raised if
1131 | the abstract input cannot be represented exactly as a subnormal quadruple-
1132 | precision floating-point number.
1133 | The input significand must be normalized or smaller. If the input
1134 | significand is not normalized, `zExp' must be 0; in that case, the result
1135 | returned is a subnormal number, and it must not require rounding. In the
1136 | usual case that the input significand is normalized, `zExp' must be 1 less
1137 | than the ``true'' floating-point exponent. The handling of underflow and
1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1142 uint64_t zSig0, uint64_t zSig1,
1143 uint64_t zSig2, float_status *status)
1144 {
1145 int8_t roundingMode;
1146 flag roundNearestEven, increment, isTiny;
1147
1148 roundingMode = status->float_rounding_mode;
1149 roundNearestEven = ( roundingMode == float_round_nearest_even );
1150 switch (roundingMode) {
1151 case float_round_nearest_even:
1152 case float_round_ties_away:
1153 increment = ((int64_t)zSig2 < 0);
1154 break;
1155 case float_round_to_zero:
1156 increment = 0;
1157 break;
1158 case float_round_up:
1159 increment = !zSign && zSig2;
1160 break;
1161 case float_round_down:
1162 increment = zSign && zSig2;
1163 break;
1164 case float_round_to_odd:
1165 increment = !(zSig1 & 0x1) && zSig2;
1166 break;
1167 default:
1168 abort();
1169 }
1170 if ( 0x7FFD <= (uint32_t) zExp ) {
1171 if ( ( 0x7FFD < zExp )
1172 || ( ( zExp == 0x7FFD )
1173 && eq128(
1174 LIT64( 0x0001FFFFFFFFFFFF ),
1175 LIT64( 0xFFFFFFFFFFFFFFFF ),
1176 zSig0,
1177 zSig1
1178 )
1179 && increment
1180 )
1181 ) {
1182 float_raise(float_flag_overflow | float_flag_inexact, status);
1183 if ( ( roundingMode == float_round_to_zero )
1184 || ( zSign && ( roundingMode == float_round_up ) )
1185 || ( ! zSign && ( roundingMode == float_round_down ) )
1186 || (roundingMode == float_round_to_odd)
1187 ) {
1188 return
1189 packFloat128(
1190 zSign,
1191 0x7FFE,
1192 LIT64( 0x0000FFFFFFFFFFFF ),
1193 LIT64( 0xFFFFFFFFFFFFFFFF )
1194 );
1195 }
1196 return packFloat128( zSign, 0x7FFF, 0, 0 );
1197 }
1198 if ( zExp < 0 ) {
1199 if (status->flush_to_zero) {
1200 float_raise(float_flag_output_denormal, status);
1201 return packFloat128(zSign, 0, 0, 0);
1202 }
1203 isTiny =
1204 (status->float_detect_tininess
1205 == float_tininess_before_rounding)
1206 || ( zExp < -1 )
1207 || ! increment
1208 || lt128(
1209 zSig0,
1210 zSig1,
1211 LIT64( 0x0001FFFFFFFFFFFF ),
1212 LIT64( 0xFFFFFFFFFFFFFFFF )
1213 );
1214 shift128ExtraRightJamming(
1215 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216 zExp = 0;
1217 if (isTiny && zSig2) {
1218 float_raise(float_flag_underflow, status);
1219 }
1220 switch (roundingMode) {
1221 case float_round_nearest_even:
1222 case float_round_ties_away:
1223 increment = ((int64_t)zSig2 < 0);
1224 break;
1225 case float_round_to_zero:
1226 increment = 0;
1227 break;
1228 case float_round_up:
1229 increment = !zSign && zSig2;
1230 break;
1231 case float_round_down:
1232 increment = zSign && zSig2;
1233 break;
1234 case float_round_to_odd:
1235 increment = !(zSig1 & 0x1) && zSig2;
1236 break;
1237 default:
1238 abort();
1239 }
1240 }
1241 }
1242 if (zSig2) {
1243 status->float_exception_flags |= float_flag_inexact;
1244 }
1245 if ( increment ) {
1246 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248 }
1249 else {
1250 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251 }
1252 return packFloat128( zSign, zExp, zSig0, zSig1 );
1253
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1259 | returns the proper quadruple-precision floating-point value corresponding
1260 | to the abstract input. This routine is just like `roundAndPackFloat128'
1261 | except that the input significand has fewer bits and does not have to be
1262 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1263 | point exponent.
1264 *----------------------------------------------------------------------------*/
1265
1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1267 uint64_t zSig0, uint64_t zSig1,
1268 float_status *status)
1269 {
1270 int8_t shiftCount;
1271 uint64_t zSig2;
1272
1273 if ( zSig0 == 0 ) {
1274 zSig0 = zSig1;
1275 zSig1 = 0;
1276 zExp -= 64;
1277 }
1278 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279 if ( 0 <= shiftCount ) {
1280 zSig2 = 0;
1281 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282 }
1283 else {
1284 shift128ExtraRightJamming(
1285 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286 }
1287 zExp -= shiftCount;
1288 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1289
1290 }
1291
1292 /*----------------------------------------------------------------------------
1293 | Returns the result of converting the 32-bit two's complement integer `a'
1294 | to the single-precision floating-point format. The conversion is performed
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 *----------------------------------------------------------------------------*/
1297
1298 float32 int32_to_float32(int32_t a, float_status *status)
1299 {
1300 flag zSign;
1301
1302 if ( a == 0 ) return float32_zero;
1303 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1304 zSign = ( a < 0 );
1305 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1306 }
1307
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the 32-bit two's complement integer `a'
1310 | to the double-precision floating-point format. The conversion is performed
1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312 *----------------------------------------------------------------------------*/
1313
1314 float64 int32_to_float64(int32_t a, float_status *status)
1315 {
1316 flag zSign;
1317 uint32_t absA;
1318 int8_t shiftCount;
1319 uint64_t zSig;
1320
1321 if ( a == 0 ) return float64_zero;
1322 zSign = ( a < 0 );
1323 absA = zSign ? - a : a;
1324 shiftCount = countLeadingZeros32( absA ) + 21;
1325 zSig = absA;
1326 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327
1328 }
1329
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the 32-bit two's complement integer `a'
1332 | to the extended double-precision floating-point format. The conversion
1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic.
1335 *----------------------------------------------------------------------------*/
1336
1337 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1338 {
1339 flag zSign;
1340 uint32_t absA;
1341 int8_t shiftCount;
1342 uint64_t zSig;
1343
1344 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345 zSign = ( a < 0 );
1346 absA = zSign ? - a : a;
1347 shiftCount = countLeadingZeros32( absA ) + 32;
1348 zSig = absA;
1349 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350
1351 }
1352
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of converting the 32-bit two's complement integer `a' to
1355 | the quadruple-precision floating-point format. The conversion is performed
1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357 *----------------------------------------------------------------------------*/
1358
1359 float128 int32_to_float128(int32_t a, float_status *status)
1360 {
1361 flag zSign;
1362 uint32_t absA;
1363 int8_t shiftCount;
1364 uint64_t zSig0;
1365
1366 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367 zSign = ( a < 0 );
1368 absA = zSign ? - a : a;
1369 shiftCount = countLeadingZeros32( absA ) + 17;
1370 zSig0 = absA;
1371 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the 64-bit two's complement integer `a'
1377 | to the single-precision floating-point format. The conversion is performed
1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379 *----------------------------------------------------------------------------*/
1380
1381 float32 int64_to_float32(int64_t a, float_status *status)
1382 {
1383 flag zSign;
1384 uint64_t absA;
1385 int8_t shiftCount;
1386
1387 if ( a == 0 ) return float32_zero;
1388 zSign = ( a < 0 );
1389 absA = zSign ? - a : a;
1390 shiftCount = countLeadingZeros64( absA ) - 40;
1391 if ( 0 <= shiftCount ) {
1392 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393 }
1394 else {
1395 shiftCount += 7;
1396 if ( shiftCount < 0 ) {
1397 shift64RightJamming( absA, - shiftCount, &absA );
1398 }
1399 else {
1400 absA <<= shiftCount;
1401 }
1402 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1403 }
1404
1405 }
1406
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the double-precision floating-point format. The conversion is performed
1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412
1413 float64 int64_to_float64(int64_t a, float_status *status)
1414 {
1415 flag zSign;
1416
1417 if ( a == 0 ) return float64_zero;
1418 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1419 return packFloat64( 1, 0x43E, 0 );
1420 }
1421 zSign = ( a < 0 );
1422 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1423 }
1424
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 64-bit two's complement integer `a'
1427 | to the extended double-precision floating-point format. The conversion
1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429 | Arithmetic.
1430 *----------------------------------------------------------------------------*/
1431
1432 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1433 {
1434 flag zSign;
1435 uint64_t absA;
1436 int8_t shiftCount;
1437
1438 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439 zSign = ( a < 0 );
1440 absA = zSign ? - a : a;
1441 shiftCount = countLeadingZeros64( absA );
1442 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443
1444 }
1445
1446 /*----------------------------------------------------------------------------
1447 | Returns the result of converting the 64-bit two's complement integer `a' to
1448 | the quadruple-precision floating-point format. The conversion is performed
1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450 *----------------------------------------------------------------------------*/
1451
1452 float128 int64_to_float128(int64_t a, float_status *status)
1453 {
1454 flag zSign;
1455 uint64_t absA;
1456 int8_t shiftCount;
1457 int32_t zExp;
1458 uint64_t zSig0, zSig1;
1459
1460 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461 zSign = ( a < 0 );
1462 absA = zSign ? - a : a;
1463 shiftCount = countLeadingZeros64( absA ) + 49;
1464 zExp = 0x406E - shiftCount;
1465 if ( 64 <= shiftCount ) {
1466 zSig1 = 0;
1467 zSig0 = absA;
1468 shiftCount -= 64;
1469 }
1470 else {
1471 zSig1 = absA;
1472 zSig0 = 0;
1473 }
1474 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475 return packFloat128( zSign, zExp, zSig0, zSig1 );
1476
1477 }
1478
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit unsigned integer `a'
1481 | to the single-precision floating-point format. The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484
1485 float32 uint64_to_float32(uint64_t a, float_status *status)
1486 {
1487 int shiftcount;
1488
1489 if (a == 0) {
1490 return float32_zero;
1491 }
1492
1493 /* Determine (left) shift needed to put first set bit into bit posn 23
1494 * (since packFloat32() expects the binary point between bits 23 and 22);
1495 * this is the fast case for smallish numbers.
1496 */
1497 shiftcount = countLeadingZeros64(a) - 40;
1498 if (shiftcount >= 0) {
1499 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500 }
1501 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502 * expects the binary point between bits 30 and 29, hence the + 7.
1503 */
1504 shiftcount += 7;
1505 if (shiftcount < 0) {
1506 shift64RightJamming(a, -shiftcount, &a);
1507 } else {
1508 a <<= shiftcount;
1509 }
1510
1511 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1512 }
1513
1514 /*----------------------------------------------------------------------------
1515 | Returns the result of converting the 64-bit unsigned integer `a'
1516 | to the double-precision floating-point format. The conversion is performed
1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 *----------------------------------------------------------------------------*/
1519
1520 float64 uint64_to_float64(uint64_t a, float_status *status)
1521 {
1522 int exp = 0x43C;
1523 int shiftcount;
1524
1525 if (a == 0) {
1526 return float64_zero;
1527 }
1528
1529 shiftcount = countLeadingZeros64(a) - 1;
1530 if (shiftcount < 0) {
1531 shift64RightJamming(a, -shiftcount, &a);
1532 } else {
1533 a <<= shiftcount;
1534 }
1535 return roundAndPackFloat64(0, exp - shiftcount, a, status);
1536 }
1537
1538 /*----------------------------------------------------------------------------
1539 | Returns the result of converting the 64-bit unsigned integer `a'
1540 | to the quadruple-precision floating-point format. The conversion is performed
1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542 *----------------------------------------------------------------------------*/
1543
1544 float128 uint64_to_float128(uint64_t a, float_status *status)
1545 {
1546 if (a == 0) {
1547 return float128_zero;
1548 }
1549 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1550 }
1551
1552 /*----------------------------------------------------------------------------
1553 | Returns the result of converting the single-precision floating-point value
1554 | `a' to the 32-bit two's complement integer format. The conversion is
1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1556 | Arithmetic---which means in particular that the conversion is rounded
1557 | according to the current rounding mode. If `a' is a NaN, the largest
1558 | positive integer is returned. Otherwise, if the conversion overflows, the
1559 | largest integer with the same sign as `a' is returned.
1560 *----------------------------------------------------------------------------*/
1561
1562 int32_t float32_to_int32(float32 a, float_status *status)
1563 {
1564 flag aSign;
1565 int aExp;
1566 int shiftCount;
1567 uint32_t aSig;
1568 uint64_t aSig64;
1569
1570 a = float32_squash_input_denormal(a, status);
1571 aSig = extractFloat32Frac( a );
1572 aExp = extractFloat32Exp( a );
1573 aSign = extractFloat32Sign( a );
1574 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575 if ( aExp ) aSig |= 0x00800000;
1576 shiftCount = 0xAF - aExp;
1577 aSig64 = aSig;
1578 aSig64 <<= 32;
1579 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1580 return roundAndPackInt32(aSign, aSig64, status);
1581
1582 }
1583
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of converting the single-precision floating-point value
1586 | `a' to the 32-bit two's complement integer format. The conversion is
1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1588 | Arithmetic, except that the conversion is always rounded toward zero.
1589 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1590 | the conversion overflows, the largest integer with the same sign as `a' is
1591 | returned.
1592 *----------------------------------------------------------------------------*/
1593
1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1595 {
1596 flag aSign;
1597 int aExp;
1598 int shiftCount;
1599 uint32_t aSig;
1600 int32_t z;
1601 a = float32_squash_input_denormal(a, status);
1602
1603 aSig = extractFloat32Frac( a );
1604 aExp = extractFloat32Exp( a );
1605 aSign = extractFloat32Sign( a );
1606 shiftCount = aExp - 0x9E;
1607 if ( 0 <= shiftCount ) {
1608 if ( float32_val(a) != 0xCF000000 ) {
1609 float_raise(float_flag_invalid, status);
1610 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611 }
1612 return (int32_t) 0x80000000;
1613 }
1614 else if ( aExp <= 0x7E ) {
1615 if (aExp | aSig) {
1616 status->float_exception_flags |= float_flag_inexact;
1617 }
1618 return 0;
1619 }
1620 aSig = ( aSig | 0x00800000 )<<8;
1621 z = aSig>>( - shiftCount );
1622 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1623 status->float_exception_flags |= float_flag_inexact;
1624 }
1625 if ( aSign ) z = - z;
1626 return z;
1627
1628 }
1629
1630 /*----------------------------------------------------------------------------
1631 | Returns the result of converting the single-precision floating-point value
1632 | `a' to the 16-bit two's complement integer format. The conversion is
1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 | Arithmetic, except that the conversion is always rounded toward zero.
1635 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1636 | the conversion overflows, the largest integer with the same sign as `a' is
1637 | returned.
1638 *----------------------------------------------------------------------------*/
1639
1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1641 {
1642 flag aSign;
1643 int aExp;
1644 int shiftCount;
1645 uint32_t aSig;
1646 int32_t z;
1647
1648 aSig = extractFloat32Frac( a );
1649 aExp = extractFloat32Exp( a );
1650 aSign = extractFloat32Sign( a );
1651 shiftCount = aExp - 0x8E;
1652 if ( 0 <= shiftCount ) {
1653 if ( float32_val(a) != 0xC7000000 ) {
1654 float_raise(float_flag_invalid, status);
1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656 return 0x7FFF;
1657 }
1658 }
1659 return (int32_t) 0xffff8000;
1660 }
1661 else if ( aExp <= 0x7E ) {
1662 if ( aExp | aSig ) {
1663 status->float_exception_flags |= float_flag_inexact;
1664 }
1665 return 0;
1666 }
1667 shiftCount -= 0x10;
1668 aSig = ( aSig | 0x00800000 )<<8;
1669 z = aSig>>( - shiftCount );
1670 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1671 status->float_exception_flags |= float_flag_inexact;
1672 }
1673 if ( aSign ) {
1674 z = - z;
1675 }
1676 return z;
1677
1678 }
1679
1680 /*----------------------------------------------------------------------------
1681 | Returns the result of converting the single-precision floating-point value
1682 | `a' to the 64-bit two's complement integer format. The conversion is
1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1684 | Arithmetic---which means in particular that the conversion is rounded
1685 | according to the current rounding mode. If `a' is a NaN, the largest
1686 | positive integer is returned. Otherwise, if the conversion overflows, the
1687 | largest integer with the same sign as `a' is returned.
1688 *----------------------------------------------------------------------------*/
1689
1690 int64_t float32_to_int64(float32 a, float_status *status)
1691 {
1692 flag aSign;
1693 int aExp;
1694 int shiftCount;
1695 uint32_t aSig;
1696 uint64_t aSig64, aSigExtra;
1697 a = float32_squash_input_denormal(a, status);
1698
1699 aSig = extractFloat32Frac( a );
1700 aExp = extractFloat32Exp( a );
1701 aSign = extractFloat32Sign( a );
1702 shiftCount = 0xBE - aExp;
1703 if ( shiftCount < 0 ) {
1704 float_raise(float_flag_invalid, status);
1705 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706 return LIT64( 0x7FFFFFFFFFFFFFFF );
1707 }
1708 return (int64_t) LIT64( 0x8000000000000000 );
1709 }
1710 if ( aExp ) aSig |= 0x00800000;
1711 aSig64 = aSig;
1712 aSig64 <<= 40;
1713 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1714 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1715
1716 }
1717
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of converting the single-precision floating-point value
1720 | `a' to the 64-bit unsigned integer format. The conversion is
1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1722 | Arithmetic---which means in particular that the conversion is rounded
1723 | according to the current rounding mode. If `a' is a NaN, the largest
1724 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1725 | largest unsigned integer is returned. If the 'a' is negative, the result
1726 | is rounded and zero is returned; values that do not round to zero will
1727 | raise the inexact exception flag.
1728 *----------------------------------------------------------------------------*/
1729
1730 uint64_t float32_to_uint64(float32 a, float_status *status)
1731 {
1732 flag aSign;
1733 int aExp;
1734 int shiftCount;
1735 uint32_t aSig;
1736 uint64_t aSig64, aSigExtra;
1737 a = float32_squash_input_denormal(a, status);
1738
1739 aSig = extractFloat32Frac(a);
1740 aExp = extractFloat32Exp(a);
1741 aSign = extractFloat32Sign(a);
1742 if ((aSign) && (aExp > 126)) {
1743 float_raise(float_flag_invalid, status);
1744 if (float32_is_any_nan(a)) {
1745 return LIT64(0xFFFFFFFFFFFFFFFF);
1746 } else {
1747 return 0;
1748 }
1749 }
1750 shiftCount = 0xBE - aExp;
1751 if (aExp) {
1752 aSig |= 0x00800000;
1753 }
1754 if (shiftCount < 0) {
1755 float_raise(float_flag_invalid, status);
1756 return LIT64(0xFFFFFFFFFFFFFFFF);
1757 }
1758
1759 aSig64 = aSig;
1760 aSig64 <<= 40;
1761 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1762 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1763 }
1764
1765 /*----------------------------------------------------------------------------
1766 | Returns the result of converting the single-precision floating-point value
1767 | `a' to the 64-bit unsigned integer format. The conversion is
1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1769 | Arithmetic, except that the conversion is always rounded toward zero. If
1770 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1771 | conversion overflows, the largest unsigned integer is returned. If the
1772 | 'a' is negative, the result is rounded and zero is returned; values that do
1773 | not round to zero will raise the inexact flag.
1774 *----------------------------------------------------------------------------*/
1775
1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1777 {
1778 signed char current_rounding_mode = status->float_rounding_mode;
1779 set_float_rounding_mode(float_round_to_zero, status);
1780 int64_t v = float32_to_uint64(a, status);
1781 set_float_rounding_mode(current_rounding_mode, status);
1782 return v;
1783 }
1784
1785 /*----------------------------------------------------------------------------
1786 | Returns the result of converting the single-precision floating-point value
1787 | `a' to the 64-bit two's complement integer format. The conversion is
1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1789 | Arithmetic, except that the conversion is always rounded toward zero. If
1790 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1791 | conversion overflows, the largest integer with the same sign as `a' is
1792 | returned.
1793 *----------------------------------------------------------------------------*/
1794
1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1796 {
1797 flag aSign;
1798 int aExp;
1799 int shiftCount;
1800 uint32_t aSig;
1801 uint64_t aSig64;
1802 int64_t z;
1803 a = float32_squash_input_denormal(a, status);
1804
1805 aSig = extractFloat32Frac( a );
1806 aExp = extractFloat32Exp( a );
1807 aSign = extractFloat32Sign( a );
1808 shiftCount = aExp - 0xBE;
1809 if ( 0 <= shiftCount ) {
1810 if ( float32_val(a) != 0xDF000000 ) {
1811 float_raise(float_flag_invalid, status);
1812 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813 return LIT64( 0x7FFFFFFFFFFFFFFF );
1814 }
1815 }
1816 return (int64_t) LIT64( 0x8000000000000000 );
1817 }
1818 else if ( aExp <= 0x7E ) {
1819 if (aExp | aSig) {
1820 status->float_exception_flags |= float_flag_inexact;
1821 }
1822 return 0;
1823 }
1824 aSig64 = aSig | 0x00800000;
1825 aSig64 <<= 40;
1826 z = aSig64>>( - shiftCount );
1827 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1828 status->float_exception_flags |= float_flag_inexact;
1829 }
1830 if ( aSign ) z = - z;
1831 return z;
1832
1833 }
1834
1835 /*----------------------------------------------------------------------------
1836 | Returns the result of converting the single-precision floating-point value
1837 | `a' to the double-precision floating-point format. The conversion is
1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1839 | Arithmetic.
1840 *----------------------------------------------------------------------------*/
1841
1842 float64 float32_to_float64(float32 a, float_status *status)
1843 {
1844 flag aSign;
1845 int aExp;
1846 uint32_t aSig;
1847 a = float32_squash_input_denormal(a, status);
1848
1849 aSig = extractFloat32Frac( a );
1850 aExp = extractFloat32Exp( a );
1851 aSign = extractFloat32Sign( a );
1852 if ( aExp == 0xFF ) {
1853 if (aSig) {
1854 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855 }
1856 return packFloat64( aSign, 0x7FF, 0 );
1857 }
1858 if ( aExp == 0 ) {
1859 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861 --aExp;
1862 }
1863 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1864
1865 }
1866
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the single-precision floating-point value
1869 | `a' to the extended double-precision floating-point format. The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871 | Arithmetic.
1872 *----------------------------------------------------------------------------*/
1873
1874 floatx80 float32_to_floatx80(float32 a, float_status *status)
1875 {
1876 flag aSign;
1877 int aExp;
1878 uint32_t aSig;
1879
1880 a = float32_squash_input_denormal(a, status);
1881 aSig = extractFloat32Frac( a );
1882 aExp = extractFloat32Exp( a );
1883 aSign = extractFloat32Sign( a );
1884 if ( aExp == 0xFF ) {
1885 if (aSig) {
1886 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887 }
1888 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889 }
1890 if ( aExp == 0 ) {
1891 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893 }
1894 aSig |= 0x00800000;
1895 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1896
1897 }
1898
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of converting the single-precision floating-point value
1901 | `a' to the double-precision floating-point format. The conversion is
1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1903 | Arithmetic.
1904 *----------------------------------------------------------------------------*/
1905
1906 float128 float32_to_float128(float32 a, float_status *status)
1907 {
1908 flag aSign;
1909 int aExp;
1910 uint32_t aSig;
1911
1912 a = float32_squash_input_denormal(a, status);
1913 aSig = extractFloat32Frac( a );
1914 aExp = extractFloat32Exp( a );
1915 aSign = extractFloat32Sign( a );
1916 if ( aExp == 0xFF ) {
1917 if (aSig) {
1918 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919 }
1920 return packFloat128( aSign, 0x7FFF, 0, 0 );
1921 }
1922 if ( aExp == 0 ) {
1923 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925 --aExp;
1926 }
1927 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1928
1929 }
1930
1931 /*----------------------------------------------------------------------------
1932 | Rounds the single-precision floating-point value `a' to an integer, and
1933 | returns the result as a single-precision floating-point value. The
1934 | operation is performed according to the IEC/IEEE Standard for Binary
1935 | Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937
1938 float32 float32_round_to_int(float32 a, float_status *status)
1939 {
1940 flag aSign;
1941 int aExp;
1942 uint32_t lastBitMask, roundBitsMask;
1943 uint32_t z;
1944 a = float32_squash_input_denormal(a, status);
1945
1946 aExp = extractFloat32Exp( a );
1947 if ( 0x96 <= aExp ) {
1948 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1949 return propagateFloat32NaN(a, a, status);
1950 }
1951 return a;
1952 }
1953 if ( aExp <= 0x7E ) {
1954 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1955 status->float_exception_flags |= float_flag_inexact;
1956 aSign = extractFloat32Sign( a );
1957 switch (status->float_rounding_mode) {
1958 case float_round_nearest_even:
1959 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960 return packFloat32( aSign, 0x7F, 0 );
1961 }
1962 break;
1963 case float_round_ties_away:
1964 if (aExp == 0x7E) {
1965 return packFloat32(aSign, 0x7F, 0);
1966 }
1967 break;
1968 case float_round_down:
1969 return make_float32(aSign ? 0xBF800000 : 0);
1970 case float_round_up:
1971 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1972 }
1973 return packFloat32( aSign, 0, 0 );
1974 }
1975 lastBitMask = 1;
1976 lastBitMask <<= 0x96 - aExp;
1977 roundBitsMask = lastBitMask - 1;
1978 z = float32_val(a);
1979 switch (status->float_rounding_mode) {
1980 case float_round_nearest_even:
1981 z += lastBitMask>>1;
1982 if ((z & roundBitsMask) == 0) {
1983 z &= ~lastBitMask;
1984 }
1985 break;
1986 case float_round_ties_away:
1987 z += lastBitMask >> 1;
1988 break;
1989 case float_round_to_zero:
1990 break;
1991 case float_round_up:
1992 if (!extractFloat32Sign(make_float32(z))) {
1993 z += roundBitsMask;
1994 }
1995 break;
1996 case float_round_down:
1997 if (extractFloat32Sign(make_float32(z))) {
1998 z += roundBitsMask;
1999 }
2000 break;
2001 default:
2002 abort();
2003 }
2004 z &= ~ roundBitsMask;
2005 if (z != float32_val(a)) {
2006 status->float_exception_flags |= float_flag_inexact;
2007 }
2008 return make_float32(z);
2009
2010 }
2011
2012 /*----------------------------------------------------------------------------
2013 | Returns the result of adding the absolute values of the single-precision
2014 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2015 | before being returned. `zSign' is ignored if the result is a NaN.
2016 | The addition is performed according to the IEC/IEEE Standard for Binary
2017 | Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019
2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021 float_status *status)
2022 {
2023 int aExp, bExp, zExp;
2024 uint32_t aSig, bSig, zSig;
2025 int expDiff;
2026
2027 aSig = extractFloat32Frac( a );
2028 aExp = extractFloat32Exp( a );
2029 bSig = extractFloat32Frac( b );
2030 bExp = extractFloat32Exp( b );
2031 expDiff = aExp - bExp;
2032 aSig <<= 6;
2033 bSig <<= 6;
2034 if ( 0 < expDiff ) {
2035 if ( aExp == 0xFF ) {
2036 if (aSig) {
2037 return propagateFloat32NaN(a, b, status);
2038 }
2039 return a;
2040 }
2041 if ( bExp == 0 ) {
2042 --expDiff;
2043 }
2044 else {
2045 bSig |= 0x20000000;
2046 }
2047 shift32RightJamming( bSig, expDiff, &bSig );
2048 zExp = aExp;
2049 }
2050 else if ( expDiff < 0 ) {
2051 if ( bExp == 0xFF ) {
2052 if (bSig) {
2053 return propagateFloat32NaN(a, b, status);
2054 }
2055 return packFloat32( zSign, 0xFF, 0 );
2056 }
2057 if ( aExp == 0 ) {
2058 ++expDiff;
2059 }
2060 else {
2061 aSig |= 0x20000000;
2062 }
2063 shift32RightJamming( aSig, - expDiff, &aSig );
2064 zExp = bExp;
2065 }
2066 else {
2067 if ( aExp == 0xFF ) {
2068 if (aSig | bSig) {
2069 return propagateFloat32NaN(a, b, status);
2070 }
2071 return a;
2072 }
2073 if ( aExp == 0 ) {
2074 if (status->flush_to_zero) {
2075 if (aSig | bSig) {
2076 float_raise(float_flag_output_denormal, status);
2077 }
2078 return packFloat32(zSign, 0, 0);
2079 }
2080 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081 }
2082 zSig = 0x40000000 + aSig + bSig;
2083 zExp = aExp;
2084 goto roundAndPack;
2085 }
2086 aSig |= 0x20000000;
2087 zSig = ( aSig + bSig )<<1;
2088 --zExp;
2089 if ( (int32_t) zSig < 0 ) {
2090 zSig = aSig + bSig;
2091 ++zExp;
2092 }
2093 roundAndPack:
2094 return roundAndPackFloat32(zSign, zExp, zSig, status);
2095
2096 }
2097
2098 /*----------------------------------------------------------------------------
2099 | Returns the result of subtracting the absolute values of the single-
2100 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2101 | difference is negated before being returned. `zSign' is ignored if the
2102 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2103 | Standard for Binary Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2105
2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107 float_status *status)
2108 {
2109 int aExp, bExp, zExp;
2110 uint32_t aSig, bSig, zSig;
2111 int expDiff;
2112
2113 aSig = extractFloat32Frac( a );
2114 aExp = extractFloat32Exp( a );
2115 bSig = extractFloat32Frac( b );
2116 bExp = extractFloat32Exp( b );
2117 expDiff = aExp - bExp;
2118 aSig <<= 7;
2119 bSig <<= 7;
2120 if ( 0 < expDiff ) goto aExpBigger;
2121 if ( expDiff < 0 ) goto bExpBigger;
2122 if ( aExp == 0xFF ) {
2123 if (aSig | bSig) {
2124 return propagateFloat32NaN(a, b, status);
2125 }
2126 float_raise(float_flag_invalid, status);
2127 return float32_default_nan(status);
2128 }
2129 if ( aExp == 0 ) {
2130 aExp = 1;
2131 bExp = 1;
2132 }
2133 if ( bSig < aSig ) goto aBigger;
2134 if ( aSig < bSig ) goto bBigger;
2135 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2136 bExpBigger:
2137 if ( bExp == 0xFF ) {
2138 if (bSig) {
2139 return propagateFloat32NaN(a, b, status);
2140 }
2141 return packFloat32( zSign ^ 1, 0xFF, 0 );
2142 }
2143 if ( aExp == 0 ) {
2144 ++expDiff;
2145 }
2146 else {
2147 aSig |= 0x40000000;
2148 }
2149 shift32RightJamming( aSig, - expDiff, &aSig );
2150 bSig |= 0x40000000;
2151 bBigger:
2152 zSig = bSig - aSig;
2153 zExp = bExp;
2154 zSign ^= 1;
2155 goto normalizeRoundAndPack;
2156 aExpBigger:
2157 if ( aExp == 0xFF ) {
2158 if (aSig) {
2159 return propagateFloat32NaN(a, b, status);
2160 }
2161 return a;
2162 }
2163 if ( bExp == 0 ) {
2164 --expDiff;
2165 }
2166 else {
2167 bSig |= 0x40000000;
2168 }
2169 shift32RightJamming( bSig, expDiff, &bSig );
2170 aSig |= 0x40000000;
2171 aBigger:
2172 zSig = aSig - bSig;
2173 zExp = aExp;
2174 normalizeRoundAndPack:
2175 --zExp;
2176 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2177
2178 }
2179
2180 /*----------------------------------------------------------------------------
2181 | Returns the result of adding the single-precision floating-point values `a'
2182 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2183 | Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185
2186 float32 float32_add(float32 a, float32 b, float_status *status)
2187 {
2188 flag aSign, bSign;
2189 a = float32_squash_input_denormal(a, status);
2190 b = float32_squash_input_denormal(b, status);
2191
2192 aSign = extractFloat32Sign( a );
2193 bSign = extractFloat32Sign( b );
2194 if ( aSign == bSign ) {
2195 return addFloat32Sigs(a, b, aSign, status);
2196 }
2197 else {
2198 return subFloat32Sigs(a, b, aSign, status);
2199 }
2200
2201 }
2202
2203 /*----------------------------------------------------------------------------
2204 | Returns the result of subtracting the single-precision floating-point values
2205 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2206 | for Binary Floating-Point Arithmetic.
2207 *----------------------------------------------------------------------------*/
2208
2209 float32 float32_sub(float32 a, float32 b, float_status *status)
2210 {
2211 flag aSign, bSign;
2212 a = float32_squash_input_denormal(a, status);
2213 b = float32_squash_input_denormal(b, status);
2214
2215 aSign = extractFloat32Sign( a );
2216 bSign = extractFloat32Sign( b );
2217 if ( aSign == bSign ) {
2218 return subFloat32Sigs(a, b, aSign, status);
2219 }
2220 else {
2221 return addFloat32Sigs(a, b, aSign, status);
2222 }
2223
2224 }
2225
2226 /*----------------------------------------------------------------------------
2227 | Returns the result of multiplying the single-precision floating-point values
2228 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2229 | for Binary Floating-Point Arithmetic.
2230 *----------------------------------------------------------------------------*/
2231
2232 float32 float32_mul(float32 a, float32 b, float_status *status)
2233 {
2234 flag aSign, bSign, zSign;
2235 int aExp, bExp, zExp;
2236 uint32_t aSig, bSig;
2237 uint64_t zSig64;
2238 uint32_t zSig;
2239
2240 a = float32_squash_input_denormal(a, status);
2241 b = float32_squash_input_denormal(b, status);
2242
2243 aSig = extractFloat32Frac( a );
2244 aExp = extractFloat32Exp( a );
2245 aSign = extractFloat32Sign( a );
2246 bSig = extractFloat32Frac( b );
2247 bExp = extractFloat32Exp( b );
2248 bSign = extractFloat32Sign( b );
2249 zSign = aSign ^ bSign;
2250 if ( aExp == 0xFF ) {
2251 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2252 return propagateFloat32NaN(a, b, status);
2253 }
2254 if ( ( bExp | bSig ) == 0 ) {
2255 float_raise(float_flag_invalid, status);
2256 return float32_default_nan(status);
2257 }
2258 return packFloat32( zSign, 0xFF, 0 );
2259 }
2260 if ( bExp == 0xFF ) {
2261 if (bSig) {
2262 return propagateFloat32NaN(a, b, status);
2263 }
2264 if ( ( aExp | aSig ) == 0 ) {
2265 float_raise(float_flag_invalid, status);
2266 return float32_default_nan(status);
2267 }
2268 return packFloat32( zSign, 0xFF, 0 );
2269 }
2270 if ( aExp == 0 ) {
2271 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273 }
2274 if ( bExp == 0 ) {
2275 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277 }
2278 zExp = aExp + bExp - 0x7F;
2279 aSig = ( aSig | 0x00800000 )<<7;
2280 bSig = ( bSig | 0x00800000 )<<8;
2281 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2282 zSig = zSig64;
2283 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2284 zSig <<= 1;
2285 --zExp;
2286 }
2287 return roundAndPackFloat32(zSign, zExp, zSig, status);
2288
2289 }
2290
2291 /*----------------------------------------------------------------------------
2292 | Returns the result of dividing the single-precision floating-point value `a'
2293 | by the corresponding value `b'. The operation is performed according to the
2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 *----------------------------------------------------------------------------*/
2296
2297 float32 float32_div(float32 a, float32 b, float_status *status)
2298 {
2299 flag aSign, bSign, zSign;
2300 int aExp, bExp, zExp;
2301 uint32_t aSig, bSig, zSig;
2302 a = float32_squash_input_denormal(a, status);
2303 b = float32_squash_input_denormal(b, status);
2304
2305 aSig = extractFloat32Frac( a );
2306 aExp = extractFloat32Exp( a );
2307 aSign = extractFloat32Sign( a );
2308 bSig = extractFloat32Frac( b );
2309 bExp = extractFloat32Exp( b );
2310 bSign = extractFloat32Sign( b );
2311 zSign = aSign ^ bSign;
2312 if ( aExp == 0xFF ) {
2313 if (aSig) {
2314 return propagateFloat32NaN(a, b, status);
2315 }
2316 if ( bExp == 0xFF ) {
2317 if (bSig) {
2318 return propagateFloat32NaN(a, b, status);
2319 }
2320 float_raise(float_flag_invalid, status);
2321 return float32_default_nan(status);
2322 }
2323 return packFloat32( zSign, 0xFF, 0 );
2324 }
2325 if ( bExp == 0xFF ) {
2326 if (bSig) {
2327 return propagateFloat32NaN(a, b, status);
2328 }
2329 return packFloat32( zSign, 0, 0 );
2330 }
2331 if ( bExp == 0 ) {
2332 if ( bSig == 0 ) {
2333 if ( ( aExp | aSig ) == 0 ) {
2334 float_raise(float_flag_invalid, status);
2335 return float32_default_nan(status);
2336 }
2337 float_raise(float_flag_divbyzero, status);
2338 return packFloat32( zSign, 0xFF, 0 );
2339 }
2340 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341 }
2342 if ( aExp == 0 ) {
2343 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345 }
2346 zExp = aExp - bExp + 0x7D;
2347 aSig = ( aSig | 0x00800000 )<<7;
2348 bSig = ( bSig | 0x00800000 )<<8;
2349 if ( bSig <= ( aSig + aSig ) ) {
2350 aSig >>= 1;
2351 ++zExp;
2352 }
2353 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2354 if ( ( zSig & 0x3F ) == 0 ) {
2355 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2356 }
2357 return roundAndPackFloat32(zSign, zExp, zSig, status);
2358
2359 }
2360
2361 /*----------------------------------------------------------------------------
2362 | Returns the remainder of the single-precision floating-point value `a'
2363 | with respect to the corresponding value `b'. The operation is performed
2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366
2367 float32 float32_rem(float32 a, float32 b, float_status *status)
2368 {
2369 flag aSign, zSign;
2370 int aExp, bExp, expDiff;
2371 uint32_t aSig, bSig;
2372 uint32_t q;
2373 uint64_t aSig64, bSig64, q64;
2374 uint32_t alternateASig;
2375 int32_t sigMean;
2376 a = float32_squash_input_denormal(a, status);
2377 b = float32_squash_input_denormal(b, status);
2378
2379 aSig = extractFloat32Frac( a );
2380 aExp = extractFloat32Exp( a );
2381 aSign = extractFloat32Sign( a );
2382 bSig = extractFloat32Frac( b );
2383 bExp = extractFloat32Exp( b );
2384 if ( aExp == 0xFF ) {
2385 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2386 return propagateFloat32NaN(a, b, status);
2387 }
2388 float_raise(float_flag_invalid, status);
2389 return float32_default_nan(status);
2390 }
2391 if ( bExp == 0xFF ) {
2392 if (bSig) {
2393 return propagateFloat32NaN(a, b, status);
2394 }
2395 return a;
2396 }
2397 if ( bExp == 0 ) {
2398 if ( bSig == 0 ) {
2399 float_raise(float_flag_invalid, status);
2400 return float32_default_nan(status);
2401 }
2402 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403 }
2404 if ( aExp == 0 ) {
2405 if ( aSig == 0 ) return a;
2406 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407 }
2408 expDiff = aExp - bExp;
2409 aSig |= 0x00800000;
2410 bSig |= 0x00800000;
2411 if ( expDiff < 32 ) {
2412 aSig <<= 8;
2413 bSig <<= 8;
2414 if ( expDiff < 0 ) {
2415 if ( expDiff < -1 ) return a;
2416 aSig >>= 1;
2417 }
2418 q = ( bSig <= aSig );
2419 if ( q ) aSig -= bSig;
2420 if ( 0 < expDiff ) {
2421 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2422 q >>= 32 - expDiff;
2423 bSig >>= 2;
2424 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425 }
2426 else {
2427 aSig >>= 2;
2428 bSig >>= 2;
2429 }
2430 }
2431 else {
2432 if ( bSig <= aSig ) aSig -= bSig;
2433 aSig64 = ( (uint64_t) aSig )<<40;
2434 bSig64 = ( (uint64_t) bSig )<<40;
2435 expDiff -= 64;
2436 while ( 0 < expDiff ) {
2437 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439 aSig64 = - ( ( bSig * q64 )<<38 );
2440 expDiff -= 62;
2441 }
2442 expDiff += 64;
2443 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445 q = q64>>( 64 - expDiff );
2446 bSig <<= 6;
2447 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448 }
2449 do {
2450 alternateASig = aSig;
2451 ++q;
2452 aSig -= bSig;
2453 } while ( 0 <= (int32_t) aSig );
2454 sigMean = aSig + alternateASig;
2455 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456 aSig = alternateASig;
2457 }
2458 zSign = ( (int32_t) aSig < 0 );
2459 if ( zSign ) aSig = - aSig;
2460 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2461 }
2462
2463 /*----------------------------------------------------------------------------
2464 | Returns the result of multiplying the single-precision floating-point values
2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2466 | multiplication. The operation is performed according to the IEC/IEEE
2467 | Standard for Binary Floating-Point Arithmetic 754-2008.
2468 | The flags argument allows the caller to select negation of the
2469 | addend, the intermediate product, or the final result. (The difference
2470 | between this and having the caller do a separate negation is that negating
2471 | externally will flip the sign bit on NaNs.)
2472 *----------------------------------------------------------------------------*/
2473
2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475 float_status *status)
2476 {
2477 flag aSign, bSign, cSign, zSign;
2478 int aExp, bExp, cExp, pExp, zExp, expDiff;
2479 uint32_t aSig, bSig, cSig;
2480 flag pInf, pZero, pSign;
2481 uint64_t pSig64, cSig64, zSig64;
2482 uint32_t pSig;
2483 int shiftcount;
2484 flag signflip, infzero;
2485
2486 a = float32_squash_input_denormal(a, status);
2487 b = float32_squash_input_denormal(b, status);
2488 c = float32_squash_input_denormal(c, status);
2489 aSig = extractFloat32Frac(a);
2490 aExp = extractFloat32Exp(a);
2491 aSign = extractFloat32Sign(a);
2492 bSig = extractFloat32Frac(b);
2493 bExp = extractFloat32Exp(b);
2494 bSign = extractFloat32Sign(b);
2495 cSig = extractFloat32Frac(c);
2496 cExp = extractFloat32Exp(c);
2497 cSign = extractFloat32Sign(c);
2498
2499 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501
2502 /* It is implementation-defined whether the cases of (0,inf,qnan)
2503 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504 * they return if they do), so we have to hand this information
2505 * off to the target-specific pick-a-NaN routine.
2506 */
2507 if (((aExp == 0xff) && aSig) ||
2508 ((bExp == 0xff) && bSig) ||
2509 ((cExp == 0xff) && cSig)) {
2510 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2511 }
2512
2513 if (infzero) {
2514 float_raise(float_flag_invalid, status);
2515 return float32_default_nan(status);
2516 }
2517
2518 if (flags & float_muladd_negate_c) {
2519 cSign ^= 1;
2520 }
2521
2522 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523
2524 /* Work out the sign and type of the product */
2525 pSign = aSign ^ bSign;
2526 if (flags & float_muladd_negate_product) {
2527 pSign ^= 1;
2528 }
2529 pInf = (aExp == 0xff) || (bExp == 0xff);
2530 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531
2532 if (cExp == 0xff) {
2533 if (pInf && (pSign ^ cSign)) {
2534 /* addition of opposite-signed infinities => InvalidOperation */
2535 float_raise(float_flag_invalid, status);
2536 return float32_default_nan(status);
2537 }
2538 /* Otherwise generate an infinity of the same sign */
2539 return packFloat32(cSign ^ signflip, 0xff, 0);
2540 }
2541
2542 if (pInf) {
2543 return packFloat32(pSign ^ signflip, 0xff, 0);
2544 }
2545
2546 if (pZero) {
2547 if (cExp == 0) {
2548 if (cSig == 0) {
2549 /* Adding two exact zeroes */
2550 if (pSign == cSign) {
2551 zSign = pSign;
2552 } else if (status->float_rounding_mode == float_round_down) {
2553 zSign = 1;
2554 } else {
2555 zSign = 0;
2556 }
2557 return packFloat32(zSign ^ signflip, 0, 0);
2558 }
2559 /* Exact zero plus a denorm */
2560 if (status->flush_to_zero) {
2561 float_raise(float_flag_output_denormal, status);
2562 return packFloat32(cSign ^ signflip, 0, 0);
2563 }
2564 }
2565 /* Zero plus something non-zero : just return the something */
2566 if (flags & float_muladd_halve_result) {
2567 if (cExp == 0) {
2568 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569 }
2570 /* Subtract one to halve, and one again because roundAndPackFloat32
2571 * wants one less than the true exponent.
2572 */
2573 cExp -= 2;
2574 cSig = (cSig | 0x00800000) << 7;
2575 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2576 }
2577 return packFloat32(cSign ^ signflip, cExp, cSig);
2578 }
2579
2580 if (aExp == 0) {
2581 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582 }
2583 if (bExp == 0) {
2584 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585 }
2586
2587 /* Calculate the actual result a * b + c */
2588
2589 /* Multiply first; this is easy. */
2590 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591 * because we want the true exponent, not the "one-less-than"
2592 * flavour that roundAndPackFloat32() takes.
2593 */
2594 pExp = aExp + bExp - 0x7e;
2595 aSig = (aSig | 0x00800000) << 7;
2596 bSig = (bSig | 0x00800000) << 8;
2597 pSig64 = (uint64_t)aSig * bSig;
2598 if ((int64_t)(pSig64 << 1) >= 0) {
2599 pSig64 <<= 1;
2600 pExp--;
2601 }
2602
2603 zSign = pSign ^ signflip;
2604
2605 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606 * position 62.
2607 */
2608 if (cExp == 0) {
2609 if (!cSig) {
2610 /* Throw out the special case of c being an exact zero now */
2611 shift64RightJamming(pSig64, 32, &pSig64);
2612 pSig = pSig64;
2613 if (flags & float_muladd_halve_result) {
2614 pExp--;
2615 }
2616 return roundAndPackFloat32(zSign, pExp - 1,
2617 pSig, status);
2618 }
2619 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620 }
2621
2622 cSig64 = (uint64_t)cSig << (62 - 23);
2623 cSig64 |= LIT64(0x4000000000000000);
2624 expDiff = pExp - cExp;
2625
2626 if (pSign == cSign) {
2627 /* Addition */
2628 if (expDiff > 0) {
2629 /* scale c to match p */
2630 shift64RightJamming(cSig64, expDiff, &cSig64);
2631 zExp = pExp;
2632 } else if (expDiff < 0) {
2633 /* scale p to match c */
2634 shift64RightJamming(pSig64, -expDiff, &pSig64);
2635 zExp = cExp;
2636 } else {
2637 /* no scaling needed */
2638 zExp = cExp;
2639 }
2640 /* Add significands and make sure explicit bit ends up in posn 62 */
2641 zSig64 = pSig64 + cSig64;
2642 if ((int64_t)zSig64 < 0) {
2643 shift64RightJamming(zSig64, 1, &zSig64);
2644 } else {
2645 zExp--;
2646 }
2647 } else {
2648 /* Subtraction */
2649 if (expDiff > 0) {
2650 shift64RightJamming(cSig64, expDiff, &cSig64);
2651 zSig64 = pSig64 - cSig64;
2652 zExp = pExp;
2653 } else if (expDiff < 0) {
2654 shift64RightJamming(pSig64, -expDiff, &pSig64);
2655 zSig64 = cSig64 - pSig64;
2656 zExp = cExp;
2657 zSign ^= 1;
2658 } else {
2659 zExp = pExp;
2660 if (cSig64 < pSig64) {
2661 zSig64 = pSig64 - cSig64;
2662 } else if (pSig64 < cSig64) {
2663 zSig64 = cSig64 - pSig64;
2664 zSign ^= 1;
2665 } else {
2666 /* Exact zero */
2667 zSign = signflip;
2668 if (status->float_rounding_mode == float_round_down) {
2669 zSign ^= 1;
2670 }
2671 return packFloat32(zSign, 0, 0);
2672 }
2673 }
2674 --zExp;
2675 /* Normalize to put the explicit bit back into bit 62. */
2676 shiftcount = countLeadingZeros64(zSig64) - 1;
2677 zSig64 <<= shiftcount;
2678 zExp -= shiftcount;
2679 }
2680 if (flags & float_muladd_halve_result) {
2681 zExp--;
2682 }
2683
2684 shift64RightJamming(zSig64, 32, &zSig64);
2685 return roundAndPackFloat32(zSign, zExp, zSig64, status);
2686 }
2687
2688
2689 /*----------------------------------------------------------------------------
2690 | Returns the square root of the single-precision floating-point value `a'.
2691 | The operation is performed according to the IEC/IEEE Standard for Binary
2692 | Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694
2695 float32 float32_sqrt(float32 a, float_status *status)
2696 {
2697 flag aSign;
2698 int aExp, zExp;
2699 uint32_t aSig, zSig;
2700 uint64_t rem, term;
2701 a = float32_squash_input_denormal(a, status);
2702
2703 aSig = extractFloat32Frac( a );
2704 aExp = extractFloat32Exp( a );
2705 aSign = extractFloat32Sign( a );
2706 if ( aExp == 0xFF ) {
2707 if (aSig) {
2708 return propagateFloat32NaN(a, float32_zero, status);
2709 }
2710 if ( ! aSign ) return a;
2711 float_raise(float_flag_invalid, status);
2712 return float32_default_nan(status);
2713 }
2714 if ( aSign ) {
2715 if ( ( aExp | aSig ) == 0 ) return a;
2716 float_raise(float_flag_invalid, status);
2717 return float32_default_nan(status);
2718 }
2719 if ( aExp == 0 ) {
2720 if ( aSig == 0 ) return float32_zero;
2721 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722 }
2723 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724 aSig = ( aSig | 0x00800000 )<<8;
2725 zSig = estimateSqrt32( aExp, aSig ) + 2;
2726 if ( ( zSig & 0x7F ) <= 5 ) {
2727 if ( zSig < 2 ) {
2728 zSig = 0x7FFFFFFF;
2729 goto roundAndPack;
2730 }
2731 aSig >>= aExp & 1;
2732 term = ( (uint64_t) zSig ) * zSig;
2733 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734 while ( (int64_t) rem < 0 ) {
2735 --zSig;
2736 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2737 }
2738 zSig |= ( rem != 0 );
2739 }
2740 shift32RightJamming( zSig, 1, &zSig );
2741 roundAndPack:
2742 return roundAndPackFloat32(0, zExp, zSig, status);
2743
2744 }
2745
2746 /*----------------------------------------------------------------------------
2747 | Returns the binary exponential of the single-precision floating-point value
2748 | `a'. The operation is performed according to the IEC/IEEE Standard for
2749 | Binary Floating-Point Arithmetic.
2750 |
2751 | Uses the following identities:
2752 |
2753 | 1. -------------------------------------------------------------------------
2754 | x x*ln(2)
2755 | 2 = e
2756 |
2757 | 2. -------------------------------------------------------------------------
2758 | 2 3 4 5 n
2759 | x x x x x x x
2760 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761 | 1! 2! 3! 4! 5! n!
2762 *----------------------------------------------------------------------------*/
2763
2764 static const float64 float32_exp2_coefficients[15] =
2765 {
2766 const_float64( 0x3ff0000000000000ll ), /* 1 */
2767 const_float64( 0x3fe0000000000000ll ), /* 2 */
2768 const_float64( 0x3fc5555555555555ll ), /* 3 */
2769 const_float64( 0x3fa5555555555555ll ), /* 4 */
2770 const_float64( 0x3f81111111111111ll ), /* 5 */
2771 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2772 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2773 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2774 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2775 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2781 };
2782
2783 float32 float32_exp2(float32 a, float_status *status)
2784 {
2785 flag aSign;
2786 int aExp;
2787 uint32_t aSig;
2788 float64 r, x, xn;
2789 int i;
2790 a = float32_squash_input_denormal(a, status);
2791
2792 aSig = extractFloat32Frac( a );
2793 aExp = extractFloat32Exp( a );
2794 aSign = extractFloat32Sign( a );
2795
2796 if ( aExp == 0xFF) {
2797 if (aSig) {
2798 return propagateFloat32NaN(a, float32_zero, status);
2799 }
2800 return (aSign) ? float32_zero : a;
2801 }
2802 if (aExp == 0) {
2803 if (aSig == 0) return float32_one;
2804 }
2805
2806 float_raise(float_flag_inexact, status);
2807
2808 /* ******************************* */
2809 /* using float64 for approximation */
2810 /* ******************************* */
2811 x = float32_to_float64(a, status);
2812 x = float64_mul(x, float64_ln2, status);
2813
2814 xn = x;
2815 r = float64_one;
2816 for (i = 0 ; i < 15 ; i++) {
2817 float64 f;
2818
2819 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820 r = float64_add(r, f, status);
2821
2822 xn = float64_mul(xn, x, status);
2823 }
2824
2825 return float64_to_float32(r, status);
2826 }
2827
2828 /*----------------------------------------------------------------------------
2829 | Returns the binary log of the single-precision floating-point value `a'.
2830 | The operation is performed according to the IEC/IEEE Standard for Binary
2831 | Floating-Point Arithmetic.
2832 *----------------------------------------------------------------------------*/
2833 float32 float32_log2(float32 a, float_status *status)
2834 {
2835 flag aSign, zSign;
2836 int aExp;
2837 uint32_t aSig, zSig, i;
2838
2839 a = float32_squash_input_denormal(a, status);
2840 aSig = extractFloat32Frac( a );
2841 aExp = extractFloat32Exp( a );
2842 aSign = extractFloat32Sign( a );
2843
2844 if ( aExp == 0 ) {
2845 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847 }
2848 if ( aSign ) {
2849 float_raise(float_flag_invalid, status);
2850 return float32_default_nan(status);
2851 }
2852 if ( aExp == 0xFF ) {
2853 if (aSig) {
2854 return propagateFloat32NaN(a, float32_zero, status);
2855 }
2856 return a;
2857 }
2858
2859 aExp -= 0x7F;
2860 aSig |= 0x00800000;
2861 zSign = aExp < 0;
2862 zSig = aExp << 23;
2863
2864 for (i = 1 << 22; i > 0; i >>= 1) {
2865 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2866 if ( aSig & 0x01000000 ) {
2867 aSig >>= 1;
2868 zSig |= i;
2869 }
2870 }
2871
2872 if ( zSign )
2873 zSig = -zSig;
2874
2875 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2876 }
2877
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise. The invalid exception is
2881 | raised if either operand is a NaN. Otherwise, the comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 int float32_eq(float32 a, float32 b, float_status *status)
2886 {
2887 uint32_t av, bv;
2888 a = float32_squash_input_denormal(a, status);
2889 b = float32_squash_input_denormal(b, status);
2890
2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893 ) {
2894 float_raise(float_flag_invalid, status);
2895 return 0;
2896 }
2897 av = float32_val(a);
2898 bv = float32_val(b);
2899 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2900 }
2901
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than
2904 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2905 | exception is raised if either operand is a NaN. The comparison is performed
2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2908
2909 int float32_le(float32 a, float32 b, float_status *status)
2910 {
2911 flag aSign, bSign;
2912 uint32_t av, bv;
2913 a = float32_squash_input_denormal(a, status);
2914 b = float32_squash_input_denormal(b, status);
2915
2916 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918 ) {
2919 float_raise(float_flag_invalid, status);
2920 return 0;
2921 }
2922 aSign = extractFloat32Sign( a );
2923 bSign = extractFloat32Sign( b );
2924 av = float32_val(a);
2925 bv = float32_val(b);
2926 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2927 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2928
2929 }
2930
2931 /*----------------------------------------------------------------------------
2932 | Returns 1 if the single-precision floating-point value `a' is less than
2933 | the corresponding value `b', and 0 otherwise. The invalid exception is
2934 | raised if either operand is a NaN. The comparison is performed according
2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936 *----------------------------------------------------------------------------*/
2937
2938 int float32_lt(float32 a, float32 b, float_status *status)
2939 {
2940 flag aSign, bSign;
2941 uint32_t av, bv;
2942 a = float32_squash_input_denormal(a, status);
2943 b = float32_squash_input_denormal(b, status);
2944
2945 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947 ) {
2948 float_raise(float_flag_invalid, status);
2949 return 0;
2950 }
2951 aSign = extractFloat32Sign( a );
2952 bSign = extractFloat32Sign( b );
2953 av = float32_val(a);
2954 bv = float32_val(b);
2955 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2956 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2957
2958 }
2959
2960 /*----------------------------------------------------------------------------
2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962 | be compared, and 0 otherwise. The invalid exception is raised if either
2963 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2964 | Standard for Binary Floating-Point Arithmetic.
2965 *----------------------------------------------------------------------------*/
2966
2967 int float32_unordered(float32 a, float32 b, float_status *status)
2968 {
2969 a = float32_squash_input_denormal(a, status);
2970 b = float32_squash_input_denormal(b, status);
2971
2972 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974 ) {
2975 float_raise(float_flag_invalid, status);
2976 return 1;
2977 }
2978 return 0;
2979 }
2980
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is equal to
2983 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2984 | exception. The comparison is performed according to the IEC/IEEE Standard
2985 | for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2987
2988 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2989 {
2990 a = float32_squash_input_denormal(a, status);
2991 b = float32_squash_input_denormal(b, status);
2992
2993 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995 ) {
2996 if (float32_is_signaling_nan(a, status)
2997 || float32_is_signaling_nan(b, status)) {
2998 float_raise(float_flag_invalid, status);
2999 }
3000 return 0;
3001 }
3002 return ( float32_val(a) == float32_val(b) ) ||
3003 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3004 }
3005
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the single-precision floating-point value `a' is less than or
3008 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3009 | cause an exception. Otherwise, the comparison is performed according to the
3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012
3013 int float32_le_quiet(float32 a, float32 b, float_status *status)
3014 {
3015 flag aSign, bSign;
3016 uint32_t av, bv;
3017 a = float32_squash_input_denormal(a, status);
3018 b = float32_squash_input_denormal(b, status);
3019
3020 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022 ) {
3023 if (float32_is_signaling_nan(a, status)
3024 || float32_is_signaling_nan(b, status)) {
3025 float_raise(float_flag_invalid, status);
3026 }
3027 return 0;
3028 }
3029 aSign = extractFloat32Sign( a );
3030 bSign = extractFloat32Sign( b );
3031 av = float32_val(a);
3032 bv = float32_val(b);
3033 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3034 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3035
3036 }
3037
3038 /*----------------------------------------------------------------------------
3039 | Returns 1 if the single-precision floating-point value `a' is less than
3040 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3041 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3042 | Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3044
3045 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3046 {
3047 flag aSign, bSign;
3048 uint32_t av, bv;
3049 a = float32_squash_input_denormal(a, status);
3050 b = float32_squash_input_denormal(b, status);
3051
3052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054 ) {
3055 if (float32_is_signaling_nan(a, status)
3056 || float32_is_signaling_nan(b, status)) {
3057 float_raise(float_flag_invalid, status);
3058 }
3059 return 0;
3060 }
3061 aSign = extractFloat32Sign( a );
3062 bSign = extractFloat32Sign( b );
3063 av = float32_val(a);
3064 bv = float32_val(b);
3065 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3066 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3067
3068 }
3069
3070 /*----------------------------------------------------------------------------
3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3073 | comparison is performed according to the IEC/IEEE Standard for Binary
3074 | Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076
3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3078 {
3079 a = float32_squash_input_denormal(a, status);
3080 b = float32_squash_input_denormal(b, status);
3081
3082 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084 ) {
3085 if (float32_is_signaling_nan(a, status)
3086 || float32_is_signaling_nan(b, status)) {
3087 float_raise(float_flag_invalid, status);
3088 }
3089 return 1;
3090 }
3091 return 0;
3092 }
3093
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format. The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic---which means in particular that the conversion is rounded
3099 | according to the current rounding mode. If `a' is a NaN, the largest
3100 | positive integer is returned. Otherwise, if the conversion overflows, the
3101 | largest integer with the same sign as `a' is returned.
3102 *----------------------------------------------------------------------------*/
3103
3104 int32_t float64_to_int32(float64 a, float_status *status)
3105 {
3106 flag aSign;
3107 int aExp;
3108 int shiftCount;
3109 uint64_t aSig;
3110 a = float64_squash_input_denormal(a, status);
3111
3112 aSig = extractFloat64Frac( a );
3113 aExp = extractFloat64Exp( a );
3114 aSign = extractFloat64Sign( a );
3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117 shiftCount = 0x42C - aExp;
3118 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3119 return roundAndPackInt32(aSign, aSig, status);
3120
3121 }
3122
3123 /*----------------------------------------------------------------------------
3124 | Returns the result of converting the double-precision floating-point value
3125 | `a' to the 32-bit two's complement integer format. The conversion is
3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3127 | Arithmetic, except that the conversion is always rounded toward zero.
3128 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3129 | the conversion overflows, the largest integer with the same sign as `a' is
3130 | returned.
3131 *----------------------------------------------------------------------------*/
3132
3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3134 {
3135 flag aSign;
3136 int aExp;
3137 int shiftCount;
3138 uint64_t aSig, savedASig;
3139 int32_t z;
3140 a = float64_squash_input_denormal(a, status);
3141
3142 aSig = extractFloat64Frac( a );
3143 aExp = extractFloat64Exp( a );
3144 aSign = extractFloat64Sign( a );
3145 if ( 0x41E < aExp ) {
3146 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147 goto invalid;
3148 }
3149 else if ( aExp < 0x3FF ) {
3150 if (aExp || aSig) {
3151 status->float_exception_flags |= float_flag_inexact;
3152 }
3153 return 0;
3154 }
3155 aSig |= LIT64( 0x0010000000000000 );
3156 shiftCount = 0x433 - aExp;
3157 savedASig = aSig;
3158 aSig >>= shiftCount;
3159 z = aSig;
3160 if ( aSign ) z = - z;
3161 if ( ( z < 0 ) ^ aSign ) {
3162 invalid:
3163 float_raise(float_flag_invalid, status);
3164 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3165 }
3166 if ( ( aSig<<shiftCount ) != savedASig ) {
3167 status->float_exception_flags |= float_flag_inexact;
3168 }
3169 return z;
3170
3171 }
3172
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of converting the double-precision floating-point value
3175 | `a' to the 16-bit two's complement integer format. The conversion is
3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3177 | Arithmetic, except that the conversion is always rounded toward zero.
3178 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3179 | the conversion overflows, the largest integer with the same sign as `a' is
3180 | returned.
3181 *----------------------------------------------------------------------------*/
3182
3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3184 {
3185 flag aSign;
3186 int aExp;
3187 int shiftCount;
3188 uint64_t aSig, savedASig;
3189 int32_t z;
3190
3191 aSig = extractFloat64Frac( a );
3192 aExp = extractFloat64Exp( a );
3193 aSign = extractFloat64Sign( a );
3194 if ( 0x40E < aExp ) {
3195 if ( ( aExp == 0x7FF ) && aSig ) {
3196 aSign = 0;
3197 }
3198 goto invalid;
3199 }
3200 else if ( aExp < 0x3FF ) {
3201 if ( aExp || aSig ) {
3202 status->float_exception_flags |= float_flag_inexact;
3203 }
3204 return 0;
3205 }
3206 aSig |= LIT64( 0x0010000000000000 );
3207 shiftCount = 0x433 - aExp;
3208 savedASig = aSig;
3209 aSig >>= shiftCount;
3210 z = aSig;
3211 if ( aSign ) {
3212 z = - z;
3213 }
3214 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215 invalid:
3216 float_raise(float_flag_invalid, status);
3217 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3218 }
3219 if ( ( aSig<<shiftCount ) != savedASig ) {
3220 status->float_exception_flags |= float_flag_inexact;
3221 }
3222 return z;
3223 }
3224
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of converting the double-precision floating-point value
3227 | `a' to the 64-bit two's complement integer format. The conversion is
3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3229 | Arithmetic---which means in particular that the conversion is rounded
3230 | according to the current rounding mode. If `a' is a NaN, the largest
3231 | positive integer is returned. Otherwise, if the conversion overflows, the
3232 | largest integer with the same sign as `a' is returned.
3233 *----------------------------------------------------------------------------*/
3234
3235 int64_t float64_to_int64(float64 a, float_status *status)
3236 {
3237 flag aSign;
3238 int aExp;
3239 int shiftCount;
3240 uint64_t aSig, aSigExtra;
3241 a = float64_squash_input_denormal(a, status);
3242
3243 aSig = extractFloat64Frac( a );
3244 aExp = extractFloat64Exp( a );
3245 aSign = extractFloat64Sign( a );
3246 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247 shiftCount = 0x433 - aExp;
3248 if ( shiftCount <= 0 ) {
3249 if ( 0x43E < aExp ) {
3250 float_raise(float_flag_invalid, status);
3251 if ( ! aSign
3252 || ( ( aExp == 0x7FF )
3253 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254 ) {
3255 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256 }
3257 return (int64_t) LIT64( 0x8000000000000000 );
3258 }
3259 aSigExtra = 0;
3260 aSig <<= - shiftCount;
3261 }
3262 else {
3263 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264 }
3265 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3266
3267 }
3268
3269 /*----------------------------------------------------------------------------
3270 | Returns the result of converting the double-precision floating-point value
3271 | `a' to the 64-bit two's complement integer format. The conversion is
3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3273 | Arithmetic, except that the conversion is always rounded toward zero.
3274 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3275 | the conversion overflows, the largest integer with the same sign as `a' is
3276 | returned.
3277 *----------------------------------------------------------------------------*/
3278
3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3280 {
3281 flag aSign;
3282 int aExp;
3283 int shiftCount;
3284 uint64_t aSig;
3285 int64_t z;
3286 a = float64_squash_input_denormal(a, status);
3287
3288 aSig = extractFloat64Frac( a );
3289 aExp = extractFloat64Exp( a );
3290 aSign = extractFloat64Sign( a );
3291 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292 shiftCount = aExp - 0x433;
3293 if ( 0 <= shiftCount ) {
3294 if ( 0x43E <= aExp ) {
3295 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3296 float_raise(float_flag_invalid, status);
3297 if ( ! aSign
3298 || ( ( aExp == 0x7FF )
3299 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300 ) {
3301 return LIT64( 0x7FFFFFFFFFFFFFFF );
3302 }
3303 }
3304 return (int64_t) LIT64( 0x8000000000000000 );
3305 }
3306 z = aSig<<shiftCount;
3307 }
3308 else {
3309 if ( aExp < 0x3FE ) {
3310 if (aExp | aSig) {
3311 status->float_exception_flags |= float_flag_inexact;
3312 }
3313 return 0;
3314 }
3315 z = aSig>>( - shiftCount );
3316 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3317 status->float_exception_flags |= float_flag_inexact;
3318 }
3319 }
3320 if ( aSign ) z = - z;
3321 return z;
3322
3323 }
3324
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of converting the double-precision floating-point value
3327 | `a' to the single-precision floating-point format. The conversion is
3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3329 | Arithmetic.
3330 *----------------------------------------------------------------------------*/
3331
3332 float32 float64_to_float32(float64 a, float_status *status)
3333 {
3334 flag aSign;
3335 int aExp;
3336 uint64_t aSig;
3337 uint32_t zSig;
3338 a = float64_squash_input_denormal(a, status);
3339
3340 aSig = extractFloat64Frac( a );
3341 aExp = extractFloat64Exp( a );
3342 aSign = extractFloat64Sign( a );
3343 if ( aExp == 0x7FF ) {
3344 if (aSig) {
3345 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346 }
3347 return packFloat32( aSign, 0xFF, 0 );
3348 }
3349 shift64RightJamming( aSig, 22, &aSig );
3350 zSig = aSig;
3351 if ( aExp || zSig ) {
3352 zSig |= 0x40000000;
3353 aExp -= 0x381;
3354 }
3355 return roundAndPackFloat32(aSign, aExp, zSig, status);
3356
3357 }
3358
3359
3360 /*----------------------------------------------------------------------------
3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362 | half-precision floating-point value, returning the result. After being
3363 | shifted into the proper positions, the three fields are simply added
3364 | together to form the result. This means that any integer portion of `zSig'
3365 | will be added into the exponent. Since a properly normalized significand
3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3367 | than the desired result exponent whenever `zSig' is a complete, normalized
3368 | significand.
3369 *----------------------------------------------------------------------------*/
3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3371 {
3372 return make_float16(
3373 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) <<