Merge remote-tracking branch 'remotes/armbru/tags/for-upstream' into staging
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
6
7 /*============================================================================
8
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35
36 =============================================================================*/
37
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41 #include "config.h"
42
43 #include "fpu/softfloat.h"
44
45 /* We only need stdlib for abort() */
46 #include <stdlib.h>
47
48 /*----------------------------------------------------------------------------
49 | Primitive arithmetic functions, including multi-word arithmetic, and
50 | division and square root approximations. (Can be specialized to target if
51 | desired.)
52 *----------------------------------------------------------------------------*/
53 #include "softfloat-macros.h"
54
55 /*----------------------------------------------------------------------------
56 | Functions and definitions to determine: (1) whether tininess for underflow
57 | is detected before or after rounding by default, (2) what (if anything)
58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 | are propagated from function inputs to output. These details are target-
61 | specific.
62 *----------------------------------------------------------------------------*/
63 #include "softfloat-specialize.h"
64
65 /*----------------------------------------------------------------------------
66 | Returns the fraction bits of the half-precision floating-point value `a'.
67 *----------------------------------------------------------------------------*/
68
69 static inline uint32_t extractFloat16Frac(float16 a)
70 {
71 return float16_val(a) & 0x3ff;
72 }
73
74 /*----------------------------------------------------------------------------
75 | Returns the exponent bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77
78 static inline int_fast16_t extractFloat16Exp(float16 a)
79 {
80 return (float16_val(a) >> 10) & 0x1f;
81 }
82
83 /*----------------------------------------------------------------------------
84 | Returns the sign bit of the single-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86
87 static inline flag extractFloat16Sign(float16 a)
88 {
89 return float16_val(a)>>15;
90 }
91
92 /*----------------------------------------------------------------------------
93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94 | and 7, and returns the properly rounded 32-bit integer corresponding to the
95 | input. If `zSign' is 1, the input is negated before being converted to an
96 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
97 | is simply rounded to an integer, with the inexact exception raised if the
98 | input cannot be represented exactly as an integer. However, if the fixed-
99 | point input is too large, the invalid exception is raised and the largest
100 | positive or negative integer is returned.
101 *----------------------------------------------------------------------------*/
102
103 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
104 {
105 int8 roundingMode;
106 flag roundNearestEven;
107 int8 roundIncrement, roundBits;
108 int32_t z;
109
110 roundingMode = STATUS(float_rounding_mode);
111 roundNearestEven = ( roundingMode == float_round_nearest_even );
112 switch (roundingMode) {
113 case float_round_nearest_even:
114 case float_round_ties_away:
115 roundIncrement = 0x40;
116 break;
117 case float_round_to_zero:
118 roundIncrement = 0;
119 break;
120 case float_round_up:
121 roundIncrement = zSign ? 0 : 0x7f;
122 break;
123 case float_round_down:
124 roundIncrement = zSign ? 0x7f : 0;
125 break;
126 default:
127 abort();
128 }
129 roundBits = absZ & 0x7F;
130 absZ = ( absZ + roundIncrement )>>7;
131 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132 z = absZ;
133 if ( zSign ) z = - z;
134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135 float_raise( float_flag_invalid STATUS_VAR);
136 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
137 }
138 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
139 return z;
140
141 }
142
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer. However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
152 | returned.
153 *----------------------------------------------------------------------------*/
154
155 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
156 {
157 int8 roundingMode;
158 flag roundNearestEven, increment;
159 int64_t z;
160
161 roundingMode = STATUS(float_rounding_mode);
162 roundNearestEven = ( roundingMode == float_round_nearest_even );
163 switch (roundingMode) {
164 case float_round_nearest_even:
165 case float_round_ties_away:
166 increment = ((int64_t) absZ1 < 0);
167 break;
168 case float_round_to_zero:
169 increment = 0;
170 break;
171 case float_round_up:
172 increment = !zSign && absZ1;
173 break;
174 case float_round_down:
175 increment = zSign && absZ1;
176 break;
177 default:
178 abort();
179 }
180 if ( increment ) {
181 ++absZ0;
182 if ( absZ0 == 0 ) goto overflow;
183 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184 }
185 z = absZ0;
186 if ( zSign ) z = - z;
187 if ( z && ( ( z < 0 ) ^ zSign ) ) {
188 overflow:
189 float_raise( float_flag_invalid STATUS_VAR);
190 return
191 zSign ? (int64_t) LIT64( 0x8000000000000000 )
192 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 }
194 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
195 return z;
196
197 }
198
199 /*----------------------------------------------------------------------------
200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201 | `absZ1', with binary point between bits 63 and 64 (between the input words),
202 | and returns the properly rounded 64-bit unsigned integer corresponding to the
203 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
204 | with the inexact exception raised if the input cannot be represented exactly
205 | as an integer. However, if the fixed-point input is too large, the invalid
206 | exception is raised and the largest unsigned integer is returned.
207 *----------------------------------------------------------------------------*/
208
209 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
210 uint64_t absZ1 STATUS_PARAM)
211 {
212 int8 roundingMode;
213 flag roundNearestEven, increment;
214
215 roundingMode = STATUS(float_rounding_mode);
216 roundNearestEven = (roundingMode == float_round_nearest_even);
217 switch (roundingMode) {
218 case float_round_nearest_even:
219 case float_round_ties_away:
220 increment = ((int64_t)absZ1 < 0);
221 break;
222 case float_round_to_zero:
223 increment = 0;
224 break;
225 case float_round_up:
226 increment = !zSign && absZ1;
227 break;
228 case float_round_down:
229 increment = zSign && absZ1;
230 break;
231 default:
232 abort();
233 }
234 if (increment) {
235 ++absZ0;
236 if (absZ0 == 0) {
237 float_raise(float_flag_invalid STATUS_VAR);
238 return LIT64(0xFFFFFFFFFFFFFFFF);
239 }
240 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
241 }
242
243 if (zSign && absZ0) {
244 float_raise(float_flag_invalid STATUS_VAR);
245 return 0;
246 }
247
248 if (absZ1) {
249 STATUS(float_exception_flags) |= float_flag_inexact;
250 }
251 return absZ0;
252 }
253
254 /*----------------------------------------------------------------------------
255 | Returns the fraction bits of the single-precision floating-point value `a'.
256 *----------------------------------------------------------------------------*/
257
258 static inline uint32_t extractFloat32Frac( float32 a )
259 {
260
261 return float32_val(a) & 0x007FFFFF;
262
263 }
264
265 /*----------------------------------------------------------------------------
266 | Returns the exponent bits of the single-precision floating-point value `a'.
267 *----------------------------------------------------------------------------*/
268
269 static inline int_fast16_t extractFloat32Exp(float32 a)
270 {
271
272 return ( float32_val(a)>>23 ) & 0xFF;
273
274 }
275
276 /*----------------------------------------------------------------------------
277 | Returns the sign bit of the single-precision floating-point value `a'.
278 *----------------------------------------------------------------------------*/
279
280 static inline flag extractFloat32Sign( float32 a )
281 {
282
283 return float32_val(a)>>31;
284
285 }
286
287 /*----------------------------------------------------------------------------
288 | If `a' is denormal and we are in flush-to-zero mode then set the
289 | input-denormal exception and return zero. Otherwise just return the value.
290 *----------------------------------------------------------------------------*/
291 float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
292 {
293 if (STATUS(flush_inputs_to_zero)) {
294 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
295 float_raise(float_flag_input_denormal STATUS_VAR);
296 return make_float32(float32_val(a) & 0x80000000);
297 }
298 }
299 return a;
300 }
301
302 /*----------------------------------------------------------------------------
303 | Normalizes the subnormal single-precision floating-point value represented
304 | by the denormalized significand `aSig'. The normalized exponent and
305 | significand are stored at the locations pointed to by `zExpPtr' and
306 | `zSigPtr', respectively.
307 *----------------------------------------------------------------------------*/
308
309 static void
310 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
311 {
312 int8 shiftCount;
313
314 shiftCount = countLeadingZeros32( aSig ) - 8;
315 *zSigPtr = aSig<<shiftCount;
316 *zExpPtr = 1 - shiftCount;
317
318 }
319
320 /*----------------------------------------------------------------------------
321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322 | single-precision floating-point value, returning the result. After being
323 | shifted into the proper positions, the three fields are simply added
324 | together to form the result. This means that any integer portion of `zSig'
325 | will be added into the exponent. Since a properly normalized significand
326 | will have an integer portion equal to 1, the `zExp' input should be 1 less
327 | than the desired result exponent whenever `zSig' is a complete, normalized
328 | significand.
329 *----------------------------------------------------------------------------*/
330
331 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
332 {
333
334 return make_float32(
335 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
336
337 }
338
339 /*----------------------------------------------------------------------------
340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341 | and significand `zSig', and returns the proper single-precision floating-
342 | point value corresponding to the abstract input. Ordinarily, the abstract
343 | value is simply rounded and packed into the single-precision format, with
344 | the inexact exception raised if the abstract input cannot be represented
345 | exactly. However, if the abstract value is too large, the overflow and
346 | inexact exceptions are raised and an infinity or maximal finite value is
347 | returned. If the abstract value is too small, the input value is rounded to
348 | a subnormal number, and the underflow and inexact exceptions are raised if
349 | the abstract input cannot be represented exactly as a subnormal single-
350 | precision floating-point number.
351 | The input significand `zSig' has its binary point between bits 30
352 | and 29, which is 7 bits to the left of the usual location. This shifted
353 | significand must be normalized or smaller. If `zSig' is not normalized,
354 | `zExp' must be 0; in that case, the result returned is a subnormal number,
355 | and it must not require rounding. In the usual case that `zSig' is
356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357 | The handling of underflow and overflow follows the IEC/IEEE Standard for
358 | Binary Floating-Point Arithmetic.
359 *----------------------------------------------------------------------------*/
360
361 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
362 {
363 int8 roundingMode;
364 flag roundNearestEven;
365 int8 roundIncrement, roundBits;
366 flag isTiny;
367
368 roundingMode = STATUS(float_rounding_mode);
369 roundNearestEven = ( roundingMode == float_round_nearest_even );
370 switch (roundingMode) {
371 case float_round_nearest_even:
372 case float_round_ties_away:
373 roundIncrement = 0x40;
374 break;
375 case float_round_to_zero:
376 roundIncrement = 0;
377 break;
378 case float_round_up:
379 roundIncrement = zSign ? 0 : 0x7f;
380 break;
381 case float_round_down:
382 roundIncrement = zSign ? 0x7f : 0;
383 break;
384 default:
385 abort();
386 break;
387 }
388 roundBits = zSig & 0x7F;
389 if ( 0xFD <= (uint16_t) zExp ) {
390 if ( ( 0xFD < zExp )
391 || ( ( zExp == 0xFD )
392 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393 ) {
394 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
395 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
396 }
397 if ( zExp < 0 ) {
398 if (STATUS(flush_to_zero)) {
399 float_raise(float_flag_output_denormal STATUS_VAR);
400 return packFloat32(zSign, 0, 0);
401 }
402 isTiny =
403 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404 || ( zExp < -1 )
405 || ( zSig + roundIncrement < 0x80000000 );
406 shift32RightJamming( zSig, - zExp, &zSig );
407 zExp = 0;
408 roundBits = zSig & 0x7F;
409 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
410 }
411 }
412 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
413 zSig = ( zSig + roundIncrement )>>7;
414 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
415 if ( zSig == 0 ) zExp = 0;
416 return packFloat32( zSign, zExp, zSig );
417
418 }
419
420 /*----------------------------------------------------------------------------
421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422 | and significand `zSig', and returns the proper single-precision floating-
423 | point value corresponding to the abstract input. This routine is just like
424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426 | floating-point exponent.
427 *----------------------------------------------------------------------------*/
428
429 static float32
430 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
431 {
432 int8 shiftCount;
433
434 shiftCount = countLeadingZeros32( zSig ) - 1;
435 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
436
437 }
438
439 /*----------------------------------------------------------------------------
440 | Returns the fraction bits of the double-precision floating-point value `a'.
441 *----------------------------------------------------------------------------*/
442
443 static inline uint64_t extractFloat64Frac( float64 a )
444 {
445
446 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
447
448 }
449
450 /*----------------------------------------------------------------------------
451 | Returns the exponent bits of the double-precision floating-point value `a'.
452 *----------------------------------------------------------------------------*/
453
454 static inline int_fast16_t extractFloat64Exp(float64 a)
455 {
456
457 return ( float64_val(a)>>52 ) & 0x7FF;
458
459 }
460
461 /*----------------------------------------------------------------------------
462 | Returns the sign bit of the double-precision floating-point value `a'.
463 *----------------------------------------------------------------------------*/
464
465 static inline flag extractFloat64Sign( float64 a )
466 {
467
468 return float64_val(a)>>63;
469
470 }
471
472 /*----------------------------------------------------------------------------
473 | If `a' is denormal and we are in flush-to-zero mode then set the
474 | input-denormal exception and return zero. Otherwise just return the value.
475 *----------------------------------------------------------------------------*/
476 float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477 {
478 if (STATUS(flush_inputs_to_zero)) {
479 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
480 float_raise(float_flag_input_denormal STATUS_VAR);
481 return make_float64(float64_val(a) & (1ULL << 63));
482 }
483 }
484 return a;
485 }
486
487 /*----------------------------------------------------------------------------
488 | Normalizes the subnormal double-precision floating-point value represented
489 | by the denormalized significand `aSig'. The normalized exponent and
490 | significand are stored at the locations pointed to by `zExpPtr' and
491 | `zSigPtr', respectively.
492 *----------------------------------------------------------------------------*/
493
494 static void
495 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
496 {
497 int8 shiftCount;
498
499 shiftCount = countLeadingZeros64( aSig ) - 11;
500 *zSigPtr = aSig<<shiftCount;
501 *zExpPtr = 1 - shiftCount;
502
503 }
504
505 /*----------------------------------------------------------------------------
506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507 | double-precision floating-point value, returning the result. After being
508 | shifted into the proper positions, the three fields are simply added
509 | together to form the result. This means that any integer portion of `zSig'
510 | will be added into the exponent. Since a properly normalized significand
511 | will have an integer portion equal to 1, the `zExp' input should be 1 less
512 | than the desired result exponent whenever `zSig' is a complete, normalized
513 | significand.
514 *----------------------------------------------------------------------------*/
515
516 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
517 {
518
519 return make_float64(
520 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
521
522 }
523
524 /*----------------------------------------------------------------------------
525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526 | and significand `zSig', and returns the proper double-precision floating-
527 | point value corresponding to the abstract input. Ordinarily, the abstract
528 | value is simply rounded and packed into the double-precision format, with
529 | the inexact exception raised if the abstract input cannot be represented
530 | exactly. However, if the abstract value is too large, the overflow and
531 | inexact exceptions are raised and an infinity or maximal finite value is
532 | returned. If the abstract value is too small, the input value is rounded
533 | to a subnormal number, and the underflow and inexact exceptions are raised
534 | if the abstract input cannot be represented exactly as a subnormal double-
535 | precision floating-point number.
536 | The input significand `zSig' has its binary point between bits 62
537 | and 61, which is 10 bits to the left of the usual location. This shifted
538 | significand must be normalized or smaller. If `zSig' is not normalized,
539 | `zExp' must be 0; in that case, the result returned is a subnormal number,
540 | and it must not require rounding. In the usual case that `zSig' is
541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542 | The handling of underflow and overflow follows the IEC/IEEE Standard for
543 | Binary Floating-Point Arithmetic.
544 *----------------------------------------------------------------------------*/
545
546 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
547 {
548 int8 roundingMode;
549 flag roundNearestEven;
550 int_fast16_t roundIncrement, roundBits;
551 flag isTiny;
552
553 roundingMode = STATUS(float_rounding_mode);
554 roundNearestEven = ( roundingMode == float_round_nearest_even );
555 switch (roundingMode) {
556 case float_round_nearest_even:
557 case float_round_ties_away:
558 roundIncrement = 0x200;
559 break;
560 case float_round_to_zero:
561 roundIncrement = 0;
562 break;
563 case float_round_up:
564 roundIncrement = zSign ? 0 : 0x3ff;
565 break;
566 case float_round_down:
567 roundIncrement = zSign ? 0x3ff : 0;
568 break;
569 default:
570 abort();
571 }
572 roundBits = zSig & 0x3FF;
573 if ( 0x7FD <= (uint16_t) zExp ) {
574 if ( ( 0x7FD < zExp )
575 || ( ( zExp == 0x7FD )
576 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
577 ) {
578 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
579 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
580 }
581 if ( zExp < 0 ) {
582 if (STATUS(flush_to_zero)) {
583 float_raise(float_flag_output_denormal STATUS_VAR);
584 return packFloat64(zSign, 0, 0);
585 }
586 isTiny =
587 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
588 || ( zExp < -1 )
589 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
590 shift64RightJamming( zSig, - zExp, &zSig );
591 zExp = 0;
592 roundBits = zSig & 0x3FF;
593 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
594 }
595 }
596 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
597 zSig = ( zSig + roundIncrement )>>10;
598 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
599 if ( zSig == 0 ) zExp = 0;
600 return packFloat64( zSign, zExp, zSig );
601
602 }
603
604 /*----------------------------------------------------------------------------
605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606 | and significand `zSig', and returns the proper double-precision floating-
607 | point value corresponding to the abstract input. This routine is just like
608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610 | floating-point exponent.
611 *----------------------------------------------------------------------------*/
612
613 static float64
614 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615 {
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( zSig ) - 1;
619 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620
621 }
622
623 /*----------------------------------------------------------------------------
624 | Returns the fraction bits of the extended double-precision floating-point
625 | value `a'.
626 *----------------------------------------------------------------------------*/
627
628 static inline uint64_t extractFloatx80Frac( floatx80 a )
629 {
630
631 return a.low;
632
633 }
634
635 /*----------------------------------------------------------------------------
636 | Returns the exponent bits of the extended double-precision floating-point
637 | value `a'.
638 *----------------------------------------------------------------------------*/
639
640 static inline int32 extractFloatx80Exp( floatx80 a )
641 {
642
643 return a.high & 0x7FFF;
644
645 }
646
647 /*----------------------------------------------------------------------------
648 | Returns the sign bit of the extended double-precision floating-point value
649 | `a'.
650 *----------------------------------------------------------------------------*/
651
652 static inline flag extractFloatx80Sign( floatx80 a )
653 {
654
655 return a.high>>15;
656
657 }
658
659 /*----------------------------------------------------------------------------
660 | Normalizes the subnormal extended double-precision floating-point value
661 | represented by the denormalized significand `aSig'. The normalized exponent
662 | and significand are stored at the locations pointed to by `zExpPtr' and
663 | `zSigPtr', respectively.
664 *----------------------------------------------------------------------------*/
665
666 static void
667 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668 {
669 int8 shiftCount;
670
671 shiftCount = countLeadingZeros64( aSig );
672 *zSigPtr = aSig<<shiftCount;
673 *zExpPtr = 1 - shiftCount;
674
675 }
676
677 /*----------------------------------------------------------------------------
678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679 | extended double-precision floating-point value, returning the result.
680 *----------------------------------------------------------------------------*/
681
682 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
683 {
684 floatx80 z;
685
686 z.low = zSig;
687 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
688 return z;
689
690 }
691
692 /*----------------------------------------------------------------------------
693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
695 | and returns the proper extended double-precision floating-point value
696 | corresponding to the abstract input. Ordinarily, the abstract value is
697 | rounded and packed into the extended double-precision format, with the
698 | inexact exception raised if the abstract input cannot be represented
699 | exactly. However, if the abstract value is too large, the overflow and
700 | inexact exceptions are raised and an infinity or maximal finite value is
701 | returned. If the abstract value is too small, the input value is rounded to
702 | a subnormal number, and the underflow and inexact exceptions are raised if
703 | the abstract input cannot be represented exactly as a subnormal extended
704 | double-precision floating-point number.
705 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
706 | number of bits as single or double precision, respectively. Otherwise, the
707 | result is rounded to the full precision of the extended double-precision
708 | format.
709 | The input significand must be normalized or smaller. If the input
710 | significand is not normalized, `zExp' must be 0; in that case, the result
711 | returned is a subnormal number, and it must not require rounding. The
712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713 | Floating-Point Arithmetic.
714 *----------------------------------------------------------------------------*/
715
716 static floatx80
717 roundAndPackFloatx80(
718 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719 STATUS_PARAM)
720 {
721 int8 roundingMode;
722 flag roundNearestEven, increment, isTiny;
723 int64 roundIncrement, roundMask, roundBits;
724
725 roundingMode = STATUS(float_rounding_mode);
726 roundNearestEven = ( roundingMode == float_round_nearest_even );
727 if ( roundingPrecision == 80 ) goto precision80;
728 if ( roundingPrecision == 64 ) {
729 roundIncrement = LIT64( 0x0000000000000400 );
730 roundMask = LIT64( 0x00000000000007FF );
731 }
732 else if ( roundingPrecision == 32 ) {
733 roundIncrement = LIT64( 0x0000008000000000 );
734 roundMask = LIT64( 0x000000FFFFFFFFFF );
735 }
736 else {
737 goto precision80;
738 }
739 zSig0 |= ( zSig1 != 0 );
740 switch (roundingMode) {
741 case float_round_nearest_even:
742 case float_round_ties_away:
743 break;
744 case float_round_to_zero:
745 roundIncrement = 0;
746 break;
747 case float_round_up:
748 roundIncrement = zSign ? 0 : roundMask;
749 break;
750 case float_round_down:
751 roundIncrement = zSign ? roundMask : 0;
752 break;
753 default:
754 abort();
755 }
756 roundBits = zSig0 & roundMask;
757 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
758 if ( ( 0x7FFE < zExp )
759 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
760 ) {
761 goto overflow;
762 }
763 if ( zExp <= 0 ) {
764 if (STATUS(flush_to_zero)) {
765 float_raise(float_flag_output_denormal STATUS_VAR);
766 return packFloatx80(zSign, 0, 0);
767 }
768 isTiny =
769 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
770 || ( zExp < 0 )
771 || ( zSig0 <= zSig0 + roundIncrement );
772 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
773 zExp = 0;
774 roundBits = zSig0 & roundMask;
775 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
776 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
777 zSig0 += roundIncrement;
778 if ( (int64_t) zSig0 < 0 ) zExp = 1;
779 roundIncrement = roundMask + 1;
780 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
781 roundMask |= roundIncrement;
782 }
783 zSig0 &= ~ roundMask;
784 return packFloatx80( zSign, zExp, zSig0 );
785 }
786 }
787 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
788 zSig0 += roundIncrement;
789 if ( zSig0 < roundIncrement ) {
790 ++zExp;
791 zSig0 = LIT64( 0x8000000000000000 );
792 }
793 roundIncrement = roundMask + 1;
794 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
795 roundMask |= roundIncrement;
796 }
797 zSig0 &= ~ roundMask;
798 if ( zSig0 == 0 ) zExp = 0;
799 return packFloatx80( zSign, zExp, zSig0 );
800 precision80:
801 switch (roundingMode) {
802 case float_round_nearest_even:
803 case float_round_ties_away:
804 increment = ((int64_t)zSig1 < 0);
805 break;
806 case float_round_to_zero:
807 increment = 0;
808 break;
809 case float_round_up:
810 increment = !zSign && zSig1;
811 break;
812 case float_round_down:
813 increment = zSign && zSig1;
814 break;
815 default:
816 abort();
817 }
818 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
819 if ( ( 0x7FFE < zExp )
820 || ( ( zExp == 0x7FFE )
821 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
822 && increment
823 )
824 ) {
825 roundMask = 0;
826 overflow:
827 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
828 if ( ( roundingMode == float_round_to_zero )
829 || ( zSign && ( roundingMode == float_round_up ) )
830 || ( ! zSign && ( roundingMode == float_round_down ) )
831 ) {
832 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
833 }
834 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
835 }
836 if ( zExp <= 0 ) {
837 isTiny =
838 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
839 || ( zExp < 0 )
840 || ! increment
841 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
842 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
843 zExp = 0;
844 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
845 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
846 switch (roundingMode) {
847 case float_round_nearest_even:
848 case float_round_ties_away:
849 increment = ((int64_t)zSig1 < 0);
850 break;
851 case float_round_to_zero:
852 increment = 0;
853 break;
854 case float_round_up:
855 increment = !zSign && zSig1;
856 break;
857 case float_round_down:
858 increment = zSign && zSig1;
859 break;
860 default:
861 abort();
862 }
863 if ( increment ) {
864 ++zSig0;
865 zSig0 &=
866 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867 if ( (int64_t) zSig0 < 0 ) zExp = 1;
868 }
869 return packFloatx80( zSign, zExp, zSig0 );
870 }
871 }
872 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
873 if ( increment ) {
874 ++zSig0;
875 if ( zSig0 == 0 ) {
876 ++zExp;
877 zSig0 = LIT64( 0x8000000000000000 );
878 }
879 else {
880 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
881 }
882 }
883 else {
884 if ( zSig0 == 0 ) zExp = 0;
885 }
886 return packFloatx80( zSign, zExp, zSig0 );
887
888 }
889
890 /*----------------------------------------------------------------------------
891 | Takes an abstract floating-point value having sign `zSign', exponent
892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893 | and returns the proper extended double-precision floating-point value
894 | corresponding to the abstract input. This routine is just like
895 | `roundAndPackFloatx80' except that the input significand does not have to be
896 | normalized.
897 *----------------------------------------------------------------------------*/
898
899 static floatx80
900 normalizeRoundAndPackFloatx80(
901 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
902 STATUS_PARAM)
903 {
904 int8 shiftCount;
905
906 if ( zSig0 == 0 ) {
907 zSig0 = zSig1;
908 zSig1 = 0;
909 zExp -= 64;
910 }
911 shiftCount = countLeadingZeros64( zSig0 );
912 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
913 zExp -= shiftCount;
914 return
915 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
916
917 }
918
919 /*----------------------------------------------------------------------------
920 | Returns the least-significant 64 fraction bits of the quadruple-precision
921 | floating-point value `a'.
922 *----------------------------------------------------------------------------*/
923
924 static inline uint64_t extractFloat128Frac1( float128 a )
925 {
926
927 return a.low;
928
929 }
930
931 /*----------------------------------------------------------------------------
932 | Returns the most-significant 48 fraction bits of the quadruple-precision
933 | floating-point value `a'.
934 *----------------------------------------------------------------------------*/
935
936 static inline uint64_t extractFloat128Frac0( float128 a )
937 {
938
939 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
940
941 }
942
943 /*----------------------------------------------------------------------------
944 | Returns the exponent bits of the quadruple-precision floating-point value
945 | `a'.
946 *----------------------------------------------------------------------------*/
947
948 static inline int32 extractFloat128Exp( float128 a )
949 {
950
951 return ( a.high>>48 ) & 0x7FFF;
952
953 }
954
955 /*----------------------------------------------------------------------------
956 | Returns the sign bit of the quadruple-precision floating-point value `a'.
957 *----------------------------------------------------------------------------*/
958
959 static inline flag extractFloat128Sign( float128 a )
960 {
961
962 return a.high>>63;
963
964 }
965
966 /*----------------------------------------------------------------------------
967 | Normalizes the subnormal quadruple-precision floating-point value
968 | represented by the denormalized significand formed by the concatenation of
969 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
970 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
971 | significand are stored at the location pointed to by `zSig0Ptr', and the
972 | least significant 64 bits of the normalized significand are stored at the
973 | location pointed to by `zSig1Ptr'.
974 *----------------------------------------------------------------------------*/
975
976 static void
977 normalizeFloat128Subnormal(
978 uint64_t aSig0,
979 uint64_t aSig1,
980 int32 *zExpPtr,
981 uint64_t *zSig0Ptr,
982 uint64_t *zSig1Ptr
983 )
984 {
985 int8 shiftCount;
986
987 if ( aSig0 == 0 ) {
988 shiftCount = countLeadingZeros64( aSig1 ) - 15;
989 if ( shiftCount < 0 ) {
990 *zSig0Ptr = aSig1>>( - shiftCount );
991 *zSig1Ptr = aSig1<<( shiftCount & 63 );
992 }
993 else {
994 *zSig0Ptr = aSig1<<shiftCount;
995 *zSig1Ptr = 0;
996 }
997 *zExpPtr = - shiftCount - 63;
998 }
999 else {
1000 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002 *zExpPtr = 1 - shiftCount;
1003 }
1004
1005 }
1006
1007 /*----------------------------------------------------------------------------
1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010 | floating-point value, returning the result. After being shifted into the
1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012 | added together to form the most significant 32 bits of the result. This
1013 | means that any integer portion of `zSig0' will be added into the exponent.
1014 | Since a properly normalized significand will have an integer portion equal
1015 | to 1, the `zExp' input should be 1 less than the desired result exponent
1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017 | significand.
1018 *----------------------------------------------------------------------------*/
1019
1020 static inline float128
1021 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1022 {
1023 float128 z;
1024
1025 z.low = zSig1;
1026 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1027 return z;
1028
1029 }
1030
1031 /*----------------------------------------------------------------------------
1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1034 | and `zSig2', and returns the proper quadruple-precision floating-point value
1035 | corresponding to the abstract input. Ordinarily, the abstract value is
1036 | simply rounded and packed into the quadruple-precision format, with the
1037 | inexact exception raised if the abstract input cannot be represented
1038 | exactly. However, if the abstract value is too large, the overflow and
1039 | inexact exceptions are raised and an infinity or maximal finite value is
1040 | returned. If the abstract value is too small, the input value is rounded to
1041 | a subnormal number, and the underflow and inexact exceptions are raised if
1042 | the abstract input cannot be represented exactly as a subnormal quadruple-
1043 | precision floating-point number.
1044 | The input significand must be normalized or smaller. If the input
1045 | significand is not normalized, `zExp' must be 0; in that case, the result
1046 | returned is a subnormal number, and it must not require rounding. In the
1047 | usual case that the input significand is normalized, `zExp' must be 1 less
1048 | than the ``true'' floating-point exponent. The handling of underflow and
1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050 *----------------------------------------------------------------------------*/
1051
1052 static float128
1053 roundAndPackFloat128(
1054 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1055 {
1056 int8 roundingMode;
1057 flag roundNearestEven, increment, isTiny;
1058
1059 roundingMode = STATUS(float_rounding_mode);
1060 roundNearestEven = ( roundingMode == float_round_nearest_even );
1061 switch (roundingMode) {
1062 case float_round_nearest_even:
1063 case float_round_ties_away:
1064 increment = ((int64_t)zSig2 < 0);
1065 break;
1066 case float_round_to_zero:
1067 increment = 0;
1068 break;
1069 case float_round_up:
1070 increment = !zSign && zSig2;
1071 break;
1072 case float_round_down:
1073 increment = zSign && zSig2;
1074 break;
1075 default:
1076 abort();
1077 }
1078 if ( 0x7FFD <= (uint32_t) zExp ) {
1079 if ( ( 0x7FFD < zExp )
1080 || ( ( zExp == 0x7FFD )
1081 && eq128(
1082 LIT64( 0x0001FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF ),
1084 zSig0,
1085 zSig1
1086 )
1087 && increment
1088 )
1089 ) {
1090 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091 if ( ( roundingMode == float_round_to_zero )
1092 || ( zSign && ( roundingMode == float_round_up ) )
1093 || ( ! zSign && ( roundingMode == float_round_down ) )
1094 ) {
1095 return
1096 packFloat128(
1097 zSign,
1098 0x7FFE,
1099 LIT64( 0x0000FFFFFFFFFFFF ),
1100 LIT64( 0xFFFFFFFFFFFFFFFF )
1101 );
1102 }
1103 return packFloat128( zSign, 0x7FFF, 0, 0 );
1104 }
1105 if ( zExp < 0 ) {
1106 if (STATUS(flush_to_zero)) {
1107 float_raise(float_flag_output_denormal STATUS_VAR);
1108 return packFloat128(zSign, 0, 0, 0);
1109 }
1110 isTiny =
1111 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112 || ( zExp < -1 )
1113 || ! increment
1114 || lt128(
1115 zSig0,
1116 zSig1,
1117 LIT64( 0x0001FFFFFFFFFFFF ),
1118 LIT64( 0xFFFFFFFFFFFFFFFF )
1119 );
1120 shift128ExtraRightJamming(
1121 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122 zExp = 0;
1123 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1124 switch (roundingMode) {
1125 case float_round_nearest_even:
1126 case float_round_ties_away:
1127 increment = ((int64_t)zSig2 < 0);
1128 break;
1129 case float_round_to_zero:
1130 increment = 0;
1131 break;
1132 case float_round_up:
1133 increment = !zSign && zSig2;
1134 break;
1135 case float_round_down:
1136 increment = zSign && zSig2;
1137 break;
1138 default:
1139 abort();
1140 }
1141 }
1142 }
1143 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144 if ( increment ) {
1145 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147 }
1148 else {
1149 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150 }
1151 return packFloat128( zSign, zExp, zSig0, zSig1 );
1152
1153 }
1154
1155 /*----------------------------------------------------------------------------
1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1158 | returns the proper quadruple-precision floating-point value corresponding
1159 | to the abstract input. This routine is just like `roundAndPackFloat128'
1160 | except that the input significand has fewer bits and does not have to be
1161 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1162 | point exponent.
1163 *----------------------------------------------------------------------------*/
1164
1165 static float128
1166 normalizeRoundAndPackFloat128(
1167 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1168 {
1169 int8 shiftCount;
1170 uint64_t zSig2;
1171
1172 if ( zSig0 == 0 ) {
1173 zSig0 = zSig1;
1174 zSig1 = 0;
1175 zExp -= 64;
1176 }
1177 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178 if ( 0 <= shiftCount ) {
1179 zSig2 = 0;
1180 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181 }
1182 else {
1183 shift128ExtraRightJamming(
1184 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185 }
1186 zExp -= shiftCount;
1187 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188
1189 }
1190
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a'
1193 | to the single-precision floating-point format. The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196
1197 float32 int32_to_float32(int32_t a STATUS_PARAM)
1198 {
1199 flag zSign;
1200
1201 if ( a == 0 ) return float32_zero;
1202 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1203 zSign = ( a < 0 );
1204 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205
1206 }
1207
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 32-bit two's complement integer `a'
1210 | to the double-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float64 int32_to_float64(int32_t a STATUS_PARAM)
1215 {
1216 flag zSign;
1217 uint32 absA;
1218 int8 shiftCount;
1219 uint64_t zSig;
1220
1221 if ( a == 0 ) return float64_zero;
1222 zSign = ( a < 0 );
1223 absA = zSign ? - a : a;
1224 shiftCount = countLeadingZeros32( absA ) + 21;
1225 zSig = absA;
1226 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227
1228 }
1229
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a'
1232 | to the extended double-precision floating-point format. The conversion
1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234 | Arithmetic.
1235 *----------------------------------------------------------------------------*/
1236
1237 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1238 {
1239 flag zSign;
1240 uint32 absA;
1241 int8 shiftCount;
1242 uint64_t zSig;
1243
1244 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245 zSign = ( a < 0 );
1246 absA = zSign ? - a : a;
1247 shiftCount = countLeadingZeros32( absA ) + 32;
1248 zSig = absA;
1249 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250
1251 }
1252
1253 /*----------------------------------------------------------------------------
1254 | Returns the result of converting the 32-bit two's complement integer `a' to
1255 | the quadruple-precision floating-point format. The conversion is performed
1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 *----------------------------------------------------------------------------*/
1258
1259 float128 int32_to_float128(int32_t a STATUS_PARAM)
1260 {
1261 flag zSign;
1262 uint32 absA;
1263 int8 shiftCount;
1264 uint64_t zSig0;
1265
1266 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267 zSign = ( a < 0 );
1268 absA = zSign ? - a : a;
1269 shiftCount = countLeadingZeros32( absA ) + 17;
1270 zSig0 = absA;
1271 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272
1273 }
1274
1275 /*----------------------------------------------------------------------------
1276 | Returns the result of converting the 64-bit two's complement integer `a'
1277 | to the single-precision floating-point format. The conversion is performed
1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279 *----------------------------------------------------------------------------*/
1280
1281 float32 int64_to_float32(int64_t a STATUS_PARAM)
1282 {
1283 flag zSign;
1284 uint64 absA;
1285 int8 shiftCount;
1286
1287 if ( a == 0 ) return float32_zero;
1288 zSign = ( a < 0 );
1289 absA = zSign ? - a : a;
1290 shiftCount = countLeadingZeros64( absA ) - 40;
1291 if ( 0 <= shiftCount ) {
1292 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293 }
1294 else {
1295 shiftCount += 7;
1296 if ( shiftCount < 0 ) {
1297 shift64RightJamming( absA, - shiftCount, &absA );
1298 }
1299 else {
1300 absA <<= shiftCount;
1301 }
1302 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303 }
1304
1305 }
1306
1307 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1308 {
1309 int8 shiftCount;
1310
1311 if ( a == 0 ) return float32_zero;
1312 shiftCount = countLeadingZeros64( a ) - 40;
1313 if ( 0 <= shiftCount ) {
1314 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1315 }
1316 else {
1317 shiftCount += 7;
1318 if ( shiftCount < 0 ) {
1319 shift64RightJamming( a, - shiftCount, &a );
1320 }
1321 else {
1322 a <<= shiftCount;
1323 }
1324 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1325 }
1326 }
1327
1328 /*----------------------------------------------------------------------------
1329 | Returns the result of converting the 64-bit two's complement integer `a'
1330 | to the double-precision floating-point format. The conversion is performed
1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 *----------------------------------------------------------------------------*/
1333
1334 float64 int64_to_float64(int64_t a STATUS_PARAM)
1335 {
1336 flag zSign;
1337
1338 if ( a == 0 ) return float64_zero;
1339 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1340 return packFloat64( 1, 0x43E, 0 );
1341 }
1342 zSign = ( a < 0 );
1343 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344
1345 }
1346
1347 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1348 {
1349 int exp = 0x43C;
1350
1351 if (a == 0) {
1352 return float64_zero;
1353 }
1354 if ((int64_t)a < 0) {
1355 shift64RightJamming(a, 1, &a);
1356 exp += 1;
1357 }
1358 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1359 }
1360
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the 64-bit two's complement integer `a'
1363 | to the extended double-precision floating-point format. The conversion
1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic.
1366 *----------------------------------------------------------------------------*/
1367
1368 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1369 {
1370 flag zSign;
1371 uint64 absA;
1372 int8 shiftCount;
1373
1374 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375 zSign = ( a < 0 );
1376 absA = zSign ? - a : a;
1377 shiftCount = countLeadingZeros64( absA );
1378 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379
1380 }
1381
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the 64-bit two's complement integer `a' to
1384 | the quadruple-precision floating-point format. The conversion is performed
1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386 *----------------------------------------------------------------------------*/
1387
1388 float128 int64_to_float128(int64_t a STATUS_PARAM)
1389 {
1390 flag zSign;
1391 uint64 absA;
1392 int8 shiftCount;
1393 int32 zExp;
1394 uint64_t zSig0, zSig1;
1395
1396 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397 zSign = ( a < 0 );
1398 absA = zSign ? - a : a;
1399 shiftCount = countLeadingZeros64( absA ) + 49;
1400 zExp = 0x406E - shiftCount;
1401 if ( 64 <= shiftCount ) {
1402 zSig1 = 0;
1403 zSig0 = absA;
1404 shiftCount -= 64;
1405 }
1406 else {
1407 zSig1 = absA;
1408 zSig0 = 0;
1409 }
1410 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411 return packFloat128( zSign, zExp, zSig0, zSig1 );
1412
1413 }
1414
1415 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1416 {
1417 if (a == 0) {
1418 return float128_zero;
1419 }
1420 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421 }
1422
1423 /*----------------------------------------------------------------------------
1424 | Returns the result of converting the single-precision floating-point value
1425 | `a' to the 32-bit two's complement integer format. The conversion is
1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1427 | Arithmetic---which means in particular that the conversion is rounded
1428 | according to the current rounding mode. If `a' is a NaN, the largest
1429 | positive integer is returned. Otherwise, if the conversion overflows, the
1430 | largest integer with the same sign as `a' is returned.
1431 *----------------------------------------------------------------------------*/
1432
1433 int32 float32_to_int32( float32 a STATUS_PARAM )
1434 {
1435 flag aSign;
1436 int_fast16_t aExp, shiftCount;
1437 uint32_t aSig;
1438 uint64_t aSig64;
1439
1440 a = float32_squash_input_denormal(a STATUS_VAR);
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445 if ( aExp ) aSig |= 0x00800000;
1446 shiftCount = 0xAF - aExp;
1447 aSig64 = aSig;
1448 aSig64 <<= 32;
1449 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451
1452 }
1453
1454 /*----------------------------------------------------------------------------
1455 | Returns the result of converting the single-precision floating-point value
1456 | `a' to the 32-bit two's complement integer format. The conversion is
1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1458 | Arithmetic, except that the conversion is always rounded toward zero.
1459 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1460 | the conversion overflows, the largest integer with the same sign as `a' is
1461 | returned.
1462 *----------------------------------------------------------------------------*/
1463
1464 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465 {
1466 flag aSign;
1467 int_fast16_t aExp, shiftCount;
1468 uint32_t aSig;
1469 int32_t z;
1470 a = float32_squash_input_denormal(a STATUS_VAR);
1471
1472 aSig = extractFloat32Frac( a );
1473 aExp = extractFloat32Exp( a );
1474 aSign = extractFloat32Sign( a );
1475 shiftCount = aExp - 0x9E;
1476 if ( 0 <= shiftCount ) {
1477 if ( float32_val(a) != 0xCF000000 ) {
1478 float_raise( float_flag_invalid STATUS_VAR);
1479 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480 }
1481 return (int32_t) 0x80000000;
1482 }
1483 else if ( aExp <= 0x7E ) {
1484 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485 return 0;
1486 }
1487 aSig = ( aSig | 0x00800000 )<<8;
1488 z = aSig>>( - shiftCount );
1489 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1490 STATUS(float_exception_flags) |= float_flag_inexact;
1491 }
1492 if ( aSign ) z = - z;
1493 return z;
1494
1495 }
1496
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 16-bit two's complement integer format. The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic, except that the conversion is always rounded toward zero.
1502 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1503 | the conversion overflows, the largest integer with the same sign as `a' is
1504 | returned.
1505 *----------------------------------------------------------------------------*/
1506
1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1508 {
1509 flag aSign;
1510 int_fast16_t aExp, shiftCount;
1511 uint32_t aSig;
1512 int32 z;
1513
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 shiftCount = aExp - 0x8E;
1518 if ( 0 <= shiftCount ) {
1519 if ( float32_val(a) != 0xC7000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR);
1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522 return 0x7FFF;
1523 }
1524 }
1525 return (int32_t) 0xffff8000;
1526 }
1527 else if ( aExp <= 0x7E ) {
1528 if ( aExp | aSig ) {
1529 STATUS(float_exception_flags) |= float_flag_inexact;
1530 }
1531 return 0;
1532 }
1533 shiftCount -= 0x10;
1534 aSig = ( aSig | 0x00800000 )<<8;
1535 z = aSig>>( - shiftCount );
1536 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1537 STATUS(float_exception_flags) |= float_flag_inexact;
1538 }
1539 if ( aSign ) {
1540 z = - z;
1541 }
1542 return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the 64-bit two's complement integer format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic---which means in particular that the conversion is rounded
1551 | according to the current rounding mode. If `a' is a NaN, the largest
1552 | positive integer is returned. Otherwise, if the conversion overflows, the
1553 | largest integer with the same sign as `a' is returned.
1554 *----------------------------------------------------------------------------*/
1555
1556 int64 float32_to_int64( float32 a STATUS_PARAM )
1557 {
1558 flag aSign;
1559 int_fast16_t aExp, shiftCount;
1560 uint32_t aSig;
1561 uint64_t aSig64, aSigExtra;
1562 a = float32_squash_input_denormal(a STATUS_VAR);
1563
1564 aSig = extractFloat32Frac( a );
1565 aExp = extractFloat32Exp( a );
1566 aSign = extractFloat32Sign( a );
1567 shiftCount = 0xBE - aExp;
1568 if ( shiftCount < 0 ) {
1569 float_raise( float_flag_invalid STATUS_VAR);
1570 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571 return LIT64( 0x7FFFFFFFFFFFFFFF );
1572 }
1573 return (int64_t) LIT64( 0x8000000000000000 );
1574 }
1575 if ( aExp ) aSig |= 0x00800000;
1576 aSig64 = aSig;
1577 aSig64 <<= 40;
1578 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580
1581 }
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the 64-bit unsigned integer format. The conversion is
1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic---which means in particular that the conversion is rounded
1588 | according to the current rounding mode. If `a' is a NaN, the largest
1589 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1590 | largest unsigned integer is returned. If the 'a' is negative, the result
1591 | is rounded and zero is returned; values that do not round to zero will
1592 | raise the inexact exception flag.
1593 *----------------------------------------------------------------------------*/
1594
1595 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596 {
1597 flag aSign;
1598 int_fast16_t aExp, shiftCount;
1599 uint32_t aSig;
1600 uint64_t aSig64, aSigExtra;
1601 a = float32_squash_input_denormal(a STATUS_VAR);
1602
1603 aSig = extractFloat32Frac(a);
1604 aExp = extractFloat32Exp(a);
1605 aSign = extractFloat32Sign(a);
1606 if ((aSign) && (aExp > 126)) {
1607 float_raise(float_flag_invalid STATUS_VAR);
1608 if (float32_is_any_nan(a)) {
1609 return LIT64(0xFFFFFFFFFFFFFFFF);
1610 } else {
1611 return 0;
1612 }
1613 }
1614 shiftCount = 0xBE - aExp;
1615 if (aExp) {
1616 aSig |= 0x00800000;
1617 }
1618 if (shiftCount < 0) {
1619 float_raise(float_flag_invalid STATUS_VAR);
1620 return LIT64(0xFFFFFFFFFFFFFFFF);
1621 }
1622
1623 aSig64 = aSig;
1624 aSig64 <<= 40;
1625 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627 }
1628
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the 64-bit unsigned integer format. The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic, except that the conversion is always rounded toward zero. If
1634 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1635 | conversion overflows, the largest unsigned integer is returned. If the
1636 | 'a' is negative, the result is rounded and zero is returned; values that do
1637 | not round to zero will raise the inexact flag.
1638 *----------------------------------------------------------------------------*/
1639
1640 uint64 float32_to_uint64_round_to_zero(float32 a STATUS_PARAM)
1641 {
1642 signed char current_rounding_mode = STATUS(float_rounding_mode);
1643 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1644 int64_t v = float32_to_uint64(a STATUS_VAR);
1645 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1646 return v;
1647 }
1648
1649 /*----------------------------------------------------------------------------
1650 | Returns the result of converting the single-precision floating-point value
1651 | `a' to the 64-bit two's complement integer format. The conversion is
1652 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1653 | Arithmetic, except that the conversion is always rounded toward zero. If
1654 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1655 | conversion overflows, the largest integer with the same sign as `a' is
1656 | returned.
1657 *----------------------------------------------------------------------------*/
1658
1659 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1660 {
1661 flag aSign;
1662 int_fast16_t aExp, shiftCount;
1663 uint32_t aSig;
1664 uint64_t aSig64;
1665 int64 z;
1666 a = float32_squash_input_denormal(a STATUS_VAR);
1667
1668 aSig = extractFloat32Frac( a );
1669 aExp = extractFloat32Exp( a );
1670 aSign = extractFloat32Sign( a );
1671 shiftCount = aExp - 0xBE;
1672 if ( 0 <= shiftCount ) {
1673 if ( float32_val(a) != 0xDF000000 ) {
1674 float_raise( float_flag_invalid STATUS_VAR);
1675 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1676 return LIT64( 0x7FFFFFFFFFFFFFFF );
1677 }
1678 }
1679 return (int64_t) LIT64( 0x8000000000000000 );
1680 }
1681 else if ( aExp <= 0x7E ) {
1682 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1683 return 0;
1684 }
1685 aSig64 = aSig | 0x00800000;
1686 aSig64 <<= 40;
1687 z = aSig64>>( - shiftCount );
1688 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1689 STATUS(float_exception_flags) |= float_flag_inexact;
1690 }
1691 if ( aSign ) z = - z;
1692 return z;
1693
1694 }
1695
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the single-precision floating-point value
1698 | `a' to the double-precision floating-point format. The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702
1703 float64 float32_to_float64( float32 a STATUS_PARAM )
1704 {
1705 flag aSign;
1706 int_fast16_t aExp;
1707 uint32_t aSig;
1708 a = float32_squash_input_denormal(a STATUS_VAR);
1709
1710 aSig = extractFloat32Frac( a );
1711 aExp = extractFloat32Exp( a );
1712 aSign = extractFloat32Sign( a );
1713 if ( aExp == 0xFF ) {
1714 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1715 return packFloat64( aSign, 0x7FF, 0 );
1716 }
1717 if ( aExp == 0 ) {
1718 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1719 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1720 --aExp;
1721 }
1722 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1723
1724 }
1725
1726 /*----------------------------------------------------------------------------
1727 | Returns the result of converting the single-precision floating-point value
1728 | `a' to the extended double-precision floating-point format. The conversion
1729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1730 | Arithmetic.
1731 *----------------------------------------------------------------------------*/
1732
1733 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1734 {
1735 flag aSign;
1736 int_fast16_t aExp;
1737 uint32_t aSig;
1738
1739 a = float32_squash_input_denormal(a STATUS_VAR);
1740 aSig = extractFloat32Frac( a );
1741 aExp = extractFloat32Exp( a );
1742 aSign = extractFloat32Sign( a );
1743 if ( aExp == 0xFF ) {
1744 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1745 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1746 }
1747 if ( aExp == 0 ) {
1748 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1749 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1750 }
1751 aSig |= 0x00800000;
1752 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1753
1754 }
1755
1756 /*----------------------------------------------------------------------------
1757 | Returns the result of converting the single-precision floating-point value
1758 | `a' to the double-precision floating-point format. The conversion is
1759 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1760 | Arithmetic.
1761 *----------------------------------------------------------------------------*/
1762
1763 float128 float32_to_float128( float32 a STATUS_PARAM )
1764 {
1765 flag aSign;
1766 int_fast16_t aExp;
1767 uint32_t aSig;
1768
1769 a = float32_squash_input_denormal(a STATUS_VAR);
1770 aSig = extractFloat32Frac( a );
1771 aExp = extractFloat32Exp( a );
1772 aSign = extractFloat32Sign( a );
1773 if ( aExp == 0xFF ) {
1774 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1775 return packFloat128( aSign, 0x7FFF, 0, 0 );
1776 }
1777 if ( aExp == 0 ) {
1778 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1779 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1780 --aExp;
1781 }
1782 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1783
1784 }
1785
1786 /*----------------------------------------------------------------------------
1787 | Rounds the single-precision floating-point value `a' to an integer, and
1788 | returns the result as a single-precision floating-point value. The
1789 | operation is performed according to the IEC/IEEE Standard for Binary
1790 | Floating-Point Arithmetic.
1791 *----------------------------------------------------------------------------*/
1792
1793 float32 float32_round_to_int( float32 a STATUS_PARAM)
1794 {
1795 flag aSign;
1796 int_fast16_t aExp;
1797 uint32_t lastBitMask, roundBitsMask;
1798 uint32_t z;
1799 a = float32_squash_input_denormal(a STATUS_VAR);
1800
1801 aExp = extractFloat32Exp( a );
1802 if ( 0x96 <= aExp ) {
1803 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1804 return propagateFloat32NaN( a, a STATUS_VAR );
1805 }
1806 return a;
1807 }
1808 if ( aExp <= 0x7E ) {
1809 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1810 STATUS(float_exception_flags) |= float_flag_inexact;
1811 aSign = extractFloat32Sign( a );
1812 switch ( STATUS(float_rounding_mode) ) {
1813 case float_round_nearest_even:
1814 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1815 return packFloat32( aSign, 0x7F, 0 );
1816 }
1817 break;
1818 case float_round_ties_away:
1819 if (aExp == 0x7E) {
1820 return packFloat32(aSign, 0x7F, 0);
1821 }
1822 break;
1823 case float_round_down:
1824 return make_float32(aSign ? 0xBF800000 : 0);
1825 case float_round_up:
1826 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1827 }
1828 return packFloat32( aSign, 0, 0 );
1829 }
1830 lastBitMask = 1;
1831 lastBitMask <<= 0x96 - aExp;
1832 roundBitsMask = lastBitMask - 1;
1833 z = float32_val(a);
1834 switch (STATUS(float_rounding_mode)) {
1835 case float_round_nearest_even:
1836 z += lastBitMask>>1;
1837 if ((z & roundBitsMask) == 0) {
1838 z &= ~lastBitMask;
1839 }
1840 break;
1841 case float_round_ties_away:
1842 z += lastBitMask >> 1;
1843 break;
1844 case float_round_to_zero:
1845 break;
1846 case float_round_up:
1847 if (!extractFloat32Sign(make_float32(z))) {
1848 z += roundBitsMask;
1849 }
1850 break;
1851 case float_round_down:
1852 if (extractFloat32Sign(make_float32(z))) {
1853 z += roundBitsMask;
1854 }
1855 break;
1856 default:
1857 abort();
1858 }
1859 z &= ~ roundBitsMask;
1860 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1861 return make_float32(z);
1862
1863 }
1864
1865 /*----------------------------------------------------------------------------
1866 | Returns the result of adding the absolute values of the single-precision
1867 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1868 | before being returned. `zSign' is ignored if the result is a NaN.
1869 | The addition is performed according to the IEC/IEEE Standard for Binary
1870 | Floating-Point Arithmetic.
1871 *----------------------------------------------------------------------------*/
1872
1873 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1874 {
1875 int_fast16_t aExp, bExp, zExp;
1876 uint32_t aSig, bSig, zSig;
1877 int_fast16_t expDiff;
1878
1879 aSig = extractFloat32Frac( a );
1880 aExp = extractFloat32Exp( a );
1881 bSig = extractFloat32Frac( b );
1882 bExp = extractFloat32Exp( b );
1883 expDiff = aExp - bExp;
1884 aSig <<= 6;
1885 bSig <<= 6;
1886 if ( 0 < expDiff ) {
1887 if ( aExp == 0xFF ) {
1888 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1889 return a;
1890 }
1891 if ( bExp == 0 ) {
1892 --expDiff;
1893 }
1894 else {
1895 bSig |= 0x20000000;
1896 }
1897 shift32RightJamming( bSig, expDiff, &bSig );
1898 zExp = aExp;
1899 }
1900 else if ( expDiff < 0 ) {
1901 if ( bExp == 0xFF ) {
1902 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1903 return packFloat32( zSign, 0xFF, 0 );
1904 }
1905 if ( aExp == 0 ) {
1906 ++expDiff;
1907 }
1908 else {
1909 aSig |= 0x20000000;
1910 }
1911 shift32RightJamming( aSig, - expDiff, &aSig );
1912 zExp = bExp;
1913 }
1914 else {
1915 if ( aExp == 0xFF ) {
1916 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1917 return a;
1918 }
1919 if ( aExp == 0 ) {
1920 if (STATUS(flush_to_zero)) {
1921 if (aSig | bSig) {
1922 float_raise(float_flag_output_denormal STATUS_VAR);
1923 }
1924 return packFloat32(zSign, 0, 0);
1925 }
1926 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1927 }
1928 zSig = 0x40000000 + aSig + bSig;
1929 zExp = aExp;
1930 goto roundAndPack;
1931 }
1932 aSig |= 0x20000000;
1933 zSig = ( aSig + bSig )<<1;
1934 --zExp;
1935 if ( (int32_t) zSig < 0 ) {
1936 zSig = aSig + bSig;
1937 ++zExp;
1938 }
1939 roundAndPack:
1940 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1941
1942 }
1943
1944 /*----------------------------------------------------------------------------
1945 | Returns the result of subtracting the absolute values of the single-
1946 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1947 | difference is negated before being returned. `zSign' is ignored if the
1948 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1949 | Standard for Binary Floating-Point Arithmetic.
1950 *----------------------------------------------------------------------------*/
1951
1952 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1953 {
1954 int_fast16_t aExp, bExp, zExp;
1955 uint32_t aSig, bSig, zSig;
1956 int_fast16_t expDiff;
1957
1958 aSig = extractFloat32Frac( a );
1959 aExp = extractFloat32Exp( a );
1960 bSig = extractFloat32Frac( b );
1961 bExp = extractFloat32Exp( b );
1962 expDiff = aExp - bExp;
1963 aSig <<= 7;
1964 bSig <<= 7;
1965 if ( 0 < expDiff ) goto aExpBigger;
1966 if ( expDiff < 0 ) goto bExpBigger;
1967 if ( aExp == 0xFF ) {
1968 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1969 float_raise( float_flag_invalid STATUS_VAR);
1970 return float32_default_nan;
1971 }
1972 if ( aExp == 0 ) {
1973 aExp = 1;
1974 bExp = 1;
1975 }
1976 if ( bSig < aSig ) goto aBigger;
1977 if ( aSig < bSig ) goto bBigger;
1978 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1979 bExpBigger:
1980 if ( bExp == 0xFF ) {
1981 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982 return packFloat32( zSign ^ 1, 0xFF, 0 );
1983 }
1984 if ( aExp == 0 ) {
1985 ++expDiff;
1986 }
1987 else {
1988 aSig |= 0x40000000;
1989 }
1990 shift32RightJamming( aSig, - expDiff, &aSig );
1991 bSig |= 0x40000000;
1992 bBigger:
1993 zSig = bSig - aSig;
1994 zExp = bExp;
1995 zSign ^= 1;
1996 goto normalizeRoundAndPack;
1997 aExpBigger:
1998 if ( aExp == 0xFF ) {
1999 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2000 return a;
2001 }
2002 if ( bExp == 0 ) {
2003 --expDiff;
2004 }
2005 else {
2006 bSig |= 0x40000000;
2007 }
2008 shift32RightJamming( bSig, expDiff, &bSig );
2009 aSig |= 0x40000000;
2010 aBigger:
2011 zSig = aSig - bSig;
2012 zExp = aExp;
2013 normalizeRoundAndPack:
2014 --zExp;
2015 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016
2017 }
2018
2019 /*----------------------------------------------------------------------------
2020 | Returns the result of adding the single-precision floating-point values `a'
2021 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2022 | Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2024
2025 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2026 {
2027 flag aSign, bSign;
2028 a = float32_squash_input_denormal(a STATUS_VAR);
2029 b = float32_squash_input_denormal(b STATUS_VAR);
2030
2031 aSign = extractFloat32Sign( a );
2032 bSign = extractFloat32Sign( b );
2033 if ( aSign == bSign ) {
2034 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2035 }
2036 else {
2037 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038 }
2039
2040 }
2041
2042 /*----------------------------------------------------------------------------
2043 | Returns the result of subtracting the single-precision floating-point values
2044 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2045 | for Binary Floating-Point Arithmetic.
2046 *----------------------------------------------------------------------------*/
2047
2048 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2049 {
2050 flag aSign, bSign;
2051 a = float32_squash_input_denormal(a STATUS_VAR);
2052 b = float32_squash_input_denormal(b STATUS_VAR);
2053
2054 aSign = extractFloat32Sign( a );
2055 bSign = extractFloat32Sign( b );
2056 if ( aSign == bSign ) {
2057 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2058 }
2059 else {
2060 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2061 }
2062
2063 }
2064
2065 /*----------------------------------------------------------------------------
2066 | Returns the result of multiplying the single-precision floating-point values
2067 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2068 | for Binary Floating-Point Arithmetic.
2069 *----------------------------------------------------------------------------*/
2070
2071 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2072 {
2073 flag aSign, bSign, zSign;
2074 int_fast16_t aExp, bExp, zExp;
2075 uint32_t aSig, bSig;
2076 uint64_t zSig64;
2077 uint32_t zSig;
2078
2079 a = float32_squash_input_denormal(a STATUS_VAR);
2080 b = float32_squash_input_denormal(b STATUS_VAR);
2081
2082 aSig = extractFloat32Frac( a );
2083 aExp = extractFloat32Exp( a );
2084 aSign = extractFloat32Sign( a );
2085 bSig = extractFloat32Frac( b );
2086 bExp = extractFloat32Exp( b );
2087 bSign = extractFloat32Sign( b );
2088 zSign = aSign ^ bSign;
2089 if ( aExp == 0xFF ) {
2090 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2091 return propagateFloat32NaN( a, b STATUS_VAR );
2092 }
2093 if ( ( bExp | bSig ) == 0 ) {
2094 float_raise( float_flag_invalid STATUS_VAR);
2095 return float32_default_nan;
2096 }
2097 return packFloat32( zSign, 0xFF, 0 );
2098 }
2099 if ( bExp == 0xFF ) {
2100 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2101 if ( ( aExp | aSig ) == 0 ) {
2102 float_raise( float_flag_invalid STATUS_VAR);
2103 return float32_default_nan;
2104 }
2105 return packFloat32( zSign, 0xFF, 0 );
2106 }
2107 if ( aExp == 0 ) {
2108 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2109 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2110 }
2111 if ( bExp == 0 ) {
2112 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2113 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2114 }
2115 zExp = aExp + bExp - 0x7F;
2116 aSig = ( aSig | 0x00800000 )<<7;
2117 bSig = ( bSig | 0x00800000 )<<8;
2118 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2119 zSig = zSig64;
2120 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2121 zSig <<= 1;
2122 --zExp;
2123 }
2124 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2125
2126 }
2127
2128 /*----------------------------------------------------------------------------
2129 | Returns the result of dividing the single-precision floating-point value `a'
2130 | by the corresponding value `b'. The operation is performed according to the
2131 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2132 *----------------------------------------------------------------------------*/
2133
2134 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2135 {
2136 flag aSign, bSign, zSign;
2137 int_fast16_t aExp, bExp, zExp;
2138 uint32_t aSig, bSig, zSig;
2139 a = float32_squash_input_denormal(a STATUS_VAR);
2140 b = float32_squash_input_denormal(b STATUS_VAR);
2141
2142 aSig = extractFloat32Frac( a );
2143 aExp = extractFloat32Exp( a );
2144 aSign = extractFloat32Sign( a );
2145 bSig = extractFloat32Frac( b );
2146 bExp = extractFloat32Exp( b );
2147 bSign = extractFloat32Sign( b );
2148 zSign = aSign ^ bSign;
2149 if ( aExp == 0xFF ) {
2150 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2151 if ( bExp == 0xFF ) {
2152 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2153 float_raise( float_flag_invalid STATUS_VAR);
2154 return float32_default_nan;
2155 }
2156 return packFloat32( zSign, 0xFF, 0 );
2157 }
2158 if ( bExp == 0xFF ) {
2159 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2160 return packFloat32( zSign, 0, 0 );
2161 }
2162 if ( bExp == 0 ) {
2163 if ( bSig == 0 ) {
2164 if ( ( aExp | aSig ) == 0 ) {
2165 float_raise( float_flag_invalid STATUS_VAR);
2166 return float32_default_nan;
2167 }
2168 float_raise( float_flag_divbyzero STATUS_VAR);
2169 return packFloat32( zSign, 0xFF, 0 );
2170 }
2171 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2172 }
2173 if ( aExp == 0 ) {
2174 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2175 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2176 }
2177 zExp = aExp - bExp + 0x7D;
2178 aSig = ( aSig | 0x00800000 )<<7;
2179 bSig = ( bSig | 0x00800000 )<<8;
2180 if ( bSig <= ( aSig + aSig ) ) {
2181 aSig >>= 1;
2182 ++zExp;
2183 }
2184 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2185 if ( ( zSig & 0x3F ) == 0 ) {
2186 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2187 }
2188 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2189
2190 }
2191
2192 /*----------------------------------------------------------------------------
2193 | Returns the remainder of the single-precision floating-point value `a'
2194 | with respect to the corresponding value `b'. The operation is performed
2195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2196 *----------------------------------------------------------------------------*/
2197
2198 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2199 {
2200 flag aSign, zSign;
2201 int_fast16_t aExp, bExp, expDiff;
2202 uint32_t aSig, bSig;
2203 uint32_t q;
2204 uint64_t aSig64, bSig64, q64;
2205 uint32_t alternateASig;
2206 int32_t sigMean;
2207 a = float32_squash_input_denormal(a STATUS_VAR);
2208 b = float32_squash_input_denormal(b STATUS_VAR);
2209
2210 aSig = extractFloat32Frac( a );
2211 aExp = extractFloat32Exp( a );
2212 aSign = extractFloat32Sign( a );
2213 bSig = extractFloat32Frac( b );
2214 bExp = extractFloat32Exp( b );
2215 if ( aExp == 0xFF ) {
2216 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2217 return propagateFloat32NaN( a, b STATUS_VAR );
2218 }
2219 float_raise( float_flag_invalid STATUS_VAR);
2220 return float32_default_nan;
2221 }
2222 if ( bExp == 0xFF ) {
2223 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2224 return a;
2225 }
2226 if ( bExp == 0 ) {
2227 if ( bSig == 0 ) {
2228 float_raise( float_flag_invalid STATUS_VAR);
2229 return float32_default_nan;
2230 }
2231 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2232 }
2233 if ( aExp == 0 ) {
2234 if ( aSig == 0 ) return a;
2235 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2236 }
2237 expDiff = aExp - bExp;
2238 aSig |= 0x00800000;
2239 bSig |= 0x00800000;
2240 if ( expDiff < 32 ) {
2241 aSig <<= 8;
2242 bSig <<= 8;
2243 if ( expDiff < 0 ) {
2244 if ( expDiff < -1 ) return a;
2245 aSig >>= 1;
2246 }
2247 q = ( bSig <= aSig );
2248 if ( q ) aSig -= bSig;
2249 if ( 0 < expDiff ) {
2250 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2251 q >>= 32 - expDiff;
2252 bSig >>= 2;
2253 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2254 }
2255 else {
2256 aSig >>= 2;
2257 bSig >>= 2;
2258 }
2259 }
2260 else {
2261 if ( bSig <= aSig ) aSig -= bSig;
2262 aSig64 = ( (uint64_t) aSig )<<40;
2263 bSig64 = ( (uint64_t) bSig )<<40;
2264 expDiff -= 64;
2265 while ( 0 < expDiff ) {
2266 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2267 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2268 aSig64 = - ( ( bSig * q64 )<<38 );
2269 expDiff -= 62;
2270 }
2271 expDiff += 64;
2272 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2273 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2274 q = q64>>( 64 - expDiff );
2275 bSig <<= 6;
2276 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2277 }
2278 do {
2279 alternateASig = aSig;
2280 ++q;
2281 aSig -= bSig;
2282 } while ( 0 <= (int32_t) aSig );
2283 sigMean = aSig + alternateASig;
2284 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2285 aSig = alternateASig;
2286 }
2287 zSign = ( (int32_t) aSig < 0 );
2288 if ( zSign ) aSig = - aSig;
2289 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2290
2291 }
2292
2293 /*----------------------------------------------------------------------------
2294 | Returns the result of multiplying the single-precision floating-point values
2295 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2296 | multiplication. The operation is performed according to the IEC/IEEE
2297 | Standard for Binary Floating-Point Arithmetic 754-2008.
2298 | The flags argument allows the caller to select negation of the
2299 | addend, the intermediate product, or the final result. (The difference
2300 | between this and having the caller do a separate negation is that negating
2301 | externally will flip the sign bit on NaNs.)
2302 *----------------------------------------------------------------------------*/
2303
2304 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2305 {
2306 flag aSign, bSign, cSign, zSign;
2307 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2308 uint32_t aSig, bSig, cSig;
2309 flag pInf, pZero, pSign;
2310 uint64_t pSig64, cSig64, zSig64;
2311 uint32_t pSig;
2312 int shiftcount;
2313 flag signflip, infzero;
2314
2315 a = float32_squash_input_denormal(a STATUS_VAR);
2316 b = float32_squash_input_denormal(b STATUS_VAR);
2317 c = float32_squash_input_denormal(c STATUS_VAR);
2318 aSig = extractFloat32Frac(a);
2319 aExp = extractFloat32Exp(a);
2320 aSign = extractFloat32Sign(a);
2321 bSig = extractFloat32Frac(b);
2322 bExp = extractFloat32Exp(b);
2323 bSign = extractFloat32Sign(b);
2324 cSig = extractFloat32Frac(c);
2325 cExp = extractFloat32Exp(c);
2326 cSign = extractFloat32Sign(c);
2327
2328 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2329 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2330
2331 /* It is implementation-defined whether the cases of (0,inf,qnan)
2332 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2333 * they return if they do), so we have to hand this information
2334 * off to the target-specific pick-a-NaN routine.
2335 */
2336 if (((aExp == 0xff) && aSig) ||
2337 ((bExp == 0xff) && bSig) ||
2338 ((cExp == 0xff) && cSig)) {
2339 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2340 }
2341
2342 if (infzero) {
2343 float_raise(float_flag_invalid STATUS_VAR);
2344 return float32_default_nan;
2345 }
2346
2347 if (flags & float_muladd_negate_c) {
2348 cSign ^= 1;
2349 }
2350
2351 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2352
2353 /* Work out the sign and type of the product */
2354 pSign = aSign ^ bSign;
2355 if (flags & float_muladd_negate_product) {
2356 pSign ^= 1;
2357 }
2358 pInf = (aExp == 0xff) || (bExp == 0xff);
2359 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2360
2361 if (cExp == 0xff) {
2362 if (pInf && (pSign ^ cSign)) {
2363 /* addition of opposite-signed infinities => InvalidOperation */
2364 float_raise(float_flag_invalid STATUS_VAR);
2365 return float32_default_nan;
2366 }
2367 /* Otherwise generate an infinity of the same sign */
2368 return packFloat32(cSign ^ signflip, 0xff, 0);
2369 }
2370
2371 if (pInf) {
2372 return packFloat32(pSign ^ signflip, 0xff, 0);
2373 }
2374
2375 if (pZero) {
2376 if (cExp == 0) {
2377 if (cSig == 0) {
2378 /* Adding two exact zeroes */
2379 if (pSign == cSign) {
2380 zSign = pSign;
2381 } else if (STATUS(float_rounding_mode) == float_round_down) {
2382 zSign = 1;
2383 } else {
2384 zSign = 0;
2385 }
2386 return packFloat32(zSign ^ signflip, 0, 0);
2387 }
2388 /* Exact zero plus a denorm */
2389 if (STATUS(flush_to_zero)) {
2390 float_raise(float_flag_output_denormal STATUS_VAR);
2391 return packFloat32(cSign ^ signflip, 0, 0);
2392 }
2393 }
2394 /* Zero plus something non-zero : just return the something */
2395 if (flags & float_muladd_halve_result) {
2396 if (cExp == 0) {
2397 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2398 }
2399 /* Subtract one to halve, and one again because roundAndPackFloat32
2400 * wants one less than the true exponent.
2401 */
2402 cExp -= 2;
2403 cSig = (cSig | 0x00800000) << 7;
2404 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2405 }
2406 return packFloat32(cSign ^ signflip, cExp, cSig);
2407 }
2408
2409 if (aExp == 0) {
2410 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2411 }
2412 if (bExp == 0) {
2413 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2414 }
2415
2416 /* Calculate the actual result a * b + c */
2417
2418 /* Multiply first; this is easy. */
2419 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2420 * because we want the true exponent, not the "one-less-than"
2421 * flavour that roundAndPackFloat32() takes.
2422 */
2423 pExp = aExp + bExp - 0x7e;
2424 aSig = (aSig | 0x00800000) << 7;
2425 bSig = (bSig | 0x00800000) << 8;
2426 pSig64 = (uint64_t)aSig * bSig;
2427 if ((int64_t)(pSig64 << 1) >= 0) {
2428 pSig64 <<= 1;
2429 pExp--;
2430 }
2431
2432 zSign = pSign ^ signflip;
2433
2434 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2435 * position 62.
2436 */
2437 if (cExp == 0) {
2438 if (!cSig) {
2439 /* Throw out the special case of c being an exact zero now */
2440 shift64RightJamming(pSig64, 32, &pSig64);
2441 pSig = pSig64;
2442 if (flags & float_muladd_halve_result) {
2443 pExp--;
2444 }
2445 return roundAndPackFloat32(zSign, pExp - 1,
2446 pSig STATUS_VAR);
2447 }
2448 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2449 }
2450
2451 cSig64 = (uint64_t)cSig << (62 - 23);
2452 cSig64 |= LIT64(0x4000000000000000);
2453 expDiff = pExp - cExp;
2454
2455 if (pSign == cSign) {
2456 /* Addition */
2457 if (expDiff > 0) {
2458 /* scale c to match p */
2459 shift64RightJamming(cSig64, expDiff, &cSig64);
2460 zExp = pExp;
2461 } else if (expDiff < 0) {
2462 /* scale p to match c */
2463 shift64RightJamming(pSig64, -expDiff, &pSig64);
2464 zExp = cExp;
2465 } else {
2466 /* no scaling needed */
2467 zExp = cExp;
2468 }
2469 /* Add significands and make sure explicit bit ends up in posn 62 */
2470 zSig64 = pSig64 + cSig64;
2471 if ((int64_t)zSig64 < 0) {
2472 shift64RightJamming(zSig64, 1, &zSig64);
2473 } else {
2474 zExp--;
2475 }
2476 } else {
2477 /* Subtraction */
2478 if (expDiff > 0) {
2479 shift64RightJamming(cSig64, expDiff, &cSig64);
2480 zSig64 = pSig64 - cSig64;
2481 zExp = pExp;
2482 } else if (expDiff < 0) {
2483 shift64RightJamming(pSig64, -expDiff, &pSig64);
2484 zSig64 = cSig64 - pSig64;
2485 zExp = cExp;
2486 zSign ^= 1;
2487 } else {
2488 zExp = pExp;
2489 if (cSig64 < pSig64) {
2490 zSig64 = pSig64 - cSig64;
2491 } else if (pSig64 < cSig64) {
2492 zSig64 = cSig64 - pSig64;
2493 zSign ^= 1;
2494 } else {
2495 /* Exact zero */
2496 zSign = signflip;
2497 if (STATUS(float_rounding_mode) == float_round_down) {
2498 zSign ^= 1;
2499 }
2500 return packFloat32(zSign, 0, 0);
2501 }
2502 }
2503 --zExp;
2504 /* Normalize to put the explicit bit back into bit 62. */
2505 shiftcount = countLeadingZeros64(zSig64) - 1;
2506 zSig64 <<= shiftcount;
2507 zExp -= shiftcount;
2508 }
2509 if (flags & float_muladd_halve_result) {
2510 zExp--;
2511 }
2512
2513 shift64RightJamming(zSig64, 32, &zSig64);
2514 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2515 }
2516
2517
2518 /*----------------------------------------------------------------------------
2519 | Returns the square root of the single-precision floating-point value `a'.
2520 | The operation is performed according to the IEC/IEEE Standard for Binary
2521 | Floating-Point Arithmetic.
2522 *----------------------------------------------------------------------------*/
2523
2524 float32 float32_sqrt( float32 a STATUS_PARAM )
2525 {
2526 flag aSign;
2527 int_fast16_t aExp, zExp;
2528 uint32_t aSig, zSig;
2529 uint64_t rem, term;
2530 a = float32_squash_input_denormal(a STATUS_VAR);
2531
2532 aSig = extractFloat32Frac( a );
2533 aExp = extractFloat32Exp( a );
2534 aSign = extractFloat32Sign( a );
2535 if ( aExp == 0xFF ) {
2536 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2537 if ( ! aSign ) return a;
2538 float_raise( float_flag_invalid STATUS_VAR);
2539 return float32_default_nan;
2540 }
2541 if ( aSign ) {
2542 if ( ( aExp | aSig ) == 0 ) return a;
2543 float_raise( float_flag_invalid STATUS_VAR);
2544 return float32_default_nan;
2545 }
2546 if ( aExp == 0 ) {
2547 if ( aSig == 0 ) return float32_zero;
2548 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2549 }
2550 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2551 aSig = ( aSig | 0x00800000 )<<8;
2552 zSig = estimateSqrt32( aExp, aSig ) + 2;
2553 if ( ( zSig & 0x7F ) <= 5 ) {
2554 if ( zSig < 2 ) {
2555 zSig = 0x7FFFFFFF;
2556 goto roundAndPack;
2557 }
2558 aSig >>= aExp & 1;
2559 term = ( (uint64_t) zSig ) * zSig;
2560 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2561 while ( (int64_t) rem < 0 ) {
2562 --zSig;
2563 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2564 }
2565 zSig |= ( rem != 0 );
2566 }
2567 shift32RightJamming( zSig, 1, &zSig );
2568 roundAndPack:
2569 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2570
2571 }
2572
2573 /*----------------------------------------------------------------------------
2574 | Returns the binary exponential of the single-precision floating-point value
2575 | `a'. The operation is performed according to the IEC/IEEE Standard for
2576 | Binary Floating-Point Arithmetic.
2577 |
2578 | Uses the following identities:
2579 |
2580 | 1. -------------------------------------------------------------------------
2581 | x x*ln(2)
2582 | 2 = e
2583 |
2584 | 2. -------------------------------------------------------------------------
2585 | 2 3 4 5 n
2586 | x x x x x x x
2587 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2588 | 1! 2! 3! 4! 5! n!
2589 *----------------------------------------------------------------------------*/
2590
2591 static const float64 float32_exp2_coefficients[15] =
2592 {
2593 const_float64( 0x3ff0000000000000ll ), /* 1 */
2594 const_float64( 0x3fe0000000000000ll ), /* 2 */
2595 const_float64( 0x3fc5555555555555ll ), /* 3 */
2596 const_float64( 0x3fa5555555555555ll ), /* 4 */
2597 const_float64( 0x3f81111111111111ll ), /* 5 */
2598 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2599 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2600 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2601 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2602 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2603 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2604 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2605 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2606 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2607 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2608 };
2609
2610 float32 float32_exp2( float32 a STATUS_PARAM )
2611 {
2612 flag aSign;
2613 int_fast16_t aExp;
2614 uint32_t aSig;
2615 float64 r, x, xn;
2616 int i;
2617 a = float32_squash_input_denormal(a STATUS_VAR);
2618
2619 aSig = extractFloat32Frac( a );
2620 aExp = extractFloat32Exp( a );
2621 aSign = extractFloat32Sign( a );
2622
2623 if ( aExp == 0xFF) {
2624 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2625 return (aSign) ? float32_zero : a;
2626 }
2627 if (aExp == 0) {
2628 if (aSig == 0) return float32_one;
2629 }
2630
2631 float_raise( float_flag_inexact STATUS_VAR);
2632
2633 /* ******************************* */
2634 /* using float64 for approximation */
2635 /* ******************************* */
2636 x = float32_to_float64(a STATUS_VAR);
2637 x = float64_mul(x, float64_ln2 STATUS_VAR);
2638
2639 xn = x;
2640 r = float64_one;
2641 for (i = 0 ; i < 15 ; i++) {
2642 float64 f;
2643
2644 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2645 r = float64_add(r, f STATUS_VAR);
2646
2647 xn = float64_mul(xn, x STATUS_VAR);
2648 }
2649
2650 return float64_to_float32(r, status);
2651 }
2652
2653 /*----------------------------------------------------------------------------
2654 | Returns the binary log of the single-precision floating-point value `a'.
2655 | The operation is performed according to the IEC/IEEE Standard for Binary
2656 | Floating-Point Arithmetic.
2657 *----------------------------------------------------------------------------*/
2658 float32 float32_log2( float32 a STATUS_PARAM )
2659 {
2660 flag aSign, zSign;
2661 int_fast16_t aExp;
2662 uint32_t aSig, zSig, i;
2663
2664 a = float32_squash_input_denormal(a STATUS_VAR);
2665 aSig = extractFloat32Frac( a );
2666 aExp = extractFloat32Exp( a );
2667 aSign = extractFloat32Sign( a );
2668
2669 if ( aExp == 0 ) {
2670 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2671 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2672 }
2673 if ( aSign ) {
2674 float_raise( float_flag_invalid STATUS_VAR);
2675 return float32_default_nan;
2676 }
2677 if ( aExp == 0xFF ) {
2678 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2679 return a;
2680 }
2681
2682 aExp -= 0x7F;
2683 aSig |= 0x00800000;
2684 zSign = aExp < 0;
2685 zSig = aExp << 23;
2686
2687 for (i = 1 << 22; i > 0; i >>= 1) {
2688 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2689 if ( aSig & 0x01000000 ) {
2690 aSig >>= 1;
2691 zSig |= i;
2692 }
2693 }
2694
2695 if ( zSign )
2696 zSig = -zSig;
2697
2698 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2699 }
2700
2701 /*----------------------------------------------------------------------------
2702 | Returns 1 if the single-precision floating-point value `a' is equal to
2703 | the corresponding value `b', and 0 otherwise. The invalid exception is
2704 | raised if either operand is a NaN. Otherwise, the comparison is performed
2705 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2706 *----------------------------------------------------------------------------*/
2707
2708 int float32_eq( float32 a, float32 b STATUS_PARAM )
2709 {
2710 uint32_t av, bv;
2711 a = float32_squash_input_denormal(a STATUS_VAR);
2712 b = float32_squash_input_denormal(b STATUS_VAR);
2713
2714 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2715 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2716 ) {
2717 float_raise( float_flag_invalid STATUS_VAR);
2718 return 0;
2719 }
2720 av = float32_val(a);
2721 bv = float32_val(b);
2722 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2723 }
2724
2725 /*----------------------------------------------------------------------------
2726 | Returns 1 if the single-precision floating-point value `a' is less than
2727 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2728 | exception is raised if either operand is a NaN. The comparison is performed
2729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2730 *----------------------------------------------------------------------------*/
2731
2732 int float32_le( float32 a, float32 b STATUS_PARAM )
2733 {
2734 flag aSign, bSign;
2735 uint32_t av, bv;
2736 a = float32_squash_input_denormal(a STATUS_VAR);
2737 b = float32_squash_input_denormal(b STATUS_VAR);
2738
2739 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2740 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2741 ) {
2742 float_raise( float_flag_invalid STATUS_VAR);
2743 return 0;
2744 }
2745 aSign = extractFloat32Sign( a );
2746 bSign = extractFloat32Sign( b );
2747 av = float32_val(a);
2748 bv = float32_val(b);
2749 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2750 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2751
2752 }
2753
2754 /*----------------------------------------------------------------------------
2755 | Returns 1 if the single-precision floating-point value `a' is less than
2756 | the corresponding value `b', and 0 otherwise. The invalid exception is
2757 | raised if either operand is a NaN. The comparison is performed according
2758 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2759 *----------------------------------------------------------------------------*/
2760
2761 int float32_lt( float32 a, float32 b STATUS_PARAM )
2762 {
2763 flag aSign, bSign;
2764 uint32_t av, bv;
2765 a = float32_squash_input_denormal(a STATUS_VAR);
2766 b = float32_squash_input_denormal(b STATUS_VAR);
2767
2768 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2769 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2770 ) {
2771 float_raise( float_flag_invalid STATUS_VAR);
2772 return 0;
2773 }
2774 aSign = extractFloat32Sign( a );
2775 bSign = extractFloat32Sign( b );
2776 av = float32_val(a);
2777 bv = float32_val(b);
2778 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2779 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2780
2781 }
2782
2783 /*----------------------------------------------------------------------------
2784 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2785 | be compared, and 0 otherwise. The invalid exception is raised if either
2786 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2787 | Standard for Binary Floating-Point Arithmetic.
2788 *----------------------------------------------------------------------------*/
2789
2790 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2791 {
2792 a = float32_squash_input_denormal(a STATUS_VAR);
2793 b = float32_squash_input_denormal(b STATUS_VAR);
2794
2795 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2796 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2797 ) {
2798 float_raise( float_flag_invalid STATUS_VAR);
2799 return 1;
2800 }
2801 return 0;
2802 }
2803
2804 /*----------------------------------------------------------------------------
2805 | Returns 1 if the single-precision floating-point value `a' is equal to
2806 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2807 | exception. The comparison is performed according to the IEC/IEEE Standard
2808 | for Binary Floating-Point Arithmetic.
2809 *----------------------------------------------------------------------------*/
2810
2811 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2812 {
2813 a = float32_squash_input_denormal(a STATUS_VAR);
2814 b = float32_squash_input_denormal(b STATUS_VAR);
2815
2816 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2817 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2818 ) {
2819 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2820 float_raise( float_flag_invalid STATUS_VAR);
2821 }
2822 return 0;
2823 }
2824 return ( float32_val(a) == float32_val(b) ) ||
2825 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2826 }
2827
2828 /*----------------------------------------------------------------------------
2829 | Returns 1 if the single-precision floating-point value `a' is less than or
2830 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2831 | cause an exception. Otherwise, the comparison is performed according to the
2832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 *----------------------------------------------------------------------------*/
2834
2835 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2836 {
2837 flag aSign, bSign;
2838 uint32_t av, bv;
2839 a = float32_squash_input_denormal(a STATUS_VAR);
2840 b = float32_squash_input_denormal(b STATUS_VAR);
2841
2842 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2843 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2844 ) {
2845 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2846 float_raise( float_flag_invalid STATUS_VAR);
2847 }
2848 return 0;
2849 }
2850 aSign = extractFloat32Sign( a );
2851 bSign = extractFloat32Sign( b );
2852 av = float32_val(a);
2853 bv = float32_val(b);
2854 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2855 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2856
2857 }
2858
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is less than
2861 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2862 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2863 | Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865
2866 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2867 {
2868 flag aSign, bSign;
2869 uint32_t av, bv;
2870 a = float32_squash_input_denormal(a STATUS_VAR);
2871 b = float32_squash_input_denormal(b STATUS_VAR);
2872
2873 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2874 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2875 ) {
2876 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2877 float_raise( float_flag_invalid STATUS_VAR);
2878 }
2879 return 0;
2880 }
2881 aSign = extractFloat32Sign( a );
2882 bSign = extractFloat32Sign( b );
2883 av = float32_val(a);
2884 bv = float32_val(b);
2885 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2886 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2887
2888 }
2889
2890 /*----------------------------------------------------------------------------
2891 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2892 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2893 | comparison is performed according to the IEC/IEEE Standard for Binary
2894 | Floating-Point Arithmetic.
2895 *----------------------------------------------------------------------------*/
2896
2897 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2898 {
2899 a = float32_squash_input_denormal(a STATUS_VAR);
2900 b = float32_squash_input_denormal(b STATUS_VAR);
2901
2902 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2903 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2904 ) {
2905 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2906 float_raise( float_flag_invalid STATUS_VAR);
2907 }
2908 return 1;
2909 }
2910 return 0;
2911 }
2912
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the 32-bit two's complement integer format. The conversion is
2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic---which means in particular that the conversion is rounded
2918 | according to the current rounding mode. If `a' is a NaN, the largest
2919 | positive integer is returned. Otherwise, if the conversion overflows, the
2920 | largest integer with the same sign as `a' is returned.
2921 *----------------------------------------------------------------------------*/
2922
2923 int32 float64_to_int32( float64 a STATUS_PARAM )
2924 {
2925 flag aSign;
2926 int_fast16_t aExp, shiftCount;
2927 uint64_t aSig;
2928 a = float64_squash_input_denormal(a STATUS_VAR);
2929
2930 aSig = extractFloat64Frac( a );
2931 aExp = extractFloat64Exp( a );
2932 aSign = extractFloat64Sign( a );
2933 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2934 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2935 shiftCount = 0x42C - aExp;
2936 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2937 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2938
2939 }
2940
2941 /*----------------------------------------------------------------------------
2942 | Returns the result of converting the double-precision floating-point value
2943 | `a' to the 32-bit two's complement integer format. The conversion is
2944 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2945 | Arithmetic, except that the conversion is always rounded toward zero.
2946 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2947 | the conversion overflows, the largest integer with the same sign as `a' is
2948 | returned.
2949 *----------------------------------------------------------------------------*/
2950
2951 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2952 {
2953 flag aSign;
2954 int_fast16_t aExp, shiftCount;
2955 uint64_t aSig, savedASig;
2956 int32_t z;
2957 a = float64_squash_input_denormal(a STATUS_VAR);
2958
2959 aSig = extractFloat64Frac( a );
2960 aExp = extractFloat64Exp( a );
2961 aSign = extractFloat64Sign( a );
2962 if ( 0x41E < aExp ) {
2963 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2964 goto invalid;
2965 }
2966 else if ( aExp < 0x3FF ) {
2967 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2968 return 0;
2969 }
2970 aSig |= LIT64( 0x0010000000000000 );
2971 shiftCount = 0x433 - aExp;
2972 savedASig = aSig;
2973 aSig >>= shiftCount;
2974 z = aSig;
2975 if ( aSign ) z = - z;
2976 if ( ( z < 0 ) ^ aSign ) {
2977 invalid:
2978 float_raise( float_flag_invalid STATUS_VAR);
2979 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2980 }
2981 if ( ( aSig<<shiftCount ) != savedASig ) {
2982 STATUS(float_exception_flags) |= float_flag_inexact;
2983 }
2984 return z;
2985
2986 }
2987
2988 /*----------------------------------------------------------------------------
2989 | Returns the result of converting the double-precision floating-point value
2990 | `a' to the 16-bit two's complement integer format. The conversion is
2991 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2992 | Arithmetic, except that the conversion is always rounded toward zero.
2993 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2994 | the conversion overflows, the largest integer with the same sign as `a' is
2995 | returned.
2996 *----------------------------------------------------------------------------*/
2997
2998 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2999 {
3000 flag aSign;
3001 int_fast16_t aExp, shiftCount;
3002 uint64_t aSig, savedASig;
3003 int32 z;
3004
3005 aSig = extractFloat64Frac( a );
3006 aExp = extractFloat64Exp( a );
3007 aSign = extractFloat64Sign( a );
3008 if ( 0x40E < aExp ) {
3009 if ( ( aExp == 0x7FF ) && aSig ) {
3010 aSign = 0;
3011 }
3012 goto invalid;
3013 }
3014 else if ( aExp < 0x3FF ) {
3015 if ( aExp || aSig ) {
3016 STATUS(float_exception_flags) |= float_flag_inexact;
3017 }
3018 return 0;
3019 }
3020 aSig |= LIT64( 0x0010000000000000 );
3021 shiftCount = 0x433 - aExp;
3022 savedASig = aSig;
3023 aSig >>= shiftCount;
3024 z = aSig;
3025 if ( aSign ) {
3026 z = - z;
3027 }
3028 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3029 invalid:
3030 float_raise( float_flag_invalid STATUS_VAR);
3031 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3032 }
3033 if ( ( aSig<<shiftCount ) != savedASig ) {
3034 STATUS(float_exception_flags) |= float_flag_inexact;
3035 }
3036 return z;
3037 }
3038
3039 /*----------------------------------------------------------------------------
3040 | Returns the result of converting the double-precision floating-point value
3041 | `a' to the 64-bit two's complement integer format. The conversion is
3042 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3043 | Arithmetic---which means in particular that the conversion is rounded
3044 | according to the current rounding mode. If `a' is a NaN, the largest
3045 | positive integer is returned. Otherwise, if the conversion overflows, the
3046 | largest integer with the same sign as `a' is returned.
3047 *----------------------------------------------------------------------------*/
3048
3049 int64 float64_to_int64( float64 a STATUS_PARAM )
3050 {
3051 flag aSign;
3052 int_fast16_t aExp, shiftCount;
3053 uint64_t aSig, aSigExtra;
3054 a = float64_squash_input_denormal(a STATUS_VAR);
3055
3056 aSig = extractFloat64Frac( a );
3057 aExp = extractFloat64Exp( a );
3058 aSign = extractFloat64Sign( a );
3059 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3060 shiftCount = 0x433 - aExp;
3061 if ( shiftCount <= 0 ) {
3062 if ( 0x43E < aExp ) {
3063 float_raise( float_flag_invalid STATUS_VAR);
3064 if ( ! aSign
3065 || ( ( aExp == 0x7FF )
3066 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3067 ) {
3068 return LIT64( 0x7FFFFFFFFFFFFFFF );
3069 }
3070 return (int64_t) LIT64( 0x8000000000000000 );
3071 }
3072 aSigExtra = 0;
3073 aSig <<= - shiftCount;
3074 }
3075 else {
3076 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3077 }
3078 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3079
3080 }
3081
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the double-precision floating-point value
3084 | `a' to the 64-bit two's complement integer format. The conversion is
3085 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3086 | Arithmetic, except that the conversion is always rounded toward zero.
3087 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3088 | the conversion overflows, the largest integer with the same sign as `a' is
3089 | returned.
3090 *----------------------------------------------------------------------------*/
3091
3092 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3093 {
3094 flag aSign;
3095 int_fast16_t aExp, shiftCount;
3096 uint64_t aSig;
3097 int64 z;
3098 a = float64_squash_input_denormal(a STATUS_VAR);
3099
3100 aSig = extractFloat64Frac( a );
3101 aExp = extractFloat64Exp( a );
3102 aSign = extractFloat64Sign( a );
3103 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3104 shiftCount = aExp - 0x433;
3105 if ( 0 <= shiftCount ) {
3106 if ( 0x43E <= aExp ) {
3107 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3108 float_raise( float_flag_invalid STATUS_VAR);
3109 if ( ! aSign
3110 || ( ( aExp == 0x7FF )
3111 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3112 ) {
3113 return LIT64( 0x7FFFFFFFFFFFFFFF );
3114 }
3115 }
3116 return (int64_t) LIT64( 0x8000000000000000 );
3117 }
3118 z = aSig<<shiftCount;
3119 }
3120 else {
3121 if ( aExp < 0x3FE ) {
3122 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3123 return 0;
3124 }
3125 z = aSig>>( - shiftCount );
3126 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3127 STATUS(float_exception_flags) |= float_flag_inexact;
3128 }
3129 }
3130 if ( aSign ) z = - z;
3131 return z;
3132
3133 }
3134
3135 /*----------------------------------------------------------------------------
3136 | Returns the result of converting the double-precision floating-point value
3137 | `a' to the single-precision floating-point format. The conversion is
3138 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3139 | Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141
3142 float32 float64_to_float32( float64 a STATUS_PARAM )
3143 {
3144 flag aSign;
3145 int_fast16_t aExp;
3146 uint64_t aSig;
3147 uint32_t zSig;
3148 a = float64_squash_input_denormal(a STATUS_VAR);
3149
3150 aSig = extractFloat64Frac( a );
3151 aExp = extractFloat64Exp( a );
3152 aSign = extractFloat64Sign( a );
3153 if ( aExp == 0x7FF ) {
3154 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3155 return packFloat32( aSign, 0xFF, 0 );
3156 }
3157 shift64RightJamming( aSig, 22, &aSig );
3158 zSig = aSig;
3159 if ( aExp || zSig ) {
3160 zSig |= 0x40000000;
3161 aExp -= 0x381;
3162 }
3163 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3164
3165 }
3166
3167
3168 /*----------------------------------------------------------------------------
3169 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3170 | half-precision floating-point value, returning the result. After being
3171 | shifted into the proper positions, the three fields are simply added
3172 | together to form the result. This means that any integer portion of `zSig'
3173 | will be added into the exponent. Since a properly normalized significand
3174 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3175 | than the desired result exponent whenever `zSig' is a complete, normalized
3176 | significand.
3177 *----------------------------------------------------------------------------*/
3178 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3179 {
3180 return make_float16(
3181 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3182 }
3183
3184 /*----------------------------------------------------------------------------
3185 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3186 | and significand `zSig', and returns the proper half-precision floating-
3187 | point value corresponding to the abstract input. Ordinarily, the abstract
3188 | value is simply rounded and packed into the half-precision format, with
3189 | the inexact exception raised if the abstract input cannot be represented
3190 | exactly. However, if the abstract value is too large, the overflow and
3191 | inexact exceptions are raised and an infinity or maximal finite value is
3192 | returned. If the abstract value is too small, the input value is rounded to
3193 | a subnormal number, and the underflow and inexact exceptions are raised if
3194 | the abstract input cannot be represented exactly as a subnormal half-
3195 | precision floating-point number.
3196 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3197 | ARM-style "alternative representation", which omits the NaN and Inf
3198 | encodings in order to raise the maximum representable exponent by one.
3199 | The input significand `zSig' has its binary point between bits 22
3200 | and 23, which is 13 bits to the left of the usual location. This shifted
3201 | significand must be normalized or smaller. If `zSig' is not normalized,
3202 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3203 | and it must not require rounding. In the usual case that `zSig' is
3204 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3205 | Note the slightly odd position of the binary point in zSig compared with the
3206 | other roundAndPackFloat functions. This should probably be fixed if we
3207 | need to implement more float16 routines than just conversion.
3208 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3209 | Binary Floating-Point Arithmetic.
3210 *----------------------------------------------------------------------------*/
3211
3212 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3213 uint32_t zSig, flag ieee STATUS_PARAM)
3214 {
3215 int maxexp = ieee ? 29 : 30;
3216 uint32_t mask;
3217 uint32_t increment;
3218 bool rounding_bumps_exp;
3219 bool is_tiny = false;
3220
3221 /* Calculate the mask of bits of the mantissa which are not
3222 * representable in half-precision and will be lost.
3223 */
3224 if (zExp < 1) {
3225 /* Will be denormal in halfprec */
3226 mask = 0x00ffffff;
3227 if (zExp >= -11) {
3228 mask >>= 11 + zExp;
3229 }
3230 } else {
3231 /* Normal number in halfprec */
3232 mask = 0x00001fff;
3233 }
3234
3235 switch (STATUS(float_rounding_mode)) {
3236 case float_round_nearest_even:
3237 increment = (mask + 1) >> 1;
3238 if ((zSig & mask) == increment) {
3239 increment = zSig & (increment << 1);
3240 }
3241 break;
3242 case float_round_ties_away:
3243 increment = (mask + 1) >> 1;
3244 break;
3245 case float_round_up:
3246 increment = zSign ? 0 : mask;
3247 break;
3248 case float_round_down:
3249 increment = zSign ? mask : 0;
3250 break;
3251 default: /* round_to_zero */
3252 increment = 0;
3253 break;
3254 }
3255
3256 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3257
3258 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3259 if (ieee) {
3260 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3261 return packFloat16(zSign, 0x1f, 0);
3262 } else {
3263 float_raise(float_flag_invalid STATUS_VAR);
3264 return packFloat16(zSign, 0x1f, 0x3ff);
3265 }
3266 }
3267
3268 if (zExp < 0) {
3269 /* Note that flush-to-zero does not affect half-precision results */
3270 is_tiny =
3271 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3272 || (zExp < -1)
3273 || (!rounding_bumps_exp);
3274 }
3275 if (zSig & mask) {
3276 float_raise(float_flag_inexact STATUS_VAR);
3277 if (is_tiny) {
3278 float_raise(float_flag_underflow STATUS_VAR);
3279 }
3280 }
3281
3282 zSig += increment;
3283 if (rounding_bumps_exp) {
3284 zSig >>= 1;
3285 zExp++;
3286 }
3287
3288 if (zExp < -10) {
3289 return packFloat16(zSign, 0, 0);
3290 }
3291 if (zExp < 0) {
3292 zSig >>= -zExp;
3293 zExp = 0;
3294 }
3295 return packFloat16(zSign, zExp, zSig >> 13);
3296 }
3297
3298 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3299 uint32_t *zSigPtr)
3300 {
3301 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3302 *zSigPtr = aSig << shiftCount;
3303 *zExpPtr = 1 - shiftCount;
3304 }
3305
3306 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3307 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3308
3309 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3310 {
3311 flag aSign;
3312 int_fast16_t aExp;
3313 uint32_t aSig;
3314
3315 aSign = extractFloat16Sign(a);
3316 aExp = extractFloat16Exp(a);
3317 aSig = extractFloat16Frac(a);
3318
3319 if (aExp == 0x1f && ieee) {
3320 if (aSig) {
3321 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3322 }
3323 return packFloat32(aSign, 0xff, 0);
3324 }
3325 if (aExp == 0) {
3326 if (aSig == 0) {
3327 return packFloat32(aSign, 0, 0);
3328 }
3329
3330 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3331 aExp--;
3332 }
3333 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3334 }
3335
3336 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3337 {
3338 flag aSign;
3339 int_fast16_t aExp;
3340 uint32_t aSig;
3341
3342 a = float32_squash_input_denormal(a STATUS_VAR);
3343
3344 aSig = extractFloat32Frac( a );
3345 aExp = extractFloat32Exp( a );
3346 aSign = extractFloat32Sign( a );
3347 if ( aExp == 0xFF ) {
3348 if (aSig) {
3349 /* Input is a NaN */
3350 if (!ieee) {
3351 float_raise(float_flag_invalid STATUS_VAR);
3352 return packFloat16(aSign, 0, 0);
3353 }
3354 return commonNaNToFloat16(
3355 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3356 }
3357 /* Infinity */
3358 if (!ieee) {
3359 float_raise(float_flag_invalid STATUS_VAR);
3360 return packFloat16(aSign, 0x1f, 0x3ff);
3361 }
3362 return packFloat16(aSign, 0x1f, 0);
3363 }
3364 if (aExp == 0 && aSig == 0) {
3365 return packFloat16(aSign, 0, 0);
3366 }
3367 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3368 * even if the input is denormal; however this is harmless because
3369 * the largest possible single-precision denormal is still smaller
3370 * than the smallest representable half-precision denormal, and s