softfloat: export squash_input_denormal functions
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
6
7 /*============================================================================
8
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35
36 =============================================================================*/
37
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41 #include "config.h"
42
43 #include "fpu/softfloat.h"
44
45 /* We only need stdlib for abort() */
46 #include <stdlib.h>
47
48 /*----------------------------------------------------------------------------
49 | Primitive arithmetic functions, including multi-word arithmetic, and
50 | division and square root approximations. (Can be specialized to target if
51 | desired.)
52 *----------------------------------------------------------------------------*/
53 #include "softfloat-macros.h"
54
55 /*----------------------------------------------------------------------------
56 | Functions and definitions to determine: (1) whether tininess for underflow
57 | is detected before or after rounding by default, (2) what (if anything)
58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 | are propagated from function inputs to output. These details are target-
61 | specific.
62 *----------------------------------------------------------------------------*/
63 #include "softfloat-specialize.h"
64
65 /*----------------------------------------------------------------------------
66 | Returns the fraction bits of the half-precision floating-point value `a'.
67 *----------------------------------------------------------------------------*/
68
69 INLINE uint32_t extractFloat16Frac(float16 a)
70 {
71 return float16_val(a) & 0x3ff;
72 }
73
74 /*----------------------------------------------------------------------------
75 | Returns the exponent bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77
78 INLINE int_fast16_t extractFloat16Exp(float16 a)
79 {
80 return (float16_val(a) >> 10) & 0x1f;
81 }
82
83 /*----------------------------------------------------------------------------
84 | Returns the sign bit of the single-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86
87 INLINE flag extractFloat16Sign(float16 a)
88 {
89 return float16_val(a)>>15;
90 }
91
92 /*----------------------------------------------------------------------------
93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94 | and 7, and returns the properly rounded 32-bit integer corresponding to the
95 | input. If `zSign' is 1, the input is negated before being converted to an
96 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
97 | is simply rounded to an integer, with the inexact exception raised if the
98 | input cannot be represented exactly as an integer. However, if the fixed-
99 | point input is too large, the invalid exception is raised and the largest
100 | positive or negative integer is returned.
101 *----------------------------------------------------------------------------*/
102
103 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
104 {
105 int8 roundingMode;
106 flag roundNearestEven;
107 int8 roundIncrement, roundBits;
108 int32_t z;
109
110 roundingMode = STATUS(float_rounding_mode);
111 roundNearestEven = ( roundingMode == float_round_nearest_even );
112 switch (roundingMode) {
113 case float_round_nearest_even:
114 case float_round_ties_away:
115 roundIncrement = 0x40;
116 break;
117 case float_round_to_zero:
118 roundIncrement = 0;
119 break;
120 case float_round_up:
121 roundIncrement = zSign ? 0 : 0x7f;
122 break;
123 case float_round_down:
124 roundIncrement = zSign ? 0x7f : 0;
125 break;
126 default:
127 abort();
128 }
129 roundBits = absZ & 0x7F;
130 absZ = ( absZ + roundIncrement )>>7;
131 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132 z = absZ;
133 if ( zSign ) z = - z;
134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135 float_raise( float_flag_invalid STATUS_VAR);
136 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
137 }
138 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
139 return z;
140
141 }
142
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer. However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
152 | returned.
153 *----------------------------------------------------------------------------*/
154
155 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
156 {
157 int8 roundingMode;
158 flag roundNearestEven, increment;
159 int64_t z;
160
161 roundingMode = STATUS(float_rounding_mode);
162 roundNearestEven = ( roundingMode == float_round_nearest_even );
163 switch (roundingMode) {
164 case float_round_nearest_even:
165 case float_round_ties_away:
166 increment = ((int64_t) absZ1 < 0);
167 break;
168 case float_round_to_zero:
169 increment = 0;
170 break;
171 case float_round_up:
172 increment = !zSign && absZ1;
173 break;
174 case float_round_down:
175 increment = zSign && absZ1;
176 break;
177 default:
178 abort();
179 }
180 if ( increment ) {
181 ++absZ0;
182 if ( absZ0 == 0 ) goto overflow;
183 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184 }
185 z = absZ0;
186 if ( zSign ) z = - z;
187 if ( z && ( ( z < 0 ) ^ zSign ) ) {
188 overflow:
189 float_raise( float_flag_invalid STATUS_VAR);
190 return
191 zSign ? (int64_t) LIT64( 0x8000000000000000 )
192 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 }
194 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
195 return z;
196
197 }
198
199 /*----------------------------------------------------------------------------
200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201 | `absZ1', with binary point between bits 63 and 64 (between the input words),
202 | and returns the properly rounded 64-bit unsigned integer corresponding to the
203 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
204 | with the inexact exception raised if the input cannot be represented exactly
205 | as an integer. However, if the fixed-point input is too large, the invalid
206 | exception is raised and the largest unsigned integer is returned.
207 *----------------------------------------------------------------------------*/
208
209 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
210 uint64_t absZ1 STATUS_PARAM)
211 {
212 int8 roundingMode;
213 flag roundNearestEven, increment;
214
215 roundingMode = STATUS(float_rounding_mode);
216 roundNearestEven = (roundingMode == float_round_nearest_even);
217 switch (roundingMode) {
218 case float_round_nearest_even:
219 case float_round_ties_away:
220 increment = ((int64_t)absZ1 < 0);
221 break;
222 case float_round_to_zero:
223 increment = 0;
224 break;
225 case float_round_up:
226 increment = !zSign && absZ1;
227 break;
228 case float_round_down:
229 increment = zSign && absZ1;
230 break;
231 default:
232 abort();
233 }
234 if (increment) {
235 ++absZ0;
236 if (absZ0 == 0) {
237 float_raise(float_flag_invalid STATUS_VAR);
238 return LIT64(0xFFFFFFFFFFFFFFFF);
239 }
240 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
241 }
242
243 if (zSign && absZ0) {
244 float_raise(float_flag_invalid STATUS_VAR);
245 return 0;
246 }
247
248 if (absZ1) {
249 STATUS(float_exception_flags) |= float_flag_inexact;
250 }
251 return absZ0;
252 }
253
254 /*----------------------------------------------------------------------------
255 | Returns the fraction bits of the single-precision floating-point value `a'.
256 *----------------------------------------------------------------------------*/
257
258 INLINE uint32_t extractFloat32Frac( float32 a )
259 {
260
261 return float32_val(a) & 0x007FFFFF;
262
263 }
264
265 /*----------------------------------------------------------------------------
266 | Returns the exponent bits of the single-precision floating-point value `a'.
267 *----------------------------------------------------------------------------*/
268
269 INLINE int_fast16_t extractFloat32Exp(float32 a)
270 {
271
272 return ( float32_val(a)>>23 ) & 0xFF;
273
274 }
275
276 /*----------------------------------------------------------------------------
277 | Returns the sign bit of the single-precision floating-point value `a'.
278 *----------------------------------------------------------------------------*/
279
280 INLINE flag extractFloat32Sign( float32 a )
281 {
282
283 return float32_val(a)>>31;
284
285 }
286
287 /*----------------------------------------------------------------------------
288 | If `a' is denormal and we are in flush-to-zero mode then set the
289 | input-denormal exception and return zero. Otherwise just return the value.
290 *----------------------------------------------------------------------------*/
291 float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
292 {
293 if (STATUS(flush_inputs_to_zero)) {
294 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
295 float_raise(float_flag_input_denormal STATUS_VAR);
296 return make_float32(float32_val(a) & 0x80000000);
297 }
298 }
299 return a;
300 }
301
302 /*----------------------------------------------------------------------------
303 | Normalizes the subnormal single-precision floating-point value represented
304 | by the denormalized significand `aSig'. The normalized exponent and
305 | significand are stored at the locations pointed to by `zExpPtr' and
306 | `zSigPtr', respectively.
307 *----------------------------------------------------------------------------*/
308
309 static void
310 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
311 {
312 int8 shiftCount;
313
314 shiftCount = countLeadingZeros32( aSig ) - 8;
315 *zSigPtr = aSig<<shiftCount;
316 *zExpPtr = 1 - shiftCount;
317
318 }
319
320 /*----------------------------------------------------------------------------
321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322 | single-precision floating-point value, returning the result. After being
323 | shifted into the proper positions, the three fields are simply added
324 | together to form the result. This means that any integer portion of `zSig'
325 | will be added into the exponent. Since a properly normalized significand
326 | will have an integer portion equal to 1, the `zExp' input should be 1 less
327 | than the desired result exponent whenever `zSig' is a complete, normalized
328 | significand.
329 *----------------------------------------------------------------------------*/
330
331 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
332 {
333
334 return make_float32(
335 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
336
337 }
338
339 /*----------------------------------------------------------------------------
340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341 | and significand `zSig', and returns the proper single-precision floating-
342 | point value corresponding to the abstract input. Ordinarily, the abstract
343 | value is simply rounded and packed into the single-precision format, with
344 | the inexact exception raised if the abstract input cannot be represented
345 | exactly. However, if the abstract value is too large, the overflow and
346 | inexact exceptions are raised and an infinity or maximal finite value is
347 | returned. If the abstract value is too small, the input value is rounded to
348 | a subnormal number, and the underflow and inexact exceptions are raised if
349 | the abstract input cannot be represented exactly as a subnormal single-
350 | precision floating-point number.
351 | The input significand `zSig' has its binary point between bits 30
352 | and 29, which is 7 bits to the left of the usual location. This shifted
353 | significand must be normalized or smaller. If `zSig' is not normalized,
354 | `zExp' must be 0; in that case, the result returned is a subnormal number,
355 | and it must not require rounding. In the usual case that `zSig' is
356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357 | The handling of underflow and overflow follows the IEC/IEEE Standard for
358 | Binary Floating-Point Arithmetic.
359 *----------------------------------------------------------------------------*/
360
361 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
362 {
363 int8 roundingMode;
364 flag roundNearestEven;
365 int8 roundIncrement, roundBits;
366 flag isTiny;
367
368 roundingMode = STATUS(float_rounding_mode);
369 roundNearestEven = ( roundingMode == float_round_nearest_even );
370 switch (roundingMode) {
371 case float_round_nearest_even:
372 case float_round_ties_away:
373 roundIncrement = 0x40;
374 break;
375 case float_round_to_zero:
376 roundIncrement = 0;
377 break;
378 case float_round_up:
379 roundIncrement = zSign ? 0 : 0x7f;
380 break;
381 case float_round_down:
382 roundIncrement = zSign ? 0x7f : 0;
383 break;
384 default:
385 abort();
386 break;
387 }
388 roundBits = zSig & 0x7F;
389 if ( 0xFD <= (uint16_t) zExp ) {
390 if ( ( 0xFD < zExp )
391 || ( ( zExp == 0xFD )
392 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393 ) {
394 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
395 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
396 }
397 if ( zExp < 0 ) {
398 if (STATUS(flush_to_zero)) {
399 float_raise(float_flag_output_denormal STATUS_VAR);
400 return packFloat32(zSign, 0, 0);
401 }
402 isTiny =
403 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404 || ( zExp < -1 )
405 || ( zSig + roundIncrement < 0x80000000 );
406 shift32RightJamming( zSig, - zExp, &zSig );
407 zExp = 0;
408 roundBits = zSig & 0x7F;
409 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
410 }
411 }
412 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
413 zSig = ( zSig + roundIncrement )>>7;
414 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
415 if ( zSig == 0 ) zExp = 0;
416 return packFloat32( zSign, zExp, zSig );
417
418 }
419
420 /*----------------------------------------------------------------------------
421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422 | and significand `zSig', and returns the proper single-precision floating-
423 | point value corresponding to the abstract input. This routine is just like
424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426 | floating-point exponent.
427 *----------------------------------------------------------------------------*/
428
429 static float32
430 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
431 {
432 int8 shiftCount;
433
434 shiftCount = countLeadingZeros32( zSig ) - 1;
435 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
436
437 }
438
439 /*----------------------------------------------------------------------------
440 | Returns the fraction bits of the double-precision floating-point value `a'.
441 *----------------------------------------------------------------------------*/
442
443 INLINE uint64_t extractFloat64Frac( float64 a )
444 {
445
446 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
447
448 }
449
450 /*----------------------------------------------------------------------------
451 | Returns the exponent bits of the double-precision floating-point value `a'.
452 *----------------------------------------------------------------------------*/
453
454 INLINE int_fast16_t extractFloat64Exp(float64 a)
455 {
456
457 return ( float64_val(a)>>52 ) & 0x7FF;
458
459 }
460
461 /*----------------------------------------------------------------------------
462 | Returns the sign bit of the double-precision floating-point value `a'.
463 *----------------------------------------------------------------------------*/
464
465 INLINE flag extractFloat64Sign( float64 a )
466 {
467
468 return float64_val(a)>>63;
469
470 }
471
472 /*----------------------------------------------------------------------------
473 | If `a' is denormal and we are in flush-to-zero mode then set the
474 | input-denormal exception and return zero. Otherwise just return the value.
475 *----------------------------------------------------------------------------*/
476 float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477 {
478 if (STATUS(flush_inputs_to_zero)) {
479 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
480 float_raise(float_flag_input_denormal STATUS_VAR);
481 return make_float64(float64_val(a) & (1ULL << 63));
482 }
483 }
484 return a;
485 }
486
487 /*----------------------------------------------------------------------------
488 | Normalizes the subnormal double-precision floating-point value represented
489 | by the denormalized significand `aSig'. The normalized exponent and
490 | significand are stored at the locations pointed to by `zExpPtr' and
491 | `zSigPtr', respectively.
492 *----------------------------------------------------------------------------*/
493
494 static void
495 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
496 {
497 int8 shiftCount;
498
499 shiftCount = countLeadingZeros64( aSig ) - 11;
500 *zSigPtr = aSig<<shiftCount;
501 *zExpPtr = 1 - shiftCount;
502
503 }
504
505 /*----------------------------------------------------------------------------
506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507 | double-precision floating-point value, returning the result. After being
508 | shifted into the proper positions, the three fields are simply added
509 | together to form the result. This means that any integer portion of `zSig'
510 | will be added into the exponent. Since a properly normalized significand
511 | will have an integer portion equal to 1, the `zExp' input should be 1 less
512 | than the desired result exponent whenever `zSig' is a complete, normalized
513 | significand.
514 *----------------------------------------------------------------------------*/
515
516 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
517 {
518
519 return make_float64(
520 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
521
522 }
523
524 /*----------------------------------------------------------------------------
525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526 | and significand `zSig', and returns the proper double-precision floating-
527 | point value corresponding to the abstract input. Ordinarily, the abstract
528 | value is simply rounded and packed into the double-precision format, with
529 | the inexact exception raised if the abstract input cannot be represented
530 | exactly. However, if the abstract value is too large, the overflow and
531 | inexact exceptions are raised and an infinity or maximal finite value is
532 | returned. If the abstract value is too small, the input value is rounded
533 | to a subnormal number, and the underflow and inexact exceptions are raised
534 | if the abstract input cannot be represented exactly as a subnormal double-
535 | precision floating-point number.
536 | The input significand `zSig' has its binary point between bits 62
537 | and 61, which is 10 bits to the left of the usual location. This shifted
538 | significand must be normalized or smaller. If `zSig' is not normalized,
539 | `zExp' must be 0; in that case, the result returned is a subnormal number,
540 | and it must not require rounding. In the usual case that `zSig' is
541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542 | The handling of underflow and overflow follows the IEC/IEEE Standard for
543 | Binary Floating-Point Arithmetic.
544 *----------------------------------------------------------------------------*/
545
546 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
547 {
548 int8 roundingMode;
549 flag roundNearestEven;
550 int_fast16_t roundIncrement, roundBits;
551 flag isTiny;
552
553 roundingMode = STATUS(float_rounding_mode);
554 roundNearestEven = ( roundingMode == float_round_nearest_even );
555 switch (roundingMode) {
556 case float_round_nearest_even:
557 case float_round_ties_away:
558 roundIncrement = 0x200;
559 break;
560 case float_round_to_zero:
561 roundIncrement = 0;
562 break;
563 case float_round_up:
564 roundIncrement = zSign ? 0 : 0x3ff;
565 break;
566 case float_round_down:
567 roundIncrement = zSign ? 0x3ff : 0;
568 break;
569 default:
570 abort();
571 }
572 roundBits = zSig & 0x3FF;
573 if ( 0x7FD <= (uint16_t) zExp ) {
574 if ( ( 0x7FD < zExp )
575 || ( ( zExp == 0x7FD )
576 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
577 ) {
578 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
579 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
580 }
581 if ( zExp < 0 ) {
582 if (STATUS(flush_to_zero)) {
583 float_raise(float_flag_output_denormal STATUS_VAR);
584 return packFloat64(zSign, 0, 0);
585 }
586 isTiny =
587 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
588 || ( zExp < -1 )
589 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
590 shift64RightJamming( zSig, - zExp, &zSig );
591 zExp = 0;
592 roundBits = zSig & 0x3FF;
593 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
594 }
595 }
596 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
597 zSig = ( zSig + roundIncrement )>>10;
598 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
599 if ( zSig == 0 ) zExp = 0;
600 return packFloat64( zSign, zExp, zSig );
601
602 }
603
604 /*----------------------------------------------------------------------------
605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606 | and significand `zSig', and returns the proper double-precision floating-
607 | point value corresponding to the abstract input. This routine is just like
608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610 | floating-point exponent.
611 *----------------------------------------------------------------------------*/
612
613 static float64
614 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615 {
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( zSig ) - 1;
619 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620
621 }
622
623 /*----------------------------------------------------------------------------
624 | Returns the fraction bits of the extended double-precision floating-point
625 | value `a'.
626 *----------------------------------------------------------------------------*/
627
628 INLINE uint64_t extractFloatx80Frac( floatx80 a )
629 {
630
631 return a.low;
632
633 }
634
635 /*----------------------------------------------------------------------------
636 | Returns the exponent bits of the extended double-precision floating-point
637 | value `a'.
638 *----------------------------------------------------------------------------*/
639
640 INLINE int32 extractFloatx80Exp( floatx80 a )
641 {
642
643 return a.high & 0x7FFF;
644
645 }
646
647 /*----------------------------------------------------------------------------
648 | Returns the sign bit of the extended double-precision floating-point value
649 | `a'.
650 *----------------------------------------------------------------------------*/
651
652 INLINE flag extractFloatx80Sign( floatx80 a )
653 {
654
655 return a.high>>15;
656
657 }
658
659 /*----------------------------------------------------------------------------
660 | Normalizes the subnormal extended double-precision floating-point value
661 | represented by the denormalized significand `aSig'. The normalized exponent
662 | and significand are stored at the locations pointed to by `zExpPtr' and
663 | `zSigPtr', respectively.
664 *----------------------------------------------------------------------------*/
665
666 static void
667 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668 {
669 int8 shiftCount;
670
671 shiftCount = countLeadingZeros64( aSig );
672 *zSigPtr = aSig<<shiftCount;
673 *zExpPtr = 1 - shiftCount;
674
675 }
676
677 /*----------------------------------------------------------------------------
678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679 | extended double-precision floating-point value, returning the result.
680 *----------------------------------------------------------------------------*/
681
682 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
683 {
684 floatx80 z;
685
686 z.low = zSig;
687 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
688 return z;
689
690 }
691
692 /*----------------------------------------------------------------------------
693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
695 | and returns the proper extended double-precision floating-point value
696 | corresponding to the abstract input. Ordinarily, the abstract value is
697 | rounded and packed into the extended double-precision format, with the
698 | inexact exception raised if the abstract input cannot be represented
699 | exactly. However, if the abstract value is too large, the overflow and
700 | inexact exceptions are raised and an infinity or maximal finite value is
701 | returned. If the abstract value is too small, the input value is rounded to
702 | a subnormal number, and the underflow and inexact exceptions are raised if
703 | the abstract input cannot be represented exactly as a subnormal extended
704 | double-precision floating-point number.
705 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
706 | number of bits as single or double precision, respectively. Otherwise, the
707 | result is rounded to the full precision of the extended double-precision
708 | format.
709 | The input significand must be normalized or smaller. If the input
710 | significand is not normalized, `zExp' must be 0; in that case, the result
711 | returned is a subnormal number, and it must not require rounding. The
712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713 | Floating-Point Arithmetic.
714 *----------------------------------------------------------------------------*/
715
716 static floatx80
717 roundAndPackFloatx80(
718 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719 STATUS_PARAM)
720 {
721 int8 roundingMode;
722 flag roundNearestEven, increment, isTiny;
723 int64 roundIncrement, roundMask, roundBits;
724
725 roundingMode = STATUS(float_rounding_mode);
726 roundNearestEven = ( roundingMode == float_round_nearest_even );
727 if ( roundingPrecision == 80 ) goto precision80;
728 if ( roundingPrecision == 64 ) {
729 roundIncrement = LIT64( 0x0000000000000400 );
730 roundMask = LIT64( 0x00000000000007FF );
731 }
732 else if ( roundingPrecision == 32 ) {
733 roundIncrement = LIT64( 0x0000008000000000 );
734 roundMask = LIT64( 0x000000FFFFFFFFFF );
735 }
736 else {
737 goto precision80;
738 }
739 zSig0 |= ( zSig1 != 0 );
740 switch (roundingMode) {
741 case float_round_nearest_even:
742 case float_round_ties_away:
743 break;
744 case float_round_to_zero:
745 roundIncrement = 0;
746 break;
747 case float_round_up:
748 roundIncrement = zSign ? 0 : roundMask;
749 break;
750 case float_round_down:
751 roundIncrement = zSign ? roundMask : 0;
752 break;
753 default:
754 abort();
755 }
756 roundBits = zSig0 & roundMask;
757 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
758 if ( ( 0x7FFE < zExp )
759 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
760 ) {
761 goto overflow;
762 }
763 if ( zExp <= 0 ) {
764 if (STATUS(flush_to_zero)) {
765 float_raise(float_flag_output_denormal STATUS_VAR);
766 return packFloatx80(zSign, 0, 0);
767 }
768 isTiny =
769 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
770 || ( zExp < 0 )
771 || ( zSig0 <= zSig0 + roundIncrement );
772 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
773 zExp = 0;
774 roundBits = zSig0 & roundMask;
775 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
776 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
777 zSig0 += roundIncrement;
778 if ( (int64_t) zSig0 < 0 ) zExp = 1;
779 roundIncrement = roundMask + 1;
780 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
781 roundMask |= roundIncrement;
782 }
783 zSig0 &= ~ roundMask;
784 return packFloatx80( zSign, zExp, zSig0 );
785 }
786 }
787 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
788 zSig0 += roundIncrement;
789 if ( zSig0 < roundIncrement ) {
790 ++zExp;
791 zSig0 = LIT64( 0x8000000000000000 );
792 }
793 roundIncrement = roundMask + 1;
794 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
795 roundMask |= roundIncrement;
796 }
797 zSig0 &= ~ roundMask;
798 if ( zSig0 == 0 ) zExp = 0;
799 return packFloatx80( zSign, zExp, zSig0 );
800 precision80:
801 switch (roundingMode) {
802 case float_round_nearest_even:
803 case float_round_ties_away:
804 increment = ((int64_t)zSig1 < 0);
805 break;
806 case float_round_to_zero:
807 increment = 0;
808 break;
809 case float_round_up:
810 increment = !zSign && zSig1;
811 break;
812 case float_round_down:
813 increment = zSign && zSig1;
814 break;
815 default:
816 abort();
817 }
818 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
819 if ( ( 0x7FFE < zExp )
820 || ( ( zExp == 0x7FFE )
821 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
822 && increment
823 )
824 ) {
825 roundMask = 0;
826 overflow:
827 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
828 if ( ( roundingMode == float_round_to_zero )
829 || ( zSign && ( roundingMode == float_round_up ) )
830 || ( ! zSign && ( roundingMode == float_round_down ) )
831 ) {
832 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
833 }
834 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
835 }
836 if ( zExp <= 0 ) {
837 isTiny =
838 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
839 || ( zExp < 0 )
840 || ! increment
841 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
842 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
843 zExp = 0;
844 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
845 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
846 switch (roundingMode) {
847 case float_round_nearest_even:
848 case float_round_ties_away:
849 increment = ((int64_t)zSig1 < 0);
850 break;
851 case float_round_to_zero:
852 increment = 0;
853 break;
854 case float_round_up:
855 increment = !zSign && zSig1;
856 break;
857 case float_round_down:
858 increment = zSign && zSig1;
859 break;
860 default:
861 abort();
862 }
863 if ( increment ) {
864 ++zSig0;
865 zSig0 &=
866 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867 if ( (int64_t) zSig0 < 0 ) zExp = 1;
868 }
869 return packFloatx80( zSign, zExp, zSig0 );
870 }
871 }
872 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
873 if ( increment ) {
874 ++zSig0;
875 if ( zSig0 == 0 ) {
876 ++zExp;
877 zSig0 = LIT64( 0x8000000000000000 );
878 }
879 else {
880 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
881 }
882 }
883 else {
884 if ( zSig0 == 0 ) zExp = 0;
885 }
886 return packFloatx80( zSign, zExp, zSig0 );
887
888 }
889
890 /*----------------------------------------------------------------------------
891 | Takes an abstract floating-point value having sign `zSign', exponent
892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893 | and returns the proper extended double-precision floating-point value
894 | corresponding to the abstract input. This routine is just like
895 | `roundAndPackFloatx80' except that the input significand does not have to be
896 | normalized.
897 *----------------------------------------------------------------------------*/
898
899 static floatx80
900 normalizeRoundAndPackFloatx80(
901 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
902 STATUS_PARAM)
903 {
904 int8 shiftCount;
905
906 if ( zSig0 == 0 ) {
907 zSig0 = zSig1;
908 zSig1 = 0;
909 zExp -= 64;
910 }
911 shiftCount = countLeadingZeros64( zSig0 );
912 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
913 zExp -= shiftCount;
914 return
915 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
916
917 }
918
919 /*----------------------------------------------------------------------------
920 | Returns the least-significant 64 fraction bits of the quadruple-precision
921 | floating-point value `a'.
922 *----------------------------------------------------------------------------*/
923
924 INLINE uint64_t extractFloat128Frac1( float128 a )
925 {
926
927 return a.low;
928
929 }
930
931 /*----------------------------------------------------------------------------
932 | Returns the most-significant 48 fraction bits of the quadruple-precision
933 | floating-point value `a'.
934 *----------------------------------------------------------------------------*/
935
936 INLINE uint64_t extractFloat128Frac0( float128 a )
937 {
938
939 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
940
941 }
942
943 /*----------------------------------------------------------------------------
944 | Returns the exponent bits of the quadruple-precision floating-point value
945 | `a'.
946 *----------------------------------------------------------------------------*/
947
948 INLINE int32 extractFloat128Exp( float128 a )
949 {
950
951 return ( a.high>>48 ) & 0x7FFF;
952
953 }
954
955 /*----------------------------------------------------------------------------
956 | Returns the sign bit of the quadruple-precision floating-point value `a'.
957 *----------------------------------------------------------------------------*/
958
959 INLINE flag extractFloat128Sign( float128 a )
960 {
961
962 return a.high>>63;
963
964 }
965
966 /*----------------------------------------------------------------------------
967 | Normalizes the subnormal quadruple-precision floating-point value
968 | represented by the denormalized significand formed by the concatenation of
969 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
970 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
971 | significand are stored at the location pointed to by `zSig0Ptr', and the
972 | least significant 64 bits of the normalized significand are stored at the
973 | location pointed to by `zSig1Ptr'.
974 *----------------------------------------------------------------------------*/
975
976 static void
977 normalizeFloat128Subnormal(
978 uint64_t aSig0,
979 uint64_t aSig1,
980 int32 *zExpPtr,
981 uint64_t *zSig0Ptr,
982 uint64_t *zSig1Ptr
983 )
984 {
985 int8 shiftCount;
986
987 if ( aSig0 == 0 ) {
988 shiftCount = countLeadingZeros64( aSig1 ) - 15;
989 if ( shiftCount < 0 ) {
990 *zSig0Ptr = aSig1>>( - shiftCount );
991 *zSig1Ptr = aSig1<<( shiftCount & 63 );
992 }
993 else {
994 *zSig0Ptr = aSig1<<shiftCount;
995 *zSig1Ptr = 0;
996 }
997 *zExpPtr = - shiftCount - 63;
998 }
999 else {
1000 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002 *zExpPtr = 1 - shiftCount;
1003 }
1004
1005 }
1006
1007 /*----------------------------------------------------------------------------
1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010 | floating-point value, returning the result. After being shifted into the
1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012 | added together to form the most significant 32 bits of the result. This
1013 | means that any integer portion of `zSig0' will be added into the exponent.
1014 | Since a properly normalized significand will have an integer portion equal
1015 | to 1, the `zExp' input should be 1 less than the desired result exponent
1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017 | significand.
1018 *----------------------------------------------------------------------------*/
1019
1020 INLINE float128
1021 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1022 {
1023 float128 z;
1024
1025 z.low = zSig1;
1026 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1027 return z;
1028
1029 }
1030
1031 /*----------------------------------------------------------------------------
1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1034 | and `zSig2', and returns the proper quadruple-precision floating-point value
1035 | corresponding to the abstract input. Ordinarily, the abstract value is
1036 | simply rounded and packed into the quadruple-precision format, with the
1037 | inexact exception raised if the abstract input cannot be represented
1038 | exactly. However, if the abstract value is too large, the overflow and
1039 | inexact exceptions are raised and an infinity or maximal finite value is
1040 | returned. If the abstract value is too small, the input value is rounded to
1041 | a subnormal number, and the underflow and inexact exceptions are raised if
1042 | the abstract input cannot be represented exactly as a subnormal quadruple-
1043 | precision floating-point number.
1044 | The input significand must be normalized or smaller. If the input
1045 | significand is not normalized, `zExp' must be 0; in that case, the result
1046 | returned is a subnormal number, and it must not require rounding. In the
1047 | usual case that the input significand is normalized, `zExp' must be 1 less
1048 | than the ``true'' floating-point exponent. The handling of underflow and
1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050 *----------------------------------------------------------------------------*/
1051
1052 static float128
1053 roundAndPackFloat128(
1054 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1055 {
1056 int8 roundingMode;
1057 flag roundNearestEven, increment, isTiny;
1058
1059 roundingMode = STATUS(float_rounding_mode);
1060 roundNearestEven = ( roundingMode == float_round_nearest_even );
1061 switch (roundingMode) {
1062 case float_round_nearest_even:
1063 case float_round_ties_away:
1064 increment = ((int64_t)zSig2 < 0);
1065 break;
1066 case float_round_to_zero:
1067 increment = 0;
1068 break;
1069 case float_round_up:
1070 increment = !zSign && zSig2;
1071 break;
1072 case float_round_down:
1073 increment = zSign && zSig2;
1074 break;
1075 default:
1076 abort();
1077 }
1078 if ( 0x7FFD <= (uint32_t) zExp ) {
1079 if ( ( 0x7FFD < zExp )
1080 || ( ( zExp == 0x7FFD )
1081 && eq128(
1082 LIT64( 0x0001FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF ),
1084 zSig0,
1085 zSig1
1086 )
1087 && increment
1088 )
1089 ) {
1090 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091 if ( ( roundingMode == float_round_to_zero )
1092 || ( zSign && ( roundingMode == float_round_up ) )
1093 || ( ! zSign && ( roundingMode == float_round_down ) )
1094 ) {
1095 return
1096 packFloat128(
1097 zSign,
1098 0x7FFE,
1099 LIT64( 0x0000FFFFFFFFFFFF ),
1100 LIT64( 0xFFFFFFFFFFFFFFFF )
1101 );
1102 }
1103 return packFloat128( zSign, 0x7FFF, 0, 0 );
1104 }
1105 if ( zExp < 0 ) {
1106 if (STATUS(flush_to_zero)) {
1107 float_raise(float_flag_output_denormal STATUS_VAR);
1108 return packFloat128(zSign, 0, 0, 0);
1109 }
1110 isTiny =
1111 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112 || ( zExp < -1 )
1113 || ! increment
1114 || lt128(
1115 zSig0,
1116 zSig1,
1117 LIT64( 0x0001FFFFFFFFFFFF ),
1118 LIT64( 0xFFFFFFFFFFFFFFFF )
1119 );
1120 shift128ExtraRightJamming(
1121 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122 zExp = 0;
1123 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1124 switch (roundingMode) {
1125 case float_round_nearest_even:
1126 case float_round_ties_away:
1127 increment = ((int64_t)zSig2 < 0);
1128 break;
1129 case float_round_to_zero:
1130 increment = 0;
1131 break;
1132 case float_round_up:
1133 increment = !zSign && zSig2;
1134 break;
1135 case float_round_down:
1136 increment = zSign && zSig2;
1137 break;
1138 default:
1139 abort();
1140 }
1141 }
1142 }
1143 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144 if ( increment ) {
1145 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147 }
1148 else {
1149 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150 }
1151 return packFloat128( zSign, zExp, zSig0, zSig1 );
1152
1153 }
1154
1155 /*----------------------------------------------------------------------------
1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1158 | returns the proper quadruple-precision floating-point value corresponding
1159 | to the abstract input. This routine is just like `roundAndPackFloat128'
1160 | except that the input significand has fewer bits and does not have to be
1161 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1162 | point exponent.
1163 *----------------------------------------------------------------------------*/
1164
1165 static float128
1166 normalizeRoundAndPackFloat128(
1167 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1168 {
1169 int8 shiftCount;
1170 uint64_t zSig2;
1171
1172 if ( zSig0 == 0 ) {
1173 zSig0 = zSig1;
1174 zSig1 = 0;
1175 zExp -= 64;
1176 }
1177 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178 if ( 0 <= shiftCount ) {
1179 zSig2 = 0;
1180 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181 }
1182 else {
1183 shift128ExtraRightJamming(
1184 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185 }
1186 zExp -= shiftCount;
1187 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188
1189 }
1190
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a'
1193 | to the single-precision floating-point format. The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196
1197 float32 int32_to_float32(int32_t a STATUS_PARAM)
1198 {
1199 flag zSign;
1200
1201 if ( a == 0 ) return float32_zero;
1202 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1203 zSign = ( a < 0 );
1204 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205
1206 }
1207
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 32-bit two's complement integer `a'
1210 | to the double-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float64 int32_to_float64(int32_t a STATUS_PARAM)
1215 {
1216 flag zSign;
1217 uint32 absA;
1218 int8 shiftCount;
1219 uint64_t zSig;
1220
1221 if ( a == 0 ) return float64_zero;
1222 zSign = ( a < 0 );
1223 absA = zSign ? - a : a;
1224 shiftCount = countLeadingZeros32( absA ) + 21;
1225 zSig = absA;
1226 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227
1228 }
1229
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a'
1232 | to the extended double-precision floating-point format. The conversion
1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234 | Arithmetic.
1235 *----------------------------------------------------------------------------*/
1236
1237 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1238 {
1239 flag zSign;
1240 uint32 absA;
1241 int8 shiftCount;
1242 uint64_t zSig;
1243
1244 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245 zSign = ( a < 0 );
1246 absA = zSign ? - a : a;
1247 shiftCount = countLeadingZeros32( absA ) + 32;
1248 zSig = absA;
1249 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250
1251 }
1252
1253 /*----------------------------------------------------------------------------
1254 | Returns the result of converting the 32-bit two's complement integer `a' to
1255 | the quadruple-precision floating-point format. The conversion is performed
1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 *----------------------------------------------------------------------------*/
1258
1259 float128 int32_to_float128(int32_t a STATUS_PARAM)
1260 {
1261 flag zSign;
1262 uint32 absA;
1263 int8 shiftCount;
1264 uint64_t zSig0;
1265
1266 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267 zSign = ( a < 0 );
1268 absA = zSign ? - a : a;
1269 shiftCount = countLeadingZeros32( absA ) + 17;
1270 zSig0 = absA;
1271 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272
1273 }
1274
1275 /*----------------------------------------------------------------------------
1276 | Returns the result of converting the 64-bit two's complement integer `a'
1277 | to the single-precision floating-point format. The conversion is performed
1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279 *----------------------------------------------------------------------------*/
1280
1281 float32 int64_to_float32(int64_t a STATUS_PARAM)
1282 {
1283 flag zSign;
1284 uint64 absA;
1285 int8 shiftCount;
1286
1287 if ( a == 0 ) return float32_zero;
1288 zSign = ( a < 0 );
1289 absA = zSign ? - a : a;
1290 shiftCount = countLeadingZeros64( absA ) - 40;
1291 if ( 0 <= shiftCount ) {
1292 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293 }
1294 else {
1295 shiftCount += 7;
1296 if ( shiftCount < 0 ) {
1297 shift64RightJamming( absA, - shiftCount, &absA );
1298 }
1299 else {
1300 absA <<= shiftCount;
1301 }
1302 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303 }
1304
1305 }
1306
1307 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1308 {
1309 int8 shiftCount;
1310
1311 if ( a == 0 ) return float32_zero;
1312 shiftCount = countLeadingZeros64( a ) - 40;
1313 if ( 0 <= shiftCount ) {
1314 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1315 }
1316 else {
1317 shiftCount += 7;
1318 if ( shiftCount < 0 ) {
1319 shift64RightJamming( a, - shiftCount, &a );
1320 }
1321 else {
1322 a <<= shiftCount;
1323 }
1324 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1325 }
1326 }
1327
1328 /*----------------------------------------------------------------------------
1329 | Returns the result of converting the 64-bit two's complement integer `a'
1330 | to the double-precision floating-point format. The conversion is performed
1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 *----------------------------------------------------------------------------*/
1333
1334 float64 int64_to_float64(int64_t a STATUS_PARAM)
1335 {
1336 flag zSign;
1337
1338 if ( a == 0 ) return float64_zero;
1339 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1340 return packFloat64( 1, 0x43E, 0 );
1341 }
1342 zSign = ( a < 0 );
1343 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344
1345 }
1346
1347 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1348 {
1349 int exp = 0x43C;
1350
1351 if (a == 0) {
1352 return float64_zero;
1353 }
1354 if ((int64_t)a < 0) {
1355 shift64RightJamming(a, 1, &a);
1356 exp += 1;
1357 }
1358 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1359 }
1360
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the 64-bit two's complement integer `a'
1363 | to the extended double-precision floating-point format. The conversion
1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic.
1366 *----------------------------------------------------------------------------*/
1367
1368 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1369 {
1370 flag zSign;
1371 uint64 absA;
1372 int8 shiftCount;
1373
1374 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375 zSign = ( a < 0 );
1376 absA = zSign ? - a : a;
1377 shiftCount = countLeadingZeros64( absA );
1378 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379
1380 }
1381
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the 64-bit two's complement integer `a' to
1384 | the quadruple-precision floating-point format. The conversion is performed
1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386 *----------------------------------------------------------------------------*/
1387
1388 float128 int64_to_float128(int64_t a STATUS_PARAM)
1389 {
1390 flag zSign;
1391 uint64 absA;
1392 int8 shiftCount;
1393 int32 zExp;
1394 uint64_t zSig0, zSig1;
1395
1396 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397 zSign = ( a < 0 );
1398 absA = zSign ? - a : a;
1399 shiftCount = countLeadingZeros64( absA ) + 49;
1400 zExp = 0x406E - shiftCount;
1401 if ( 64 <= shiftCount ) {
1402 zSig1 = 0;
1403 zSig0 = absA;
1404 shiftCount -= 64;
1405 }
1406 else {
1407 zSig1 = absA;
1408 zSig0 = 0;
1409 }
1410 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411 return packFloat128( zSign, zExp, zSig0, zSig1 );
1412
1413 }
1414
1415 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1416 {
1417 if (a == 0) {
1418 return float128_zero;
1419 }
1420 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421 }
1422
1423 /*----------------------------------------------------------------------------
1424 | Returns the result of converting the single-precision floating-point value
1425 | `a' to the 32-bit two's complement integer format. The conversion is
1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1427 | Arithmetic---which means in particular that the conversion is rounded
1428 | according to the current rounding mode. If `a' is a NaN, the largest
1429 | positive integer is returned. Otherwise, if the conversion overflows, the
1430 | largest integer with the same sign as `a' is returned.
1431 *----------------------------------------------------------------------------*/
1432
1433 int32 float32_to_int32( float32 a STATUS_PARAM )
1434 {
1435 flag aSign;
1436 int_fast16_t aExp, shiftCount;
1437 uint32_t aSig;
1438 uint64_t aSig64;
1439
1440 a = float32_squash_input_denormal(a STATUS_VAR);
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445 if ( aExp ) aSig |= 0x00800000;
1446 shiftCount = 0xAF - aExp;
1447 aSig64 = aSig;
1448 aSig64 <<= 32;
1449 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451
1452 }
1453
1454 /*----------------------------------------------------------------------------
1455 | Returns the result of converting the single-precision floating-point value
1456 | `a' to the 32-bit two's complement integer format. The conversion is
1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1458 | Arithmetic, except that the conversion is always rounded toward zero.
1459 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1460 | the conversion overflows, the largest integer with the same sign as `a' is
1461 | returned.
1462 *----------------------------------------------------------------------------*/
1463
1464 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465 {
1466 flag aSign;
1467 int_fast16_t aExp, shiftCount;
1468 uint32_t aSig;
1469 int32_t z;
1470 a = float32_squash_input_denormal(a STATUS_VAR);
1471
1472 aSig = extractFloat32Frac( a );
1473 aExp = extractFloat32Exp( a );
1474 aSign = extractFloat32Sign( a );
1475 shiftCount = aExp - 0x9E;
1476 if ( 0 <= shiftCount ) {
1477 if ( float32_val(a) != 0xCF000000 ) {
1478 float_raise( float_flag_invalid STATUS_VAR);
1479 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480 }
1481 return (int32_t) 0x80000000;
1482 }
1483 else if ( aExp <= 0x7E ) {
1484 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485 return 0;
1486 }
1487 aSig = ( aSig | 0x00800000 )<<8;
1488 z = aSig>>( - shiftCount );
1489 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1490 STATUS(float_exception_flags) |= float_flag_inexact;
1491 }
1492 if ( aSign ) z = - z;
1493 return z;
1494
1495 }
1496
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 16-bit two's complement integer format. The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic, except that the conversion is always rounded toward zero.
1502 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1503 | the conversion overflows, the largest integer with the same sign as `a' is
1504 | returned.
1505 *----------------------------------------------------------------------------*/
1506
1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1508 {
1509 flag aSign;
1510 int_fast16_t aExp, shiftCount;
1511 uint32_t aSig;
1512 int32 z;
1513
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 shiftCount = aExp - 0x8E;
1518 if ( 0 <= shiftCount ) {
1519 if ( float32_val(a) != 0xC7000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR);
1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522 return 0x7FFF;
1523 }
1524 }
1525 return (int32_t) 0xffff8000;
1526 }
1527 else if ( aExp <= 0x7E ) {
1528 if ( aExp | aSig ) {
1529 STATUS(float_exception_flags) |= float_flag_inexact;
1530 }
1531 return 0;
1532 }
1533 shiftCount -= 0x10;
1534 aSig = ( aSig | 0x00800000 )<<8;
1535 z = aSig>>( - shiftCount );
1536 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1537 STATUS(float_exception_flags) |= float_flag_inexact;
1538 }
1539 if ( aSign ) {
1540 z = - z;
1541 }
1542 return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the 64-bit two's complement integer format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic---which means in particular that the conversion is rounded
1551 | according to the current rounding mode. If `a' is a NaN, the largest
1552 | positive integer is returned. Otherwise, if the conversion overflows, the
1553 | largest integer with the same sign as `a' is returned.
1554 *----------------------------------------------------------------------------*/
1555
1556 int64 float32_to_int64( float32 a STATUS_PARAM )
1557 {
1558 flag aSign;
1559 int_fast16_t aExp, shiftCount;
1560 uint32_t aSig;
1561 uint64_t aSig64, aSigExtra;
1562 a = float32_squash_input_denormal(a STATUS_VAR);
1563
1564 aSig = extractFloat32Frac( a );
1565 aExp = extractFloat32Exp( a );
1566 aSign = extractFloat32Sign( a );
1567 shiftCount = 0xBE - aExp;
1568 if ( shiftCount < 0 ) {
1569 float_raise( float_flag_invalid STATUS_VAR);
1570 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571 return LIT64( 0x7FFFFFFFFFFFFFFF );
1572 }
1573 return (int64_t) LIT64( 0x8000000000000000 );
1574 }
1575 if ( aExp ) aSig |= 0x00800000;
1576 aSig64 = aSig;
1577 aSig64 <<= 40;
1578 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580
1581 }
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the 64-bit unsigned integer format. The conversion is
1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic---which means in particular that the conversion is rounded
1588 | according to the current rounding mode. If `a' is a NaN, the largest
1589 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1590 | largest unsigned integer is returned. If the 'a' is negative, the result
1591 | is rounded and zero is returned; values that do not round to zero will
1592 | raise the inexact exception flag.
1593 *----------------------------------------------------------------------------*/
1594
1595 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596 {
1597 flag aSign;
1598 int_fast16_t aExp, shiftCount;
1599 uint32_t aSig;
1600 uint64_t aSig64, aSigExtra;
1601 a = float32_squash_input_denormal(a STATUS_VAR);
1602
1603 aSig = extractFloat32Frac(a);
1604 aExp = extractFloat32Exp(a);
1605 aSign = extractFloat32Sign(a);
1606 if ((aSign) && (aExp > 126)) {
1607 float_raise(float_flag_invalid STATUS_VAR);
1608 if (float32_is_any_nan(a)) {
1609 return LIT64(0xFFFFFFFFFFFFFFFF);
1610 } else {
1611 return 0;
1612 }
1613 }
1614 shiftCount = 0xBE - aExp;
1615 if (aExp) {
1616 aSig |= 0x00800000;
1617 }
1618 if (shiftCount < 0) {
1619 float_raise(float_flag_invalid STATUS_VAR);
1620 return LIT64(0xFFFFFFFFFFFFFFFF);
1621 }
1622
1623 aSig64 = aSig;
1624 aSig64 <<= 40;
1625 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627 }
1628
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the 64-bit two's complement integer format. The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic, except that the conversion is always rounded toward zero. If
1634 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1635 | conversion overflows, the largest integer with the same sign as `a' is
1636 | returned.
1637 *----------------------------------------------------------------------------*/
1638
1639 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1640 {
1641 flag aSign;
1642 int_fast16_t aExp, shiftCount;
1643 uint32_t aSig;
1644 uint64_t aSig64;
1645 int64 z;
1646 a = float32_squash_input_denormal(a STATUS_VAR);
1647
1648 aSig = extractFloat32Frac( a );
1649 aExp = extractFloat32Exp( a );
1650 aSign = extractFloat32Sign( a );
1651 shiftCount = aExp - 0xBE;
1652 if ( 0 <= shiftCount ) {
1653 if ( float32_val(a) != 0xDF000000 ) {
1654 float_raise( float_flag_invalid STATUS_VAR);
1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656 return LIT64( 0x7FFFFFFFFFFFFFFF );
1657 }
1658 }
1659 return (int64_t) LIT64( 0x8000000000000000 );
1660 }
1661 else if ( aExp <= 0x7E ) {
1662 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1663 return 0;
1664 }
1665 aSig64 = aSig | 0x00800000;
1666 aSig64 <<= 40;
1667 z = aSig64>>( - shiftCount );
1668 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1669 STATUS(float_exception_flags) |= float_flag_inexact;
1670 }
1671 if ( aSign ) z = - z;
1672 return z;
1673
1674 }
1675
1676 /*----------------------------------------------------------------------------
1677 | Returns the result of converting the single-precision floating-point value
1678 | `a' to the double-precision floating-point format. The conversion is
1679 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1680 | Arithmetic.
1681 *----------------------------------------------------------------------------*/
1682
1683 float64 float32_to_float64( float32 a STATUS_PARAM )
1684 {
1685 flag aSign;
1686 int_fast16_t aExp;
1687 uint32_t aSig;
1688 a = float32_squash_input_denormal(a STATUS_VAR);
1689
1690 aSig = extractFloat32Frac( a );
1691 aExp = extractFloat32Exp( a );
1692 aSign = extractFloat32Sign( a );
1693 if ( aExp == 0xFF ) {
1694 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1695 return packFloat64( aSign, 0x7FF, 0 );
1696 }
1697 if ( aExp == 0 ) {
1698 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1699 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1700 --aExp;
1701 }
1702 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1703
1704 }
1705
1706 /*----------------------------------------------------------------------------
1707 | Returns the result of converting the single-precision floating-point value
1708 | `a' to the extended double-precision floating-point format. The conversion
1709 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1710 | Arithmetic.
1711 *----------------------------------------------------------------------------*/
1712
1713 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1714 {
1715 flag aSign;
1716 int_fast16_t aExp;
1717 uint32_t aSig;
1718
1719 a = float32_squash_input_denormal(a STATUS_VAR);
1720 aSig = extractFloat32Frac( a );
1721 aExp = extractFloat32Exp( a );
1722 aSign = extractFloat32Sign( a );
1723 if ( aExp == 0xFF ) {
1724 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1725 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1726 }
1727 if ( aExp == 0 ) {
1728 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1729 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1730 }
1731 aSig |= 0x00800000;
1732 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1733
1734 }
1735
1736 /*----------------------------------------------------------------------------
1737 | Returns the result of converting the single-precision floating-point value
1738 | `a' to the double-precision floating-point format. The conversion is
1739 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1740 | Arithmetic.
1741 *----------------------------------------------------------------------------*/
1742
1743 float128 float32_to_float128( float32 a STATUS_PARAM )
1744 {
1745 flag aSign;
1746 int_fast16_t aExp;
1747 uint32_t aSig;
1748
1749 a = float32_squash_input_denormal(a STATUS_VAR);
1750 aSig = extractFloat32Frac( a );
1751 aExp = extractFloat32Exp( a );
1752 aSign = extractFloat32Sign( a );
1753 if ( aExp == 0xFF ) {
1754 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1755 return packFloat128( aSign, 0x7FFF, 0, 0 );
1756 }
1757 if ( aExp == 0 ) {
1758 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1759 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1760 --aExp;
1761 }
1762 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1763
1764 }
1765
1766 /*----------------------------------------------------------------------------
1767 | Rounds the single-precision floating-point value `a' to an integer, and
1768 | returns the result as a single-precision floating-point value. The
1769 | operation is performed according to the IEC/IEEE Standard for Binary
1770 | Floating-Point Arithmetic.
1771 *----------------------------------------------------------------------------*/
1772
1773 float32 float32_round_to_int( float32 a STATUS_PARAM)
1774 {
1775 flag aSign;
1776 int_fast16_t aExp;
1777 uint32_t lastBitMask, roundBitsMask;
1778 uint32_t z;
1779 a = float32_squash_input_denormal(a STATUS_VAR);
1780
1781 aExp = extractFloat32Exp( a );
1782 if ( 0x96 <= aExp ) {
1783 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1784 return propagateFloat32NaN( a, a STATUS_VAR );
1785 }
1786 return a;
1787 }
1788 if ( aExp <= 0x7E ) {
1789 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1790 STATUS(float_exception_flags) |= float_flag_inexact;
1791 aSign = extractFloat32Sign( a );
1792 switch ( STATUS(float_rounding_mode) ) {
1793 case float_round_nearest_even:
1794 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1795 return packFloat32( aSign, 0x7F, 0 );
1796 }
1797 break;
1798 case float_round_ties_away:
1799 if (aExp == 0x7E) {
1800 return packFloat32(aSign, 0x7F, 0);
1801 }
1802 break;
1803 case float_round_down:
1804 return make_float32(aSign ? 0xBF800000 : 0);
1805 case float_round_up:
1806 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1807 }
1808 return packFloat32( aSign, 0, 0 );
1809 }
1810 lastBitMask = 1;
1811 lastBitMask <<= 0x96 - aExp;
1812 roundBitsMask = lastBitMask - 1;
1813 z = float32_val(a);
1814 switch (STATUS(float_rounding_mode)) {
1815 case float_round_nearest_even:
1816 z += lastBitMask>>1;
1817 if ((z & roundBitsMask) == 0) {
1818 z &= ~lastBitMask;
1819 }
1820 break;
1821 case float_round_ties_away:
1822 z += lastBitMask >> 1;
1823 break;
1824 case float_round_to_zero:
1825 break;
1826 case float_round_up:
1827 if (!extractFloat32Sign(make_float32(z))) {
1828 z += roundBitsMask;
1829 }
1830 break;
1831 case float_round_down:
1832 if (extractFloat32Sign(make_float32(z))) {
1833 z += roundBitsMask;
1834 }
1835 break;
1836 default:
1837 abort();
1838 }
1839 z &= ~ roundBitsMask;
1840 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1841 return make_float32(z);
1842
1843 }
1844
1845 /*----------------------------------------------------------------------------
1846 | Returns the result of adding the absolute values of the single-precision
1847 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1848 | before being returned. `zSign' is ignored if the result is a NaN.
1849 | The addition is performed according to the IEC/IEEE Standard for Binary
1850 | Floating-Point Arithmetic.
1851 *----------------------------------------------------------------------------*/
1852
1853 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1854 {
1855 int_fast16_t aExp, bExp, zExp;
1856 uint32_t aSig, bSig, zSig;
1857 int_fast16_t expDiff;
1858
1859 aSig = extractFloat32Frac( a );
1860 aExp = extractFloat32Exp( a );
1861 bSig = extractFloat32Frac( b );
1862 bExp = extractFloat32Exp( b );
1863 expDiff = aExp - bExp;
1864 aSig <<= 6;
1865 bSig <<= 6;
1866 if ( 0 < expDiff ) {
1867 if ( aExp == 0xFF ) {
1868 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1869 return a;
1870 }
1871 if ( bExp == 0 ) {
1872 --expDiff;
1873 }
1874 else {
1875 bSig |= 0x20000000;
1876 }
1877 shift32RightJamming( bSig, expDiff, &bSig );
1878 zExp = aExp;
1879 }
1880 else if ( expDiff < 0 ) {
1881 if ( bExp == 0xFF ) {
1882 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1883 return packFloat32( zSign, 0xFF, 0 );
1884 }
1885 if ( aExp == 0 ) {
1886 ++expDiff;
1887 }
1888 else {
1889 aSig |= 0x20000000;
1890 }
1891 shift32RightJamming( aSig, - expDiff, &aSig );
1892 zExp = bExp;
1893 }
1894 else {
1895 if ( aExp == 0xFF ) {
1896 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897 return a;
1898 }
1899 if ( aExp == 0 ) {
1900 if (STATUS(flush_to_zero)) {
1901 if (aSig | bSig) {
1902 float_raise(float_flag_output_denormal STATUS_VAR);
1903 }
1904 return packFloat32(zSign, 0, 0);
1905 }
1906 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1907 }
1908 zSig = 0x40000000 + aSig + bSig;
1909 zExp = aExp;
1910 goto roundAndPack;
1911 }
1912 aSig |= 0x20000000;
1913 zSig = ( aSig + bSig )<<1;
1914 --zExp;
1915 if ( (int32_t) zSig < 0 ) {
1916 zSig = aSig + bSig;
1917 ++zExp;
1918 }
1919 roundAndPack:
1920 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1921
1922 }
1923
1924 /*----------------------------------------------------------------------------
1925 | Returns the result of subtracting the absolute values of the single-
1926 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1927 | difference is negated before being returned. `zSign' is ignored if the
1928 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1929 | Standard for Binary Floating-Point Arithmetic.
1930 *----------------------------------------------------------------------------*/
1931
1932 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1933 {
1934 int_fast16_t aExp, bExp, zExp;
1935 uint32_t aSig, bSig, zSig;
1936 int_fast16_t expDiff;
1937
1938 aSig = extractFloat32Frac( a );
1939 aExp = extractFloat32Exp( a );
1940 bSig = extractFloat32Frac( b );
1941 bExp = extractFloat32Exp( b );
1942 expDiff = aExp - bExp;
1943 aSig <<= 7;
1944 bSig <<= 7;
1945 if ( 0 < expDiff ) goto aExpBigger;
1946 if ( expDiff < 0 ) goto bExpBigger;
1947 if ( aExp == 0xFF ) {
1948 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1949 float_raise( float_flag_invalid STATUS_VAR);
1950 return float32_default_nan;
1951 }
1952 if ( aExp == 0 ) {
1953 aExp = 1;
1954 bExp = 1;
1955 }
1956 if ( bSig < aSig ) goto aBigger;
1957 if ( aSig < bSig ) goto bBigger;
1958 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1959 bExpBigger:
1960 if ( bExp == 0xFF ) {
1961 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1962 return packFloat32( zSign ^ 1, 0xFF, 0 );
1963 }
1964 if ( aExp == 0 ) {
1965 ++expDiff;
1966 }
1967 else {
1968 aSig |= 0x40000000;
1969 }
1970 shift32RightJamming( aSig, - expDiff, &aSig );
1971 bSig |= 0x40000000;
1972 bBigger:
1973 zSig = bSig - aSig;
1974 zExp = bExp;
1975 zSign ^= 1;
1976 goto normalizeRoundAndPack;
1977 aExpBigger:
1978 if ( aExp == 0xFF ) {
1979 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980 return a;
1981 }
1982 if ( bExp == 0 ) {
1983 --expDiff;
1984 }
1985 else {
1986 bSig |= 0x40000000;
1987 }
1988 shift32RightJamming( bSig, expDiff, &bSig );
1989 aSig |= 0x40000000;
1990 aBigger:
1991 zSig = aSig - bSig;
1992 zExp = aExp;
1993 normalizeRoundAndPack:
1994 --zExp;
1995 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996
1997 }
1998
1999 /*----------------------------------------------------------------------------
2000 | Returns the result of adding the single-precision floating-point values `a'
2001 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2002 | Binary Floating-Point Arithmetic.
2003 *----------------------------------------------------------------------------*/
2004
2005 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2006 {
2007 flag aSign, bSign;
2008 a = float32_squash_input_denormal(a STATUS_VAR);
2009 b = float32_squash_input_denormal(b STATUS_VAR);
2010
2011 aSign = extractFloat32Sign( a );
2012 bSign = extractFloat32Sign( b );
2013 if ( aSign == bSign ) {
2014 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2015 }
2016 else {
2017 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2018 }
2019
2020 }
2021
2022 /*----------------------------------------------------------------------------
2023 | Returns the result of subtracting the single-precision floating-point values
2024 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2025 | for Binary Floating-Point Arithmetic.
2026 *----------------------------------------------------------------------------*/
2027
2028 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2029 {
2030 flag aSign, bSign;
2031 a = float32_squash_input_denormal(a STATUS_VAR);
2032 b = float32_squash_input_denormal(b STATUS_VAR);
2033
2034 aSign = extractFloat32Sign( a );
2035 bSign = extractFloat32Sign( b );
2036 if ( aSign == bSign ) {
2037 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038 }
2039 else {
2040 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2041 }
2042
2043 }
2044
2045 /*----------------------------------------------------------------------------
2046 | Returns the result of multiplying the single-precision floating-point values
2047 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2048 | for Binary Floating-Point Arithmetic.
2049 *----------------------------------------------------------------------------*/
2050
2051 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2052 {
2053 flag aSign, bSign, zSign;
2054 int_fast16_t aExp, bExp, zExp;
2055 uint32_t aSig, bSig;
2056 uint64_t zSig64;
2057 uint32_t zSig;
2058
2059 a = float32_squash_input_denormal(a STATUS_VAR);
2060 b = float32_squash_input_denormal(b STATUS_VAR);
2061
2062 aSig = extractFloat32Frac( a );
2063 aExp = extractFloat32Exp( a );
2064 aSign = extractFloat32Sign( a );
2065 bSig = extractFloat32Frac( b );
2066 bExp = extractFloat32Exp( b );
2067 bSign = extractFloat32Sign( b );
2068 zSign = aSign ^ bSign;
2069 if ( aExp == 0xFF ) {
2070 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2071 return propagateFloat32NaN( a, b STATUS_VAR );
2072 }
2073 if ( ( bExp | bSig ) == 0 ) {
2074 float_raise( float_flag_invalid STATUS_VAR);
2075 return float32_default_nan;
2076 }
2077 return packFloat32( zSign, 0xFF, 0 );
2078 }
2079 if ( bExp == 0xFF ) {
2080 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2081 if ( ( aExp | aSig ) == 0 ) {
2082 float_raise( float_flag_invalid STATUS_VAR);
2083 return float32_default_nan;
2084 }
2085 return packFloat32( zSign, 0xFF, 0 );
2086 }
2087 if ( aExp == 0 ) {
2088 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2089 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2090 }
2091 if ( bExp == 0 ) {
2092 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2093 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2094 }
2095 zExp = aExp + bExp - 0x7F;
2096 aSig = ( aSig | 0x00800000 )<<7;
2097 bSig = ( bSig | 0x00800000 )<<8;
2098 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2099 zSig = zSig64;
2100 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2101 zSig <<= 1;
2102 --zExp;
2103 }
2104 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2105
2106 }
2107
2108 /*----------------------------------------------------------------------------
2109 | Returns the result of dividing the single-precision floating-point value `a'
2110 | by the corresponding value `b'. The operation is performed according to the
2111 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112 *----------------------------------------------------------------------------*/
2113
2114 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2115 {
2116 flag aSign, bSign, zSign;
2117 int_fast16_t aExp, bExp, zExp;
2118 uint32_t aSig, bSig, zSig;
2119 a = float32_squash_input_denormal(a STATUS_VAR);
2120 b = float32_squash_input_denormal(b STATUS_VAR);
2121
2122 aSig = extractFloat32Frac( a );
2123 aExp = extractFloat32Exp( a );
2124 aSign = extractFloat32Sign( a );
2125 bSig = extractFloat32Frac( b );
2126 bExp = extractFloat32Exp( b );
2127 bSign = extractFloat32Sign( b );
2128 zSign = aSign ^ bSign;
2129 if ( aExp == 0xFF ) {
2130 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2131 if ( bExp == 0xFF ) {
2132 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2133 float_raise( float_flag_invalid STATUS_VAR);
2134 return float32_default_nan;
2135 }
2136 return packFloat32( zSign, 0xFF, 0 );
2137 }
2138 if ( bExp == 0xFF ) {
2139 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2140 return packFloat32( zSign, 0, 0 );
2141 }
2142 if ( bExp == 0 ) {
2143 if ( bSig == 0 ) {
2144 if ( ( aExp | aSig ) == 0 ) {
2145 float_raise( float_flag_invalid STATUS_VAR);
2146 return float32_default_nan;
2147 }
2148 float_raise( float_flag_divbyzero STATUS_VAR);
2149 return packFloat32( zSign, 0xFF, 0 );
2150 }
2151 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2152 }
2153 if ( aExp == 0 ) {
2154 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2155 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2156 }
2157 zExp = aExp - bExp + 0x7D;
2158 aSig = ( aSig | 0x00800000 )<<7;
2159 bSig = ( bSig | 0x00800000 )<<8;
2160 if ( bSig <= ( aSig + aSig ) ) {
2161 aSig >>= 1;
2162 ++zExp;
2163 }
2164 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2165 if ( ( zSig & 0x3F ) == 0 ) {
2166 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2167 }
2168 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2169
2170 }
2171
2172 /*----------------------------------------------------------------------------
2173 | Returns the remainder of the single-precision floating-point value `a'
2174 | with respect to the corresponding value `b'. The operation is performed
2175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 *----------------------------------------------------------------------------*/
2177
2178 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2179 {
2180 flag aSign, zSign;
2181 int_fast16_t aExp, bExp, expDiff;
2182 uint32_t aSig, bSig;
2183 uint32_t q;
2184 uint64_t aSig64, bSig64, q64;
2185 uint32_t alternateASig;
2186 int32_t sigMean;
2187 a = float32_squash_input_denormal(a STATUS_VAR);
2188 b = float32_squash_input_denormal(b STATUS_VAR);
2189
2190 aSig = extractFloat32Frac( a );
2191 aExp = extractFloat32Exp( a );
2192 aSign = extractFloat32Sign( a );
2193 bSig = extractFloat32Frac( b );
2194 bExp = extractFloat32Exp( b );
2195 if ( aExp == 0xFF ) {
2196 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2197 return propagateFloat32NaN( a, b STATUS_VAR );
2198 }
2199 float_raise( float_flag_invalid STATUS_VAR);
2200 return float32_default_nan;
2201 }
2202 if ( bExp == 0xFF ) {
2203 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2204 return a;
2205 }
2206 if ( bExp == 0 ) {
2207 if ( bSig == 0 ) {
2208 float_raise( float_flag_invalid STATUS_VAR);
2209 return float32_default_nan;
2210 }
2211 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2212 }
2213 if ( aExp == 0 ) {
2214 if ( aSig == 0 ) return a;
2215 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2216 }
2217 expDiff = aExp - bExp;
2218 aSig |= 0x00800000;
2219 bSig |= 0x00800000;
2220 if ( expDiff < 32 ) {
2221 aSig <<= 8;
2222 bSig <<= 8;
2223 if ( expDiff < 0 ) {
2224 if ( expDiff < -1 ) return a;
2225 aSig >>= 1;
2226 }
2227 q = ( bSig <= aSig );
2228 if ( q ) aSig -= bSig;
2229 if ( 0 < expDiff ) {
2230 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2231 q >>= 32 - expDiff;
2232 bSig >>= 2;
2233 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2234 }
2235 else {
2236 aSig >>= 2;
2237 bSig >>= 2;
2238 }
2239 }
2240 else {
2241 if ( bSig <= aSig ) aSig -= bSig;
2242 aSig64 = ( (uint64_t) aSig )<<40;
2243 bSig64 = ( (uint64_t) bSig )<<40;
2244 expDiff -= 64;
2245 while ( 0 < expDiff ) {
2246 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2247 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2248 aSig64 = - ( ( bSig * q64 )<<38 );
2249 expDiff -= 62;
2250 }
2251 expDiff += 64;
2252 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2253 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2254 q = q64>>( 64 - expDiff );
2255 bSig <<= 6;
2256 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2257 }
2258 do {
2259 alternateASig = aSig;
2260 ++q;
2261 aSig -= bSig;
2262 } while ( 0 <= (int32_t) aSig );
2263 sigMean = aSig + alternateASig;
2264 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2265 aSig = alternateASig;
2266 }
2267 zSign = ( (int32_t) aSig < 0 );
2268 if ( zSign ) aSig = - aSig;
2269 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2270
2271 }
2272
2273 /*----------------------------------------------------------------------------
2274 | Returns the result of multiplying the single-precision floating-point values
2275 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2276 | multiplication. The operation is performed according to the IEC/IEEE
2277 | Standard for Binary Floating-Point Arithmetic 754-2008.
2278 | The flags argument allows the caller to select negation of the
2279 | addend, the intermediate product, or the final result. (The difference
2280 | between this and having the caller do a separate negation is that negating
2281 | externally will flip the sign bit on NaNs.)
2282 *----------------------------------------------------------------------------*/
2283
2284 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2285 {
2286 flag aSign, bSign, cSign, zSign;
2287 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2288 uint32_t aSig, bSig, cSig;
2289 flag pInf, pZero, pSign;
2290 uint64_t pSig64, cSig64, zSig64;
2291 uint32_t pSig;
2292 int shiftcount;
2293 flag signflip, infzero;
2294
2295 a = float32_squash_input_denormal(a STATUS_VAR);
2296 b = float32_squash_input_denormal(b STATUS_VAR);
2297 c = float32_squash_input_denormal(c STATUS_VAR);
2298 aSig = extractFloat32Frac(a);
2299 aExp = extractFloat32Exp(a);
2300 aSign = extractFloat32Sign(a);
2301 bSig = extractFloat32Frac(b);
2302 bExp = extractFloat32Exp(b);
2303 bSign = extractFloat32Sign(b);
2304 cSig = extractFloat32Frac(c);
2305 cExp = extractFloat32Exp(c);
2306 cSign = extractFloat32Sign(c);
2307
2308 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2309 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2310
2311 /* It is implementation-defined whether the cases of (0,inf,qnan)
2312 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2313 * they return if they do), so we have to hand this information
2314 * off to the target-specific pick-a-NaN routine.
2315 */
2316 if (((aExp == 0xff) && aSig) ||
2317 ((bExp == 0xff) && bSig) ||
2318 ((cExp == 0xff) && cSig)) {
2319 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2320 }
2321
2322 if (infzero) {
2323 float_raise(float_flag_invalid STATUS_VAR);
2324 return float32_default_nan;
2325 }
2326
2327 if (flags & float_muladd_negate_c) {
2328 cSign ^= 1;
2329 }
2330
2331 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2332
2333 /* Work out the sign and type of the product */
2334 pSign = aSign ^ bSign;
2335 if (flags & float_muladd_negate_product) {
2336 pSign ^= 1;
2337 }
2338 pInf = (aExp == 0xff) || (bExp == 0xff);
2339 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2340
2341 if (cExp == 0xff) {
2342 if (pInf && (pSign ^ cSign)) {
2343 /* addition of opposite-signed infinities => InvalidOperation */
2344 float_raise(float_flag_invalid STATUS_VAR);
2345 return float32_default_nan;
2346 }
2347 /* Otherwise generate an infinity of the same sign */
2348 return packFloat32(cSign ^ signflip, 0xff, 0);
2349 }
2350
2351 if (pInf) {
2352 return packFloat32(pSign ^ signflip, 0xff, 0);
2353 }
2354
2355 if (pZero) {
2356 if (cExp == 0) {
2357 if (cSig == 0) {
2358 /* Adding two exact zeroes */
2359 if (pSign == cSign) {
2360 zSign = pSign;
2361 } else if (STATUS(float_rounding_mode) == float_round_down) {
2362 zSign = 1;
2363 } else {
2364 zSign = 0;
2365 }
2366 return packFloat32(zSign ^ signflip, 0, 0);
2367 }
2368 /* Exact zero plus a denorm */
2369 if (STATUS(flush_to_zero)) {
2370 float_raise(float_flag_output_denormal STATUS_VAR);
2371 return packFloat32(cSign ^ signflip, 0, 0);
2372 }
2373 }
2374 /* Zero plus something non-zero : just return the something */
2375 if (flags & float_muladd_halve_result) {
2376 if (cExp == 0) {
2377 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2378 }
2379 /* Subtract one to halve, and one again because roundAndPackFloat32
2380 * wants one less than the true exponent.
2381 */
2382 cExp -= 2;
2383 cSig = (cSig | 0x00800000) << 7;
2384 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2385 }
2386 return packFloat32(cSign ^ signflip, cExp, cSig);
2387 }
2388
2389 if (aExp == 0) {
2390 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2391 }
2392 if (bExp == 0) {
2393 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2394 }
2395
2396 /* Calculate the actual result a * b + c */
2397
2398 /* Multiply first; this is easy. */
2399 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2400 * because we want the true exponent, not the "one-less-than"
2401 * flavour that roundAndPackFloat32() takes.
2402 */
2403 pExp = aExp + bExp - 0x7e;
2404 aSig = (aSig | 0x00800000) << 7;
2405 bSig = (bSig | 0x00800000) << 8;
2406 pSig64 = (uint64_t)aSig * bSig;
2407 if ((int64_t)(pSig64 << 1) >= 0) {
2408 pSig64 <<= 1;
2409 pExp--;
2410 }
2411
2412 zSign = pSign ^ signflip;
2413
2414 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2415 * position 62.
2416 */
2417 if (cExp == 0) {
2418 if (!cSig) {
2419 /* Throw out the special case of c being an exact zero now */
2420 shift64RightJamming(pSig64, 32, &pSig64);
2421 pSig = pSig64;
2422 if (flags & float_muladd_halve_result) {
2423 pExp--;
2424 }
2425 return roundAndPackFloat32(zSign, pExp - 1,
2426 pSig STATUS_VAR);
2427 }
2428 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2429 }
2430
2431 cSig64 = (uint64_t)cSig << (62 - 23);
2432 cSig64 |= LIT64(0x4000000000000000);
2433 expDiff = pExp - cExp;
2434
2435 if (pSign == cSign) {
2436 /* Addition */
2437 if (expDiff > 0) {
2438 /* scale c to match p */
2439 shift64RightJamming(cSig64, expDiff, &cSig64);
2440 zExp = pExp;
2441 } else if (expDiff < 0) {
2442 /* scale p to match c */
2443 shift64RightJamming(pSig64, -expDiff, &pSig64);
2444 zExp = cExp;
2445 } else {
2446 /* no scaling needed */
2447 zExp = cExp;
2448 }
2449 /* Add significands and make sure explicit bit ends up in posn 62 */
2450 zSig64 = pSig64 + cSig64;
2451 if ((int64_t)zSig64 < 0) {
2452 shift64RightJamming(zSig64, 1, &zSig64);
2453 } else {
2454 zExp--;
2455 }
2456 } else {
2457 /* Subtraction */
2458 if (expDiff > 0) {
2459 shift64RightJamming(cSig64, expDiff, &cSig64);
2460 zSig64 = pSig64 - cSig64;
2461 zExp = pExp;
2462 } else if (expDiff < 0) {
2463 shift64RightJamming(pSig64, -expDiff, &pSig64);
2464 zSig64 = cSig64 - pSig64;
2465 zExp = cExp;
2466 zSign ^= 1;
2467 } else {
2468 zExp = pExp;
2469 if (cSig64 < pSig64) {
2470 zSig64 = pSig64 - cSig64;
2471 } else if (pSig64 < cSig64) {
2472 zSig64 = cSig64 - pSig64;
2473 zSign ^= 1;
2474 } else {
2475 /* Exact zero */
2476 zSign = signflip;
2477 if (STATUS(float_rounding_mode) == float_round_down) {
2478 zSign ^= 1;
2479 }
2480 return packFloat32(zSign, 0, 0);
2481 }
2482 }
2483 --zExp;
2484 /* Normalize to put the explicit bit back into bit 62. */
2485 shiftcount = countLeadingZeros64(zSig64) - 1;
2486 zSig64 <<= shiftcount;
2487 zExp -= shiftcount;
2488 }
2489 if (flags & float_muladd_halve_result) {
2490 zExp--;
2491 }
2492
2493 shift64RightJamming(zSig64, 32, &zSig64);
2494 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2495 }
2496
2497
2498 /*----------------------------------------------------------------------------
2499 | Returns the square root of the single-precision floating-point value `a'.
2500 | The operation is performed according to the IEC/IEEE Standard for Binary
2501 | Floating-Point Arithmetic.
2502 *----------------------------------------------------------------------------*/
2503
2504 float32 float32_sqrt( float32 a STATUS_PARAM )
2505 {
2506 flag aSign;
2507 int_fast16_t aExp, zExp;
2508 uint32_t aSig, zSig;
2509 uint64_t rem, term;
2510 a = float32_squash_input_denormal(a STATUS_VAR);
2511
2512 aSig = extractFloat32Frac( a );
2513 aExp = extractFloat32Exp( a );
2514 aSign = extractFloat32Sign( a );
2515 if ( aExp == 0xFF ) {
2516 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2517 if ( ! aSign ) return a;
2518 float_raise( float_flag_invalid STATUS_VAR);
2519 return float32_default_nan;
2520 }
2521 if ( aSign ) {
2522 if ( ( aExp | aSig ) == 0 ) return a;
2523 float_raise( float_flag_invalid STATUS_VAR);
2524 return float32_default_nan;
2525 }
2526 if ( aExp == 0 ) {
2527 if ( aSig == 0 ) return float32_zero;
2528 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2529 }
2530 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2531 aSig = ( aSig | 0x00800000 )<<8;
2532 zSig = estimateSqrt32( aExp, aSig ) + 2;
2533 if ( ( zSig & 0x7F ) <= 5 ) {
2534 if ( zSig < 2 ) {
2535 zSig = 0x7FFFFFFF;
2536 goto roundAndPack;
2537 }
2538 aSig >>= aExp & 1;
2539 term = ( (uint64_t) zSig ) * zSig;
2540 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2541 while ( (int64_t) rem < 0 ) {
2542 --zSig;
2543 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2544 }
2545 zSig |= ( rem != 0 );
2546 }
2547 shift32RightJamming( zSig, 1, &zSig );
2548 roundAndPack:
2549 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the binary exponential of the single-precision floating-point value
2555 | `a'. The operation is performed according to the IEC/IEEE Standard for
2556 | Binary Floating-Point Arithmetic.
2557 |
2558 | Uses the following identities:
2559 |
2560 | 1. -------------------------------------------------------------------------
2561 | x x*ln(2)
2562 | 2 = e
2563 |
2564 | 2. -------------------------------------------------------------------------
2565 | 2 3 4 5 n
2566 | x x x x x x x
2567 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2568 | 1! 2! 3! 4! 5! n!
2569 *----------------------------------------------------------------------------*/
2570
2571 static const float64 float32_exp2_coefficients[15] =
2572 {
2573 const_float64( 0x3ff0000000000000ll ), /* 1 */
2574 const_float64( 0x3fe0000000000000ll ), /* 2 */
2575 const_float64( 0x3fc5555555555555ll ), /* 3 */
2576 const_float64( 0x3fa5555555555555ll ), /* 4 */
2577 const_float64( 0x3f81111111111111ll ), /* 5 */
2578 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2579 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2580 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2581 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2582 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2583 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2584 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2585 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2586 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2587 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2588 };
2589
2590 float32 float32_exp2( float32 a STATUS_PARAM )
2591 {
2592 flag aSign;
2593 int_fast16_t aExp;
2594 uint32_t aSig;
2595 float64 r, x, xn;
2596 int i;
2597 a = float32_squash_input_denormal(a STATUS_VAR);
2598
2599 aSig = extractFloat32Frac( a );
2600 aExp = extractFloat32Exp( a );
2601 aSign = extractFloat32Sign( a );
2602
2603 if ( aExp == 0xFF) {
2604 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2605 return (aSign) ? float32_zero : a;
2606 }
2607 if (aExp == 0) {
2608 if (aSig == 0) return float32_one;
2609 }
2610
2611 float_raise( float_flag_inexact STATUS_VAR);
2612
2613 /* ******************************* */
2614 /* using float64 for approximation */
2615 /* ******************************* */
2616 x = float32_to_float64(a STATUS_VAR);
2617 x = float64_mul(x, float64_ln2 STATUS_VAR);
2618
2619 xn = x;
2620 r = float64_one;
2621 for (i = 0 ; i < 15 ; i++) {
2622 float64 f;
2623
2624 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2625 r = float64_add(r, f STATUS_VAR);
2626
2627 xn = float64_mul(xn, x STATUS_VAR);
2628 }
2629
2630 return float64_to_float32(r, status);
2631 }
2632
2633 /*----------------------------------------------------------------------------
2634 | Returns the binary log of the single-precision floating-point value `a'.
2635 | The operation is performed according to the IEC/IEEE Standard for Binary
2636 | Floating-Point Arithmetic.
2637 *----------------------------------------------------------------------------*/
2638 float32 float32_log2( float32 a STATUS_PARAM )
2639 {
2640 flag aSign, zSign;
2641 int_fast16_t aExp;
2642 uint32_t aSig, zSig, i;
2643
2644 a = float32_squash_input_denormal(a STATUS_VAR);
2645 aSig = extractFloat32Frac( a );
2646 aExp = extractFloat32Exp( a );
2647 aSign = extractFloat32Sign( a );
2648
2649 if ( aExp == 0 ) {
2650 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2651 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2652 }
2653 if ( aSign ) {
2654 float_raise( float_flag_invalid STATUS_VAR);
2655 return float32_default_nan;
2656 }
2657 if ( aExp == 0xFF ) {
2658 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2659 return a;
2660 }
2661
2662 aExp -= 0x7F;
2663 aSig |= 0x00800000;
2664 zSign = aExp < 0;
2665 zSig = aExp << 23;
2666
2667 for (i = 1 << 22; i > 0; i >>= 1) {
2668 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2669 if ( aSig & 0x01000000 ) {
2670 aSig >>= 1;
2671 zSig |= i;
2672 }
2673 }
2674
2675 if ( zSign )
2676 zSig = -zSig;
2677
2678 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2679 }
2680
2681 /*----------------------------------------------------------------------------
2682 | Returns 1 if the single-precision floating-point value `a' is equal to
2683 | the corresponding value `b', and 0 otherwise. The invalid exception is
2684 | raised if either operand is a NaN. Otherwise, the comparison is performed
2685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2686 *----------------------------------------------------------------------------*/
2687
2688 int float32_eq( float32 a, float32 b STATUS_PARAM )
2689 {
2690 uint32_t av, bv;
2691 a = float32_squash_input_denormal(a STATUS_VAR);
2692 b = float32_squash_input_denormal(b STATUS_VAR);
2693
2694 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2695 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2696 ) {
2697 float_raise( float_flag_invalid STATUS_VAR);
2698 return 0;
2699 }
2700 av = float32_val(a);
2701 bv = float32_val(b);
2702 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2703 }
2704
2705 /*----------------------------------------------------------------------------
2706 | Returns 1 if the single-precision floating-point value `a' is less than
2707 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2708 | exception is raised if either operand is a NaN. The comparison is performed
2709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2710 *----------------------------------------------------------------------------*/
2711
2712 int float32_le( float32 a, float32 b STATUS_PARAM )
2713 {
2714 flag aSign, bSign;
2715 uint32_t av, bv;
2716 a = float32_squash_input_denormal(a STATUS_VAR);
2717 b = float32_squash_input_denormal(b STATUS_VAR);
2718
2719 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2720 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2721 ) {
2722 float_raise( float_flag_invalid STATUS_VAR);
2723 return 0;
2724 }
2725 aSign = extractFloat32Sign( a );
2726 bSign = extractFloat32Sign( b );
2727 av = float32_val(a);
2728 bv = float32_val(b);
2729 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2730 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2731
2732 }
2733
2734 /*----------------------------------------------------------------------------
2735 | Returns 1 if the single-precision floating-point value `a' is less than
2736 | the corresponding value `b', and 0 otherwise. The invalid exception is
2737 | raised if either operand is a NaN. The comparison is performed according
2738 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2739 *----------------------------------------------------------------------------*/
2740
2741 int float32_lt( float32 a, float32 b STATUS_PARAM )
2742 {
2743 flag aSign, bSign;
2744 uint32_t av, bv;
2745 a = float32_squash_input_denormal(a STATUS_VAR);
2746 b = float32_squash_input_denormal(b STATUS_VAR);
2747
2748 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2749 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2750 ) {
2751 float_raise( float_flag_invalid STATUS_VAR);
2752 return 0;
2753 }
2754 aSign = extractFloat32Sign( a );
2755 bSign = extractFloat32Sign( b );
2756 av = float32_val(a);
2757 bv = float32_val(b);
2758 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2759 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2760
2761 }
2762
2763 /*----------------------------------------------------------------------------
2764 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2765 | be compared, and 0 otherwise. The invalid exception is raised if either
2766 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2767 | Standard for Binary Floating-Point Arithmetic.
2768 *----------------------------------------------------------------------------*/
2769
2770 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2771 {
2772 a = float32_squash_input_denormal(a STATUS_VAR);
2773 b = float32_squash_input_denormal(b STATUS_VAR);
2774
2775 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2776 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2777 ) {
2778 float_raise( float_flag_invalid STATUS_VAR);
2779 return 1;
2780 }
2781 return 0;
2782 }
2783
2784 /*----------------------------------------------------------------------------
2785 | Returns 1 if the single-precision floating-point value `a' is equal to
2786 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2787 | exception. The comparison is performed according to the IEC/IEEE Standard
2788 | for Binary Floating-Point Arithmetic.
2789 *----------------------------------------------------------------------------*/
2790
2791 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2792 {
2793 a = float32_squash_input_denormal(a STATUS_VAR);
2794 b = float32_squash_input_denormal(b STATUS_VAR);
2795
2796 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2797 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2798 ) {
2799 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2800 float_raise( float_flag_invalid STATUS_VAR);
2801 }
2802 return 0;
2803 }
2804 return ( float32_val(a) == float32_val(b) ) ||
2805 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2806 }
2807
2808 /*----------------------------------------------------------------------------
2809 | Returns 1 if the single-precision floating-point value `a' is less than or
2810 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2811 | cause an exception. Otherwise, the comparison is performed according to the
2812 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814
2815 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2816 {
2817 flag aSign, bSign;
2818 uint32_t av, bv;
2819 a = float32_squash_input_denormal(a STATUS_VAR);
2820 b = float32_squash_input_denormal(b STATUS_VAR);
2821
2822 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2823 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2824 ) {
2825 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2826 float_raise( float_flag_invalid STATUS_VAR);
2827 }
2828 return 0;
2829 }
2830 aSign = extractFloat32Sign( a );
2831 bSign = extractFloat32Sign( b );
2832 av = float32_val(a);
2833 bv = float32_val(b);
2834 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2835 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2836
2837 }
2838
2839 /*----------------------------------------------------------------------------
2840 | Returns 1 if the single-precision floating-point value `a' is less than
2841 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2842 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2843 | Standard for Binary Floating-Point Arithmetic.
2844 *----------------------------------------------------------------------------*/
2845
2846 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2847 {
2848 flag aSign, bSign;
2849 uint32_t av, bv;
2850 a = float32_squash_input_denormal(a STATUS_VAR);
2851 b = float32_squash_input_denormal(b STATUS_VAR);
2852
2853 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2854 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2855 ) {
2856 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2857 float_raise( float_flag_invalid STATUS_VAR);
2858 }
2859 return 0;
2860 }
2861 aSign = extractFloat32Sign( a );
2862 bSign = extractFloat32Sign( b );
2863 av = float32_val(a);
2864 bv = float32_val(b);
2865 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2866 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2867
2868 }
2869
2870 /*----------------------------------------------------------------------------
2871 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2872 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2873 | comparison is performed according to the IEC/IEEE Standard for Binary
2874 | Floating-Point Arithmetic.
2875 *----------------------------------------------------------------------------*/
2876
2877 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2878 {
2879 a = float32_squash_input_denormal(a STATUS_VAR);
2880 b = float32_squash_input_denormal(b STATUS_VAR);
2881
2882 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2883 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2884 ) {
2885 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2886 float_raise( float_flag_invalid STATUS_VAR);
2887 }
2888 return 1;
2889 }
2890 return 0;
2891 }
2892
2893 /*----------------------------------------------------------------------------
2894 | Returns the result of converting the double-precision floating-point value
2895 | `a' to the 32-bit two's complement integer format. The conversion is
2896 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2897 | Arithmetic---which means in particular that the conversion is rounded
2898 | according to the current rounding mode. If `a' is a NaN, the largest
2899 | positive integer is returned. Otherwise, if the conversion overflows, the
2900 | largest integer with the same sign as `a' is returned.
2901 *----------------------------------------------------------------------------*/
2902
2903 int32 float64_to_int32( float64 a STATUS_PARAM )
2904 {
2905 flag aSign;
2906 int_fast16_t aExp, shiftCount;
2907 uint64_t aSig;
2908 a = float64_squash_input_denormal(a STATUS_VAR);
2909
2910 aSig = extractFloat64Frac( a );
2911 aExp = extractFloat64Exp( a );
2912 aSign = extractFloat64Sign( a );
2913 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2914 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2915 shiftCount = 0x42C - aExp;
2916 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2917 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2918
2919 }
2920
2921 /*----------------------------------------------------------------------------
2922 | Returns the result of converting the double-precision floating-point value
2923 | `a' to the 32-bit two's complement integer format. The conversion is
2924 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2925 | Arithmetic, except that the conversion is always rounded toward zero.
2926 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2927 | the conversion overflows, the largest integer with the same sign as `a' is
2928 | returned.
2929 *----------------------------------------------------------------------------*/
2930
2931 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2932 {
2933 flag aSign;
2934 int_fast16_t aExp, shiftCount;
2935 uint64_t aSig, savedASig;
2936 int32_t z;
2937 a = float64_squash_input_denormal(a STATUS_VAR);
2938
2939 aSig = extractFloat64Frac( a );
2940 aExp = extractFloat64Exp( a );
2941 aSign = extractFloat64Sign( a );
2942 if ( 0x41E < aExp ) {
2943 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2944 goto invalid;
2945 }
2946 else if ( aExp < 0x3FF ) {
2947 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2948 return 0;
2949 }
2950 aSig |= LIT64( 0x0010000000000000 );
2951 shiftCount = 0x433 - aExp;
2952 savedASig = aSig;
2953 aSig >>= shiftCount;
2954 z = aSig;
2955 if ( aSign ) z = - z;
2956 if ( ( z < 0 ) ^ aSign ) {
2957 invalid:
2958 float_raise( float_flag_invalid STATUS_VAR);
2959 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2960 }
2961 if ( ( aSig<<shiftCount ) != savedASig ) {
2962 STATUS(float_exception_flags) |= float_flag_inexact;
2963 }
2964 return z;
2965
2966 }
2967
2968 /*----------------------------------------------------------------------------
2969 | Returns the result of converting the double-precision floating-point value
2970 | `a' to the 16-bit two's complement integer format. The conversion is
2971 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2972 | Arithmetic, except that the conversion is always rounded toward zero.
2973 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2974 | the conversion overflows, the largest integer with the same sign as `a' is
2975 | returned.
2976 *----------------------------------------------------------------------------*/
2977
2978 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2979 {
2980 flag aSign;
2981 int_fast16_t aExp, shiftCount;
2982 uint64_t aSig, savedASig;
2983 int32 z;
2984
2985 aSig = extractFloat64Frac( a );
2986 aExp = extractFloat64Exp( a );
2987 aSign = extractFloat64Sign( a );
2988 if ( 0x40E < aExp ) {
2989 if ( ( aExp == 0x7FF ) && aSig ) {
2990 aSign = 0;
2991 }
2992 goto invalid;
2993 }
2994 else if ( aExp < 0x3FF ) {
2995 if ( aExp || aSig ) {
2996 STATUS(float_exception_flags) |= float_flag_inexact;
2997 }
2998 return 0;
2999 }
3000 aSig |= LIT64( 0x0010000000000000 );
3001 shiftCount = 0x433 - aExp;
3002 savedASig = aSig;
3003 aSig >>= shiftCount;
3004 z = aSig;
3005 if ( aSign ) {
3006 z = - z;
3007 }
3008 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3009 invalid:
3010 float_raise( float_flag_invalid STATUS_VAR);
3011 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3012 }
3013 if ( ( aSig<<shiftCount ) != savedASig ) {
3014 STATUS(float_exception_flags) |= float_flag_inexact;
3015 }
3016 return z;
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of converting the double-precision floating-point value
3021 | `a' to the 64-bit two's complement integer format. The conversion is
3022 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3023 | Arithmetic---which means in particular that the conversion is rounded
3024 | according to the current rounding mode. If `a' is a NaN, the largest
3025 | positive integer is returned. Otherwise, if the conversion overflows, the
3026 | largest integer with the same sign as `a' is returned.
3027 *----------------------------------------------------------------------------*/
3028
3029 int64 float64_to_int64( float64 a STATUS_PARAM )
3030 {
3031 flag aSign;
3032 int_fast16_t aExp, shiftCount;
3033 uint64_t aSig, aSigExtra;
3034 a = float64_squash_input_denormal(a STATUS_VAR);
3035
3036 aSig = extractFloat64Frac( a );
3037 aExp = extractFloat64Exp( a );
3038 aSign = extractFloat64Sign( a );
3039 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3040 shiftCount = 0x433 - aExp;
3041 if ( shiftCount <= 0 ) {
3042 if ( 0x43E < aExp ) {
3043 float_raise( float_flag_invalid STATUS_VAR);
3044 if ( ! aSign
3045 || ( ( aExp == 0x7FF )
3046 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3047 ) {
3048 return LIT64( 0x7FFFFFFFFFFFFFFF );
3049 }
3050 return (int64_t) LIT64( 0x8000000000000000 );
3051 }
3052 aSigExtra = 0;
3053 aSig <<= - shiftCount;
3054 }
3055 else {
3056 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3057 }
3058 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3059
3060 }
3061
3062 /*----------------------------------------------------------------------------
3063 | Returns the result of converting the double-precision floating-point value
3064 | `a' to the 64-bit two's complement integer format. The conversion is
3065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3066 | Arithmetic, except that the conversion is always rounded toward zero.
3067 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3068 | the conversion overflows, the largest integer with the same sign as `a' is
3069 | returned.
3070 *----------------------------------------------------------------------------*/
3071
3072 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3073 {
3074 flag aSign;
3075 int_fast16_t aExp, shiftCount;
3076 uint64_t aSig;
3077 int64 z;
3078 a = float64_squash_input_denormal(a STATUS_VAR);
3079
3080 aSig = extractFloat64Frac( a );
3081 aExp = extractFloat64Exp( a );
3082 aSign = extractFloat64Sign( a );
3083 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3084 shiftCount = aExp - 0x433;
3085 if ( 0 <= shiftCount ) {
3086 if ( 0x43E <= aExp ) {
3087 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3088 float_raise( float_flag_invalid STATUS_VAR);
3089 if ( ! aSign
3090 || ( ( aExp == 0x7FF )
3091 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3092 ) {
3093 return LIT64( 0x7FFFFFFFFFFFFFFF );
3094 }
3095 }
3096 return (int64_t) LIT64( 0x8000000000000000 );
3097 }
3098 z = aSig<<shiftCount;
3099 }
3100 else {
3101 if ( aExp < 0x3FE ) {
3102 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3103 return 0;
3104 }
3105 z = aSig>>( - shiftCount );
3106 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3107 STATUS(float_exception_flags) |= float_flag_inexact;
3108 }
3109 }
3110 if ( aSign ) z = - z;
3111 return z;
3112
3113 }
3114
3115 /*----------------------------------------------------------------------------
3116 | Returns the result of converting the double-precision floating-point value
3117 | `a' to the single-precision floating-point format. The conversion is
3118 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3119 | Arithmetic.
3120 *----------------------------------------------------------------------------*/
3121
3122 float32 float64_to_float32( float64 a STATUS_PARAM )
3123 {
3124 flag aSign;
3125 int_fast16_t aExp;
3126 uint64_t aSig;
3127 uint32_t zSig;
3128 a = float64_squash_input_denormal(a STATUS_VAR);
3129
3130 aSig = extractFloat64Frac( a );
3131 aExp = extractFloat64Exp( a );
3132 aSign = extractFloat64Sign( a );
3133 if ( aExp == 0x7FF ) {
3134 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3135 return packFloat32( aSign, 0xFF, 0 );
3136 }
3137 shift64RightJamming( aSig, 22, &aSig );
3138 zSig = aSig;
3139 if ( aExp || zSig ) {
3140 zSig |= 0x40000000;
3141 aExp -= 0x381;
3142 }
3143 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3144
3145 }
3146
3147
3148 /*----------------------------------------------------------------------------
3149 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3150 | half-precision floating-point value, returning the result. After being
3151 | shifted into the proper positions, the three fields are simply added
3152 | together to form the result. This means that any integer portion of `zSig'
3153 | will be added into the exponent. Since a properly normalized significand
3154 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3155 | than the desired result exponent whenever `zSig' is a complete, normalized
3156 | significand.
3157 *----------------------------------------------------------------------------*/
3158 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3159 {
3160 return make_float16(
3161 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3162 }
3163
3164 /*----------------------------------------------------------------------------
3165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3166 | and significand `zSig', and returns the proper half-precision floating-
3167 | point value corresponding to the abstract input. Ordinarily, the abstract
3168 | value is simply rounded and packed into the half-precision format, with
3169 | the inexact exception raised if the abstract input cannot be represented
3170 | exactly. However, if the abstract value is too large, the overflow and
3171 | inexact exceptions are raised and an infinity or maximal finite value is
3172 | returned. If the abstract value is too small, the input value is rounded to
3173 | a subnormal number, and the underflow and inexact exceptions are raised if
3174 | the abstract input cannot be represented exactly as a subnormal half-
3175 | precision floating-point number.
3176 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3177 | ARM-style "alternative representation", which omits the NaN and Inf
3178 | encodings in order to raise the maximum representable exponent by one.
3179 | The input significand `zSig' has its binary point between bits 22
3180 | and 23, which is 13 bits to the left of the usual location. This shifted
3181 | significand must be normalized or smaller. If `zSig' is not normalized,
3182 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3183 | and it must not require rounding. In the usual case that `zSig' is
3184 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3185 | Note the slightly odd position of the binary point in zSig compared with the
3186 | other roundAndPackFloat functions. This should probably be fixed if we
3187 | need to implement more float16 routines than just conversion.
3188 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3189 | Binary Floating-Point Arithmetic.
3190 *----------------------------------------------------------------------------*/
3191
3192 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3193 uint32_t zSig, flag ieee STATUS_PARAM)
3194 {
3195 int maxexp = ieee ? 29 : 30;
3196 uint32_t mask;
3197 uint32_t increment;
3198 bool rounding_bumps_exp;
3199 bool is_tiny = false;
3200
3201 /* Calculate the mask of bits of the mantissa which are not
3202 * representable in half-precision and will be lost.
3203 */
3204 if (zExp < 1) {
3205 /* Will be denormal in halfprec */
3206 mask = 0x00ffffff;
3207 if (zExp >= -11) {
3208 mask >>= 11 + zExp;
3209 }
3210 } else {
3211 /* Normal number in halfprec */
3212 mask = 0x00001fff;
3213 }
3214
3215 switch (STATUS(float_rounding_mode)) {
3216 case float_round_nearest_even:
3217 increment = (mask + 1) >> 1;
3218 if ((zSig & mask) == increment) {
3219 increment = zSig & (increment << 1);
3220 }
3221 break;
3222 case float_round_ties_away:
3223 increment = (mask + 1) >> 1;
3224 break;
3225 case float_round_up:
3226 increment = zSign ? 0 : mask;
3227 break;
3228 case float_round_down:
3229 increment = zSign ? mask : 0;
3230 break;
3231 default: /* round_to_zero */
3232 increment = 0;
3233 break;
3234 }
3235
3236 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3237
3238 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3239 if (ieee) {
3240 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3241 return packFloat16(zSign, 0x1f, 0);
3242 } else {
3243 float_raise(float_flag_invalid STATUS_VAR);
3244 return packFloat16(zSign, 0x1f, 0x3ff);
3245 }
3246 }
3247
3248 if (zExp < 0) {
3249 /* Note that flush-to-zero does not affect half-precision results */
3250 is_tiny =
3251 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3252 || (zExp < -1)
3253 || (!rounding_bumps_exp);
3254 }
3255 if (zSig & mask) {
3256 float_raise(float_flag_inexact STATUS_VAR);
3257 if (is_tiny) {
3258 float_raise(float_flag_underflow STATUS_VAR);
3259 }
3260 }
3261
3262 zSig += increment;
3263 if (rounding_bumps_exp) {
3264 zSig >>= 1;
3265 zExp++;
3266 }
3267
3268 if (zExp < -10) {
3269 return packFloat16(zSign, 0, 0);
3270 }
3271 if (zExp < 0) {
3272 zSig >>= -zExp;
3273 zExp = 0;
3274 }
3275 return packFloat16(zSign, zExp, zSig >> 13);
3276 }
3277
3278 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3279 uint32_t *zSigPtr)
3280 {
3281 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3282 *zSigPtr = aSig << shiftCount;
3283 *zExpPtr = 1 - shiftCount;
3284 }
3285
3286 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3287 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3288
3289 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3290 {
3291 flag aSign;
3292 int_fast16_t aExp;
3293 uint32_t aSig;
3294
3295 aSign = extractFloat16Sign(a);
3296 aExp = extractFloat16Exp(a);
3297 aSig = extractFloat16Frac(a);
3298
3299 if (aExp == 0x1f && ieee) {
3300 if (aSig) {
3301 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3302 }
3303 return packFloat32(aSign, 0xff, 0);
3304 }
3305 if (aExp == 0) {
3306 if (aSig == 0) {
3307 return packFloat32(aSign, 0, 0);
3308 }
3309
3310 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3311 aExp--;
3312 }
3313 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3314 }
3315
3316 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3317 {
3318 flag aSign;
3319 int_fast16_t aExp;
3320 uint32_t aSig;
3321
3322 a = float32_squash_input_denormal(a STATUS_VAR);
3323
3324 aSig = extractFloat32Frac( a );
3325 aExp = extractFloat32Exp( a );
3326 aSign = extractFloat32Sign( a );
3327 if ( aExp == 0xFF ) {
3328 if (aSig) {
3329 /* Input is a NaN */
3330 if (!ieee) {
3331 float_raise(float_flag_invalid STATUS_VAR);
3332 return packFloat16(aSign, 0, 0);
3333 }
3334 return commonNaNToFloat16(
3335 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3336 }
3337 /* Infinity */
3338 if (!ieee) {
3339 float_raise(float_flag_invalid STATUS_VAR);
3340 return packFloat16(aSign, 0x1f, 0x3ff);
3341 }
3342 return packFloat16(aSign, 0x1f, 0);
3343 }
3344 if (aExp == 0 && aSig == 0) {
3345 return packFloat16(aSign, 0, 0);
3346 }
3347 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3348 * even if the input is denormal; however this is harmless because
3349 * the largest possible single-precision denormal is still smaller
3350 * than the smallest representable half-precision denormal, and so we
3351 * will end up ignoring aSig and returning via the "always return zero"
3352 * codepath.
3353 */
3354 aSig |= 0x00800000;
3355 aExp -= 0x71;
3356
3357 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3358 }
3359
3360 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3361 {
3362 flag aSign;
3363 int_fast16_t aExp;
3364 uint32_t aSig;
3365
3366 aSign = extractFloat16Sign(a);
3367 aExp = extractFloat16Exp(a);
3368 aSig = extractFloat16Frac(a);
3369
3370 if (aExp == 0x1f && ieee) {
3371 if (aSig) {
3372 return commonNaNToFloat64(
3373 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3374 }
3375 return pa