linux-user: Add default configs for mips64[el]
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
6
7 /*============================================================================
8
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35
36 =============================================================================*/
37
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
40 */
41 #include "config.h"
42
43 #include "softfloat.h"
44
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations. (Can be specialized to target if
48 | desired.)
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
51
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine: (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output. These details are target-
58 | specific.
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
61
62 void set_float_rounding_mode(int val STATUS_PARAM)
63 {
64 STATUS(float_rounding_mode) = val;
65 }
66
67 void set_float_exception_flags(int val STATUS_PARAM)
68 {
69 STATUS(float_exception_flags) = val;
70 }
71
72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
73 {
74 STATUS(floatx80_rounding_precision) = val;
75 }
76
77 /*----------------------------------------------------------------------------
78 | Returns the fraction bits of the half-precision floating-point value `a'.
79 *----------------------------------------------------------------------------*/
80
81 INLINE uint32_t extractFloat16Frac(float16 a)
82 {
83 return float16_val(a) & 0x3ff;
84 }
85
86 /*----------------------------------------------------------------------------
87 | Returns the exponent bits of the half-precision floating-point value `a'.
88 *----------------------------------------------------------------------------*/
89
90 INLINE int16 extractFloat16Exp(float16 a)
91 {
92 return (float16_val(a) >> 10) & 0x1f;
93 }
94
95 /*----------------------------------------------------------------------------
96 | Returns the sign bit of the single-precision floating-point value `a'.
97 *----------------------------------------------------------------------------*/
98
99 INLINE flag extractFloat16Sign(float16 a)
100 {
101 return float16_val(a)>>15;
102 }
103
104 /*----------------------------------------------------------------------------
105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
107 | input. If `zSign' is 1, the input is negated before being converted to an
108 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109 | is simply rounded to an integer, with the inexact exception raised if the
110 | input cannot be represented exactly as an integer. However, if the fixed-
111 | point input is too large, the invalid exception is raised and the largest
112 | positive or negative integer is returned.
113 *----------------------------------------------------------------------------*/
114
115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
116 {
117 int8 roundingMode;
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
120 int32 z;
121
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
127 roundIncrement = 0;
128 }
129 else {
130 roundIncrement = 0x7F;
131 if ( zSign ) {
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
133 }
134 else {
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 }
137 }
138 }
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 z = absZ;
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
147 }
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149 return z;
150
151 }
152
153 /*----------------------------------------------------------------------------
154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
156 | and returns the properly rounded 64-bit integer corresponding to the input.
157 | If `zSign' is 1, the input is negated before being converted to an integer.
158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
159 | the inexact exception raised if the input cannot be represented exactly as
160 | an integer. However, if the fixed-point input is too large, the invalid
161 | exception is raised and the largest positive or negative integer is
162 | returned.
163 *----------------------------------------------------------------------------*/
164
165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
166 {
167 int8 roundingMode;
168 flag roundNearestEven, increment;
169 int64 z;
170
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
173 increment = ( (int64_t) absZ1 < 0 );
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
176 increment = 0;
177 }
178 else {
179 if ( zSign ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
181 }
182 else {
183 increment = ( roundingMode == float_round_up ) && absZ1;
184 }
185 }
186 }
187 if ( increment ) {
188 ++absZ0;
189 if ( absZ0 == 0 ) goto overflow;
190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191 }
192 z = absZ0;
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196 float_raise( float_flag_invalid STATUS_VAR);
197 return
198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
200 }
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202 return z;
203
204 }
205
206 /*----------------------------------------------------------------------------
207 | Returns the fraction bits of the single-precision floating-point value `a'.
208 *----------------------------------------------------------------------------*/
209
210 INLINE uint32_t extractFloat32Frac( float32 a )
211 {
212
213 return float32_val(a) & 0x007FFFFF;
214
215 }
216
217 /*----------------------------------------------------------------------------
218 | Returns the exponent bits of the single-precision floating-point value `a'.
219 *----------------------------------------------------------------------------*/
220
221 INLINE int16 extractFloat32Exp( float32 a )
222 {
223
224 return ( float32_val(a)>>23 ) & 0xFF;
225
226 }
227
228 /*----------------------------------------------------------------------------
229 | Returns the sign bit of the single-precision floating-point value `a'.
230 *----------------------------------------------------------------------------*/
231
232 INLINE flag extractFloat32Sign( float32 a )
233 {
234
235 return float32_val(a)>>31;
236
237 }
238
239 /*----------------------------------------------------------------------------
240 | If `a' is denormal and we are in flush-to-zero mode then set the
241 | input-denormal exception and return zero. Otherwise just return the value.
242 *----------------------------------------------------------------------------*/
243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
244 {
245 if (STATUS(flush_inputs_to_zero)) {
246 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247 float_raise(float_flag_input_denormal STATUS_VAR);
248 return make_float32(float32_val(a) & 0x80000000);
249 }
250 }
251 return a;
252 }
253
254 /*----------------------------------------------------------------------------
255 | Normalizes the subnormal single-precision floating-point value represented
256 | by the denormalized significand `aSig'. The normalized exponent and
257 | significand are stored at the locations pointed to by `zExpPtr' and
258 | `zSigPtr', respectively.
259 *----------------------------------------------------------------------------*/
260
261 static void
262 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
263 {
264 int8 shiftCount;
265
266 shiftCount = countLeadingZeros32( aSig ) - 8;
267 *zSigPtr = aSig<<shiftCount;
268 *zExpPtr = 1 - shiftCount;
269
270 }
271
272 /*----------------------------------------------------------------------------
273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 | single-precision floating-point value, returning the result. After being
275 | shifted into the proper positions, the three fields are simply added
276 | together to form the result. This means that any integer portion of `zSig'
277 | will be added into the exponent. Since a properly normalized significand
278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
279 | than the desired result exponent whenever `zSig' is a complete, normalized
280 | significand.
281 *----------------------------------------------------------------------------*/
282
283 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
284 {
285
286 return make_float32(
287 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
288
289 }
290
291 /*----------------------------------------------------------------------------
292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 | and significand `zSig', and returns the proper single-precision floating-
294 | point value corresponding to the abstract input. Ordinarily, the abstract
295 | value is simply rounded and packed into the single-precision format, with
296 | the inexact exception raised if the abstract input cannot be represented
297 | exactly. However, if the abstract value is too large, the overflow and
298 | inexact exceptions are raised and an infinity or maximal finite value is
299 | returned. If the abstract value is too small, the input value is rounded to
300 | a subnormal number, and the underflow and inexact exceptions are raised if
301 | the abstract input cannot be represented exactly as a subnormal single-
302 | precision floating-point number.
303 | The input significand `zSig' has its binary point between bits 30
304 | and 29, which is 7 bits to the left of the usual location. This shifted
305 | significand must be normalized or smaller. If `zSig' is not normalized,
306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
307 | and it must not require rounding. In the usual case that `zSig' is
308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
310 | Binary Floating-Point Arithmetic.
311 *----------------------------------------------------------------------------*/
312
313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
314 {
315 int8 roundingMode;
316 flag roundNearestEven;
317 int8 roundIncrement, roundBits;
318 flag isTiny;
319
320 roundingMode = STATUS(float_rounding_mode);
321 roundNearestEven = ( roundingMode == float_round_nearest_even );
322 roundIncrement = 0x40;
323 if ( ! roundNearestEven ) {
324 if ( roundingMode == float_round_to_zero ) {
325 roundIncrement = 0;
326 }
327 else {
328 roundIncrement = 0x7F;
329 if ( zSign ) {
330 if ( roundingMode == float_round_up ) roundIncrement = 0;
331 }
332 else {
333 if ( roundingMode == float_round_down ) roundIncrement = 0;
334 }
335 }
336 }
337 roundBits = zSig & 0x7F;
338 if ( 0xFD <= (uint16_t) zExp ) {
339 if ( ( 0xFD < zExp )
340 || ( ( zExp == 0xFD )
341 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
342 ) {
343 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
344 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
345 }
346 if ( zExp < 0 ) {
347 if (STATUS(flush_to_zero)) {
348 float_raise(float_flag_output_denormal STATUS_VAR);
349 return packFloat32(zSign, 0, 0);
350 }
351 isTiny =
352 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353 || ( zExp < -1 )
354 || ( zSig + roundIncrement < 0x80000000 );
355 shift32RightJamming( zSig, - zExp, &zSig );
356 zExp = 0;
357 roundBits = zSig & 0x7F;
358 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
359 }
360 }
361 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362 zSig = ( zSig + roundIncrement )>>7;
363 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364 if ( zSig == 0 ) zExp = 0;
365 return packFloat32( zSign, zExp, zSig );
366
367 }
368
369 /*----------------------------------------------------------------------------
370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 | and significand `zSig', and returns the proper single-precision floating-
372 | point value corresponding to the abstract input. This routine is just like
373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375 | floating-point exponent.
376 *----------------------------------------------------------------------------*/
377
378 static float32
379 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
380 {
381 int8 shiftCount;
382
383 shiftCount = countLeadingZeros32( zSig ) - 1;
384 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
385
386 }
387
388 /*----------------------------------------------------------------------------
389 | Returns the fraction bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
391
392 INLINE uint64_t extractFloat64Frac( float64 a )
393 {
394
395 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
396
397 }
398
399 /*----------------------------------------------------------------------------
400 | Returns the exponent bits of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
402
403 INLINE int16 extractFloat64Exp( float64 a )
404 {
405
406 return ( float64_val(a)>>52 ) & 0x7FF;
407
408 }
409
410 /*----------------------------------------------------------------------------
411 | Returns the sign bit of the double-precision floating-point value `a'.
412 *----------------------------------------------------------------------------*/
413
414 INLINE flag extractFloat64Sign( float64 a )
415 {
416
417 return float64_val(a)>>63;
418
419 }
420
421 /*----------------------------------------------------------------------------
422 | If `a' is denormal and we are in flush-to-zero mode then set the
423 | input-denormal exception and return zero. Otherwise just return the value.
424 *----------------------------------------------------------------------------*/
425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
426 {
427 if (STATUS(flush_inputs_to_zero)) {
428 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429 float_raise(float_flag_input_denormal STATUS_VAR);
430 return make_float64(float64_val(a) & (1ULL << 63));
431 }
432 }
433 return a;
434 }
435
436 /*----------------------------------------------------------------------------
437 | Normalizes the subnormal double-precision floating-point value represented
438 | by the denormalized significand `aSig'. The normalized exponent and
439 | significand are stored at the locations pointed to by `zExpPtr' and
440 | `zSigPtr', respectively.
441 *----------------------------------------------------------------------------*/
442
443 static void
444 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
445 {
446 int8 shiftCount;
447
448 shiftCount = countLeadingZeros64( aSig ) - 11;
449 *zSigPtr = aSig<<shiftCount;
450 *zExpPtr = 1 - shiftCount;
451
452 }
453
454 /*----------------------------------------------------------------------------
455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456 | double-precision floating-point value, returning the result. After being
457 | shifted into the proper positions, the three fields are simply added
458 | together to form the result. This means that any integer portion of `zSig'
459 | will be added into the exponent. Since a properly normalized significand
460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
461 | than the desired result exponent whenever `zSig' is a complete, normalized
462 | significand.
463 *----------------------------------------------------------------------------*/
464
465 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
466 {
467
468 return make_float64(
469 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
470
471 }
472
473 /*----------------------------------------------------------------------------
474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475 | and significand `zSig', and returns the proper double-precision floating-
476 | point value corresponding to the abstract input. Ordinarily, the abstract
477 | value is simply rounded and packed into the double-precision format, with
478 | the inexact exception raised if the abstract input cannot be represented
479 | exactly. However, if the abstract value is too large, the overflow and
480 | inexact exceptions are raised and an infinity or maximal finite value is
481 | returned. If the abstract value is too small, the input value is rounded
482 | to a subnormal number, and the underflow and inexact exceptions are raised
483 | if the abstract input cannot be represented exactly as a subnormal double-
484 | precision floating-point number.
485 | The input significand `zSig' has its binary point between bits 62
486 | and 61, which is 10 bits to the left of the usual location. This shifted
487 | significand must be normalized or smaller. If `zSig' is not normalized,
488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
489 | and it must not require rounding. In the usual case that `zSig' is
490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
492 | Binary Floating-Point Arithmetic.
493 *----------------------------------------------------------------------------*/
494
495 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
496 {
497 int8 roundingMode;
498 flag roundNearestEven;
499 int16 roundIncrement, roundBits;
500 flag isTiny;
501
502 roundingMode = STATUS(float_rounding_mode);
503 roundNearestEven = ( roundingMode == float_round_nearest_even );
504 roundIncrement = 0x200;
505 if ( ! roundNearestEven ) {
506 if ( roundingMode == float_round_to_zero ) {
507 roundIncrement = 0;
508 }
509 else {
510 roundIncrement = 0x3FF;
511 if ( zSign ) {
512 if ( roundingMode == float_round_up ) roundIncrement = 0;
513 }
514 else {
515 if ( roundingMode == float_round_down ) roundIncrement = 0;
516 }
517 }
518 }
519 roundBits = zSig & 0x3FF;
520 if ( 0x7FD <= (uint16_t) zExp ) {
521 if ( ( 0x7FD < zExp )
522 || ( ( zExp == 0x7FD )
523 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
524 ) {
525 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
526 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
527 }
528 if ( zExp < 0 ) {
529 if (STATUS(flush_to_zero)) {
530 float_raise(float_flag_output_denormal STATUS_VAR);
531 return packFloat64(zSign, 0, 0);
532 }
533 isTiny =
534 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535 || ( zExp < -1 )
536 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537 shift64RightJamming( zSig, - zExp, &zSig );
538 zExp = 0;
539 roundBits = zSig & 0x3FF;
540 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
541 }
542 }
543 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544 zSig = ( zSig + roundIncrement )>>10;
545 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546 if ( zSig == 0 ) zExp = 0;
547 return packFloat64( zSign, zExp, zSig );
548
549 }
550
551 /*----------------------------------------------------------------------------
552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553 | and significand `zSig', and returns the proper double-precision floating-
554 | point value corresponding to the abstract input. This routine is just like
555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557 | floating-point exponent.
558 *----------------------------------------------------------------------------*/
559
560 static float64
561 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
562 {
563 int8 shiftCount;
564
565 shiftCount = countLeadingZeros64( zSig ) - 1;
566 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
567
568 }
569
570 /*----------------------------------------------------------------------------
571 | Returns the fraction bits of the extended double-precision floating-point
572 | value `a'.
573 *----------------------------------------------------------------------------*/
574
575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
576 {
577
578 return a.low;
579
580 }
581
582 /*----------------------------------------------------------------------------
583 | Returns the exponent bits of the extended double-precision floating-point
584 | value `a'.
585 *----------------------------------------------------------------------------*/
586
587 INLINE int32 extractFloatx80Exp( floatx80 a )
588 {
589
590 return a.high & 0x7FFF;
591
592 }
593
594 /*----------------------------------------------------------------------------
595 | Returns the sign bit of the extended double-precision floating-point value
596 | `a'.
597 *----------------------------------------------------------------------------*/
598
599 INLINE flag extractFloatx80Sign( floatx80 a )
600 {
601
602 return a.high>>15;
603
604 }
605
606 /*----------------------------------------------------------------------------
607 | Normalizes the subnormal extended double-precision floating-point value
608 | represented by the denormalized significand `aSig'. The normalized exponent
609 | and significand are stored at the locations pointed to by `zExpPtr' and
610 | `zSigPtr', respectively.
611 *----------------------------------------------------------------------------*/
612
613 static void
614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
615 {
616 int8 shiftCount;
617
618 shiftCount = countLeadingZeros64( aSig );
619 *zSigPtr = aSig<<shiftCount;
620 *zExpPtr = 1 - shiftCount;
621
622 }
623
624 /*----------------------------------------------------------------------------
625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626 | extended double-precision floating-point value, returning the result.
627 *----------------------------------------------------------------------------*/
628
629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
630 {
631 floatx80 z;
632
633 z.low = zSig;
634 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
635 return z;
636
637 }
638
639 /*----------------------------------------------------------------------------
640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
642 | and returns the proper extended double-precision floating-point value
643 | corresponding to the abstract input. Ordinarily, the abstract value is
644 | rounded and packed into the extended double-precision format, with the
645 | inexact exception raised if the abstract input cannot be represented
646 | exactly. However, if the abstract value is too large, the overflow and
647 | inexact exceptions are raised and an infinity or maximal finite value is
648 | returned. If the abstract value is too small, the input value is rounded to
649 | a subnormal number, and the underflow and inexact exceptions are raised if
650 | the abstract input cannot be represented exactly as a subnormal extended
651 | double-precision floating-point number.
652 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
653 | number of bits as single or double precision, respectively. Otherwise, the
654 | result is rounded to the full precision of the extended double-precision
655 | format.
656 | The input significand must be normalized or smaller. If the input
657 | significand is not normalized, `zExp' must be 0; in that case, the result
658 | returned is a subnormal number, and it must not require rounding. The
659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660 | Floating-Point Arithmetic.
661 *----------------------------------------------------------------------------*/
662
663 static floatx80
664 roundAndPackFloatx80(
665 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
666 STATUS_PARAM)
667 {
668 int8 roundingMode;
669 flag roundNearestEven, increment, isTiny;
670 int64 roundIncrement, roundMask, roundBits;
671
672 roundingMode = STATUS(float_rounding_mode);
673 roundNearestEven = ( roundingMode == float_round_nearest_even );
674 if ( roundingPrecision == 80 ) goto precision80;
675 if ( roundingPrecision == 64 ) {
676 roundIncrement = LIT64( 0x0000000000000400 );
677 roundMask = LIT64( 0x00000000000007FF );
678 }
679 else if ( roundingPrecision == 32 ) {
680 roundIncrement = LIT64( 0x0000008000000000 );
681 roundMask = LIT64( 0x000000FFFFFFFFFF );
682 }
683 else {
684 goto precision80;
685 }
686 zSig0 |= ( zSig1 != 0 );
687 if ( ! roundNearestEven ) {
688 if ( roundingMode == float_round_to_zero ) {
689 roundIncrement = 0;
690 }
691 else {
692 roundIncrement = roundMask;
693 if ( zSign ) {
694 if ( roundingMode == float_round_up ) roundIncrement = 0;
695 }
696 else {
697 if ( roundingMode == float_round_down ) roundIncrement = 0;
698 }
699 }
700 }
701 roundBits = zSig0 & roundMask;
702 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
703 if ( ( 0x7FFE < zExp )
704 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
705 ) {
706 goto overflow;
707 }
708 if ( zExp <= 0 ) {
709 if (STATUS(flush_to_zero)) {
710 float_raise(float_flag_output_denormal STATUS_VAR);
711 return packFloatx80(zSign, 0, 0);
712 }
713 isTiny =
714 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715 || ( zExp < 0 )
716 || ( zSig0 <= zSig0 + roundIncrement );
717 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718 zExp = 0;
719 roundBits = zSig0 & roundMask;
720 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722 zSig0 += roundIncrement;
723 if ( (int64_t) zSig0 < 0 ) zExp = 1;
724 roundIncrement = roundMask + 1;
725 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726 roundMask |= roundIncrement;
727 }
728 zSig0 &= ~ roundMask;
729 return packFloatx80( zSign, zExp, zSig0 );
730 }
731 }
732 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733 zSig0 += roundIncrement;
734 if ( zSig0 < roundIncrement ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
737 }
738 roundIncrement = roundMask + 1;
739 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740 roundMask |= roundIncrement;
741 }
742 zSig0 &= ~ roundMask;
743 if ( zSig0 == 0 ) zExp = 0;
744 return packFloatx80( zSign, zExp, zSig0 );
745 precision80:
746 increment = ( (int64_t) zSig1 < 0 );
747 if ( ! roundNearestEven ) {
748 if ( roundingMode == float_round_to_zero ) {
749 increment = 0;
750 }
751 else {
752 if ( zSign ) {
753 increment = ( roundingMode == float_round_down ) && zSig1;
754 }
755 else {
756 increment = ( roundingMode == float_round_up ) && zSig1;
757 }
758 }
759 }
760 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
761 if ( ( 0x7FFE < zExp )
762 || ( ( zExp == 0x7FFE )
763 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764 && increment
765 )
766 ) {
767 roundMask = 0;
768 overflow:
769 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770 if ( ( roundingMode == float_round_to_zero )
771 || ( zSign && ( roundingMode == float_round_up ) )
772 || ( ! zSign && ( roundingMode == float_round_down ) )
773 ) {
774 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
775 }
776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
777 }
778 if ( zExp <= 0 ) {
779 isTiny =
780 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781 || ( zExp < 0 )
782 || ! increment
783 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785 zExp = 0;
786 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788 if ( roundNearestEven ) {
789 increment = ( (int64_t) zSig1 < 0 );
790 }
791 else {
792 if ( zSign ) {
793 increment = ( roundingMode == float_round_down ) && zSig1;
794 }
795 else {
796 increment = ( roundingMode == float_round_up ) && zSig1;
797 }
798 }
799 if ( increment ) {
800 ++zSig0;
801 zSig0 &=
802 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803 if ( (int64_t) zSig0 < 0 ) zExp = 1;
804 }
805 return packFloatx80( zSign, zExp, zSig0 );
806 }
807 }
808 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809 if ( increment ) {
810 ++zSig0;
811 if ( zSig0 == 0 ) {
812 ++zExp;
813 zSig0 = LIT64( 0x8000000000000000 );
814 }
815 else {
816 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
817 }
818 }
819 else {
820 if ( zSig0 == 0 ) zExp = 0;
821 }
822 return packFloatx80( zSign, zExp, zSig0 );
823
824 }
825
826 /*----------------------------------------------------------------------------
827 | Takes an abstract floating-point value having sign `zSign', exponent
828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829 | and returns the proper extended double-precision floating-point value
830 | corresponding to the abstract input. This routine is just like
831 | `roundAndPackFloatx80' except that the input significand does not have to be
832 | normalized.
833 *----------------------------------------------------------------------------*/
834
835 static floatx80
836 normalizeRoundAndPackFloatx80(
837 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
838 STATUS_PARAM)
839 {
840 int8 shiftCount;
841
842 if ( zSig0 == 0 ) {
843 zSig0 = zSig1;
844 zSig1 = 0;
845 zExp -= 64;
846 }
847 shiftCount = countLeadingZeros64( zSig0 );
848 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849 zExp -= shiftCount;
850 return
851 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
852
853 }
854
855 /*----------------------------------------------------------------------------
856 | Returns the least-significant 64 fraction bits of the quadruple-precision
857 | floating-point value `a'.
858 *----------------------------------------------------------------------------*/
859
860 INLINE uint64_t extractFloat128Frac1( float128 a )
861 {
862
863 return a.low;
864
865 }
866
867 /*----------------------------------------------------------------------------
868 | Returns the most-significant 48 fraction bits of the quadruple-precision
869 | floating-point value `a'.
870 *----------------------------------------------------------------------------*/
871
872 INLINE uint64_t extractFloat128Frac0( float128 a )
873 {
874
875 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876
877 }
878
879 /*----------------------------------------------------------------------------
880 | Returns the exponent bits of the quadruple-precision floating-point value
881 | `a'.
882 *----------------------------------------------------------------------------*/
883
884 INLINE int32 extractFloat128Exp( float128 a )
885 {
886
887 return ( a.high>>48 ) & 0x7FFF;
888
889 }
890
891 /*----------------------------------------------------------------------------
892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
893 *----------------------------------------------------------------------------*/
894
895 INLINE flag extractFloat128Sign( float128 a )
896 {
897
898 return a.high>>63;
899
900 }
901
902 /*----------------------------------------------------------------------------
903 | Normalizes the subnormal quadruple-precision floating-point value
904 | represented by the denormalized significand formed by the concatenation of
905 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
906 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907 | significand are stored at the location pointed to by `zSig0Ptr', and the
908 | least significant 64 bits of the normalized significand are stored at the
909 | location pointed to by `zSig1Ptr'.
910 *----------------------------------------------------------------------------*/
911
912 static void
913 normalizeFloat128Subnormal(
914 uint64_t aSig0,
915 uint64_t aSig1,
916 int32 *zExpPtr,
917 uint64_t *zSig0Ptr,
918 uint64_t *zSig1Ptr
919 )
920 {
921 int8 shiftCount;
922
923 if ( aSig0 == 0 ) {
924 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 if ( shiftCount < 0 ) {
926 *zSig0Ptr = aSig1>>( - shiftCount );
927 *zSig1Ptr = aSig1<<( shiftCount & 63 );
928 }
929 else {
930 *zSig0Ptr = aSig1<<shiftCount;
931 *zSig1Ptr = 0;
932 }
933 *zExpPtr = - shiftCount - 63;
934 }
935 else {
936 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 *zExpPtr = 1 - shiftCount;
939 }
940
941 }
942
943 /*----------------------------------------------------------------------------
944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946 | floating-point value, returning the result. After being shifted into the
947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948 | added together to form the most significant 32 bits of the result. This
949 | means that any integer portion of `zSig0' will be added into the exponent.
950 | Since a properly normalized significand will have an integer portion equal
951 | to 1, the `zExp' input should be 1 less than the desired result exponent
952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953 | significand.
954 *----------------------------------------------------------------------------*/
955
956 INLINE float128
957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
958 {
959 float128 z;
960
961 z.low = zSig1;
962 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
963 return z;
964
965 }
966
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
970 | and `zSig2', and returns the proper quadruple-precision floating-point value
971 | corresponding to the abstract input. Ordinarily, the abstract value is
972 | simply rounded and packed into the quadruple-precision format, with the
973 | inexact exception raised if the abstract input cannot be represented
974 | exactly. However, if the abstract value is too large, the overflow and
975 | inexact exceptions are raised and an infinity or maximal finite value is
976 | returned. If the abstract value is too small, the input value is rounded to
977 | a subnormal number, and the underflow and inexact exceptions are raised if
978 | the abstract input cannot be represented exactly as a subnormal quadruple-
979 | precision floating-point number.
980 | The input significand must be normalized or smaller. If the input
981 | significand is not normalized, `zExp' must be 0; in that case, the result
982 | returned is a subnormal number, and it must not require rounding. In the
983 | usual case that the input significand is normalized, `zExp' must be 1 less
984 | than the ``true'' floating-point exponent. The handling of underflow and
985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986 *----------------------------------------------------------------------------*/
987
988 static float128
989 roundAndPackFloat128(
990 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
991 {
992 int8 roundingMode;
993 flag roundNearestEven, increment, isTiny;
994
995 roundingMode = STATUS(float_rounding_mode);
996 roundNearestEven = ( roundingMode == float_round_nearest_even );
997 increment = ( (int64_t) zSig2 < 0 );
998 if ( ! roundNearestEven ) {
999 if ( roundingMode == float_round_to_zero ) {
1000 increment = 0;
1001 }
1002 else {
1003 if ( zSign ) {
1004 increment = ( roundingMode == float_round_down ) && zSig2;
1005 }
1006 else {
1007 increment = ( roundingMode == float_round_up ) && zSig2;
1008 }
1009 }
1010 }
1011 if ( 0x7FFD <= (uint32_t) zExp ) {
1012 if ( ( 0x7FFD < zExp )
1013 || ( ( zExp == 0x7FFD )
1014 && eq128(
1015 LIT64( 0x0001FFFFFFFFFFFF ),
1016 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 zSig0,
1018 zSig1
1019 )
1020 && increment
1021 )
1022 ) {
1023 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024 if ( ( roundingMode == float_round_to_zero )
1025 || ( zSign && ( roundingMode == float_round_up ) )
1026 || ( ! zSign && ( roundingMode == float_round_down ) )
1027 ) {
1028 return
1029 packFloat128(
1030 zSign,
1031 0x7FFE,
1032 LIT64( 0x0000FFFFFFFFFFFF ),
1033 LIT64( 0xFFFFFFFFFFFFFFFF )
1034 );
1035 }
1036 return packFloat128( zSign, 0x7FFF, 0, 0 );
1037 }
1038 if ( zExp < 0 ) {
1039 if (STATUS(flush_to_zero)) {
1040 float_raise(float_flag_output_denormal STATUS_VAR);
1041 return packFloat128(zSign, 0, 0, 0);
1042 }
1043 isTiny =
1044 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045 || ( zExp < -1 )
1046 || ! increment
1047 || lt128(
1048 zSig0,
1049 zSig1,
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF )
1052 );
1053 shift128ExtraRightJamming(
1054 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055 zExp = 0;
1056 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057 if ( roundNearestEven ) {
1058 increment = ( (int64_t) zSig2 < 0 );
1059 }
1060 else {
1061 if ( zSign ) {
1062 increment = ( roundingMode == float_round_down ) && zSig2;
1063 }
1064 else {
1065 increment = ( roundingMode == float_round_up ) && zSig2;
1066 }
1067 }
1068 }
1069 }
1070 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071 if ( increment ) {
1072 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074 }
1075 else {
1076 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077 }
1078 return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080 }
1081
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input. This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 | point exponent.
1090 *----------------------------------------------------------------------------*/
1091
1092 static float128
1093 normalizeRoundAndPackFloat128(
1094 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095 {
1096 int8 shiftCount;
1097 uint64_t zSig2;
1098
1099 if ( zSig0 == 0 ) {
1100 zSig0 = zSig1;
1101 zSig1 = 0;
1102 zExp -= 64;
1103 }
1104 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 if ( 0 <= shiftCount ) {
1106 zSig2 = 0;
1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108 }
1109 else {
1110 shift128ExtraRightJamming(
1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112 }
1113 zExp -= shiftCount;
1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116 }
1117
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format. The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1123
1124 float32 int32_to_float32( int32 a STATUS_PARAM )
1125 {
1126 flag zSign;
1127
1128 if ( a == 0 ) return float32_zero;
1129 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130 zSign = ( a < 0 );
1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133 }
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format. The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float64 int32_to_float64( int32 a STATUS_PARAM )
1142 {
1143 flag zSign;
1144 uint32 absA;
1145 int8 shiftCount;
1146 uint64_t zSig;
1147
1148 if ( a == 0 ) return float64_zero;
1149 zSign = ( a < 0 );
1150 absA = zSign ? - a : a;
1151 shiftCount = countLeadingZeros32( absA ) + 21;
1152 zSig = absA;
1153 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155 }
1156
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format. The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 | Arithmetic.
1162 *----------------------------------------------------------------------------*/
1163
1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165 {
1166 flag zSign;
1167 uint32 absA;
1168 int8 shiftCount;
1169 uint64_t zSig;
1170
1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172 zSign = ( a < 0 );
1173 absA = zSign ? - a : a;
1174 shiftCount = countLeadingZeros32( absA ) + 32;
1175 zSig = absA;
1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178 }
1179
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format. The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185
1186 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 {
1188 flag zSign;
1189 uint32 absA;
1190 int8 shiftCount;
1191 uint64_t zSig0;
1192
1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194 zSign = ( a < 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 17;
1197 zSig0 = absA;
1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200 }
1201
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format. The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1207
1208 float32 int64_to_float32( int64 a STATUS_PARAM )
1209 {
1210 flag zSign;
1211 uint64 absA;
1212 int8 shiftCount;
1213
1214 if ( a == 0 ) return float32_zero;
1215 zSign = ( a < 0 );
1216 absA = zSign ? - a : a;
1217 shiftCount = countLeadingZeros64( absA ) - 40;
1218 if ( 0 <= shiftCount ) {
1219 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220 }
1221 else {
1222 shiftCount += 7;
1223 if ( shiftCount < 0 ) {
1224 shift64RightJamming( absA, - shiftCount, &absA );
1225 }
1226 else {
1227 absA <<= shiftCount;
1228 }
1229 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230 }
1231
1232 }
1233
1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235 {
1236 int8 shiftCount;
1237
1238 if ( a == 0 ) return float32_zero;
1239 shiftCount = countLeadingZeros64( a ) - 40;
1240 if ( 0 <= shiftCount ) {
1241 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1242 }
1243 else {
1244 shiftCount += 7;
1245 if ( shiftCount < 0 ) {
1246 shift64RightJamming( a, - shiftCount, &a );
1247 }
1248 else {
1249 a <<= shiftCount;
1250 }
1251 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1252 }
1253 }
1254
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format. The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260
1261 float64 int64_to_float64( int64 a STATUS_PARAM )
1262 {
1263 flag zSign;
1264
1265 if ( a == 0 ) return float64_zero;
1266 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267 return packFloat64( 1, 0x43E, 0 );
1268 }
1269 zSign = ( a < 0 );
1270 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272 }
1273
1274 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1275 {
1276 if ( a == 0 ) return float64_zero;
1277 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1278
1279 }
1280
1281 /*----------------------------------------------------------------------------
1282 | Returns the result of converting the 64-bit two's complement integer `a'
1283 | to the extended double-precision floating-point format. The conversion
1284 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285 | Arithmetic.
1286 *----------------------------------------------------------------------------*/
1287
1288 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1289 {
1290 flag zSign;
1291 uint64 absA;
1292 int8 shiftCount;
1293
1294 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295 zSign = ( a < 0 );
1296 absA = zSign ? - a : a;
1297 shiftCount = countLeadingZeros64( absA );
1298 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1299
1300 }
1301
1302 /*----------------------------------------------------------------------------
1303 | Returns the result of converting the 64-bit two's complement integer `a' to
1304 | the quadruple-precision floating-point format. The conversion is performed
1305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 *----------------------------------------------------------------------------*/
1307
1308 float128 int64_to_float128( int64 a STATUS_PARAM )
1309 {
1310 flag zSign;
1311 uint64 absA;
1312 int8 shiftCount;
1313 int32 zExp;
1314 uint64_t zSig0, zSig1;
1315
1316 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317 zSign = ( a < 0 );
1318 absA = zSign ? - a : a;
1319 shiftCount = countLeadingZeros64( absA ) + 49;
1320 zExp = 0x406E - shiftCount;
1321 if ( 64 <= shiftCount ) {
1322 zSig1 = 0;
1323 zSig0 = absA;
1324 shiftCount -= 64;
1325 }
1326 else {
1327 zSig1 = absA;
1328 zSig0 = 0;
1329 }
1330 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331 return packFloat128( zSign, zExp, zSig0, zSig1 );
1332
1333 }
1334
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the single-precision floating-point value
1337 | `a' to the 32-bit two's complement integer format. The conversion is
1338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1339 | Arithmetic---which means in particular that the conversion is rounded
1340 | according to the current rounding mode. If `a' is a NaN, the largest
1341 | positive integer is returned. Otherwise, if the conversion overflows, the
1342 | largest integer with the same sign as `a' is returned.
1343 *----------------------------------------------------------------------------*/
1344
1345 int32 float32_to_int32( float32 a STATUS_PARAM )
1346 {
1347 flag aSign;
1348 int16 aExp, shiftCount;
1349 uint32_t aSig;
1350 uint64_t aSig64;
1351
1352 a = float32_squash_input_denormal(a STATUS_VAR);
1353 aSig = extractFloat32Frac( a );
1354 aExp = extractFloat32Exp( a );
1355 aSign = extractFloat32Sign( a );
1356 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357 if ( aExp ) aSig |= 0x00800000;
1358 shiftCount = 0xAF - aExp;
1359 aSig64 = aSig;
1360 aSig64 <<= 32;
1361 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1363
1364 }
1365
1366 /*----------------------------------------------------------------------------
1367 | Returns the result of converting the single-precision floating-point value
1368 | `a' to the 32-bit two's complement integer format. The conversion is
1369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 | Arithmetic, except that the conversion is always rounded toward zero.
1371 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1372 | the conversion overflows, the largest integer with the same sign as `a' is
1373 | returned.
1374 *----------------------------------------------------------------------------*/
1375
1376 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1377 {
1378 flag aSign;
1379 int16 aExp, shiftCount;
1380 uint32_t aSig;
1381 int32 z;
1382 a = float32_squash_input_denormal(a STATUS_VAR);
1383
1384 aSig = extractFloat32Frac( a );
1385 aExp = extractFloat32Exp( a );
1386 aSign = extractFloat32Sign( a );
1387 shiftCount = aExp - 0x9E;
1388 if ( 0 <= shiftCount ) {
1389 if ( float32_val(a) != 0xCF000000 ) {
1390 float_raise( float_flag_invalid STATUS_VAR);
1391 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1392 }
1393 return (int32_t) 0x80000000;
1394 }
1395 else if ( aExp <= 0x7E ) {
1396 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397 return 0;
1398 }
1399 aSig = ( aSig | 0x00800000 )<<8;
1400 z = aSig>>( - shiftCount );
1401 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1402 STATUS(float_exception_flags) |= float_flag_inexact;
1403 }
1404 if ( aSign ) z = - z;
1405 return z;
1406
1407 }
1408
1409 /*----------------------------------------------------------------------------
1410 | Returns the result of converting the single-precision floating-point value
1411 | `a' to the 16-bit two's complement integer format. The conversion is
1412 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1413 | Arithmetic, except that the conversion is always rounded toward zero.
1414 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1415 | the conversion overflows, the largest integer with the same sign as `a' is
1416 | returned.
1417 *----------------------------------------------------------------------------*/
1418
1419 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1420 {
1421 flag aSign;
1422 int16 aExp, shiftCount;
1423 uint32_t aSig;
1424 int32 z;
1425
1426 aSig = extractFloat32Frac( a );
1427 aExp = extractFloat32Exp( a );
1428 aSign = extractFloat32Sign( a );
1429 shiftCount = aExp - 0x8E;
1430 if ( 0 <= shiftCount ) {
1431 if ( float32_val(a) != 0xC7000000 ) {
1432 float_raise( float_flag_invalid STATUS_VAR);
1433 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434 return 0x7FFF;
1435 }
1436 }
1437 return (int32_t) 0xffff8000;
1438 }
1439 else if ( aExp <= 0x7E ) {
1440 if ( aExp | aSig ) {
1441 STATUS(float_exception_flags) |= float_flag_inexact;
1442 }
1443 return 0;
1444 }
1445 shiftCount -= 0x10;
1446 aSig = ( aSig | 0x00800000 )<<8;
1447 z = aSig>>( - shiftCount );
1448 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1449 STATUS(float_exception_flags) |= float_flag_inexact;
1450 }
1451 if ( aSign ) {
1452 z = - z;
1453 }
1454 return z;
1455
1456 }
1457
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of converting the single-precision floating-point value
1460 | `a' to the 64-bit two's complement integer format. The conversion is
1461 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1462 | Arithmetic---which means in particular that the conversion is rounded
1463 | according to the current rounding mode. If `a' is a NaN, the largest
1464 | positive integer is returned. Otherwise, if the conversion overflows, the
1465 | largest integer with the same sign as `a' is returned.
1466 *----------------------------------------------------------------------------*/
1467
1468 int64 float32_to_int64( float32 a STATUS_PARAM )
1469 {
1470 flag aSign;
1471 int16 aExp, shiftCount;
1472 uint32_t aSig;
1473 uint64_t aSig64, aSigExtra;
1474 a = float32_squash_input_denormal(a STATUS_VAR);
1475
1476 aSig = extractFloat32Frac( a );
1477 aExp = extractFloat32Exp( a );
1478 aSign = extractFloat32Sign( a );
1479 shiftCount = 0xBE - aExp;
1480 if ( shiftCount < 0 ) {
1481 float_raise( float_flag_invalid STATUS_VAR);
1482 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483 return LIT64( 0x7FFFFFFFFFFFFFFF );
1484 }
1485 return (int64_t) LIT64( 0x8000000000000000 );
1486 }
1487 if ( aExp ) aSig |= 0x00800000;
1488 aSig64 = aSig;
1489 aSig64 <<= 40;
1490 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1492
1493 }
1494
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the single-precision floating-point value
1497 | `a' to the 64-bit two's complement integer format. The conversion is
1498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 | Arithmetic, except that the conversion is always rounded toward zero. If
1500 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1501 | conversion overflows, the largest integer with the same sign as `a' is
1502 | returned.
1503 *----------------------------------------------------------------------------*/
1504
1505 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1506 {
1507 flag aSign;
1508 int16 aExp, shiftCount;
1509 uint32_t aSig;
1510 uint64_t aSig64;
1511 int64 z;
1512 a = float32_squash_input_denormal(a STATUS_VAR);
1513
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 shiftCount = aExp - 0xBE;
1518 if ( 0 <= shiftCount ) {
1519 if ( float32_val(a) != 0xDF000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR);
1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522 return LIT64( 0x7FFFFFFFFFFFFFFF );
1523 }
1524 }
1525 return (int64_t) LIT64( 0x8000000000000000 );
1526 }
1527 else if ( aExp <= 0x7E ) {
1528 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529 return 0;
1530 }
1531 aSig64 = aSig | 0x00800000;
1532 aSig64 <<= 40;
1533 z = aSig64>>( - shiftCount );
1534 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1535 STATUS(float_exception_flags) |= float_flag_inexact;
1536 }
1537 if ( aSign ) z = - z;
1538 return z;
1539
1540 }
1541
1542 /*----------------------------------------------------------------------------
1543 | Returns the result of converting the single-precision floating-point value
1544 | `a' to the double-precision floating-point format. The conversion is
1545 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1546 | Arithmetic.
1547 *----------------------------------------------------------------------------*/
1548
1549 float64 float32_to_float64( float32 a STATUS_PARAM )
1550 {
1551 flag aSign;
1552 int16 aExp;
1553 uint32_t aSig;
1554 a = float32_squash_input_denormal(a STATUS_VAR);
1555
1556 aSig = extractFloat32Frac( a );
1557 aExp = extractFloat32Exp( a );
1558 aSign = extractFloat32Sign( a );
1559 if ( aExp == 0xFF ) {
1560 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1561 return packFloat64( aSign, 0x7FF, 0 );
1562 }
1563 if ( aExp == 0 ) {
1564 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566 --aExp;
1567 }
1568 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1569
1570 }
1571
1572 /*----------------------------------------------------------------------------
1573 | Returns the result of converting the single-precision floating-point value
1574 | `a' to the extended double-precision floating-point format. The conversion
1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 | Arithmetic.
1577 *----------------------------------------------------------------------------*/
1578
1579 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1580 {
1581 flag aSign;
1582 int16 aExp;
1583 uint32_t aSig;
1584
1585 a = float32_squash_input_denormal(a STATUS_VAR);
1586 aSig = extractFloat32Frac( a );
1587 aExp = extractFloat32Exp( a );
1588 aSign = extractFloat32Sign( a );
1589 if ( aExp == 0xFF ) {
1590 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1591 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592 }
1593 if ( aExp == 0 ) {
1594 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596 }
1597 aSig |= 0x00800000;
1598 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1599
1600 }
1601
1602 /*----------------------------------------------------------------------------
1603 | Returns the result of converting the single-precision floating-point value
1604 | `a' to the double-precision floating-point format. The conversion is
1605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1606 | Arithmetic.
1607 *----------------------------------------------------------------------------*/
1608
1609 float128 float32_to_float128( float32 a STATUS_PARAM )
1610 {
1611 flag aSign;
1612 int16 aExp;
1613 uint32_t aSig;
1614
1615 a = float32_squash_input_denormal(a STATUS_VAR);
1616 aSig = extractFloat32Frac( a );
1617 aExp = extractFloat32Exp( a );
1618 aSign = extractFloat32Sign( a );
1619 if ( aExp == 0xFF ) {
1620 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1621 return packFloat128( aSign, 0x7FFF, 0, 0 );
1622 }
1623 if ( aExp == 0 ) {
1624 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626 --aExp;
1627 }
1628 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1629
1630 }
1631
1632 /*----------------------------------------------------------------------------
1633 | Rounds the single-precision floating-point value `a' to an integer, and
1634 | returns the result as a single-precision floating-point value. The
1635 | operation is performed according to the IEC/IEEE Standard for Binary
1636 | Floating-Point Arithmetic.
1637 *----------------------------------------------------------------------------*/
1638
1639 float32 float32_round_to_int( float32 a STATUS_PARAM)
1640 {
1641 flag aSign;
1642 int16 aExp;
1643 uint32_t lastBitMask, roundBitsMask;
1644 int8 roundingMode;
1645 uint32_t z;
1646 a = float32_squash_input_denormal(a STATUS_VAR);
1647
1648 aExp = extractFloat32Exp( a );
1649 if ( 0x96 <= aExp ) {
1650 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651 return propagateFloat32NaN( a, a STATUS_VAR );
1652 }
1653 return a;
1654 }
1655 if ( aExp <= 0x7E ) {
1656 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1657 STATUS(float_exception_flags) |= float_flag_inexact;
1658 aSign = extractFloat32Sign( a );
1659 switch ( STATUS(float_rounding_mode) ) {
1660 case float_round_nearest_even:
1661 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662 return packFloat32( aSign, 0x7F, 0 );
1663 }
1664 break;
1665 case float_round_down:
1666 return make_float32(aSign ? 0xBF800000 : 0);
1667 case float_round_up:
1668 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1669 }
1670 return packFloat32( aSign, 0, 0 );
1671 }
1672 lastBitMask = 1;
1673 lastBitMask <<= 0x96 - aExp;
1674 roundBitsMask = lastBitMask - 1;
1675 z = float32_val(a);
1676 roundingMode = STATUS(float_rounding_mode);
1677 if ( roundingMode == float_round_nearest_even ) {
1678 z += lastBitMask>>1;
1679 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680 }
1681 else if ( roundingMode != float_round_to_zero ) {
1682 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1683 z += roundBitsMask;
1684 }
1685 }
1686 z &= ~ roundBitsMask;
1687 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688 return make_float32(z);
1689
1690 }
1691
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of adding the absolute values of the single-precision
1694 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1695 | before being returned. `zSign' is ignored if the result is a NaN.
1696 | The addition is performed according to the IEC/IEEE Standard for Binary
1697 | Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1699
1700 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1701 {
1702 int16 aExp, bExp, zExp;
1703 uint32_t aSig, bSig, zSig;
1704 int16 expDiff;
1705
1706 aSig = extractFloat32Frac( a );
1707 aExp = extractFloat32Exp( a );
1708 bSig = extractFloat32Frac( b );
1709 bExp = extractFloat32Exp( b );
1710 expDiff = aExp - bExp;
1711 aSig <<= 6;
1712 bSig <<= 6;
1713 if ( 0 < expDiff ) {
1714 if ( aExp == 0xFF ) {
1715 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716 return a;
1717 }
1718 if ( bExp == 0 ) {
1719 --expDiff;
1720 }
1721 else {
1722 bSig |= 0x20000000;
1723 }
1724 shift32RightJamming( bSig, expDiff, &bSig );
1725 zExp = aExp;
1726 }
1727 else if ( expDiff < 0 ) {
1728 if ( bExp == 0xFF ) {
1729 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730 return packFloat32( zSign, 0xFF, 0 );
1731 }
1732 if ( aExp == 0 ) {
1733 ++expDiff;
1734 }
1735 else {
1736 aSig |= 0x20000000;
1737 }
1738 shift32RightJamming( aSig, - expDiff, &aSig );
1739 zExp = bExp;
1740 }
1741 else {
1742 if ( aExp == 0xFF ) {
1743 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744 return a;
1745 }
1746 if ( aExp == 0 ) {
1747 if (STATUS(flush_to_zero)) {
1748 if (aSig | bSig) {
1749 float_raise(float_flag_output_denormal STATUS_VAR);
1750 }
1751 return packFloat32(zSign, 0, 0);
1752 }
1753 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1754 }
1755 zSig = 0x40000000 + aSig + bSig;
1756 zExp = aExp;
1757 goto roundAndPack;
1758 }
1759 aSig |= 0x20000000;
1760 zSig = ( aSig + bSig )<<1;
1761 --zExp;
1762 if ( (int32_t) zSig < 0 ) {
1763 zSig = aSig + bSig;
1764 ++zExp;
1765 }
1766 roundAndPack:
1767 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1768
1769 }
1770
1771 /*----------------------------------------------------------------------------
1772 | Returns the result of subtracting the absolute values of the single-
1773 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1774 | difference is negated before being returned. `zSign' is ignored if the
1775 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1776 | Standard for Binary Floating-Point Arithmetic.
1777 *----------------------------------------------------------------------------*/
1778
1779 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1780 {
1781 int16 aExp, bExp, zExp;
1782 uint32_t aSig, bSig, zSig;
1783 int16 expDiff;
1784
1785 aSig = extractFloat32Frac( a );
1786 aExp = extractFloat32Exp( a );
1787 bSig = extractFloat32Frac( b );
1788 bExp = extractFloat32Exp( b );
1789 expDiff = aExp - bExp;
1790 aSig <<= 7;
1791 bSig <<= 7;
1792 if ( 0 < expDiff ) goto aExpBigger;
1793 if ( expDiff < 0 ) goto bExpBigger;
1794 if ( aExp == 0xFF ) {
1795 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796 float_raise( float_flag_invalid STATUS_VAR);
1797 return float32_default_nan;
1798 }
1799 if ( aExp == 0 ) {
1800 aExp = 1;
1801 bExp = 1;
1802 }
1803 if ( bSig < aSig ) goto aBigger;
1804 if ( aSig < bSig ) goto bBigger;
1805 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806 bExpBigger:
1807 if ( bExp == 0xFF ) {
1808 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809 return packFloat32( zSign ^ 1, 0xFF, 0 );
1810 }
1811 if ( aExp == 0 ) {
1812 ++expDiff;
1813 }
1814 else {
1815 aSig |= 0x40000000;
1816 }
1817 shift32RightJamming( aSig, - expDiff, &aSig );
1818 bSig |= 0x40000000;
1819 bBigger:
1820 zSig = bSig - aSig;
1821 zExp = bExp;
1822 zSign ^= 1;
1823 goto normalizeRoundAndPack;
1824 aExpBigger:
1825 if ( aExp == 0xFF ) {
1826 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827 return a;
1828 }
1829 if ( bExp == 0 ) {
1830 --expDiff;
1831 }
1832 else {
1833 bSig |= 0x40000000;
1834 }
1835 shift32RightJamming( bSig, expDiff, &bSig );
1836 aSig |= 0x40000000;
1837 aBigger:
1838 zSig = aSig - bSig;
1839 zExp = aExp;
1840 normalizeRoundAndPack:
1841 --zExp;
1842 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843
1844 }
1845
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of adding the single-precision floating-point values `a'
1848 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1849 | Binary Floating-Point Arithmetic.
1850 *----------------------------------------------------------------------------*/
1851
1852 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1853 {
1854 flag aSign, bSign;
1855 a = float32_squash_input_denormal(a STATUS_VAR);
1856 b = float32_squash_input_denormal(b STATUS_VAR);
1857
1858 aSign = extractFloat32Sign( a );
1859 bSign = extractFloat32Sign( b );
1860 if ( aSign == bSign ) {
1861 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1862 }
1863 else {
1864 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865 }
1866
1867 }
1868
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of subtracting the single-precision floating-point values
1871 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1872 | for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874
1875 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1876 {
1877 flag aSign, bSign;
1878 a = float32_squash_input_denormal(a STATUS_VAR);
1879 b = float32_squash_input_denormal(b STATUS_VAR);
1880
1881 aSign = extractFloat32Sign( a );
1882 bSign = extractFloat32Sign( b );
1883 if ( aSign == bSign ) {
1884 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1885 }
1886 else {
1887 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1888 }
1889
1890 }
1891
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of multiplying the single-precision floating-point values
1894 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1895 | for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1897
1898 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1899 {
1900 flag aSign, bSign, zSign;
1901 int16 aExp, bExp, zExp;
1902 uint32_t aSig, bSig;
1903 uint64_t zSig64;
1904 uint32_t zSig;
1905
1906 a = float32_squash_input_denormal(a STATUS_VAR);
1907 b = float32_squash_input_denormal(b STATUS_VAR);
1908
1909 aSig = extractFloat32Frac( a );
1910 aExp = extractFloat32Exp( a );
1911 aSign = extractFloat32Sign( a );
1912 bSig = extractFloat32Frac( b );
1913 bExp = extractFloat32Exp( b );
1914 bSign = extractFloat32Sign( b );
1915 zSign = aSign ^ bSign;
1916 if ( aExp == 0xFF ) {
1917 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918 return propagateFloat32NaN( a, b STATUS_VAR );
1919 }
1920 if ( ( bExp | bSig ) == 0 ) {
1921 float_raise( float_flag_invalid STATUS_VAR);
1922 return float32_default_nan;
1923 }
1924 return packFloat32( zSign, 0xFF, 0 );
1925 }
1926 if ( bExp == 0xFF ) {
1927 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928 if ( ( aExp | aSig ) == 0 ) {
1929 float_raise( float_flag_invalid STATUS_VAR);
1930 return float32_default_nan;
1931 }
1932 return packFloat32( zSign, 0xFF, 0 );
1933 }
1934 if ( aExp == 0 ) {
1935 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937 }
1938 if ( bExp == 0 ) {
1939 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941 }
1942 zExp = aExp + bExp - 0x7F;
1943 aSig = ( aSig | 0x00800000 )<<7;
1944 bSig = ( bSig | 0x00800000 )<<8;
1945 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1946 zSig = zSig64;
1947 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1948 zSig <<= 1;
1949 --zExp;
1950 }
1951 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1952
1953 }
1954
1955 /*----------------------------------------------------------------------------
1956 | Returns the result of dividing the single-precision floating-point value `a'
1957 | by the corresponding value `b'. The operation is performed according to the
1958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 *----------------------------------------------------------------------------*/
1960
1961 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1962 {
1963 flag aSign, bSign, zSign;
1964 int16 aExp, bExp, zExp;
1965 uint32_t aSig, bSig, zSig;
1966 a = float32_squash_input_denormal(a STATUS_VAR);
1967 b = float32_squash_input_denormal(b STATUS_VAR);
1968
1969 aSig = extractFloat32Frac( a );
1970 aExp = extractFloat32Exp( a );
1971 aSign = extractFloat32Sign( a );
1972 bSig = extractFloat32Frac( b );
1973 bExp = extractFloat32Exp( b );
1974 bSign = extractFloat32Sign( b );
1975 zSign = aSign ^ bSign;
1976 if ( aExp == 0xFF ) {
1977 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978 if ( bExp == 0xFF ) {
1979 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980 float_raise( float_flag_invalid STATUS_VAR);
1981 return float32_default_nan;
1982 }
1983 return packFloat32( zSign, 0xFF, 0 );
1984 }
1985 if ( bExp == 0xFF ) {
1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987 return packFloat32( zSign, 0, 0 );
1988 }
1989 if ( bExp == 0 ) {
1990 if ( bSig == 0 ) {
1991 if ( ( aExp | aSig ) == 0 ) {
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1994 }
1995 float_raise( float_flag_divbyzero STATUS_VAR);
1996 return packFloat32( zSign, 0xFF, 0 );
1997 }
1998 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999 }
2000 if ( aExp == 0 ) {
2001 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003 }
2004 zExp = aExp - bExp + 0x7D;
2005 aSig = ( aSig | 0x00800000 )<<7;
2006 bSig = ( bSig | 0x00800000 )<<8;
2007 if ( bSig <= ( aSig + aSig ) ) {
2008 aSig >>= 1;
2009 ++zExp;
2010 }
2011 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2012 if ( ( zSig & 0x3F ) == 0 ) {
2013 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2014 }
2015 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016
2017 }
2018
2019 /*----------------------------------------------------------------------------
2020 | Returns the remainder of the single-precision floating-point value `a'
2021 | with respect to the corresponding value `b'. The operation is performed
2022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2024
2025 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2026 {
2027 flag aSign, zSign;
2028 int16 aExp, bExp, expDiff;
2029 uint32_t aSig, bSig;
2030 uint32_t q;
2031 uint64_t aSig64, bSig64, q64;
2032 uint32_t alternateASig;
2033 int32_t sigMean;
2034 a = float32_squash_input_denormal(a STATUS_VAR);
2035 b = float32_squash_input_denormal(b STATUS_VAR);
2036
2037 aSig = extractFloat32Frac( a );
2038 aExp = extractFloat32Exp( a );
2039 aSign = extractFloat32Sign( a );
2040 bSig = extractFloat32Frac( b );
2041 bExp = extractFloat32Exp( b );
2042 if ( aExp == 0xFF ) {
2043 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044 return propagateFloat32NaN( a, b STATUS_VAR );
2045 }
2046 float_raise( float_flag_invalid STATUS_VAR);
2047 return float32_default_nan;
2048 }
2049 if ( bExp == 0xFF ) {
2050 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051 return a;
2052 }
2053 if ( bExp == 0 ) {
2054 if ( bSig == 0 ) {
2055 float_raise( float_flag_invalid STATUS_VAR);
2056 return float32_default_nan;
2057 }
2058 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059 }
2060 if ( aExp == 0 ) {
2061 if ( aSig == 0 ) return a;
2062 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063 }
2064 expDiff = aExp - bExp;
2065 aSig |= 0x00800000;
2066 bSig |= 0x00800000;
2067 if ( expDiff < 32 ) {
2068 aSig <<= 8;
2069 bSig <<= 8;
2070 if ( expDiff < 0 ) {
2071 if ( expDiff < -1 ) return a;
2072 aSig >>= 1;
2073 }
2074 q = ( bSig <= aSig );
2075 if ( q ) aSig -= bSig;
2076 if ( 0 < expDiff ) {
2077 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2078 q >>= 32 - expDiff;
2079 bSig >>= 2;
2080 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081 }
2082 else {
2083 aSig >>= 2;
2084 bSig >>= 2;
2085 }
2086 }
2087 else {
2088 if ( bSig <= aSig ) aSig -= bSig;
2089 aSig64 = ( (uint64_t) aSig )<<40;
2090 bSig64 = ( (uint64_t) bSig )<<40;
2091 expDiff -= 64;
2092 while ( 0 < expDiff ) {
2093 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095 aSig64 = - ( ( bSig * q64 )<<38 );
2096 expDiff -= 62;
2097 }
2098 expDiff += 64;
2099 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101 q = q64>>( 64 - expDiff );
2102 bSig <<= 6;
2103 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104 }
2105 do {
2106 alternateASig = aSig;
2107 ++q;
2108 aSig -= bSig;
2109 } while ( 0 <= (int32_t) aSig );
2110 sigMean = aSig + alternateASig;
2111 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112 aSig = alternateASig;
2113 }
2114 zSign = ( (int32_t) aSig < 0 );
2115 if ( zSign ) aSig = - aSig;
2116 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2117
2118 }
2119
2120 /*----------------------------------------------------------------------------
2121 | Returns the result of multiplying the single-precision floating-point values
2122 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2123 | multiplication. The operation is performed according to the IEC/IEEE
2124 | Standard for Binary Floating-Point Arithmetic 754-2008.
2125 | The flags argument allows the caller to select negation of the
2126 | addend, the intermediate product, or the final result. (The difference
2127 | between this and having the caller do a separate negation is that negating
2128 | externally will flip the sign bit on NaNs.)
2129 *----------------------------------------------------------------------------*/
2130
2131 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2132 {
2133 flag aSign, bSign, cSign, zSign;
2134 int aExp, bExp, cExp, pExp, zExp, expDiff;
2135 uint32_t aSig, bSig, cSig;
2136 flag pInf, pZero, pSign;
2137 uint64_t pSig64, cSig64, zSig64;
2138 uint32_t pSig;
2139 int shiftcount;
2140 flag signflip, infzero;
2141
2142 a = float32_squash_input_denormal(a STATUS_VAR);
2143 b = float32_squash_input_denormal(b STATUS_VAR);
2144 c = float32_squash_input_denormal(c STATUS_VAR);
2145 aSig = extractFloat32Frac(a);
2146 aExp = extractFloat32Exp(a);
2147 aSign = extractFloat32Sign(a);
2148 bSig = extractFloat32Frac(b);
2149 bExp = extractFloat32Exp(b);
2150 bSign = extractFloat32Sign(b);
2151 cSig = extractFloat32Frac(c);
2152 cExp = extractFloat32Exp(c);
2153 cSign = extractFloat32Sign(c);
2154
2155 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2156 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2157
2158 /* It is implementation-defined whether the cases of (0,inf,qnan)
2159 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2160 * they return if they do), so we have to hand this information
2161 * off to the target-specific pick-a-NaN routine.
2162 */
2163 if (((aExp == 0xff) && aSig) ||
2164 ((bExp == 0xff) && bSig) ||
2165 ((cExp == 0xff) && cSig)) {
2166 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2167 }
2168
2169 if (infzero) {
2170 float_raise(float_flag_invalid STATUS_VAR);
2171 return float32_default_nan;
2172 }
2173
2174 if (flags & float_muladd_negate_c) {
2175 cSign ^= 1;
2176 }
2177
2178 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2179
2180 /* Work out the sign and type of the product */
2181 pSign = aSign ^ bSign;
2182 if (flags & float_muladd_negate_product) {
2183 pSign ^= 1;
2184 }
2185 pInf = (aExp == 0xff) || (bExp == 0xff);
2186 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2187
2188 if (cExp == 0xff) {
2189 if (pInf && (pSign ^ cSign)) {
2190 /* addition of opposite-signed infinities => InvalidOperation */
2191 float_raise(float_flag_invalid STATUS_VAR);
2192 return float32_default_nan;
2193 }
2194 /* Otherwise generate an infinity of the same sign */
2195 return packFloat32(cSign ^ signflip, 0xff, 0);
2196 }
2197
2198 if (pInf) {
2199 return packFloat32(pSign ^ signflip, 0xff, 0);
2200 }
2201
2202 if (pZero) {
2203 if (cExp == 0) {
2204 if (cSig == 0) {
2205 /* Adding two exact zeroes */
2206 if (pSign == cSign) {
2207 zSign = pSign;
2208 } else if (STATUS(float_rounding_mode) == float_round_down) {
2209 zSign = 1;
2210 } else {
2211 zSign = 0;
2212 }
2213 return packFloat32(zSign ^ signflip, 0, 0);
2214 }
2215 /* Exact zero plus a denorm */
2216 if (STATUS(flush_to_zero)) {
2217 float_raise(float_flag_output_denormal STATUS_VAR);
2218 return packFloat32(cSign ^ signflip, 0, 0);
2219 }
2220 }
2221 /* Zero plus something non-zero : just return the something */
2222 return c ^ (signflip << 31);
2223 }
2224
2225 if (aExp == 0) {
2226 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2227 }
2228 if (bExp == 0) {
2229 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2230 }
2231
2232 /* Calculate the actual result a * b + c */
2233
2234 /* Multiply first; this is easy. */
2235 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2236 * because we want the true exponent, not the "one-less-than"
2237 * flavour that roundAndPackFloat32() takes.
2238 */
2239 pExp = aExp + bExp - 0x7e;
2240 aSig = (aSig | 0x00800000) << 7;
2241 bSig = (bSig | 0x00800000) << 8;
2242 pSig64 = (uint64_t)aSig * bSig;
2243 if ((int64_t)(pSig64 << 1) >= 0) {
2244 pSig64 <<= 1;
2245 pExp--;
2246 }
2247
2248 zSign = pSign ^ signflip;
2249
2250 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2251 * position 62.
2252 */
2253 if (cExp == 0) {
2254 if (!cSig) {
2255 /* Throw out the special case of c being an exact zero now */
2256 shift64RightJamming(pSig64, 32, &pSig64);
2257 pSig = pSig64;
2258 return roundAndPackFloat32(zSign, pExp - 1,
2259 pSig STATUS_VAR);
2260 }
2261 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2262 }
2263
2264 cSig64 = (uint64_t)cSig << (62 - 23);
2265 cSig64 |= LIT64(0x4000000000000000);
2266 expDiff = pExp - cExp;
2267
2268 if (pSign == cSign) {
2269 /* Addition */
2270 if (expDiff > 0) {
2271 /* scale c to match p */
2272 shift64RightJamming(cSig64, expDiff, &cSig64);
2273 zExp = pExp;
2274 } else if (expDiff < 0) {
2275 /* scale p to match c */
2276 shift64RightJamming(pSig64, -expDiff, &pSig64);
2277 zExp = cExp;
2278 } else {
2279 /* no scaling needed */
2280 zExp = cExp;
2281 }
2282 /* Add significands and make sure explicit bit ends up in posn 62 */
2283 zSig64 = pSig64 + cSig64;
2284 if ((int64_t)zSig64 < 0) {
2285 shift64RightJamming(zSig64, 1, &zSig64);
2286 } else {
2287 zExp--;
2288 }
2289 } else {
2290 /* Subtraction */
2291 if (expDiff > 0) {
2292 shift64RightJamming(cSig64, expDiff, &cSig64);
2293 zSig64 = pSig64 - cSig64;
2294 zExp = pExp;
2295 } else if (expDiff < 0) {
2296 shift64RightJamming(pSig64, -expDiff, &pSig64);
2297 zSig64 = cSig64 - pSig64;
2298 zExp = cExp;
2299 zSign ^= 1;
2300 } else {
2301 zExp = pExp;
2302 if (cSig64 < pSig64) {
2303 zSig64 = pSig64 - cSig64;
2304 } else if (pSig64 < cSig64) {
2305 zSig64 = cSig64 - pSig64;
2306 zSign ^= 1;
2307 } else {
2308 /* Exact zero */
2309 zSign = signflip;
2310 if (STATUS(float_rounding_mode) == float_round_down) {
2311 zSign ^= 1;
2312 }
2313 return packFloat32(zSign, 0, 0);
2314 }
2315 }
2316 --zExp;
2317 /* Normalize to put the explicit bit back into bit 62. */
2318 shiftcount = countLeadingZeros64(zSig64) - 1;
2319 zSig64 <<= shiftcount;
2320 zExp -= shiftcount;
2321 }
2322 shift64RightJamming(zSig64, 32, &zSig64);
2323 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2324 }
2325
2326
2327 /*----------------------------------------------------------------------------
2328 | Returns the square root of the single-precision floating-point value `a'.
2329 | The operation is performed according to the IEC/IEEE Standard for Binary
2330 | Floating-Point Arithmetic.
2331 *----------------------------------------------------------------------------*/
2332
2333 float32 float32_sqrt( float32 a STATUS_PARAM )
2334 {
2335 flag aSign;
2336 int16 aExp, zExp;
2337 uint32_t aSig, zSig;
2338 uint64_t rem, term;
2339 a = float32_squash_input_denormal(a STATUS_VAR);
2340
2341 aSig = extractFloat32Frac( a );
2342 aExp = extractFloat32Exp( a );
2343 aSign = extractFloat32Sign( a );
2344 if ( aExp == 0xFF ) {
2345 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2346 if ( ! aSign ) return a;
2347 float_raise( float_flag_invalid STATUS_VAR);
2348 return float32_default_nan;
2349 }
2350 if ( aSign ) {
2351 if ( ( aExp | aSig ) == 0 ) return a;
2352 float_raise( float_flag_invalid STATUS_VAR);
2353 return float32_default_nan;
2354 }
2355 if ( aExp == 0 ) {
2356 if ( aSig == 0 ) return float32_zero;
2357 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2358 }
2359 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2360 aSig = ( aSig | 0x00800000 )<<8;
2361 zSig = estimateSqrt32( aExp, aSig ) + 2;
2362 if ( ( zSig & 0x7F ) <= 5 ) {
2363 if ( zSig < 2 ) {
2364 zSig = 0x7FFFFFFF;
2365 goto roundAndPack;
2366 }
2367 aSig >>= aExp & 1;
2368 term = ( (uint64_t) zSig ) * zSig;
2369 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2370 while ( (int64_t) rem < 0 ) {
2371 --zSig;
2372 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2373 }
2374 zSig |= ( rem != 0 );
2375 }
2376 shift32RightJamming( zSig, 1, &zSig );
2377 roundAndPack:
2378 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2379
2380 }
2381
2382 /*----------------------------------------------------------------------------
2383 | Returns the binary exponential of the single-precision floating-point value
2384 | `a'. The operation is performed according to the IEC/IEEE Standard for
2385 | Binary Floating-Point Arithmetic.
2386 |
2387 | Uses the following identities:
2388 |
2389 | 1. -------------------------------------------------------------------------
2390 | x x*ln(2)
2391 | 2 = e
2392 |
2393 | 2. -------------------------------------------------------------------------
2394 | 2 3 4 5 n
2395 | x x x x x x x
2396 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2397 | 1! 2! 3! 4! 5! n!
2398 *----------------------------------------------------------------------------*/
2399
2400 static const float64 float32_exp2_coefficients[15] =
2401 {
2402 const_float64( 0x3ff0000000000000ll ), /* 1 */
2403 const_float64( 0x3fe0000000000000ll ), /* 2 */
2404 const_float64( 0x3fc5555555555555ll ), /* 3 */
2405 const_float64( 0x3fa5555555555555ll ), /* 4 */
2406 const_float64( 0x3f81111111111111ll ), /* 5 */
2407 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2408 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2409 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2410 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2411 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2412 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2413 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2414 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2415 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2416 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2417 };
2418
2419 float32 float32_exp2( float32 a STATUS_PARAM )
2420 {
2421 flag aSign;
2422 int16 aExp;
2423 uint32_t aSig;
2424 float64 r, x, xn;
2425 int i;
2426 a = float32_squash_input_denormal(a STATUS_VAR);
2427
2428 aSig = extractFloat32Frac( a );
2429 aExp = extractFloat32Exp( a );
2430 aSign = extractFloat32Sign( a );
2431
2432 if ( aExp == 0xFF) {
2433 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2434 return (aSign) ? float32_zero : a;
2435 }
2436 if (aExp == 0) {
2437 if (aSig == 0) return float32_one;
2438 }
2439
2440 float_raise( float_flag_inexact STATUS_VAR);
2441
2442 /* ******************************* */
2443 /* using float64 for approximation */
2444 /* ******************************* */
2445 x = float32_to_float64(a STATUS_VAR);
2446 x = float64_mul(x, float64_ln2 STATUS_VAR);
2447
2448 xn = x;
2449 r = float64_one;
2450 for (i = 0 ; i < 15 ; i++) {
2451 float64 f;
2452
2453 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2454 r = float64_add(r, f STATUS_VAR);
2455
2456 xn = float64_mul(xn, x STATUS_VAR);
2457 }
2458
2459 return float64_to_float32(r, status);
2460 }
2461
2462 /*----------------------------------------------------------------------------
2463 | Returns the binary log of the single-precision floating-point value `a'.
2464 | The operation is performed according to the IEC/IEEE Standard for Binary
2465 | Floating-Point Arithmetic.
2466 *----------------------------------------------------------------------------*/
2467 float32 float32_log2( float32 a STATUS_PARAM )
2468 {
2469 flag aSign, zSign;
2470 int16 aExp;
2471 uint32_t aSig, zSig, i;
2472
2473 a = float32_squash_input_denormal(a STATUS_VAR);
2474 aSig = extractFloat32Frac( a );
2475 aExp = extractFloat32Exp( a );
2476 aSign = extractFloat32Sign( a );
2477
2478 if ( aExp == 0 ) {
2479 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2480 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2481 }
2482 if ( aSign ) {
2483 float_raise( float_flag_invalid STATUS_VAR);
2484 return float32_default_nan;
2485 }
2486 if ( aExp == 0xFF ) {
2487 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2488 return a;
2489 }
2490
2491 aExp -= 0x7F;
2492 aSig |= 0x00800000;
2493 zSign = aExp < 0;
2494 zSig = aExp << 23;
2495
2496 for (i = 1 << 22; i > 0; i >>= 1) {
2497 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2498 if ( aSig & 0x01000000 ) {
2499 aSig >>= 1;
2500 zSig |= i;
2501 }
2502 }
2503
2504 if ( zSign )
2505 zSig = -zSig;
2506
2507 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2508 }
2509
2510 /*----------------------------------------------------------------------------
2511 | Returns 1 if the single-precision floating-point value `a' is equal to
2512 | the corresponding value `b', and 0 otherwise. The invalid exception is
2513 | raised if either operand is a NaN. Otherwise, the comparison is performed
2514 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2515 *----------------------------------------------------------------------------*/
2516
2517 int float32_eq( float32 a, float32 b STATUS_PARAM )
2518 {
2519 uint32_t av, bv;
2520 a = float32_squash_input_denormal(a STATUS_VAR);
2521 b = float32_squash_input_denormal(b STATUS_VAR);
2522
2523 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2524 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2525 ) {
2526 float_raise( float_flag_invalid STATUS_VAR);
2527 return 0;
2528 }
2529 av = float32_val(a);
2530 bv = float32_val(b);
2531 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2532 }
2533
2534 /*----------------------------------------------------------------------------
2535 | Returns 1 if the single-precision floating-point value `a' is less than
2536 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2537 | exception is raised if either operand is a NaN. The comparison is performed
2538 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2539 *----------------------------------------------------------------------------*/
2540
2541 int float32_le( float32 a, float32 b STATUS_PARAM )
2542 {
2543 flag aSign, bSign;
2544 uint32_t av, bv;
2545 a = float32_squash_input_denormal(a STATUS_VAR);
2546 b = float32_squash_input_denormal(b STATUS_VAR);
2547
2548 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2549 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2550 ) {
2551 float_raise( float_flag_invalid STATUS_VAR);
2552 return 0;
2553 }
2554 aSign = extractFloat32Sign( a );
2555 bSign = extractFloat32Sign( b );
2556 av = float32_val(a);
2557 bv = float32_val(b);
2558 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2559 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2560
2561 }
2562
2563 /*----------------------------------------------------------------------------
2564 | Returns 1 if the single-precision floating-point value `a' is less than
2565 | the corresponding value `b', and 0 otherwise. The invalid exception is
2566 | raised if either operand is a NaN. The comparison is performed according
2567 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2568 *----------------------------------------------------------------------------*/
2569
2570 int float32_lt( float32 a, float32 b STATUS_PARAM )
2571 {
2572 flag aSign, bSign;
2573 uint32_t av, bv;
2574 a = float32_squash_input_denormal(a STATUS_VAR);
2575 b = float32_squash_input_denormal(b STATUS_VAR);
2576
2577 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2578 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2579 ) {
2580 float_raise( float_flag_invalid STATUS_VAR);
2581 return 0;
2582 }
2583 aSign = extractFloat32Sign( a );
2584 bSign = extractFloat32Sign( b );
2585 av = float32_val(a);
2586 bv = float32_val(b);
2587 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2588 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2589
2590 }
2591
2592 /*----------------------------------------------------------------------------
2593 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2594 | be compared, and 0 otherwise. The invalid exception is raised if either
2595 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2596 | Standard for Binary Floating-Point Arithmetic.
2597 *----------------------------------------------------------------------------*/
2598
2599 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2600 {
2601 a = float32_squash_input_denormal(a STATUS_VAR);
2602 b = float32_squash_input_denormal(b STATUS_VAR);
2603
2604 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2605 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2606 ) {
2607 float_raise( float_flag_invalid STATUS_VAR);
2608 return 1;
2609 }
2610 return 0;
2611 }
2612
2613 /*----------------------------------------------------------------------------
2614 | Returns 1 if the single-precision floating-point value `a' is equal to
2615 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2616 | exception. The comparison is performed according to the IEC/IEEE Standard
2617 | for Binary Floating-Point Arithmetic.
2618 *----------------------------------------------------------------------------*/
2619
2620 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2621 {
2622 a = float32_squash_input_denormal(a STATUS_VAR);
2623 b = float32_squash_input_denormal(b STATUS_VAR);
2624
2625 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2626 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2627 ) {
2628 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2629 float_raise( float_flag_invalid STATUS_VAR);
2630 }
2631 return 0;
2632 }
2633 return ( float32_val(a) == float32_val(b) ) ||
2634 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2635 }
2636
2637 /*----------------------------------------------------------------------------
2638 | Returns 1 if the single-precision floating-point value `a' is less than or
2639 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2640 | cause an exception. Otherwise, the comparison is performed according to the
2641 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2642 *----------------------------------------------------------------------------*/
2643
2644 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2645 {
2646 flag aSign, bSign;
2647 uint32_t av, bv;
2648 a = float32_squash_input_denormal(a STATUS_VAR);
2649 b = float32_squash_input_denormal(b STATUS_VAR);
2650
2651 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2652 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2653 ) {
2654 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2655 float_raise( float_flag_invalid STATUS_VAR);
2656 }
2657 return 0;
2658 }
2659 aSign = extractFloat32Sign( a );
2660 bSign = extractFloat32Sign( b );
2661 av = float32_val(a);
2662 bv = float32_val(b);
2663 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2664 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2665
2666 }
2667
2668 /*----------------------------------------------------------------------------
2669 | Returns 1 if the single-precision floating-point value `a' is less than
2670 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2671 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2672 | Standard for Binary Floating-Point Arithmetic.
2673 *----------------------------------------------------------------------------*/
2674
2675 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2676 {
2677 flag aSign, bSign;
2678 uint32_t av, bv;
2679 a = float32_squash_input_denormal(a STATUS_VAR);
2680 b = float32_squash_input_denormal(b STATUS_VAR);
2681
2682 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2683 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2684 ) {
2685 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2686 float_raise( float_flag_invalid STATUS_VAR);
2687 }
2688 return 0;
2689 }
2690 aSign = extractFloat32Sign( a );
2691 bSign = extractFloat32Sign( b );
2692 av = float32_val(a);
2693 bv = float32_val(b);
2694 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2695 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2696
2697 }
2698
2699 /*----------------------------------------------------------------------------
2700 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2701 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2702 | comparison is performed according to the IEC/IEEE Standard for Binary
2703 | Floating-Point Arithmetic.
2704 *----------------------------------------------------------------------------*/
2705
2706 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2707 {
2708 a = float32_squash_input_denormal(a STATUS_VAR);
2709 b = float32_squash_input_denormal(b STATUS_VAR);
2710
2711 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2712 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2713 ) {
2714 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2715 float_raise( float_flag_invalid STATUS_VAR);
2716 }
2717 return 1;
2718 }
2719 return 0;
2720 }
2721
2722 /*----------------------------------------------------------------------------
2723 | Returns the result of converting the double-precision floating-point value
2724 | `a' to the 32-bit two's complement integer format. The conversion is
2725 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2726 | Arithmetic---which means in particular that the conversion is rounded
2727 | according to the current rounding mode. If `a' is a NaN, the largest
2728 | positive integer is returned. Otherwise, if the conversion overflows, the
2729 | largest integer with the same sign as `a' is returned.
2730 *----------------------------------------------------------------------------*/
2731
2732 int32 float64_to_int32( float64 a STATUS_PARAM )
2733 {
2734 flag aSign;
2735 int16 aExp, shiftCount;
2736 uint64_t aSig;
2737 a = float64_squash_input_denormal(a STATUS_VAR);
2738
2739 aSig = extractFloat64Frac( a );
2740 aExp = extractFloat64Exp( a );
2741 aSign = extractFloat64Sign( a );
2742 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2743 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2744 shiftCount = 0x42C - aExp;
2745 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2746 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2747
2748 }
2749
2750 /*----------------------------------------------------------------------------
2751 | Returns the result of converting the double-precision floating-point value
2752 | `a' to the 32-bit two's complement integer format. The conversion is
2753 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2754 | Arithmetic, except that the conversion is always rounded toward zero.
2755 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2756 | the conversion overflows, the largest integer with the same sign as `a' is
2757 | returned.
2758 *----------------------------------------------------------------------------*/
2759
2760 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2761 {
2762 flag aSign;
2763 int16 aExp, shiftCount;
2764 uint64_t aSig, savedASig;
2765 int32 z;
2766 a = float64_squash_input_denormal(a STATUS_VAR);
2767
2768 aSig = extractFloat64Frac( a );
2769 aExp = extractFloat64Exp( a );
2770 aSign = extractFloat64Sign( a );
2771 if ( 0x41E < aExp ) {
2772 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2773 goto invalid;
2774 }
2775 else if ( aExp < 0x3FF ) {
2776 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2777 return 0;
2778 }
2779 aSig |= LIT64( 0x0010000000000000 );
2780 shiftCount = 0x433 - aExp;
2781 savedASig = aSig;
2782 aSig >>= shiftCount;
2783 z = aSig;
2784 if ( aSign ) z = - z;
2785 if ( ( z < 0 ) ^ aSign ) {
2786 invalid:
2787 float_raise( float_flag_invalid STATUS_VAR);
2788 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2789 }
2790 if ( ( aSig<<shiftCount ) != savedASig ) {
2791 STATUS(float_exception_flags) |= float_flag_inexact;
2792 }
2793 return z;
2794
2795 }
2796
2797 /*----------------------------------------------------------------------------
2798 | Returns the result of converting the double-precision floating-point value
2799 | `a' to the 16-bit two's complement integer format. The conversion is
2800 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2801 | Arithmetic, except that the conversion is always rounded toward zero.
2802 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2803 | the conversion overflows, the largest integer with the same sign as `a' is
2804 | returned.
2805 *----------------------------------------------------------------------------*/
2806
2807 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2808 {
2809 flag aSign;
2810 int16 aExp, shiftCount;
2811 uint64_t aSig, savedASig;
2812 int32 z;
2813
2814 aSig = extractFloat64Frac( a );
2815 aExp = extractFloat64Exp( a );
2816 aSign = extractFloat64Sign( a );
2817 if ( 0x40E < aExp ) {
2818 if ( ( aExp == 0x7FF ) && aSig ) {
2819 aSign = 0;
2820 }
2821 goto invalid;
2822 }
2823 else if ( aExp < 0x3FF ) {
2824 if ( aExp || aSig ) {
2825 STATUS(float_exception_flags) |= float_flag_inexact;
2826 }
2827 return 0;
2828 }
2829 aSig |= LIT64( 0x0010000000000000 );
2830 shiftCount = 0x433 - aExp;
2831 savedASig = aSig;
2832 aSig >>= shiftCount;
2833 z = aSig;
2834 if ( aSign ) {
2835 z = - z;
2836 }
2837 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2838 invalid:
2839 float_raise( float_flag_invalid STATUS_VAR);
2840 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2841 }
2842 if ( ( aSig<<shiftCount ) != savedASig ) {
2843 STATUS(float_exception_flags) |= float_flag_inexact;
2844 }
2845 return z;
2846 }
2847
2848 /*----------------------------------------------------------------------------
2849 | Returns the result of converting the double-precision floating-point value
2850 | `a' to the 64-bit two's complement integer format. The conversion is
2851 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2852 | Arithmetic---which means in particular that the conversion is rounded
2853 | according to the current rounding mode. If `a' is a NaN, the largest
2854 | positive integer is returned. Otherwise, if the conversion overflows, the
2855 | largest integer with the same sign as `a' is returned.
2856 *----------------------------------------------------------------------------*/
2857
2858 int64 float64_to_int64( float64 a STATUS_PARAM )
2859 {
2860 flag aSign;
2861 int16 aExp, shiftCount;
2862 uint64_t aSig, aSigExtra;
2863 a = float64_squash_input_denormal(a STATUS_VAR);
2864
2865 aSig = extractFloat64Frac( a );
2866 aExp = extractFloat64Exp( a );
2867 aSign = extractFloat64Sign( a );
2868 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2869 shiftCount = 0x433 - aExp;
2870 if ( shiftCount <= 0 ) {
2871 if ( 0x43E < aExp ) {
2872 float_raise( float_flag_invalid STATUS_VAR);
2873 if ( ! aSign
2874 || ( ( aExp == 0x7FF )
2875 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2876 ) {
2877 return LIT64( 0x7FFFFFFFFFFFFFFF );
2878 }
2879 return (int64_t) LIT64( 0x8000000000000000 );
2880 }
2881 aSigExtra = 0;
2882 aSig <<= - shiftCount;
2883 }
2884 else {
2885 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2886 }
2887 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2888
2889 }
2890
2891 /*----------------------------------------------------------------------------
2892 | Returns the result of converting the double-precision floating-point value
2893 | `a' to the 64-bit two's complement integer format. The conversion is
2894 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2895 | Arithmetic, except that the conversion is always rounded toward zero.
2896 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2897 | the conversion overflows, the largest integer with the same sign as `a' is
2898 | returned.
2899 *----------------------------------------------------------------------------*/
2900
2901 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2902 {
2903 flag aSign;
2904 int16 aExp, shiftCount;
2905 uint64_t aSig;
2906 int64 z;
2907 a = float64_squash_input_denormal(a STATUS_VAR);
2908
2909 aSig = extractFloat64Frac( a );
2910 aExp = extractFloat64Exp( a );
2911 aSign = extractFloat64Sign( a );
2912 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2913 shiftCount = aExp - 0x433;
2914 if ( 0 <= shiftCount ) {
2915 if ( 0x43E <= aExp ) {
2916 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2917 float_raise( float_flag_invalid STATUS_VAR);
2918 if ( ! aSign
2919 || ( ( aExp == 0x7FF )
2920 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2921 ) {
2922 return LIT64( 0x7FFFFFFFFFFFFFFF );
2923 }
2924 }
2925 return (int64_t) LIT64( 0x8000000000000000 );
2926 }
2927 z = aSig<<shiftCount;
2928 }
2929 else {
2930 if ( aExp < 0x3FE ) {
2931 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2932 return 0;
2933 }
2934 z = aSig>>( - shiftCount );
2935 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2936 STATUS(float_exception_flags) |= float_flag_inexact;
2937 }
2938 }
2939 if ( aSign ) z = - z;
2940 return z;
2941
2942 }
2943
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the single-precision floating-point format. The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic.
2949 *----------------------------------------------------------------------------*/
2950
2951 float32 float64_to_float32( float64 a STATUS_PARAM )
2952 {
2953 flag aSign;
2954 int16 aExp;
2955 uint64_t aSig;
2956 uint32_t zSig;
2957 a = float64_squash_input_denormal(a STATUS_VAR);
2958
2959 aSig = extractFloat64Frac( a );
2960 aExp = extractFloat64Exp( a );
2961 aSign = extractFloat64Sign( a );
2962 if ( aExp == 0x7FF ) {
2963 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2964 return packFloat32( aSign, 0xFF, 0 );
2965 }
2966 shift64RightJamming( aSig, 22, &aSig );
2967 zSig = aSig;
2968 if ( aExp || zSig ) {
2969 zSig |= 0x40000000;
2970 aExp -= 0x381;
2971 }
2972 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2973
2974 }
2975
2976
2977 /*----------------------------------------------------------------------------
2978 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2979 | half-precision floating-point value, returning the result. After being
2980 | shifted into the proper positions, the three fields are simply added
2981 | together to form the result. This means that any integer portion of `zSig'
2982 | will be added into the exponent. Since a properly normalized significand
2983 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2984 | than the desired result exponent whenever `zSig' is a complete, normalized
2985 | significand.
2986 *----------------------------------------------------------------------------*/
2987 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2988 {
2989 return make_float16(
2990 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2991 }
2992
2993 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2994 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2995
2996 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2997 {
2998 flag aSign;
2999 int16 aExp;
3000 uint32_t aSig;
3001
3002 aSign = extractFloat16Sign(a);
3003 aExp = extractFloat16Exp(a);
3004 aSig = extractFloat16Frac(a);
3005
3006 if (aExp == 0x1f && ieee) {
3007 if (aSig) {
3008 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3009 }
3010 return packFloat32(aSign, 0xff, aSig << 13);
3011 }
3012 if (aExp == 0) {
3013 int8 shiftCount;
3014
3015 if (aSig == 0) {
3016 return packFloat32(aSign, 0, 0);
3017 }
3018
3019 shiftCount = countLeadingZeros32( aSig ) - 21;
3020 aSig = aSig << shiftCount;
3021 aExp = -shiftCount;
3022 }
3023 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3024 }
3025
3026 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3027 {
3028 flag aSign;
3029 int16 aExp;
3030 uint32_t aSig;
3031 uint32_t mask;
3032 uint32_t increment;
3033 int8 roundingMode;
3034 a = float32_squash_input_denormal(a STATUS_VAR);
3035
3036 aSig = extractFloat32Frac( a );
3037 aExp = extractFloat32Exp( a );
3038 aSign = extractFloat32Sign( a );
3039 if ( aExp == 0xFF ) {
3040 if (aSig) {
3041 /* Input is a NaN */
3042 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3043 if (!ieee) {
3044 return packFloat16(aSign, 0, 0);
3045 }
3046 return r;
3047 }
3048 /* Infinity */
3049 if (!ieee) {
3050 float_raise(float_flag_invalid STATUS_VAR);
3051 return packFloat16(aSign, 0x1f, 0x3ff);
3052 }
3053 return packFloat16(aSign, 0x1f, 0);
3054 }
3055 if (aExp == 0 && aSig == 0) {
3056 return packFloat16(aSign, 0, 0);
3057 }
3058 /* Decimal point between bits 22 and 23. */
3059 aSig |= 0x00800000;
3060 aExp -= 0x7f;
3061 if (aExp < -14) {
3062 mask = 0x00ffffff;
3063 if (aExp >= -24) {
3064 mask >>= 25 + aExp;
3065 }
3066 } else {
3067 mask = 0x00001fff;
3068 }
3069 if (aSig & mask) {
3070 float_raise( float_flag_underflow STATUS_VAR );
3071 roundingMode = STATUS(float_rounding_mode);
3072 switch (roundingMode) {
3073 case float_round_nearest_even:
3074 increment = (mask + 1) >> 1;
3075 if ((aSig & mask) == increment) {
3076 increment = aSig & (increment << 1);
3077 }
3078 break;
3079 case float_round_up:
3080 increment = aSign ? 0 : mask;
3081 break;
3082 case float_round_down:
3083 increment = aSign ? mask : 0;
3084 break;
3085 default: /* round_to_zero */
3086 increment = 0;
3087 break;
3088 }
3089 aSig += increment;
3090 if (aSig >= 0x01000000) {
3091 aSig >>= 1;
3092 aExp++;
3093 }
3094 } else if (aExp < -14
3095 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3096 float_raise( float_flag_underflow STATUS_VAR);
3097 }
3098
3099 if (ieee) {
3100 if (aExp > 15) {
3101 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3102 return packFloat16(aSign, 0x1f, 0);
3103 }
3104 } else {
3105 if (aExp > 16) {
3106 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3107 return packFloat16(aSign, 0x1f, 0x3ff);
3108 }
3109 }
3110 if (aExp < -24) {
3111 return packFloat16(aSign, 0, 0);
3112 }
3113 if (aExp < -14) {
3114 aSig >>= -14 - aExp;
3115 aExp = -14;
3116 }
3117 return packFloat16(aSign, aExp + 14, aSig >> 13);
3118 }
3119
3120 /*----------------------------------------------------------------------------
3121 | Returns the result of converting the double-precision floating-point value
3122 | `a' to the extended double-precision floating-point format. The conversion
3123 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3124 | Arithmetic.
3125 *----------------------------------------------------------------------------*/
3126
3127 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3128 {
3129 flag aSign;
3130 int16 aExp;
3131 uint64_t aSig;
3132
3133 a = float64_squash_input_denormal(a STATUS_VAR);
3134 aSig = extractFloat64Frac( a );
3135 aExp = extractFloat64Exp( a );
3136 aSign = extractFloat64Sign( a );
3137 if ( aExp == 0x7FF ) {
3138 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3139 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3140 }
3141 if ( aExp == 0 ) {
3142 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3143 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3144 }
3145 return
3146 packFloatx80(
3147 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3148
3149 }
3150
3151 /*----------------------------------------------------------------------------
3152 | Returns the result of converting the double-precision floating-point value
3153 | `a' to the quadruple-precision floating-point format. The conversion is
3154 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3155 | Arithmetic.
3156 *----------------------------------------------------------------------------*/
3157
3158 float128 float64_to_float128( float64 a STATUS_PARAM )
3159 {
3160 flag aSign;
3161 int16 aExp;
3162 uint64_t aSig, zSig0, zSig1;
3163
3164 a = float64_squash_input_denormal(a STATUS_VAR);
3165 aSig = extractFloat64Frac( a );
3166 aExp = extractFloat64Exp( a );
3167 aSign = extractFloat64Sign( a );
3168 if ( aExp == 0x7FF ) {
3169 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3170 return packFloat128( aSign, 0x7FFF, 0, 0 );
3171 }
3172 if ( aExp == 0 ) {
3173 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3174 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175 --aExp;
3176 }
3177 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3178 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3179
3180 }
3181
3182 /*----------------------------------------------------------------------------
3183 | Rounds the double-precision floating-point value `a' to an integer, and
3184 | returns the result as a double-precision floating-point value. The
3185 | operation is performed according to the IEC/IEEE Standard for Binary
3186 | Floating-Point Arithmetic.
3187 *----------------------------------------------------------------------------*/
3188
3189 float64 float64_round_to_int( float64 a STATUS_PARAM )
3190 {
3191 flag aSign;
3192 int16 aExp;
3193 uint64_t lastBitMask, roundBitsMask;
3194 int8 roundingMode;
3195 uint64_t z;
3196 a = float64_squash_input_denormal(a STATUS_VAR);
3197
3198 aExp = extractFloat64Exp( a );
3199 if ( 0x433 <= aExp ) {
3200 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3201 return propagateFloat64NaN( a, a STATUS_VAR );
3202 }
3203 return a;
3204 }
3205 if ( aExp < 0x3FF ) {
3206 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3207 STATUS(float_exception_flags) |= float_flag_inexact;
3208 aSign = extractFloat64Sign( a );
3209 switch ( STATUS(float_rounding_mode) ) {
3210 case float_round_nearest_even:
3211 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3212 return packFloat64( aSign, 0x3FF, 0 );
3213 }
3214 break;
3215 case float_round_down:
3216 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3217 case float_round_up:
3218 return make_float64(
3219 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3220 }
3221 return packFloat64( aSign, 0, 0 );
3222 }
3223 lastBitMask = 1;
3224 lastBitMask <<= 0x433 - aExp;
3225 roundBitsMask = lastBitMask - 1;
3226 z = float64_val(a);
3227 roundingMode = STATUS(float_rounding_mode);
3228 if ( roundingMode == float_round_nearest_even ) {
3229 z += lastBitMask>>1;
3230 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3231 }
3232 else if ( roundingMode != float_round_to_zero ) {
3233 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3234 z += roundBitsMask;
3235 }
3236 }
3237 z &= ~ roundBitsMask;
3238 if ( z != float64_val(a) )
3239 STATUS(float_exception_flags) |= float_flag_inexact;
3240 return make_float64(z);
3241
3242 }
3243
3244 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3245 {
3246 int oldmode;
3247 float64 res;
3248 oldmode = STATUS(float_rounding_mode);
3249 STATUS(float_rounding_mode) = float_round_to_zero;
3250 res = float64_round_to_int(a STATUS_VAR);
3251 STATUS(float_rounding_mode) = oldmode;
3252 return res;
3253 }
3254
3255 /*----------------------------------------------------------------------------
3256 | Returns the result of adding the absolute values of the double-precision
3257 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3258 | before being returned. `zSign' is ignored if the result is a NaN.
3259 | The addition is performed according to the IEC/IEEE Standard for Binary
3260 | Floating-Point Arithmetic.
3261 *----------------------------------------------------------------------------*/
3262
3263 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3264 {
3265 int16 aExp, bExp, zExp;
3266 uint64_t aSig, bSig, zSig;
3267 int16 expDiff;
3268
3269 aSig = extractFloat64Frac( a );
3270 aExp = extractFloat64Exp( a );
3271 bSig = extractFloat64Frac( b );
3272 bExp = extractFloat64Exp( b );
3273 expDiff = aExp - bExp;
3274 aSig <<= 9;
3275 bSig <<= 9;
3276 if ( 0 < expDiff ) {
3277 if ( aExp == 0x7FF ) {
3278 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3279 return a;
3280 }
3281 if ( bExp == 0 ) {
3282 --expDiff;
3283 }
3284 else {
3285 bSig |= LIT64( 0x2000000000000000 );
3286 }
3287 shift64RightJamming( bSig, expDiff, &bSig );
3288 zExp = aExp;
3289 }
3290 else if ( expDiff < 0 ) {
3291 if ( bExp == 0x7FF ) {
3292 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3293 return packFloat64( zSign, 0x7FF, 0 );
3294 }
3295 if ( aExp == 0 ) {
3296 ++expDiff;
3297 }
3298 else {
3299 aSig |= LIT64( 0x2000000000000000 );
3300 }
3301 shift64RightJamming( aSig, - expDiff, &aSig );
3302 zExp = bExp;
3303 }
3304 else {
3305 if ( aExp == 0x7FF ) {
3306 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307 return a;
3308 }
3309 if ( aExp == 0 ) {
3310 if (STATUS(flush_to_zero)) {
3311 if (aSig | bSig) {
3312 float_raise(float_flag_output_denormal STATUS_VAR);
3313 }
3314 return packFloat64(zSign, 0, 0);
3315 }
3316 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3317 }
3318 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3319 zExp = aExp;
3320 goto roundAndPack;
3321 }
3322 aSig |= LIT64( 0x2000000000000000 );
3323 zSig = ( aSig + bSig )<<1;
3324 --zExp;
3325 if ( (int64_t) zSig < 0 ) {
3326 zSig = aSig + bSig;
3327 ++zExp;
3328 }
3329 roundAndPack:
3330 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3331
3332 }
3333
3334 /*----------------------------------------------------------------------------
3335 | Returns the result of subtracting the absolute values of the double-
3336 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3337 | difference is negated before being returned. `zSign' is ignored if the
3338 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3339 | Standard for Binary Floating-Point Arithmetic.
3340 *----------------------------------------------------------------------------*/
3341
3342 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3343 {
3344 int16 aExp, bExp, zExp;
3345 uint64_t aSig, bSig, zSig;
3346 int16 expDiff;
3347
3348 aSig = extractFloat64Frac( a );
3349 aExp = extractFloat64Exp( a );
3350 bSig = extractFloat64Frac( b );
3351 bExp = extractFloat64Exp( b );
3352 expDiff = aExp - bExp;
3353 aSig <<= 10;
3354 bSig <<= 10;
3355 if ( 0 < expDiff ) goto aExpBigger;
3356 if ( expDiff <