tests: virtio-9p: add version operation test
[qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86
87 #include "fpu/softfloat.h"
88
89 /* We only need stdlib for abort() */
90
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114 return float16_val(a) & 0x3ff;
115 }
116
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120
121 static inline int extractFloat16Exp(float16 a)
122 {
123 return (float16_val(a) >> 10) & 0x1f;
124 }
125
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129
130 static inline flag extractFloat16Sign(float16 a)
131 {
132 return float16_val(a)>>15;
133 }
134
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148 int8_t roundingMode;
149 flag roundNearestEven;
150 int8_t roundIncrement, roundBits;
151 int32_t z;
152
153 roundingMode = status->float_rounding_mode;
154 roundNearestEven = ( roundingMode == float_round_nearest_even );
155 switch (roundingMode) {
156 case float_round_nearest_even:
157 case float_round_ties_away:
158 roundIncrement = 0x40;
159 break;
160 case float_round_to_zero:
161 roundIncrement = 0;
162 break;
163 case float_round_up:
164 roundIncrement = zSign ? 0 : 0x7f;
165 break;
166 case float_round_down:
167 roundIncrement = zSign ? 0x7f : 0;
168 break;
169 default:
170 abort();
171 }
172 roundBits = absZ & 0x7F;
173 absZ = ( absZ + roundIncrement )>>7;
174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175 z = absZ;
176 if ( zSign ) z = - z;
177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178 float_raise(float_flag_invalid, status);
179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180 }
181 if (roundBits) {
182 status->float_exception_flags |= float_flag_inexact;
183 }
184 return z;
185
186 }
187
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201 float_status *status)
202 {
203 int8_t roundingMode;
204 flag roundNearestEven, increment;
205 int64_t z;
206
207 roundingMode = status->float_rounding_mode;
208 roundNearestEven = ( roundingMode == float_round_nearest_even );
209 switch (roundingMode) {
210 case float_round_nearest_even:
211 case float_round_ties_away:
212 increment = ((int64_t) absZ1 < 0);
213 break;
214 case float_round_to_zero:
215 increment = 0;
216 break;
217 case float_round_up:
218 increment = !zSign && absZ1;
219 break;
220 case float_round_down:
221 increment = zSign && absZ1;
222 break;
223 default:
224 abort();
225 }
226 if ( increment ) {
227 ++absZ0;
228 if ( absZ0 == 0 ) goto overflow;
229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230 }
231 z = absZ0;
232 if ( zSign ) z = - z;
233 if ( z && ( ( z < 0 ) ^ zSign ) ) {
234 overflow:
235 float_raise(float_flag_invalid, status);
236 return
237 zSign ? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
239 }
240 if (absZ1) {
241 status->float_exception_flags |= float_flag_inexact;
242 }
243 return z;
244
245 }
246
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258 uint64_t absZ1, float_status *status)
259 {
260 int8_t roundingMode;
261 flag roundNearestEven, increment;
262
263 roundingMode = status->float_rounding_mode;
264 roundNearestEven = (roundingMode == float_round_nearest_even);
265 switch (roundingMode) {
266 case float_round_nearest_even:
267 case float_round_ties_away:
268 increment = ((int64_t)absZ1 < 0);
269 break;
270 case float_round_to_zero:
271 increment = 0;
272 break;
273 case float_round_up:
274 increment = !zSign && absZ1;
275 break;
276 case float_round_down:
277 increment = zSign && absZ1;
278 break;
279 default:
280 abort();
281 }
282 if (increment) {
283 ++absZ0;
284 if (absZ0 == 0) {
285 float_raise(float_flag_invalid, status);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
287 }
288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289 }
290
291 if (zSign && absZ0) {
292 float_raise(float_flag_invalid, status);
293 return 0;
294 }
295
296 if (absZ1) {
297 status->float_exception_flags |= float_flag_inexact;
298 }
299 return absZ0;
300 }
301
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308
309 return float32_val(a) & 0x007FFFFF;
310
311 }
312
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316
317 static inline int extractFloat32Exp(float32 a)
318 {
319
320 return ( float32_val(a)>>23 ) & 0xFF;
321
322 }
323
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327
328 static inline flag extractFloat32Sign( float32 a )
329 {
330
331 return float32_val(a)>>31;
332
333 }
334
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341 if (status->flush_inputs_to_zero) {
342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343 float_raise(float_flag_input_denormal, status);
344 return make_float32(float32_val(a) & 0x80000000);
345 }
346 }
347 return a;
348 }
349
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356
357 static void
358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
359 {
360 int8_t shiftCount;
361
362 shiftCount = countLeadingZeros32( aSig ) - 8;
363 *zSigPtr = aSig<<shiftCount;
364 *zExpPtr = 1 - shiftCount;
365
366 }
367
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378
379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
380 {
381
382 return make_float32(
383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384
385 }
386
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408
409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
410 float_status *status)
411 {
412 int8_t roundingMode;
413 flag roundNearestEven;
414 int8_t roundIncrement, roundBits;
415 flag isTiny;
416
417 roundingMode = status->float_rounding_mode;
418 roundNearestEven = ( roundingMode == float_round_nearest_even );
419 switch (roundingMode) {
420 case float_round_nearest_even:
421 case float_round_ties_away:
422 roundIncrement = 0x40;
423 break;
424 case float_round_to_zero:
425 roundIncrement = 0;
426 break;
427 case float_round_up:
428 roundIncrement = zSign ? 0 : 0x7f;
429 break;
430 case float_round_down:
431 roundIncrement = zSign ? 0x7f : 0;
432 break;
433 default:
434 abort();
435 break;
436 }
437 roundBits = zSig & 0x7F;
438 if ( 0xFD <= (uint16_t) zExp ) {
439 if ( ( 0xFD < zExp )
440 || ( ( zExp == 0xFD )
441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442 ) {
443 float_raise(float_flag_overflow | float_flag_inexact, status);
444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445 }
446 if ( zExp < 0 ) {
447 if (status->flush_to_zero) {
448 float_raise(float_flag_output_denormal, status);
449 return packFloat32(zSign, 0, 0);
450 }
451 isTiny =
452 (status->float_detect_tininess
453 == float_tininess_before_rounding)
454 || ( zExp < -1 )
455 || ( zSig + roundIncrement < 0x80000000 );
456 shift32RightJamming( zSig, - zExp, &zSig );
457 zExp = 0;
458 roundBits = zSig & 0x7F;
459 if (isTiny && roundBits) {
460 float_raise(float_flag_underflow, status);
461 }
462 }
463 }
464 if (roundBits) {
465 status->float_exception_flags |= float_flag_inexact;
466 }
467 zSig = ( zSig + roundIncrement )>>7;
468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469 if ( zSig == 0 ) zExp = 0;
470 return packFloat32( zSign, zExp, zSig );
471
472 }
473
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482
483 static float32
484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
485 float_status *status)
486 {
487 int8_t shiftCount;
488
489 shiftCount = countLeadingZeros32( zSig ) - 1;
490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491 status);
492
493 }
494
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501
502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503
504 }
505
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509
510 static inline int extractFloat64Exp(float64 a)
511 {
512
513 return ( float64_val(a)>>52 ) & 0x7FF;
514
515 }
516
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520
521 static inline flag extractFloat64Sign( float64 a )
522 {
523
524 return float64_val(a)>>63;
525
526 }
527
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534 if (status->flush_inputs_to_zero) {
535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536 float_raise(float_flag_input_denormal, status);
537 return make_float64(float64_val(a) & (1ULL << 63));
538 }
539 }
540 return a;
541 }
542
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549
550 static void
551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
552 {
553 int8_t shiftCount;
554
555 shiftCount = countLeadingZeros64( aSig ) - 11;
556 *zSigPtr = aSig<<shiftCount;
557 *zExpPtr = 1 - shiftCount;
558
559 }
560
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571
572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
573 {
574
575 return make_float64(
576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577
578 }
579
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601
602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
603 float_status *status)
604 {
605 int8_t roundingMode;
606 flag roundNearestEven;
607 int roundIncrement, roundBits;
608 flag isTiny;
609
610 roundingMode = status->float_rounding_mode;
611 roundNearestEven = ( roundingMode == float_round_nearest_even );
612 switch (roundingMode) {
613 case float_round_nearest_even:
614 case float_round_ties_away:
615 roundIncrement = 0x200;
616 break;
617 case float_round_to_zero:
618 roundIncrement = 0;
619 break;
620 case float_round_up:
621 roundIncrement = zSign ? 0 : 0x3ff;
622 break;
623 case float_round_down:
624 roundIncrement = zSign ? 0x3ff : 0;
625 break;
626 default:
627 abort();
628 }
629 roundBits = zSig & 0x3FF;
630 if ( 0x7FD <= (uint16_t) zExp ) {
631 if ( ( 0x7FD < zExp )
632 || ( ( zExp == 0x7FD )
633 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
634 ) {
635 float_raise(float_flag_overflow | float_flag_inexact, status);
636 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
637 }
638 if ( zExp < 0 ) {
639 if (status->flush_to_zero) {
640 float_raise(float_flag_output_denormal, status);
641 return packFloat64(zSign, 0, 0);
642 }
643 isTiny =
644 (status->float_detect_tininess
645 == float_tininess_before_rounding)
646 || ( zExp < -1 )
647 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
648 shift64RightJamming( zSig, - zExp, &zSig );
649 zExp = 0;
650 roundBits = zSig & 0x3FF;
651 if (isTiny && roundBits) {
652 float_raise(float_flag_underflow, status);
653 }
654 }
655 }
656 if (roundBits) {
657 status->float_exception_flags |= float_flag_inexact;
658 }
659 zSig = ( zSig + roundIncrement )>>10;
660 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
661 if ( zSig == 0 ) zExp = 0;
662 return packFloat64( zSign, zExp, zSig );
663
664 }
665
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input. This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
674
675 static float64
676 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
677 float_status *status)
678 {
679 int8_t shiftCount;
680
681 shiftCount = countLeadingZeros64( zSig ) - 1;
682 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
683 status);
684
685 }
686
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
689 | value `a'.
690 *----------------------------------------------------------------------------*/
691
692 static inline uint64_t extractFloatx80Frac( floatx80 a )
693 {
694
695 return a.low;
696
697 }
698
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703
704 static inline int32_t extractFloatx80Exp( floatx80 a )
705 {
706
707 return a.high & 0x7FFF;
708
709 }
710
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
713 | `a'.
714 *----------------------------------------------------------------------------*/
715
716 static inline flag extractFloatx80Sign( floatx80 a )
717 {
718
719 return a.high>>15;
720
721 }
722
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'. The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
729
730 static void
731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
732 {
733 int8_t shiftCount;
734
735 shiftCount = countLeadingZeros64( aSig );
736 *zSigPtr = aSig<<shiftCount;
737 *zExpPtr = 1 - shiftCount;
738
739 }
740
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
745
746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
747 {
748 floatx80 z;
749
750 z.low = zSig;
751 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
752 return z;
753
754 }
755
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly. However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned. If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively. Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
772 | format.
773 | The input significand must be normalized or smaller. If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding. The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
779
780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
781 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
782 float_status *status)
783 {
784 int8_t roundingMode;
785 flag roundNearestEven, increment, isTiny;
786 int64_t roundIncrement, roundMask, roundBits;
787
788 roundingMode = status->float_rounding_mode;
789 roundNearestEven = ( roundingMode == float_round_nearest_even );
790 if ( roundingPrecision == 80 ) goto precision80;
791 if ( roundingPrecision == 64 ) {
792 roundIncrement = LIT64( 0x0000000000000400 );
793 roundMask = LIT64( 0x00000000000007FF );
794 }
795 else if ( roundingPrecision == 32 ) {
796 roundIncrement = LIT64( 0x0000008000000000 );
797 roundMask = LIT64( 0x000000FFFFFFFFFF );
798 }
799 else {
800 goto precision80;
801 }
802 zSig0 |= ( zSig1 != 0 );
803 switch (roundingMode) {
804 case float_round_nearest_even:
805 case float_round_ties_away:
806 break;
807 case float_round_to_zero:
808 roundIncrement = 0;
809 break;
810 case float_round_up:
811 roundIncrement = zSign ? 0 : roundMask;
812 break;
813 case float_round_down:
814 roundIncrement = zSign ? roundMask : 0;
815 break;
816 default:
817 abort();
818 }
819 roundBits = zSig0 & roundMask;
820 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
821 if ( ( 0x7FFE < zExp )
822 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
823 ) {
824 goto overflow;
825 }
826 if ( zExp <= 0 ) {
827 if (status->flush_to_zero) {
828 float_raise(float_flag_output_denormal, status);
829 return packFloatx80(zSign, 0, 0);
830 }
831 isTiny =
832 (status->float_detect_tininess
833 == float_tininess_before_rounding)
834 || ( zExp < 0 )
835 || ( zSig0 <= zSig0 + roundIncrement );
836 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
837 zExp = 0;
838 roundBits = zSig0 & roundMask;
839 if (isTiny && roundBits) {
840 float_raise(float_flag_underflow, status);
841 }
842 if (roundBits) {
843 status->float_exception_flags |= float_flag_inexact;
844 }
845 zSig0 += roundIncrement;
846 if ( (int64_t) zSig0 < 0 ) zExp = 1;
847 roundIncrement = roundMask + 1;
848 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
849 roundMask |= roundIncrement;
850 }
851 zSig0 &= ~ roundMask;
852 return packFloatx80( zSign, zExp, zSig0 );
853 }
854 }
855 if (roundBits) {
856 status->float_exception_flags |= float_flag_inexact;
857 }
858 zSig0 += roundIncrement;
859 if ( zSig0 < roundIncrement ) {
860 ++zExp;
861 zSig0 = LIT64( 0x8000000000000000 );
862 }
863 roundIncrement = roundMask + 1;
864 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
865 roundMask |= roundIncrement;
866 }
867 zSig0 &= ~ roundMask;
868 if ( zSig0 == 0 ) zExp = 0;
869 return packFloatx80( zSign, zExp, zSig0 );
870 precision80:
871 switch (roundingMode) {
872 case float_round_nearest_even:
873 case float_round_ties_away:
874 increment = ((int64_t)zSig1 < 0);
875 break;
876 case float_round_to_zero:
877 increment = 0;
878 break;
879 case float_round_up:
880 increment = !zSign && zSig1;
881 break;
882 case float_round_down:
883 increment = zSign && zSig1;
884 break;
885 default:
886 abort();
887 }
888 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
889 if ( ( 0x7FFE < zExp )
890 || ( ( zExp == 0x7FFE )
891 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
892 && increment
893 )
894 ) {
895 roundMask = 0;
896 overflow:
897 float_raise(float_flag_overflow | float_flag_inexact, status);
898 if ( ( roundingMode == float_round_to_zero )
899 || ( zSign && ( roundingMode == float_round_up ) )
900 || ( ! zSign && ( roundingMode == float_round_down ) )
901 ) {
902 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
903 }
904 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
905 }
906 if ( zExp <= 0 ) {
907 isTiny =
908 (status->float_detect_tininess
909 == float_tininess_before_rounding)
910 || ( zExp < 0 )
911 || ! increment
912 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
913 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
914 zExp = 0;
915 if (isTiny && zSig1) {
916 float_raise(float_flag_underflow, status);
917 }
918 if (zSig1) {
919 status->float_exception_flags |= float_flag_inexact;
920 }
921 switch (roundingMode) {
922 case float_round_nearest_even:
923 case float_round_ties_away:
924 increment = ((int64_t)zSig1 < 0);
925 break;
926 case float_round_to_zero:
927 increment = 0;
928 break;
929 case float_round_up:
930 increment = !zSign && zSig1;
931 break;
932 case float_round_down:
933 increment = zSign && zSig1;
934 break;
935 default:
936 abort();
937 }
938 if ( increment ) {
939 ++zSig0;
940 zSig0 &=
941 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
942 if ( (int64_t) zSig0 < 0 ) zExp = 1;
943 }
944 return packFloatx80( zSign, zExp, zSig0 );
945 }
946 }
947 if (zSig1) {
948 status->float_exception_flags |= float_flag_inexact;
949 }
950 if ( increment ) {
951 ++zSig0;
952 if ( zSig0 == 0 ) {
953 ++zExp;
954 zSig0 = LIT64( 0x8000000000000000 );
955 }
956 else {
957 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
958 }
959 }
960 else {
961 if ( zSig0 == 0 ) zExp = 0;
962 }
963 return packFloatx80( zSign, zExp, zSig0 );
964
965 }
966
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input. This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
973 | normalized.
974 *----------------------------------------------------------------------------*/
975
976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
977 flag zSign, int32_t zExp,
978 uint64_t zSig0, uint64_t zSig1,
979 float_status *status)
980 {
981 int8_t shiftCount;
982
983 if ( zSig0 == 0 ) {
984 zSig0 = zSig1;
985 zSig1 = 0;
986 zExp -= 64;
987 }
988 shiftCount = countLeadingZeros64( zSig0 );
989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
990 zExp -= shiftCount;
991 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
992 zSig0, zSig1, status);
993
994 }
995
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1000
1001 static inline uint64_t extractFloat128Frac1( float128 a )
1002 {
1003
1004 return a.low;
1005
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012
1013 static inline uint64_t extractFloat128Frac0( float128 a )
1014 {
1015
1016 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017
1018 }
1019
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1022 | `a'.
1023 *----------------------------------------------------------------------------*/
1024
1025 static inline int32_t extractFloat128Exp( float128 a )
1026 {
1027
1028 return ( a.high>>48 ) & 0x7FFF;
1029
1030 }
1031
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1035
1036 static inline flag extractFloat128Sign( float128 a )
1037 {
1038
1039 return a.high>>63;
1040
1041 }
1042
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1052
1053 static void
1054 normalizeFloat128Subnormal(
1055 uint64_t aSig0,
1056 uint64_t aSig1,
1057 int32_t *zExpPtr,
1058 uint64_t *zSig0Ptr,
1059 uint64_t *zSig1Ptr
1060 )
1061 {
1062 int8_t shiftCount;
1063
1064 if ( aSig0 == 0 ) {
1065 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066 if ( shiftCount < 0 ) {
1067 *zSig0Ptr = aSig1>>( - shiftCount );
1068 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069 }
1070 else {
1071 *zSig0Ptr = aSig1<<shiftCount;
1072 *zSig1Ptr = 0;
1073 }
1074 *zExpPtr = - shiftCount - 63;
1075 }
1076 else {
1077 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079 *zExpPtr = 1 - shiftCount;
1080 }
1081
1082 }
1083
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result. After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result. This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094 | significand.
1095 *----------------------------------------------------------------------------*/
1096
1097 static inline float128
1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099 {
1100 float128 z;
1101
1102 z.low = zSig1;
1103 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104 return z;
1105
1106 }
1107
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input. Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly. However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned. If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 | The input significand must be normalized or smaller. If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding. In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent. The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130 uint64_t zSig0, uint64_t zSig1,
1131 uint64_t zSig2, float_status *status)
1132 {
1133 int8_t roundingMode;
1134 flag roundNearestEven, increment, isTiny;
1135
1136 roundingMode = status->float_rounding_mode;
1137 roundNearestEven = ( roundingMode == float_round_nearest_even );
1138 switch (roundingMode) {
1139 case float_round_nearest_even:
1140 case float_round_ties_away:
1141 increment = ((int64_t)zSig2 < 0);
1142 break;
1143 case float_round_to_zero:
1144 increment = 0;
1145 break;
1146 case float_round_up:
1147 increment = !zSign && zSig2;
1148 break;
1149 case float_round_down:
1150 increment = zSign && zSig2;
1151 break;
1152 default:
1153 abort();
1154 }
1155 if ( 0x7FFD <= (uint32_t) zExp ) {
1156 if ( ( 0x7FFD < zExp )
1157 || ( ( zExp == 0x7FFD )
1158 && eq128(
1159 LIT64( 0x0001FFFFFFFFFFFF ),
1160 LIT64( 0xFFFFFFFFFFFFFFFF ),
1161 zSig0,
1162 zSig1
1163 )
1164 && increment
1165 )
1166 ) {
1167 float_raise(float_flag_overflow | float_flag_inexact, status);
1168 if ( ( roundingMode == float_round_to_zero )
1169 || ( zSign && ( roundingMode == float_round_up ) )
1170 || ( ! zSign && ( roundingMode == float_round_down ) )
1171 ) {
1172 return
1173 packFloat128(
1174 zSign,
1175 0x7FFE,
1176 LIT64( 0x0000FFFFFFFFFFFF ),
1177 LIT64( 0xFFFFFFFFFFFFFFFF )
1178 );
1179 }
1180 return packFloat128( zSign, 0x7FFF, 0, 0 );
1181 }
1182 if ( zExp < 0 ) {
1183 if (status->flush_to_zero) {
1184 float_raise(float_flag_output_denormal, status);
1185 return packFloat128(zSign, 0, 0, 0);
1186 }
1187 isTiny =
1188 (status->float_detect_tininess
1189 == float_tininess_before_rounding)
1190 || ( zExp < -1 )
1191 || ! increment
1192 || lt128(
1193 zSig0,
1194 zSig1,
1195 LIT64( 0x0001FFFFFFFFFFFF ),
1196 LIT64( 0xFFFFFFFFFFFFFFFF )
1197 );
1198 shift128ExtraRightJamming(
1199 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200 zExp = 0;
1201 if (isTiny && zSig2) {
1202 float_raise(float_flag_underflow, status);
1203 }
1204 switch (roundingMode) {
1205 case float_round_nearest_even:
1206 case float_round_ties_away:
1207 increment = ((int64_t)zSig2 < 0);
1208 break;
1209 case float_round_to_zero:
1210 increment = 0;
1211 break;
1212 case float_round_up:
1213 increment = !zSign && zSig2;
1214 break;
1215 case float_round_down:
1216 increment = zSign && zSig2;
1217 break;
1218 default:
1219 abort();
1220 }
1221 }
1222 }
1223 if (zSig2) {
1224 status->float_exception_flags |= float_flag_inexact;
1225 }
1226 if ( increment ) {
1227 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229 }
1230 else {
1231 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232 }
1233 return packFloat128( zSign, zExp, zSig0, zSig1 );
1234
1235 }
1236
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input. This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1244 | point exponent.
1245 *----------------------------------------------------------------------------*/
1246
1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248 uint64_t zSig0, uint64_t zSig1,
1249 float_status *status)
1250 {
1251 int8_t shiftCount;
1252 uint64_t zSig2;
1253
1254 if ( zSig0 == 0 ) {
1255 zSig0 = zSig1;
1256 zSig1 = 0;
1257 zExp -= 64;
1258 }
1259 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260 if ( 0 <= shiftCount ) {
1261 zSig2 = 0;
1262 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263 }
1264 else {
1265 shift128ExtraRightJamming(
1266 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267 }
1268 zExp -= shiftCount;
1269 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1270
1271 }
1272
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format. The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1278
1279 float32 int32_to_float32(int32_t a, float_status *status)
1280 {
1281 flag zSign;
1282
1283 if ( a == 0 ) return float32_zero;
1284 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285 zSign = ( a < 0 );
1286 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1287 }
1288
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format. The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1294
1295 float64 int32_to_float64(int32_t a, float_status *status)
1296 {
1297 flag zSign;
1298 uint32_t absA;
1299 int8_t shiftCount;
1300 uint64_t zSig;
1301
1302 if ( a == 0 ) return float64_zero;
1303 zSign = ( a < 0 );
1304 absA = zSign ? - a : a;
1305 shiftCount = countLeadingZeros32( absA ) + 21;
1306 zSig = absA;
1307 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308
1309 }
1310
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format. The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315 | Arithmetic.
1316 *----------------------------------------------------------------------------*/
1317
1318 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1319 {
1320 flag zSign;
1321 uint32_t absA;
1322 int8_t shiftCount;
1323 uint64_t zSig;
1324
1325 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326 zSign = ( a < 0 );
1327 absA = zSign ? - a : a;
1328 shiftCount = countLeadingZeros32( absA ) + 32;
1329 zSig = absA;
1330 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331
1332 }
1333
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format. The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1339
1340 float128 int32_to_float128(int32_t a, float_status *status)
1341 {
1342 flag zSign;
1343 uint32_t absA;
1344 int8_t shiftCount;
1345 uint64_t zSig0;
1346
1347 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348 zSign = ( a < 0 );
1349 absA = zSign ? - a : a;
1350 shiftCount = countLeadingZeros32( absA ) + 17;
1351 zSig0 = absA;
1352 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format. The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1361
1362 float32 int64_to_float32(int64_t a, float_status *status)
1363 {
1364 flag zSign;
1365 uint64_t absA;
1366 int8_t shiftCount;
1367
1368 if ( a == 0 ) return float32_zero;
1369 zSign = ( a < 0 );
1370 absA = zSign ? - a : a;
1371 shiftCount = countLeadingZeros64( absA ) - 40;
1372 if ( 0 <= shiftCount ) {
1373 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374 }
1375 else {
1376 shiftCount += 7;
1377 if ( shiftCount < 0 ) {
1378 shift64RightJamming( absA, - shiftCount, &absA );
1379 }
1380 else {
1381 absA <<= shiftCount;
1382 }
1383 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1384 }
1385
1386 }
1387
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format. The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1393
1394 float64 int64_to_float64(int64_t a, float_status *status)
1395 {
1396 flag zSign;
1397
1398 if ( a == 0 ) return float64_zero;
1399 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400 return packFloat64( 1, 0x43E, 0 );
1401 }
1402 zSign = ( a < 0 );
1403 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1404 }
1405
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format. The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410 | Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412
1413 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1414 {
1415 flag zSign;
1416 uint64_t absA;
1417 int8_t shiftCount;
1418
1419 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420 zSign = ( a < 0 );
1421 absA = zSign ? - a : a;
1422 shiftCount = countLeadingZeros64( absA );
1423 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424
1425 }
1426
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format. The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1432
1433 float128 int64_to_float128(int64_t a, float_status *status)
1434 {
1435 flag zSign;
1436 uint64_t absA;
1437 int8_t shiftCount;
1438 int32_t zExp;
1439 uint64_t zSig0, zSig1;
1440
1441 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442 zSign = ( a < 0 );
1443 absA = zSign ? - a : a;
1444 shiftCount = countLeadingZeros64( absA ) + 49;
1445 zExp = 0x406E - shiftCount;
1446 if ( 64 <= shiftCount ) {
1447 zSig1 = 0;
1448 zSig0 = absA;
1449 shiftCount -= 64;
1450 }
1451 else {
1452 zSig1 = absA;
1453 zSig0 = 0;
1454 }
1455 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456 return packFloat128( zSign, zExp, zSig0, zSig1 );
1457
1458 }
1459
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format. The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1465
1466 float32 uint64_to_float32(uint64_t a, float_status *status)
1467 {
1468 int shiftcount;
1469
1470 if (a == 0) {
1471 return float32_zero;
1472 }
1473
1474 /* Determine (left) shift needed to put first set bit into bit posn 23
1475 * (since packFloat32() expects the binary point between bits 23 and 22);
1476 * this is the fast case for smallish numbers.
1477 */
1478 shiftcount = countLeadingZeros64(a) - 40;
1479 if (shiftcount >= 0) {
1480 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481 }
1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483 * expects the binary point between bits 30 and 29, hence the + 7.
1484 */
1485 shiftcount += 7;
1486 if (shiftcount < 0) {
1487 shift64RightJamming(a, -shiftcount, &a);
1488 } else {
1489 a <<= shiftcount;
1490 }
1491
1492 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1493 }
1494
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format. The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1500
1501 float64 uint64_to_float64(uint64_t a, float_status *status)
1502 {
1503 int exp = 0x43C;
1504 int shiftcount;
1505
1506 if (a == 0) {
1507 return float64_zero;
1508 }
1509
1510 shiftcount = countLeadingZeros64(a) - 1;
1511 if (shiftcount < 0) {
1512 shift64RightJamming(a, -shiftcount, &a);
1513 } else {
1514 a <<= shiftcount;
1515 }
1516 return roundAndPackFloat64(0, exp - shiftcount, a, status);
1517 }
1518
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format. The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1524
1525 float128 uint64_to_float128(uint64_t a, float_status *status)
1526 {
1527 if (a == 0) {
1528 return float128_zero;
1529 }
1530 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1531 }
1532
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format. The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode. If `a' is a NaN, the largest
1539 | positive integer is returned. Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1542
1543 int32_t float32_to_int32(float32 a, float_status *status)
1544 {
1545 flag aSign;
1546 int aExp;
1547 int shiftCount;
1548 uint32_t aSig;
1549 uint64_t aSig64;
1550
1551 a = float32_squash_input_denormal(a, status);
1552 aSig = extractFloat32Frac( a );
1553 aExp = extractFloat32Exp( a );
1554 aSign = extractFloat32Sign( a );
1555 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556 if ( aExp ) aSig |= 0x00800000;
1557 shiftCount = 0xAF - aExp;
1558 aSig64 = aSig;
1559 aSig64 <<= 32;
1560 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561 return roundAndPackInt32(aSign, aSig64, status);
1562
1563 }
1564
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format. The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1572 | returned.
1573 *----------------------------------------------------------------------------*/
1574
1575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576 {
1577 flag aSign;
1578 int aExp;
1579 int shiftCount;
1580 uint32_t aSig;
1581 int32_t z;
1582 a = float32_squash_input_denormal(a, status);
1583
1584 aSig = extractFloat32Frac( a );
1585 aExp = extractFloat32Exp( a );
1586 aSign = extractFloat32Sign( a );
1587 shiftCount = aExp - 0x9E;
1588 if ( 0 <= shiftCount ) {
1589 if ( float32_val(a) != 0xCF000000 ) {
1590 float_raise(float_flag_invalid, status);
1591 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592 }
1593 return (int32_t) 0x80000000;
1594 }
1595 else if ( aExp <= 0x7E ) {
1596 if (aExp | aSig) {
1597 status->float_exception_flags |= float_flag_inexact;
1598 }
1599 return 0;
1600 }
1601 aSig = ( aSig | 0x00800000 )<<8;
1602 z = aSig>>( - shiftCount );
1603 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1604 status->float_exception_flags |= float_flag_inexact;
1605 }
1606 if ( aSign ) z = - z;
1607 return z;
1608
1609 }
1610
1611 /*----------------------------------------------------------------------------
1612 | Returns the result of converting the single-precision floating-point value
1613 | `a' to the 16-bit two's complement integer format. The conversion is
1614 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 | Arithmetic, except that the conversion is always rounded toward zero.
1616 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1617 | the conversion overflows, the largest integer with the same sign as `a' is
1618 | returned.
1619 *----------------------------------------------------------------------------*/
1620
1621 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1622 {
1623 flag aSign;
1624 int aExp;
1625 int shiftCount;
1626 uint32_t aSig;
1627 int32_t z;
1628
1629 aSig = extractFloat32Frac( a );
1630 aExp = extractFloat32Exp( a );
1631 aSign = extractFloat32Sign( a );
1632 shiftCount = aExp - 0x8E;
1633 if ( 0 <= shiftCount ) {
1634 if ( float32_val(a) != 0xC7000000 ) {
1635 float_raise(float_flag_invalid, status);
1636 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1637 return 0x7FFF;
1638 }
1639 }
1640 return (int32_t) 0xffff8000;
1641 }
1642 else if ( aExp <= 0x7E ) {
1643 if ( aExp | aSig ) {
1644 status->float_exception_flags |= float_flag_inexact;
1645 }
1646 return 0;
1647 }
1648 shiftCount -= 0x10;
1649 aSig = ( aSig | 0x00800000 )<<8;
1650 z = aSig>>( - shiftCount );
1651 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1652 status->float_exception_flags |= float_flag_inexact;
1653 }
1654 if ( aSign ) {
1655 z = - z;
1656 }
1657 return z;
1658
1659 }
1660
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit two's complement integer format. The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode. If `a' is a NaN, the largest
1667 | positive integer is returned. Otherwise, if the conversion overflows, the
1668 | largest integer with the same sign as `a' is returned.
1669 *----------------------------------------------------------------------------*/
1670
1671 int64_t float32_to_int64(float32 a, float_status *status)
1672 {
1673 flag aSign;
1674 int aExp;
1675 int shiftCount;
1676 uint32_t aSig;
1677 uint64_t aSig64, aSigExtra;
1678 a = float32_squash_input_denormal(a, status);
1679
1680 aSig = extractFloat32Frac( a );
1681 aExp = extractFloat32Exp( a );
1682 aSign = extractFloat32Sign( a );
1683 shiftCount = 0xBE - aExp;
1684 if ( shiftCount < 0 ) {
1685 float_raise(float_flag_invalid, status);
1686 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1687 return LIT64( 0x7FFFFFFFFFFFFFFF );
1688 }
1689 return (int64_t) LIT64( 0x8000000000000000 );
1690 }
1691 if ( aExp ) aSig |= 0x00800000;
1692 aSig64 = aSig;
1693 aSig64 <<= 40;
1694 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1695 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696
1697 }
1698
1699 /*----------------------------------------------------------------------------
1700 | Returns the result of converting the single-precision floating-point value
1701 | `a' to the 64-bit unsigned integer format. The conversion is
1702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1703 | Arithmetic---which means in particular that the conversion is rounded
1704 | according to the current rounding mode. If `a' is a NaN, the largest
1705 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1706 | largest unsigned integer is returned. If the 'a' is negative, the result
1707 | is rounded and zero is returned; values that do not round to zero will
1708 | raise the inexact exception flag.
1709 *----------------------------------------------------------------------------*/
1710
1711 uint64_t float32_to_uint64(float32 a, float_status *status)
1712 {
1713 flag aSign;
1714 int aExp;
1715 int shiftCount;
1716 uint32_t aSig;
1717 uint64_t aSig64, aSigExtra;
1718 a = float32_squash_input_denormal(a, status);
1719
1720 aSig = extractFloat32Frac(a);
1721 aExp = extractFloat32Exp(a);
1722 aSign = extractFloat32Sign(a);
1723 if ((aSign) && (aExp > 126)) {
1724 float_raise(float_flag_invalid, status);
1725 if (float32_is_any_nan(a)) {
1726 return LIT64(0xFFFFFFFFFFFFFFFF);
1727 } else {
1728 return 0;
1729 }
1730 }
1731 shiftCount = 0xBE - aExp;
1732 if (aExp) {
1733 aSig |= 0x00800000;
1734 }
1735 if (shiftCount < 0) {
1736 float_raise(float_flag_invalid, status);
1737 return LIT64(0xFFFFFFFFFFFFFFFF);
1738 }
1739
1740 aSig64 = aSig;
1741 aSig64 <<= 40;
1742 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1743 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1744 }
1745
1746 /*----------------------------------------------------------------------------
1747 | Returns the result of converting the single-precision floating-point value
1748 | `a' to the 64-bit unsigned integer format. The conversion is
1749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1750 | Arithmetic, except that the conversion is always rounded toward zero. If
1751 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1752 | conversion overflows, the largest unsigned integer is returned. If the
1753 | 'a' is negative, the result is rounded and zero is returned; values that do
1754 | not round to zero will raise the inexact flag.
1755 *----------------------------------------------------------------------------*/
1756
1757 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1758 {
1759 signed char current_rounding_mode = status->float_rounding_mode;
1760 set_float_rounding_mode(float_round_to_zero, status);
1761 int64_t v = float32_to_uint64(a, status);
1762 set_float_rounding_mode(current_rounding_mode, status);
1763 return v;
1764 }
1765
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of converting the single-precision floating-point value
1768 | `a' to the 64-bit two's complement integer format. The conversion is
1769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1770 | Arithmetic, except that the conversion is always rounded toward zero. If
1771 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1772 | conversion overflows, the largest integer with the same sign as `a' is
1773 | returned.
1774 *----------------------------------------------------------------------------*/
1775
1776 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1777 {
1778 flag aSign;
1779 int aExp;
1780 int shiftCount;
1781 uint32_t aSig;
1782 uint64_t aSig64;
1783 int64_t z;
1784 a = float32_squash_input_denormal(a, status);
1785
1786 aSig = extractFloat32Frac( a );
1787 aExp = extractFloat32Exp( a );
1788 aSign = extractFloat32Sign( a );
1789 shiftCount = aExp - 0xBE;
1790 if ( 0 <= shiftCount ) {
1791 if ( float32_val(a) != 0xDF000000 ) {
1792 float_raise(float_flag_invalid, status);
1793 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1794 return LIT64( 0x7FFFFFFFFFFFFFFF );
1795 }
1796 }
1797 return (int64_t) LIT64( 0x8000000000000000 );
1798 }
1799 else if ( aExp <= 0x7E ) {
1800 if (aExp | aSig) {
1801 status->float_exception_flags |= float_flag_inexact;
1802 }
1803 return 0;
1804 }
1805 aSig64 = aSig | 0x00800000;
1806 aSig64 <<= 40;
1807 z = aSig64>>( - shiftCount );
1808 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1809 status->float_exception_flags |= float_flag_inexact;
1810 }
1811 if ( aSign ) z = - z;
1812 return z;
1813
1814 }
1815
1816 /*----------------------------------------------------------------------------
1817 | Returns the result of converting the single-precision floating-point value
1818 | `a' to the double-precision floating-point format. The conversion is
1819 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1820 | Arithmetic.
1821 *----------------------------------------------------------------------------*/
1822
1823 float64 float32_to_float64(float32 a, float_status *status)
1824 {
1825 flag aSign;
1826 int aExp;
1827 uint32_t aSig;
1828 a = float32_squash_input_denormal(a, status);
1829
1830 aSig = extractFloat32Frac( a );
1831 aExp = extractFloat32Exp( a );
1832 aSign = extractFloat32Sign( a );
1833 if ( aExp == 0xFF ) {
1834 if (aSig) {
1835 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1836 }
1837 return packFloat64( aSign, 0x7FF, 0 );
1838 }
1839 if ( aExp == 0 ) {
1840 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1841 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1842 --aExp;
1843 }
1844 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1845
1846 }
1847
1848 /*----------------------------------------------------------------------------
1849 | Returns the result of converting the single-precision floating-point value
1850 | `a' to the extended double-precision floating-point format. The conversion
1851 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1852 | Arithmetic.
1853 *----------------------------------------------------------------------------*/
1854
1855 floatx80 float32_to_floatx80(float32 a, float_status *status)
1856 {
1857 flag aSign;
1858 int aExp;
1859 uint32_t aSig;
1860
1861 a = float32_squash_input_denormal(a, status);
1862 aSig = extractFloat32Frac( a );
1863 aExp = extractFloat32Exp( a );
1864 aSign = extractFloat32Sign( a );
1865 if ( aExp == 0xFF ) {
1866 if (aSig) {
1867 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1868 }
1869 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1870 }
1871 if ( aExp == 0 ) {
1872 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1873 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874 }
1875 aSig |= 0x00800000;
1876 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1877
1878 }
1879
1880 /*----------------------------------------------------------------------------
1881 | Returns the result of converting the single-precision floating-point value
1882 | `a' to the double-precision floating-point format. The conversion is
1883 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1884 | Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float128 float32_to_float128(float32 a, float_status *status)
1888 {
1889 flag aSign;
1890 int aExp;
1891 uint32_t aSig;
1892
1893 a = float32_squash_input_denormal(a, status);
1894 aSig = extractFloat32Frac( a );
1895 aExp = extractFloat32Exp( a );
1896 aSign = extractFloat32Sign( a );
1897 if ( aExp == 0xFF ) {
1898 if (aSig) {
1899 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1900 }
1901 return packFloat128( aSign, 0x7FFF, 0, 0 );
1902 }
1903 if ( aExp == 0 ) {
1904 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1905 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1906 --aExp;
1907 }
1908 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1909
1910 }
1911
1912 /*----------------------------------------------------------------------------
1913 | Rounds the single-precision floating-point value `a' to an integer, and
1914 | returns the result as a single-precision floating-point value. The
1915 | operation is performed according to the IEC/IEEE Standard for Binary
1916 | Floating-Point Arithmetic.
1917 *----------------------------------------------------------------------------*/
1918
1919 float32 float32_round_to_int(float32 a, float_status *status)
1920 {
1921 flag aSign;
1922 int aExp;
1923 uint32_t lastBitMask, roundBitsMask;
1924 uint32_t z;
1925 a = float32_squash_input_denormal(a, status);
1926
1927 aExp = extractFloat32Exp( a );
1928 if ( 0x96 <= aExp ) {
1929 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1930 return propagateFloat32NaN(a, a, status);
1931 }
1932 return a;
1933 }
1934 if ( aExp <= 0x7E ) {
1935 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1936 status->float_exception_flags |= float_flag_inexact;
1937 aSign = extractFloat32Sign( a );
1938 switch (status->float_rounding_mode) {
1939 case float_round_nearest_even:
1940 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1941 return packFloat32( aSign, 0x7F, 0 );
1942 }
1943 break;
1944 case float_round_ties_away:
1945 if (aExp == 0x7E) {
1946 return packFloat32(aSign, 0x7F, 0);
1947 }
1948 break;
1949 case float_round_down:
1950 return make_float32(aSign ? 0xBF800000 : 0);
1951 case float_round_up:
1952 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1953 }
1954 return packFloat32( aSign, 0, 0 );
1955 }
1956 lastBitMask = 1;
1957 lastBitMask <<= 0x96 - aExp;
1958 roundBitsMask = lastBitMask - 1;
1959 z = float32_val(a);
1960 switch (status->float_rounding_mode) {
1961 case float_round_nearest_even:
1962 z += lastBitMask>>1;
1963 if ((z & roundBitsMask) == 0) {
1964 z &= ~lastBitMask;
1965 }
1966 break;
1967 case float_round_ties_away:
1968 z += lastBitMask >> 1;
1969 break;
1970 case float_round_to_zero:
1971 break;
1972 case float_round_up:
1973 if (!extractFloat32Sign(make_float32(z))) {
1974 z += roundBitsMask;
1975 }
1976 break;
1977 case float_round_down:
1978 if (extractFloat32Sign(make_float32(z))) {
1979 z += roundBitsMask;
1980 }
1981 break;
1982 default:
1983 abort();
1984 }
1985 z &= ~ roundBitsMask;
1986 if (z != float32_val(a)) {
1987 status->float_exception_flags |= float_flag_inexact;
1988 }
1989 return make_float32(z);
1990
1991 }
1992
1993 /*----------------------------------------------------------------------------
1994 | Returns the result of adding the absolute values of the single-precision
1995 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1996 | before being returned. `zSign' is ignored if the result is a NaN.
1997 | The addition is performed according to the IEC/IEEE Standard for Binary
1998 | Floating-Point Arithmetic.
1999 *----------------------------------------------------------------------------*/
2000
2001 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2002 float_status *status)
2003 {
2004 int aExp, bExp, zExp;
2005 uint32_t aSig, bSig, zSig;
2006 int expDiff;
2007
2008 aSig = extractFloat32Frac( a );
2009 aExp = extractFloat32Exp( a );
2010 bSig = extractFloat32Frac( b );
2011 bExp = extractFloat32Exp( b );
2012 expDiff = aExp - bExp;
2013 aSig <<= 6;
2014 bSig <<= 6;
2015 if ( 0 < expDiff ) {
2016 if ( aExp == 0xFF ) {
2017 if (aSig) {
2018 return propagateFloat32NaN(a, b, status);
2019 }
2020 return a;
2021 }
2022 if ( bExp == 0 ) {
2023 --expDiff;
2024 }
2025 else {
2026 bSig |= 0x20000000;
2027 }
2028 shift32RightJamming( bSig, expDiff, &bSig );
2029 zExp = aExp;
2030 }
2031 else if ( expDiff < 0 ) {
2032 if ( bExp == 0xFF ) {
2033 if (bSig) {
2034 return propagateFloat32NaN(a, b, status);
2035 }
2036 return packFloat32( zSign, 0xFF, 0 );
2037 }
2038 if ( aExp == 0 ) {
2039 ++expDiff;
2040 }
2041 else {
2042 aSig |= 0x20000000;
2043 }
2044 shift32RightJamming( aSig, - expDiff, &aSig );
2045 zExp = bExp;
2046 }
2047 else {
2048 if ( aExp == 0xFF ) {
2049 if (aSig | bSig) {
2050 return propagateFloat32NaN(a, b, status);
2051 }
2052 return a;
2053 }
2054 if ( aExp == 0 ) {
2055 if (status->flush_to_zero) {
2056 if (aSig | bSig) {
2057 float_raise(float_flag_output_denormal, status);
2058 }
2059 return packFloat32(zSign, 0, 0);
2060 }
2061 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2062 }
2063 zSig = 0x40000000 + aSig + bSig;
2064 zExp = aExp;
2065 goto roundAndPack;
2066 }
2067 aSig |= 0x20000000;
2068 zSig = ( aSig + bSig )<<1;
2069 --zExp;
2070 if ( (int32_t) zSig < 0 ) {
2071 zSig = aSig + bSig;
2072 ++zExp;
2073 }
2074 roundAndPack:
2075 return roundAndPackFloat32(zSign, zExp, zSig, status);
2076
2077 }
2078
2079 /*----------------------------------------------------------------------------
2080 | Returns the result of subtracting the absolute values of the single-
2081 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2082 | difference is negated before being returned. `zSign' is ignored if the
2083 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2084 | Standard for Binary Floating-Point Arithmetic.
2085 *----------------------------------------------------------------------------*/
2086
2087 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2088 float_status *status)
2089 {
2090 int aExp, bExp, zExp;
2091 uint32_t aSig, bSig, zSig;
2092 int expDiff;
2093
2094 aSig = extractFloat32Frac( a );
2095 aExp = extractFloat32Exp( a );
2096 bSig = extractFloat32Frac( b );
2097 bExp = extractFloat32Exp( b );
2098 expDiff = aExp - bExp;
2099 aSig <<= 7;
2100 bSig <<= 7;
2101 if ( 0 < expDiff ) goto aExpBigger;
2102 if ( expDiff < 0 ) goto bExpBigger;
2103 if ( aExp == 0xFF ) {
2104 if (aSig | bSig) {
2105 return propagateFloat32NaN(a, b, status);
2106 }
2107 float_raise(float_flag_invalid, status);
2108 return float32_default_nan(status);
2109 }
2110 if ( aExp == 0 ) {
2111 aExp = 1;
2112 bExp = 1;
2113 }
2114 if ( bSig < aSig ) goto aBigger;
2115 if ( aSig < bSig ) goto bBigger;
2116 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2117 bExpBigger:
2118 if ( bExp == 0xFF ) {
2119 if (bSig) {
2120 return propagateFloat32NaN(a, b, status);
2121 }
2122 return packFloat32( zSign ^ 1, 0xFF, 0 );
2123 }
2124 if ( aExp == 0 ) {
2125 ++expDiff;
2126 }
2127 else {
2128 aSig |= 0x40000000;
2129 }
2130 shift32RightJamming( aSig, - expDiff, &aSig );
2131 bSig |= 0x40000000;
2132 bBigger:
2133 zSig = bSig - aSig;
2134 zExp = bExp;
2135 zSign ^= 1;
2136 goto normalizeRoundAndPack;
2137 aExpBigger:
2138 if ( aExp == 0xFF ) {
2139 if (aSig) {
2140 return propagateFloat32NaN(a, b, status);
2141 }
2142 return a;
2143 }
2144 if ( bExp == 0 ) {
2145 --expDiff;
2146 }
2147 else {
2148 bSig |= 0x40000000;
2149 }
2150 shift32RightJamming( bSig, expDiff, &bSig );
2151 aSig |= 0x40000000;
2152 aBigger:
2153 zSig = aSig - bSig;
2154 zExp = aExp;
2155 normalizeRoundAndPack:
2156 --zExp;
2157 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2158
2159 }
2160
2161 /*----------------------------------------------------------------------------
2162 | Returns the result of adding the single-precision floating-point values `a'
2163 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2164 | Binary Floating-Point Arithmetic.
2165 *----------------------------------------------------------------------------*/
2166
2167 float32 float32_add(float32 a, float32 b, float_status *status)
2168 {
2169 flag aSign, bSign;
2170 a = float32_squash_input_denormal(a, status);
2171 b = float32_squash_input_denormal(b, status);
2172
2173 aSign = extractFloat32Sign( a );
2174 bSign = extractFloat32Sign( b );
2175 if ( aSign == bSign ) {
2176 return addFloat32Sigs(a, b, aSign, status);
2177 }
2178 else {
2179 return subFloat32Sigs(a, b, aSign, status);
2180 }
2181
2182 }
2183
2184 /*----------------------------------------------------------------------------
2185 | Returns the result of subtracting the single-precision floating-point values
2186 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2187 | for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2189
2190 float32 float32_sub(float32 a, float32 b, float_status *status)
2191 {
2192 flag aSign, bSign;
2193 a = float32_squash_input_denormal(a, status);
2194 b = float32_squash_input_denormal(b, status);
2195
2196 aSign = extractFloat32Sign( a );
2197 bSign = extractFloat32Sign( b );
2198 if ( aSign == bSign ) {
2199 return subFloat32Sigs(a, b, aSign, status);
2200 }
2201 else {
2202 return addFloat32Sigs(a, b, aSign, status);
2203 }
2204
2205 }
2206
2207 /*----------------------------------------------------------------------------
2208 | Returns the result of multiplying the single-precision floating-point values
2209 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2210 | for Binary Floating-Point Arithmetic.
2211 *----------------------------------------------------------------------------*/
2212
2213 float32 float32_mul(float32 a, float32 b, float_status *status)
2214 {
2215 flag aSign, bSign, zSign;
2216 int aExp, bExp, zExp;
2217 uint32_t aSig, bSig;
2218 uint64_t zSig64;
2219 uint32_t zSig;
2220
2221 a = float32_squash_input_denormal(a, status);
2222 b = float32_squash_input_denormal(b, status);
2223
2224 aSig = extractFloat32Frac( a );
2225 aExp = extractFloat32Exp( a );
2226 aSign = extractFloat32Sign( a );
2227 bSig = extractFloat32Frac( b );
2228 bExp = extractFloat32Exp( b );
2229 bSign = extractFloat32Sign( b );
2230 zSign = aSign ^ bSign;
2231 if ( aExp == 0xFF ) {
2232 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2233 return propagateFloat32NaN(a, b, status);
2234 }
2235 if ( ( bExp | bSig ) == 0 ) {
2236 float_raise(float_flag_invalid, status);
2237 return float32_default_nan(status);
2238 }
2239 return packFloat32( zSign, 0xFF, 0 );
2240 }
2241 if ( bExp == 0xFF ) {
2242 if (bSig) {
2243 return propagateFloat32NaN(a, b, status);
2244 }
2245 if ( ( aExp | aSig ) == 0 ) {
2246 float_raise(float_flag_invalid, status);
2247 return float32_default_nan(status);
2248 }
2249 return packFloat32( zSign, 0xFF, 0 );
2250 }
2251 if ( aExp == 0 ) {
2252 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2253 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254 }
2255 if ( bExp == 0 ) {
2256 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2257 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2258 }
2259 zExp = aExp + bExp - 0x7F;
2260 aSig = ( aSig | 0x00800000 )<<7;
2261 bSig = ( bSig | 0x00800000 )<<8;
2262 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2263 zSig = zSig64;
2264 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2265 zSig <<= 1;
2266 --zExp;
2267 }
2268 return roundAndPackFloat32(zSign, zExp, zSig, status);
2269
2270 }
2271
2272 /*----------------------------------------------------------------------------
2273 | Returns the result of dividing the single-precision floating-point value `a'
2274 | by the corresponding value `b'. The operation is performed according to the
2275 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2277
2278 float32 float32_div(float32 a, float32 b, float_status *status)
2279 {
2280 flag aSign, bSign, zSign;
2281 int aExp, bExp, zExp;
2282 uint32_t aSig, bSig, zSig;
2283 a = float32_squash_input_denormal(a, status);
2284 b = float32_squash_input_denormal(b, status);
2285
2286 aSig = extractFloat32Frac( a );
2287 aExp = extractFloat32Exp( a );
2288 aSign = extractFloat32Sign( a );
2289 bSig = extractFloat32Frac( b );
2290 bExp = extractFloat32Exp( b );
2291 bSign = extractFloat32Sign( b );
2292 zSign = aSign ^ bSign;
2293 if ( aExp == 0xFF ) {
2294 if (aSig) {
2295 return propagateFloat32NaN(a, b, status);
2296 }
2297 if ( bExp == 0xFF ) {
2298 if (bSig) {
2299 return propagateFloat32NaN(a, b, status);
2300 }
2301 float_raise(float_flag_invalid, status);
2302 return float32_default_nan(status);
2303 }
2304 return packFloat32( zSign, 0xFF, 0 );
2305 }
2306 if ( bExp == 0xFF ) {
2307 if (bSig) {
2308 return propagateFloat32NaN(a, b, status);
2309 }
2310 return packFloat32( zSign, 0, 0 );
2311 }
2312 if ( bExp == 0 ) {
2313 if ( bSig == 0 ) {
2314 if ( ( aExp | aSig ) == 0 ) {
2315 float_raise(float_flag_invalid, status);
2316 return float32_default_nan(status);
2317 }
2318 float_raise(float_flag_divbyzero, status);
2319 return packFloat32( zSign, 0xFF, 0 );
2320 }
2321 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2322 }
2323 if ( aExp == 0 ) {
2324 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2325 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2326 }
2327 zExp = aExp - bExp + 0x7D;
2328 aSig = ( aSig | 0x00800000 )<<7;
2329 bSig = ( bSig | 0x00800000 )<<8;
2330 if ( bSig <= ( aSig + aSig ) ) {
2331 aSig >>= 1;
2332 ++zExp;
2333 }
2334 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2335 if ( ( zSig & 0x3F ) == 0 ) {
2336 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2337 }
2338 return roundAndPackFloat32(zSign, zExp, zSig, status);
2339
2340 }
2341
2342 /*----------------------------------------------------------------------------
2343 | Returns the remainder of the single-precision floating-point value `a'
2344 | with respect to the corresponding value `b'. The operation is performed
2345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2347
2348 float32 float32_rem(float32 a, float32 b, float_status *status)
2349 {
2350 flag aSign, zSign;
2351 int aExp, bExp, expDiff;
2352 uint32_t aSig, bSig;
2353 uint32_t q;
2354 uint64_t aSig64, bSig64, q64;
2355 uint32_t alternateASig;
2356 int32_t sigMean;
2357 a = float32_squash_input_denormal(a, status);
2358 b = float32_squash_input_denormal(b, status);
2359
2360 aSig = extractFloat32Frac( a );
2361 aExp = extractFloat32Exp( a );
2362 aSign = extractFloat32Sign( a );
2363 bSig = extractFloat32Frac( b );
2364 bExp = extractFloat32Exp( b );
2365 if ( aExp == 0xFF ) {
2366 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2367 return propagateFloat32NaN(a, b, status);
2368 }
2369 float_raise(float_flag_invalid, status);
2370 return float32_default_nan(status);
2371 }
2372 if ( bExp == 0xFF ) {
2373 if (bSig) {
2374 return propagateFloat32NaN(a, b, status);
2375 }
2376 return a;
2377 }
2378 if ( bExp == 0 ) {
2379 if ( bSig == 0 ) {
2380 float_raise(float_flag_invalid, status);
2381 return float32_default_nan(status);
2382 }
2383 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2384 }
2385 if ( aExp == 0 ) {
2386 if ( aSig == 0 ) return a;
2387 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2388 }
2389 expDiff = aExp - bExp;
2390 aSig |= 0x00800000;
2391 bSig |= 0x00800000;
2392 if ( expDiff < 32 ) {
2393 aSig <<= 8;
2394 bSig <<= 8;
2395 if ( expDiff < 0 ) {
2396 if ( expDiff < -1 ) return a;
2397 aSig >>= 1;
2398 }
2399 q = ( bSig <= aSig );
2400 if ( q ) aSig -= bSig;
2401 if ( 0 < expDiff ) {
2402 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2403 q >>= 32 - expDiff;
2404 bSig >>= 2;
2405 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2406 }
2407 else {
2408 aSig >>= 2;
2409 bSig >>= 2;
2410 }
2411 }
2412 else {
2413 if ( bSig <= aSig ) aSig -= bSig;
2414 aSig64 = ( (uint64_t) aSig )<<40;
2415 bSig64 = ( (uint64_t) bSig )<<40;
2416 expDiff -= 64;
2417 while ( 0 < expDiff ) {
2418 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420 aSig64 = - ( ( bSig * q64 )<<38 );
2421 expDiff -= 62;
2422 }
2423 expDiff += 64;
2424 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2425 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2426 q = q64>>( 64 - expDiff );
2427 bSig <<= 6;
2428 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2429 }
2430 do {
2431 alternateASig = aSig;
2432 ++q;
2433 aSig -= bSig;
2434 } while ( 0 <= (int32_t) aSig );
2435 sigMean = aSig + alternateASig;
2436 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2437 aSig = alternateASig;
2438 }
2439 zSign = ( (int32_t) aSig < 0 );
2440 if ( zSign ) aSig = - aSig;
2441 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2442 }
2443
2444 /*----------------------------------------------------------------------------
2445 | Returns the result of multiplying the single-precision floating-point values
2446 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2447 | multiplication. The operation is performed according to the IEC/IEEE
2448 | Standard for Binary Floating-Point Arithmetic 754-2008.
2449 | The flags argument allows the caller to select negation of the
2450 | addend, the intermediate product, or the final result. (The difference
2451 | between this and having the caller do a separate negation is that negating
2452 | externally will flip the sign bit on NaNs.)
2453 *----------------------------------------------------------------------------*/
2454
2455 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2456 float_status *status)
2457 {
2458 flag aSign, bSign, cSign, zSign;
2459 int aExp, bExp, cExp, pExp, zExp, expDiff;
2460 uint32_t aSig, bSig, cSig;
2461 flag pInf, pZero, pSign;
2462 uint64_t pSig64, cSig64, zSig64;
2463 uint32_t pSig;
2464 int shiftcount;
2465 flag signflip, infzero;
2466
2467 a = float32_squash_input_denormal(a, status);
2468 b = float32_squash_input_denormal(b, status);
2469 c = float32_squash_input_denormal(c, status);
2470 aSig = extractFloat32Frac(a);
2471 aExp = extractFloat32Exp(a);
2472 aSign = extractFloat32Sign(a);
2473 bSig = extractFloat32Frac(b);
2474 bExp = extractFloat32Exp(b);
2475 bSign = extractFloat32Sign(b);
2476 cSig = extractFloat32Frac(c);
2477 cExp = extractFloat32Exp(c);
2478 cSign = extractFloat32Sign(c);
2479
2480 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2481 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2482
2483 /* It is implementation-defined whether the cases of (0,inf,qnan)
2484 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485 * they return if they do), so we have to hand this information
2486 * off to the target-specific pick-a-NaN routine.
2487 */
2488 if (((aExp == 0xff) && aSig) ||
2489 ((bExp == 0xff) && bSig) ||
2490 ((cExp == 0xff) && cSig)) {
2491 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2492 }
2493
2494 if (infzero) {
2495 float_raise(float_flag_invalid, status);
2496 return float32_default_nan(status);
2497 }
2498
2499 if (flags & float_muladd_negate_c) {
2500 cSign ^= 1;
2501 }
2502
2503 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2504
2505 /* Work out the sign and type of the product */
2506 pSign = aSign ^ bSign;
2507 if (flags & float_muladd_negate_product) {
2508 pSign ^= 1;
2509 }
2510 pInf = (aExp == 0xff) || (bExp == 0xff);
2511 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2512
2513 if (cExp == 0xff) {
2514 if (pInf && (pSign ^ cSign)) {
2515 /* addition of opposite-signed infinities => InvalidOperation */
2516 float_raise(float_flag_invalid, status);
2517 return float32_default_nan(status);
2518 }
2519 /* Otherwise generate an infinity of the same sign */
2520 return packFloat32(cSign ^ signflip, 0xff, 0);
2521 }
2522
2523 if (pInf) {
2524 return packFloat32(pSign ^ signflip, 0xff, 0);
2525 }
2526
2527 if (pZero) {
2528 if (cExp == 0) {
2529 if (cSig == 0) {
2530 /* Adding two exact zeroes */
2531 if (pSign == cSign) {
2532 zSign = pSign;
2533 } else if (status->float_rounding_mode == float_round_down) {
2534 zSign = 1;
2535 } else {
2536 zSign = 0;
2537 }
2538 return packFloat32(zSign ^ signflip, 0, 0);
2539 }
2540 /* Exact zero plus a denorm */
2541 if (status->flush_to_zero) {
2542 float_raise(float_flag_output_denormal, status);
2543 return packFloat32(cSign ^ signflip, 0, 0);
2544 }
2545 }
2546 /* Zero plus something non-zero : just return the something */
2547 if (flags & float_muladd_halve_result) {
2548 if (cExp == 0) {
2549 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2550 }
2551 /* Subtract one to halve, and one again because roundAndPackFloat32
2552 * wants one less than the true exponent.
2553 */
2554 cExp -= 2;
2555 cSig = (cSig | 0x00800000) << 7;
2556 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2557 }
2558 return packFloat32(cSign ^ signflip, cExp, cSig);
2559 }
2560
2561 if (aExp == 0) {
2562 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2563 }
2564 if (bExp == 0) {
2565 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2566 }
2567
2568 /* Calculate the actual result a * b + c */
2569
2570 /* Multiply first; this is easy. */
2571 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572 * because we want the true exponent, not the "one-less-than"
2573 * flavour that roundAndPackFloat32() takes.
2574 */
2575 pExp = aExp + bExp - 0x7e;
2576 aSig = (aSig | 0x00800000) << 7;
2577 bSig = (bSig | 0x00800000) << 8;
2578 pSig64 = (uint64_t)aSig * bSig;
2579 if ((int64_t)(pSig64 << 1) >= 0) {
2580 pSig64 <<= 1;
2581 pExp--;
2582 }
2583
2584 zSign = pSign ^ signflip;
2585
2586 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2587 * position 62.
2588 */
2589 if (cExp == 0) {
2590 if (!cSig) {
2591 /* Throw out the special case of c being an exact zero now */
2592 shift64RightJamming(pSig64, 32, &pSig64);
2593 pSig = pSig64;
2594 if (flags & float_muladd_halve_result) {
2595 pExp--;
2596 }
2597 return roundAndPackFloat32(zSign, pExp - 1,
2598 pSig, status);
2599 }
2600 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2601 }
2602
2603 cSig64 = (uint64_t)cSig << (62 - 23);
2604 cSig64 |= LIT64(0x4000000000000000);
2605 expDiff = pExp - cExp;
2606
2607 if (pSign == cSign) {
2608 /* Addition */
2609 if (expDiff > 0) {
2610 /* scale c to match p */
2611 shift64RightJamming(cSig64, expDiff, &cSig64);
2612 zExp = pExp;
2613 } else if (expDiff < 0) {
2614 /* scale p to match c */
2615 shift64RightJamming(pSig64, -expDiff, &pSig64);
2616 zExp = cExp;
2617 } else {
2618 /* no scaling needed */
2619 zExp = cExp;
2620 }
2621 /* Add significands and make sure explicit bit ends up in posn 62 */
2622 zSig64 = pSig64 + cSig64;
2623 if ((int64_t)zSig64 < 0) {
2624 shift64RightJamming(zSig64, 1, &zSig64);
2625 } else {
2626 zExp--;
2627 }
2628 } else {
2629 /* Subtraction */
2630 if (expDiff > 0) {
2631 shift64RightJamming(cSig64, expDiff, &cSig64);
2632 zSig64 = pSig64 - cSig64;
2633 zExp = pExp;
2634 } else if (expDiff < 0) {
2635 shift64RightJamming(pSig64, -expDiff, &pSig64);
2636 zSig64 = cSig64 - pSig64;
2637 zExp = cExp;
2638 zSign ^= 1;
2639 } else {
2640 zExp = pExp;
2641 if (cSig64 < pSig64) {
2642 zSig64 = pSig64 - cSig64;
2643 } else if (pSig64 < cSig64) {
2644 zSig64 = cSig64 - pSig64;
2645 zSign ^= 1;
2646 } else {
2647 /* Exact zero */
2648 zSign = signflip;
2649 if (status->float_rounding_mode == float_round_down) {
2650 zSign ^= 1;
2651 }
2652 return packFloat32(zSign, 0, 0);
2653 }
2654 }
2655 --zExp;
2656 /* Normalize to put the explicit bit back into bit 62. */
2657 shiftcount = countLeadingZeros64(zSig64) - 1;
2658 zSig64 <<= shiftcount;
2659 zExp -= shiftcount;
2660 }
2661 if (flags & float_muladd_halve_result) {
2662 zExp--;
2663 }
2664
2665 shift64RightJamming(zSig64, 32, &zSig64);
2666 return roundAndPackFloat32(zSign, zExp, zSig64, status);
2667 }
2668
2669
2670 /*----------------------------------------------------------------------------
2671 | Returns the square root of the single-precision floating-point value `a'.
2672 | The operation is performed according to the IEC/IEEE Standard for Binary
2673 | Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2675
2676 float32 float32_sqrt(float32 a, float_status *status)
2677 {
2678 flag aSign;
2679 int aExp, zExp;
2680 uint32_t aSig, zSig;
2681 uint64_t rem, term;
2682 a = float32_squash_input_denormal(a, status);
2683
2684 aSig = extractFloat32Frac( a );
2685 aExp = extractFloat32Exp( a );
2686 aSign = extractFloat32Sign( a );
2687 if ( aExp == 0xFF ) {
2688 if (aSig) {
2689 return propagateFloat32NaN(a, float32_zero, status);
2690 }
2691 if ( ! aSign ) return a;
2692 float_raise(float_flag_invalid, status);
2693 return float32_default_nan(status);
2694 }
2695 if ( aSign ) {
2696 if ( ( aExp | aSig ) == 0 ) return a;
2697 float_raise(float_flag_invalid, status);
2698 return float32_default_nan(status);
2699 }
2700 if ( aExp == 0 ) {
2701 if ( aSig == 0 ) return float32_zero;
2702 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2703 }
2704 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2705 aSig = ( aSig | 0x00800000 )<<8;
2706 zSig = estimateSqrt32( aExp, aSig ) + 2;
2707 if ( ( zSig & 0x7F ) <= 5 ) {
2708 if ( zSig < 2 ) {
2709 zSig = 0x7FFFFFFF;
2710 goto roundAndPack;
2711 }
2712 aSig >>= aExp & 1;
2713 term = ( (uint64_t) zSig ) * zSig;
2714 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2715 while ( (int64_t) rem < 0 ) {
2716 --zSig;
2717 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2718 }
2719 zSig |= ( rem != 0 );
2720 }
2721 shift32RightJamming( zSig, 1, &zSig );
2722 roundAndPack:
2723 return roundAndPackFloat32(0, zExp, zSig, status);
2724
2725 }
2726
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary exponential of the single-precision floating-point value
2729 | `a'. The operation is performed according to the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2731 |
2732 | Uses the following identities:
2733 |
2734 | 1. -------------------------------------------------------------------------
2735 | x x*ln(2)
2736 | 2 = e
2737 |
2738 | 2. -------------------------------------------------------------------------
2739 | 2 3 4 5 n
2740 | x x x x x x x
2741 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2742 | 1! 2! 3! 4! 5! n!
2743 *----------------------------------------------------------------------------*/
2744
2745 static const float64 float32_exp2_coefficients[15] =
2746 {
2747 const_float64( 0x3ff0000000000000ll ), /* 1 */
2748 const_float64( 0x3fe0000000000000ll ), /* 2 */
2749 const_float64( 0x3fc5555555555555ll ), /* 3 */
2750 const_float64( 0x3fa5555555555555ll ), /* 4 */
2751 const_float64( 0x3f81111111111111ll ), /* 5 */
2752 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2753 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2754 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2755 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2756 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2757 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2758 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2759 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2760 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2761 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2762 };
2763
2764 float32 float32_exp2(float32 a, float_status *status)
2765 {
2766 flag aSign;
2767 int aExp;
2768 uint32_t aSig;
2769 float64 r, x, xn;
2770 int i;
2771 a = float32_squash_input_denormal(a, status);
2772
2773 aSig = extractFloat32Frac( a );
2774 aExp = extractFloat32Exp( a );
2775 aSign = extractFloat32Sign( a );
2776
2777 if ( aExp == 0xFF) {
2778 if (aSig) {
2779 return propagateFloat32NaN(a, float32_zero, status);
2780 }
2781 return (aSign) ? float32_zero : a;
2782 }
2783 if (aExp == 0) {
2784 if (aSig == 0) return float32_one;
2785 }
2786
2787 float_raise(float_flag_inexact, status);
2788
2789 /* ******************************* */
2790 /* using float64 for approximation */
2791 /* ******************************* */
2792 x = float32_to_float64(a, status);
2793 x = float64_mul(x, float64_ln2, status);
2794
2795 xn = x;
2796 r = float64_one;
2797 for (i = 0 ; i < 15 ; i++) {
2798 float64 f;
2799
2800 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2801 r = float64_add(r, f, status);
2802
2803 xn = float64_mul(xn, x, status);
2804 }
2805
2806 return float64_to_float32(r, status);
2807 }
2808
2809 /*----------------------------------------------------------------------------
2810 | Returns the binary log of the single-precision floating-point value `a'.
2811 | The operation is performed according to the IEC/IEEE Standard for Binary
2812 | Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814 float32 float32_log2(float32 a, float_status *status)
2815 {
2816 flag aSign, zSign;
2817 int aExp;
2818 uint32_t aSig, zSig, i;
2819
2820 a = float32_squash_input_denormal(a, status);
2821 aSig = extractFloat32Frac( a );
2822 aExp = extractFloat32Exp( a );
2823 aSign = extractFloat32Sign( a );
2824
2825 if ( aExp == 0 ) {
2826 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2827 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2828 }
2829 if ( aSign ) {
2830 float_raise(float_flag_invalid, status);
2831 return float32_default_nan(status);
2832 }
2833 if ( aExp == 0xFF ) {
2834 if (aSig) {
2835 return propagateFloat32NaN(a, float32_zero, status);
2836 }
2837 return a;
2838 }
2839
2840 aExp -= 0x7F;
2841 aSig |= 0x00800000;
2842 zSign = aExp < 0;
2843 zSig = aExp << 23;
2844
2845 for (i = 1 << 22; i > 0; i >>= 1) {
2846 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2847 if ( aSig & 0x01000000 ) {
2848 aSig >>= 1;
2849 zSig |= i;
2850 }
2851 }
2852
2853 if ( zSign )
2854 zSig = -zSig;
2855
2856 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2857 }
2858
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is equal to
2861 | the corresponding value `b', and 0 otherwise. The invalid exception is
2862 | raised if either operand is a NaN. Otherwise, the comparison is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865
2866 int float32_eq(float32 a, float32 b, float_status *status)
2867 {
2868 uint32_t av, bv;
2869 a = float32_squash_input_denormal(a, status);
2870 b = float32_squash_input_denormal(b, status);
2871
2872 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2873 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2874 ) {
2875 float_raise(float_flag_invalid, status);
2876 return 0;
2877 }
2878 av = float32_val(a);
2879 bv = float32_val(b);
2880 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2881 }
2882
2883 /*----------------------------------------------------------------------------
2884 | Returns 1 if the single-precision floating-point value `a' is less than
2885 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2886 | exception is raised if either operand is a NaN. The comparison is performed
2887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889
2890 int float32_le(float32 a, float32 b, float_status *status)
2891 {
2892 flag aSign, bSign;
2893 uint32_t av, bv;
2894 a = float32_squash_input_denormal(a, status);
2895 b = float32_squash_input_denormal(b, status);
2896
2897 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2899 ) {
2900 float_raise(float_flag_invalid, status);
2901 return 0;
2902 }
2903 aSign = extractFloat32Sign( a );
2904 bSign = extractFloat32Sign( b );
2905 av = float32_val(a);
2906 bv = float32_val(b);
2907 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2908 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2909
2910 }
2911
2912 /*----------------------------------------------------------------------------
2913 | Returns 1 if the single-precision floating-point value `a' is less than
2914 | the corresponding value `b', and 0 otherwise. The invalid exception is
2915 | raised if either operand is a NaN. The comparison is performed according
2916 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 *----------------------------------------------------------------------------*/
2918
2919 int float32_lt(float32 a, float32 b, float_status *status)
2920 {
2921 flag aSign, bSign;
2922 uint32_t av, bv;
2923 a = float32_squash_input_denormal(a, status);
2924 b = float32_squash_input_denormal(b, status);
2925
2926 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2927 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2928 ) {
2929 float_raise(float_flag_invalid, status);
2930 return 0;
2931 }
2932 aSign = extractFloat32Sign( a );
2933 bSign = extractFloat32Sign( b );
2934 av = float32_val(a);
2935 bv = float32_val(b);
2936 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2937 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2938
2939 }
2940
2941 /*----------------------------------------------------------------------------
2942 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943 | be compared, and 0 otherwise. The invalid exception is raised if either
2944 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2945 | Standard for Binary Floating-Point Arithmetic.
2946 *----------------------------------------------------------------------------*/
2947
2948 int float32_unordered(float32 a, float32 b, float_status *status)
2949 {
2950 a = float32_squash_input_denormal(a, status);
2951 b = float32_squash_input_denormal(b, status);
2952
2953 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2954 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2955 ) {
2956 float_raise(float_flag_invalid, status);
2957 return 1;
2958 }
2959 return 0;
2960 }
2961
2962 /*----------------------------------------------------------------------------
2963 | Returns 1 if the single-precision floating-point value `a' is equal to
2964 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2965 | exception. The comparison is performed according to the IEC/IEEE Standard
2966 | for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2968
2969 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2970 {
2971 a = float32_squash_input_denormal(a, status);
2972 b = float32_squash_input_denormal(b, status);
2973
2974 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2975 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2976 ) {
2977 if (float32_is_signaling_nan(a, status)
2978 || float32_is_signaling_nan(b, status)) {
2979 float_raise(float_flag_invalid, status);
2980 }
2981 return 0;
2982 }
2983 return ( float32_val(a) == float32_val(b) ) ||
2984 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns 1 if the single-precision floating-point value `a' is less than or
2989 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2990 | cause an exception. Otherwise, the comparison is performed according to the
2991 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2992 *----------------------------------------------------------------------------*/
2993
2994 int float32_le_quiet(float32 a, float32 b, float_status *status)
2995 {
2996 flag aSign, bSign;
2997 uint32_t av, bv;
2998 a = float32_squash_input_denormal(a, status);
2999 b = float32_squash_input_denormal(b, status);
3000
3001 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3002 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3003 ) {
3004 if (float32_is_signaling_nan(a, status)
3005 || float32_is_signaling_nan(b, status)) {
3006 float_raise(float_flag_invalid, status);
3007 }
3008 return 0;
3009 }
3010 aSign = extractFloat32Sign( a );
3011 bSign = extractFloat32Sign( b );
3012 av = float32_val(a);
3013 bv = float32_val(b);
3014 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3015 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3016
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns 1 if the single-precision floating-point value `a' is less than
3021 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3022 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3023 | Standard for Binary Floating-Point Arithmetic.
3024 *----------------------------------------------------------------------------*/
3025
3026 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3027 {
3028 flag aSign, bSign;
3029 uint32_t av, bv;
3030 a = float32_squash_input_denormal(a, status);
3031 b = float32_squash_input_denormal(b, status);
3032
3033 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3034 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3035 ) {
3036 if (float32_is_signaling_nan(a, status)
3037 || float32_is_signaling_nan(b, status)) {
3038 float_raise(float_flag_invalid, status);
3039 }
3040 return 0;
3041 }
3042 aSign = extractFloat32Sign( a );
3043 bSign = extractFloat32Sign( b );
3044 av = float32_val(a);
3045 bv = float32_val(b);
3046 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3047 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3048
3049 }
3050
3051 /*----------------------------------------------------------------------------
3052 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3053 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3054 | comparison is performed according to the IEC/IEEE Standard for Binary
3055 | Floating-Point Arithmetic.
3056 *----------------------------------------------------------------------------*/
3057
3058 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3059 {
3060 a = float32_squash_input_denormal(a, status);
3061 b = float32_squash_input_denormal(b, status);
3062
3063 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3064 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3065 ) {
3066 if (float32_is_signaling_nan(a, status)
3067 || float32_is_signaling_nan(b, status)) {
3068 float_raise(float_flag_invalid, status);
3069 }
3070 return 1;
3071 }
3072 return 0;
3073 }
3074
3075 /*----------------------------------------------------------------------------
3076 | Returns the result of converting the double-precision floating-point value
3077 | `a' to the 32-bit two's complement integer format. The conversion is
3078 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3079 | Arithmetic---which means in particular that the conversion is rounded
3080 | according to the current rounding mode. If `a' is a NaN, the largest
3081 | positive integer is returned. Otherwise, if the conversion overflows, the
3082 | largest integer with the same sign as `a' is returned.
3083 *----------------------------------------------------------------------------*/
3084
3085 int32_t float64_to_int32(float64 a, float_status *status)
3086 {
3087 flag aSign;
3088 int aExp;
3089 int shiftCount;
3090 uint64_t aSig;
3091 a = float64_squash_input_denormal(a, status);
3092
3093 aSig = extractFloat64Frac( a );
3094 aExp = extractFloat64Exp( a );
3095 aSign = extractFloat64Sign( a );
3096 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3097 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3098 shiftCount = 0x42C - aExp;
3099 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3100 return roundAndPackInt32(aSign, aSig, status);
3101
3102 }
3103
3104 /*----------------------------------------------------------------------------
3105 | Returns the result of converting the double-precision floating-point value
3106 | `a' to the 32-bit two's complement integer format. The conversion is
3107 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 | Arithmetic, except that the conversion is always rounded toward zero.
3109 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3110 | the conversion overflows, the largest integer with the same sign as `a' is
3111 | returned.
3112 *----------------------------------------------------------------------------*/
3113
3114 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3115 {
3116 flag aSign;
3117 int aExp;
3118 int shiftCount;
3119 uint64_t aSig, savedASig;
3120 int32_t z;
3121 a = float64_squash_input_denormal(a, status);
3122
3123 aSig = extractFloat64Frac( a );
3124 aExp = extractFloat64Exp( a );
3125 aSign = extractFloat64Sign( a );
3126 if ( 0x41E < aExp ) {
3127 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3128 goto invalid;
3129 }
3130 else if ( aExp < 0x3FF ) {
3131 if (aExp || aSig) {
3132 status->float_exception_flags |= float_flag_inexact;
3133 }
3134 return 0;
3135 }
3136 aSig |= LIT64( 0x0010000000000000 );
3137 shiftCount = 0x433 - aExp;
3138 savedASig = aSig;
3139 aSig >>= shiftCount;
3140 z = aSig;
3141 if ( aSign ) z = - z;
3142 if ( ( z < 0 ) ^ aSign ) {
3143 invalid:
3144 float_raise(float_flag_invalid, status);
3145 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3146 }
3147 if ( ( aSig<<shiftCount ) != savedASig ) {
3148 status->float_exception_flags |= float_flag_inexact;
3149 }
3150 return z;
3151
3152 }
3153
3154 /*----------------------------------------------------------------------------
3155 | Returns the result of converting the double-precision floating-point value
3156 | `a' to the 16-bit two's complement integer format. The conversion is
3157 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3158 | Arithmetic, except that the conversion is always rounded toward zero.
3159 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3160 | the conversion overflows, the largest integer with the same sign as `a' is
3161 | returned.
3162 *----------------------------------------------------------------------------*/
3163
3164 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3165 {
3166 flag aSign;
3167 int aExp;
3168 int shiftCount;
3169 uint64_t aSig, savedASig;
3170 int32_t z;
3171
3172 aSig = extractFloat64Frac( a );
3173 aExp = extractFloat64Exp( a );
3174 aSign = extractFloat64Sign( a );
3175 if ( 0x40E < aExp ) {
3176 if ( ( aExp == 0x7FF ) && aSig ) {
3177 aSign = 0;
3178 }
3179 goto invalid;
3180 }
3181 else if ( aExp < 0x3FF ) {
3182 if ( aExp || aSig ) {
3183 status->float_exception_flags |= float_flag_inexact;
3184 }
3185 return 0;
3186 }
3187 aSig |= LIT64( 0x0010000000000000 );
3188 shiftCount = 0x433 - aExp;
3189 savedASig = aSig;
3190 aSig >>= shiftCount;
3191 z = aSig;
3192 if ( aSign ) {
3193 z = - z;
3194 }
3195 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3196 invalid:
3197 float_raise(float_flag_invalid, status);
3198 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3199 }
3200 if ( ( aSig<<shiftCount ) != savedASig ) {
3201 status->float_exception_flags |= float_flag_inexact;
3202 }
3203 return z;
3204 }
3205
3206 /*----------------------------------------------------------------------------
3207 | Returns the result of converting the double-precision floating-point value
3208 | `a' to the 64-bit two's complement integer format. The conversion is
3209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3210 | Arithmetic---which means in particular that the conversion is rounded
3211 | according to the current rounding mode. If `a' is a NaN, the largest
3212 | positive integer is returned. Otherwise, if the conversion overflows, the
3213 | largest integer with the same sign as `a' is returned.
3214 *----------------------------------------------------------------------------*/
3215
3216 int64_t float64_to_int64(float64 a, float_status *status)
3217 {
3218 flag aSign;
3219 int aExp;
3220 int shiftCount;
3221 uint64_t aSig, aSigExtra;
3222 a = float64_squash_input_denormal(a, status);
3223
3224 aSig = extractFloat64Frac( a );
3225 aExp = extractFloat64Exp( a );
3226 aSign = extractFloat64Sign( a );
3227 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3228 shiftCount = 0x433 - aExp;
3229 if ( shiftCount <= 0 ) {
3230 if ( 0x43E < aExp ) {
3231 float_raise(float_flag_invalid, status);
3232 if ( ! aSign
3233 || ( ( aExp == 0x7FF )
3234 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3235 ) {
3236 return LIT64( 0x7FFFFFFFFFFFFFFF );
3237 }
3238 return (int64_t) LIT64( 0x8000000000000000 );
3239 }
3240 aSigExtra = 0;
3241 aSig <<= - shiftCount;
3242 }
3243 else {
3244 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3245 }
3246 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3247
3248 }
3249
3250 /*----------------------------------------------------------------------------
3251 | Returns the result of converting the double-precision floating-point value
3252 | `a' to the 64-bit two's complement integer format. The conversion is
3253 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3254 | Arithmetic, except that the conversion is always rounded toward zero.
3255 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3256 | the conversion overflows, the largest integer with the same sign as `a' is
3257 | returned.
3258 *----------------------------------------------------------------------------*/
3259
3260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3261 {
3262 flag aSign;
3263 int aExp;
3264 int shiftCount;
3265 uint64_t aSig;
3266 int64_t z;
3267 a = float64_squash_input_denormal(a, status);
3268
3269 aSig = extractFloat64Frac( a );
3270 aExp = extractFloat64Exp( a );
3271 aSign = extractFloat64Sign( a );
3272 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3273 shiftCount = aExp - 0x433;
3274 if ( 0 <= shiftCount ) {
3275 if ( 0x43E <= aExp ) {
3276 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3277 float_raise(float_flag_invalid, status);
3278 if ( ! aSign
3279 || ( ( aExp == 0x7FF )
3280 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3281 ) {
3282 return LIT64( 0x7FFFFFFFFFFFFFFF );
3283 }
3284 }
3285 return (int64_t) LIT64( 0x8000000000000000 );
3286 }
3287 z = aSig<<shiftCount;
3288 }
3289 else {
3290 if ( aExp < 0x3FE ) {
3291 if (aExp | aSig) {
3292 status->float_exception_flags |= float_flag_inexact;
3293 }
3294 return 0;
3295 }
3296 z = aSig>>( - shiftCount );
3297 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3298 status->float_exception_flags |= float_flag_inexact;
3299 }
3300 }
3301 if ( aSign ) z = - z;
3302 return z;
3303
3304 }
3305
3306 /*----------------------------------------------------------------------------
3307 | Returns the result of converting the double-precision floating-point value
3308 | `a' to the single-precision floating-point format. The conversion is
3309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3310 | Arithmetic.
3311 *----------------------------------------------------------------------------*/
3312
3313 float32 float64_to_float32(float64 a, float_status *status)
3314 {
3315 flag aSign;
3316 int aExp;
3317 uint64_t aSig;
3318 uint32_t zSig;
3319 a = float64_squash_input_denormal(a, status);
3320
3321 aSig = extractFloat64Frac( a );
3322 aExp = extractFloat64Exp( a );
3323 aSign = extractFloat64Sign( a );
3324 if ( aExp == 0x7FF ) {
3325 if (aSig) {
3326 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3327 }
3328 return packFloat32( aSign, 0xFF, 0 );
3329 }
3330 shift64RightJamming( aSig, 22, &aSig );
3331 zSig = aSig;
3332 if ( aExp || zSig ) {
3333 zSig |= 0x40000000;
3334 aExp -= 0x381;
3335 }
3336 return roundAndPackFloat32(aSign, aExp, zSig, status);
3337
3338 }
3339
3340
3341 /*----------------------------------------------------------------------------
3342 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3343 | half-precision floating-point value, returning the result. After being
3344 | shifted into the proper positions, the three fields are simply added
3345 | together to form the result. This means that any integer portion of `zSig'
3346 | will be added into the exponent. Since a properly normalized significand
3347 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3348 | than the desired result exponent whenever `zSig' is a complete, normalized
3349 | significand.
3350 *----------------------------------------------------------------------------*/
3351 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3352 {
3353 return make_float16(
3354 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3355 }
3356
3357 /*----------------------------------------------------------------------------
3358 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3359 | and significand `zSig', and returns the proper half-precision floating-
3360 | point value corresponding to the abstract input. Ordinarily, the abstract
3361 | value is simply rounded and packed into the half-precision format, with
3362 | the inexact exception raised if the abstract input cannot be represented
3363 | exactly. However, if the abstract value is too large, the overflow and
3364 | inexact exceptions are raised and an infinity or maximal finite value is
3365 | returned. If the abstract value is too small, the input value is rounded to
3366 | a subnormal number, and the underflow and inexact exceptions are raised if
3367 | the abstract input cannot be represented exactly as a subnormal half-
3368 | precision floating-point number.
3369 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3370 | ARM-style "alternative representation", which omits the NaN and Inf
3371 | encodings in order to raise the maximum representable exponent by one.
3372 | The input significand `zSig' has its binary point between bits 22
3373 | and 23, which is 13 bits to the left of the usual location. This shifted
3374 | significand must be normalized or smaller. If `zSig' is not normalized,
3375