diff options
author | Anas Nashif <anas.nashif@intel.com> | 2012-11-06 07:50:24 -0800 |
---|---|---|
committer | Anas Nashif <anas.nashif@intel.com> | 2012-11-06 07:50:24 -0800 |
commit | 060629c6ef0b7e5c267d84c91600113264d33120 (patch) | |
tree | 18fcb144ac71b9c4d08ee5d1dc58e2b16c109a5a /fpu | |
download | qemu-060629c6ef0b7e5c267d84c91600113264d33120.tar.gz qemu-060629c6ef0b7e5c267d84c91600113264d33120.tar.bz2 qemu-060629c6ef0b7e5c267d84c91600113264d33120.zip |
Imported Upstream version 1.2.0upstream/1.2.0
Diffstat (limited to 'fpu')
-rw-r--r-- | fpu/softfloat-macros.h | 749 | ||||
-rw-r--r-- | fpu/softfloat-specialize.h | 1066 | ||||
-rw-r--r-- | fpu/softfloat.c | 6862 | ||||
-rw-r--r-- | fpu/softfloat.h | 633 |
4 files changed, 9310 insertions, 0 deletions
diff --git a/fpu/softfloat-macros.h b/fpu/softfloat-macros.h new file mode 100644 index 000000000..b5164af7f --- /dev/null +++ b/fpu/softfloat-macros.h @@ -0,0 +1,749 @@ +/* + * QEMU float support macros + * + * Derived from SoftFloat. + */ + +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| This macro tests for minimum version of the GNU C compiler. +*----------------------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define SOFTFLOAT_GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +#else +# define SOFTFLOAT_GNUC_PREREQ(maj, min) 0 +#endif + + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 32, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void shift32RightJamming(uint32_t a, int_fast16_t count, uint32_t *zPtr) +{ + uint32_t z; + + if ( count == 0 ) { + z = a; + } + else if ( count < 32 ) { + z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); + } + else { + z = ( a != 0 ); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 64, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void shift64RightJamming(uint64_t a, int_fast16_t count, uint64_t *zPtr) +{ + uint64_t z; + + if ( count == 0 ) { + z = a; + } + else if ( count < 64 ) { + z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); + } + else { + z = ( a != 0 ); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 +| _plus_ the number of bits given in `count'. The shifted result is at most +| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The +| bits shifted off form a second 64-bit result as follows: The _last_ bit +| shifted off is the most-significant bit of the extra result, and the other +| 63 bits of the extra result are all zero if and only if _all_but_the_last_ +| bits shifted off were all zero. This extra result is stored in the location +| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. +| (This routine makes more sense if `a0' and `a1' are considered to form +| a fixed-point value with binary point between `a0' and `a1'. This fixed- +| point value is shifted right by the number of bits given in `count', and +| the integer part of the result is returned at the location pointed to by +| `z0Ptr'. The fractional part of the result may be slightly corrupted as +| described above, and is returned at the location pointed to by `z1Ptr'.) +*----------------------------------------------------------------------------*/ + +INLINE void + shift64ExtraRightJamming( + uint64_t a0, uint64_t a1, int_fast16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z0, z1; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 64 ) { + z1 = ( a0<<negCount ) | ( a1 != 0 ); + z0 = a0>>count; + } + else { + if ( count == 64 ) { + z1 = a0 | ( a1 != 0 ); + } + else { + z1 = ( ( a0 | a1 ) != 0 ); + } + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' can be arbitrarily large; in particular, if `count' is greater +| than 128, the result will be 0. The result is broken into two 64-bit pieces +| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shift128Right( + uint64_t a0, uint64_t a1, int_fast16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z0, z1; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 64 ) { + z1 = ( a0<<negCount ) | ( a1>>count ); + z0 = a0>>count; + } + else { + z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0; + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. If any nonzero bits are shifted off, they +| are ``jammed'' into the least significant bit of the result by setting the +| least significant bit to 1. The value of `count' can be arbitrarily large; +| in particular, if `count' is greater than 128, the result will be either +| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or +| nonzero. The result is broken into two 64-bit pieces which are stored at +| the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shift128RightJamming( + uint64_t a0, uint64_t a1, int_fast16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z0, z1; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 64 ) { + z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); + z0 = a0>>count; + } + else { + if ( count == 64 ) { + z1 = a0 | ( a1 != 0 ); + } + else if ( count < 128 ) { + z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); + } + else { + z1 = ( ( a0 | a1 ) != 0 ); + } + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right +| by 64 _plus_ the number of bits given in `count'. The shifted result is +| at most 128 nonzero bits; these are broken into two 64-bit pieces which are +| stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted +| off form a third 64-bit result as follows: The _last_ bit shifted off is +| the most-significant bit of the extra result, and the other 63 bits of the +| extra result are all zero if and only if _all_but_the_last_ bits shifted off +| were all zero. This extra result is stored in the location pointed to by +| `z2Ptr'. The value of `count' can be arbitrarily large. +| (This routine makes more sense if `a0', `a1', and `a2' are considered +| to form a fixed-point value with binary point between `a1' and `a2'. This +| fixed-point value is shifted right by the number of bits given in `count', +| and the integer part of the result is returned at the locations pointed to +| by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly +| corrupted as described above, and is returned at the location pointed to by +| `z2Ptr'.) +*----------------------------------------------------------------------------*/ + +INLINE void + shift128ExtraRightJamming( + uint64_t a0, + uint64_t a1, + uint64_t a2, + int_fast16_t count, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr + ) +{ + uint64_t z0, z1, z2; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z2 = a2; + z1 = a1; + z0 = a0; + } + else { + if ( count < 64 ) { + z2 = a1<<negCount; + z1 = ( a0<<negCount ) | ( a1>>count ); + z0 = a0>>count; + } + else { + if ( count == 64 ) { + z2 = a1; + z1 = a0; + } + else { + a2 |= a1; + if ( count < 128 ) { + z2 = a0<<negCount; + z1 = a0>>( count & 63 ); + } + else { + z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); + z1 = 0; + } + } + z0 = 0; + } + z2 |= ( a2 != 0 ); + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' must be less than 64. The result is broken into two 64-bit +| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shortShift128Left( + uint64_t a0, uint64_t a1, int_fast16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + + *z1Ptr = a1<<count; + *z0Ptr = + ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) ); + +} + +/*---------------------------------------------------------------------------- +| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left +| by the number of bits given in `count'. Any bits shifted off are lost. +| The value of `count' must be less than 64. The result is broken into three +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shortShift192Left( + uint64_t a0, + uint64_t a1, + uint64_t a2, + int_fast16_t count, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr + ) +{ + uint64_t z0, z1, z2; + int8 negCount; + + z2 = a2<<count; + z1 = a1<<count; + z0 = a0<<count; + if ( 0 < count ) { + negCount = ( ( - count ) & 63 ); + z1 |= a2>>negCount; + z0 |= a1>>negCount; + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so +| any carry out is lost. The result is broken into two 64-bit pieces which +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + add128( + uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr ) +{ + uint64_t z1; + + z1 = a1 + b1; + *z1Ptr = z1; + *z0Ptr = a0 + b0 + ( z1 < a1 ); + +} + +/*---------------------------------------------------------------------------- +| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the +| 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is +| modulo 2^192, so any carry out is lost. The result is broken into three +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + add192( + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t b0, + uint64_t b1, + uint64_t b2, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr + ) +{ + uint64_t z0, z1, z2; + int8 carry0, carry1; + + z2 = a2 + b2; + carry1 = ( z2 < a2 ); + z1 = a1 + b1; + carry0 = ( z1 < a1 ); + z0 = a0 + b0; + z1 += carry1; + z0 += ( z1 < carry1 ); + z0 += carry0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the +| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo +| 2^128, so any borrow out (carry out) is lost. The result is broken into two +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and +| `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + sub128( + uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr ) +{ + + *z1Ptr = a1 - b1; + *z0Ptr = a0 - b0 - ( a1 < b1 ); + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' +| from the 192-bit value formed by concatenating `a0', `a1', and `a2'. +| Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The +| result is broken into three 64-bit pieces which are stored at the locations +| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + sub192( + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t b0, + uint64_t b1, + uint64_t b2, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr + ) +{ + uint64_t z0, z1, z2; + int8 borrow0, borrow1; + + z2 = a2 - b2; + borrow1 = ( a2 < b2 ); + z1 = a1 - b1; + borrow0 = ( a1 < b1 ); + z0 = a0 - b0; + z0 -= ( z1 < borrow1 ); + z1 -= borrow1; + z0 -= borrow0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken +| into two 64-bit pieces which are stored at the locations pointed to by +| `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr ) +{ + uint32_t aHigh, aLow, bHigh, bLow; + uint64_t z0, zMiddleA, zMiddleB, z1; + + aLow = a; + aHigh = a>>32; + bLow = b; + bHigh = b>>32; + z1 = ( (uint64_t) aLow ) * bLow; + zMiddleA = ( (uint64_t) aLow ) * bHigh; + zMiddleB = ( (uint64_t) aHigh ) * bLow; + z0 = ( (uint64_t) aHigh ) * bHigh; + zMiddleA += zMiddleB; + z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); + zMiddleA <<= 32; + z1 += zMiddleA; + z0 += ( z1 < zMiddleA ); + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by +| `b' to obtain a 192-bit product. The product is broken into three 64-bit +| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and +| `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + mul128By64To192( + uint64_t a0, + uint64_t a1, + uint64_t b, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr + ) +{ + uint64_t z0, z1, z2, more1; + + mul64To128( a1, b, &z1, &z2 ); + mul64To128( a0, b, &z0, &more1 ); + add128( z0, more1, 0, z1, &z0, &z1 ); + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the +| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit +| product. The product is broken into four 64-bit pieces which are stored at +| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + mul128To256( + uint64_t a0, + uint64_t a1, + uint64_t b0, + uint64_t b1, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr, + uint64_t *z3Ptr + ) +{ + uint64_t z0, z1, z2, z3; + uint64_t more1, more2; + + mul64To128( a1, b1, &z2, &z3 ); + mul64To128( a1, b0, &z1, &more2 ); + add128( z1, more2, 0, z2, &z1, &z2 ); + mul64To128( a0, b0, &z0, &more1 ); + add128( z0, more1, 0, z1, &z0, &z1 ); + mul64To128( a0, b1, &more1, &more2 ); + add128( more1, more2, 0, z2, &more1, &z2 ); + add128( z0, z1, 0, more1, &z0, &z1 ); + *z3Ptr = z3; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the 64-bit integer quotient obtained by dividing +| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The +| divisor `b' must be at least 2^63. If q is the exact quotient truncated +| toward zero, the approximation returned lies between q and q + 2 inclusive. +| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit +| unsigned integer is returned. +*----------------------------------------------------------------------------*/ + +static uint64_t estimateDiv128To64( uint64_t a0, uint64_t a1, uint64_t b ) +{ + uint64_t b0, b1; + uint64_t rem0, rem1, term0, term1; + uint64_t z; + + if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); + b0 = b>>32; + z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32; + mul64To128( b, z, &term0, &term1 ); + sub128( a0, a1, term0, term1, &rem0, &rem1 ); + while ( ( (int64_t) rem0 ) < 0 ) { + z -= LIT64( 0x100000000 ); + b1 = b<<32; + add128( rem0, rem1, b0, b1, &rem0, &rem1 ); + } + rem0 = ( rem0<<32 ) | ( rem1>>32 ); + z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the square root of the 32-bit significand given +| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of +| `aExp' (the least significant bit) is 1, the integer returned approximates +| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' +| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either +| case, the approximation returned lies strictly within +/-2 of the exact +| value. +*----------------------------------------------------------------------------*/ + +static uint32_t estimateSqrt32(int_fast16_t aExp, uint32_t a) +{ + static const uint16_t sqrtOddAdjustments[] = { + 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, + 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 + }; + static const uint16_t sqrtEvenAdjustments[] = { + 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, + 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 + }; + int8 index; + uint32_t z; + + index = ( a>>27 ) & 15; + if ( aExp & 1 ) { + z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ]; + z = ( ( a / z )<<14 ) + ( z<<15 ); + a >>= 1; + } + else { + z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ]; + z = a / z + z; + z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); + if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 ); + } + return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 32 is returned. +*----------------------------------------------------------------------------*/ + +static int8 countLeadingZeros32( uint32_t a ) +{ +#if SOFTFLOAT_GNUC_PREREQ(3, 4) + if (a) { + return __builtin_clz(a); + } else { + return 32; + } +#else + static const int8 countLeadingZerosHigh[] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int8 shiftCount; + + shiftCount = 0; + if ( a < 0x10000 ) { + shiftCount += 16; + a <<= 16; + } + if ( a < 0x1000000 ) { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZerosHigh[ a>>24 ]; + return shiftCount; +#endif +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 64 is returned. +*----------------------------------------------------------------------------*/ + +static int8 countLeadingZeros64( uint64_t a ) +{ +#if SOFTFLOAT_GNUC_PREREQ(3, 4) + if (a) { + return __builtin_clzll(a); + } else { + return 64; + } +#else + int8 shiftCount; + + shiftCount = 0; + if ( a < ( (uint64_t) 1 )<<32 ) { + shiftCount += 32; + } + else { + a >>= 32; + } + shiftCount += countLeadingZeros32( a ); + return shiftCount; +#endif +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' +| is equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) +{ + + return ( a0 == b0 ) && ( a1 == b1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less +| than or equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) +{ + + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less +| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, +| returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) +{ + + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is +| not equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 ) +{ + + return ( a0 != b0 ) || ( a1 != b1 ); + +} diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h new file mode 100644 index 000000000..490245004 --- /dev/null +++ b/fpu/softfloat-specialize.h @@ -0,0 +1,1066 @@ +/* + * QEMU float support + * + * Derived from SoftFloat. + */ + +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32) +#define SNAN_BIT_IS_ONE 1 +#else +#define SNAN_BIT_IS_ONE 0 +#endif + +/*---------------------------------------------------------------------------- +| The pattern for a default generated half-precision NaN. +*----------------------------------------------------------------------------*/ +#if defined(TARGET_ARM) +const float16 float16_default_nan = const_float16(0x7E00); +#elif SNAN_BIT_IS_ONE +const float16 float16_default_nan = const_float16(0x7DFF); +#else +const float16 float16_default_nan = const_float16(0xFE00); +#endif + +/*---------------------------------------------------------------------------- +| The pattern for a default generated single-precision NaN. +*----------------------------------------------------------------------------*/ +#if defined(TARGET_SPARC) +const float32 float32_default_nan = const_float32(0x7FFFFFFF); +#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) +const float32 float32_default_nan = const_float32(0x7FC00000); +#elif SNAN_BIT_IS_ONE +const float32 float32_default_nan = const_float32(0x7FBFFFFF); +#else +const float32 float32_default_nan = const_float32(0xFFC00000); +#endif + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +#if defined(TARGET_SPARC) +const float64 float64_default_nan = const_float64(LIT64( 0x7FFFFFFFFFFFFFFF )); +#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) +const float64 float64_default_nan = const_float64(LIT64( 0x7FF8000000000000 )); +#elif SNAN_BIT_IS_ONE +const float64 float64_default_nan = const_float64(LIT64( 0x7FF7FFFFFFFFFFFF )); +#else +const float64 float64_default_nan = const_float64(LIT64( 0xFFF8000000000000 )); +#endif + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. +*----------------------------------------------------------------------------*/ +#if SNAN_BIT_IS_ONE +#define floatx80_default_nan_high 0x7FFF +#define floatx80_default_nan_low LIT64( 0xBFFFFFFFFFFFFFFF ) +#else +#define floatx80_default_nan_high 0xFFFF +#define floatx80_default_nan_low LIT64( 0xC000000000000000 ) +#endif + +const floatx80 floatx80_default_nan + = make_floatx80_init(floatx80_default_nan_high, floatx80_default_nan_low); + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. The `high' and +| `low' values hold the most- and least-significant bits, respectively. +*----------------------------------------------------------------------------*/ +#if SNAN_BIT_IS_ONE +#define float128_default_nan_high LIT64( 0x7FFF7FFFFFFFFFFF ) +#define float128_default_nan_low LIT64( 0xFFFFFFFFFFFFFFFF ) +#else +#define float128_default_nan_high LIT64( 0xFFFF800000000000 ) +#define float128_default_nan_low LIT64( 0x0000000000000000 ) +#endif + +const float128 float128_default_nan + = make_float128_init(float128_default_nan_high, float128_default_nan_low); + +/*---------------------------------------------------------------------------- +| Raises the exceptions specified by `flags'. Floating-point traps can be +| defined here if desired. It is currently not possible for such a trap +| to substitute a result value. If traps are not implemented, this routine +| should be simply `float_exception_flags |= flags;'. +*----------------------------------------------------------------------------*/ + +void float_raise( int8 flags STATUS_PARAM ) +{ + STATUS(float_exception_flags) |= flags; +} + +/*---------------------------------------------------------------------------- +| Internal canonical NaN format. +*----------------------------------------------------------------------------*/ +typedef struct { + flag sign; + uint64_t high, low; +} commonNaNT; + +/*---------------------------------------------------------------------------- +| Returns 1 if the half-precision floating-point value `a' is a quiet +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float16_is_quiet_nan(float16 a_) +{ + uint16_t a = float16_val(a_); +#if SNAN_BIT_IS_ONE + return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x1FF); +#else + return ((a & ~0x8000) >= 0x7c80); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the half-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float16_is_signaling_nan(float16 a_) +{ + uint16_t a = float16_val(a_); +#if SNAN_BIT_IS_ONE + return ((a & ~0x8000) >= 0x7c80); +#else + return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x1FF); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns a quiet NaN if the half-precision floating point value `a' is a +| signaling NaN; otherwise returns `a'. +*----------------------------------------------------------------------------*/ +float16 float16_maybe_silence_nan(float16 a_) +{ + if (float16_is_signaling_nan(a_)) { +#if SNAN_BIT_IS_ONE +# if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32) + return float16_default_nan; +# else +# error Rules for silencing a signaling NaN are target-specific +# endif +#else + uint16_t a = float16_val(a_); + a |= (1 << 9); + return make_float16(a); +#endif + } + return a_; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the half-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float16ToCommonNaN( float16 a STATUS_PARAM ) +{ + commonNaNT z; + + if ( float16_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR ); + z.sign = float16_val(a) >> 15; + z.low = 0; + z.high = ((uint64_t) float16_val(a))<<54; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the half- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float16 commonNaNToFloat16(commonNaNT a STATUS_PARAM) +{ + uint16_t mantissa = a.high>>54; + + if (STATUS(default_nan_mode)) { + return float16_default_nan; + } + + if (mantissa) { + return make_float16(((((uint16_t) a.sign) << 15) + | (0x1F << 10) | mantissa)); + } else { + return float16_default_nan; + } +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a quiet +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float32_is_quiet_nan( float32 a_ ) +{ + uint32_t a = float32_val(a_); +#if SNAN_BIT_IS_ONE + return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF ); +#else + return ( 0xFF800000 <= (uint32_t) ( a<<1 ) ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float32_is_signaling_nan( float32 a_ ) +{ + uint32_t a = float32_val(a_); +#if SNAN_BIT_IS_ONE + return ( 0xFF800000 <= (uint32_t) ( a<<1 ) ); +#else + return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns a quiet NaN if the single-precision floating point value `a' is a +| signaling NaN; otherwise returns `a'. +*----------------------------------------------------------------------------*/ + +float32 float32_maybe_silence_nan( float32 a_ ) +{ + if (float32_is_signaling_nan(a_)) { +#if SNAN_BIT_IS_ONE +# if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32) + return float32_default_nan; +# else +# error Rules for silencing a signaling NaN are target-specific +# endif +#else + uint32_t a = float32_val(a_); + a |= (1 << 22); + return make_float32(a); +#endif + } + return a_; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float32ToCommonNaN( float32 a STATUS_PARAM ) +{ + commonNaNT z; + + if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR ); + z.sign = float32_val(a)>>31; + z.low = 0; + z.high = ( (uint64_t) float32_val(a) )<<41; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the single- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float32 commonNaNToFloat32( commonNaNT a STATUS_PARAM) +{ + uint32_t mantissa = a.high>>41; + + if ( STATUS(default_nan_mode) ) { + return float32_default_nan; + } + + if ( mantissa ) + return make_float32( + ( ( (uint32_t) a.sign )<<31 ) | 0x7F800000 | ( a.high>>41 ) ); + else + return float32_default_nan; +} + +/*---------------------------------------------------------------------------- +| Select which NaN to propagate for a two-input operation. +| IEEE754 doesn't specify all the details of this, so the +| algorithm is target-specific. +| The routine is passed various bits of information about the +| two NaNs and should return 0 to select NaN a and 1 for NaN b. +| Note that signalling NaNs are always squashed to quiet NaNs +| by the caller, by calling floatXX_maybe_silence_nan() before +| returning them. +| +| aIsLargerSignificand is only valid if both a and b are NaNs +| of some kind, and is true if a has the larger significand, +| or if both a and b have the same significand but a is +| positive but b is negative. It is only needed for the x87 +| tie-break rule. +*----------------------------------------------------------------------------*/ + +#if defined(TARGET_ARM) +static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag aIsLargerSignificand) +{ + /* ARM mandated NaN propagation rules: take the first of: + * 1. A if it is signaling + * 2. B if it is signaling + * 3. A (quiet) + * 4. B (quiet) + * A signaling NaN is always quietened before returning it. + */ + if (aIsSNaN) { + return 0; + } else if (bIsSNaN) { + return 1; + } else if (aIsQNaN) { + return 0; + } else { + return 1; + } +} +#elif defined(TARGET_MIPS) +static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag aIsLargerSignificand) +{ + /* According to MIPS specifications, if one of the two operands is + * a sNaN, a new qNaN has to be generated. This is done in + * floatXX_maybe_silence_nan(). For qNaN inputs the specifications + * says: "When possible, this QNaN result is one of the operand QNaN + * values." In practice it seems that most implementations choose + * the first operand if both operands are qNaN. In short this gives + * the following rules: + * 1. A if it is signaling + * 2. B if it is signaling + * 3. A (quiet) + * 4. B (quiet) + * A signaling NaN is always silenced before returning it. + */ + if (aIsSNaN) { + return 0; + } else if (bIsSNaN) { + return 1; + } else if (aIsQNaN) { + return 0; + } else { + return 1; + } +} +#elif defined(TARGET_PPC) +static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag aIsLargerSignificand) +{ + /* PowerPC propagation rules: + * 1. A if it sNaN or qNaN + * 2. B if it sNaN or qNaN + * A signaling NaN is always silenced before returning it. + */ + if (aIsSNaN || aIsQNaN) { + return 0; + } else { + return 1; + } +} +#else +static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag aIsLargerSignificand) +{ + /* This implements x87 NaN propagation rules: + * SNaN + QNaN => return the QNaN + * two SNaNs => return the one with the larger significand, silenced + * two QNaNs => return the one with the larger significand + * SNaN and a non-NaN => return the SNaN, silenced + * QNaN and a non-NaN => return the QNaN + * + * If we get down to comparing significands and they are the same, + * return the NaN with the positive sign bit (if any). + */ + if (aIsSNaN) { + if (bIsSNaN) { + return aIsLargerSignificand ? 0 : 1; + } + return bIsQNaN ? 1 : 0; + } + else if (aIsQNaN) { + if (bIsSNaN || !bIsQNaN) + return 0; + else { + return aIsLargerSignificand ? 0 : 1; + } + } else { + return 1; + } +} +#endif + +/*---------------------------------------------------------------------------- +| Select which NaN to propagate for a three-input operation. +| For the moment we assume that no CPU needs the 'larger significand' +| information. +| Return values : 0 : a; 1 : b; 2 : c; 3 : default-NaN +*----------------------------------------------------------------------------*/ +#if defined(TARGET_ARM) +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + /* For ARM, the (inf,zero,qnan) case sets InvalidOp and returns + * the default NaN + */ + if (infzero && cIsQNaN) { + float_raise(float_flag_invalid STATUS_VAR); + return 3; + } + + /* This looks different from the ARM ARM pseudocode, because the ARM ARM + * puts the operands to a fused mac operation (a*b)+c in the order c,a,b. + */ + if (cIsSNaN) { + return 2; + } else if (aIsSNaN) { + return 0; + } else if (bIsSNaN) { + return 1; + } else if (cIsQNaN) { + return 2; + } else if (aIsQNaN) { + return 0; + } else { + return 1; + } +} +#elif defined(TARGET_PPC) +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + /* For PPC, the (inf,zero,qnan) case sets InvalidOp, but we prefer + * to return an input NaN if we have one (ie c) rather than generating + * a default NaN + */ + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return 2; + } + + /* If fRA is a NaN return it; otherwise if fRB is a NaN return it; + * otherwise return fRC. Note that muladd on PPC is (fRA * fRC) + frB + */ + if (aIsSNaN || aIsQNaN) { + return 0; + } else if (cIsSNaN || cIsQNaN) { + return 2; + } else { + return 1; + } +} +#else +/* A default implementation: prefer a to b to c. + * This is unlikely to actually match any real implementation. + */ +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + if (aIsSNaN || aIsQNaN) { + return 0; + } else if (bIsSNaN || bIsQNaN) { + return 1; + } else { + return 2; + } +} +#endif + +/*---------------------------------------------------------------------------- +| Takes two single-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN; + flag aIsLargerSignificand; + uint32_t av, bv; + + aIsQuietNaN = float32_is_quiet_nan( a ); + aIsSignalingNaN = float32_is_signaling_nan( a ); + bIsQuietNaN = float32_is_quiet_nan( b ); + bIsSignalingNaN = float32_is_signaling_nan( b ); + av = float32_val(a); + bv = float32_val(b); + + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR); + + if ( STATUS(default_nan_mode) ) + return float32_default_nan; + + if ((uint32_t)(av<<1) < (uint32_t)(bv<<1)) { + aIsLargerSignificand = 0; + } else if ((uint32_t)(bv<<1) < (uint32_t)(av<<1)) { + aIsLargerSignificand = 1; + } else { + aIsLargerSignificand = (av < bv) ? 1 : 0; + } + + if (pickNaN(aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + aIsLargerSignificand)) { + return float32_maybe_silence_nan(b); + } else { + return float32_maybe_silence_nan(a); + } +} + +/*---------------------------------------------------------------------------- +| Takes three single-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float32 propagateFloat32MulAddNaN(float32 a, float32 b, + float32 c, flag infzero STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN; + int which; + + aIsQuietNaN = float32_is_quiet_nan(a); + aIsSignalingNaN = float32_is_signaling_nan(a); + bIsQuietNaN = float32_is_quiet_nan(b); + bIsSignalingNaN = float32_is_signaling_nan(b); + cIsQuietNaN = float32_is_quiet_nan(c); + cIsSignalingNaN = float32_is_signaling_nan(c); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) { + float_raise(float_flag_invalid STATUS_VAR); + } + + which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN, + bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN, infzero STATUS_VAR); + + if (STATUS(default_nan_mode)) { + /* Note that this check is after pickNaNMulAdd so that function + * has an opportunity to set the Invalid flag. + */ + return float32_default_nan; + } + + switch (which) { + case 0: + return float32_maybe_silence_nan(a); + case 1: + return float32_maybe_silence_nan(b); + case 2: + return float32_maybe_silence_nan(c); + case 3: + default: + return float32_default_nan; + } +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a quiet +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float64_is_quiet_nan( float64 a_ ) +{ + uint64_t a = float64_val(a_); +#if SNAN_BIT_IS_ONE + return + ( ( ( a>>51 ) & 0xFFF ) == 0xFFE ) + && ( a & LIT64( 0x0007FFFFFFFFFFFF ) ); +#else + return ( LIT64( 0xFFF0000000000000 ) <= (uint64_t) ( a<<1 ) ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float64_is_signaling_nan( float64 a_ ) +{ + uint64_t a = float64_val(a_); +#if SNAN_BIT_IS_ONE + return ( LIT64( 0xFFF0000000000000 ) <= (uint64_t) ( a<<1 ) ); +#else + return + ( ( ( a>>51 ) & 0xFFF ) == 0xFFE ) + && ( a & LIT64( 0x0007FFFFFFFFFFFF ) ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns a quiet NaN if the double-precision floating point value `a' is a +| signaling NaN; otherwise returns `a'. +*----------------------------------------------------------------------------*/ + +float64 float64_maybe_silence_nan( float64 a_ ) +{ + if (float64_is_signaling_nan(a_)) { +#if SNAN_BIT_IS_ONE +# if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32) + return float64_default_nan; +# else +# error Rules for silencing a signaling NaN are target-specific +# endif +#else + uint64_t a = float64_val(a_); + a |= LIT64( 0x0008000000000000 ); + return make_float64(a); +#endif + } + return a_; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float64ToCommonNaN( float64 a STATUS_PARAM) +{ + commonNaNT z; + + if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR); + z.sign = float64_val(a)>>63; + z.low = 0; + z.high = float64_val(a)<<12; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the double- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float64 commonNaNToFloat64( commonNaNT a STATUS_PARAM) +{ + uint64_t mantissa = a.high>>12; + + if ( STATUS(default_nan_mode) ) { + return float64_default_nan; + } + + if ( mantissa ) + return make_float64( + ( ( (uint64_t) a.sign )<<63 ) + | LIT64( 0x7FF0000000000000 ) + | ( a.high>>12 )); + else + return float64_default_nan; +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN; + flag aIsLargerSignificand; + uint64_t av, bv; + + aIsQuietNaN = float64_is_quiet_nan( a ); + aIsSignalingNaN = float64_is_signaling_nan( a ); + bIsQuietNaN = float64_is_quiet_nan( b ); + bIsSignalingNaN = float64_is_signaling_nan( b ); + av = float64_val(a); + bv = float64_val(b); + + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR); + + if ( STATUS(default_nan_mode) ) + return float64_default_nan; + + if ((uint64_t)(av<<1) < (uint64_t)(bv<<1)) { + aIsLargerSignificand = 0; + } else if ((uint64_t)(bv<<1) < (uint64_t)(av<<1)) { + aIsLargerSignificand = 1; + } else { + aIsLargerSignificand = (av < bv) ? 1 : 0; + } + + if (pickNaN(aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + aIsLargerSignificand)) { + return float64_maybe_silence_nan(b); + } else { + return float64_maybe_silence_nan(a); + } +} + +/*---------------------------------------------------------------------------- +| Takes three double-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64MulAddNaN(float64 a, float64 b, + float64 c, flag infzero STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN; + int which; + + aIsQuietNaN = float64_is_quiet_nan(a); + aIsSignalingNaN = float64_is_signaling_nan(a); + bIsQuietNaN = float64_is_quiet_nan(b); + bIsSignalingNaN = float64_is_signaling_nan(b); + cIsQuietNaN = float64_is_quiet_nan(c); + cIsSignalingNaN = float64_is_signaling_nan(c); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) { + float_raise(float_flag_invalid STATUS_VAR); + } + + which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN, + bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN, infzero STATUS_VAR); + + if (STATUS(default_nan_mode)) { + /* Note that this check is after pickNaNMulAdd so that function + * has an opportunity to set the Invalid flag. + */ + return float64_default_nan; + } + + switch (which) { + case 0: + return float64_maybe_silence_nan(a); + case 1: + return float64_maybe_silence_nan(b); + case 2: + return float64_maybe_silence_nan(c); + case 3: + default: + return float64_default_nan; + } +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is a +| quiet NaN; otherwise returns 0. This slightly differs from the same +| function for other types as floatx80 has an explicit bit. +*----------------------------------------------------------------------------*/ + +int floatx80_is_quiet_nan( floatx80 a ) +{ +#if SNAN_BIT_IS_ONE + uint64_t aLow; + + aLow = a.low & ~ LIT64( 0x4000000000000000 ); + return + ( ( a.high & 0x7FFF ) == 0x7FFF ) + && (uint64_t) ( aLow<<1 ) + && ( a.low == aLow ); +#else + return ( ( a.high & 0x7FFF ) == 0x7FFF ) + && (LIT64( 0x8000000000000000 ) <= ((uint64_t) ( a.low<<1 ))); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is a +| signaling NaN; otherwise returns 0. This slightly differs from the same +| function for other types as floatx80 has an explicit bit. +*----------------------------------------------------------------------------*/ + +int floatx80_is_signaling_nan( floatx80 a ) +{ +#if SNAN_BIT_IS_ONE + return ( ( a.high & 0x7FFF ) == 0x7FFF ) + && (LIT64( 0x8000000000000000 ) <= ((uint64_t) ( a.low<<1 ))); +#else + uint64_t aLow; + + aLow = a.low & ~ LIT64( 0x4000000000000000 ); + return + ( ( a.high & 0x7FFF ) == 0x7FFF ) + && (uint64_t) ( aLow<<1 ) + && ( a.low == aLow ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns a quiet NaN if the extended double-precision floating point value +| `a' is a signaling NaN; otherwise returns `a'. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_maybe_silence_nan( floatx80 a ) +{ + if (floatx80_is_signaling_nan(a)) { +#if SNAN_BIT_IS_ONE +# if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32) + a.low = floatx80_default_nan_low; + a.high = floatx80_default_nan_high; +# else +# error Rules for silencing a signaling NaN are target-specific +# endif +#else + a.low |= LIT64( 0xC000000000000000 ); + return a; +#endif + } + return a; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the +| invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT floatx80ToCommonNaN( floatx80 a STATUS_PARAM) +{ + commonNaNT z; + + if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR); + if ( a.low >> 63 ) { + z.sign = a.high >> 15; + z.low = 0; + z.high = a.low << 1; + } else { + z.sign = floatx80_default_nan_high >> 15; + z.low = 0; + z.high = floatx80_default_nan_low << 1; + } + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the extended +| double-precision floating-point format. +*----------------------------------------------------------------------------*/ + +static floatx80 commonNaNToFloatx80( commonNaNT a STATUS_PARAM) +{ + floatx80 z; + + if ( STATUS(default_nan_mode) ) { + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + + if (a.high >> 1) { + z.low = LIT64( 0x8000000000000000 ) | a.high >> 1; + z.high = ( ( (uint16_t) a.sign )<<15 ) | 0x7FFF; + } else { + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + } + + return z; +} + +/*---------------------------------------------------------------------------- +| Takes two extended double-precision floating-point values `a' and `b', one +| of which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN; + flag aIsLargerSignificand; + + aIsQuietNaN = floatx80_is_quiet_nan( a ); + aIsSignalingNaN = floatx80_is_signaling_nan( a ); + bIsQuietNaN = floatx80_is_quiet_nan( b ); + bIsSignalingNaN = floatx80_is_signaling_nan( b ); + + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR); + + if ( STATUS(default_nan_mode) ) { + a.low = floatx80_default_nan_low; + a.high = floatx80_default_nan_high; + return a; + } + + if (a.low < b.low) { + aIsLargerSignificand = 0; + } else if (b.low < a.low) { + aIsLargerSignificand = 1; + } else { + aIsLargerSignificand = (a.high < b.high) ? 1 : 0; + } + + if (pickNaN(aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + aIsLargerSignificand)) { + return floatx80_maybe_silence_nan(b); + } else { + return floatx80_maybe_silence_nan(a); + } +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is a quiet +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float128_is_quiet_nan( float128 a ) +{ +#if SNAN_BIT_IS_ONE + return + ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE ) + && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) ); +#else + return + ( LIT64( 0xFFFE000000000000 ) <= (uint64_t) ( a.high<<1 ) ) + && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is a +| signaling NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +int float128_is_signaling_nan( float128 a ) +{ +#if SNAN_BIT_IS_ONE + return + ( LIT64( 0xFFFE000000000000 ) <= (uint64_t) ( a.high<<1 ) ) + && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) ); +#else + return + ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE ) + && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) ); +#endif +} + +/*---------------------------------------------------------------------------- +| Returns a quiet NaN if the quadruple-precision floating point value `a' is +| a signaling NaN; otherwise returns `a'. +*----------------------------------------------------------------------------*/ + +float128 float128_maybe_silence_nan( float128 a ) +{ + if (float128_is_signaling_nan(a)) { +#if SNAN_BIT_IS_ONE +# if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32) + a.low = float128_default_nan_low; + a.high = float128_default_nan_high; +# else +# error Rules for silencing a signaling NaN are target-specific +# endif +#else + a.high |= LIT64( 0x0000800000000000 ); + return a; +#endif + } + return a; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float128ToCommonNaN( float128 a STATUS_PARAM) +{ + commonNaNT z; + + if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR); + z.sign = a.high>>63; + shortShift128Left( a.high, a.low, 16, &z.high, &z.low ); + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the quadruple- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float128 commonNaNToFloat128( commonNaNT a STATUS_PARAM) +{ + float128 z; + + if ( STATUS(default_nan_mode) ) { + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + + shift128Right( a.high, a.low, 16, &z.high, &z.low ); + z.high |= ( ( (uint64_t) a.sign )<<63 ) | LIT64( 0x7FFF000000000000 ); + return z; +} + +/*---------------------------------------------------------------------------- +| Takes two quadruple-precision floating-point values `a' and `b', one of +| which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float128 propagateFloat128NaN( float128 a, float128 b STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN; + flag aIsLargerSignificand; + + aIsQuietNaN = float128_is_quiet_nan( a ); + aIsSignalingNaN = float128_is_signaling_nan( a ); + bIsQuietNaN = float128_is_quiet_nan( b ); + bIsSignalingNaN = float128_is_signaling_nan( b ); + + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR); + + if ( STATUS(default_nan_mode) ) { + a.low = float128_default_nan_low; + a.high = float128_default_nan_high; + return a; + } + + if (lt128(a.high<<1, a.low, b.high<<1, b.low)) { + aIsLargerSignificand = 0; + } else if (lt128(b.high<<1, b.low, a.high<<1, a.low)) { + aIsLargerSignificand = 1; + } else { + aIsLargerSignificand = (a.high < b.high) ? 1 : 0; + } + + if (pickNaN(aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + aIsLargerSignificand)) { + return float128_maybe_silence_nan(b); + } else { + return float128_maybe_silence_nan(a); + } +} + diff --git a/fpu/softfloat.c b/fpu/softfloat.c new file mode 100644 index 000000000..b29256a8e --- /dev/null +++ b/fpu/softfloat.c @@ -0,0 +1,6862 @@ +/* + * QEMU float support + * + * Derived from SoftFloat. + */ + +/*============================================================================ + +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/* softfloat (and in particular the code in softfloat-specialize.h) is + * target-dependent and needs the TARGET_* macros. + */ +#include "config.h" + +#include "softfloat.h" + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target if +| desired.) +*----------------------------------------------------------------------------*/ +#include "softfloat-macros.h" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize.h" + +void set_float_rounding_mode(int val STATUS_PARAM) +{ + STATUS(float_rounding_mode) = val; +} + +void set_float_exception_flags(int val STATUS_PARAM) +{ + STATUS(float_exception_flags) = val; +} + +void set_floatx80_rounding_precision(int val STATUS_PARAM) +{ + STATUS(floatx80_rounding_precision) = val; +} + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the half-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE uint32_t extractFloat16Frac(float16 a) +{ + return float16_val(a) & 0x3ff; +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the half-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int_fast16_t extractFloat16Exp(float16 a) +{ + return (float16_val(a) >> 10) & 0x1f; +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat16Sign(float16 a) +{ + return float16_val(a)>>15; +} + +/*---------------------------------------------------------------------------- +| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 +| and 7, and returns the properly rounded 32-bit integer corresponding to the +| input. If `zSign' is 1, the input is negated before being converted to an +| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input +| is simply rounded to an integer, with the inexact exception raised if the +| input cannot be represented exactly as an integer. However, if the fixed- +| point input is too large, the invalid exception is raised and the largest +| positive or negative integer is returned. +*----------------------------------------------------------------------------*/ + +static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM) +{ + int8 roundingMode; + flag roundNearestEven; + int8 roundIncrement, roundBits; + int32_t z; + + roundingMode = STATUS(float_rounding_mode); + roundNearestEven = ( roundingMode == float_round_nearest_even ); + roundIncrement = 0x40; + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + roundIncrement = 0; + } + else { + roundIncrement = 0x7F; + if ( zSign ) { + if ( roundingMode == float_round_up ) roundIncrement = 0; + } + else { + if ( roundingMode == float_round_down ) roundIncrement = 0; + } + } + } + roundBits = absZ & 0x7F; + absZ = ( absZ + roundIncrement )>>7; + absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); + z = absZ; + if ( zSign ) z = - z; + if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { + float_raise( float_flag_invalid STATUS_VAR); + return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; + } + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and +| `absZ1', with binary point between bits 63 and 64 (between the input words), +| and returns the properly rounded 64-bit integer corresponding to the input. +| If `zSign' is 1, the input is negated before being converted to an integer. +| Ordinarily, the fixed-point input is simply rounded to an integer, with +| the inexact exception raised if the input cannot be represented exactly as +| an integer. However, if the fixed-point input is too large, the invalid +| exception is raised and the largest positive or negative integer is +| returned. +*----------------------------------------------------------------------------*/ + +static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM) +{ + int8 roundingMode; + flag roundNearestEven, increment; + int64_t z; + + roundingMode = STATUS(float_rounding_mode); + roundNearestEven = ( roundingMode == float_round_nearest_even ); + increment = ( (int64_t) absZ1 < 0 ); + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + increment = 0; + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && absZ1; + } + else { + increment = ( roundingMode == float_round_up ) && absZ1; + } + } + } + if ( increment ) { + ++absZ0; + if ( absZ0 == 0 ) goto overflow; + absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); + } + z = absZ0; + if ( zSign ) z = - z; + if ( z && ( ( z < 0 ) ^ zSign ) ) { + overflow: + float_raise( float_flag_invalid STATUS_VAR); + return + zSign ? (int64_t) LIT64( 0x8000000000000000 ) + : LIT64( 0x7FFFFFFFFFFFFFFF ); + } + if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE uint32_t extractFloat32Frac( float32 a ) +{ + + return float32_val(a) & 0x007FFFFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int_fast16_t extractFloat32Exp(float32 a) +{ + + return ( float32_val(a)>>23 ) & 0xFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat32Sign( float32 a ) +{ + + return float32_val(a)>>31; + +} + +/*---------------------------------------------------------------------------- +| If `a' is denormal and we are in flush-to-zero mode then set the +| input-denormal exception and return zero. Otherwise just return the value. +*----------------------------------------------------------------------------*/ +static float32 float32_squash_input_denormal(float32 a STATUS_PARAM) +{ + if (STATUS(flush_inputs_to_zero)) { + if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { + float_raise(float_flag_input_denormal STATUS_VAR); + return make_float32(float32_val(a) & 0x80000000); + } + } + return a; +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal single-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros32( aSig ) - 8; + *zSigPtr = aSig<<shiftCount; + *zExpPtr = 1 - shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| single-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig) +{ + + return make_float32( + ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the single-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal single- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 30 +| and 29, which is 7 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM) +{ + int8 roundingMode; + flag roundNearestEven; + int8 roundIncrement, roundBits; + flag isTiny; + + roundingMode = STATUS(float_rounding_mode); + roundNearestEven = ( roundingMode == float_round_nearest_even ); + roundIncrement = 0x40; + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + roundIncrement = 0; + } + else { + roundIncrement = 0x7F; + if ( zSign ) { + if ( roundingMode == float_round_up ) roundIncrement = 0; + } + else { + if ( roundingMode == float_round_down ) roundIncrement = 0; + } + } + } + roundBits = zSig & 0x7F; + if ( 0xFD <= (uint16_t) zExp ) { + if ( ( 0xFD < zExp ) + || ( ( zExp == 0xFD ) + && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) + ) { + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); + } + if ( zExp < 0 ) { + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat32(zSign, 0, 0); + } + isTiny = + ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) + || ( zExp < -1 ) + || ( zSig + roundIncrement < 0x80000000 ); + shift32RightJamming( zSig, - zExp, &zSig ); + zExp = 0; + roundBits = zSig & 0x7F; + if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); + } + } + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; + zSig = ( zSig + roundIncrement )>>7; + zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); + if ( zSig == 0 ) zExp = 0; + return packFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat32' except that `zSig' does not have to be normalized. +| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +static float32 + normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros32( zSig ) - 1; + return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); + +} + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE uint64_t extractFloat64Frac( float64 a ) +{ + + return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int_fast16_t extractFloat64Exp(float64 a) +{ + + return ( float64_val(a)>>52 ) & 0x7FF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat64Sign( float64 a ) +{ + + return float64_val(a)>>63; + +} + +/*---------------------------------------------------------------------------- +| If `a' is denormal and we are in flush-to-zero mode then set the +| input-denormal exception and return zero. Otherwise just return the value. +*----------------------------------------------------------------------------*/ +static float64 float64_squash_input_denormal(float64 a STATUS_PARAM) +{ + if (STATUS(flush_inputs_to_zero)) { + if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { + float_raise(float_flag_input_denormal STATUS_VAR); + return make_float64(float64_val(a) & (1ULL << 63)); + } + } + return a; +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal double-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64( aSig ) - 11; + *zSigPtr = aSig<<shiftCount; + *zExpPtr = 1 - shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| double-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig) +{ + + return make_float64( + ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the double-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded +| to a subnormal number, and the underflow and inexact exceptions are raised +| if the abstract input cannot be represented exactly as a subnormal double- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 62 +| and 61, which is 10 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM) +{ + int8 roundingMode; + flag roundNearestEven; + int_fast16_t roundIncrement, roundBits; + flag isTiny; + + roundingMode = STATUS(float_rounding_mode); + roundNearestEven = ( roundingMode == float_round_nearest_even ); + roundIncrement = 0x200; + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + roundIncrement = 0; + } + else { + roundIncrement = 0x3FF; + if ( zSign ) { + if ( roundingMode == float_round_up ) roundIncrement = 0; + } + else { + if ( roundingMode == float_round_down ) roundIncrement = 0; + } + } + } + roundBits = zSig & 0x3FF; + if ( 0x7FD <= (uint16_t) zExp ) { + if ( ( 0x7FD < zExp ) + || ( ( zExp == 0x7FD ) + && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) + ) { + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); + } + if ( zExp < 0 ) { + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat64(zSign, 0, 0); + } + isTiny = + ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) + || ( zExp < -1 ) + || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); + shift64RightJamming( zSig, - zExp, &zSig ); + zExp = 0; + roundBits = zSig & 0x3FF; + if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); + } + } + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; + zSig = ( zSig + roundIncrement )>>10; + zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); + if ( zSig == 0 ) zExp = 0; + return packFloat64( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat64' except that `zSig' does not have to be normalized. +| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +static float64 + normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64( zSig ) - 1; + return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); + +} + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +INLINE uint64_t extractFloatx80Frac( floatx80 a ) +{ + + return a.low; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int32 extractFloatx80Exp( floatx80 a ) +{ + + return a.high & 0x7FFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the extended double-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloatx80Sign( floatx80 a ) +{ + + return a.high>>15; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal extended double-precision floating-point value +| represented by the denormalized significand `aSig'. The normalized exponent +| and significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64( aSig ); + *zSigPtr = aSig<<shiftCount; + *zExpPtr = 1 - shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an +| extended double-precision floating-point value, returning the result. +*----------------------------------------------------------------------------*/ + +INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig ) +{ + floatx80 z; + + z.low = zSig; + z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and extended significand formed by the concatenation of `zSig0' and `zSig1', +| and returns the proper extended double-precision floating-point value +| corresponding to the abstract input. Ordinarily, the abstract value is +| rounded and packed into the extended double-precision format, with the +| inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal extended +| double-precision floating-point number. +| If `roundingPrecision' is 32 or 64, the result is rounded to the same +| number of bits as single or double precision, respectively. Otherwise, the +| result is rounded to the full precision of the extended double-precision +| format. +| The input significand must be normalized or smaller. If the input +| significand is not normalized, `zExp' must be 0; in that case, the result +| returned is a subnormal number, and it must not require rounding. The +| handling of underflow and overflow follows the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 + roundAndPackFloatx80( + int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 + STATUS_PARAM) +{ + int8 roundingMode; + flag roundNearestEven, increment, isTiny; + int64 roundIncrement, roundMask, roundBits; + + roundingMode = STATUS(float_rounding_mode); + roundNearestEven = ( roundingMode == float_round_nearest_even ); + if ( roundingPrecision == 80 ) goto precision80; + if ( roundingPrecision == 64 ) { + roundIncrement = LIT64( 0x0000000000000400 ); + roundMask = LIT64( 0x00000000000007FF ); + } + else if ( roundingPrecision == 32 ) { + roundIncrement = LIT64( 0x0000008000000000 ); + roundMask = LIT64( 0x000000FFFFFFFFFF ); + } + else { + goto precision80; + } + zSig0 |= ( zSig1 != 0 ); + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + roundIncrement = 0; + } + else { + roundIncrement = roundMask; + if ( zSign ) { + if ( roundingMode == float_round_up ) roundIncrement = 0; + } + else { + if ( roundingMode == float_round_down ) roundIncrement = 0; + } + } + } + roundBits = zSig0 & roundMask; + if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { + if ( ( 0x7FFE < zExp ) + || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) + ) { + goto overflow; + } + if ( zExp <= 0 ) { + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloatx80(zSign, 0, 0); + } + isTiny = + ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) + || ( zExp < 0 ) + || ( zSig0 <= zSig0 + roundIncrement ); + shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); + zExp = 0; + roundBits = zSig0 & roundMask; + if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; + zSig0 += roundIncrement; + if ( (int64_t) zSig0 < 0 ) zExp = 1; + roundIncrement = roundMask + 1; + if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { + roundMask |= roundIncrement; + } + zSig0 &= ~ roundMask; + return packFloatx80( zSign, zExp, zSig0 ); + } + } + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; + zSig0 += roundIncrement; + if ( zSig0 < roundIncrement ) { + ++zExp; + zSig0 = LIT64( 0x8000000000000000 ); + } + roundIncrement = roundMask + 1; + if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { + roundMask |= roundIncrement; + } + zSig0 &= ~ roundMask; + if ( zSig0 == 0 ) zExp = 0; + return packFloatx80( zSign, zExp, zSig0 ); + precision80: + increment = ( (int64_t) zSig1 < 0 ); + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + increment = 0; + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && zSig1; + } + else { + increment = ( roundingMode == float_round_up ) && zSig1; + } + } + } + if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { + if ( ( 0x7FFE < zExp ) + || ( ( zExp == 0x7FFE ) + && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) + && increment + ) + ) { + roundMask = 0; + overflow: + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + if ( ( roundingMode == float_round_to_zero ) + || ( zSign && ( roundingMode == float_round_up ) ) + || ( ! zSign && ( roundingMode == float_round_down ) ) + ) { + return packFloatx80( zSign, 0x7FFE, ~ roundMask ); + } + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( zExp <= 0 ) { + isTiny = + ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) + || ( zExp < 0 ) + || ! increment + || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); + shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); + zExp = 0; + if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); + if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; + if ( roundNearestEven ) { + increment = ( (int64_t) zSig1 < 0 ); + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && zSig1; + } + else { + increment = ( roundingMode == float_round_up ) && zSig1; + } + } + if ( increment ) { + ++zSig0; + zSig0 &= + ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); + if ( (int64_t) zSig0 < 0 ) zExp = 1; + } + return packFloatx80( zSign, zExp, zSig0 ); + } + } + if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; + if ( increment ) { + ++zSig0; + if ( zSig0 == 0 ) { + ++zExp; + zSig0 = LIT64( 0x8000000000000000 ); + } + else { + zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); + } + } + else { + if ( zSig0 == 0 ) zExp = 0; + } + return packFloatx80( zSign, zExp, zSig0 ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent +| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', +| and returns the proper extended double-precision floating-point value +| corresponding to the abstract input. This routine is just like +| `roundAndPackFloatx80' except that the input significand does not have to be +| normalized. +*----------------------------------------------------------------------------*/ + +static floatx80 + normalizeRoundAndPackFloatx80( + int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 + STATUS_PARAM) +{ + int8 shiftCount; + + if ( zSig0 == 0 ) { + zSig0 = zSig1; + zSig1 = 0; + zExp -= 64; + } + shiftCount = countLeadingZeros64( zSig0 ); + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); + zExp -= shiftCount; + return + roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); + +} + +/*---------------------------------------------------------------------------- +| Returns the least-significant 64 fraction bits of the quadruple-precision +| floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE uint64_t extractFloat128Frac1( float128 a ) +{ + + return a.low; + +} + +/*---------------------------------------------------------------------------- +| Returns the most-significant 48 fraction bits of the quadruple-precision +| floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE uint64_t extractFloat128Frac0( float128 a ) +{ + + return a.high & LIT64( 0x0000FFFFFFFFFFFF ); + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the quadruple-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +INLINE int32 extractFloat128Exp( float128 a ) +{ + + return ( a.high>>48 ) & 0x7FFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the quadruple-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat128Sign( float128 a ) +{ + + return a.high>>63; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal quadruple-precision floating-point value +| represented by the denormalized significand formed by the concatenation of +| `aSig0' and `aSig1'. The normalized exponent is stored at the location +| pointed to by `zExpPtr'. The most significant 49 bits of the normalized +| significand are stored at the location pointed to by `zSig0Ptr', and the +| least significant 64 bits of the normalized significand are stored at the +| location pointed to by `zSig1Ptr'. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat128Subnormal( + uint64_t aSig0, + uint64_t aSig1, + int32 *zExpPtr, + uint64_t *zSig0Ptr, + uint64_t *zSig1Ptr + ) +{ + int8 shiftCount; + + if ( aSig0 == 0 ) { + shiftCount = countLeadingZeros64( aSig1 ) - 15; + if ( shiftCount < 0 ) { + *zSig0Ptr = aSig1>>( - shiftCount ); + *zSig1Ptr = aSig1<<( shiftCount & 63 ); + } + else { + *zSig0Ptr = aSig1<<shiftCount; + *zSig1Ptr = 0; + } + *zExpPtr = - shiftCount - 63; + } + else { + shiftCount = countLeadingZeros64( aSig0 ) - 15; + shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); + *zExpPtr = 1 - shiftCount; + } + +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', the exponent `zExp', and the significand formed +| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision +| floating-point value, returning the result. After being shifted into the +| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply +| added together to form the most significant 32 bits of the result. This +| means that any integer portion of `zSig0' will be added into the exponent. +| Since a properly normalized significand will have an integer portion equal +| to 1, the `zExp' input should be 1 less than the desired result exponent +| whenever `zSig0' and `zSig1' concatenated form a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +INLINE float128 + packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 ) +{ + float128 z; + + z.low = zSig1; + z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and extended significand formed by the concatenation of `zSig0', `zSig1', +| and `zSig2', and returns the proper quadruple-precision floating-point value +| corresponding to the abstract input. Ordinarily, the abstract value is +| simply rounded and packed into the quadruple-precision format, with the +| inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal quadruple- +| precision floating-point number. +| The input significand must be normalized or smaller. If the input +| significand is not normalized, `zExp' must be 0; in that case, the result +| returned is a subnormal number, and it must not require rounding. In the +| usual case that the input significand is normalized, `zExp' must be 1 less +| than the ``true'' floating-point exponent. The handling of underflow and +| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 + roundAndPackFloat128( + flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM) +{ + int8 roundingMode; + flag roundNearestEven, increment, isTiny; + + roundingMode = STATUS(float_rounding_mode); + roundNearestEven = ( roundingMode == float_round_nearest_even ); + increment = ( (int64_t) zSig2 < 0 ); + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + increment = 0; + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && zSig2; + } + else { + increment = ( roundingMode == float_round_up ) && zSig2; + } + } + } + if ( 0x7FFD <= (uint32_t) zExp ) { + if ( ( 0x7FFD < zExp ) + || ( ( zExp == 0x7FFD ) + && eq128( + LIT64( 0x0001FFFFFFFFFFFF ), + LIT64( 0xFFFFFFFFFFFFFFFF ), + zSig0, + zSig1 + ) + && increment + ) + ) { + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + if ( ( roundingMode == float_round_to_zero ) + || ( zSign && ( roundingMode == float_round_up ) ) + || ( ! zSign && ( roundingMode == float_round_down ) ) + ) { + return + packFloat128( + zSign, + 0x7FFE, + LIT64( 0x0000FFFFFFFFFFFF ), + LIT64( 0xFFFFFFFFFFFFFFFF ) + ); + } + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( zExp < 0 ) { + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat128(zSign, 0, 0, 0); + } + isTiny = + ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) + || ( zExp < -1 ) + || ! increment + || lt128( + zSig0, + zSig1, + LIT64( 0x0001FFFFFFFFFFFF ), + LIT64( 0xFFFFFFFFFFFFFFFF ) + ); + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); + zExp = 0; + if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); + if ( roundNearestEven ) { + increment = ( (int64_t) zSig2 < 0 ); + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && zSig2; + } + else { + increment = ( roundingMode == float_round_up ) && zSig2; + } + } + } + } + if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; + if ( increment ) { + add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); + zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); + } + else { + if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; + } + return packFloat128( zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand formed by the concatenation of `zSig0' and `zSig1', and +| returns the proper quadruple-precision floating-point value corresponding +| to the abstract input. This routine is just like `roundAndPackFloat128' +| except that the input significand has fewer bits and does not have to be +| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- +| point exponent. +*----------------------------------------------------------------------------*/ + +static float128 + normalizeRoundAndPackFloat128( + flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM) +{ + int8 shiftCount; + uint64_t zSig2; + + if ( zSig0 == 0 ) { + zSig0 = zSig1; + zSig1 = 0; + zExp -= 64; + } + shiftCount = countLeadingZeros64( zSig0 ) - 15; + if ( 0 <= shiftCount ) { + zSig2 = 0; + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); + } + else { + shift128ExtraRightJamming( + zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); + } + zExp -= shiftCount; + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' +| to the single-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 int32_to_float32( int32 a STATUS_PARAM ) +{ + flag zSign; + + if ( a == 0 ) return float32_zero; + if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); + zSign = ( a < 0 ); + return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' +| to the double-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 int32_to_float64( int32 a STATUS_PARAM ) +{ + flag zSign; + uint32 absA; + int8 shiftCount; + uint64_t zSig; + + if ( a == 0 ) return float64_zero; + zSign = ( a < 0 ); + absA = zSign ? - a : a; + shiftCount = countLeadingZeros32( absA ) + 21; + zSig = absA; + return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' +| to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) +{ + flag zSign; + uint32 absA; + int8 shiftCount; + uint64_t zSig; + + if ( a == 0 ) return packFloatx80( 0, 0, 0 ); + zSign = ( a < 0 ); + absA = zSign ? - a : a; + shiftCount = countLeadingZeros32( absA ) + 32; + zSig = absA; + return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' to +| the quadruple-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 int32_to_float128( int32 a STATUS_PARAM ) +{ + flag zSign; + uint32 absA; + int8 shiftCount; + uint64_t zSig0; + + if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); + zSign = ( a < 0 ); + absA = zSign ? - a : a; + shiftCount = countLeadingZeros32( absA ) + 17; + zSig0 = absA; + return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 64-bit two's complement integer `a' +| to the single-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 int64_to_float32( int64 a STATUS_PARAM ) +{ + flag zSign; + uint64 absA; + int8 shiftCount; + + if ( a == 0 ) return float32_zero; + zSign = ( a < 0 ); + absA = zSign ? - a : a; + shiftCount = countLeadingZeros64( absA ) - 40; + if ( 0 <= shiftCount ) { + return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); + } + else { + shiftCount += 7; + if ( shiftCount < 0 ) { + shift64RightJamming( absA, - shiftCount, &absA ); + } + else { + absA <<= shiftCount; + } + return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); + } + +} + +float32 uint64_to_float32( uint64 a STATUS_PARAM ) +{ + int8 shiftCount; + + if ( a == 0 ) return float32_zero; + shiftCount = countLeadingZeros64( a ) - 40; + if ( 0 <= shiftCount ) { + return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); + } + else { + shiftCount += 7; + if ( shiftCount < 0 ) { + shift64RightJamming( a, - shiftCount, &a ); + } + else { + a <<= shiftCount; + } + return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 64-bit two's complement integer `a' +| to the double-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 int64_to_float64( int64 a STATUS_PARAM ) +{ + flag zSign; + + if ( a == 0 ) return float64_zero; + if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { + return packFloat64( 1, 0x43E, 0 ); + } + zSign = ( a < 0 ); + return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); + +} + +float64 uint64_to_float64( uint64 a STATUS_PARAM ) +{ + if ( a == 0 ) return float64_zero; + return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 64-bit two's complement integer `a' +| to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) +{ + flag zSign; + uint64 absA; + int8 shiftCount; + + if ( a == 0 ) return packFloatx80( 0, 0, 0 ); + zSign = ( a < 0 ); + absA = zSign ? - a : a; + shiftCount = countLeadingZeros64( absA ); + return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 64-bit two's complement integer `a' to +| the quadruple-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 int64_to_float128( int64 a STATUS_PARAM ) +{ + flag zSign; + uint64 absA; + int8 shiftCount; + int32 zExp; + uint64_t zSig0, zSig1; + + if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); + zSign = ( a < 0 ); + absA = zSign ? - a : a; + shiftCount = countLeadingZeros64( absA ) + 49; + zExp = 0x406E - shiftCount; + if ( 64 <= shiftCount ) { + zSig1 = 0; + zSig0 = absA; + shiftCount -= 64; + } + else { + zSig1 = absA; + zSig0 = 0; + } + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); + return packFloat128( zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 float32_to_int32( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint32_t aSig; + uint64_t aSig64; + + a = float32_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( ( aExp == 0xFF ) && aSig ) aSign = 0; + if ( aExp ) aSig |= 0x00800000; + shiftCount = 0xAF - aExp; + aSig64 = aSig; + aSig64 <<= 32; + if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); + return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint32_t aSig; + int32_t z; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = aExp - 0x9E; + if ( 0 <= shiftCount ) { + if ( float32_val(a) != 0xCF000000 ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; + } + return (int32_t) 0x80000000; + } + else if ( aExp <= 0x7E ) { + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + aSig = ( aSig | 0x00800000 )<<8; + z = aSig>>( - shiftCount ); + if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 16-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint32_t aSig; + int32 z; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = aExp - 0x8E; + if ( 0 <= shiftCount ) { + if ( float32_val(a) != 0xC7000000 ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { + return 0x7FFF; + } + } + return (int32_t) 0xffff8000; + } + else if ( aExp <= 0x7E ) { + if ( aExp | aSig ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return 0; + } + shiftCount -= 0x10; + aSig = ( aSig | 0x00800000 )<<8; + z = aSig>>( - shiftCount ); + if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + if ( aSign ) { + z = - z; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int64 float32_to_int64( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint32_t aSig; + uint64_t aSig64, aSigExtra; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = 0xBE - aExp; + if ( shiftCount < 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + if ( aExp ) aSig |= 0x00800000; + aSig64 = aSig; + aSig64 <<= 40; + shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); + return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. If +| `a' is a NaN, the largest positive integer is returned. Otherwise, if the +| conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint32_t aSig; + uint64_t aSig64; + int64 z; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = aExp - 0xBE; + if ( 0 <= shiftCount ) { + if ( float32_val(a) != 0xDF000000 ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + else if ( aExp <= 0x7E ) { + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + aSig64 = aSig | 0x00800000; + aSig64 <<= 40; + z = aSig64>>( - shiftCount ); + if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the double-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float32_to_float64( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint32_t aSig; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + return packFloat64( aSign, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint32_t aSig; + + a = float32_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + aSig |= 0x00800000; + return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the double-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float32_to_float128( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint32_t aSig; + + a = float32_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + return packFloat128( aSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); + +} + +/*---------------------------------------------------------------------------- +| Rounds the single-precision floating-point value `a' to an integer, and +| returns the result as a single-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_round_to_int( float32 a STATUS_PARAM) +{ + flag aSign; + int_fast16_t aExp; + uint32_t lastBitMask, roundBitsMask; + int8 roundingMode; + uint32_t z; + a = float32_squash_input_denormal(a STATUS_VAR); + + aExp = extractFloat32Exp( a ); + if ( 0x96 <= aExp ) { + if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { + return propagateFloat32NaN( a, a STATUS_VAR ); + } + return a; + } + if ( aExp <= 0x7E ) { + if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; + STATUS(float_exception_flags) |= float_flag_inexact; + aSign = extractFloat32Sign( a ); + switch ( STATUS(float_rounding_mode) ) { + case float_round_nearest_even: + if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { + return packFloat32( aSign, 0x7F, 0 ); + } + break; + case float_round_down: + return make_float32(aSign ? 0xBF800000 : 0); + case float_round_up: + return make_float32(aSign ? 0x80000000 : 0x3F800000); + } + return packFloat32( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x96 - aExp; + roundBitsMask = lastBitMask - 1; + z = float32_val(a); + roundingMode = STATUS(float_rounding_mode); + if ( roundingMode == float_round_nearest_even ) { + z += lastBitMask>>1; + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { + z += roundBitsMask; + } + } + z &= ~ roundBitsMask; + if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; + return make_float32(z); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the single-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) +{ + int_fast16_t aExp, bExp, zExp; + uint32_t aSig, bSig, zSig; + int_fast16_t expDiff; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + expDiff = aExp - bExp; + aSig <<= 6; + bSig <<= 6; + if ( 0 < expDiff ) { + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= 0x20000000; + } + shift32RightJamming( bSig, expDiff, &bSig ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return packFloat32( zSign, 0xFF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= 0x20000000; + } + shift32RightJamming( aSig, - expDiff, &aSig ); + zExp = bExp; + } + else { + if ( aExp == 0xFF ) { + if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return a; + } + if ( aExp == 0 ) { + if (STATUS(flush_to_zero)) { + if (aSig | bSig) { + float_raise(float_flag_output_denormal STATUS_VAR); + } + return packFloat32(zSign, 0, 0); + } + return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); + } + zSig = 0x40000000 + aSig + bSig; + zExp = aExp; + goto roundAndPack; + } + aSig |= 0x20000000; + zSig = ( aSig + bSig )<<1; + --zExp; + if ( (int32_t) zSig < 0 ) { + zSig = aSig + bSig; + ++zExp; + } + roundAndPack: + return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the single- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) +{ + int_fast16_t aExp, bExp, zExp; + uint32_t aSig, bSig, zSig; + int_fast16_t expDiff; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + expDiff = aExp - bExp; + aSig <<= 7; + bSig <<= 7; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0xFF ) { + if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return packFloat32( zSign ^ 1, 0xFF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= 0x40000000; + } + shift32RightJamming( aSig, - expDiff, &aSig ); + bSig |= 0x40000000; + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= 0x40000000; + } + shift32RightJamming( bSig, expDiff, &bSig ); + aSig |= 0x40000000; + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the single-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_add( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign == bSign ) { + return addFloat32Sigs( a, b, aSign STATUS_VAR); + } + else { + return subFloat32Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sub( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign == bSign ) { + return subFloat32Sigs( a, b, aSign STATUS_VAR ); + } + else { + return addFloat32Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_mul( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int_fast16_t aExp, bExp, zExp; + uint32_t aSig, bSig; + uint64_t zSig64; + uint32_t zSig; + + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0xFF ) { + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { + return propagateFloat32NaN( a, b STATUS_VAR ); + } + if ( ( bExp | bSig ) == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x7F; + aSig = ( aSig | 0x00800000 )<<7; + bSig = ( bSig | 0x00800000 )<<8; + shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); + zSig = zSig64; + if ( 0 <= (int32_t) ( zSig<<1 ) ) { + zSig <<= 1; + --zExp; + } + return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the single-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_div( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int_fast16_t aExp, bExp, zExp; + uint32_t aSig, bSig, zSig; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return packFloat32( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + float_raise( float_flag_divbyzero STATUS_VAR); + return packFloat32( zSign, 0xFF, 0 ); + } + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x7D; + aSig = ( aSig | 0x00800000 )<<7; + bSig = ( bSig | 0x00800000 )<<8; + if ( bSig <= ( aSig + aSig ) ) { + aSig >>= 1; + ++zExp; + } + zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; + if ( ( zSig & 0x3F ) == 0 ) { + zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); + } + return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the single-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_rem( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, zSign; + int_fast16_t aExp, bExp, expDiff; + uint32_t aSig, bSig; + uint32_t q; + uint64_t aSig64, bSig64, q64; + uint32_t alternateASig; + int32_t sigMean; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + if ( aExp == 0xFF ) { + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { + return propagateFloat32NaN( a, b STATUS_VAR ); + } + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return a; + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + expDiff = aExp - bExp; + aSig |= 0x00800000; + bSig |= 0x00800000; + if ( expDiff < 32 ) { + aSig <<= 8; + bSig <<= 8; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + aSig >>= 1; + } + q = ( bSig <= aSig ); + if ( q ) aSig -= bSig; + if ( 0 < expDiff ) { + q = ( ( (uint64_t) aSig )<<32 ) / bSig; + q >>= 32 - expDiff; + bSig >>= 2; + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; + } + else { + aSig >>= 2; + bSig >>= 2; + } + } + else { + if ( bSig <= aSig ) aSig -= bSig; + aSig64 = ( (uint64_t) aSig )<<40; + bSig64 = ( (uint64_t) bSig )<<40; + expDiff -= 64; + while ( 0 < expDiff ) { + q64 = estimateDiv128To64( aSig64, 0, bSig64 ); + q64 = ( 2 < q64 ) ? q64 - 2 : 0; + aSig64 = - ( ( bSig * q64 )<<38 ); + expDiff -= 62; + } + expDiff += 64; + q64 = estimateDiv128To64( aSig64, 0, bSig64 ); + q64 = ( 2 < q64 ) ? q64 - 2 : 0; + q = q64>>( 64 - expDiff ); + bSig <<= 6; + aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; + } + do { + alternateASig = aSig; + ++q; + aSig -= bSig; + } while ( 0 <= (int32_t) aSig ); + sigMean = aSig + alternateASig; + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { + aSig = alternateASig; + } + zSign = ( (int32_t) aSig < 0 ); + if ( zSign ) aSig = - aSig; + return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM) +{ + flag aSign, bSign, cSign, zSign; + int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; + uint32_t aSig, bSig, cSig; + flag pInf, pZero, pSign; + uint64_t pSig64, cSig64, zSig64; + uint32_t pSig; + int shiftcount; + flag signflip, infzero; + + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + c = float32_squash_input_denormal(c STATUS_VAR); + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + bSign = extractFloat32Sign(b); + cSig = extractFloat32Frac(c); + cExp = extractFloat32Exp(c); + cSign = extractFloat32Sign(c); + + infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || + (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0xff) && aSig) || + ((bExp == 0xff) && bSig) || + ((cExp == 0xff) && cSig)) { + return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR); + } + + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + signflip = (flags & float_muladd_negate_result) ? 1 : 0; + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0xff) || (bExp == 0xff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0xff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + return packFloat32(cSign ^ signflip, 0xff, 0); + } + + if (pInf) { + return packFloat32(pSign ^ signflip, 0xff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (STATUS(float_rounding_mode) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat32(zSign ^ signflip, 0, 0); + } + /* Exact zero plus a denorm */ + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat32(cSign ^ signflip, 0, 0); + } + } + /* Zero plus something non-zero : just return the something */ + return make_float32(float32_val(c) ^ (signflip << 31)); + } + + if (aExp == 0) { + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + normalizeFloat32Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat32() takes. + */ + pExp = aExp + bExp - 0x7e; + aSig = (aSig | 0x00800000) << 7; + bSig = (bSig | 0x00800000) << 8; + pSig64 = (uint64_t)aSig * bSig; + if ((int64_t)(pSig64 << 1) >= 0) { + pSig64 <<= 1; + pExp--; + } + + zSign = pSign ^ signflip; + + /* Now pSig64 is the significand of the multiply, with the explicit bit in + * position 62. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift64RightJamming(pSig64, 32, &pSig64); + pSig = pSig64; + return roundAndPackFloat32(zSign, pExp - 1, + pSig STATUS_VAR); + } + normalizeFloat32Subnormal(cSig, &cExp, &cSig); + } + + cSig64 = (uint64_t)cSig << (62 - 23); + cSig64 |= LIT64(0x4000000000000000); + expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift64RightJamming(cSig64, expDiff, &cSig64); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift64RightJamming(pSig64, -expDiff, &pSig64); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 62 */ + zSig64 = pSig64 + cSig64; + if ((int64_t)zSig64 < 0) { + shift64RightJamming(zSig64, 1, &zSig64); + } else { + zExp--; + } + } else { + /* Subtraction */ + if (expDiff > 0) { + shift64RightJamming(cSig64, expDiff, &cSig64); + zSig64 = pSig64 - cSig64; + zExp = pExp; + } else if (expDiff < 0) { + shift64RightJamming(pSig64, -expDiff, &pSig64); + zSig64 = cSig64 - pSig64; + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (cSig64 < pSig64) { + zSig64 = pSig64 - cSig64; + } else if (pSig64 < cSig64) { + zSig64 = cSig64 - pSig64; + zSign ^= 1; + } else { + /* Exact zero */ + zSign = signflip; + if (STATUS(float_rounding_mode) == float_round_down) { + zSign ^= 1; + } + return packFloat32(zSign, 0, 0); + } + } + --zExp; + /* Normalize to put the explicit bit back into bit 62. */ + shiftcount = countLeadingZeros64(zSig64) - 1; + zSig64 <<= shiftcount; + zExp -= shiftcount; + } + shift64RightJamming(zSig64, 32, &zSig64); + return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR); +} + + +/*---------------------------------------------------------------------------- +| Returns the square root of the single-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sqrt( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, zExp; + uint32_t aSig, zSig; + uint64_t rem, term; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); + if ( ! aSign ) return a; + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + if ( aSign ) { + if ( ( aExp | aSig ) == 0 ) return a; + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return float32_zero; + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; + aSig = ( aSig | 0x00800000 )<<8; + zSig = estimateSqrt32( aExp, aSig ) + 2; + if ( ( zSig & 0x7F ) <= 5 ) { + if ( zSig < 2 ) { + zSig = 0x7FFFFFFF; + goto roundAndPack; + } + aSig >>= aExp & 1; + term = ( (uint64_t) zSig ) * zSig; + rem = ( ( (uint64_t) aSig )<<32 ) - term; + while ( (int64_t) rem < 0 ) { + --zSig; + rem += ( ( (uint64_t) zSig )<<1 ) | 1; + } + zSig |= ( rem != 0 ); + } + shift32RightJamming( zSig, 1, &zSig ); + roundAndPack: + return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the binary exponential of the single-precision floating-point value +| `a'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +| +| Uses the following identities: +| +| 1. ------------------------------------------------------------------------- +| x x*ln(2) +| 2 = e +| +| 2. ------------------------------------------------------------------------- +| 2 3 4 5 n +| x x x x x x x +| e = 1 + --- + --- + --- + --- + --- + ... + --- + ... +| 1! 2! 3! 4! 5! n! +*----------------------------------------------------------------------------*/ + +static const float64 float32_exp2_coefficients[15] = +{ + const_float64( 0x3ff0000000000000ll ), /* 1 */ + const_float64( 0x3fe0000000000000ll ), /* 2 */ + const_float64( 0x3fc5555555555555ll ), /* 3 */ + const_float64( 0x3fa5555555555555ll ), /* 4 */ + const_float64( 0x3f81111111111111ll ), /* 5 */ + const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ + const_float64( 0x3f2a01a01a01a01all ), /* 7 */ + const_float64( 0x3efa01a01a01a01all ), /* 8 */ + const_float64( 0x3ec71de3a556c734ll ), /* 9 */ + const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ + const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ + const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ + const_float64( 0x3de6124613a86d09ll ), /* 13 */ + const_float64( 0x3da93974a8c07c9dll ), /* 14 */ + const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ +}; + +float32 float32_exp2( float32 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint32_t aSig; + float64 r, x, xn; + int i; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + + if ( aExp == 0xFF) { + if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); + return (aSign) ? float32_zero : a; + } + if (aExp == 0) { + if (aSig == 0) return float32_one; + } + + float_raise( float_flag_inexact STATUS_VAR); + + /* ******************************* */ + /* using float64 for approximation */ + /* ******************************* */ + x = float32_to_float64(a STATUS_VAR); + x = float64_mul(x, float64_ln2 STATUS_VAR); + + xn = x; + r = float64_one; + for (i = 0 ; i < 15 ; i++) { + float64 f; + + f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR); + r = float64_add(r, f STATUS_VAR); + + xn = float64_mul(xn, x STATUS_VAR); + } + + return float64_to_float32(r, status); +} + +/*---------------------------------------------------------------------------- +| Returns the binary log of the single-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ +float32 float32_log2( float32 a STATUS_PARAM ) +{ + flag aSign, zSign; + int_fast16_t aExp; + uint32_t aSig, zSig, i; + + a = float32_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + if ( aSign ) { + float_raise( float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); + return a; + } + + aExp -= 0x7F; + aSig |= 0x00800000; + zSign = aExp < 0; + zSig = aExp << 23; + + for (i = 1 << 22; i > 0; i >>= 1) { + aSig = ( (uint64_t)aSig * aSig ) >> 23; + if ( aSig & 0x01000000 ) { + aSig >>= 1; + zSig |= i; + } + } + + if ( zSign ) + zSig = -zSig; + + return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_eq( float32 a, float32 b STATUS_PARAM ) +{ + uint32_t av, bv; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + av = float32_val(a); + bv = float32_val(b); + return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. The invalid +| exception is raised if either operand is a NaN. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_le( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint32_t av, bv; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + av = float32_val(a); + bv = float32_val(b); + if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); + return ( av == bv ) || ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. The comparison is performed according +| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_lt( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint32_t av, bv; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + av = float32_val(a); + bv = float32_val(b); + if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); + return ( av != bv ) && ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point values `a' and `b' cannot +| be compared, and 0 otherwise. The invalid exception is raised if either +| operand is a NaN. The comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_unordered( float32 a, float32 b STATUS_PARAM ) +{ + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. The comparison is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_eq_quiet( float32 a, float32 b STATUS_PARAM ) +{ + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + return ( float32_val(a) == float32_val(b) ) || + ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint32_t av, bv; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + av = float32_val(a); + bv = float32_val(b); + if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); + return ( av == bv ) || ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint32_t av, bv; + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + av = float32_val(a); + bv = float32_val(b); + if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); + return ( av != bv ) && ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point values `a' and `b' cannot +| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The +| comparison is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM ) +{ + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 float64_to_int32( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint64_t aSig; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; + if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); + shiftCount = 0x42C - aExp; + if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); + return roundAndPackInt32( aSign, aSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint64_t aSig, savedASig; + int32_t z; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( 0x41E < aExp ) { + if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; + goto invalid; + } + else if ( aExp < 0x3FF ) { + if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + aSig |= LIT64( 0x0010000000000000 ); + shiftCount = 0x433 - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = aSig; + if ( aSign ) z = - z; + if ( ( z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; + } + if ( ( aSig<<shiftCount ) != savedASig ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 16-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint64_t aSig, savedASig; + int32 z; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( 0x40E < aExp ) { + if ( ( aExp == 0x7FF ) && aSig ) { + aSign = 0; + } + goto invalid; + } + else if ( aExp < 0x3FF ) { + if ( aExp || aSig ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return 0; + } + aSig |= LIT64( 0x0010000000000000 ); + shiftCount = 0x433 - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = aSig; + if ( aSign ) { + z = - z; + } + if ( ( (int16_t)z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + return aSign ? (int32_t) 0xffff8000 : 0x7FFF; + } + if ( ( aSig<<shiftCount ) != savedASig ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int64 float64_to_int64( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint64_t aSig, aSigExtra; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); + shiftCount = 0x433 - aExp; + if ( shiftCount <= 0 ) { + if ( 0x43E < aExp ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign + || ( ( aExp == 0x7FF ) + && ( aSig != LIT64( 0x0010000000000000 ) ) ) + ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + aSigExtra = 0; + aSig <<= - shiftCount; + } + else { + shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); + } + return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, shiftCount; + uint64_t aSig; + int64 z; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); + shiftCount = aExp - 0x433; + if ( 0 <= shiftCount ) { + if ( 0x43E <= aExp ) { + if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign + || ( ( aExp == 0x7FF ) + && ( aSig != LIT64( 0x0010000000000000 ) ) ) + ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + z = aSig<<shiftCount; + } + else { + if ( aExp < 0x3FE ) { + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + z = aSig>>( - shiftCount ); + if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the single-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float64_to_float32( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint64_t aSig; + uint32_t zSig; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + return packFloat32( aSign, 0xFF, 0 ); + } + shift64RightJamming( aSig, 22, &aSig ); + zSig = aSig; + if ( aExp || zSig ) { + zSig |= 0x40000000; + aExp -= 0x381; + } + return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); + +} + + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| half-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ +static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig) +{ + return make_float16( + (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); +} + +/* Half precision floats come in two formats: standard IEEE and "ARM" format. + The latter gains extra exponent range by omitting the NaN/Inf encodings. */ + +float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM) +{ + flag aSign; + int_fast16_t aExp; + uint32_t aSig; + + aSign = extractFloat16Sign(a); + aExp = extractFloat16Exp(a); + aSig = extractFloat16Frac(a); + + if (aExp == 0x1f && ieee) { + if (aSig) { + return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR); + } + return packFloat32(aSign, 0xff, aSig << 13); + } + if (aExp == 0) { + int8 shiftCount; + + if (aSig == 0) { + return packFloat32(aSign, 0, 0); + } + + shiftCount = countLeadingZeros32( aSig ) - 21; + aSig = aSig << shiftCount; + aExp = -shiftCount; + } + return packFloat32( aSign, aExp + 0x70, aSig << 13); +} + +float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM) +{ + flag aSign; + int_fast16_t aExp; + uint32_t aSig; + uint32_t mask; + uint32_t increment; + int8 roundingMode; + a = float32_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if (aSig) { + /* Input is a NaN */ + float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + if (!ieee) { + return packFloat16(aSign, 0, 0); + } + return r; + } + /* Infinity */ + if (!ieee) { + float_raise(float_flag_invalid STATUS_VAR); + return packFloat16(aSign, 0x1f, 0x3ff); + } + return packFloat16(aSign, 0x1f, 0); + } + if (aExp == 0 && aSig == 0) { + return packFloat16(aSign, 0, 0); + } + /* Decimal point between bits 22 and 23. */ + aSig |= 0x00800000; + aExp -= 0x7f; + if (aExp < -14) { + mask = 0x00ffffff; + if (aExp >= -24) { + mask >>= 25 + aExp; + } + } else { + mask = 0x00001fff; + } + if (aSig & mask) { + float_raise( float_flag_underflow STATUS_VAR ); + roundingMode = STATUS(float_rounding_mode); + switch (roundingMode) { + case float_round_nearest_even: + increment = (mask + 1) >> 1; + if ((aSig & mask) == increment) { + increment = aSig & (increment << 1); + } + break; + case float_round_up: + increment = aSign ? 0 : mask; + break; + case float_round_down: + increment = aSign ? mask : 0; + break; + default: /* round_to_zero */ + increment = 0; + break; + } + aSig += increment; + if (aSig >= 0x01000000) { + aSig >>= 1; + aExp++; + } + } else if (aExp < -14 + && STATUS(float_detect_tininess) == float_tininess_before_rounding) { + float_raise( float_flag_underflow STATUS_VAR); + } + + if (ieee) { + if (aExp > 15) { + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + return packFloat16(aSign, 0x1f, 0); + } + } else { + if (aExp > 16) { + float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR); + return packFloat16(aSign, 0x1f, 0x3ff); + } + } + if (aExp < -24) { + return packFloat16(aSign, 0, 0); + } + if (aExp < -14) { + aSig >>= -14 - aExp; + aExp = -14; + } + return packFloat16(aSign, aExp + 14, aSig >> 13); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint64_t aSig; + + a = float64_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + return + packFloatx80( + aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the quadruple-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float64_to_float128( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint64_t aSig, zSig0, zSig1; + + a = float64_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + return packFloat128( aSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); + return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Rounds the double-precision floating-point value `a' to an integer, and +| returns the result as a double-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_round_to_int( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint64_t lastBitMask, roundBitsMask; + int8 roundingMode; + uint64_t z; + a = float64_squash_input_denormal(a STATUS_VAR); + + aExp = extractFloat64Exp( a ); + if ( 0x433 <= aExp ) { + if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { + return propagateFloat64NaN( a, a STATUS_VAR ); + } + return a; + } + if ( aExp < 0x3FF ) { + if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; + STATUS(float_exception_flags) |= float_flag_inexact; + aSign = extractFloat64Sign( a ); + switch ( STATUS(float_rounding_mode) ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { + return packFloat64( aSign, 0x3FF, 0 ); + } + break; + case float_round_down: + return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); + case float_round_up: + return make_float64( + aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); + } + return packFloat64( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x433 - aExp; + roundBitsMask = lastBitMask - 1; + z = float64_val(a); + roundingMode = STATUS(float_rounding_mode); + if ( roundingMode == float_round_nearest_even ) { + z += lastBitMask>>1; + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { + z += roundBitsMask; + } + } + z &= ~ roundBitsMask; + if ( z != float64_val(a) ) + STATUS(float_exception_flags) |= float_flag_inexact; + return make_float64(z); + +} + +float64 float64_trunc_to_int( float64 a STATUS_PARAM) +{ + int oldmode; + float64 res; + oldmode = STATUS(float_rounding_mode); + STATUS(float_rounding_mode) = float_round_to_zero; + res = float64_round_to_int(a STATUS_VAR); + STATUS(float_rounding_mode) = oldmode; + return res; +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the double-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) +{ + int_fast16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig; + int_fast16_t expDiff; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + expDiff = aExp - bExp; + aSig <<= 9; + bSig <<= 9; + if ( 0 < expDiff ) { + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= LIT64( 0x2000000000000000 ); + } + shift64RightJamming( bSig, expDiff, &bSig ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= LIT64( 0x2000000000000000 ); + } + shift64RightJamming( aSig, - expDiff, &aSig ); + zExp = bExp; + } + else { + if ( aExp == 0x7FF ) { + if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return a; + } + if ( aExp == 0 ) { + if (STATUS(flush_to_zero)) { + if (aSig | bSig) { + float_raise(float_flag_output_denormal STATUS_VAR); + } + return packFloat64(zSign, 0, 0); + } + return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); + } + zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; + zExp = aExp; + goto roundAndPack; + } + aSig |= LIT64( 0x2000000000000000 ); + zSig = ( aSig + bSig )<<1; + --zExp; + if ( (int64_t) zSig < 0 ) { + zSig = aSig + bSig; + ++zExp; + } + roundAndPack: + return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) +{ + int_fast16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig; + int_fast16_t expDiff; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + expDiff = aExp - bExp; + aSig <<= 10; + bSig <<= 10; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FF ) { + if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return packFloat64( zSign ^ 1, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= LIT64( 0x4000000000000000 ); + } + shift64RightJamming( aSig, - expDiff, &aSig ); + bSig |= LIT64( 0x4000000000000000 ); + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= LIT64( 0x4000000000000000 ); + } + shift64RightJamming( bSig, expDiff, &bSig ); + aSig |= LIT64( 0x4000000000000000 ); + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the double-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_add( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign == bSign ) { + return addFloat64Sigs( a, b, aSign STATUS_VAR ); + } + else { + return subFloat64Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sub( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign == bSign ) { + return subFloat64Sigs( a, b, aSign STATUS_VAR ); + } + else { + return addFloat64Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_mul( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int_fast16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FF ) { + if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { + return propagateFloat64NaN( a, b STATUS_VAR ); + } + if ( ( bExp | bSig ) == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); + normalizeFloat64Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x3FF; + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; + mul64To128( aSig, bSig, &zSig0, &zSig1 ); + zSig0 |= ( zSig1 != 0 ); + if ( 0 <= (int64_t) ( zSig0<<1 ) ) { + zSig0 <<= 1; + --zExp; + } + return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the double-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_div( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int_fast16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig; + uint64_t rem0, rem1; + uint64_t term0, term1; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return packFloat64( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + float_raise( float_flag_divbyzero STATUS_VAR); + return packFloat64( zSign, 0x7FF, 0 ); + } + normalizeFloat64Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x3FD; + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; + if ( bSig <= ( aSig + aSig ) ) { + aSig >>= 1; + ++zExp; + } + zSig = estimateDiv128To64( aSig, 0, bSig ); + if ( ( zSig & 0x1FF ) <= 2 ) { + mul64To128( bSig, zSig, &term0, &term1 ); + sub128( aSig, 0, term0, term1, &rem0, &rem1 ); + while ( (int64_t) rem0 < 0 ) { + --zSig; + add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); + } + zSig |= ( rem1 != 0 ); + } + return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the double-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_rem( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, zSign; + int_fast16_t aExp, bExp, expDiff; + uint64_t aSig, bSig; + uint64_t q, alternateASig; + int64_t sigMean; + + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + if ( aExp == 0x7FF ) { + if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { + return propagateFloat64NaN( a, b STATUS_VAR ); + } + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + normalizeFloat64Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return a; + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + expDiff = aExp - bExp; + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + aSig >>= 1; + } + q = ( bSig <= aSig ); + if ( q ) aSig -= bSig; + expDiff -= 64; + while ( 0 < expDiff ) { + q = estimateDiv128To64( aSig, 0, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + aSig = - ( ( bSig>>2 ) * q ); + expDiff -= 62; + } + expDiff += 64; + if ( 0 < expDiff ) { + q = estimateDiv128To64( aSig, 0, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + q >>= 64 - expDiff; + bSig >>= 2; + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; + } + else { + aSig >>= 2; + bSig >>= 2; + } + do { + alternateASig = aSig; + ++q; + aSig -= bSig; + } while ( 0 <= (int64_t) aSig ); + sigMean = aSig + alternateASig; + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { + aSig = alternateASig; + } + zSign = ( (int64_t) aSig < 0 ); + if ( zSign ) aSig = - aSig; + return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM) +{ + flag aSign, bSign, cSign, zSign; + int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; + uint64_t aSig, bSig, cSig; + flag pInf, pZero, pSign; + uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; + int shiftcount; + flag signflip, infzero; + + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + c = float64_squash_input_denormal(c STATUS_VAR); + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + bSign = extractFloat64Sign(b); + cSig = extractFloat64Frac(c); + cExp = extractFloat64Exp(c); + cSign = extractFloat64Sign(c); + + infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || + (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0x7ff) && aSig) || + ((bExp == 0x7ff) && bSig) || + ((cExp == 0x7ff) && cSig)) { + return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR); + } + + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + signflip = (flags & float_muladd_negate_result) ? 1 : 0; + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0x7ff) || (bExp == 0x7ff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0x7ff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + return packFloat64(cSign ^ signflip, 0x7ff, 0); + } + + if (pInf) { + return packFloat64(pSign ^ signflip, 0x7ff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (STATUS(float_rounding_mode) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat64(zSign ^ signflip, 0, 0); + } + /* Exact zero plus a denorm */ + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat64(cSign ^ signflip, 0, 0); + } + } + /* Zero plus something non-zero : just return the something */ + return make_float64(float64_val(c) ^ ((uint64_t)signflip << 63)); + } + + if (aExp == 0) { + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + normalizeFloat64Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat64() takes. + */ + pExp = aExp + bExp - 0x3fe; + aSig = (aSig | LIT64(0x0010000000000000))<<10; + bSig = (bSig | LIT64(0x0010000000000000))<<11; + mul64To128(aSig, bSig, &pSig0, &pSig1); + if ((int64_t)(pSig0 << 1) >= 0) { + shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); + pExp--; + } + + zSign = pSign ^ signflip; + + /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit + * bit in position 126. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); + return roundAndPackFloat64(zSign, pExp - 1, + pSig1 STATUS_VAR); + } + normalizeFloat64Subnormal(cSig, &cExp, &cSig); + } + + /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the + * significand of the addend, with the explicit bit in position 126. + */ + cSig0 = cSig << (126 - 64 - 52); + cSig1 = 0; + cSig0 |= LIT64(0x4000000000000000); + expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 126 */ + add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + if ((int64_t)zSig0 < 0) { + shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); + } else { + zExp--; + } + shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); + return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR); + } else { + /* Subtraction */ + if (expDiff > 0) { + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + zExp = pExp; + } else if (expDiff < 0) { + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (lt128(cSig0, cSig1, pSig0, pSig1)) { + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zSign ^= 1; + } else { + /* Exact zero */ + zSign = signflip; + if (STATUS(float_rounding_mode) == float_round_down) { + zSign ^= 1; + } + return packFloat64(zSign, 0, 0); + } + } + --zExp; + /* Do the equivalent of normalizeRoundAndPackFloat64() but + * starting with the significand in a pair of uint64_t. + */ + if (zSig0) { + shiftcount = countLeadingZeros64(zSig0) - 1; + shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); + if (zSig1) { + zSig0 |= 1; + } + zExp -= shiftcount; + } else { + shiftcount = countLeadingZeros64(zSig1) - 1; + zSig0 = zSig1 << shiftcount; + zExp -= (shiftcount + 64); + } + return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR); + } +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the double-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sqrt( float64 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp, zExp; + uint64_t aSig, zSig, doubleZSig; + uint64_t rem0, rem1, term0, term1; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); + if ( ! aSign ) return a; + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + if ( aSign ) { + if ( ( aExp | aSig ) == 0 ) return a; + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return float64_zero; + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; + aSig |= LIT64( 0x0010000000000000 ); + zSig = estimateSqrt32( aExp, aSig>>21 ); + aSig <<= 9 - ( aExp & 1 ); + zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); + if ( ( zSig & 0x1FF ) <= 5 ) { + doubleZSig = zSig<<1; + mul64To128( zSig, zSig, &term0, &term1 ); + sub128( aSig, 0, term0, term1, &rem0, &rem1 ); + while ( (int64_t) rem0 < 0 ) { + --zSig; + doubleZSig -= 2; + add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); + } + zSig |= ( ( rem0 | rem1 ) != 0 ); + } + return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the binary log of the double-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ +float64 float64_log2( float64 a STATUS_PARAM ) +{ + flag aSign, zSign; + int_fast16_t aExp; + uint64_t aSig, aSig0, aSig1, zSig, i; + a = float64_squash_input_denormal(a STATUS_VAR); + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + if ( aSign ) { + float_raise( float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); + return a; + } + + aExp -= 0x3FF; + aSig |= LIT64( 0x0010000000000000 ); + zSign = aExp < 0; + zSig = (uint64_t)aExp << 52; + for (i = 1LL << 51; i > 0; i >>= 1) { + mul64To128( aSig, aSig, &aSig0, &aSig1 ); + aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); + if ( aSig & LIT64( 0x0020000000000000 ) ) { + aSig >>= 1; + zSig |= i; + } + } + + if ( zSign ) + zSig = -zSig; + return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is equal to the +| corresponding value `b', and 0 otherwise. The invalid exception is raised +| if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_eq( float64 a, float64 b STATUS_PARAM ) +{ + uint64_t av, bv; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + av = float64_val(a); + bv = float64_val(b); + return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. The invalid +| exception is raised if either operand is a NaN. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_le( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint64_t av, bv; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + av = float64_val(a); + bv = float64_val(b); + if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); + return ( av == bv ) || ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. The comparison is performed according +| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_lt( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint64_t av, bv; + + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + av = float64_val(a); + bv = float64_val(b); + if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); + return ( av != bv ) && ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point values `a' and `b' cannot +| be compared, and 0 otherwise. The invalid exception is raised if either +| operand is a NaN. The comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_unordered( float64 a, float64 b STATUS_PARAM ) +{ + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is equal to the +| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception.The comparison is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_eq_quiet( float64 a, float64 b STATUS_PARAM ) +{ + uint64_t av, bv; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + av = float64_val(a); + bv = float64_val(b); + return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint64_t av, bv; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + av = float64_val(a); + bv = float64_val(b); + if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); + return ( av == bv ) || ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) +{ + flag aSign, bSign; + uint64_t av, bv; + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + av = float64_val(a); + bv = float64_val(b); + if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); + return ( av != bv ) && ( aSign ^ ( av < bv ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point values `a' and `b' cannot +| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The +| comparison is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM ) +{ + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 32-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic---which means in particular that the conversion +| is rounded according to the current rounding mode. If `a' is a NaN, the +| largest positive integer is returned. Otherwise, if the conversion +| overflows, the largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; + shiftCount = 0x4037 - aExp; + if ( shiftCount <= 0 ) shiftCount = 1; + shift64RightJamming( aSig, shiftCount, &aSig ); + return roundAndPackInt32( aSign, aSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 32-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic, except that the conversion is always rounded +| toward zero. If `a' is a NaN, the largest positive integer is returned. +| Otherwise, if the conversion overflows, the largest integer with the same +| sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig, savedASig; + int32_t z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( 0x401E < aExp ) { + if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; + goto invalid; + } + else if ( aExp < 0x3FFF ) { + if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + shiftCount = 0x403E - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = aSig; + if ( aSign ) z = - z; + if ( ( z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; + } + if ( ( aSig<<shiftCount ) != savedASig ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 64-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic---which means in particular that the conversion +| is rounded according to the current rounding mode. If `a' is a NaN, +| the largest positive integer is returned. Otherwise, if the conversion +| overflows, the largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig, aSigExtra; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + shiftCount = 0x403E - aExp; + if ( shiftCount <= 0 ) { + if ( shiftCount ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign + || ( ( aExp == 0x7FFF ) + && ( aSig != LIT64( 0x8000000000000000 ) ) ) + ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + aSigExtra = 0; + } + else { + shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); + } + return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 64-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic, except that the conversion is always rounded +| toward zero. If `a' is a NaN, the largest positive integer is returned. +| Otherwise, if the conversion overflows, the largest integer with the same +| sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig; + int64 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + shiftCount = aExp - 0x403E; + if ( 0 <= shiftCount ) { + aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); + if ( ( a.high != 0xC03E ) || aSig ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + else if ( aExp < 0x3FFF ) { + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + z = aSig>>( - shiftCount ); + if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the single-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig<<1 ) ) { + return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + } + return packFloat32( aSign, 0xFF, 0 ); + } + shift64RightJamming( aSig, 33, &aSig ); + if ( aExp || aSig ) aExp -= 0x3F81; + return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the double-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t aSig, zSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig<<1 ) ) { + return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + } + return packFloat64( aSign, 0x7FF, 0 ); + } + shift64RightJamming( aSig, 1, &zSig ); + if ( aExp || aSig ) aExp -= 0x3C01; + return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the quadruple-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int_fast16_t aExp; + uint64_t aSig, zSig0, zSig1; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { + return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + } + shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); + return packFloat128( aSign, aExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Rounds the extended double-precision floating-point value `a' to an integer, +| and returns the result as an extended quadruple-precision floating-point +| value. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t lastBitMask, roundBitsMask; + int8 roundingMode; + floatx80 z; + + aExp = extractFloatx80Exp( a ); + if ( 0x403E <= aExp ) { + if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { + return propagateFloatx80NaN( a, a STATUS_VAR ); + } + return a; + } + if ( aExp < 0x3FFF ) { + if ( ( aExp == 0 ) + && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { + return a; + } + STATUS(float_exception_flags) |= float_flag_inexact; + aSign = extractFloatx80Sign( a ); + switch ( STATUS(float_rounding_mode) ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) + ) { + return + packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); + } + break; + case float_round_down: + return + aSign ? + packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) + : packFloatx80( 0, 0, 0 ); + case float_round_up: + return + aSign ? packFloatx80( 1, 0, 0 ) + : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); + } + return packFloatx80( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x403E - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = STATUS(float_rounding_mode); + if ( roundingMode == float_round_nearest_even ) { + z.low += lastBitMask>>1; + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { + z.low += roundBitsMask; + } + } + z.low &= ~ roundBitsMask; + if ( z.low == 0 ) { + ++z.high; + z.low = LIT64( 0x8000000000000000 ); + } + if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the extended double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is +| negated before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) +{ + int32 aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + int32 expDiff; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) { + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) --expDiff; + shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FFF ) { + if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) ++expDiff; + shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); + zExp = bExp; + } + else { + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { + return propagateFloatx80NaN( a, b STATUS_VAR ); + } + return a; + } + zSig1 = 0; + zSig0 = aSig + bSig; + if ( aExp == 0 ) { + normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); + goto roundAndPack; + } + zExp = aExp; + goto shiftRight1; + } + zSig0 = aSig + bSig; + if ( (int64_t) zSig0 < 0 ) goto roundAndPack; + shiftRight1: + shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); + zSig0 |= LIT64( 0x8000000000000000 ); + ++zExp; + roundAndPack: + return + roundAndPackFloatx80( + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the extended +| double-precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) +{ + int32 aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + int32 expDiff; + floatx80 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { + return propagateFloatx80NaN( a, b STATUS_VAR ); + } + float_raise( float_flag_invalid STATUS_VAR); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + zSig1 = 0; + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FFF ) { + if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) ++expDiff; + shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); + bBigger: + sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) --expDiff; + shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); + aBigger: + sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); + zExp = aExp; + normalizeRoundAndPack: + return + normalizeRoundAndPackFloatx80( + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the extended double-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign; + + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign == bSign ) { + return addFloatx80Sigs( a, b, aSign STATUS_VAR ); + } + else { + return subFloatx80Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the extended double-precision floating- +| point values `a' and `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign; + + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign == bSign ) { + return subFloatx80Sigs( a, b, aSign STATUS_VAR ); + } + else { + return addFloatx80Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the extended double-precision floating- +| point values `a' and `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + floatx80 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + bSign = extractFloatx80Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig<<1 ) + || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { + return propagateFloatx80NaN( a, b STATUS_VAR ); + } + if ( ( bExp | bSig ) == 0 ) goto invalid; + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( bExp == 0x7FFF ) { + if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + if ( ( aExp | aSig ) == 0 ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); + normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); + normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x3FFE; + mul64To128( aSig, bSig, &zSig0, &zSig1 ); + if ( 0 < (int64_t) zSig0 ) { + shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); + --zExp; + } + return + roundAndPackFloatx80( + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the extended double-precision floating-point +| value `a' by the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + uint64_t rem0, rem1, rem2, term0, term1, term2; + floatx80 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + bSign = extractFloatx80Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + if ( bExp == 0x7FFF ) { + if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + goto invalid; + } + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( bExp == 0x7FFF ) { + if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + return packFloatx80( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + float_raise( float_flag_divbyzero STATUS_VAR); + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); + normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x3FFE; + rem1 = 0; + if ( bSig <= aSig ) { + shift128Right( aSig, 0, 1, &aSig, &rem1 ); + ++zExp; + } + zSig0 = estimateDiv128To64( aSig, rem1, bSig ); + mul64To128( bSig, zSig0, &term0, &term1 ); + sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); + while ( (int64_t) rem0 < 0 ) { + --zSig0; + add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); + } + zSig1 = estimateDiv128To64( rem1, 0, bSig ); + if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { + mul64To128( bSig, zSig1, &term1, &term2 ); + sub128( rem1, 0, term1, term2, &rem1, &rem2 ); + while ( (int64_t) rem1 < 0 ) { + --zSig1; + add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); + } + zSig1 |= ( ( rem1 | rem2 ) != 0 ); + } + return + roundAndPackFloatx80( + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the extended double-precision floating-point value +| `a' with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, zSign; + int32 aExp, bExp, expDiff; + uint64_t aSig0, aSig1, bSig; + uint64_t q, term0, term1, alternateASig0, alternateASig1; + floatx80 z; + + aSig0 = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig0<<1 ) + || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { + return propagateFloatx80NaN( a, b STATUS_VAR ); + } + goto invalid; + } + if ( bExp == 0x7FFF ) { + if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; + normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); + } + bSig |= LIT64( 0x8000000000000000 ); + zSign = aSign; + expDiff = aExp - bExp; + aSig1 = 0; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); + expDiff = 0; + } + q = ( bSig <= aSig0 ); + if ( q ) aSig0 -= bSig; + expDiff -= 64; + while ( 0 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + mul64To128( bSig, q, &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); + shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); + expDiff -= 62; + } + expDiff += 64; + if ( 0 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + q >>= 64 - expDiff; + mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); + shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); + while ( le128( term0, term1, aSig0, aSig1 ) ) { + ++q; + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); + } + } + else { + term1 = 0; + term0 = bSig; + } + sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); + if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) + || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) + && ( q & 1 ) ) + ) { + aSig0 = alternateASig0; + aSig1 = alternateASig1; + zSign = ! zSign; + } + return + normalizeRoundAndPackFloatx80( + 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the extended double-precision floating-point +| value `a'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, zExp; + uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; + uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; + floatx80 z; + + aSig0 = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( aExp == 0x7FFF ) { + if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); + if ( ! aSign ) return a; + goto invalid; + } + if ( aSign ) { + if ( ( aExp | aSig0 ) == 0 ) return a; + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + if ( aExp == 0 ) { + if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); + normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); + } + zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; + zSig0 = estimateSqrt32( aExp, aSig0>>32 ); + shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); + zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); + doubleZSig0 = zSig0<<1; + mul64To128( zSig0, zSig0, &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); + while ( (int64_t) rem0 < 0 ) { + --zSig0; + doubleZSig0 -= 2; + add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); + } + zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); + if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { + if ( zSig1 == 0 ) zSig1 = 1; + mul64To128( doubleZSig0, zSig1, &term1, &term2 ); + sub128( rem1, 0, term1, term2, &rem1, &rem2 ); + mul64To128( zSig1, zSig1, &term2, &term3 ); + sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); + while ( (int64_t) rem1 < 0 ) { + --zSig1; + shortShift128Left( 0, zSig1, 1, &term2, &term3 ); + term3 |= 1; + term2 |= doubleZSig0; + add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); + zSig0 |= doubleZSig0; + return + roundAndPackFloatx80( + STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is equal +| to the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) +{ + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is +| less than or equal to the corresponding value `b', and 0 otherwise. The +| invalid exception is raised if either operand is a NaN. The comparison is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is +| less than the corresponding value `b', and 0 otherwise. The invalid +| exception is raised if either operand is a NaN. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point values `a' and `b' +| cannot be compared, and 0 otherwise. The invalid exception is raised if +| either operand is a NaN. The comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ +int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM ) +{ + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. The comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM ) +{ + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is less +| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs +| do not cause an exception. Otherwise, the comparison is performed according +| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is less +| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause +| an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point values `a' and `b' +| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. +| The comparison is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ +int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM ) +{ + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the 32-bit two's complement integer format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 float128_to_int32( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); + aSig0 |= ( aSig1 != 0 ); + shiftCount = 0x4028 - aExp; + if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); + return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the 32-bit two's complement integer format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. If +| `a' is a NaN, the largest positive integer is returned. Otherwise, if the +| conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig0, aSig1, savedASig; + int32_t z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + aSig0 |= ( aSig1 != 0 ); + if ( 0x401E < aExp ) { + if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; + goto invalid; + } + else if ( aExp < 0x3FFF ) { + if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; + return 0; + } + aSig0 |= LIT64( 0x0001000000000000 ); + shiftCount = 0x402F - aExp; + savedASig = aSig0; + aSig0 >>= shiftCount; + z = aSig0; + if ( aSign ) z = - z; + if ( ( z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; + } + if ( ( aSig0<<shiftCount ) != savedASig ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the 64-bit two's complement integer format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int64 float128_to_int64( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); + shiftCount = 0x402F - aExp; + if ( shiftCount <= 0 ) { + if ( 0x403E < aExp ) { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign + || ( ( aExp == 0x7FFF ) + && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) + ) + ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); + } + else { + shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); + } + return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the 64-bit two's complement integer format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, shiftCount; + uint64_t aSig0, aSig1; + int64 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); + shiftCount = aExp - 0x402F; + if ( 0 < shiftCount ) { + if ( 0x403E <= aExp ) { + aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); + if ( ( a.high == LIT64( 0xC03E000000000000 ) ) + && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { + if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; + } + else { + float_raise( float_flag_invalid STATUS_VAR); + if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + } + return (int64_t) LIT64( 0x8000000000000000 ); + } + z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); + if ( (uint64_t) ( aSig1<<shiftCount ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + } + else { + if ( aExp < 0x3FFF ) { + if ( aExp | aSig0 | aSig1 ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return 0; + } + z = aSig0>>( - shiftCount ); + if ( aSig1 + || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the single-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float128_to_float32( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t aSig0, aSig1; + uint32_t zSig; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + } + return packFloat32( aSign, 0xFF, 0 ); + } + aSig0 |= ( aSig1 != 0 ); + shift64RightJamming( aSig0, 18, &aSig0 ); + zSig = aSig0; + if ( aExp || zSig ) { + zSig |= 0x40000000; + aExp -= 0x3F81; + } + return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float128_to_float64( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + } + return packFloat64( aSign, 0x7FF, 0 ); + } + shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); + aSig0 |= ( aSig1 != 0 ); + if ( aExp || aSig0 ) { + aSig0 |= LIT64( 0x4000000000000000 ); + aExp -= 0x3C01; + } + return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the extended double-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); + } + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + else { + aSig0 |= LIT64( 0x0001000000000000 ); + } + shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); + return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Rounds the quadruple-precision floating-point value `a' to an integer, and +| returns the result as a quadruple-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_round_to_int( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp; + uint64_t lastBitMask, roundBitsMask; + int8 roundingMode; + float128 z; + + aExp = extractFloat128Exp( a ); + if ( 0x402F <= aExp ) { + if ( 0x406F <= aExp ) { + if ( ( aExp == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) + ) { + return propagateFloat128NaN( a, a STATUS_VAR ); + } + return a; + } + lastBitMask = 1; + lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = STATUS(float_rounding_mode); + if ( roundingMode == float_round_nearest_even ) { + if ( lastBitMask ) { + add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; + } + else { + if ( (int64_t) z.low < 0 ) { + ++z.high; + if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; + } + } + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat128Sign( z ) + ^ ( roundingMode == float_round_up ) ) { + add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); + } + } + z.low &= ~ roundBitsMask; + } + else { + if ( aExp < 0x3FFF ) { + if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; + STATUS(float_exception_flags) |= float_flag_inexact; + aSign = extractFloat128Sign( a ); + switch ( STATUS(float_rounding_mode) ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FFE ) + && ( extractFloat128Frac0( a ) + | extractFloat128Frac1( a ) ) + ) { + return packFloat128( aSign, 0x3FFF, 0, 0 ); + } + break; + case float_round_down: + return + aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) + : packFloat128( 0, 0, 0, 0 ); + case float_round_up: + return + aSign ? packFloat128( 1, 0, 0, 0 ) + : packFloat128( 0, 0x3FFF, 0, 0 ); + } + return packFloat128( aSign, 0, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x402F - aExp; + roundBitsMask = lastBitMask - 1; + z.low = 0; + z.high = a.high; + roundingMode = STATUS(float_rounding_mode); + if ( roundingMode == float_round_nearest_even ) { + z.high += lastBitMask>>1; + if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { + z.high &= ~ lastBitMask; + } + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat128Sign( z ) + ^ ( roundingMode == float_round_up ) ) { + z.high |= ( a.low != 0 ); + z.high += roundBitsMask; + } + } + z.high &= ~ roundBitsMask; + } + if ( ( z.low != a.low ) || ( z.high != a.high ) ) { + STATUS(float_exception_flags) |= float_flag_inexact; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the quadruple-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) +{ + int32 aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + int32 expDiff; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) { + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig0 |= LIT64( 0x0001000000000000 ); + } + shift128ExtraRightJamming( + bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig0 |= LIT64( 0x0001000000000000 ); + } + shift128ExtraRightJamming( + aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); + zExp = bExp; + } + else { + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 | bSig0 | bSig1 ) { + return propagateFloat128NaN( a, b STATUS_VAR ); + } + return a; + } + add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + if ( aExp == 0 ) { + if (STATUS(flush_to_zero)) { + if (zSig0 | zSig1) { + float_raise(float_flag_output_denormal STATUS_VAR); + } + return packFloat128(zSign, 0, 0, 0); + } + return packFloat128( zSign, 0, zSig0, zSig1 ); + } + zSig2 = 0; + zSig0 |= LIT64( 0x0002000000000000 ); + zExp = aExp; + goto shiftRight1; + } + aSig0 |= LIT64( 0x0001000000000000 ); + add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + --zExp; + if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; + ++zExp; + shiftRight1: + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); + roundAndPack: + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the quadruple- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) +{ + int32 aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; + int32 expDiff; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + expDiff = aExp - bExp; + shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); + shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 | bSig0 | bSig1 ) { + return propagateFloat128NaN( a, b STATUS_VAR ); + } + float_raise( float_flag_invalid STATUS_VAR); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig0 < aSig0 ) goto aBigger; + if ( aSig0 < bSig0 ) goto bBigger; + if ( bSig1 < aSig1 ) goto aBigger; + if ( aSig1 < bSig1 ) goto bBigger; + return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig0 |= LIT64( 0x4000000000000000 ); + } + shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); + bSig0 |= LIT64( 0x4000000000000000 ); + bBigger: + sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig0 |= LIT64( 0x4000000000000000 ); + } + shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); + aSig0 |= LIT64( 0x4000000000000000 ); + aBigger: + sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the quadruple-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_add( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign; + + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign == bSign ) { + return addFloat128Sigs( a, b, aSign STATUS_VAR ); + } + else { + return subFloat128Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the quadruple-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_sub( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign; + + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign == bSign ) { + return subFloat128Sigs( a, b, aSign STATUS_VAR ); + } + else { + return addFloat128Sigs( a, b, aSign STATUS_VAR ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the quadruple-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_mul( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + bSign = extractFloat128Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( ( aSig0 | aSig1 ) + || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { + return propagateFloat128NaN( a, b STATUS_VAR ); + } + if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + if ( ( aExp | aSig0 | aSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + zExp = aExp + bExp - 0x4000; + aSig0 |= LIT64( 0x0001000000000000 ); + shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); + mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); + add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); + zSig2 |= ( zSig3 != 0 ); + if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); + ++zExp; + } + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the quadruple-precision floating-point value +| `a' by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_div( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + bSign = extractFloat128Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + goto invalid; + } + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + return packFloat128( zSign, 0, 0, 0 ); + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) { + if ( ( aExp | aSig0 | aSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + float_raise( float_flag_divbyzero STATUS_VAR); + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + zExp = aExp - bExp + 0x3FFD; + shortShift128Left( + aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); + shortShift128Left( + bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); + if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { + shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); + ++zExp; + } + zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); + mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); + sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); + while ( (int64_t) rem0 < 0 ) { + --zSig0; + add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); + } + zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); + if ( ( zSig1 & 0x3FFF ) <= 4 ) { + mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); + sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); + while ( (int64_t) rem1 < 0 ) { + --zSig1; + add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the quadruple-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_rem( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, zSign; + int32 aExp, bExp, expDiff; + uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; + uint64_t allZero, alternateASig0, alternateASig1, sigMean1; + int64_t sigMean0; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + if ( aExp == 0x7FFF ) { + if ( ( aSig0 | aSig1 ) + || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { + return propagateFloat128NaN( a, b STATUS_VAR ); + } + goto invalid; + } + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); + return a; + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return a; + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + expDiff = aExp - bExp; + if ( expDiff < -1 ) return a; + shortShift128Left( + aSig0 | LIT64( 0x0001000000000000 ), + aSig1, + 15 - ( expDiff < 0 ), + &aSig0, + &aSig1 + ); + shortShift128Left( + bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); + q = le128( bSig0, bSig1, aSig0, aSig1 ); + if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); + expDiff -= 64; + while ( 0 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig0 ); + q = ( 4 < q ) ? q - 4 : 0; + mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); + shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); + shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); + sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); + expDiff -= 61; + } + if ( -64 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig0 ); + q = ( 4 < q ) ? q - 4 : 0; + q >>= - expDiff; + shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); + expDiff += 52; + if ( expDiff < 0 ) { + shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); + } + else { + shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); + } + mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); + sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); + } + else { + shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); + shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); + } + do { + alternateASig0 = aSig0; + alternateASig1 = aSig1; + ++q; + sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); + } while ( 0 <= (int64_t) aSig0 ); + add128( + aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); + if ( ( sigMean0 < 0 ) + || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { + aSig0 = alternateASig0; + aSig1 = alternateASig1; + } + zSign = ( (int64_t) aSig0 < 0 ); + if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); + return + normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the quadruple-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_sqrt( float128 a STATUS_PARAM ) +{ + flag aSign; + int32 aExp, zExp; + uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; + uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); + if ( ! aSign ) return a; + goto invalid; + } + if ( aSign ) { + if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; + invalid: + float_raise( float_flag_invalid STATUS_VAR); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; + aSig0 |= LIT64( 0x0001000000000000 ); + zSig0 = estimateSqrt32( aExp, aSig0>>17 ); + shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); + zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); + doubleZSig0 = zSig0<<1; + mul64To128( zSig0, zSig0, &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); + while ( (int64_t) rem0 < 0 ) { + --zSig0; + doubleZSig0 -= 2; + add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); + } + zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); + if ( ( zSig1 & 0x1FFF ) <= 5 ) { + if ( zSig1 == 0 ) zSig1 = 1; + mul64To128( doubleZSig0, zSig1, &term1, &term2 ); + sub128( rem1, 0, term1, term2, &rem1, &rem2 ); + mul64To128( zSig1, zSig1, &term2, &term3 ); + sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); + while ( (int64_t) rem1 < 0 ) { + --zSig1; + shortShift128Left( 0, zSig1, 1, &term2, &term3 ); + term3 |= 1; + term2 |= doubleZSig0; + add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); + return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_eq( float128 a, float128 b STATUS_PARAM ) +{ + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. The invalid +| exception is raised if either operand is a NaN. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_le( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. The comparison is performed according +| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_lt( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot +| be compared, and 0 otherwise. The invalid exception is raised if either +| operand is a NaN. The comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_unordered( float128 a, float128 b STATUS_PARAM ) +{ + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid STATUS_VAR); + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. The comparison is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_eq_quiet( float128 a, float128 b STATUS_PARAM ) +{ + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot +| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The +| comparison is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM ) +{ + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return 1; + } + return 0; +} + +/* misc functions */ +float32 uint32_to_float32( uint32 a STATUS_PARAM ) +{ + return int64_to_float32(a STATUS_VAR); +} + +float64 uint32_to_float64( uint32 a STATUS_PARAM ) +{ + return int64_to_float64(a STATUS_VAR); +} + +uint32 float32_to_uint32( float32 a STATUS_PARAM ) +{ + int64_t v; + uint32 res; + + v = float32_to_int64(a STATUS_VAR); + if (v < 0) { + res = 0; + float_raise( float_flag_invalid STATUS_VAR); + } else if (v > 0xffffffff) { + res = 0xffffffff; + float_raise( float_flag_invalid STATUS_VAR); + } else { + res = v; + } + return res; +} + +uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) +{ + int64_t v; + uint32 res; + + v = float32_to_int64_round_to_zero(a STATUS_VAR); + if (v < 0) { + res = 0; + float_raise( float_flag_invalid STATUS_VAR); + } else if (v > 0xffffffff) { + res = 0xffffffff; + float_raise( float_flag_invalid STATUS_VAR); + } else { + res = v; + } + return res; +} + +uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM) +{ + int64_t v; + uint_fast16_t res; + + v = float32_to_int64_round_to_zero(a STATUS_VAR); + if (v < 0) { + res = 0; + float_raise( float_flag_invalid STATUS_VAR); + } else if (v > 0xffff) { + res = 0xffff; + float_raise( float_flag_invalid STATUS_VAR); + } else { + res = v; + } + return res; +} + +uint32 float64_to_uint32( float64 a STATUS_PARAM ) +{ + int64_t v; + uint32 res; + + v = float64_to_int64(a STATUS_VAR); + if (v < 0) { + res = 0; + float_raise( float_flag_invalid STATUS_VAR); + } else if (v > 0xffffffff) { + res = 0xffffffff; + float_raise( float_flag_invalid STATUS_VAR); + } else { + res = v; + } + return res; +} + +uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) +{ + int64_t v; + uint32 res; + + v = float64_to_int64_round_to_zero(a STATUS_VAR); + if (v < 0) { + res = 0; + float_raise( float_flag_invalid STATUS_VAR); + } else if (v > 0xffffffff) { + res = 0xffffffff; + float_raise( float_flag_invalid STATUS_VAR); + } else { + res = v; + } + return res; +} + +uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM) +{ + int64_t v; + uint_fast16_t res; + + v = float64_to_int64_round_to_zero(a STATUS_VAR); + if (v < 0) { + res = 0; + float_raise( float_flag_invalid STATUS_VAR); + } else if (v > 0xffff) { + res = 0xffff; + float_raise( float_flag_invalid STATUS_VAR); + } else { + res = v; + } + return res; +} + +/* FIXME: This looks broken. */ +uint64_t float64_to_uint64 (float64 a STATUS_PARAM) +{ + int64_t v; + + v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); + v += float64_val(a); + v = float64_to_int64(make_float64(v) STATUS_VAR); + + return v - INT64_MIN; +} + +uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) +{ + int64_t v; + + v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); + v += float64_val(a); + v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); + + return v - INT64_MIN; +} + +#define COMPARE(s, nan_exp) \ +INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ + int is_quiet STATUS_PARAM ) \ +{ \ + flag aSign, bSign; \ + uint ## s ## _t av, bv; \ + a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ + b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ + \ + if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ + extractFloat ## s ## Frac( a ) ) || \ + ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ + extractFloat ## s ## Frac( b ) )) { \ + if (!is_quiet || \ + float ## s ## _is_signaling_nan( a ) || \ + float ## s ## _is_signaling_nan( b ) ) { \ + float_raise( float_flag_invalid STATUS_VAR); \ + } \ + return float_relation_unordered; \ + } \ + aSign = extractFloat ## s ## Sign( a ); \ + bSign = extractFloat ## s ## Sign( b ); \ + av = float ## s ## _val(a); \ + bv = float ## s ## _val(b); \ + if ( aSign != bSign ) { \ + if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ + /* zero case */ \ + return float_relation_equal; \ + } else { \ + return 1 - (2 * aSign); \ + } \ + } else { \ + if (av == bv) { \ + return float_relation_equal; \ + } else { \ + return 1 - 2 * (aSign ^ ( av < bv )); \ + } \ + } \ +} \ + \ +int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ +{ \ + return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ +} \ + \ +int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ +{ \ + return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ +} + +COMPARE(32, 0xff) +COMPARE(64, 0x7ff) + +INLINE int floatx80_compare_internal( floatx80 a, floatx80 b, + int is_quiet STATUS_PARAM ) +{ + flag aSign, bSign; + + if (( ( extractFloatx80Exp( a ) == 0x7fff ) && + ( extractFloatx80Frac( a )<<1 ) ) || + ( ( extractFloatx80Exp( b ) == 0x7fff ) && + ( extractFloatx80Frac( b )<<1 ) )) { + if (!is_quiet || + floatx80_is_signaling_nan( a ) || + floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return float_relation_unordered; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + + if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && + ( ( a.low | b.low ) == 0 ) ) { + /* zero case */ + return float_relation_equal; + } else { + return 1 - (2 * aSign); + } + } else { + if (a.low == b.low && a.high == b.high) { + return float_relation_equal; + } else { + return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); + } + } +} + +int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM ) +{ + return floatx80_compare_internal(a, b, 0 STATUS_VAR); +} + +int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM ) +{ + return floatx80_compare_internal(a, b, 1 STATUS_VAR); +} + +INLINE int float128_compare_internal( float128 a, float128 b, + int is_quiet STATUS_PARAM ) +{ + flag aSign, bSign; + + if (( ( extractFloat128Exp( a ) == 0x7fff ) && + ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || + ( ( extractFloat128Exp( b ) == 0x7fff ) && + ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { + if (!is_quiet || + float128_is_signaling_nan( a ) || + float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid STATUS_VAR); + } + return float_relation_unordered; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { + /* zero case */ + return float_relation_equal; + } else { + return 1 - (2 * aSign); + } + } else { + if (a.low == b.low && a.high == b.high) { + return float_relation_equal; + } else { + return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); + } + } +} + +int float128_compare( float128 a, float128 b STATUS_PARAM ) +{ + return float128_compare_internal(a, b, 0 STATUS_VAR); +} + +int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) +{ + return float128_compare_internal(a, b, 1 STATUS_VAR); +} + +/* min() and max() functions. These can't be implemented as + * 'compare and pick one input' because that would mishandle + * NaNs and +0 vs -0. + */ +#define MINMAX(s, nan_exp) \ +INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \ + int ismin STATUS_PARAM ) \ +{ \ + flag aSign, bSign; \ + uint ## s ## _t av, bv; \ + a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ + b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ + if (float ## s ## _is_any_nan(a) || \ + float ## s ## _is_any_nan(b)) { \ + return propagateFloat ## s ## NaN(a, b STATUS_VAR); \ + } \ + aSign = extractFloat ## s ## Sign(a); \ + bSign = extractFloat ## s ## Sign(b); \ + av = float ## s ## _val(a); \ + bv = float ## s ## _val(b); \ + if (aSign != bSign) { \ + if (ismin) { \ + return aSign ? a : b; \ + } else { \ + return aSign ? b : a; \ + } \ + } else { \ + if (ismin) { \ + return (aSign ^ (av < bv)) ? a : b; \ + } else { \ + return (aSign ^ (av < bv)) ? b : a; \ + } \ + } \ +} \ + \ +float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \ +{ \ + return float ## s ## _minmax(a, b, 1 STATUS_VAR); \ +} \ + \ +float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \ +{ \ + return float ## s ## _minmax(a, b, 0 STATUS_VAR); \ +} + +MINMAX(32, 0xff) +MINMAX(64, 0x7ff) + + +/* Multiply A by 2 raised to the power N. */ +float32 float32_scalbn( float32 a, int n STATUS_PARAM ) +{ + flag aSign; + int16_t aExp; + uint32_t aSig; + + a = float32_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + + if ( aExp == 0xFF ) { + if ( aSig ) { + return propagateFloat32NaN( a, a STATUS_VAR ); + } + return a; + } + if ( aExp != 0 ) + aSig |= 0x00800000; + else if ( aSig == 0 ) + return a; + + if (n > 0x200) { + n = 0x200; + } else if (n < -0x200) { + n = -0x200; + } + + aExp += n - 1; + aSig <<= 7; + return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); +} + +float64 float64_scalbn( float64 a, int n STATUS_PARAM ) +{ + flag aSign; + int16_t aExp; + uint64_t aSig; + + a = float64_squash_input_denormal(a STATUS_VAR); + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + + if ( aExp == 0x7FF ) { + if ( aSig ) { + return propagateFloat64NaN( a, a STATUS_VAR ); + } + return a; + } + if ( aExp != 0 ) + aSig |= LIT64( 0x0010000000000000 ); + else if ( aSig == 0 ) + return a; + + if (n > 0x1000) { + n = 0x1000; + } else if (n < -0x1000) { + n = -0x1000; + } + + aExp += n - 1; + aSig <<= 10; + return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); +} + +floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) +{ + flag aSign; + int32_t aExp; + uint64_t aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + + if ( aExp == 0x7FFF ) { + if ( aSig<<1 ) { + return propagateFloatx80NaN( a, a STATUS_VAR ); + } + return a; + } + + if (aExp == 0 && aSig == 0) + return a; + + if (n > 0x10000) { + n = 0x10000; + } else if (n < -0x10000) { + n = -0x10000; + } + + aExp += n; + return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), + aSign, aExp, aSig, 0 STATUS_VAR ); +} + +float128 float128_scalbn( float128 a, int n STATUS_PARAM ) +{ + flag aSign; + int32_t aExp; + uint64_t aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return propagateFloat128NaN( a, a STATUS_VAR ); + } + return a; + } + if ( aExp != 0 ) + aSig0 |= LIT64( 0x0001000000000000 ); + else if ( aSig0 == 0 && aSig1 == 0 ) + return a; + + if (n > 0x10000) { + n = 0x10000; + } else if (n < -0x10000) { + n = -0x10000; + } + + aExp += n - 1; + return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 + STATUS_VAR ); + +} diff --git a/fpu/softfloat.h b/fpu/softfloat.h new file mode 100644 index 000000000..feec3a180 --- /dev/null +++ b/fpu/softfloat.h @@ -0,0 +1,633 @@ +/* + * QEMU float support + * + * Derived from SoftFloat. + */ + +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#ifndef SOFTFLOAT_H +#define SOFTFLOAT_H + +#if defined(CONFIG_SOLARIS) && defined(CONFIG_NEEDS_LIBSUNMATH) +#include <sunmath.h> +#endif + +#include <inttypes.h> +#include "config-host.h" +#include "osdep.h" + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines the most convenient type that holds +| integers of at least as many bits as specified. For example, `uint8' should +| be the most convenient type that can hold unsigned integers of as many as +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed +| to the same as `int'. +*----------------------------------------------------------------------------*/ +typedef uint8_t flag; +typedef uint8_t uint8; +typedef int8_t int8; +typedef unsigned int uint32; +typedef signed int int32; +typedef uint64_t uint64; +typedef int64_t int64; + +#define LIT64( a ) a##LL +#define INLINE static inline + +#define STATUS_PARAM , float_status *status +#define STATUS(field) status->field +#define STATUS_VAR , status + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point ordering relations +*----------------------------------------------------------------------------*/ +enum { + float_relation_less = -1, + float_relation_equal = 0, + float_relation_greater = 1, + float_relation_unordered = 2 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ +/* Use structures for soft-float types. This prevents accidentally mixing + them with native int/float types. A sufficiently clever compiler and + sane ABI should be able to see though these structs. However + x86/gcc 3.x seems to struggle a bit, so leave them disabled by default. */ +//#define USE_SOFTFLOAT_STRUCT_TYPES +#ifdef USE_SOFTFLOAT_STRUCT_TYPES +typedef struct { + uint16_t v; +} float16; +#define float16_val(x) (((float16)(x)).v) +#define make_float16(x) __extension__ ({ float16 f16_val = {x}; f16_val; }) +#define const_float16(x) { x } +typedef struct { + uint32_t v; +} float32; +/* The cast ensures an error if the wrong type is passed. */ +#define float32_val(x) (((float32)(x)).v) +#define make_float32(x) __extension__ ({ float32 f32_val = {x}; f32_val; }) +#define const_float32(x) { x } +typedef struct { + uint64_t v; +} float64; +#define float64_val(x) (((float64)(x)).v) +#define make_float64(x) __extension__ ({ float64 f64_val = {x}; f64_val; }) +#define const_float64(x) { x } +#else +typedef uint16_t float16; +typedef uint32_t float32; +typedef uint64_t float64; +#define float16_val(x) (x) +#define float32_val(x) (x) +#define float64_val(x) (x) +#define make_float16(x) (x) +#define make_float32(x) (x) +#define make_float64(x) (x) +#define const_float16(x) (x) +#define const_float32(x) (x) +#define const_float64(x) (x) +#endif +typedef struct { + uint64_t low; + uint16_t high; +} floatx80; +#define make_floatx80(exp, mant) ((floatx80) { mant, exp }) +#define make_floatx80_init(exp, mant) { .low = mant, .high = exp } +typedef struct { +#ifdef HOST_WORDS_BIGENDIAN + uint64_t high, low; +#else + uint64_t low, high; +#endif +} float128; +#define make_float128(high_, low_) ((float128) { .high = high_, .low = low_ }) +#define make_float128_init(high_, low_) { .high = high_, .low = low_ } + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point underflow tininess-detection mode. +*----------------------------------------------------------------------------*/ +enum { + float_tininess_after_rounding = 0, + float_tininess_before_rounding = 1 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +enum { + float_round_nearest_even = 0, + float_round_down = 1, + float_round_up = 2, + float_round_to_zero = 3 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +enum { + float_flag_invalid = 1, + float_flag_divbyzero = 4, + float_flag_overflow = 8, + float_flag_underflow = 16, + float_flag_inexact = 32, + float_flag_input_denormal = 64, + float_flag_output_denormal = 128 +}; + +typedef struct float_status { + signed char float_detect_tininess; + signed char float_rounding_mode; + signed char float_exception_flags; + signed char floatx80_rounding_precision; + /* should denormalised results go to zero and set the inexact flag? */ + flag flush_to_zero; + /* should denormalised inputs go to zero and set the input_denormal flag? */ + flag flush_inputs_to_zero; + flag default_nan_mode; +} float_status; + +void set_float_rounding_mode(int val STATUS_PARAM); +void set_float_exception_flags(int val STATUS_PARAM); +INLINE void set_float_detect_tininess(int val STATUS_PARAM) +{ + STATUS(float_detect_tininess) = val; +} +INLINE void set_flush_to_zero(flag val STATUS_PARAM) +{ + STATUS(flush_to_zero) = val; +} +INLINE void set_flush_inputs_to_zero(flag val STATUS_PARAM) +{ + STATUS(flush_inputs_to_zero) = val; +} +INLINE void set_default_nan_mode(flag val STATUS_PARAM) +{ + STATUS(default_nan_mode) = val; +} +INLINE int get_float_exception_flags(float_status *status) +{ + return STATUS(float_exception_flags); +} +void set_floatx80_rounding_precision(int val STATUS_PARAM); + +/*---------------------------------------------------------------------------- +| Routine to raise any or all of the software IEC/IEEE floating-point +| exception flags. +*----------------------------------------------------------------------------*/ +void float_raise( int8 flags STATUS_PARAM); + +/*---------------------------------------------------------------------------- +| Options to indicate which negations to perform in float*_muladd() +| Using these differs from negating an input or output before calling +| the muladd function in that this means that a NaN doesn't have its +| sign bit inverted before it is propagated. +*----------------------------------------------------------------------------*/ +enum { + float_muladd_negate_c = 1, + float_muladd_negate_product = 2, + float_muladd_negate_result = 3, +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ +float32 int32_to_float32( int32 STATUS_PARAM ); +float64 int32_to_float64( int32 STATUS_PARAM ); +float32 uint32_to_float32( uint32 STATUS_PARAM ); +float64 uint32_to_float64( uint32 STATUS_PARAM ); +floatx80 int32_to_floatx80( int32 STATUS_PARAM ); +float128 int32_to_float128( int32 STATUS_PARAM ); +float32 int64_to_float32( int64 STATUS_PARAM ); +float32 uint64_to_float32( uint64 STATUS_PARAM ); +float64 int64_to_float64( int64 STATUS_PARAM ); +float64 uint64_to_float64( uint64 STATUS_PARAM ); +floatx80 int64_to_floatx80( int64 STATUS_PARAM ); +float128 int64_to_float128( int64 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software half-precision conversion routines. +*----------------------------------------------------------------------------*/ +float16 float32_to_float16( float32, flag STATUS_PARAM ); +float32 float16_to_float32( float16, flag STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software half-precision operations. +*----------------------------------------------------------------------------*/ +int float16_is_quiet_nan( float16 ); +int float16_is_signaling_nan( float16 ); +float16 float16_maybe_silence_nan( float16 ); + +/*---------------------------------------------------------------------------- +| The pattern for a default generated half-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float16 float16_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision conversion routines. +*----------------------------------------------------------------------------*/ +int_fast16_t float32_to_int16_round_to_zero(float32 STATUS_PARAM); +uint_fast16_t float32_to_uint16_round_to_zero(float32 STATUS_PARAM); +int32 float32_to_int32( float32 STATUS_PARAM ); +int32 float32_to_int32_round_to_zero( float32 STATUS_PARAM ); +uint32 float32_to_uint32( float32 STATUS_PARAM ); +uint32 float32_to_uint32_round_to_zero( float32 STATUS_PARAM ); +int64 float32_to_int64( float32 STATUS_PARAM ); +int64 float32_to_int64_round_to_zero( float32 STATUS_PARAM ); +float64 float32_to_float64( float32 STATUS_PARAM ); +floatx80 float32_to_floatx80( float32 STATUS_PARAM ); +float128 float32_to_float128( float32 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision operations. +*----------------------------------------------------------------------------*/ +float32 float32_round_to_int( float32 STATUS_PARAM ); +float32 float32_add( float32, float32 STATUS_PARAM ); +float32 float32_sub( float32, float32 STATUS_PARAM ); +float32 float32_mul( float32, float32 STATUS_PARAM ); +float32 float32_div( float32, float32 STATUS_PARAM ); +float32 float32_rem( float32, float32 STATUS_PARAM ); +float32 float32_muladd(float32, float32, float32, int STATUS_PARAM); +float32 float32_sqrt( float32 STATUS_PARAM ); +float32 float32_exp2( float32 STATUS_PARAM ); +float32 float32_log2( float32 STATUS_PARAM ); +int float32_eq( float32, float32 STATUS_PARAM ); +int float32_le( float32, float32 STATUS_PARAM ); +int float32_lt( float32, float32 STATUS_PARAM ); +int float32_unordered( float32, float32 STATUS_PARAM ); +int float32_eq_quiet( float32, float32 STATUS_PARAM ); +int float32_le_quiet( float32, float32 STATUS_PARAM ); +int float32_lt_quiet( float32, float32 STATUS_PARAM ); +int float32_unordered_quiet( float32, float32 STATUS_PARAM ); +int float32_compare( float32, float32 STATUS_PARAM ); +int float32_compare_quiet( float32, float32 STATUS_PARAM ); +float32 float32_min(float32, float32 STATUS_PARAM); +float32 float32_max(float32, float32 STATUS_PARAM); +int float32_is_quiet_nan( float32 ); +int float32_is_signaling_nan( float32 ); +float32 float32_maybe_silence_nan( float32 ); +float32 float32_scalbn( float32, int STATUS_PARAM ); + +INLINE float32 float32_abs(float32 a) +{ + /* Note that abs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float32(float32_val(a) & 0x7fffffff); +} + +INLINE float32 float32_chs(float32 a) +{ + /* Note that chs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float32(float32_val(a) ^ 0x80000000); +} + +INLINE int float32_is_infinity(float32 a) +{ + return (float32_val(a) & 0x7fffffff) == 0x7f800000; +} + +INLINE int float32_is_neg(float32 a) +{ + return float32_val(a) >> 31; +} + +INLINE int float32_is_zero(float32 a) +{ + return (float32_val(a) & 0x7fffffff) == 0; +} + +INLINE int float32_is_any_nan(float32 a) +{ + return ((float32_val(a) & ~(1 << 31)) > 0x7f800000UL); +} + +INLINE int float32_is_zero_or_denormal(float32 a) +{ + return (float32_val(a) & 0x7f800000) == 0; +} + +INLINE float32 float32_set_sign(float32 a, int sign) +{ + return make_float32((float32_val(a) & 0x7fffffff) | (sign << 31)); +} + +#define float32_zero make_float32(0) +#define float32_one make_float32(0x3f800000) +#define float32_ln2 make_float32(0x3f317218) +#define float32_pi make_float32(0x40490fdb) +#define float32_half make_float32(0x3f000000) +#define float32_infinity make_float32(0x7f800000) + + +/*---------------------------------------------------------------------------- +| The pattern for a default generated single-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float32 float32_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int_fast16_t float64_to_int16_round_to_zero(float64 STATUS_PARAM); +uint_fast16_t float64_to_uint16_round_to_zero(float64 STATUS_PARAM); +int32 float64_to_int32( float64 STATUS_PARAM ); +int32 float64_to_int32_round_to_zero( float64 STATUS_PARAM ); +uint32 float64_to_uint32( float64 STATUS_PARAM ); +uint32 float64_to_uint32_round_to_zero( float64 STATUS_PARAM ); +int64 float64_to_int64( float64 STATUS_PARAM ); +int64 float64_to_int64_round_to_zero( float64 STATUS_PARAM ); +uint64 float64_to_uint64 (float64 a STATUS_PARAM); +uint64 float64_to_uint64_round_to_zero (float64 a STATUS_PARAM); +float32 float64_to_float32( float64 STATUS_PARAM ); +floatx80 float64_to_floatx80( float64 STATUS_PARAM ); +float128 float64_to_float128( float64 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision operations. +*----------------------------------------------------------------------------*/ +float64 float64_round_to_int( float64 STATUS_PARAM ); +float64 float64_trunc_to_int( float64 STATUS_PARAM ); +float64 float64_add( float64, float64 STATUS_PARAM ); +float64 float64_sub( float64, float64 STATUS_PARAM ); +float64 float64_mul( float64, float64 STATUS_PARAM ); +float64 float64_div( float64, float64 STATUS_PARAM ); +float64 float64_rem( float64, float64 STATUS_PARAM ); +float64 float64_muladd(float64, float64, float64, int STATUS_PARAM); +float64 float64_sqrt( float64 STATUS_PARAM ); +float64 float64_log2( float64 STATUS_PARAM ); +int float64_eq( float64, float64 STATUS_PARAM ); +int float64_le( float64, float64 STATUS_PARAM ); +int float64_lt( float64, float64 STATUS_PARAM ); +int float64_unordered( float64, float64 STATUS_PARAM ); +int float64_eq_quiet( float64, float64 STATUS_PARAM ); +int float64_le_quiet( float64, float64 STATUS_PARAM ); +int float64_lt_quiet( float64, float64 STATUS_PARAM ); +int float64_unordered_quiet( float64, float64 STATUS_PARAM ); +int float64_compare( float64, float64 STATUS_PARAM ); +int float64_compare_quiet( float64, float64 STATUS_PARAM ); +float64 float64_min(float64, float64 STATUS_PARAM); +float64 float64_max(float64, float64 STATUS_PARAM); +int float64_is_quiet_nan( float64 a ); +int float64_is_signaling_nan( float64 ); +float64 float64_maybe_silence_nan( float64 ); +float64 float64_scalbn( float64, int STATUS_PARAM ); + +INLINE float64 float64_abs(float64 a) +{ + /* Note that abs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float64(float64_val(a) & 0x7fffffffffffffffLL); +} + +INLINE float64 float64_chs(float64 a) +{ + /* Note that chs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float64(float64_val(a) ^ 0x8000000000000000LL); +} + +INLINE int float64_is_infinity(float64 a) +{ + return (float64_val(a) & 0x7fffffffffffffffLL ) == 0x7ff0000000000000LL; +} + +INLINE int float64_is_neg(float64 a) +{ + return float64_val(a) >> 63; +} + +INLINE int float64_is_zero(float64 a) +{ + return (float64_val(a) & 0x7fffffffffffffffLL) == 0; +} + +INLINE int float64_is_any_nan(float64 a) +{ + return ((float64_val(a) & ~(1ULL << 63)) > 0x7ff0000000000000ULL); +} + +INLINE int float64_is_zero_or_denormal(float64 a) +{ + return (float64_val(a) & 0x7ff0000000000000LL) == 0; +} + +INLINE float64 float64_set_sign(float64 a, int sign) +{ + return make_float64((float64_val(a) & 0x7fffffffffffffffULL) + | ((int64_t)sign << 63)); +} + +#define float64_zero make_float64(0) +#define float64_one make_float64(0x3ff0000000000000LL) +#define float64_ln2 make_float64(0x3fe62e42fefa39efLL) +#define float64_pi make_float64(0x400921fb54442d18LL) +#define float64_half make_float64(0x3fe0000000000000LL) +#define float64_infinity make_float64(0x7ff0000000000000LL) + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float64 float64_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 floatx80_to_int32( floatx80 STATUS_PARAM ); +int32 floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM ); +int64 floatx80_to_int64( floatx80 STATUS_PARAM ); +int64 floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM ); +float32 floatx80_to_float32( floatx80 STATUS_PARAM ); +float64 floatx80_to_float64( floatx80 STATUS_PARAM ); +float128 floatx80_to_float128( floatx80 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision operations. +*----------------------------------------------------------------------------*/ +floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM ); +floatx80 floatx80_add( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_sub( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_mul( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_div( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_sqrt( floatx80 STATUS_PARAM ); +int floatx80_eq( floatx80, floatx80 STATUS_PARAM ); +int floatx80_le( floatx80, floatx80 STATUS_PARAM ); +int floatx80_lt( floatx80, floatx80 STATUS_PARAM ); +int floatx80_unordered( floatx80, floatx80 STATUS_PARAM ); +int floatx80_eq_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_le_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_compare( floatx80, floatx80 STATUS_PARAM ); +int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_is_quiet_nan( floatx80 ); +int floatx80_is_signaling_nan( floatx80 ); +floatx80 floatx80_maybe_silence_nan( floatx80 ); +floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM ); + +INLINE floatx80 floatx80_abs(floatx80 a) +{ + a.high &= 0x7fff; + return a; +} + +INLINE floatx80 floatx80_chs(floatx80 a) +{ + a.high ^= 0x8000; + return a; +} + +INLINE int floatx80_is_infinity(floatx80 a) +{ + return (a.high & 0x7fff) == 0x7fff && a.low == 0x8000000000000000LL; +} + +INLINE int floatx80_is_neg(floatx80 a) +{ + return a.high >> 15; +} + +INLINE int floatx80_is_zero(floatx80 a) +{ + return (a.high & 0x7fff) == 0 && a.low == 0; +} + +INLINE int floatx80_is_zero_or_denormal(floatx80 a) +{ + return (a.high & 0x7fff) == 0; +} + +INLINE int floatx80_is_any_nan(floatx80 a) +{ + return ((a.high & 0x7fff) == 0x7fff) && (a.low<<1); +} + +#define floatx80_zero make_floatx80(0x0000, 0x0000000000000000LL) +#define floatx80_one make_floatx80(0x3fff, 0x8000000000000000LL) +#define floatx80_ln2 make_floatx80(0x3ffe, 0xb17217f7d1cf79acLL) +#define floatx80_pi make_floatx80(0x4000, 0xc90fdaa22168c235LL) +#define floatx80_half make_floatx80(0x3ffe, 0x8000000000000000LL) +#define floatx80_infinity make_floatx80(0x7fff, 0x8000000000000000LL) + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. +*----------------------------------------------------------------------------*/ +extern const floatx80 floatx80_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float128_to_int32( float128 STATUS_PARAM ); +int32 float128_to_int32_round_to_zero( float128 STATUS_PARAM ); +int64 float128_to_int64( float128 STATUS_PARAM ); +int64 float128_to_int64_round_to_zero( float128 STATUS_PARAM ); +float32 float128_to_float32( float128 STATUS_PARAM ); +float64 float128_to_float64( float128 STATUS_PARAM ); +floatx80 float128_to_floatx80( float128 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision operations. +*----------------------------------------------------------------------------*/ +float128 float128_round_to_int( float128 STATUS_PARAM ); +float128 float128_add( float128, float128 STATUS_PARAM ); +float128 float128_sub( float128, float128 STATUS_PARAM ); +float128 float128_mul( float128, float128 STATUS_PARAM ); +float128 float128_div( float128, float128 STATUS_PARAM ); +float128 float128_rem( float128, float128 STATUS_PARAM ); +float128 float128_sqrt( float128 STATUS_PARAM ); +int float128_eq( float128, float128 STATUS_PARAM ); +int float128_le( float128, float128 STATUS_PARAM ); +int float128_lt( float128, float128 STATUS_PARAM ); +int float128_unordered( float128, float128 STATUS_PARAM ); +int float128_eq_quiet( float128, float128 STATUS_PARAM ); +int float128_le_quiet( float128, float128 STATUS_PARAM ); +int float128_lt_quiet( float128, float128 STATUS_PARAM ); +int float128_unordered_quiet( float128, float128 STATUS_PARAM ); +int float128_compare( float128, float128 STATUS_PARAM ); +int float128_compare_quiet( float128, float128 STATUS_PARAM ); +int float128_is_quiet_nan( float128 ); +int float128_is_signaling_nan( float128 ); +float128 float128_maybe_silence_nan( float128 ); +float128 float128_scalbn( float128, int STATUS_PARAM ); + +INLINE float128 float128_abs(float128 a) +{ + a.high &= 0x7fffffffffffffffLL; + return a; +} + +INLINE float128 float128_chs(float128 a) +{ + a.high ^= 0x8000000000000000LL; + return a; +} + +INLINE int float128_is_infinity(float128 a) +{ + return (a.high & 0x7fffffffffffffffLL) == 0x7fff000000000000LL && a.low == 0; +} + +INLINE int float128_is_neg(float128 a) +{ + return a.high >> 63; +} + +INLINE int float128_is_zero(float128 a) +{ + return (a.high & 0x7fffffffffffffffLL) == 0 && a.low == 0; +} + +INLINE int float128_is_zero_or_denormal(float128 a) +{ + return (a.high & 0x7fff000000000000LL) == 0; +} + +INLINE int float128_is_any_nan(float128 a) +{ + return ((a.high >> 48) & 0x7fff) == 0x7fff && + ((a.low != 0) || ((a.high & 0xffffffffffffLL) != 0)); +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float128 float128_default_nan; + +#endif /* !SOFTFLOAT_H */ |