diff options
author | Lei Zhang <antiagainst@google.com> | 2017-03-20 15:40:54 -0400 |
---|---|---|
committer | Alexander Galazin <Alexander.Galazin@arm.com> | 2017-06-18 10:30:45 -0400 |
commit | 47046481073b94d1b5ae3d5afae0ed40fa3753a3 (patch) | |
tree | 2c0998679bfa6bf7eb6fd240f14d501f427284c1 /framework | |
parent | 55dd4426673bd260dde56addcfea802f21c31304 (diff) | |
download | VK-GL-CTS-47046481073b94d1b5ae3d5afae0ed40fa3753a3.tar.gz VK-GL-CTS-47046481073b94d1b5ae3d5afae0ed40fa3753a3.tar.bz2 VK-GL-CTS-47046481073b94d1b5ae3d5afae0ed40fa3753a3.zip |
Add 32 to 16 bit float conversion w.r.t rounding mode
Component: Framework
Change-Id: I9428f0dc725484b8a2f213157100326a69754cd8
(cherry picked from commit 7c0f5bea1da74f70e549e9a735c9d702c9396084)
Diffstat (limited to 'framework')
-rw-r--r-- | framework/common/tcuApp.cpp | 2 | ||||
-rw-r--r-- | framework/delibs/debase/CMakeLists.txt | 1 | ||||
-rw-r--r-- | framework/delibs/debase/deFloat16.c | 157 | ||||
-rw-r--r-- | framework/delibs/debase/deFloat16.h | 5 | ||||
-rw-r--r-- | framework/delibs/debase/deFloat16Test.c | 335 | ||||
-rw-r--r-- | framework/delibs/debase/deMath.c | 8 | ||||
-rw-r--r-- | framework/delibs/debase/deMath.h | 2 |
7 files changed, 503 insertions, 7 deletions
diff --git a/framework/common/tcuApp.cpp b/framework/common/tcuApp.cpp index 409a6774f..6227089b8 100644 --- a/framework/common/tcuApp.cpp +++ b/framework/common/tcuApp.cpp @@ -89,7 +89,7 @@ App::App (Platform& platform, Archive& archive, TestLog& log, const CommandLine& print("dEQP Core %s (0x%08x) starting..\n", qpGetReleaseName(), qpGetReleaseId()); print(" target implementation = '%s'\n", qpGetTargetName()); - if (!deSetRoundingMode(DE_ROUNDINGMODE_TO_NEAREST)) + if (!deSetRoundingMode(DE_ROUNDINGMODE_TO_NEAREST_EVEN)) qpPrintf("WARNING: Failed to set floating-point rounding mode!\n"); try diff --git a/framework/delibs/debase/CMakeLists.txt b/framework/delibs/debase/CMakeLists.txt index 2f9d8d013..636d73ecf 100644 --- a/framework/delibs/debase/CMakeLists.txt +++ b/framework/delibs/debase/CMakeLists.txt @@ -9,6 +9,7 @@ set(DEBASE_SRCS deDefs.h deFloat16.c deFloat16.h + deFloat16Test.c deInt32.c deInt32.h deInt32Test.c diff --git a/framework/delibs/debase/deFloat16.c b/framework/delibs/debase/deFloat16.c index 6460f0587..e8a1057e0 100644 --- a/framework/delibs/debase/deFloat16.c +++ b/framework/delibs/debase/deFloat16.c @@ -99,6 +99,163 @@ deFloat16 deFloat32To16 (float val32) } } +/*--------------------------------------------------------------------*//*! + * \brief Round the given number `val` to nearest even by discarding + * the last `numBitsToDiscard` bits. + * \param val value to round + * \param numBitsToDiscard number of (least significant) bits to discard + * \return The rounded value with the last `numBitsToDiscard` removed + *//*--------------------------------------------------------------------*/ +static deUint32 roundToNearestEven (deUint32 val, const deUint32 numBitsToDiscard) +{ + const deUint32 lastBits = val & ((1 << numBitsToDiscard) - 1); + const deUint32 headBit = val & (1 << (numBitsToDiscard - 1)); + + DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32); /* Make sure no overflow. */ + val >>= numBitsToDiscard; + + if (headBit == 0) + { + return val; + } + else if (headBit == lastBits) + { + if ((val & 0x1) == 0x1) + { + return val + 1; + } + else + { + return val; + } + } + else + { + return val + 1; + } +} + +deFloat16 deFloat32To16Round (float val32, deRoundingMode mode) +{ + union + { + float f; /* Interpret as 32-bit float */ + deUint32 u; /* Interpret as 32-bit unsigned integer */ + } x; + deUint32 sign; /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */ + deUint32 exp32; /* exp32: biased exponent for 32-bit floats */ + int exp16; /* exp16: biased exponent for 16-bit floats */ + deUint32 mantissa; + + /* We only support these two rounding modes for now */ + DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN); + + x.f = val32; + sign = (x.u >> 16u) & 0x00008000u; + exp32 = (x.u >> 23u) & 0x000000ffu; + exp16 = (int) (exp32) - 127 + 15; /* 15/127: exponent bias for 16-bit/32-bit floats */ + mantissa = x.u & 0x007fffffu; + + /* Case: zero and denormalized floats */ + if (exp32 == 0) + { + /* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */ + return (deFloat16) sign; + } + /* Case: Inf and NaN */ + else if (exp32 == 0x000000ffu) + { + if (mantissa == 0u) + { + /* Inf */ + return (deFloat16) (sign | 0x7c00u); + } + else + { + /* NaN */ + mantissa >>= 13u; /* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */ + /* Make sure we don't turn NaN into zero by | (mantissa == 0). */ + return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u)); + } + } + /* The following are cases for normalized floats. + * + * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent, + * we can only shift the mantissa further right. + * The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent. + * Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent. + * So, we just need to right shift the mantissa -exp16 bits. + * * If exp16 is 0, mantissa shifting requirement is similar to the above. + * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats. + */ + /* Case: normalized floats -> zero */ + else if (exp16 < -10) + { + /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */ + /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */ + return (deFloat16) sign; + } + /* Case: normalized floats -> zero and denormalized halfs */ + else if (exp16 <= 0) + { + /* Add the implicit leading 1 in mormalized float to mantissa. */ + mantissa |= 0x00800000u; + /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa. + * Need to discard the last 14-bits considering rounding mode. + * We also need to shift right -exp16 bits to encode the underflowed exponent. + */ + if (mode == DE_ROUNDINGMODE_TO_ZERO) + { + mantissa >>= (14 - exp16); + } + else + { + /* mantissa in the above may exceed 10-bits, in which case overflow happens. + * The overflowed bit is automatically carried to exponent then. + */ + mantissa = roundToNearestEven(mantissa, 14 - exp16); + } + return (deFloat16) (sign | mantissa); + } + /* Case: normalized floats -> normalized floats */ + else if (exp16 <= 30) + { + if (mode == DE_ROUNDINGMODE_TO_ZERO) + { + return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 13u)); + } + else + { + mantissa = roundToNearestEven(mantissa, 13); + /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */ + exp16 = (exp16 << 10u) + (mantissa & (1 << 10)); + mantissa &= (1u << 10) - 1; + return (deFloat16) (sign | ((deUint32) exp16) | mantissa); + } + } + /* Case: normalized floats (too large to be representable as 16-bit floats) */ + else + { + /* According to IEEE Std 754-2008 Section 7.4, + * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign + * of the intermediate result. + * * roundTowardZero carries all overflows to the format’s largest finite number + * with the sign of the intermediate result. + */ + if (mode == DE_ROUNDINGMODE_TO_ZERO) + { + return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */ + } + else + { + return (deFloat16) (sign | (0x1f << 10)); + } + } + + /* Make compiler happy */ + return (deFloat16) 0; +} + float deFloat16To32 (deFloat16 val16) { deUint32 sign; diff --git a/framework/delibs/debase/deFloat16.h b/framework/delibs/debase/deFloat16.h index d2d71dc29..ab81199e4 100644 --- a/framework/delibs/debase/deFloat16.h +++ b/framework/delibs/debase/deFloat16.h @@ -24,6 +24,7 @@ *//*--------------------------------------------------------------------*/ #include "deDefs.h" +#include "deMath.h" DE_BEGIN_EXTERN_C @@ -38,7 +39,9 @@ typedef deFloat16 DEfloat16; * \param val32 Input value. * \return Converted 16-bit floating-point value. *//*--------------------------------------------------------------------*/ -deFloat16 deFloat32To16 (float val32); +deFloat16 deFloat32To16 (float val32); +deFloat16 deFloat32To16Round (float val32, deRoundingMode mode); +void deFloat16_selfTest (void); /*--------------------------------------------------------------------*//*! * \brief Convert 16-bit floating point number to 32 bit. diff --git a/framework/delibs/debase/deFloat16Test.c b/framework/delibs/debase/deFloat16Test.c new file mode 100644 index 000000000..ea5d21707 --- /dev/null +++ b/framework/delibs/debase/deFloat16Test.c @@ -0,0 +1,335 @@ +/*------------------------------------------------------------------------- + * drawElements Base Portability Library + * ------------------------------------- + * + * Copyright 2017 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + *//*! + * \file + * \brief Testing of deFloat16 functions. + *//*--------------------------------------------------------------------*/ + +#include "deFloat16.h" +#include "deRandom.h" + +DE_BEGIN_EXTERN_C + +static float getFloat32 (deUint32 sign, deUint32 biased_exponent, deUint32 mantissa) +{ + union + { + float f; + deUint32 u; + } x; + + x.u = (sign << 31) | (biased_exponent << 23) | mantissa; + + return x.f; +} + +static deFloat16 getFloat16 (deUint16 sign, deUint16 biased_exponent, deUint16 mantissa) +{ + return (deFloat16) ((sign << 15) | (biased_exponent << 10) | mantissa); +} + + +static deFloat16 deFloat32To16RTZ (float val32) +{ + return deFloat32To16Round(val32, DE_ROUNDINGMODE_TO_ZERO); +} + +static deFloat16 deFloat32To16RTE (float val32) +{ + return deFloat32To16Round(val32, DE_ROUNDINGMODE_TO_NEAREST_EVEN); +} + +void deFloat16_selfTest (void) +{ + /* 16-bit: 1 5 (0x00--0x1f) 10 (0x000--0x3ff) + * 32-bit: 1 8 (0x00--0xff) 23 (0x000000--0x7fffff) + */ + deRandom rnd; + int idx; + + deRandom_init(&rnd, 0xdeadbeefu-1); + + /* --- For rounding mode RTZ --- */ + + /* Zero */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0, 0)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0, 0)) == getFloat16(1, 0, 0)); + + /* Inf */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0)) == getFloat16(0, 0x1f, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0)) == getFloat16(1, 0x1f, 0)); + + /* SNaN */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 1)) == getFloat16(0, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 1)) == getFloat16(1, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x3fffff)) == getFloat16(0, 0x1f, 0x1ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x3fffff)) == getFloat16(1, 0x1f, 0x1ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x0003ff)) == getFloat16(0, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x0003ff)) == getFloat16(1, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x123456)) == getFloat16(0, 0x1f, 0x123456 >> 13)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x123456)) == getFloat16(1, 0x1f, 0x123456 >> 13)); + + /* QNaN */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x400000)) == getFloat16(0, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x400000)) == getFloat16(1, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x7fffff)) == getFloat16(0, 0x1f, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x7fffff)) == getFloat16(1, 0x1f, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x4003ff)) == getFloat16(0, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x4003ff)) == getFloat16(1, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x723456)) == getFloat16(0, 0x1f, 0x723456 >> 13)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x723456)) == getFloat16(1, 0x1f, 0x723456 >> 13)); + + /* Denormalized */ + for (idx = 0; idx < 256; ++idx) + { + deUint32 mantissa = deRandom_getUint32(&rnd); + + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + mantissa |= (mantissa == 0); /* Make sure it is not zero */ + + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0, mantissa)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0, mantissa)) == getFloat16(1, 0, 0)); + } + + /* Normalized -> zero */ + /* Absolute value: minimal 32-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 1, 0)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 1, 0)) == getFloat16(1, 0, 0)); + /* Absolute value: 2^-24 - e, extremely near minimal 16-bit denormalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 25, 0x7fffff)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 25, 0x7fffff)) == getFloat16(1, 0, 0)); + for (idx = 0; idx < 256; ++idx) + { + deUint32 exponent = deRandom_getUint32(&rnd); + deUint32 mantissa = deRandom_getUint32(&rnd); + + exponent = exponent % (127 - 25) + 1; /* Make sure >= 1, <= 127 - 25 */ + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0, 0)); + } + + /* Normalized -> denormalized */ + /* Absolute value: 2^-24, minimal 16-bit denormalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 24, 0)) == getFloat16(0, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 24, 0)) == getFloat16(1, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 24, 1)) == getFloat16(0, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 24, 1)) == getFloat16(1, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 20, 0x123456)) == getFloat16(0, 0, 0x12)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 20, 0x123456)) == getFloat16(1, 0, 0x12)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 18, 0x654321)) == getFloat16(0, 0, 0x72)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 18, 0x654321)) == getFloat16(1, 0, 0x72)); + /* Absolute value: 2^-14 - 2^-24 = (2 - 2^-9) * 2^-15, maximal 16-bit denormalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000)) == getFloat16(0, 0, 0x3ff)); /* 0x7fc000: 0111 1111 1100 0000 0000 0000 */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000)) == getFloat16(1, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000 - 1)) == getFloat16(0, 0, 0x3fe)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000 - 1)) == getFloat16(1, 0, 0x3fe)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000 + 1)) == getFloat16(0, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000 + 1)) == getFloat16(1, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fffff)) == getFloat16(0, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fffff)) == getFloat16(1, 0, 0x3ff)); + + /* Normalized -> normalized */ + /* Absolute value: 2^-14, minimal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 14, 0)) == getFloat16(0, 1, 0)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 14, 0)) == getFloat16(1, 1, 0)); + /* Absolute value: 65504 - 2^-23, extremely near maximal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(0, 0x1e, 0x3fe)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(1, 0x1e, 0x3fe)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) - 0x456)) == getFloat16(0, 0x1e, 0x3fe)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) - 0x456)) == getFloat16(1, 0x1e, 0x3fe)); + /* Absolute value: 65504, maximal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, 0x3ff << 13)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, 0x3ff << 13)) == getFloat16(1, 0x1e, 0x3ff)); + for (idx = 0; idx < 256; ++idx) + { + deUint32 exponent = deRandom_getUint32(&rnd); + deUint32 mantissa = deRandom_getUint32(&rnd); + + exponent = exponent % ((127 + 14) - (127 -14) + 1) + (127 - 14); /* Make sure >= 127 - 14, <= 127 + 14 */ + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, (deUint16) (exponent + 15 - 127), (deUint16) (mantissa >> 13))); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, (deUint16) (exponent + 15 - 127), (deUint16) (mantissa >> 13))); + } + + /* Normalized -> minimal/maximal normalized */ + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(1, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) + 0x123)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) + 0x123)) == getFloat16(1, 0x1e, 0x3ff)); + for (idx = 0; idx < 256; ++idx) + { + deUint32 exponent = deRandom_getUint32(&rnd); + deUint32 mantissa = deRandom_getUint32(&rnd); + + exponent = exponent % (0xfe - (127 + 16) + 1) + (127 + 16); /* Make sure >= 127 + 16, <= 0xfe */ + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0x1e, 0x3ff)); + } + + /* --- For rounding mode RTE --- */ + + /* Zero */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0, 0)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0, 0)) == getFloat16(1, 0, 0)); + + /* Inf */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0)) == getFloat16(0, 0x1f, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0)) == getFloat16(1, 0x1f, 0)); + + /* SNaN */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 1)) == getFloat16(0, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 1)) == getFloat16(1, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x3fffff)) == getFloat16(0, 0x1f, 0x1ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x3fffff)) == getFloat16(1, 0x1f, 0x1ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x0003ff)) == getFloat16(0, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x0003ff)) == getFloat16(1, 0x1f, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x123456)) == getFloat16(0, 0x1f, 0x123456 >> 13)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x123456)) == getFloat16(1, 0x1f, 0x123456 >> 13)); + + /* QNaN */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x400000)) == getFloat16(0, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x400000)) == getFloat16(1, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x7fffff)) == getFloat16(0, 0x1f, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x7fffff)) == getFloat16(1, 0x1f, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x4003ff)) == getFloat16(0, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x4003ff)) == getFloat16(1, 0x1f, 0x200)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x723456)) == getFloat16(0, 0x1f, 0x723456 >> 13)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x723456)) == getFloat16(1, 0x1f, 0x723456 >> 13)); + + /* Denormalized */ + for (idx = 0; idx < 256; ++idx) + { + deUint32 mantissa = deRandom_getUint32(&rnd); + + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + mantissa |= (mantissa == 0); /* Make sure it is not zero */ + + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0, mantissa)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0, mantissa)) == getFloat16(1, 0, 0)); + } + + /* Normalized -> zero and denormalized */ + /* Absolute value: minimal 32-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 1, 0)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 1, 0)) == getFloat16(1, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 42, 0x7abcde)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 42, 0x7abcde)) == getFloat16(1, 0, 0)); + for (idx = 0; idx < 256; ++idx) + { + deUint32 exponent = deRandom_getUint32(&rnd); + deUint32 mantissa = deRandom_getUint32(&rnd); + + exponent = exponent % (127 - 26) + 1; /* Make sure >= 1, <= 127 - 26 */ + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0, 0)); + } + /* Absolute value: 2^-25, minimal 16-bit denormalized: 2^-24 */ + /* The following six cases need to right shift mantissa (with leading 1) 10 bits --------------------> to here */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 0)) == getFloat16(0, 0, 0)); /* XX XXXX XXXX 1 000 0000 0000 0000 0000 0000 */ + /* Take the first 10 bits with RTE ------ 00 0000 0000 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 0)) == getFloat16(1, 0, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 1)) == getFloat16(0, 0, 1)); /* XX XXXX XXXX 1 000 0000 0000 0000 0000 0001 */ + /* Take the first 10 bits with RTE ------ 00 0000 0001 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 1)) == getFloat16(1, 0, 1)); + /* Absolute value: 2^-24 - e, extremely near minimal 16-bit denormalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 0x7fffff)) == getFloat16(0, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 0x7fffff)) == getFloat16(1, 0, 1)); + /* Absolute value: 2^-24, minimal 16-bit denormalized */ + /* The following (127 - 24) cases need to right shift mantissa (with leading 1) 9 bits -----------------> to here */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0)) == getFloat16(0, 0, 1)); /* X XXXX XXXX 1 000 0000 0000 0000 0000 0000 */ + /* Take the first 10 bits with RTE ---------- 0 0000 0000 1 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0)) == getFloat16(1, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 1)) == getFloat16(0, 0, 1)); /* X XXXX XXXX 1 000 0000 0000 0000 0000 0001 */ + /* Take the first 10 bits with RTE ---------- 0 0000 0000 1 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 1)) == getFloat16(1, 0, 1)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x400000)) == getFloat16(0, 0, 2)); /* X XXXX XXXX 1 100 0000 0000 0000 0000 0000 */ + /* Take the first 10 bits with RTE ---------- 0 0000 0000 2 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x400000)) == getFloat16(1, 0, 2)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x400001)) == getFloat16(0, 0, 2)); /* X XXXX XXXX 1 100 0000 0000 0000 0000 0001 */ + /* Take the first 10 bits with RTE ---------- 0 0000 0000 2 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x400001)) == getFloat16(1, 0, 2)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x4fffff)) == getFloat16(0, 0, 2)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x4fffff)) == getFloat16(1, 0, 2)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 20, 0x123456)) == getFloat16(0, 0, 0x12)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 20, 0x123456)) == getFloat16(1, 0, 0x12)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 18, 0x654321)) == getFloat16(0, 0, 0x73)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 18, 0x654321)) == getFloat16(1, 0, 0x73)); + /* Absolute value: 2^-14 - 2^-24 = (2 - 2^-9) * 2^-15, maximal 16-bit denormalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000)) == getFloat16(0, 0, 0x3ff)); /* 0x7fc000: 0111 1111 1100 0000 0000 0000 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000)) == getFloat16(1, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000 - 1)) == getFloat16(0, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000 - 1)) == getFloat16(1, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000 + 1)) == getFloat16(0, 0, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000 + 1)) == getFloat16(1, 0, 0x3ff)); + + /* Normalized -> normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fe000)) == getFloat16(0, 1, 0)); /* 0x7fe000: 0111 1111 1110 0000 0000 0000 */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fe000)) == getFloat16(1, 1, 0)); + /* Absolute value: (2 - 2^-23) * 2^-15, extremely near 2^-14, minimal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fffff)) == getFloat16(0, 1, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fffff)) == getFloat16(1, 1, 0)); + /* Absolute value: 2^-14, minimal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 14, 0)) == getFloat16(0, 1, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 14, 0)) == getFloat16(1, 1, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3fe << 13) + (1 << 12))) == getFloat16(0, 0x1e, 0x3fe)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3fe << 13) + (1 << 12))) == getFloat16(1, 0x1e, 0x3fe)); + + /* Normalized -> minimal/maximal normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3fe << 13) + (1 << 12) + 1)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3fe << 13) + (1 << 12) + 1)) == getFloat16(1, 0x1e, 0x3ff)); + /* Absolute value: 65504 - 2^-23, extremely near maximal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(1, 0x1e, 0x3ff)); + /* Absolute value: 65504, maximal 16-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, 0x3ff << 13)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, 0x3ff << 13)) == getFloat16(1, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(1, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + 0x456)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + 0x456)) == getFloat16(1, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + (1 << 12) - 1)) == getFloat16(0, 0x1e, 0x3ff)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + (1 << 12) - 1)) == getFloat16(1, 0x1e, 0x3ff)); + + /* Normalized -> Inf */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + (1 << 12))) == getFloat16(0, 0x1f, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + (1 << 12))) == getFloat16(1, 0x1f, 0)); + /* Absolute value: maximal 32-bit normalized */ + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, 0x7fffff)) == getFloat16(0, 0x1f, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, 0x7fffff)) == getFloat16(1, 0x1f, 0)); + for (idx = 0; idx < 256; ++idx) + { + deUint32 exponent = deRandom_getUint32(&rnd); + deUint32 mantissa = deRandom_getUint32(&rnd); + + exponent = exponent % (0xfe - (127 + 16) + 1) + (127 + 16); /* Make sure >= 127 + 16, <= 0xfe */ + mantissa &= 0x7fffffu; /* Take the last 23 bits */ + + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0x1f, 0)); + DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0x1f, 0)); + } +} + +DE_END_EXTERN_C diff --git a/framework/delibs/debase/deMath.c b/framework/delibs/debase/deMath.c index 26e2aef67..c76175140 100644 --- a/framework/delibs/debase/deMath.c +++ b/framework/delibs/debase/deMath.c @@ -46,7 +46,7 @@ deRoundingMode deGetRoundingMode (void) case _RC_CHOP: return DE_ROUNDINGMODE_TO_ZERO; case _RC_UP: return DE_ROUNDINGMODE_TO_POSITIVE_INF; case _RC_DOWN: return DE_ROUNDINGMODE_TO_NEGATIVE_INF; - case _RC_NEAR: return DE_ROUNDINGMODE_TO_NEAREST; + case _RC_NEAR: return DE_ROUNDINGMODE_TO_NEAREST_EVEN; default: return DE_ROUNDINGMODE_LAST; } #elif (DE_COMPILER == DE_COMPILER_GCC) || (DE_COMPILER == DE_COMPILER_CLANG) @@ -56,7 +56,7 @@ deRoundingMode deGetRoundingMode (void) case FE_TOWARDZERO: return DE_ROUNDINGMODE_TO_ZERO; case FE_UPWARD: return DE_ROUNDINGMODE_TO_POSITIVE_INF; case FE_DOWNWARD: return DE_ROUNDINGMODE_TO_NEGATIVE_INF; - case FE_TONEAREST: return DE_ROUNDINGMODE_TO_NEAREST; + case FE_TONEAREST: return DE_ROUNDINGMODE_TO_NEAREST_EVEN; default: return DE_ROUNDINGMODE_LAST; } #else @@ -76,7 +76,7 @@ deBool deSetRoundingMode (deRoundingMode mode) case DE_ROUNDINGMODE_TO_ZERO: flag = _RC_CHOP; break; case DE_ROUNDINGMODE_TO_POSITIVE_INF: flag = _RC_UP; break; case DE_ROUNDINGMODE_TO_NEGATIVE_INF: flag = _RC_DOWN; break; - case DE_ROUNDINGMODE_TO_NEAREST: flag = _RC_NEAR; break; + case DE_ROUNDINGMODE_TO_NEAREST_EVEN: flag = _RC_NEAR; break; default: DE_ASSERT(DE_FALSE); } @@ -92,7 +92,7 @@ deBool deSetRoundingMode (deRoundingMode mode) case DE_ROUNDINGMODE_TO_ZERO: flag = FE_TOWARDZERO; break; case DE_ROUNDINGMODE_TO_POSITIVE_INF: flag = FE_UPWARD; break; case DE_ROUNDINGMODE_TO_NEGATIVE_INF: flag = FE_DOWNWARD; break; - case DE_ROUNDINGMODE_TO_NEAREST: flag = FE_TONEAREST; break; + case DE_ROUNDINGMODE_TO_NEAREST_EVEN: flag = FE_TONEAREST; break; default: DE_ASSERT(DE_FALSE); } diff --git a/framework/delibs/debase/deMath.h b/framework/delibs/debase/deMath.h index 4ab86fb87..61f2d1483 100644 --- a/framework/delibs/debase/deMath.h +++ b/framework/delibs/debase/deMath.h @@ -45,7 +45,7 @@ DE_BEGIN_EXTERN_C typedef enum deRoundingMode_e { - DE_ROUNDINGMODE_TO_NEAREST = 0, + DE_ROUNDINGMODE_TO_NEAREST_EVEN = 0, DE_ROUNDINGMODE_TO_ZERO, DE_ROUNDINGMODE_TO_POSITIVE_INF, DE_ROUNDINGMODE_TO_NEGATIVE_INF, |