summaryrefslogtreecommitdiff
path: root/framework
diff options
context:
space:
mode:
authorLei Zhang <antiagainst@google.com>2017-03-20 15:40:54 -0400
committerAlexander Galazin <Alexander.Galazin@arm.com>2017-06-18 10:30:45 -0400
commit47046481073b94d1b5ae3d5afae0ed40fa3753a3 (patch)
tree2c0998679bfa6bf7eb6fd240f14d501f427284c1 /framework
parent55dd4426673bd260dde56addcfea802f21c31304 (diff)
downloadVK-GL-CTS-47046481073b94d1b5ae3d5afae0ed40fa3753a3.tar.gz
VK-GL-CTS-47046481073b94d1b5ae3d5afae0ed40fa3753a3.tar.bz2
VK-GL-CTS-47046481073b94d1b5ae3d5afae0ed40fa3753a3.zip
Add 32 to 16 bit float conversion w.r.t rounding mode
Component: Framework Change-Id: I9428f0dc725484b8a2f213157100326a69754cd8 (cherry picked from commit 7c0f5bea1da74f70e549e9a735c9d702c9396084)
Diffstat (limited to 'framework')
-rw-r--r--framework/common/tcuApp.cpp2
-rw-r--r--framework/delibs/debase/CMakeLists.txt1
-rw-r--r--framework/delibs/debase/deFloat16.c157
-rw-r--r--framework/delibs/debase/deFloat16.h5
-rw-r--r--framework/delibs/debase/deFloat16Test.c335
-rw-r--r--framework/delibs/debase/deMath.c8
-rw-r--r--framework/delibs/debase/deMath.h2
7 files changed, 503 insertions, 7 deletions
diff --git a/framework/common/tcuApp.cpp b/framework/common/tcuApp.cpp
index 409a6774f..6227089b8 100644
--- a/framework/common/tcuApp.cpp
+++ b/framework/common/tcuApp.cpp
@@ -89,7 +89,7 @@ App::App (Platform& platform, Archive& archive, TestLog& log, const CommandLine&
print("dEQP Core %s (0x%08x) starting..\n", qpGetReleaseName(), qpGetReleaseId());
print(" target implementation = '%s'\n", qpGetTargetName());
- if (!deSetRoundingMode(DE_ROUNDINGMODE_TO_NEAREST))
+ if (!deSetRoundingMode(DE_ROUNDINGMODE_TO_NEAREST_EVEN))
qpPrintf("WARNING: Failed to set floating-point rounding mode!\n");
try
diff --git a/framework/delibs/debase/CMakeLists.txt b/framework/delibs/debase/CMakeLists.txt
index 2f9d8d013..636d73ecf 100644
--- a/framework/delibs/debase/CMakeLists.txt
+++ b/framework/delibs/debase/CMakeLists.txt
@@ -9,6 +9,7 @@ set(DEBASE_SRCS
deDefs.h
deFloat16.c
deFloat16.h
+ deFloat16Test.c
deInt32.c
deInt32.h
deInt32Test.c
diff --git a/framework/delibs/debase/deFloat16.c b/framework/delibs/debase/deFloat16.c
index 6460f0587..e8a1057e0 100644
--- a/framework/delibs/debase/deFloat16.c
+++ b/framework/delibs/debase/deFloat16.c
@@ -99,6 +99,163 @@ deFloat16 deFloat32To16 (float val32)
}
}
+/*--------------------------------------------------------------------*//*!
+ * \brief Round the given number `val` to nearest even by discarding
+ * the last `numBitsToDiscard` bits.
+ * \param val value to round
+ * \param numBitsToDiscard number of (least significant) bits to discard
+ * \return The rounded value with the last `numBitsToDiscard` removed
+ *//*--------------------------------------------------------------------*/
+static deUint32 roundToNearestEven (deUint32 val, const deUint32 numBitsToDiscard)
+{
+ const deUint32 lastBits = val & ((1 << numBitsToDiscard) - 1);
+ const deUint32 headBit = val & (1 << (numBitsToDiscard - 1));
+
+ DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32); /* Make sure no overflow. */
+ val >>= numBitsToDiscard;
+
+ if (headBit == 0)
+ {
+ return val;
+ }
+ else if (headBit == lastBits)
+ {
+ if ((val & 0x1) == 0x1)
+ {
+ return val + 1;
+ }
+ else
+ {
+ return val;
+ }
+ }
+ else
+ {
+ return val + 1;
+ }
+}
+
+deFloat16 deFloat32To16Round (float val32, deRoundingMode mode)
+{
+ union
+ {
+ float f; /* Interpret as 32-bit float */
+ deUint32 u; /* Interpret as 32-bit unsigned integer */
+ } x;
+ deUint32 sign; /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
+ deUint32 exp32; /* exp32: biased exponent for 32-bit floats */
+ int exp16; /* exp16: biased exponent for 16-bit floats */
+ deUint32 mantissa;
+
+ /* We only support these two rounding modes for now */
+ DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
+
+ x.f = val32;
+ sign = (x.u >> 16u) & 0x00008000u;
+ exp32 = (x.u >> 23u) & 0x000000ffu;
+ exp16 = (int) (exp32) - 127 + 15; /* 15/127: exponent bias for 16-bit/32-bit floats */
+ mantissa = x.u & 0x007fffffu;
+
+ /* Case: zero and denormalized floats */
+ if (exp32 == 0)
+ {
+ /* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */
+ return (deFloat16) sign;
+ }
+ /* Case: Inf and NaN */
+ else if (exp32 == 0x000000ffu)
+ {
+ if (mantissa == 0u)
+ {
+ /* Inf */
+ return (deFloat16) (sign | 0x7c00u);
+ }
+ else
+ {
+ /* NaN */
+ mantissa >>= 13u; /* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */
+ /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
+ return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
+ }
+ }
+ /* The following are cases for normalized floats.
+ *
+ * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
+ * we can only shift the mantissa further right.
+ * The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
+ * Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
+ * So, we just need to right shift the mantissa -exp16 bits.
+ * * If exp16 is 0, mantissa shifting requirement is similar to the above.
+ * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
+ */
+ /* Case: normalized floats -> zero */
+ else if (exp16 < -10)
+ {
+ /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
+ /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
+ return (deFloat16) sign;
+ }
+ /* Case: normalized floats -> zero and denormalized halfs */
+ else if (exp16 <= 0)
+ {
+ /* Add the implicit leading 1 in mormalized float to mantissa. */
+ mantissa |= 0x00800000u;
+ /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
+ * Need to discard the last 14-bits considering rounding mode.
+ * We also need to shift right -exp16 bits to encode the underflowed exponent.
+ */
+ if (mode == DE_ROUNDINGMODE_TO_ZERO)
+ {
+ mantissa >>= (14 - exp16);
+ }
+ else
+ {
+ /* mantissa in the above may exceed 10-bits, in which case overflow happens.
+ * The overflowed bit is automatically carried to exponent then.
+ */
+ mantissa = roundToNearestEven(mantissa, 14 - exp16);
+ }
+ return (deFloat16) (sign | mantissa);
+ }
+ /* Case: normalized floats -> normalized floats */
+ else if (exp16 <= 30)
+ {
+ if (mode == DE_ROUNDINGMODE_TO_ZERO)
+ {
+ return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 13u));
+ }
+ else
+ {
+ mantissa = roundToNearestEven(mantissa, 13);
+ /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
+ exp16 = (exp16 << 10u) + (mantissa & (1 << 10));
+ mantissa &= (1u << 10) - 1;
+ return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
+ }
+ }
+ /* Case: normalized floats (too large to be representable as 16-bit floats) */
+ else
+ {
+ /* According to IEEE Std 754-2008 Section 7.4,
+ * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
+ * of the intermediate result.
+ * * roundTowardZero carries all overflows to the format’s largest finite number
+ * with the sign of the intermediate result.
+ */
+ if (mode == DE_ROUNDINGMODE_TO_ZERO)
+ {
+ return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
+ }
+ else
+ {
+ return (deFloat16) (sign | (0x1f << 10));
+ }
+ }
+
+ /* Make compiler happy */
+ return (deFloat16) 0;
+}
+
float deFloat16To32 (deFloat16 val16)
{
deUint32 sign;
diff --git a/framework/delibs/debase/deFloat16.h b/framework/delibs/debase/deFloat16.h
index d2d71dc29..ab81199e4 100644
--- a/framework/delibs/debase/deFloat16.h
+++ b/framework/delibs/debase/deFloat16.h
@@ -24,6 +24,7 @@
*//*--------------------------------------------------------------------*/
#include "deDefs.h"
+#include "deMath.h"
DE_BEGIN_EXTERN_C
@@ -38,7 +39,9 @@ typedef deFloat16 DEfloat16;
* \param val32 Input value.
* \return Converted 16-bit floating-point value.
*//*--------------------------------------------------------------------*/
-deFloat16 deFloat32To16 (float val32);
+deFloat16 deFloat32To16 (float val32);
+deFloat16 deFloat32To16Round (float val32, deRoundingMode mode);
+void deFloat16_selfTest (void);
/*--------------------------------------------------------------------*//*!
* \brief Convert 16-bit floating point number to 32 bit.
diff --git a/framework/delibs/debase/deFloat16Test.c b/framework/delibs/debase/deFloat16Test.c
new file mode 100644
index 000000000..ea5d21707
--- /dev/null
+++ b/framework/delibs/debase/deFloat16Test.c
@@ -0,0 +1,335 @@
+/*-------------------------------------------------------------------------
+ * drawElements Base Portability Library
+ * -------------------------------------
+ *
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *//*!
+ * \file
+ * \brief Testing of deFloat16 functions.
+ *//*--------------------------------------------------------------------*/
+
+#include "deFloat16.h"
+#include "deRandom.h"
+
+DE_BEGIN_EXTERN_C
+
+static float getFloat32 (deUint32 sign, deUint32 biased_exponent, deUint32 mantissa)
+{
+ union
+ {
+ float f;
+ deUint32 u;
+ } x;
+
+ x.u = (sign << 31) | (biased_exponent << 23) | mantissa;
+
+ return x.f;
+}
+
+static deFloat16 getFloat16 (deUint16 sign, deUint16 biased_exponent, deUint16 mantissa)
+{
+ return (deFloat16) ((sign << 15) | (biased_exponent << 10) | mantissa);
+}
+
+
+static deFloat16 deFloat32To16RTZ (float val32)
+{
+ return deFloat32To16Round(val32, DE_ROUNDINGMODE_TO_ZERO);
+}
+
+static deFloat16 deFloat32To16RTE (float val32)
+{
+ return deFloat32To16Round(val32, DE_ROUNDINGMODE_TO_NEAREST_EVEN);
+}
+
+void deFloat16_selfTest (void)
+{
+ /* 16-bit: 1 5 (0x00--0x1f) 10 (0x000--0x3ff)
+ * 32-bit: 1 8 (0x00--0xff) 23 (0x000000--0x7fffff)
+ */
+ deRandom rnd;
+ int idx;
+
+ deRandom_init(&rnd, 0xdeadbeefu-1);
+
+ /* --- For rounding mode RTZ --- */
+
+ /* Zero */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0, 0)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0, 0)) == getFloat16(1, 0, 0));
+
+ /* Inf */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0)) == getFloat16(0, 0x1f, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0)) == getFloat16(1, 0x1f, 0));
+
+ /* SNaN */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 1)) == getFloat16(0, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 1)) == getFloat16(1, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x3fffff)) == getFloat16(0, 0x1f, 0x1ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x3fffff)) == getFloat16(1, 0x1f, 0x1ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x0003ff)) == getFloat16(0, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x0003ff)) == getFloat16(1, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x123456)) == getFloat16(0, 0x1f, 0x123456 >> 13));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x123456)) == getFloat16(1, 0x1f, 0x123456 >> 13));
+
+ /* QNaN */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x400000)) == getFloat16(0, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x400000)) == getFloat16(1, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x7fffff)) == getFloat16(0, 0x1f, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x7fffff)) == getFloat16(1, 0x1f, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x4003ff)) == getFloat16(0, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x4003ff)) == getFloat16(1, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0xff, 0x723456)) == getFloat16(0, 0x1f, 0x723456 >> 13));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0xff, 0x723456)) == getFloat16(1, 0x1f, 0x723456 >> 13));
+
+ /* Denormalized */
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+ mantissa |= (mantissa == 0); /* Make sure it is not zero */
+
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 0, mantissa)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 0, mantissa)) == getFloat16(1, 0, 0));
+ }
+
+ /* Normalized -> zero */
+ /* Absolute value: minimal 32-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 1, 0)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 1, 0)) == getFloat16(1, 0, 0));
+ /* Absolute value: 2^-24 - e, extremely near minimal 16-bit denormalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 25, 0x7fffff)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 25, 0x7fffff)) == getFloat16(1, 0, 0));
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 exponent = deRandom_getUint32(&rnd);
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ exponent = exponent % (127 - 25) + 1; /* Make sure >= 1, <= 127 - 25 */
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0, 0));
+ }
+
+ /* Normalized -> denormalized */
+ /* Absolute value: 2^-24, minimal 16-bit denormalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 24, 0)) == getFloat16(0, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 24, 0)) == getFloat16(1, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 24, 1)) == getFloat16(0, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 24, 1)) == getFloat16(1, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 20, 0x123456)) == getFloat16(0, 0, 0x12));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 20, 0x123456)) == getFloat16(1, 0, 0x12));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 18, 0x654321)) == getFloat16(0, 0, 0x72));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 18, 0x654321)) == getFloat16(1, 0, 0x72));
+ /* Absolute value: 2^-14 - 2^-24 = (2 - 2^-9) * 2^-15, maximal 16-bit denormalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000)) == getFloat16(0, 0, 0x3ff)); /* 0x7fc000: 0111 1111 1100 0000 0000 0000 */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000)) == getFloat16(1, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000 - 1)) == getFloat16(0, 0, 0x3fe));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000 - 1)) == getFloat16(1, 0, 0x3fe));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fc000 + 1)) == getFloat16(0, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fc000 + 1)) == getFloat16(1, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 15, 0x7fffff)) == getFloat16(0, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 15, 0x7fffff)) == getFloat16(1, 0, 0x3ff));
+
+ /* Normalized -> normalized */
+ /* Absolute value: 2^-14, minimal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 - 14, 0)) == getFloat16(0, 1, 0));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 - 14, 0)) == getFloat16(1, 1, 0));
+ /* Absolute value: 65504 - 2^-23, extremely near maximal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(0, 0x1e, 0x3fe));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(1, 0x1e, 0x3fe));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) - 0x456)) == getFloat16(0, 0x1e, 0x3fe));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) - 0x456)) == getFloat16(1, 0x1e, 0x3fe));
+ /* Absolute value: 65504, maximal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, 0x3ff << 13)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, 0x3ff << 13)) == getFloat16(1, 0x1e, 0x3ff));
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 exponent = deRandom_getUint32(&rnd);
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ exponent = exponent % ((127 + 14) - (127 -14) + 1) + (127 - 14); /* Make sure >= 127 - 14, <= 127 + 14 */
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, (deUint16) (exponent + 15 - 127), (deUint16) (mantissa >> 13)));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, (deUint16) (exponent + 15 - 127), (deUint16) (mantissa >> 13)));
+ }
+
+ /* Normalized -> minimal/maximal normalized */
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(1, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, 127 + 15, (0x3ff << 13) + 0x123)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, 127 + 15, (0x3ff << 13) + 0x123)) == getFloat16(1, 0x1e, 0x3ff));
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 exponent = deRandom_getUint32(&rnd);
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ exponent = exponent % (0xfe - (127 + 16) + 1) + (127 + 16); /* Make sure >= 127 + 16, <= 0xfe */
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTZ(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0x1e, 0x3ff));
+ }
+
+ /* --- For rounding mode RTE --- */
+
+ /* Zero */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0, 0)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0, 0)) == getFloat16(1, 0, 0));
+
+ /* Inf */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0)) == getFloat16(0, 0x1f, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0)) == getFloat16(1, 0x1f, 0));
+
+ /* SNaN */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 1)) == getFloat16(0, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 1)) == getFloat16(1, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x3fffff)) == getFloat16(0, 0x1f, 0x1ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x3fffff)) == getFloat16(1, 0x1f, 0x1ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x0003ff)) == getFloat16(0, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x0003ff)) == getFloat16(1, 0x1f, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x123456)) == getFloat16(0, 0x1f, 0x123456 >> 13));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x123456)) == getFloat16(1, 0x1f, 0x123456 >> 13));
+
+ /* QNaN */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x400000)) == getFloat16(0, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x400000)) == getFloat16(1, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x7fffff)) == getFloat16(0, 0x1f, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x7fffff)) == getFloat16(1, 0x1f, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x4003ff)) == getFloat16(0, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x4003ff)) == getFloat16(1, 0x1f, 0x200));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0xff, 0x723456)) == getFloat16(0, 0x1f, 0x723456 >> 13));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0xff, 0x723456)) == getFloat16(1, 0x1f, 0x723456 >> 13));
+
+ /* Denormalized */
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+ mantissa |= (mantissa == 0); /* Make sure it is not zero */
+
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 0, mantissa)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 0, mantissa)) == getFloat16(1, 0, 0));
+ }
+
+ /* Normalized -> zero and denormalized */
+ /* Absolute value: minimal 32-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 1, 0)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 1, 0)) == getFloat16(1, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 42, 0x7abcde)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 42, 0x7abcde)) == getFloat16(1, 0, 0));
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 exponent = deRandom_getUint32(&rnd);
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ exponent = exponent % (127 - 26) + 1; /* Make sure >= 1, <= 127 - 26 */
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0, 0));
+ }
+ /* Absolute value: 2^-25, minimal 16-bit denormalized: 2^-24 */
+ /* The following six cases need to right shift mantissa (with leading 1) 10 bits --------------------> to here */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 0)) == getFloat16(0, 0, 0)); /* XX XXXX XXXX 1 000 0000 0000 0000 0000 0000 */
+ /* Take the first 10 bits with RTE ------ 00 0000 0000 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 0)) == getFloat16(1, 0, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 1)) == getFloat16(0, 0, 1)); /* XX XXXX XXXX 1 000 0000 0000 0000 0000 0001 */
+ /* Take the first 10 bits with RTE ------ 00 0000 0001 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 1)) == getFloat16(1, 0, 1));
+ /* Absolute value: 2^-24 - e, extremely near minimal 16-bit denormalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 25, 0x7fffff)) == getFloat16(0, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 25, 0x7fffff)) == getFloat16(1, 0, 1));
+ /* Absolute value: 2^-24, minimal 16-bit denormalized */
+ /* The following (127 - 24) cases need to right shift mantissa (with leading 1) 9 bits -----------------> to here */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0)) == getFloat16(0, 0, 1)); /* X XXXX XXXX 1 000 0000 0000 0000 0000 0000 */
+ /* Take the first 10 bits with RTE ---------- 0 0000 0000 1 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0)) == getFloat16(1, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 1)) == getFloat16(0, 0, 1)); /* X XXXX XXXX 1 000 0000 0000 0000 0000 0001 */
+ /* Take the first 10 bits with RTE ---------- 0 0000 0000 1 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 1)) == getFloat16(1, 0, 1));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x400000)) == getFloat16(0, 0, 2)); /* X XXXX XXXX 1 100 0000 0000 0000 0000 0000 */
+ /* Take the first 10 bits with RTE ---------- 0 0000 0000 2 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x400000)) == getFloat16(1, 0, 2));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x400001)) == getFloat16(0, 0, 2)); /* X XXXX XXXX 1 100 0000 0000 0000 0000 0001 */
+ /* Take the first 10 bits with RTE ---------- 0 0000 0000 2 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x400001)) == getFloat16(1, 0, 2));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 24, 0x4fffff)) == getFloat16(0, 0, 2));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 24, 0x4fffff)) == getFloat16(1, 0, 2));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 20, 0x123456)) == getFloat16(0, 0, 0x12));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 20, 0x123456)) == getFloat16(1, 0, 0x12));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 18, 0x654321)) == getFloat16(0, 0, 0x73));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 18, 0x654321)) == getFloat16(1, 0, 0x73));
+ /* Absolute value: 2^-14 - 2^-24 = (2 - 2^-9) * 2^-15, maximal 16-bit denormalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000)) == getFloat16(0, 0, 0x3ff)); /* 0x7fc000: 0111 1111 1100 0000 0000 0000 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000)) == getFloat16(1, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000 - 1)) == getFloat16(0, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000 - 1)) == getFloat16(1, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fc000 + 1)) == getFloat16(0, 0, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fc000 + 1)) == getFloat16(1, 0, 0x3ff));
+
+ /* Normalized -> normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fe000)) == getFloat16(0, 1, 0)); /* 0x7fe000: 0111 1111 1110 0000 0000 0000 */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fe000)) == getFloat16(1, 1, 0));
+ /* Absolute value: (2 - 2^-23) * 2^-15, extremely near 2^-14, minimal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 15, 0x7fffff)) == getFloat16(0, 1, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 15, 0x7fffff)) == getFloat16(1, 1, 0));
+ /* Absolute value: 2^-14, minimal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 - 14, 0)) == getFloat16(0, 1, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 - 14, 0)) == getFloat16(1, 1, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3fe << 13) + (1 << 12))) == getFloat16(0, 0x1e, 0x3fe));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3fe << 13) + (1 << 12))) == getFloat16(1, 0x1e, 0x3fe));
+
+ /* Normalized -> minimal/maximal normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3fe << 13) + (1 << 12) + 1)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3fe << 13) + (1 << 12) + 1)) == getFloat16(1, 0x1e, 0x3ff));
+ /* Absolute value: 65504 - 2^-23, extremely near maximal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) - 1)) == getFloat16(1, 0x1e, 0x3ff));
+ /* Absolute value: 65504, maximal 16-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, 0x3ff << 13)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, 0x3ff << 13)) == getFloat16(1, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + 1)) == getFloat16(1, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + 0x456)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + 0x456)) == getFloat16(1, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + (1 << 12) - 1)) == getFloat16(0, 0x1e, 0x3ff));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + (1 << 12) - 1)) == getFloat16(1, 0x1e, 0x3ff));
+
+ /* Normalized -> Inf */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, (0x3ff << 13) + (1 << 12))) == getFloat16(0, 0x1f, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, (0x3ff << 13) + (1 << 12))) == getFloat16(1, 0x1f, 0));
+ /* Absolute value: maximal 32-bit normalized */
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, 127 + 15, 0x7fffff)) == getFloat16(0, 0x1f, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, 127 + 15, 0x7fffff)) == getFloat16(1, 0x1f, 0));
+ for (idx = 0; idx < 256; ++idx)
+ {
+ deUint32 exponent = deRandom_getUint32(&rnd);
+ deUint32 mantissa = deRandom_getUint32(&rnd);
+
+ exponent = exponent % (0xfe - (127 + 16) + 1) + (127 + 16); /* Make sure >= 127 + 16, <= 0xfe */
+ mantissa &= 0x7fffffu; /* Take the last 23 bits */
+
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(0, exponent, mantissa)) == getFloat16(0, 0x1f, 0));
+ DE_TEST_ASSERT(deFloat32To16RTE(getFloat32(1, exponent, mantissa)) == getFloat16(1, 0x1f, 0));
+ }
+}
+
+DE_END_EXTERN_C
diff --git a/framework/delibs/debase/deMath.c b/framework/delibs/debase/deMath.c
index 26e2aef67..c76175140 100644
--- a/framework/delibs/debase/deMath.c
+++ b/framework/delibs/debase/deMath.c
@@ -46,7 +46,7 @@ deRoundingMode deGetRoundingMode (void)
case _RC_CHOP: return DE_ROUNDINGMODE_TO_ZERO;
case _RC_UP: return DE_ROUNDINGMODE_TO_POSITIVE_INF;
case _RC_DOWN: return DE_ROUNDINGMODE_TO_NEGATIVE_INF;
- case _RC_NEAR: return DE_ROUNDINGMODE_TO_NEAREST;
+ case _RC_NEAR: return DE_ROUNDINGMODE_TO_NEAREST_EVEN;
default: return DE_ROUNDINGMODE_LAST;
}
#elif (DE_COMPILER == DE_COMPILER_GCC) || (DE_COMPILER == DE_COMPILER_CLANG)
@@ -56,7 +56,7 @@ deRoundingMode deGetRoundingMode (void)
case FE_TOWARDZERO: return DE_ROUNDINGMODE_TO_ZERO;
case FE_UPWARD: return DE_ROUNDINGMODE_TO_POSITIVE_INF;
case FE_DOWNWARD: return DE_ROUNDINGMODE_TO_NEGATIVE_INF;
- case FE_TONEAREST: return DE_ROUNDINGMODE_TO_NEAREST;
+ case FE_TONEAREST: return DE_ROUNDINGMODE_TO_NEAREST_EVEN;
default: return DE_ROUNDINGMODE_LAST;
}
#else
@@ -76,7 +76,7 @@ deBool deSetRoundingMode (deRoundingMode mode)
case DE_ROUNDINGMODE_TO_ZERO: flag = _RC_CHOP; break;
case DE_ROUNDINGMODE_TO_POSITIVE_INF: flag = _RC_UP; break;
case DE_ROUNDINGMODE_TO_NEGATIVE_INF: flag = _RC_DOWN; break;
- case DE_ROUNDINGMODE_TO_NEAREST: flag = _RC_NEAR; break;
+ case DE_ROUNDINGMODE_TO_NEAREST_EVEN: flag = _RC_NEAR; break;
default:
DE_ASSERT(DE_FALSE);
}
@@ -92,7 +92,7 @@ deBool deSetRoundingMode (deRoundingMode mode)
case DE_ROUNDINGMODE_TO_ZERO: flag = FE_TOWARDZERO; break;
case DE_ROUNDINGMODE_TO_POSITIVE_INF: flag = FE_UPWARD; break;
case DE_ROUNDINGMODE_TO_NEGATIVE_INF: flag = FE_DOWNWARD; break;
- case DE_ROUNDINGMODE_TO_NEAREST: flag = FE_TONEAREST; break;
+ case DE_ROUNDINGMODE_TO_NEAREST_EVEN: flag = FE_TONEAREST; break;
default:
DE_ASSERT(DE_FALSE);
}
diff --git a/framework/delibs/debase/deMath.h b/framework/delibs/debase/deMath.h
index 4ab86fb87..61f2d1483 100644
--- a/framework/delibs/debase/deMath.h
+++ b/framework/delibs/debase/deMath.h
@@ -45,7 +45,7 @@ DE_BEGIN_EXTERN_C
typedef enum deRoundingMode_e
{
- DE_ROUNDINGMODE_TO_NEAREST = 0,
+ DE_ROUNDINGMODE_TO_NEAREST_EVEN = 0,
DE_ROUNDINGMODE_TO_ZERO,
DE_ROUNDINGMODE_TO_POSITIVE_INF,
DE_ROUNDINGMODE_TO_NEGATIVE_INF,