diff options
author | Matt Brown <matthew.brown.dev@gmail.com> | 2017-06-14 16:23:20 +1000 |
---|---|---|
committer | Matt Brown <matthew.brown.dev@gmail.com> | 2017-06-14 16:59:13 +1000 |
commit | 6f4eca5ea4ab00726199277bb7a079900d20d388 (patch) | |
tree | e0011237ff1ba9498bad24adfd808636903d1749 | |
parent | be55f96cbdc919c7c7da2da2f7a2c6c47336a9f6 (diff) | |
download | openblas-6f4eca5ea4ab00726199277bb7a079900d20d388.tar.gz openblas-6f4eca5ea4ab00726199277bb7a079900d20d388.tar.bz2 openblas-6f4eca5ea4ab00726199277bb7a079900d20d388.zip |
Optimise sswap for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
-rw-r--r-- | kernel/power/sswap_microk_power8.c | 64 |
1 files changed, 32 insertions, 32 deletions
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index d44f16765..cfefdd6ef 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" - "lxvw4x 40, 0, %3 \n\t" - "lxvw4x 41, %5, %3 \n\t" - "lxvw4x 42, %6, %3 \n\t" - "lxvw4x 43, %7, %3 \n\t" - "lxvw4x 44, %8, %3 \n\t" - "lxvw4x 45, %9, %3 \n\t" - "lxvw4x 46, %10, %3 \n\t" - "lxvw4x 47, %11, %3 \n\t" + "lxvd2x 40, 0, %3 \n\t" + "lxvd2x 41, %5, %3 \n\t" + "lxvd2x 42, %6, %3 \n\t" + "lxvd2x 43, %7, %3 \n\t" + "lxvd2x 44, %8, %3 \n\t" + "lxvd2x 45, %9, %3 \n\t" + "lxvd2x 46, %10, %3 \n\t" + "lxvd2x 47, %11, %3 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %4 \n\t" - "stxvw4x 41, %5, %4 \n\t" - "stxvw4x 42, %6, %4 \n\t" - "stxvw4x 43, %7, %4 \n\t" - "stxvw4x 44, %8, %4 \n\t" - "stxvw4x 45, %9, %4 \n\t" - "stxvw4x 46, %10, %4 \n\t" - "stxvw4x 47, %11, %4 \n\t" + "stxvd2x 40, 0, %4 \n\t" + "stxvd2x 41, %5, %4 \n\t" + "stxvd2x 42, %6, %4 \n\t" + "stxvd2x 43, %7, %4 \n\t" + "stxvd2x 44, %8, %4 \n\t" + "stxvd2x 45, %9, %4 \n\t" + "stxvd2x 46, %10, %4 \n\t" + "stxvd2x 47, %11, %4 \n\t" "addi %4, %4, 128 \n\t" |