summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZhang Xianyi <traits.zhang@gmail.com>2017-07-03 13:48:29 +0800
committerGitHub <noreply@github.com>2017-07-03 13:48:29 +0800
commita6515bb858c0f0b01abde0e0908366755ece2ef0 (patch)
treed81144cc14f6ead3f0266b47f0395e35af50fa9e
parentc66b842d66c5516e52804bf5a0544d18b1da1b44 (diff)
parentbd831a03a80d642693c786f7a65265ad40a50fc0 (diff)
downloadopenblas-a6515bb858c0f0b01abde0e0908366755ece2ef0.tar.gz
openblas-a6515bb858c0f0b01abde0e0908366755ece2ef0.tar.bz2
openblas-a6515bb858c0f0b01abde0e0908366755ece2ef0.zip
Merge pull request #1218 from m-brow/power9
Optimise loads on Power9 LE
-rw-r--r--kernel/power/casum_microk_power8.c32
-rw-r--r--kernel/power/ccopy_microk_power8.c128
-rw-r--r--kernel/power/cswap_microk_power8.c128
-rw-r--r--kernel/power/sasum_microk_power8.c32
-rw-r--r--kernel/power/scopy_microk_power8.c64
-rw-r--r--kernel/power/sdot_microk_power8.c64
-rw-r--r--kernel/power/srot_microk_power8.c64
-rw-r--r--kernel/power/sscal_microk_power8.c80
-rw-r--r--kernel/power/sswap_microk_power8.c64
9 files changed, 328 insertions, 328 deletions
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index 93ba50660..7d12c9885 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index b2b1bead1..613c4d286 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
{
__asm__
(
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %5, %2 \n\t"
- "lxvw4x 34, %6, %2 \n\t"
- "lxvw4x 35, %7, %2 \n\t"
- "lxvw4x 36, %8, %2 \n\t"
- "lxvw4x 37, %9, %2 \n\t"
- "lxvw4x 38, %10, %2 \n\t"
- "lxvw4x 39, %11, %2 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %5, %2 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "lxvw4x 34, %6, %2 \n\t"
- "lxvw4x 35, %7, %2 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "lxvw4x 36, %8, %2 \n\t"
- "lxvw4x 37, %9, %2 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
- "lxvw4x 38, %10, %2 \n\t"
- "lxvw4x 39, %11, %2 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
@@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"2: \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 1dd03dc88..8d7d0c0b9 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "lxvw4x 32, 0, %4 \n\t"
- "lxvw4x 33, %5, %4 \n\t"
- "lxvw4x 34, %6, %4 \n\t"
- "lxvw4x 35, %7, %4 \n\t"
- "lxvw4x 36, %8, %4 \n\t"
- "lxvw4x 37, %9, %4 \n\t"
- "lxvw4x 38, %10, %4 \n\t"
- "lxvw4x 39, %11, %4 \n\t"
+ "lxvd2x 32, 0, %4 \n\t"
+ "lxvd2x 33, %5, %4 \n\t"
+ "lxvd2x 34, %6, %4 \n\t"
+ "lxvd2x 35, %7, %4 \n\t"
+ "lxvd2x 36, %8, %4 \n\t"
+ "lxvd2x 37, %9, %4 \n\t"
+ "lxvd2x 38, %10, %4 \n\t"
+ "lxvd2x 39, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
- "lxvw4x 40, 0, %4 \n\t"
- "lxvw4x 41, %5, %4 \n\t"
- "lxvw4x 42, %6, %4 \n\t"
- "lxvw4x 43, %7, %4 \n\t"
- "lxvw4x 44, %8, %4 \n\t"
- "lxvw4x 45, %9, %4 \n\t"
- "lxvw4x 46, %10, %4 \n\t"
- "lxvw4x 47, %11, %4 \n\t"
+ "lxvd2x 40, 0, %4 \n\t"
+ "lxvd2x 41, %5, %4 \n\t"
+ "lxvd2x 42, %6, %4 \n\t"
+ "lxvd2x 43, %7, %4 \n\t"
+ "lxvd2x 44, %8, %4 \n\t"
+ "lxvd2x 45, %9, %4 \n\t"
+ "lxvd2x 46, %10, %4 \n\t"
+ "lxvd2x 47, %11, %4 \n\t"
"addi %4, %4, -128 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
- "lxvw4x 49, %5, %3 \n\t"
- "lxvw4x 50, %6, %3 \n\t"
- "lxvw4x 51, %7, %3 \n\t"
- "lxvw4x 0, %8, %3 \n\t"
- "lxvw4x 1, %9, %3 \n\t"
- "lxvw4x 2, %10, %3 \n\t"
- "lxvw4x 3, %11, %3 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+ "lxvd2x 0, %8, %3 \n\t"
+ "lxvd2x 1, %9, %3 \n\t"
+ "lxvd2x 2, %10, %3 \n\t"
+ "lxvd2x 3, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "lxvw4x 4, 0, %3 \n\t"
- "lxvw4x 5, %5, %3 \n\t"
- "lxvw4x 6, %6, %3 \n\t"
- "lxvw4x 7, %7, %3 \n\t"
- "lxvw4x 8, %8, %3 \n\t"
- "lxvw4x 9, %9, %3 \n\t"
- "lxvw4x 10, %10, %3 \n\t"
- "lxvw4x 11, %11, %3 \n\t"
+ "lxvd2x 4, 0, %3 \n\t"
+ "lxvd2x 5, %5, %3 \n\t"
+ "lxvd2x 6, %6, %3 \n\t"
+ "lxvd2x 7, %7, %3 \n\t"
+ "lxvd2x 8, %8, %3 \n\t"
+ "lxvd2x 9, %9, %3 \n\t"
+ "lxvd2x 10, %10, %3 \n\t"
+ "lxvd2x 11, %11, %3 \n\t"
"addi %3, %3, -128 \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 48, 0, %4 \n\t"
- "stxvw4x 49, %5, %4 \n\t"
- "stxvw4x 50, %6, %4 \n\t"
- "stxvw4x 51, %7, %4 \n\t"
- "stxvw4x 0, %8, %4 \n\t"
- "stxvw4x 1, %9, %4 \n\t"
- "stxvw4x 2, %10, %4 \n\t"
- "stxvw4x 3, %11, %4 \n\t"
+ "stxvd2x 48, 0, %4 \n\t"
+ "stxvd2x 49, %5, %4 \n\t"
+ "stxvd2x 50, %6, %4 \n\t"
+ "stxvd2x 51, %7, %4 \n\t"
+ "stxvd2x 0, %8, %4 \n\t"
+ "stxvd2x 1, %9, %4 \n\t"
+ "stxvd2x 2, %10, %4 \n\t"
+ "stxvd2x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
- "stxvw4x 4, 0, %4 \n\t"
- "stxvw4x 5, %5, %4 \n\t"
- "stxvw4x 6, %6, %4 \n\t"
- "stxvw4x 7, %7, %4 \n\t"
- "stxvw4x 8, %8, %4 \n\t"
- "stxvw4x 9, %9, %4 \n\t"
- "stxvw4x 10, %10, %4 \n\t"
- "stxvw4x 11, %11, %4 \n\t"
+ "stxvd2x 4, 0, %4 \n\t"
+ "stxvd2x 5, %5, %4 \n\t"
+ "stxvd2x 6, %6, %4 \n\t"
+ "stxvd2x 7, %7, %4 \n\t"
+ "stxvd2x 8, %8, %4 \n\t"
+ "stxvd2x 9, %9, %4 \n\t"
+ "stxvd2x 10, %10, %4 \n\t"
+ "stxvd2x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 08a766f80..4bb515de8 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 444a6d4d5..7a54d5e1e 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
{
__asm__
(
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"2: \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index 7f7ccfac3..bfe100c8b 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
- "lxvw4x 41, %10, %2 \n\t"
- "lxvw4x 49, %10, %3 \n\t"
- "lxvw4x 42, %11, %2 \n\t"
- "lxvw4x 50, %11, %3 \n\t"
- "lxvw4x 43, %12, %2 \n\t"
- "lxvw4x 51, %12, %3 \n\t"
- "lxvw4x 44, %13, %2 \n\t"
- "lxvw4x %x4, %13, %3 \n\t"
- "lxvw4x 45, %14, %2 \n\t"
- "lxvw4x %x5, %14, %3 \n\t"
- "lxvw4x 46, %15, %2 \n\t"
- "lxvw4x %x6, %15, %3 \n\t"
- "lxvw4x 47, %16, %2 \n\t"
- "lxvw4x %x7, %16, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 41, %10, %2 \n\t"
+ "lxvd2x 49, %10, %3 \n\t"
+ "lxvd2x 42, %11, %2 \n\t"
+ "lxvd2x 50, %11, %3 \n\t"
+ "lxvd2x 43, %12, %2 \n\t"
+ "lxvd2x 51, %12, %3 \n\t"
+ "lxvd2x 44, %13, %2 \n\t"
+ "lxvd2x %x4, %13, %3 \n\t"
+ "lxvd2x 45, %14, %2 \n\t"
+ "lxvd2x %x5, %14, %3 \n\t"
+ "lxvd2x 46, %15, %2 \n\t"
+ "lxvd2x %x6, %15, %3 \n\t"
+ "lxvd2x 47, %16, %2 \n\t"
+ "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
@@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"1: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
- "lxvw4x 41, %10, %2 \n\t"
- "lxvw4x 49, %10, %3 \n\t"
+ "lxvd2x 41, %10, %2 \n\t"
+ "lxvd2x 49, %10, %3 \n\t"
"xvmaddasp 34, 42, 50 \n\t"
- "lxvw4x 42, %11, %2 \n\t"
- "lxvw4x 50, %11, %3 \n\t"
+ "lxvd2x 42, %11, %2 \n\t"
+ "lxvd2x 50, %11, %3 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
- "lxvw4x 43, %12, %2 \n\t"
- "lxvw4x 51, %12, %3 \n\t"
+ "lxvd2x 43, %12, %2 \n\t"
+ "lxvd2x 51, %12, %3 \n\t"
"xvmaddasp 36, 44, %x4 \n\t"
- "lxvw4x 44, %13, %2 \n\t"
- "lxvw4x %x4, %13, %3 \n\t"
+ "lxvd2x 44, %13, %2 \n\t"
+ "lxvd2x %x4, %13, %3 \n\t"
"xvmaddasp 37, 45, %x5 \n\t"
- "lxvw4x 45, %14, %2 \n\t"
- "lxvw4x %x5, %14, %3 \n\t"
+ "lxvd2x 45, %14, %2 \n\t"
+ "lxvd2x %x5, %14, %3 \n\t"
"xvmaddasp 38, 46, %x6 \n\t"
- "lxvw4x 46, %15, %2 \n\t"
- "lxvw4x %x6, %15, %3 \n\t"
+ "lxvd2x 46, %15, %2 \n\t"
+ "lxvd2x %x6, %15, %3 \n\t"
"xvmaddasp 39, 47, %x7 \n\t"
- "lxvw4x 47, %16, %2 \n\t"
- "lxvw4x %x7, %16, %3 \n\t"
+ "lxvd2x 47, %16, %2 \n\t"
+ "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index 0a18c16e0..6eecb60a1 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xscvdpspn 37, %x14 \n\t" // load s to all words
"xxspltw 37, 37, 0 \n\t"
- "lxvw4x 32, 0, %3 \n\t" // load x
- "lxvw4x 33, %15, %3 \n\t"
- "lxvw4x 34, %16, %3 \n\t"
- "lxvw4x 35, %17, %3 \n\t"
+ "lxvd2x 32, 0, %3 \n\t" // load x
+ "lxvd2x 33, %15, %3 \n\t"
+ "lxvd2x 34, %16, %3 \n\t"
+ "lxvd2x 35, %17, %3 \n\t"
- "lxvw4x 48, 0, %4 \n\t" // load y
- "lxvw4x 49, %15, %4 \n\t"
- "lxvw4x 50, %16, %4 \n\t"
- "lxvw4x 51, %17, %4 \n\t"
+ "lxvd2x 48, 0, %4 \n\t" // load y
+ "lxvd2x 49, %15, %4 \n\t"
+ "lxvd2x 50, %16, %4 \n\t"
+ "lxvd2x 51, %17, %4 \n\t"
"addi %3, %3, 64 \n\t"
"addi %4, %4, 64 \n\t"
@@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 45, 33, 37 \n\t"
- "lxvw4x 32, 0, %3 \n\t" // load x
- "lxvw4x 33, %15, %3 \n\t"
+ "lxvd2x 32, 0, %3 \n\t" // load x
+ "lxvd2x 33, %15, %3 \n\t"
"xvmulsp 46, 34, 37 \n\t"
"xvmulsp 47, 35, 37 \n\t"
- "lxvw4x 34, %16, %3 \n\t"
- "lxvw4x 35, %17, %3 \n\t"
+ "lxvd2x 34, %16, %3 \n\t"
+ "lxvd2x 35, %17, %3 \n\t"
"xvmulsp %x9, 48, 37 \n\t" // s * y
"xvmulsp %x10, 49, 37 \n\t"
- "lxvw4x 48, 0, %4 \n\t" // load y
- "lxvw4x 49, %15, %4 \n\t"
+ "lxvd2x 48, 0, %4 \n\t" // load y
+ "lxvd2x 49, %15, %4 \n\t"
"xvmulsp %x11, 50, 37 \n\t"
"xvmulsp %x12, 51, 37 \n\t"
- "lxvw4x 50, %16, %4 \n\t"
- "lxvw4x 51, %17, %4 \n\t"
+ "lxvd2x 50, %16, %4 \n\t"
+ "lxvd2x 51, %17, %4 \n\t"
"xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
"xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
@@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
- "stxvw4x 40, 0, %3 \n\t" // store x
- "stxvw4x 41, %15, %3 \n\t"
- "stxvw4x 42, %16, %3 \n\t"
- "stxvw4x 43, %17, %3 \n\t"
+ "stxvd2x 40, 0, %3 \n\t" // store x
+ "stxvd2x 41, %15, %3 \n\t"
+ "stxvd2x 42, %16, %3 \n\t"
+ "stxvd2x 43, %17, %3 \n\t"
- "stxvw4x %x5, 0, %4 \n\t" // store y
- "stxvw4x %x6, %15, %4 \n\t"
- "stxvw4x %x7, %16, %4 \n\t"
- "stxvw4x %x8, %17, %4 \n\t"
+ "stxvd2x %x5, 0, %4 \n\t" // store y
+ "stxvd2x %x6, %15, %4 \n\t"
+ "stxvd2x %x7, %16, %4 \n\t"
+ "stxvd2x %x8, %17, %4 \n\t"
"addi %3, %3, 128 \n\t"
"addi %4, %4, 128 \n\t"
@@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
- "stxvw4x 40, 0, %3 \n\t" // store x
- "stxvw4x 41, %15, %3 \n\t"
- "stxvw4x 42, %16, %3 \n\t"
- "stxvw4x 43, %17, %3 \n\t"
+ "stxvd2x 40, 0, %3 \n\t" // store x
+ "stxvd2x 41, %15, %3 \n\t"
+ "stxvd2x 42, %16, %3 \n\t"
+ "stxvd2x 43, %17, %3 \n\t"
- "stxvw4x %x5, 0, %4 \n\t" // store y
- "stxvw4x %x6, %15, %4 \n\t"
- "stxvw4x %x7, %16, %4 \n\t"
- "stxvw4x %x8, %17, %4 \n"
+ "stxvd2x %x5, 0, %4 \n\t" // store y
+ "stxvd2x %x6, %15, %4 \n\t"
+ "stxvd2x %x7, %16, %4 \n\t"
+ "stxvd2x %x8, %17, %4 \n"
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 49862a329..058ff3399 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xscvdpspn %x3, %x3 \n\t"
"xxspltw %x3, %x3, 0 \n\t"
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %4, %2 \n\t"
- "lxvw4x 34, %5, %2 \n\t"
- "lxvw4x 35, %6, %2 \n\t"
- "lxvw4x 36, %7, %2 \n\t"
- "lxvw4x 37, %8, %2 \n\t"
- "lxvw4x 38, %9, %2 \n\t"
- "lxvw4x 39, %10, %2 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %4, %2 \n\t"
+ "lxvd2x 34, %5, %2 \n\t"
+ "lxvd2x 35, %6, %2 \n\t"
+ "lxvd2x 36, %7, %2 \n\t"
+ "lxvd2x 37, %8, %2 \n\t"
+ "lxvd2x 38, %9, %2 \n\t"
+ "lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %4, %2 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %4, %2 \n\t"
"xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t"
- "lxvw4x 34, %5, %2 \n\t"
- "lxvw4x 35, %6, %2 \n\t"
+ "lxvd2x 34, %5, %2 \n\t"
+ "lxvd2x 35, %6, %2 \n\t"
"xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t"
- "lxvw4x 36, %7, %2 \n\t"
- "lxvw4x 37, %8, %2 \n\t"
+ "lxvd2x 36, %7, %2 \n\t"
+ "lxvd2x 37, %8, %2 \n\t"
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
- "lxvw4x 38, %9, %2 \n\t"
- "lxvw4x 39, %10, %2 \n\t"
+ "lxvd2x 38, %9, %2 \n\t"
+ "lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, -128 \n\t"
- "stxvw4x 40, 0, %2 \n\t"
- "stxvw4x 41, %4, %2 \n\t"
- "stxvw4x 42, %5, %2 \n\t"
- "stxvw4x 43, %6, %2 \n\t"
- "stxvw4x 44, %7, %2 \n\t"
- "stxvw4x 45, %8, %2 \n\t"
- "stxvw4x 46, %9, %2 \n\t"
- "stxvw4x 47, %10, %2 \n\t"
+ "stxvd2x 40, 0, %2 \n\t"
+ "stxvd2x 41, %4, %2 \n\t"
+ "stxvd2x 42, %5, %2 \n\t"
+ "stxvd2x 43, %6, %2 \n\t"
+ "stxvd2x 44, %7, %2 \n\t"
+ "stxvd2x 45, %8, %2 \n\t"
+ "stxvd2x 46, %9, %2 \n\t"
+ "stxvd2x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t"
@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
- "stxvw4x 40, 0, %2 \n\t"
- "stxvw4x 41, %4, %2 \n\t"
- "stxvw4x 42, %5, %2 \n\t"
- "stxvw4x 43, %6, %2 \n\t"
- "stxvw4x 44, %7, %2 \n\t"
- "stxvw4x 45, %8, %2 \n\t"
- "stxvw4x 46, %9, %2 \n\t"
- "stxvw4x 47, %10, %2 \n"
+ "stxvd2x 40, 0, %2 \n\t"
+ "stxvd2x 41, %4, %2 \n\t"
+ "stxvd2x 42, %5, %2 \n\t"
+ "stxvd2x 43, %6, %2 \n\t"
+ "stxvd2x 44, %7, %2 \n\t"
+ "stxvd2x 45, %8, %2 \n\t"
+ "stxvd2x 46, %9, %2 \n\t"
+ "stxvd2x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
".p2align 5 \n"
"1: \n\t"
- "stxvw4x %x3, 0, %2 \n\t"
- "stxvw4x %x3, %4, %2 \n\t"
- "stxvw4x %x3, %5, %2 \n\t"
- "stxvw4x %x3, %6, %2 \n\t"
- "stxvw4x %x3, %7, %2 \n\t"
- "stxvw4x %x3, %8, %2 \n\t"
- "stxvw4x %x3, %9, %2 \n\t"
- "stxvw4x %x3, %10, %2 \n\t"
+ "stxvd2x %x3, 0, %2 \n\t"
+ "stxvd2x %x3, %4, %2 \n\t"
+ "stxvd2x %x3, %5, %2 \n\t"
+ "stxvd2x %x3, %6, %2 \n\t"
+ "stxvd2x %x3, %7, %2 \n\t"
+ "stxvd2x %x3, %8, %2 \n\t"
+ "stxvd2x %x3, %9, %2 \n\t"
+ "stxvd2x %x3, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index d44f16765..cfefdd6ef 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "lxvw4x 32, 0, %4 \n\t"
- "lxvw4x 33, %5, %4 \n\t"
- "lxvw4x 34, %6, %4 \n\t"
- "lxvw4x 35, %7, %4 \n\t"
- "lxvw4x 36, %8, %4 \n\t"
- "lxvw4x 37, %9, %4 \n\t"
- "lxvw4x 38, %10, %4 \n\t"
- "lxvw4x 39, %11, %4 \n\t"
+ "lxvd2x 32, 0, %4 \n\t"
+ "lxvd2x 33, %5, %4 \n\t"
+ "lxvd2x 34, %6, %4 \n\t"
+ "lxvd2x 35, %7, %4 \n\t"
+ "lxvd2x 36, %8, %4 \n\t"
+ "lxvd2x 37, %9, %4 \n\t"
+ "lxvd2x 38, %10, %4 \n\t"
+ "lxvd2x 39, %11, %4 \n\t"
- "lxvw4x 40, 0, %3 \n\t"
- "lxvw4x 41, %5, %3 \n\t"
- "lxvw4x 42, %6, %3 \n\t"
- "lxvw4x 43, %7, %3 \n\t"
- "lxvw4x 44, %8, %3 \n\t"
- "lxvw4x 45, %9, %3 \n\t"
- "lxvw4x 46, %10, %3 \n\t"
- "lxvw4x 47, %11, %3 \n\t"
+ "lxvd2x 40, 0, %3 \n\t"
+ "lxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 42, %6, %3 \n\t"
+ "lxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 44, %8, %3 \n\t"
+ "lxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 46, %10, %3 \n\t"
+ "lxvd2x 47, %11, %3 \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 40, 0, %4 \n\t"
- "stxvw4x 41, %5, %4 \n\t"
- "stxvw4x 42, %6, %4 \n\t"
- "stxvw4x 43, %7, %4 \n\t"
- "stxvw4x 44, %8, %4 \n\t"
- "stxvw4x 45, %9, %4 \n\t"
- "stxvw4x 46, %10, %4 \n\t"
- "stxvw4x 47, %11, %4 \n\t"
+ "stxvd2x 40, 0, %4 \n\t"
+ "stxvd2x 41, %5, %4 \n\t"
+ "stxvd2x 42, %6, %4 \n\t"
+ "stxvd2x 43, %7, %4 \n\t"
+ "stxvd2x 44, %8, %4 \n\t"
+ "stxvd2x 45, %9, %4 \n\t"
+ "stxvd2x 46, %10, %4 \n\t"
+ "stxvd2x 47, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"