summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAshwin Sekhar T K <ashwin.sekhar@cavium.com>2017-07-02 02:00:48 +0530
committerAshwin Sekhar T K <ashwin.sekhar@cavium.com>2017-07-02 02:00:48 +0530
commiteda9e8632ab7d94d609006612a4b760214dfa847 (patch)
tree1e58b54c1e35eee3b050e66dceaab69f65b930ac
parent8f83d3f961f57fb002d8c5359c32a8db50dcab5d (diff)
downloadopenblas-eda9e8632ab7d94d609006612a4b760214dfa847.tar.gz
openblas-eda9e8632ab7d94d609006612a4b760214dfa847.tar.bz2
openblas-eda9e8632ab7d94d609006612a4b760214dfa847.zip
generic: Bug fixes in generic 4x2 and 4x4 gemm kernels
-rw-r--r--kernel/generic/gemmkernel_4x2.c30
-rw-r--r--kernel/generic/gemmkernel_4x4.c104
2 files changed, 67 insertions, 67 deletions
diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c
index 1d15de1d7..8c784e2f1 100644
--- a/kernel/generic/gemmkernel_4x2.c
+++ b/kernel/generic/gemmkernel_4x2.c
@@ -154,11 +154,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha;
res1_1 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
- C1[0] = res1_0;
- C1[1] = res1_1;
+ C1[0] += res1_0;
+ C1[1] += res1_1;
C0 = C0+2;
C1 = C1+2;
@@ -190,12 +190,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha;
- C0[0] = res0_0;
+ C0[0] += res0_0;
- C1[0] = res1_0;
+ C1[0] += res1_0;
- C0 = C0+1;
- C1 = C1+1;
+ C0 += C0+1;
+ C1 += C1+1;
}
@@ -245,10 +245,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_2 *= alpha;
res0_3 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
- C0[2] = res0_2;
- C0[3] = res0_3;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
+ C0[2] += res0_2;
+ C0[3] += res0_3;
C0 = C0+4;
@@ -278,8 +278,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_0 *= alpha;
res0_1 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
C0 = C0+2;
@@ -306,7 +306,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[0] = res0_0;
- C0 = C0+1;
+ C0 += C0+1;
}
k = (bk<<0);
diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c
index bd67b3fc8..99bd9c1ef 100644
--- a/kernel/generic/gemmkernel_4x4.c
+++ b/kernel/generic/gemmkernel_4x4.c
@@ -152,25 +152,25 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_2 *= alpha;
res3_3 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
- C0[2] = res0_2;
- C0[3] = res0_3;
-
- C1[0] = res1_0;
- C1[1] = res1_1;
- C1[2] = res1_2;
- C1[3] = res1_3;
-
- C2[0] = res2_0;
- C2[1] = res2_1;
- C2[2] = res2_2;
- C2[3] = res2_3;
-
- C3[0] = res3_0;
- C3[1] = res3_1;
- C3[2] = res3_2;
- C3[3] = res3_3;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
+ C0[2] += res0_2;
+ C0[3] += res0_3;
+
+ C1[0] += res1_0;
+ C1[1] += res1_1;
+ C1[2] += res1_2;
+ C1[3] += res1_3;
+
+ C2[0] += res2_0;
+ C2[1] += res2_1;
+ C2[2] += res2_2;
+ C2[3] += res2_3;
+
+ C3[0] += res3_0;
+ C3[1] += res3_1;
+ C3[2] += res3_2;
+ C3[3] += res3_3;
C0 = C0+4;
C1 = C1+4;
@@ -230,17 +230,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_0 *= alpha;
res3_1 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
- C1[0] = res1_0;
- C1[1] = res1_1;
+ C1[0] += res1_0;
+ C1[1] += res1_1;
- C2[0] = res2_0;
- C2[1] = res2_1;
+ C2[0] += res2_0;
+ C2[1] += res2_1;
- C3[0] = res3_0;
- C3[1] = res3_1;
+ C3[0] += res3_0;
+ C3[1] += res3_1;
C0 = C0+2;
C1 = C1+2;
@@ -283,13 +283,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_0 *= alpha;
- C0[0] = res0_0;
+ C0[0] += res0_0;
- C1[0] = res1_0;
+ C1[0] += res1_0;
- C2[0] = res2_0;
+ C2[0] += res2_0;
- C3[0] = res3_0;
+ C3[0] += res3_0;
C0 = C0+1;
C1 = C1+1;
@@ -360,15 +360,15 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_2 *= alpha;
res1_3 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
- C0[2] = res0_2;
- C0[3] = res0_3;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
+ C0[2] += res0_2;
+ C0[3] += res0_3;
- C1[0] = res1_0;
- C1[1] = res1_1;
- C1[2] = res1_2;
- C1[3] = res1_3;
+ C1[0] += res1_0;
+ C1[1] += res1_1;
+ C1[2] += res1_2;
+ C1[3] += res1_3;
C0 = C0+4;
C1 = C1+4;
@@ -408,11 +408,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha;
res1_1 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
- C1[0] = res1_0;
- C1[1] = res1_1;
+ C1[0] += res1_0;
+ C1[1] += res1_1;
C0 = C0+2;
C1 = C1+2;
@@ -444,9 +444,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha;
- C0[0] = res0_0;
+ C0[0] += res0_0;
- C1[0] = res1_0;
+ C1[0] += res1_0;
C0 = C0+1;
C1 = C1+1;
@@ -499,10 +499,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_2 *= alpha;
res0_3 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
- C0[2] = res0_2;
- C0[3] = res0_3;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
+ C0[2] += res0_2;
+ C0[3] += res0_3;
C0 = C0+4;
@@ -532,8 +532,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_0 *= alpha;
res0_1 *= alpha;
- C0[0] = res0_0;
- C0[1] = res0_1;
+ C0[0] += res0_0;
+ C0[1] += res0_1;
C0 = C0+2;
@@ -558,7 +558,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_0 *= alpha;
- C0[0] = res0_0;
+ C0[0] += res0_0;
C0 = C0+1;