summaryrefslogtreecommitdiff
path: root/symcopy.h
diff options
context:
space:
mode:
authorXianyi Zhang <traits.zhang@gmail.com>2011-01-24 14:54:24 +0000
committerXianyi Zhang <traits.zhang@gmail.com>2011-01-24 14:54:24 +0000
commit342bbc3871d1b43f548e9d1ae9d380a1d4989cb3 (patch)
tree385fc6d27e282ae1bb06d685833c5325615c8169 /symcopy.h
downloadopenblas-342bbc3871d1b43f548e9d1ae9d380a1d4989cb3.tar.gz
openblas-342bbc3871d1b43f548e9d1ae9d380a1d4989cb3.tar.bz2
openblas-342bbc3871d1b43f548e9d1ae9d380a1d4989cb3.zip
Import GotoBLAS2 1.13 BSD version codes.
Diffstat (limited to 'symcopy.h')
-rw-r--r--symcopy.h1873
1 files changed, 1873 insertions, 0 deletions
diff --git a/symcopy.h b/symcopy.h
new file mode 100644
index 000000000..ed6e5b417
--- /dev/null
+++ b/symcopy.h
@@ -0,0 +1,1873 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+/* This implementation is completely wrong. I'll rewrite this */
+
+#ifndef SYMCOPY_H
+#define SYMCOPY_H
+
+#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
+
+static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a12;
+ FLOAT a21, a22;
+
+ b1 = b;
+ b2 = b;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 2;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 1 * m;
+ b1 += 2 * m + 2;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 1 * m;
+ b2 += 2 * m + 2;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a21;
+ *(bb2 + 1) = a22;
+ aa1 += 2;
+ aa2 += 2;
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ aa1 += 2;
+ aa2 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ *(cc2 + 0) = a21;
+ *(cc2 + 1) = a22;
+
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+
+ is --;
+ }
+
+ is = ((m - js - 2) & 1);
+
+ if (is == 1){
+ a11 = *(aa1 + 0);
+ a12 = *(aa2 + 0);
+
+ *(bb1 + 0) = a11;
+ *(bb2 + 0) = a12;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ }
+
+ }
+}
+
+static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a12;
+ FLOAT a21, a22;
+
+ b1 = b;
+ b2 = b;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 1 * m;
+ b1 += 2 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 1 * m;
+ b2 += 2;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ aa1 += 2;
+ aa2 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ *(cc2 + 0) = a21;
+ *(cc2 + 1) = a22;
+
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+ }
+
+ a11 = *(aa1 + 0);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a12;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ aa1 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(cc1 + 0) = a11;
+ *(cc2 + 0) = a21;
+ bb1 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ }
+ }
+}
+
+
+static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 4;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m + 4;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4 * m + 4;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 2);
+ a22 = *(aa2 + 3);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a31;
+ *(bb2 + 1) = a41;
+ *(bb2 + 2) = a12;
+ *(bb2 + 3) = a22;
+
+ aa1 += 4;
+ aa2 += 4;
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is --;
+ }
+
+ if (m & 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ }
+
+ }
+}
+
+static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a12;
+ *(bb1 + 3) = a22;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+ aa1 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ bb1 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ }
+ }
+}
+
+static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 4;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m + 4;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4 * m + 4;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 2);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a31;
+ *(bb2 + 1) = -a41;
+ *(bb2 + 2) = a12;
+ *(bb2 + 3) = 0.;
+
+ aa1 += 4;
+ aa2 += 4;
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = -a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = -a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = -a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = -a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is --;
+ }
+
+ if (m & 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = -a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = -a22;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ }
+
+ }
+}
+
+static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = -a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = -a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = -a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = -a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ *(bb1 + 2) = a12;
+ *(bb1 + 3) = -a22;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = 0.;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+ aa1 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = -a21;
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = -a41;
+ bb1 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ }
+ }
+}
+
+
+static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 4;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m + 4;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4 * m + 4;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 2);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = -a41;
+
+ *(bb2 + 0) = a31;
+ *(bb2 + 1) = a41;
+ *(bb2 + 2) = a12;
+ *(bb2 + 3) = 0.;
+
+ aa1 += 4;
+ aa2 += 4;
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = -a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = -a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = -a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = -a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is --;
+ }
+
+ if (m & 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = -a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = -a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ }
+
+ }
+}
+
+static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = -a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = -a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = -a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = -a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ *(bb1 + 2) = a12;
+ *(bb1 + 3) = a22;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = -a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = 0.;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+ aa1 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = -a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = -a41;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ bb1 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = 0.;
+ }
+ }
+}
+
+
+static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a12;
+ FLOAT a21, a22;
+
+ b1 = b;
+ b2 = b;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 2;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 1 * m;
+ b1 += 2 * m + 2;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 1 * m;
+ b2 += 2 * m + 2;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a21;
+ *(bb2 + 1) = a22;
+ aa1 += 2;
+ aa2 += 2;
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ aa1 += 2;
+ aa2 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ *(cc2 + 0) = a21;
+ *(cc2 + 1) = a22;
+
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+
+ is --;
+ }
+
+ is = ((m - js - 2) & 1);
+
+ if (is == 1){
+ a11 = *(aa1 + 0);
+ a12 = *(aa2 + 0);
+
+ *(bb1 + 0) = a11;
+ *(bb2 + 0) = a12;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ }
+
+ }
+}
+
+static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a12;
+ FLOAT a21, a22;
+
+ b1 = b;
+ b2 = b;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 2;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 1 * m;
+ b1 += 2 * m + 2;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 1 * m;
+ b2 += 2 * m + 2;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a21;
+ *(bb2 + 1) = a22;
+ aa1 += 2;
+ aa2 += 2;
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ aa1 += 2;
+ aa2 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ *(cc2 + 0) = a21;
+ *(cc2 + 1) = a22;
+
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+
+ is --;
+ }
+
+ is = ((m - js - 2) & 1);
+
+ if (is == 1){
+ a11 = *(aa1 + 0);
+ a12 = *(aa2 + 0);
+
+ *(bb1 + 0) = a11;
+ *(bb2 + 0) = a12;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ }
+
+ }
+}
+
+static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a12;
+ FLOAT a21, a22;
+
+ b1 = b;
+ b2 = b;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 1 * m;
+ b1 += 2 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 1 * m;
+ b2 += 2;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ aa1 += 2;
+ aa2 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ *(cc2 + 0) = a21;
+ *(cc2 + 1) = a22;
+
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+ }
+
+ a11 = *(aa1 + 0);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a12;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ aa1 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(cc1 + 0) = a11;
+ *(cc2 + 0) = a21;
+ bb1 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ }
+ }
+}
+
+static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a12;
+ FLOAT a21, a22;
+
+ b1 = b;
+ b2 = b;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 1 * m;
+ b1 += 2 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 1 * m;
+ b2 += 2;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ aa1 += 2;
+ aa2 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a12;
+ *(cc2 + 0) = a21;
+ *(cc2 + 1) = a22;
+
+ bb1 += 2;
+ bb2 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+ }
+
+ a11 = *(aa1 + 0);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a12;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ aa1 += 2;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(cc1 + 0) = a11;
+ *(cc2 + 0) = a21;
+ bb1 += 2;
+
+ cc1 += 2 * m;
+ cc2 += 2 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ *(bb1 + 0) = a11;
+ }
+ }
+}
+
+static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 4;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m + 4;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4 * m + 4;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 2);
+ a22 = *(aa2 + 3);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a31;
+ *(bb2 + 1) = a41;
+ *(bb2 + 2) = a12;
+ *(bb2 + 3) = a22;
+
+ aa1 += 4;
+ aa2 += 4;
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is --;
+ }
+
+ if (m & 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ }
+
+ }
+}
+
+static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda + 4;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m + 4;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4 * m + 4;
+
+ if (m - js >= 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 2);
+ a22 = *(aa2 + 3);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a31;
+ *(bb2 + 1) = a41;
+ *(bb2 + 2) = a12;
+ *(bb2 + 3) = a22;
+
+ aa1 += 4;
+ aa2 += 4;
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is = ((m - js - 2) >> 1);
+
+ while (is > 0){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+
+ is --;
+ }
+
+ if (m & 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+ }
+ }
+
+ if (m - js == 1){
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ }
+
+ }
+}
+
+static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a12;
+ *(bb1 + 3) = a22;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+ aa1 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ bb1 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ }
+ }
+}
+
+static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
+ BLASLONG is, js;
+
+ FLOAT *aa1, *aa2;
+ FLOAT *b1, *b2;
+ FLOAT *bb1, *bb2;
+ FLOAT *cc1, *cc2;
+ FLOAT a11, a21, a31, a41;
+ FLOAT a12, a22, a32, a42;
+
+ b1 = b;
+ b2 = b;
+
+ lda *= 2;
+
+ for (js = 0; js < m; js += 2){
+
+ aa1 = a + 0 * lda;
+ aa2 = a + 1 * lda;
+ a += 2 * lda;
+
+ bb1 = b1 + 0 * m;
+ bb2 = b1 + 2 * m;
+ b1 += 4 * m;
+
+ cc1 = b2 + 0 * m;
+ cc2 = b2 + 2 * m;
+ b2 += 4;
+
+ if (m - js >= 2){
+
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ aa1 += 4;
+ aa2 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc1 + 2) = a12;
+ *(cc1 + 3) = a22;
+
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ *(cc2 + 2) = a32;
+ *(cc2 + 3) = a42;
+
+ bb1 += 4;
+ bb2 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+
+ a12 = *(aa2 + 0);
+ a22 = *(aa2 + 1);
+ a32 = *(aa2 + 2);
+ a42 = *(aa2 + 3);
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a12;
+ *(bb1 + 3) = a22;
+
+ *(bb2 + 0) = a12;
+ *(bb2 + 1) = a22;
+ *(bb2 + 2) = a32;
+ *(bb2 + 3) = a42;
+ }
+
+ if (m - js == 1){
+ for (is = 0; is < js; is += 2){
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ a31 = *(aa1 + 2);
+ a41 = *(aa1 + 3);
+ aa1 += 4;
+
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ *(bb1 + 2) = a31;
+ *(bb1 + 3) = a41;
+
+ *(cc1 + 0) = a11;
+ *(cc1 + 1) = a21;
+ *(cc2 + 0) = a31;
+ *(cc2 + 1) = a41;
+ bb1 += 4;
+
+ cc1 += 4 * m;
+ cc2 += 4 * m;
+ }
+
+ a11 = *(aa1 + 0);
+ a21 = *(aa1 + 1);
+ *(bb1 + 0) = a11;
+ *(bb1 + 1) = a21;
+ }
+ }
+}
+
+#endif
+#endif
+