diff options
author | Xianyi Zhang <traits.zhang@gmail.com> | 2011-01-24 14:54:24 +0000 |
---|---|---|
committer | Xianyi Zhang <traits.zhang@gmail.com> | 2011-01-24 14:54:24 +0000 |
commit | 342bbc3871d1b43f548e9d1ae9d380a1d4989cb3 (patch) | |
tree | 385fc6d27e282ae1bb06d685833c5325615c8169 /symcopy.h | |
download | openblas-342bbc3871d1b43f548e9d1ae9d380a1d4989cb3.tar.gz openblas-342bbc3871d1b43f548e9d1ae9d380a1d4989cb3.tar.bz2 openblas-342bbc3871d1b43f548e9d1ae9d380a1d4989cb3.zip |
Import GotoBLAS2 1.13 BSD version codes.
Diffstat (limited to 'symcopy.h')
-rw-r--r-- | symcopy.h | 1873 |
1 files changed, 1873 insertions, 0 deletions
diff --git a/symcopy.h b/symcopy.h new file mode 100644 index 000000000..ed6e5b417 --- /dev/null +++ b/symcopy.h @@ -0,0 +1,1873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/* This implementation is completely wrong. I'll rewrite this */ + +#ifndef SYMCOPY_H +#define SYMCOPY_H + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + +static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + + +static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = -a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = 0.; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = -a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + + } +} + +static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = -a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a12; + *(bb1 + 3) = -a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = 0.; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + } +} + + +static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = 0.; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = -a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + + } +} + +static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = -a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = 0.; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + } +} + + +static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + +static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + +static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +#endif +#endif + |