diff options
Diffstat (limited to 'kernel/sparc/zgemm_ncopy.S')
-rw-r--r-- | kernel/sparc/zgemm_ncopy.S | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/kernel/sparc/zgemm_ncopy.S b/kernel/sparc/zgemm_ncopy.S new file mode 100644 index 000000000..2b0c398bf --- /dev/null +++ b/kernel/sparc/zgemm_ncopy.S @@ -0,0 +1,250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, ZBASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra M, 2, I + cmp I, 0 + + ble,pn %icc, .LL15 + add A2, LDA, A + +#define PREFETCHSIZE 36 +#define WPREFETCHSIZE 20 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A1 + 2 * SIZE], c05 + LDF [A1 + 3 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A1 + 5 * SIZE], c10 + LDF [A2 + 4 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + + LDF [A1 + 6 * SIZE], c13 + LDF [A1 + 7 * SIZE], c14 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + + STF c01, [B + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B + 1 * SIZE] + add A2, 8 * SIZE, A2 + STF c03, [B + 2 * SIZE] + add I, -1, I + STF c04, [B + 3 * SIZE] + cmp I, 0 + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] +#ifdef DOUBLE + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + add A1, 2 * SIZE, A1 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + add A2, 2 * SIZE, A2 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + bg,pt %icc, .LL16 + add B, 4 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL115 + mov A, A1 + + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + add A1, 8 * SIZE, A1 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + bg,pt %icc, .LL112 + add B, 8 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add I, -1, I + LDF [A1 + 1 * SIZE], c02 + add A1, 2 * SIZE, A1 + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + bg,pt %icc, .LL116 + add B, 2 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE |